[med-svn] [gadgetron] 01/09: Imported Upstream version 3.7.5

Fri Apr 24 18:41:14 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository gadgetron.

commit 066e6cd22febf6f2f8e2d15fc754a08424df061b
Author: Ghislain Antony Vaillant <ghisvail at gmail.com>
Date:   Fri Apr 24 12:02:07 2015 +0100

    Imported Upstream version 3.7.5
---
 .gitignore                                         |    26 +
 CMakeLists.txt                                     |   312 +
 LICENSE                                            |    29 +
 README                                             |    19 +
 apps/CMakeLists.txt                                |     6 +
 apps/clients/CMakeLists.txt                        |     7 +
 .../gadgetron_ismrmrd_client/CMakeLists.txt        |    24 +
 .../gadgetron_ismrmrd_client.cpp                   |  1158 ++
 apps/clients/utilities/CMakeLists.txt              |    49 +
 apps/clients/utilities/DependencyQueryReader.h     |   100 +
 apps/clients/utilities/gt_alive.cpp                |    69 +
 apps/clients/utilities/gt_query.cpp                |    99 +
 apps/clients/utilities/gtquery.xml                 |    33 +
 apps/clients/utilities/isalive.xml                 |    55 +
 apps/gadgetron/CMakeLists.txt                      |   104 +
 apps/gadgetron/EndGadget.h                         |    55 +
 apps/gadgetron/Gadget.cpp                          |    50 +
 apps/gadgetron/Gadget.h                            |   683 ++
 apps/gadgetron/GadgetContainerMessage.h            |   160 +
 apps/gadgetron/GadgetMessageInterface.h            |   236 +
 apps/gadgetron/GadgetServerAcceptor.cpp            |    55 +
 apps/gadgetron/GadgetServerAcceptor.h              |    31 +
 apps/gadgetron/GadgetStreamController.cpp          |   384 +
 apps/gadgetron/GadgetStreamController.h            |    50 +
 apps/gadgetron/GadgetStreamInterface.h             |   129 +
 apps/gadgetron/GadgetronExport.h                   |    38 +
 apps/gadgetron/gadgetbase_export.h                 |    16 +
 apps/gadgetron/gadgetron.xml.example               |    14 +
 apps/gadgetron/gadgetron_config.in                 |    13 +
 apps/gadgetron/gadgetron_info.cpp                  |   193 +
 apps/gadgetron/gadgetron_paths.h                   |    76 +
 apps/gadgetron/gadgetron_start.pl                  |    31 +
 apps/gadgetron/gadgetron_xml.cpp                   |    95 +
 apps/gadgetron/gadgetron_xml.h                     |   122 +
 apps/gadgetron/main.cpp                            |   196 +
 apps/gadgetron/pugiconfig.hpp                      |    72 +
 apps/gadgetron/pugixml.cpp                         | 10639 +++++++++++++++++++
 apps/gadgetron/pugixml.hpp                         |  1332 +++
 apps/gadgetron/schema/gadgetron.xsd                |    81 +
 apps/gadgetron/upstart/gadgetron.conf              |    36 +
 apps/gadgetron/webapp/CMakeLists.txt               |    27 +
 apps/gadgetron/webapp/gadgetron_web.conf           |    15 +
 apps/gadgetron/webapp/gadgetron_web.conf.in        |    16 +
 apps/gadgetron/webapp/gadgetron_web_app.cfg        |     8 +
 apps/gadgetron/webapp/gadgetron_web_app.in         |     8 +
 apps/gadgetron/webapp/gadgetron_web_app.py         |   182 +
 apps/gadgetron/webapp/gadgetron_web_ld.conf.in     |     2 +
 apps/gadgetron/webapp/main.cpp                     |     8 +
 apps/matlab/mexGT.h                                |   580 +
 apps/standalone/CMakeLists.txt                     |    13 +
 apps/standalone/cpu/CMakeLists.txt                 |    48 +
 apps/standalone/cpu/denoising/2d/CMakeLists.txt    |    26 +
 apps/standalone/cpu/denoising/2d/denoise_TV.cpp    |   117 +
 apps/standalone/cpu/denoising/CMakeLists.txt       |     1 +
 apps/standalone/cpu/gtplus/CMakeLists.txt          |    62 +
 .../cpu/gtplus/Matlab_compute_coil_map_2D.cpp      |   136 +
 .../cpu/gtplus/Matlab_compute_coil_map_3D.cpp      |   137 +
 .../cpu/gtplus/Matlab_gt_read_analyze.cpp          |   260 +
 .../cpu/gtplus/Matlab_gt_write_analyze.cpp         |   246 +
 apps/standalone/cpu/registration/2d/CMakeLists.txt |    46 +
 .../cpu/registration/2d/Matlab_register_CK_2d.cpp  |   197 +
 .../cpu/registration/2d/register_CK_2d.cpp         |   121 +
 .../cpu/registration/2d/register_HS_2d.cpp         |   110 +
 apps/standalone/cpu/registration/3d/CMakeLists.txt |    11 +
 .../cpu/registration/3d/register_CK_3d.cpp         |   115 +
 apps/standalone/cpu/registration/CMakeLists.txt    |    12 +
 apps/standalone/gpu/CMakeLists.txt                 |    22 +
 apps/standalone/gpu/ct/CMakeLists.txt              |     3 +
 .../gpu/ct/xray/CBCT_forwards_projection.cpp       |   252 +
 .../standalone/gpu/ct/xray/CBCT_reconstruct_CG.cpp |   204 +
 .../gpu/ct/xray/CBCT_reconstruct_FDK_3d.cpp        |   143 +
 .../gpu/ct/xray/CBCT_reconstruct_FDK_4d.cpp        |   157 +
 .../gpu/ct/xray/CBCT_reconstruct_NLCG.cpp          |   194 +
 .../standalone/gpu/ct/xray/CBCT_reconstruct_SB.cpp |   281 +
 apps/standalone/gpu/ct/xray/CMakeLists.txt         |    64 +
 apps/standalone/gpu/deblurring/2d/CMakeLists.txt   |    14 +
 apps/standalone/gpu/deblurring/2d/blur_2d.cpp      |   111 +
 apps/standalone/gpu/deblurring/2d/deblur_2d_cg.cpp |   109 +
 apps/standalone/gpu/deblurring/2d/deblur_2d_sb.cpp |   129 +
 apps/standalone/gpu/deblurring/3d/CMakeLists.txt   |    13 +
 apps/standalone/gpu/deblurring/3d/blur_3d.cpp      |   113 +
 apps/standalone/gpu/deblurring/3d/deblur_3d_cg.cpp |   114 +
 apps/standalone/gpu/deblurring/3d/deblur_3d_sb.cpp |   135 +
 apps/standalone/gpu/deblurring/CMakeLists.txt      |     2 +
 apps/standalone/gpu/denoising/2d/CMakeLists.txt    |    10 +
 apps/standalone/gpu/denoising/2d/denoise_TV.cpp    |   154 +
 apps/standalone/gpu/denoising/CMakeLists.txt       |     1 +
 apps/standalone/gpu/mri/CMakeLists.txt             |     2 +
 apps/standalone/gpu/mri/nfft/2d/CMakeLists.txt     |    17 +
 apps/standalone/gpu/mri/nfft/2d/main_cg.cpp        |   141 +
 apps/standalone/gpu/mri/nfft/2d/main_nfft.cpp      |   142 +
 apps/standalone/gpu/mri/nfft/2d/main_nffth.cpp     |   145 +
 apps/standalone/gpu/mri/nfft/2d/main_sb.cpp        |   175 +
 apps/standalone/gpu/mri/nfft/CMakeLists.txt        |     2 +
 apps/standalone/gpu/mri/nfft/ms2d/CMakeLists.txt   |     9 +
 apps/standalone/gpu/mri/nfft/ms2d/nfft_main.cpp    |   148 +
 .../standalone/gpu/mri/nfft/ms2d/nffth_generic.cpp |   161 +
 apps/standalone/gpu/mri/nfft/ms2d/nffth_main.cpp   |   173 +
 apps/standalone/gpu/mri/sense/CMakeLists.txt       |     2 +
 .../gpu/mri/sense/noncartesian/CMakeLists.txt      |    11 +
 .../gpu/mri/sense/noncartesian/generic_cg.cpp      |   200 +
 .../noncartesian/radial/2d_golden_ratio/.gitignore |     1 +
 .../radial/2d_golden_ratio/CMakeLists.txt          |    11 +
 .../radial/2d_golden_ratio/main_cg.cpp             |   291 +
 .../radial/2d_golden_ratio/main_gpbb.cpp           |   286 +
 .../radial/2d_golden_ratio/main_nlcg.cpp           |   341 +
 .../radial/2d_golden_ratio/main_sbc.cpp            |   335 +
 .../radial/2d_golden_ratio_gui/CMakeLists.txt      |    31 +
 .../radial/2d_golden_ratio_gui/GLReconWidget.cpp   |   222 +
 .../radial/2d_golden_ratio_gui/GLReconWidget.h     |    56 +
 .../radial/2d_golden_ratio_gui/UIconstants.h       |    10 +
 .../radial/2d_golden_ratio_gui/main.cpp            |    19 +
 .../radialSenseAppBaseMainWidget.ui                |   572 +
 .../radialSenseAppMainWidget.cpp                   |   690 ++
 .../2d_golden_ratio_gui/radialSenseAppMainWidget.h |   134 +
 .../radial/2d_golden_ratio_gui/reconBaseWidget.ui  |   303 +
 .../radial/2d_golden_ratio_gui/reconWidget.cpp     |     7 +
 .../radial/2d_golden_ratio_gui/reconWidget.h       |    13 +
 .../radial/2d_golden_ratio_kt/CMakeLists.txt       |     5 +
 .../radial/2d_golden_ratio_kt/main.cpp             |   314 +
 .../mri/sense/noncartesian/radial/CMakeLists.txt   |    10 +
 apps/standalone/gpu/registration/2d/CMakeLists.txt |    52 +
 .../gpu/registration/2d/register_CGHS_2d.cpp       |   134 +
 .../gpu/registration/2d/register_CK_2d.cpp         |   129 +
 .../gpu/registration/2d/register_HS_2d.cpp         |   122 +
 .../gpu/registration/2d/test_reg_sense_recon.cpp   |   568 +
 apps/standalone/gpu/registration/3d/CMakeLists.txt |    12 +
 .../gpu/registration/3d/register_CK_3d.cpp         |   124 +
 apps/standalone/gpu/registration/CMakeLists.txt    |     7 +
 chroot/CMakeLists.txt                              |   120 +
 chroot/README.rst                                  |    35 +
 chroot/chroot-manual.txt                           |   156 +
 chroot/copy-cuda-lib.sh.in                         |    34 +
 chroot/copy_file_and_dependencies                  |    25 +
 chroot/create_chroot.sh                            |    85 +
 chroot/create_chroot_base.sh                       |    46 +
 chroot/create_chroot_from_base.sh                  |   117 +
 chroot/enter-chroot-env.sh.in                      |     3 +
 chroot/gadgetron-dependency-query.sh.in            |    16 +
 chroot/gadgetron_chroot.conf                       |    26 +
 chroot/gadgetron_ismrmrd_client.sh.in              |    17 +
 chroot/generate_gadgetron_root                     |    45 +
 chroot/get_dependencies_for_binary                 |    12 +
 chroot/gt_alive.sh.in                              |    15 +
 chroot/install_chroot_image.sh                     |    91 +
 chroot/make_list_of_dependencies                   |    25 +
 chroot/mount.sh                                    |    64 +
 chroot/mount_image.sh                              |    85 +
 chroot/mount_image.sh.in                           |    85 +
 chroot/run-gadgetron-dependency-query.sh           |    31 +
 chroot/run-gadgetron-dependency-query.sh.in        |    31 +
 chroot/run-gadgetron_ismrmrd_client.sh             |    32 +
 chroot/run-gadgetron_ismrmrd_client.sh.in          |    32 +
 chroot/run-gt_alive.sh                             |    49 +
 chroot/run-gt_alive.sh.in                          |    49 +
 chroot/run-siemens_to_ismrmrd.sh                   |    31 +
 chroot/run-siemens_to_ismrmrd.sh.in                |    31 +
 chroot/run-webapp.sh.in                            |    11 +
 chroot/siemens_to_ismrmrd.sh.in                    |    16 +
 chroot/start-env.sh                                |    18 +
 chroot/start-env.sh.in                             |    21 +
 chroot/start-gadgetron-from-image.sh               |    49 +
 chroot/start-gadgetron-from-image.sh.in            |    49 +
 chroot/start-gadgetron.sh.in                       |    10 +
 chroot/start-webapp.sh                             |    18 +
 chroot/start-webapp.sh.in                          |    24 +
 chroot/start.sh                                    |    23 +
 chroot/start.sh.in                                 |    26 +
 chroot/stop.sh                                     |    51 +
 chroot/umount_image.sh                             |    51 +
 chroot/unique_lines_in_file                        |    14 +
 chroot/upstart-instructions.txt                    |    10 +
 cmake/CMakeLists.txt                               |    22 +
 cmake/FindACE.cmake                                |    90 +
 cmake/FindArmadillo.cmake                          |   100 +
 cmake/FindCUDA/cuda_compute_capability.c           |    42 +
 cmake/FindCUDA_advanced.cmake                      |    40 +
 cmake/FindDCMTK.cmake                              |   175 +
 cmake/FindFFTW3.cmake                              |   114 +
 cmake/FindGLEW.cmake                               |    53 +
 cmake/FindGMatlab.cmake                            |   116 +
 cmake/FindGadgetron.cmake                          |    40 +
 cmake/FindIsmrmrd.cmake                            |    28 +
 cmake/FindMKL.cmake                                |   128 +
 cmake/FindNumPy.cmake                              |   102 +
 cmake/FindOctave.cmake                             |    84 +
 cmake/InstallLinuxDependencies.cmake               |    24 +
 cmake/InstallWinDependencies.cmake                 |   137 +
 cmake/InstallWinGadgetron.bat                      |     6 +
 cmake/cpack_options.cmake.in                       |    34 +
 cmake/cpack_options_dependency.cmake.in            |    37 +
 cmake/cpack_options_web.cmake.in                   |    38 +
 cmake/debian/postinst                              |    18 +
 cmake/debian/prerm                                 |    13 +
 cmake/debian_web/postinst                          |     7 +
 cmake/debian_web/prerm                             |     7 +
 cmake/gadgetron_cpack.cmake                        |    40 +
 cmake/gadgetron_web_cpack.cmake                    |    32 +
 doc/.gitignore                                     |     7 +
 doc/CMakeLists.txt                                 |     6 +
 doc/doxygen/CMakeLists.txt                         |     8 +
 doc/doxygen/Doxyfile.in                            |  1757 +++
 doc/website/Gadgetron.png                          |   Bin 0 -> 34116 bytes
 doc/website/index.html                             |   146 +
 .../GadgetronWindowsInstallation.ps1               |   Bin 0 -> 18112 bytes
 gadgets/.gitignore                                 |     1 +
 gadgets/CMakeLists.txt                             |    91 +
 gadgets/cartesian/CMakeLists.txt                   |    34 +
 gadgets/cartesian/CartesianToGenericGadget.cpp     |    94 +
 gadgets/cartesian/CartesianToGenericGadget.h       |    42 +
 gadgets/cartesian/gadgetron_cartesian_export.h     |    11 +
 gadgets/dicom/CMakeLists.txt                       |   105 +
 gadgets/dicom/DicomFinishGadget.cpp                |   701 ++
 gadgets/dicom/DicomFinishGadget.h                  |   414 +
 gadgets/dicom/DicomImageWriter.cpp                 |   157 +
 gadgets/dicom/DicomImageWriter.h                   |    22 +
 gadgets/dicom/dicom.xml                            |    90 +
 gadgets/dicom/gadgetron_dicom_export.h             |    15 +
 gadgets/epi/CMakeLists.txt                         |    50 +
 gadgets/epi/CutXGadget.cpp                         |    91 +
 gadgets/epi/CutXGadget.h                           |    33 +
 gadgets/epi/EPICorrGadget.cpp                      |   192 +
 gadgets/epi/EPICorrGadget.h                        |    50 +
 gadgets/epi/EPIReconXGadget.cpp                    |   131 +
 gadgets/epi/EPIReconXGadget.h                      |    40 +
 gadgets/epi/FFTXGadget.cpp                         |    27 +
 gadgets/epi/FFTXGadget.h                           |    25 +
 gadgets/epi/epi.xml                                |   124 +
 gadgets/epi/epi_gtplus_grappa.xml                  |   483 +
 gadgets/epi/gadgetron_epi_export.h                 |    14 +
 gadgets/gpu/CMakeLists.txt                         |    42 +
 gadgets/gpu/cuFFTGadget.cpp                        |    26 +
 gadgets/gpu/cuFFTGadget.h                          |    33 +
 gadgets/grappa/CMakeLists.txt                      |    75 +
 gadgets/grappa/GrappaCalibrationBuffer.cpp         |   144 +
 gadgets/grappa/GrappaCalibrationBuffer.h           |   150 +
 gadgets/grappa/GrappaGadget.cpp                    |   416 +
 gadgets/grappa/GrappaGadget.h                      |    75 +
 gadgets/grappa/GrappaUnmixingGadget.cpp            |    70 +
 gadgets/grappa/GrappaUnmixingGadget.h              |    32 +
 gadgets/grappa/GrappaWeights.cpp                   |   111 +
 gadgets/grappa/GrappaWeights.h                     |    37 +
 gadgets/grappa/GrappaWeightsCalculator.cpp         |   470 +
 gadgets/grappa/GrappaWeightsCalculator.h           |    85 +
 gadgets/grappa/config/CMakeLists.txt               |     6 +
 gadgets/grappa/config/grappa.xml                   |    88 +
 gadgets/grappa/config/grappa_float.xml             |    93 +
 gadgets/grappa/config/grappa_float_cpu.xml         |    94 +
 gadgets/grappa/config/grappa_unoptimized.xml       |    69 +
 gadgets/grappa/config/grappa_unoptimized_float.xml |    73 +
 gadgets/grappa/gadgetron_grappa_export.h           |    14 +
 gadgets/gtPlus/CMakeLists.txt                      |   177 +
 gadgets/gtPlus/GadgetCloudJobMessageReadWrite.cpp  |    11 +
 gadgets/gtPlus/GadgetCloudJobMessageReadWrite.h    |   243 +
 gadgets/gtPlus/GadgetMRIHeaders.cpp                |   262 +
 gadgets/gtPlus/GadgetMRIHeadersExt.cpp             |   428 +
 .../gtPlus/GtPlusAccumulatorImageTriggerGadget.cpp |   746 ++
 .../gtPlus/GtPlusAccumulatorImageTriggerGadget.h   |   150 +
 .../GtPlusAccumulatorWorkOrderTriggerGadget.cpp    |  2412 +++++
 .../GtPlusAccumulatorWorkOrderTriggerGadget.h      |   297 +
 gadgets/gtPlus/GtPlusGadgetExport.h                |    16 +
 gadgets/gtPlus/GtPlusGadgetImageArray.cpp          |   685 ++
 gadgets/gtPlus/GtPlusGadgetImageArray.h            |    76 +
 gadgets/gtPlus/GtPlusGadgetOpenMP.cpp              |    71 +
 gadgets/gtPlus/GtPlusGadgetOpenMP.h                |    26 +
 gadgets/gtPlus/GtPlusImageReconGadget.cpp          |   714 ++
 gadgets/gtPlus/GtPlusImageReconGadget.h            |   134 +
 gadgets/gtPlus/GtPlusRecon2DTCloudPackage.h        |   301 +
 gadgets/gtPlus/GtPlusRecon2DTGadget.cpp            |   550 +
 gadgets/gtPlus/GtPlusRecon2DTGadget.h              |    63 +
 gadgets/gtPlus/GtPlusRecon2DTGadgetCloud.cpp       |   726 ++
 gadgets/gtPlus/GtPlusRecon2DTGadgetCloud.h         |    91 +
 gadgets/gtPlus/GtPlusRecon3DTGadget.cpp            |   453 +
 gadgets/gtPlus/GtPlusRecon3DTGadget.h              |   104 +
 gadgets/gtPlus/GtPlusReconGadget.cpp               |  1909 ++++
 gadgets/gtPlus/GtPlusReconGadget.h                 |   305 +
 gadgets/gtPlus/GtPlusReconGadgetUtil.cpp           |   711 ++
 gadgets/gtPlus/GtPlusReconGadgetUtil.h             |    77 +
 gadgets/gtPlus/GtPlusReconJob2DTGadget.cpp         |   201 +
 gadgets/gtPlus/GtPlusReconJob2DTGadget.h           |   103 +
 gadgets/gtPlus/GtPlusReconJob2DTGadgetCloud.cpp    |   790 ++
 gadgets/gtPlus/GtPlusReconJob2DTGadgetCloud.h      |   181 +
 gadgets/gtPlus/GtPlusReconJob3DTGadget.cpp         |   229 +
 gadgets/gtPlus/GtPlusReconJob3DTGadget.h           |   103 +
 gadgets/gtPlus/config/GT_2DT_Cartesian.xml         |   798 ++
 .../gtPlus/config/GT_2DT_Cartesian_CloudNode.xml   |    77 +
 gadgets/gtPlus/config/GT_2DT_Cartesian_Dicom.xml   |   804 ++
 ...GT_2DT_Cartesian_DualLayer_Gateway_L1SPIRIT.xml |   804 ++
 .../GT_2DT_Cartesian_DualLayer_Gateway_SPIRIT.xml  |   798 ++
 .../GT_2DT_Cartesian_FirstLayer_CloudNode.xml      |   279 +
 gadgets/gtPlus/config/GT_2DT_Cartesian_GFactor.xml |   799 ++
 .../config/GT_2DT_Cartesian_ImageTrigger_Dicom.xml |   826 ++
 .../gtPlus/config/GT_2DT_Cartesian_L1SPIRIT.xml    |   789 ++
 ...GT_2DT_Cartesian_PseudoReplica_SNRUnitRecon.xml |   766 ++
 gadgets/gtPlus/config/GT_2DT_Cartesian_SPIRIT.xml  |   789 ++
 .../GT_2DT_Cartesian_SingleLayer_CloudNode.xml     |   279 +
 gadgets/gtPlus/config/GT_2DT_FatWater.xml          |   649 ++
 gadgets/gtPlus/config/GT_2DT_HASTE.xml             |   757 ++
 gadgets/gtPlus/config/GT_2DT_HASTE_MOCO_AVE.xml    |  1033 ++
 gadgets/gtPlus/config/GT_2DT_LGE.xml               |   654 ++
 gadgets/gtPlus/config/GT_2DT_MOLLI.xml             |   649 ++
 gadgets/gtPlus/config/GT_2DT_MOLLI_Offline.xml     |   652 ++
 gadgets/gtPlus/config/GT_2DT_Perfusion.xml         |   655 ++
 ...T_2DT_PseudoReplica_SNRUnitRecon_DataExport.xml |    68 +
 .../config/GT_2DT_RTCine_L1SPIRIT_PhysioInterp.xml |   819 ++
 ...ine_L1SPIRIT_PhysioInterp_DualLayer_Gateway.xml |   828 ++
 gadgets/gtPlus/config/GT_2DT_RealTimeCine.xml      |   736 ++
 gadgets/gtPlus/config/GT_2DT_RealTimeFlow.xml      |   718 ++
 gadgets/gtPlus/config/GT_2DT_T2W.xml               |   654 ++
 gadgets/gtPlus/config/GT_3DT_Cartesian.xml         |   802 ++
 .../gtPlus/config/GT_3DT_Cartesian_CloudNode.xml   |    82 +
 gadgets/gtPlus/config/GT_3DT_Cartesian_GFactor.xml |   682 ++
 .../gtPlus/config/GT_3DT_Cartesian_L1SPIRIT.xml    |   806 ++
 gadgets/gtPlus/config/GT_3DT_Cartesian_SPIRIT.xml  |   828 ++
 .../GT_3DT_Cartesian_SingleLayer_L1SPIRIT.xml      |   806 ++
 gadgets/gtPlus/config/gtCloud/myCloud_2DT.txt      |     8 +
 .../config/gtCloud/myCloud_2DT_DualLayer.txt       |     8 +
 .../gtCloud/myCloud_2DT_DualLayer_FirstLayer.txt   |     8 +
 gadgets/gtPlus/config/gtCloud/myCloud_3DT.txt      |    12 +
 gadgets/hyper/CMRT.xml                             |    46 +
 gadgets/hyper/CMRT3D.xml                           |    44 +
 gadgets/hyper/CMRT3DGadget.cpp                     |   265 +
 gadgets/hyper/CMRT3DGadget.h                       |    43 +
 gadgets/hyper/CMRTGadget.cpp                       |   501 +
 gadgets/hyper/CMRTGadget.h                         |    79 +
 gadgets/hyper/CMakeLists.txt                       |    70 +
 gadgets/hyper/CSIGadget.cpp                        |   330 +
 gadgets/hyper/CSIGadget.h                          |    52 +
 gadgets/hyper/NFFT2D.xml                           |    36 +
 gadgets/hyper/NFFT2DGadget.cpp                     |   365 +
 gadgets/hyper/NFFT2DGadget.h                       |    56 +
 gadgets/hyper/gadgetron_hyper_export.h             |    14 +
 gadgets/hyper/gpuCSICoilEstimationGadget.cpp       |   271 +
 gadgets/hyper/gpuCSICoilEstimationGadget.h         |    56 +
 gadgets/interventional_mri/CMakeLists.txt          |    55 +
 .../DeviceChannelSplitterGadget.cpp                |    92 +
 .../DeviceChannelSplitterGadget.h                  |    45 +
 .../gadgetron_interventional_mri_export.h          |    14 +
 gadgets/interventional_mri/grappa_device.xml       |   104 +
 gadgets/matlab/BaseBufferGadget.m                  |    69 +
 gadgets/matlab/BaseGadget.m                        |    74 +
 gadgets/matlab/CMakeLists.txt                      |    57 +
 gadgets/matlab/MatlabBufferGadget.cpp              |   114 +
 gadgets/matlab/MatlabBufferGadget.h                |   185 +
 gadgets/matlab/MatlabCommandServer.java            |   129 +
 gadgets/matlab/MatlabGadget.cpp                    |   302 +
 gadgets/matlab/MatlabGadget.h                      |   203 +
 gadgets/matlab/MatlabUtils.cpp                     |   376 +
 gadgets/matlab/MatlabUtils.h                       |    47 +
 gadgets/matlab/accumulate_and_recon.m              |    94 +
 gadgets/matlab/bufferRecon.m                       |    88 +
 gadgets/matlab/gadgetron_matlab_export.h           |    23 +
 gadgets/matlab/mask_image.m                        |    27 +
 gadgets/matlab/matlab.xml                          |    63 +
 gadgets/matlab/matlabbuffer.xml                    |    71 +
 gadgets/matlab/matlabnoncartesian.xml              |    71 +
 gadgets/matlab/recon.m                             |    87 +
 gadgets/matlab/scale.m                             |    21 +
 gadgets/matlab/trajectoryScale.m                   |    69 +
 gadgets/moco/CMakeLists.txt                        |    94 +
 gadgets/moco/RegistrationAveragingGadget.h         |   328 +
 gadgets/moco/RegistrationScatteringGadget.h        |   375 +
 gadgets/moco/config/CMakeLists.txt                 |    13 +
 gadgets/moco/config/cpureg_cartesian_averaging.xml |   120 +
 gadgets/moco/config/gpureg_cartesian_averaging.xml |   120 +
 gadgets/moco/cpuRegistrationAveragingGadget.cpp    |    44 +
 gadgets/moco/cpuRegistrationAveragingGadget.h      |    25 +
 gadgets/moco/gadgetron_moco_export.h               |    14 +
 gadgets/moco/gpuRegistrationAveragingGadget.cpp    |    50 +
 gadgets/moco/gpuRegistrationAveragingGadget.h      |    26 +
 gadgets/moco/gpuRegistrationScatteringGadget.cpp   |    57 +
 gadgets/moco/gpuRegistrationScatteringGadget.h     |    26 +
 gadgets/mri_core/AccumulatorGadget.cpp             |   187 +
 gadgets/mri_core/AccumulatorGadget.h               |    38 +
 .../AcquisitionAccumulateTriggerGadget.cpp         |   403 +
 .../mri_core/AcquisitionAccumulateTriggerGadget.h  |    88 +
 gadgets/mri_core/AcquisitionFinishGadget.cpp       |    25 +
 gadgets/mri_core/AcquisitionFinishGadget.h         |    26 +
 gadgets/mri_core/AcquisitionPassthroughGadget.cpp  |    21 +
 gadgets/mri_core/AcquisitionPassthroughGadget.h    |    24 +
 gadgets/mri_core/AsymmetricEchoAdjustROGadget.cpp  |   138 +
 gadgets/mri_core/AsymmetricEchoAdjustROGadget.h    |    32 +
 gadgets/mri_core/AutoScaleGadget.cpp               |    81 +
 gadgets/mri_core/AutoScaleGadget.h                 |    35 +
 gadgets/mri_core/BucketToBufferGadget.cpp          |   622 ++
 gadgets/mri_core/BucketToBufferGadget.h            |    77 +
 gadgets/mri_core/CMakeLists.txt                    |   163 +
 gadgets/mri_core/CoilReductionGadget.cpp           |   122 +
 gadgets/mri_core/CoilReductionGadget.h             |    35 +
 gadgets/mri_core/CombineGadget.cpp                 |    69 +
 gadgets/mri_core/CombineGadget.h                   |    27 +
 gadgets/mri_core/ComplexToFloatGadget.cpp          |    90 +
 gadgets/mri_core/ComplexToFloatGadget.h            |    34 +
 gadgets/mri_core/CplxDumpGadget.cpp                |   137 +
 gadgets/mri_core/CplxDumpGadget.h                  |    34 +
 gadgets/mri_core/CropAndCombineGadget.cpp          |    70 +
 gadgets/mri_core/CropAndCombineGadget.h            |    25 +
 gadgets/mri_core/DependencyQueryGadget.cpp         |   194 +
 gadgets/mri_core/DependencyQueryGadget.h           |    57 +
 gadgets/mri_core/DependencyQueryWriter.cpp         |    70 +
 gadgets/mri_core/DependencyQueryWriter.h           |    28 +
 gadgets/mri_core/ExtractGadget.cpp                 |   117 +
 gadgets/mri_core/ExtractGadget.h                   |    64 +
 gadgets/mri_core/FFTGadget.cpp                     |   112 +
 gadgets/mri_core/FFTGadget.h                       |    26 +
 gadgets/mri_core/FloatToFixPointGadget.cpp         |   206 +
 gadgets/mri_core/FloatToFixPointGadget.h           |    84 +
 gadgets/mri_core/FlowPhaseSubtractionGadget.cpp    |   149 +
 gadgets/mri_core/FlowPhaseSubtractionGadget.h      |    38 +
 gadgets/mri_core/GadgetIsmrmrdReadWrite.cpp        |     6 +
 gadgets/mri_core/GadgetIsmrmrdReadWrite.h          |   155 +
 gadgets/mri_core/GadgetMRIHeaders.h                |   136 +
 gadgets/mri_core/ImageArraySplitGadget.cpp         |    81 +
 gadgets/mri_core/ImageArraySplitGadget.h           |    23 +
 gadgets/mri_core/ImageFinishGadget.cpp             |    31 +
 gadgets/mri_core/ImageFinishGadget.h               |    22 +
 gadgets/mri_core/ImageWriterGadget.cpp             |    52 +
 gadgets/mri_core/ImageWriterGadget.h               |    50 +
 gadgets/mri_core/IsmrmrdDumpGadget.cpp             |   136 +
 gadgets/mri_core/IsmrmrdDumpGadget.h               |    39 +
 gadgets/mri_core/MRIImageAttribWriter.cpp          |   149 +
 gadgets/mri_core/MRIImageAttribWriter.h            |   122 +
 gadgets/mri_core/MRIImageWriter.cpp                |   149 +
 gadgets/mri_core/MRIImageWriter.h                  |   122 +
 gadgets/mri_core/MaxwellCorrectionGadget.cpp       |   139 +
 gadgets/mri_core/MaxwellCorrectionGadget.h         |    35 +
 gadgets/mri_core/NoiseAdjustGadget.cpp             |   468 +
 gadgets/mri_core/NoiseAdjustGadget.h               |    74 +
 gadgets/mri_core/NoiseAdjustGadget_unoptimized.cpp |   219 +
 gadgets/mri_core/NoiseAdjustGadget_unoptimized.h   |    35 +
 gadgets/mri_core/PCACoilGadget.cpp                 |   300 +
 gadgets/mri_core/PCACoilGadget.h                   |    49 +
 gadgets/mri_core/PartialFourierAdjustROGadget.cpp  |   134 +
 gadgets/mri_core/PartialFourierAdjustROGadget.h    |    30 +
 gadgets/mri_core/PhysioInterpolationGadget.cpp     |   411 +
 gadgets/mri_core/PhysioInterpolationGadget.h       |    59 +
 gadgets/mri_core/RemoveROOversamplingGadget.cpp    |   141 +
 gadgets/mri_core/RemoveROOversamplingGadget.h      |    43 +
 gadgets/mri_core/SimpleReconGadget.cpp             |   154 +
 gadgets/mri_core/SimpleReconGadget.h               |    23 +
 gadgets/mri_core/Spline.h                          |   129 +
 gadgets/mri_core/WhiteNoiseInjectorGadget.cpp      |   195 +
 gadgets/mri_core/WhiteNoiseInjectorGadget.h        |    65 +
 gadgets/mri_core/default.xml                       |    80 +
 .../mri_core/default_measurement_dependencies.xml  |    34 +
 gadgets/mri_core/default_optimized.xml             |   123 +
 gadgets/mri_core/default_short.xml                 |    92 +
 gadgets/mri_core/gadgetron_mricore_export.h        |    14 +
 gadgets/mri_core/ismrmrd_dump.xml                  |    26 +
 gadgets/pmri/CMakeLists.txt                        |    76 +
 gadgets/pmri/GenericReconJob.h                     |    27 +
 gadgets/pmri/config/CMakeLists.txt                 |    16 +
 .../pmri/config/generic_gpu_ktsense_singleshot.xml |    97 +
 gadgets/pmri/config/generic_gpusense_cg.xml        |    95 +
 .../pmri/config/generic_gpusense_cg_singleshot.xml |    97 +
 .../config/generic_gpusense_nlcg_singleshot.xml    |    98 +
 .../pmri/config/generic_gpusense_sb_singleshot.xml |   101 +
 gadgets/pmri/gadgetron_gpupmri_export.h            |    14 +
 gadgets/pmri/gpuBufferSensePrepGadget.cpp          |   308 +
 gadgets/pmri/gpuBufferSensePrepGadget.h            |    47 +
 gadgets/pmri/gpuCgKtSenseGadget.cpp                |   365 +
 gadgets/pmri/gpuCgKtSenseGadget.h                  |    82 +
 gadgets/pmri/gpuCgSenseGadget.cpp                  |   245 +
 gadgets/pmri/gpuCgSenseGadget.h                    |    60 +
 gadgets/pmri/gpuCgSpiritGadget.cpp                 |   255 +
 gadgets/pmri/gpuCgSpiritGadget.h                   |    63 +
 gadgets/pmri/gpuGenericSensePrepGadget.cpp         |   933 ++
 gadgets/pmri/gpuGenericSensePrepGadget.h           |   143 +
 gadgets/pmri/gpuLALMSenseGadget.cpp                |   259 +
 gadgets/pmri/gpuLALMSenseGadget.h                  |    86 +
 gadgets/pmri/gpuNlcgSenseGadget.cpp                |   374 +
 gadgets/pmri/gpuNlcgSenseGadget.h                  |    94 +
 gadgets/pmri/gpuOsSenseGadget.cpp                  |   278 +
 gadgets/pmri/gpuOsSenseGadget.h                    |    82 +
 gadgets/pmri/gpuSbSenseGadget.cpp                  |   364 +
 gadgets/pmri/gpuSbSenseGadget.h                    |    89 +
 gadgets/pmri/gpuSenseGadget.cpp                    |   150 +
 gadgets/pmri/gpuSenseGadget.h                      |    58 +
 gadgets/python/CMakeLists.txt                      |    75 +
 .../GadgetInstrumentationStreamController.cpp      |   321 +
 .../python/GadgetInstrumentationStreamController.h |   144 +
 gadgets/python/GadgetReference.cpp                 |   109 +
 gadgets/python/GadgetReference.h                   |    36 +
 gadgets/python/GadgetronPythonMRI.cpp              |    37 +
 gadgets/python/PythonGadget.cpp                    |    71 +
 gadgets/python/PythonGadget.h                      |   209 +
 gadgets/python/config/CMakeLists.txt               |     2 +
 gadgets/python/config/python.xml                   |    66 +
 gadgets/python/config/python_short.xml             |    69 +
 gadgets/python/config/python_tpat_snr_scale.xml    |    71 +
 gadgets/python/examples/mixed_gadgets.py           |    67 +
 gadgets/python/examples/mixed_gadgets_gpu.py       |   181 +
 gadgets/python/examples/pure_python_demo.py        |    69 +
 gadgets/python/gadgetronpython_export.h            |    14 +
 gadgets/python/gadgets/CMakeLists.txt              |     8 +
 gadgets/python/gadgets/accumulate_and_recon.py     |    64 +
 gadgets/python/gadgets/gadgetron.py                |   177 +
 gadgets/python/gadgets/image_viewer.py             |    70 +
 gadgets/python/gadgets/remove_2x_oversampling.py   |    22 +
 gadgets/python/gadgets/rms_coil_combine.py         |    13 +
 gadgets/python/gadgets/tpat_snr_scale.py           |   281 +
 gadgets/python/kspaceandimage.py                   |    20 +
 gadgets/python/utils/CMakeLists.txt                |     3 +
 gadgets/python/utils/gadgetron_python_to_xml.py    |    83 +
 gadgets/python/utils/gadgetron_run_python_chain.py |    77 +
 gadgets/python/utils/gadgetron_xml_to_python.py    |    70 +
 gadgets/radial/CMakeLists.txt                      |    61 +
 gadgets/radial/RadialPhaseCorrectionGadget.cpp     |   310 +
 gadgets/radial/RadialPhaseCorrectionGadget.h       |    47 +
 gadgets/radial/config/CMakeLists.txt               |    33 +
 .../config/fixed_radial_mode0_gpu_ktsense.xml      |   131 +
 .../config/fixed_radial_mode0_gpusense_cg.xml      |   127 +
 .../fixed_radial_mode0_gpusense_cg_unoptimized.xml |   114 +
 .../config/fixed_radial_mode0_gpusense_sb.xml      |   137 +
 .../fixed_radial_mode0_gpusense_sb_unoptimized.xml |   124 +
 .../radial/config/fixed_radial_mode0_realtime.xml  |   123 +
 .../config/fixed_radial_mode1_gpu_ktsense.xml      |   131 +
 .../config/fixed_radial_mode1_gpusense_cg.xml      |   127 +
 .../fixed_radial_mode1_gpusense_cg_unoptimized.xml |   114 +
 .../config/fixed_radial_mode1_gpusense_sb.xml      |   137 +
 .../fixed_radial_mode1_gpusense_sb_unoptimized.xml |   124 +
 .../radial/config/fixed_radial_mode1_realtime.xml  |   123 +
 .../config/golden_radial_mode2_gpu_ktsense.xml     |   133 +
 .../config/golden_radial_mode2_gpusense_cg.xml     |   129 +
 ...golden_radial_mode2_gpusense_cg_unoptimized.xml |   116 +
 .../config/golden_radial_mode2_gpusense_nlcg.xml   |   132 +
 ...lden_radial_mode2_gpusense_nlcg_unoptimized.xml |   120 +
 .../config/golden_radial_mode2_gpusense_sb.xml     |   139 +
 ...golden_radial_mode2_gpusense_sb_unoptimized.xml |   126 +
 .../radial/config/golden_radial_mode2_realtime.xml |   124 +
 .../config/golden_radial_mode3_gpusense_cg.xml     |   129 +
 .../config/golden_radial_mode3_gpusense_sb.xml     |   139 +
 .../config/golden_radial_mode3_os_realtime.xml     |   162 +
 gadgets/radial/config/spirit.xml                   |    96 +
 gadgets/radial/gadgetron_radial_export.h           |    14 +
 gadgets/radial/gpuRadialPrepGadget.cpp             |   939 ++
 gadgets/radial/gpuRadialPrepGadget.h               |   222 +
 gadgets/radial/gpuRadialSensePrepGadget.cpp        |    89 +
 gadgets/radial/gpuRadialSensePrepGadget.h          |    31 +
 gadgets/radial/gpuRadialSpiritPrepGadget.cpp       |    98 +
 gadgets/radial/gpuRadialSpiritPrepGadget.h         |    33 +
 gadgets/radial/gpuRetroGatedSensePrepGadget.cpp    |   870 ++
 gadgets/radial/gpuRetroGatedSensePrepGadget.h      |   148 +
 gadgets/spiral/CMakeLists.txt                      |    45 +
 gadgets/spiral/SpiralToGenericGadget.cpp           |   218 +
 gadgets/spiral/SpiralToGenericGadget.h             |    50 +
 gadgets/spiral/config/CMakeLists.txt               |    16 +
 .../config/spiral_flow_generic_gpusense_cg.xml     |   131 +
 .../config/spiral_flow_generic_gpusense_sb.xml     |   141 +
 gadgets/spiral/config/spiral_flow_gpusense_cg.xml  |   123 +
 .../spiral/config/spiral_flow_gpusense_cg_ecg.xml  |   131 +
 .../config/spiral_flow_gpusense_cg_unoptimized.xml |   105 +
 gadgets/spiral/config/spiral_flow_gpusense_sb.xml  |   132 +
 .../config/spiral_flow_gpusense_sb_unoptimized.xml |   111 +
 gadgets/spiral/config/spiral_interactive.xml       |   106 +
 gadgets/spiral/gadgetron_spiral_export.h           |    11 +
 gadgets/spiral/gpuSpiralSensePrepGadget.cpp        |   687 ++
 gadgets/spiral/gpuSpiralSensePrepGadget.h          |   100 +
 gadgets/spiral/vds.cpp                             |   495 +
 gadgets/spiral/vds.h                               |    15 +
 gadgets/util/CMakeLists.txt                        |    28 +
 gadgets/util/ParameterRelayGadget.cpp              |    18 +
 gadgets/util/ParameterRelayGadget.h                |    18 +
 gadgets/util/gadgetron_util_gadgets_export.h       |    14 +
 test/CMakeLists.txt                                |    79 +
 test/cuNDArray_Vector_td_test.cpp                  |    50 +
 test/cuNDArray_blas_test.cpp                       |   156 +
 test/cuNDArray_elemwise_test.cpp                   |   379 +
 test/cuNDArray_operators_test.cpp                  |   243 +
 test/cuNDArray_test.cpp                            |    83 +
 test/cuNDArray_utils_test.cpp                      |   241 +
 test/cuNDFFT_test.cpp                              |    47 +
 test/cuVector_td_test_kernels.cu                   |   237 +
 test/cuVector_td_test_kernels.h                    |    18 +
 test/hoCuGTBLAS_test.cpp                           |    80 +
 test/hoCuNDArray_elemwise_test.cpp                 |   144 +
 test/hoNDArray_blas_test.cpp                       |   144 +
 test/hoNDArray_elemwise_test.cpp                   |   618 ++
 test/hoNDArray_operators_test.cpp                  |   249 +
 test/hoNDArray_utils_test.cpp                      |   186 +
 test/hoNDFFT_test.cpp                              |    46 +
 test/integration/.gitignore                        |     8 +
 test/integration/CMakeLists.txt                    |    20 +
 test/integration/cases/cpu_grappa_simple.cfg       |    31 +
 test/integration/cases/epi_2d.cfg                  |    30 +
 .../cases/gpu_fixed_radial_mode1_cg.cfg            |    30 +
 .../cases/gpu_fixed_radial_mode1_ktsense.cfg       |    30 +
 .../cases/gpu_fixed_radial_mode1_realtime.cfg      |    30 +
 .../cases/gpu_golden_radial_mode2_cg.cfg           |    30 +
 .../cases/gpu_golden_radial_mode2_ktsense.cfg      |    30 +
 .../cases/gpu_golden_radial_mode2_realtime.cfg     |    30 +
 .../cases/gpu_golden_radial_mode2_sb.cfg           |    30 +
 test/integration/cases/gpu_grappa_simple.cfg       |    31 +
 test/integration/cases/gpu_spiral.cfg              |    30 +
 test/integration/cases/gpu_spiral_sb.cfg           |    30 +
 test/integration/cases/gtplus_3D_head.cfg          |    31 +
 test/integration/cases/gtplus_FatWater.cfg         |    32 +
 test/integration/cases/gtplus_FetalHASTE.cfg       |    31 +
 test/integration/cases/gtplus_LGE.cfg              |    31 +
 test/integration/cases/gtplus_Perfusion.cfg        |    31 +
 test/integration/cases/gtplus_T2W.cfg              |    31 +
 test/integration/cases/gtplus_localizer.cfg        |    31 +
 test/integration/cases/gtplus_molli.cfg            |    31 +
 test/integration/cases/gtplus_real_time_cine.cfg   |    31 +
 .../cases/gtplus_real_time_cine_9slices.cfg        |    31 +
 test/integration/cases/gtplus_sasha.cfg            |    31 +
 .../cases/gtplus_snr_unit_recon_builtin_noise.cfg  |    31 +
 .../cases/gtplus_snr_unit_recon_ipat4.cfg          |    31 +
 .../gtplus_snr_unit_recon_prospective_cine.cfg     |    31 +
 .../cases/gtplus_snr_unit_recon_spat2_asym_pf.cfg  |    31 +
 .../cases/gtplus_snr_unit_recon_spat3.cfg          |    31 +
 .../cases/gtplus_snr_unit_recon_tpat3.cfg          |    31 +
 test/integration/cases/simple_gre.cfg              |    30 +
 test/integration/cases/simple_gre_3d.cfg           |    32 +
 test/integration/cases/simple_gre_python.cfg       |    30 +
 test/integration/data.txt                          |    68 +
 test/integration/get_data.py                       |    68 +
 test/integration/run_all_tests.py                  |    71 +
 test/integration/run_gadgetron_test.py             |   425 +
 test/tests.cpp                                     |    14 +
 test/unit/run_unit_tests.py                        |    62 +
 test/vector_td_test.cpp                            |   141 +
 toolboxes/CMakeLists.txt                           |    44 +
 toolboxes/cloudbus/CMakeLists.txt                  |    33 +
 toolboxes/cloudbus/CloudBus.cpp                    |   248 +
 toolboxes/cloudbus/CloudBus.h                      |   117 +
 toolboxes/cloudbus/cloudbus_export.h               |    14 +
 toolboxes/cloudbus/cloudbus_main.cpp               |    36 +
 toolboxes/core/CMakeLists.txt                      |    25 +
 toolboxes/core/GadgetronException.h                |    33 +
 toolboxes/core/GadgetronTimer.h                    |   110 +
 toolboxes/core/Gadgetron_enable_types.h            |    12 +
 toolboxes/core/NDArray.h                           |   802 ++
 toolboxes/core/complext.h                          |   320 +
 toolboxes/core/core_defines.h.in                   |    28 +
 toolboxes/core/cpu/CMakeLists.txt                  |    92 +
 toolboxes/core/cpu/algorithm/hoNDBSpline.h         |   191 +
 toolboxes/core/cpu/algorithm/hoNDBSpline.hxx       |  2133 ++++
 toolboxes/core/cpu/cpucore_export.h                |    18 +
 toolboxes/core/cpu/dummy.cpp                       |    18 +
 toolboxes/core/cpu/gadgetronmath.h                 |    26 +
 toolboxes/core/cpu/ho2DArray.h                     |    83 +
 toolboxes/core/cpu/ho2DArray.hxx                   |   261 +
 toolboxes/core/cpu/ho3DArray.h                     |    54 +
 toolboxes/core/cpu/ho3DArray.hxx                   |   287 +
 toolboxes/core/cpu/ho4DArray.h                     |    54 +
 toolboxes/core/cpu/ho4DArray.hxx                   |   313 +
 toolboxes/core/cpu/ho5DArray.h                     |    54 +
 toolboxes/core/cpu/ho5DArray.hxx                   |   345 +
 toolboxes/core/cpu/ho6DArray.h                     |    54 +
 toolboxes/core/cpu/ho6DArray.hxx                   |   392 +
 toolboxes/core/cpu/ho7DArray.h                     |    54 +
 toolboxes/core/cpu/ho7DArray.hxx                   |   427 +
 toolboxes/core/cpu/hoMatrix.cpp                    |   416 +
 toolboxes/core/cpu/hoMatrix.h                      |   109 +
 toolboxes/core/cpu/hoMatrix.hxx                    |   732 ++
 toolboxes/core/cpu/hoNDArray.h                     |   212 +
 toolboxes/core/cpu/hoNDArray.hxx                   |  1029 ++
 toolboxes/core/cpu/hoNDArray_fileio.h              |    66 +
 toolboxes/core/cpu/hoNDArray_utils.h               |   763 ++
 toolboxes/core/cpu/hoNDBoundaryHandler.h           |   276 +
 toolboxes/core/cpu/hoNDBoundaryHandler.hxx         |   497 +
 toolboxes/core/cpu/hoNDInterpolator.h              |   307 +
 toolboxes/core/cpu/hoNDInterpolatorBSpline.hxx     |   339 +
 toolboxes/core/cpu/hoNDInterpolatorLinear.hxx      |   874 ++
 .../core/cpu/hoNDInterpolatorNearestNeighbor.hxx   |    94 +
 toolboxes/core/cpu/hoNDObjectArray.h               |   200 +
 toolboxes/core/cpu/hoNDPoint.h                     |   337 +
 toolboxes/core/cpu/hostutils/CMakeLists.txt        |    20 +
 toolboxes/core/cpu/hostutils/FileInfo.h            |    54 +
 toolboxes/core/cpu/hostutils/hostutils_export.h    |    22 +
 toolboxes/core/cpu/hostutils/parameterparser.cpp   |   330 +
 toolboxes/core/cpu/hostutils/parameterparser.h     |    81 +
 toolboxes/core/cpu/hostutils/url_encode.h          |    47 +
 toolboxes/core/cpu/image/hoNDImage.h               |   513 +
 toolboxes/core/cpu/image/hoNDImage.hxx             |  2955 +++++
 toolboxes/core/cpu/image/hoNDImageAttrib.h         |   329 +
 toolboxes/core/cpu/image/hoNDImageContainer2D.h    |  1223 +++
 toolboxes/core/cpu/math/CMakeLists.txt             |    78 +
 toolboxes/core/cpu/math/cpucore_math_export.h      |    18 +
 toolboxes/core/cpu/math/hoArmadillo.h              |    89 +
 toolboxes/core/cpu/math/hoNDArray_elemwise.cpp     |  2936 +++++
 toolboxes/core/cpu/math/hoNDArray_elemwise.h       |   694 ++
 toolboxes/core/cpu/math/hoNDArray_linalg.cpp       |  1970 ++++
 toolboxes/core/cpu/math/hoNDArray_linalg.h         |    90 +
 toolboxes/core/cpu/math/hoNDArray_math.h           |     4 +
 toolboxes/core/cpu/math/hoNDArray_math_util.cpp    |  2178 ++++
 toolboxes/core/cpu/math/hoNDArray_math_util.h      |    27 +
 toolboxes/core/cpu/math/hoNDArray_reductions.cpp   |   939 ++
 toolboxes/core/cpu/math/hoNDArray_reductions.h     |   203 +
 toolboxes/core/cpu/math/hoNDImage_util.cpp         |   877 ++
 toolboxes/core/cpu/math/hoNDImage_util.h           |    75 +
 toolboxes/core/cpu/math/hoNDImage_util.hxx         |  1022 ++
 .../core/cpu/math/hoNDImage_util_instantiate.hxx   |    15 +
 toolboxes/core/gpu/CMakeLists.txt                  |    87 +
 toolboxes/core/gpu/CUBLASContextProvider.cpp       |   115 +
 toolboxes/core/gpu/CUBLASContextProvider.h         |    35 +
 toolboxes/core/gpu/GPUTimer.h                      |    73 +
 toolboxes/core/gpu/GadgetronCuException.h          |    15 +
 toolboxes/core/gpu/check_CUDA.h                    |    40 +
 toolboxes/core/gpu/cuNDArray.h                     |   878 ++
 toolboxes/core/gpu/cuNDArray_blas.cu               |   312 +
 toolboxes/core/gpu/cuNDArray_blas.h                |    51 +
 toolboxes/core/gpu/cuNDArray_elemwise.cu           |   703 ++
 toolboxes/core/gpu/cuNDArray_elemwise.h            |   255 +
 toolboxes/core/gpu/cuNDArray_fileio.h              |    10 +
 toolboxes/core/gpu/cuNDArray_kernels.cu            |   179 +
 toolboxes/core/gpu/cuNDArray_math.h                |     7 +
 toolboxes/core/gpu/cuNDArray_operators.cu          |   238 +
 toolboxes/core/gpu/cuNDArray_operators.h           |   168 +
 toolboxes/core/gpu/cuNDArray_reductions.cu         |   103 +
 toolboxes/core/gpu/cuNDArray_reductions.h          |    15 +
 toolboxes/core/gpu/cuNDArray_utils.cu              |   963 ++
 toolboxes/core/gpu/cuNDArray_utils.h               |   128 +
 toolboxes/core/gpu/cuSparseMatrix.cu               |   113 +
 toolboxes/core/gpu/cuSparseMatrix.h                |    49 +
 toolboxes/core/gpu/cudaDeviceManager.cpp           |   264 +
 toolboxes/core/gpu/cudaDeviceManager.h             |    89 +
 toolboxes/core/gpu/gpucore_export.h                |    18 +
 toolboxes/core/gpu/hoCuNDArray.h                   |   171 +
 toolboxes/core/gpu/hoCuNDArray_blas.cpp            |   260 +
 toolboxes/core/gpu/hoCuNDArray_blas.h              |    32 +
 toolboxes/core/gpu/hoCuNDArray_elemwise.h          |     8 +
 toolboxes/core/gpu/hoCuNDArray_math.h              |     5 +
 toolboxes/core/gpu/hoCuNDArray_utils.h             |    18 +
 toolboxes/core/gpu/radial_utilities.cu             |   481 +
 toolboxes/core/gpu/radial_utilities.h              |    41 +
 toolboxes/core/gpu/real_utilities_device.h         |    22 +
 toolboxes/core/gpu/setup_grid.h                    |    36 +
 toolboxes/core/real_utilities.h                    |    72 +
 toolboxes/core/vector_td.h                         |   317 +
 toolboxes/core/vector_td_io.h                      |    49 +
 toolboxes/core/vector_td_operators.h               |   435 +
 toolboxes/core/vector_td_utilities.h               |   495 +
 toolboxes/ct/CMakeLists.txt                        |     3 +
 toolboxes/ct/xray/CMakeLists.txt                   |     3 +
 toolboxes/ct/xray/gpu/CBCT_acquisition.h           |   298 +
 toolboxes/ct/xray/gpu/CBCT_binning.h               |   166 +
 toolboxes/ct/xray/gpu/CMakeLists.txt               |    49 +
 toolboxes/ct/xray/gpu/conebeam_projection.cu       |  1151 ++
 toolboxes/ct/xray/gpu/conebeam_projection.h        |    76 +
 toolboxes/ct/xray/gpu/float3x3.h                   |    66 +
 toolboxes/ct/xray/gpu/gpuxray_export.h             |    19 +
 .../ct/xray/gpu/hoCuConebeamProjectionOperator.cpp |   261 +
 .../ct/xray/gpu/hoCuConebeamProjectionOperator.h   |   147 +
 toolboxes/dwt/CMakeLists.txt                       |     3 +
 toolboxes/dwt/gpu/CMakeLists.txt                   |    37 +
 toolboxes/dwt/gpu/cuDWTOperator.h                  |   135 +
 toolboxes/dwt/gpu/cuHaarWaveletOperator.cu         |   365 +
 toolboxes/dwt/gpu/cuHaarWaveletOperator.h          |    31 +
 toolboxes/dwt/gpu/cuNDDWT.cu                       |   205 +
 toolboxes/dwt/gpu/cuNDDWT.h                        |    27 +
 toolboxes/fft/CMakeLists.txt                       |    11 +
 toolboxes/fft/cpu/CMakeLists.txt                   |    47 +
 toolboxes/fft/cpu/cpufft_export.h                  |    18 +
 toolboxes/fft/cpu/hoNDFFT.cpp                      |  1447 +++
 toolboxes/fft/cpu/hoNDFFT.h                        |   282 +
 toolboxes/fft/gpu/CMakeLists.txt                   |    35 +
 toolboxes/fft/gpu/cuNDFFT.cpp                      |   171 +
 toolboxes/fft/gpu/cuNDFFT.cu                       |    46 +
 toolboxes/fft/gpu/cuNDFFT.h                        |    48 +
 toolboxes/fft/gpu/gpufft_export.h                  |    18 +
 toolboxes/gadgettools/CMakeLists.txt               |    56 +
 toolboxes/gadgettools/GadgetCloudController.h      |   712 ++
 toolboxes/gadgettools/GadgetronCloudConnector.h    |   580 +
 toolboxes/gadgettools/GadgetronConnector.cpp       |   208 +
 toolboxes/gadgettools/GadgetronConnector.h         |   157 +
 toolboxes/gadgettools/GadgetronOSUtil.cpp          |    50 +
 toolboxes/gadgettools/GadgetronOSUtil.h            |    21 +
 toolboxes/gadgettools/GadgetronSlotContainer.h     |    53 +
 toolboxes/gadgettools/demo.xml                     |    43 +
 toolboxes/gadgettools/gadgettools_export.h         |    20 +
 toolboxes/gadgettools/ismrmrd/CMakeLists.txt       |     4 +
 .../gadgettools/ismrmrd/GadgetImageMessageReader.h |   196 +
 .../gadgettools/ismrmrd/GadgetImageMessageWriter.h |    78 +
 toolboxes/gadgettools/schema/gadgetron.xsd         |    54 +
 toolboxes/gadgettools/test_gadget_xml.cpp          |    32 +
 toolboxes/gtplus/CMakeLists.txt                    |   236 +
 toolboxes/gtplus/GtPlusExport.h                    |    16 +
 toolboxes/gtplus/GtPlusIOExport.h                  |    16 +
 .../FreeFormDeformation/gtplusBSplineFFD.h         |   820 ++
 .../FreeFormDeformation/gtplusBSplineFFD2D.h       |   597 ++
 .../FreeFormDeformation/gtplusBSplineFFD3D.h       |   740 ++
 .../FreeFormDeformation/gtplusBSplineFFD4D.h       |   905 ++
 .../algorithm/FreeFormDeformation/gtplusFFDBase.h  |  1978 ++++
 .../algorithm/FreeFormDeformation/gtplusMLFFD.h    |   436 +
 toolboxes/gtplus/algorithm/gtPlusAlgorithmBase.h   |    70 +
 .../gtplus/algorithm/gtPlusDataFidelityOperator.h  |   161 +
 toolboxes/gtplus/algorithm/gtPlusGRAPPA.h          |  1016 ++
 toolboxes/gtplus/algorithm/gtPlusOperator.h        |   284 +
 toolboxes/gtplus/algorithm/gtPlusSPIRIT.h          |  1258 +++
 .../gtplus/algorithm/gtPlusSPIRIT2DOperator.h      |   239 +
 .../gtplus/algorithm/gtPlusSPIRIT2DTOperator.h     |   353 +
 .../gtplus/algorithm/gtPlusSPIRIT3DOperator.h      |    98 +
 .../algorithm/gtPlusSPIRITNoNullSpace2DOperator.h  |    68 +
 .../algorithm/gtPlusSPIRITNoNullSpace2DTOperator.h |   285 +
 .../algorithm/gtPlusSPIRITNoNullSpace3DOperator.h  |    64 +
 .../algorithm/gtPlusSPIRITNoNullSpaceOperator.h    |   130 +
 toolboxes/gtplus/algorithm/gtPlusSPIRITOperator.h  |   396 +
 .../gtplus/algorithm/gtPlusWavelet2DOperator.h     |   374 +
 .../gtplus/algorithm/gtPlusWavelet3DOperator.h     |  1450 +++
 .../algorithm/gtPlusWaveletNoNullSpace2DOperator.h |   119 +
 .../algorithm/gtPlusWaveletNoNullSpace3DOperator.h |   121 +
 toolboxes/gtplus/algorithm/gtPlusWaveletOperator.h |   746 ++
 toolboxes/gtplus/matlab/gtMatlab.h                 |    88 +
 toolboxes/gtplus/matlab/gtMatlabConverter.h        |   266 +
 toolboxes/gtplus/matlab/gtMatlabConverterComplex.h |   184 +
 toolboxes/gtplus/matlab/gtMatlabImage.h            |   258 +
 toolboxes/gtplus/solver/gtPlusLSQRSolver.h         |   293 +
 toolboxes/gtplus/solver/gtPlusLinearSolver.h       |    93 +
 toolboxes/gtplus/solver/gtPlusNCGSolver.h          |   394 +
 toolboxes/gtplus/solver/gtPlusNonLinearSolver.h    |   122 +
 toolboxes/gtplus/solver/gtPlusSolver.h             |   146 +
 toolboxes/gtplus/ut/CMakeLists.txt                 |    57 +
 toolboxes/gtplus/ut/grappa_test.cpp                |   609 ++
 toolboxes/gtplus/ut/gtplus_ut.cpp                  |    16 +
 toolboxes/gtplus/util/gtPlusIOAnalyze.cpp          |   100 +
 toolboxes/gtplus/util/gtPlusIOAnalyze.h            |   937 ++
 toolboxes/gtplus/util/gtPlusIOBase.cpp             |   200 +
 toolboxes/gtplus/util/gtPlusIOBase.h               |   819 ++
 toolboxes/gtplus/util/gtPlusUtil.h                 |    95 +
 toolboxes/gtplus/util/gtPlusUtil.hxx               |   149 +
 toolboxes/gtplus/workflow/gtPlusCloudScheduler.cpp |   157 +
 toolboxes/gtplus/workflow/gtPlusCloudScheduler.h   |    54 +
 .../workflow/gtPlusISMRMRDReconCoilMapEstimation.h |   137 +
 .../gtplus/workflow/gtPlusISMRMRDReconUtil.cpp     |   864 ++
 toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.h |   724 ++
 .../gtplus/workflow/gtPlusISMRMRDReconUtil.hxx     |  5359 ++++++++++
 .../gtplus/workflow/gtPlusISMRMRDReconWorkFlow.h   |   604 ++
 .../workflow/gtPlusISMRMRDReconWorkFlowCartesian.h |  1892 ++++
 .../gtPlusISMRMRDReconWorkFlowCartesian2DT.h       |   292 +
 .../gtPlusISMRMRDReconWorkFlowCartesian3DT.h       |   262 +
 .../gtplus/workflow/gtPlusISMRMRDReconWorkOrder.h  |  1273 +++
 .../workflow/gtPlusISMRMRDReconWorkOrder2DT.h      |   409 +
 .../workflow/gtPlusISMRMRDReconWorkOrder3DT.h      |   380 +
 .../gtplus/workflow/gtPlusISMRMRDReconWorker.h     |   615 ++
 .../gtplus/workflow/gtPlusISMRMRDReconWorker2DT.h  |  2825 +++++
 .../workflow/gtPlusISMRMRDReconWorker2DTGRAPPA.h   |   426 +
 .../gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h       |   355 +
 .../gtPlusISMRMRDReconWorker2DTNoAcceleration.h    |   155 +
 .../workflow/gtPlusISMRMRDReconWorker2DTSPIRIT.h   |   734 ++
 .../gtplus/workflow/gtPlusISMRMRDReconWorker3DT.h  |  2716 +++++
 .../workflow/gtPlusISMRMRDReconWorker3DTGRAPPA.h   |   642 ++
 .../gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h       |   787 ++
 .../gtPlusISMRMRDReconWorker3DTNoAcceleration.h    |   144 +
 .../workflow/gtPlusISMRMRDReconWorker3DTSPIRIT.h   |  1056 ++
 toolboxes/log/CMakeLists.txt                       |    10 +
 toolboxes/log/log.cpp                              |   206 +
 toolboxes/log/log.h                                |   195 +
 toolboxes/log/log_export.h                         |    14 +
 toolboxes/mri/CMakeLists.txt                       |     9 +
 toolboxes/mri/epi/CMakeLists.txt                   |    29 +
 toolboxes/mri/epi/EPIExport.h                      |    16 +
 toolboxes/mri/epi/EPIReconXObject.h                |    73 +
 toolboxes/mri/epi/EPIReconXObjectFlat.h            |   189 +
 toolboxes/mri/epi/EPIReconXObjectTrapezoid.h       |   240 +
 toolboxes/mri/hyper/CMRTOperator.cpp               |    21 +
 toolboxes/mri/hyper/CMRTOperator.h                 |   147 +
 toolboxes/mri/hyper/CMakeLists.txt                 |    44 +
 toolboxes/mri/hyper/CSIOperator.cpp                |    60 +
 toolboxes/mri/hyper/CSIOperator.h                  |    42 +
 toolboxes/mri/hyper/CSI_utils.cu                   |    97 +
 toolboxes/mri/hyper/CSI_utils.h                    |    35 +
 toolboxes/mri/hyper/CSfreqOperator.h               |    52 +
 .../mri/hyper/gadgetron_toolbox_hyper_export.h     |    11 +
 toolboxes/mri/pmri/CMakeLists.txt                  |     3 +
 toolboxes/mri/pmri/gpu/CMakeLists.txt              |    90 +
 toolboxes/mri/pmri/gpu/b1_map.cu                   |   733 ++
 toolboxes/mri/pmri/gpu/b1_map.h                    |    32 +
 toolboxes/mri/pmri/gpu/b1_map_NIH_Souheil.cu       |   647 ++
 toolboxes/mri/pmri/gpu/b1map_test.cu               |    48 +
 toolboxes/mri/pmri/gpu/cuBuffer.cpp                |   197 +
 toolboxes/mri/pmri/gpu/cuBuffer.h                  |    61 +
 toolboxes/mri/pmri/gpu/cuCartesianSenseOperator.cu |   133 +
 toolboxes/mri/pmri/gpu/cuCartesianSenseOperator.h  |    35 +
 .../mri/pmri/gpu/cuNonCartesianKtSenseOperator.cu  |    41 +
 .../mri/pmri/gpu/cuNonCartesianKtSenseOperator.h   |    27 +
 .../mri/pmri/gpu/cuNonCartesianSenseOperator.cu    |   102 +
 .../mri/pmri/gpu/cuNonCartesianSenseOperator.h     |    48 +
 toolboxes/mri/pmri/gpu/cuSenseBuffer.cpp           |    58 +
 toolboxes/mri/pmri/gpu/cuSenseBuffer.h             |    38 +
 toolboxes/mri/pmri/gpu/cuSenseBufferCg.cpp         |    89 +
 toolboxes/mri/pmri/gpu/cuSenseBufferCg.h           |    39 +
 toolboxes/mri/pmri/gpu/cuSenseOperator.cu          |    32 +
 toolboxes/mri/pmri/gpu/cuSenseOperator.h           |    31 +
 toolboxes/mri/pmri/gpu/cuSpiritBuffer.cpp          |    89 +
 toolboxes/mri/pmri/gpu/cuSpiritBuffer.h            |    43 +
 toolboxes/mri/pmri/gpu/cuSpiritOperator.h          |   141 +
 toolboxes/mri/pmri/gpu/gpupmri_export.h            |    19 +
 toolboxes/mri/pmri/gpu/htgrappa.cpp                |    65 +
 toolboxes/mri/pmri/gpu/htgrappa.cu                 |   838 ++
 toolboxes/mri/pmri/gpu/htgrappa.h                  |    36 +
 toolboxes/mri/pmri/gpu/htgrappa_test.cpp           |    64 +
 toolboxes/mri/pmri/gpu/osSenseOperator.h           |    97 +
 toolboxes/mri/pmri/gpu/senseOperator.h             |    48 +
 toolboxes/mri/pmri/gpu/sense_utilities.cu          |   146 +
 toolboxes/mri/pmri/gpu/sense_utilities.h           |    25 +
 toolboxes/mri/pmri/gpu/spirit_calibration.cu       |   363 +
 toolboxes/mri/pmri/gpu/spirit_calibration.h        |    22 +
 toolboxes/mri/pmri/gpu/trajectory_utils.cu         |    38 +
 toolboxes/mri/pmri/gpu/trajectory_utils.h          |    16 +
 toolboxes/mri_core/CMakeLists.txt                  |    57 +
 .../mri_core/mri_core_coil_map_estimation.cpp      |   610 ++
 toolboxes/mri_core/mri_core_coil_map_estimation.h  |    50 +
 toolboxes/mri_core/mri_core_data.h                 |   282 +
 toolboxes/mri_core/mri_core_def.h                  |    79 +
 toolboxes/mri_core/mri_core_export.h               |    18 +
 toolboxes/mri_core/mri_core_grappa.cpp             |   553 +
 toolboxes/mri_core/mri_core_grappa.h               |    80 +
 toolboxes/mri_core/mri_core_utility.cpp            |    13 +
 toolboxes/mri_core/mri_core_utility.h              |    14 +
 toolboxes/nfft/CMakeLists.txt                      |     3 +
 toolboxes/nfft/gpu/CMakeLists.txt                  |    47 +
 toolboxes/nfft/gpu/KaiserBessel_kernel.cu          |   127 +
 toolboxes/nfft/gpu/NFFT_C2NC_conv_kernel.cu        |   249 +
 toolboxes/nfft/gpu/NFFT_NC2C_atomic_conv_kernel.cu |   227 +
 toolboxes/nfft/gpu/NFFT_NC2C_conv_kernel.cu        |   142 +
 toolboxes/nfft/gpu/NFFT_preprocess_kernel.cu       |   171 +
 toolboxes/nfft/gpu/NFFT_sparseMatrix_kernel.cu     |   171 +
 toolboxes/nfft/gpu/cuNFFT.cu                       |  1455 +++
 toolboxes/nfft/gpu/cuNFFT.h                        |   294 +
 toolboxes/nfft/gpu/cuNFFTOperator.cu               |   118 +
 toolboxes/nfft/gpu/cuNFFTOperator.h                |    36 +
 toolboxes/nfft/gpu/gpunfft_export.h                |    19 +
 toolboxes/operators/CMakeLists.txt                 |    32 +
 toolboxes/operators/FFTOperator.h                  |    76 +
 toolboxes/operators/convolutionOperator.h          |   220 +
 toolboxes/operators/cpu/CMakeLists.txt             |    24 +
 toolboxes/operators/cpu/hoDiagonalOperator.h       |    20 +
 toolboxes/operators/cpu/hoDiagonalSumOperator.h    |    20 +
 toolboxes/operators/cpu/hoFFTOperator.h            |    29 +
 toolboxes/operators/cpu/hoIdentityOperator.h       |    28 +
 toolboxes/operators/cpu/hoImageOperator.h          |    58 +
 .../operators/cpu/hoPartialDerivativeOperator.h    |   107 +
 toolboxes/operators/cpu/hoTvOperator.h             |   117 +
 toolboxes/operators/cpu/hoTvPicsOperator.h         |    16 +
 toolboxes/operators/diagonalOperator.h             |    74 +
 toolboxes/operators/diagonalSumOperator.h          |    92 +
 toolboxes/operators/downsampleOperator.h           |    51 +
 toolboxes/operators/encodedImageOperator.h         |    48 +
 toolboxes/operators/encodingOperatorContainer.h    |   231 +
 toolboxes/operators/generalOperator.h              |    88 +
 toolboxes/operators/gpu/CMakeLists.txt             |    67 +
 toolboxes/operators/gpu/cuConvolutionOperator.cu   |    89 +
 toolboxes/operators/gpu/cuConvolutionOperator.h    |    28 +
 toolboxes/operators/gpu/cuDiagonalOperator.h       |    20 +
 toolboxes/operators/gpu/cuDiagonalSumOperator.h    |    20 +
 toolboxes/operators/gpu/cuDownsampleOperator.h     |    28 +
 toolboxes/operators/gpu/cuFFTOperator.h            |    29 +
 toolboxes/operators/gpu/cuIdentityOperator.h       |    28 +
 toolboxes/operators/gpu/cuImageOperator.h          |    66 +
 toolboxes/operators/gpu/cuLaplaceOperator.cu       |    95 +
 toolboxes/operators/gpu/cuLaplaceOperator.h        |    24 +
 .../gpu/cuMultiplicationOperatorContainer.h        |    23 +
 .../operators/gpu/cuPartialDerivativeOperator.cu   |   145 +
 .../operators/gpu/cuPartialDerivativeOperator.h    |    35 +
 .../operators/gpu/cuPartialDerivativeOperator2.cu  |   217 +
 .../operators/gpu/cuPartialDerivativeOperator2.h   |    27 +
 toolboxes/operators/gpu/cuTv1dOperator.cu          |   129 +
 toolboxes/operators/gpu/cuTv1dOperator.h           |    37 +
 toolboxes/operators/gpu/cuTvOperator.cu            |   141 +
 toolboxes/operators/gpu/cuTvOperator.h             |    41 +
 toolboxes/operators/gpu/cuTvPicsOperator.h         |    16 +
 toolboxes/operators/gpu/cuUpsampleOperator.h       |    28 +
 toolboxes/operators/gpu/gpuoperators_export.h      |    18 +
 toolboxes/operators/gpu/hoCuDiagonalOperator.h     |    20 +
 .../operators/gpu/hoCuEncodingOperatorContainer.h  |    22 +
 toolboxes/operators/gpu/hoCuIdentityOperator.h     |    28 +
 toolboxes/operators/gpu/hoCuOperator.h             |    55 +
 .../operators/gpu/hoCuPartialDerivativeOperator.h  |    90 +
 toolboxes/operators/gpu/hoCuTvOperator.h           |    84 +
 toolboxes/operators/gpu/hoCuTvPicsOperator.h       |    16 +
 toolboxes/operators/identityOperator.h             |    51 +
 toolboxes/operators/imageOperator.h                |    99 +
 toolboxes/operators/laplaceOperator.h              |    31 +
 toolboxes/operators/linearOperator.h               |    93 +
 .../operators/multiplicationOperatorContainer.h    |   163 +
 toolboxes/operators/partialDerivativeOperator.h    |    71 +
 toolboxes/operators/permutationOperator.h          |    48 +
 toolboxes/operators/subsetOperator.h               |    72 +
 toolboxes/operators/tvPicsOperator.h               |    44 +
 toolboxes/operators/upsampleOperator.h             |    51 +
 toolboxes/python/CMakeLists.txt                    |    43 +
 toolboxes/python/example/CMakeLists.txt            |     9 +
 toolboxes/python/example/demo.cpp                  |    87 +
 toolboxes/python/example/test_python.cpp           |    33 +
 toolboxes/python/python_converters.h               |    33 +
 toolboxes/python/python_export.h                   |    14 +
 toolboxes/python/python_hoNDArray_converter.h      |   103 +
 toolboxes/python/python_ismrmrd_converter.h        |   359 +
 toolboxes/python/python_numpy_wrappers.h           |    40 +
 toolboxes/python/python_toolbox.cpp                |   148 +
 toolboxes/python/python_toolbox.h                  |   154 +
 toolboxes/python/python_tuple_converter.h          |   120 +
 toolboxes/python/python_vector_converter.h         |    92 +
 toolboxes/registration/CMakeLists.txt              |     1 +
 toolboxes/registration/optical_flow/CMakeLists.txt |    32 +
 .../registration/optical_flow/cpu/CMakeLists.txt   |   131 +
 .../hoImageRegContainer2DRegistration.h            |  1418 +++
 .../registration/optical_flow/cpu/cpureg_export.h  |    14 +
 .../cpu/dissimilarity/hoImageRegDissimilarity.h    |   251 +
 .../hoImageRegDissimilarityHistogramBased.h        |   226 +
 .../hoImageRegDissimilarityLocalCCR.h              |   405 +
 .../hoImageRegDissimilarityMutualInformation.h     |   289 +
 ...geRegDissimilarityNormalizedMutualInformation.h |   173 +
 .../cpu/dissimilarity/hoImageRegDissimilaritySSD.h |   108 +
 .../optical_flow/cpu/hoCKOpticalFlowSolver.cpp     |   297 +
 .../optical_flow/cpu/hoCKOpticalFlowSolver.h       |    55 +
 .../optical_flow/cpu/hoHSOpticalFlowSolver.cpp     |   286 +
 .../optical_flow/cpu/hoHSOpticalFlowSolver.h       |    52 +
 .../optical_flow/cpu/hoLinearResampleOperator.cpp  |   203 +
 .../optical_flow/cpu/hoLinearResampleOperator.h    |    31 +
 .../cpu/hoLinearResampleOperator_eigen.cpp         |   206 +
 .../cpu/hoLinearResampleOperator_eigen.h           |    40 +
 .../optical_flow/cpu/hoOpticalFlowSolver.cpp       |   183 +
 .../optical_flow/cpu/hoOpticalFlowSolver.h         |    46 +
 ...ImageRegDeformationFieldBidirectionalRegister.h |   501 +
 .../register/hoImageRegDeformationFieldRegister.h  |   527 +
 .../cpu/register/hoImageRegNonParametricRegister.h |   148 +
 .../cpu/register/hoImageRegParametricRegister.h    |   408 +
 .../optical_flow/cpu/register/hoImageRegRegister.h |   651 ++
 ...hoImageRegDeformationFieldBidirectionalSolver.h |   602 ++
 .../cpu/solver/hoImageRegDeformationFieldSolver.h  |   673 ++
 .../cpu/solver/hoImageRegNonParametricSolver.h     |   162 +
 .../solver/hoImageRegParametricDownHillSolver.h    |   166 +
 .../hoImageRegParametricGradientDescentSolver.h    |   146 +
 .../cpu/solver/hoImageRegParametricSolver.h        |   326 +
 .../optical_flow/cpu/solver/hoImageRegSolver.h     |   210 +
 .../transformation/hoImageRegDeformationField.h    |   965 ++
 .../hoImageRegHomogenousTransformation.h           |   475 +
 .../hoImageRegNonParametricTransformation.h        |    82 +
 .../hoImageRegParametricTransformation.h           |   227 +
 .../hoImageRegRigid2DTransformation.h              |   380 +
 .../hoImageRegRigid3DTransformation.h              |   491 +
 .../cpu/transformation/hoImageRegTransformation.h  |   408 +
 .../optical_flow/cpu/warper/hoImageRegWarper.h     |   529 +
 .../registration/optical_flow/gpu/CMakeLists.txt   |    39 +
 .../registration/optical_flow/gpu/cuCGHSOFSolver.h |    67 +
 .../optical_flow/gpu/cuCKOpticalFlowSolver.cu      |   340 +
 .../optical_flow/gpu/cuCKOpticalFlowSolver.h       |    55 +
 .../optical_flow/gpu/cuHSOpticalFlowSolver.cu      |   326 +
 .../optical_flow/gpu/cuHSOpticalFlowSolver.h       |    52 +
 .../optical_flow/gpu/cuLinearResampleOperator.cu   |   265 +
 .../optical_flow/gpu/cuLinearResampleOperator.h    |    23 +
 .../optical_flow/gpu/cuOpticalFlowSolver.cu        |   303 +
 .../optical_flow/gpu/cuOpticalFlowSolver.h         |    50 +
 .../optical_flow/gpu/cuResampleOperator.cu         |   107 +
 .../optical_flow/gpu/cuResampleOperator.h          |    42 +
 .../optical_flow/gpu/cuResampleOperator_macros.h   |   248 +
 .../registration/optical_flow/gpu/gpureg_export.h  |    14 +
 .../optical_flow/multiresRegistrationSolver.h      |   263 +
 .../optical_flow/opticalFlowOperator.h             |    68 +
 .../registration/optical_flow/opticalFlowSolver.h  |   176 +
 .../registration/optical_flow/registrationSolver.h |   103 +
 .../registration/optical_flow/resampleOperator.h   |    42 +
 toolboxes/solvers/CMakeLists.txt                   |    35 +
 toolboxes/solvers/cgCallback.h                     |   198 +
 toolboxes/solvers/cgPreconditioner.h               |    46 +
 toolboxes/solvers/cgSolver.h                       |   412 +
 toolboxes/solvers/cpu/CMakeLists.txt               |    16 +
 toolboxes/solvers/cpu/hoCgPreconditioner.h         |    14 +
 toolboxes/solvers/cpu/hoCgSolver.h                 |    28 +
 toolboxes/solvers/cpu/hoGpBbSolver.h               |    21 +
 toolboxes/solvers/cpu/hoSbCgSolver.h               |    16 +
 toolboxes/solvers/cpu/hoSolverUtils.h              |    26 +
 toolboxes/solvers/eigenTester.h                    |   157 +
 toolboxes/solvers/gpBbSolver.h                     |   199 +
 toolboxes/solvers/gpSolver.h                       |   318 +
 toolboxes/solvers/gpu/CMakeLists.txt               |    51 +
 toolboxes/solvers/gpu/cuCgPreconditioner.h         |    14 +
 toolboxes/solvers/gpu/cuCgSolver.h                 |    30 +
 toolboxes/solvers/gpu/cuGpBbSolver.h               |    22 +
 toolboxes/solvers/gpu/cuLbfgsSolver.h              |    36 +
 toolboxes/solvers/gpu/cuLwSolver.h                 |    42 +
 toolboxes/solvers/gpu/cuNlcgSolver.h               |    24 +
 toolboxes/solvers/gpu/cuSbCgSolver.h               |    16 +
 toolboxes/solvers/gpu/cuSbLwSolver.h               |    27 +
 toolboxes/solvers/gpu/cuSbcCgSolver.h              |    14 +
 toolboxes/solvers/gpu/cuSbcLwSolver.h              |    29 +
 toolboxes/solvers/gpu/cuSolverUtils.cu             |   111 +
 toolboxes/solvers/gpu/cuSolverUtils.h              |    16 +
 toolboxes/solvers/gpu/gpusolvers_export.h          |    18 +
 toolboxes/solvers/gpu/hoCuCgSolver.h               |    34 +
 toolboxes/solvers/gpu/hoCuGpBbSolver.h             |    23 +
 toolboxes/solvers/gpu/hoCuNlcgSolver.h             |    35 +
 toolboxes/solvers/gpu/hoCuSbcCgSolver.h            |    16 +
 toolboxes/solvers/lbfgsSolver.h                    |   824 ++
 toolboxes/solvers/linearOperatorSolver.h           |    75 +
 toolboxes/solvers/lsqrSolver.h                     |   173 +
 toolboxes/solvers/lwSolver.h                       |   214 +
 toolboxes/solvers/nlcgSolver.h                     |   776 ++
 toolboxes/solvers/osLALMSolver.h                   |   279 +
 toolboxes/solvers/osMOMSolver.h                    |   216 +
 toolboxes/solvers/osSPSSolver.h                    |   180 +
 toolboxes/solvers/sbSolver.h                       |   842 ++
 toolboxes/solvers/sbcSolver.h                      |    96 +
 toolboxes/solvers/solver.h                         |    47 +
 1097 files changed, 229082 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..475173d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,26 @@
+*~
+*.so
+*.dll
+*.o
+*.cuo
+*#
+*.dylib
+*.real
+*.cplx
+inc/
+*.d
+!CMakeLists.txt
+bin/
+lib/*.py
+lib/*.pyc
+.DS_Store
+*.swp
+build/
+build_debug/
+*.pyc
+toolboxes/core/core_defines.h
+prod/
+external/
+test/integration/test_cases.txt
+*.h5
+tags
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..33a0b4e
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,312 @@
+cmake_minimum_required(VERSION 2.8)
+project(GADGETRON)
+
+# check the compiler version
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+  if (NOT CMAKE_CXX_COMPILER_VERSION) #Compiler version is not set on Ubuntu 12.02 (gcc 4.6)
+    execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE CMAKE_CXX_COMPILER_VERSION)
+  endif()
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
+    message("CXX COMPILER VERSION: ${CMAKE_CXX_COMPILER_ID} : ${CMAKE_CXX_COMPILER_VERSION}")
+    message(FATAL_ERROR "Gadgetron requires GCC version >= 4.8")
+  endif()
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.5)
+    message(FATAL_ERROR "Gadgetron requires Clang version >= 3.5")
+  endif()
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0)
+    message(FATAL_ERROR "Gadgetron requires MSVC 2013 or above ")
+  endif()
+else()
+  message(WARNING "Unsupported compiler!")
+endif()
+
+#VERSIONING AND INSTALL PATHS
+set(GADGETRON_VERSION_MAJOR 3)
+set(GADGETRON_VERSION_MINOR 7)
+set(GADGETRON_VERSION_PATCH 5) 
+set(GADGETRON_VERSION_STRING ${GADGETRON_VERSION_MAJOR}.${GADGETRON_VERSION_MINOR}.${GADGETRON_VERSION_PATCH})
+set(GADGETRON_SOVERSION ${GADGETRON_VERSION_MAJOR}.${GADGETRON_VERSION_MINOR})
+find_package(Git)
+if (GIT_FOUND)
+  execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse HEAD WORKING_DIRECTORY
+      ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE GADGETRON_GIT_SHA1 ERROR_VARIABLE GADGETRON_GIT_STDERR)
+  string(STRIP "${GADGETRON_GIT_SHA1}" GADGETRON_GIT_SHA1)
+  string(LENGTH "${GADGETRON_GIT_SHA1}" GADGETRON_GIT_SHA1_LEN)
+  if(${GADGETRON_GIT_SHA1_LEN} LESS 40)
+    message(WARNING "Could not determine SHA-1 hash: ${GADGETRON_GIT_STDERR}")
+    set(GADGETRON_GIT_SHA1 "NA")
+  endif(${GADGETRON_GIT_SHA1_LEN} LESS 40)
+else()
+  set(GADGETRON_GIT_SHA1 "NA")
+endif()
+set(GADGETRON_INSTALL_CMAKE_PATH share/gadgetron/cmake)
+set(GADGETRON_INSTALL_CONFIG_PATH share/gadgetron/config)
+set(GADGETRON_INSTALL_MATLAB_PATH share/gadgetron/matlab)
+set(GADGETRON_INSTALL_PYTHON_MODULE_PATH share/gadgetron/python)
+set(GADGETRON_INSTALL_SCHEMA_PATH share/gadgetron/schema)
+set(GADGETRON_INSTALL_INCLUDE_PATH include/gadgetron)
+set(GADGETRON_INSTALL_CHROOT_SCRIPTS_PATH share/gadgetron/chroot)
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)
+
+#Set the build type to Release if not specified
+IF(NOT CMAKE_BUILD_TYPE)
+  SET(CMAKE_BUILD_TYPE Release CACHE STRING
+      "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel."
+      FORCE)
+ENDIF(NOT CMAKE_BUILD_TYPE)
+
+# build options for 64 bits system
+if( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+  message("64bit system is found")
+  set( HAS_64_BIT On CACHE BOOL "64bit build")
+else( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+  message("32bit system is found")
+  set( HAS_64_BIT Off CACHE BOOL "64bit build")
+endif( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+
+# whether to install dependencies
+OPTION(GADGETRON_INSTALL_DEPENDENCIES "Install gadgetron dependencies" Off)
+
+# build options for OpenMP support
+find_package(OpenMP)
+OPTION(USE_OPENMP "Use OpenMP" On)
+if (OPENMP_FOUND)
+  if(USE_OPENMP) 
+    message("OpenMP multithreading enabled")
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    ADD_DEFINITIONS(-DUSE_OMP)
+  else (USE_OPENMP)
+    message("OpenMP multithreading is supported, but disabled")
+  endif(USE_OPENMP) 
+else (OPENMP_FOUND)
+  message("OpenMP multithreading not supported")
+endif (OPENMP_FOUND)
+
+if (WIN32)
+  ADD_DEFINITIONS(-DWIN32 -D_WIN32 -D_WINDOWS)
+#  ADD_DEFINITIONS(-DUNICODE -D_UNICODE)
+  ADD_DEFINITIONS(-D_CRT_SECURE_NO_WARNINGS)
+  ADD_DEFINITIONS(-D_VARIADIC_MAX=10) #to fix compiler limitations in Visual Studio Express
+  ADD_DEFINITIONS("/wd4251") #disable warnings, 4251: needs to have dll-interface to be used by clients
+  ADD_DEFINITIONS("/wd4344") #disable warnings, 4344: behavior change: use of explicit template arguments
+  ADD_DEFINITIONS("/wd4996") #disable warnings, 4996: the POSIX name for this item is deprecated. Instead, use the ISO C++ conformant name
+  if ( HAS_64_BIT )
+    ADD_DEFINITIONS(-DWIN64 -D_WIN64)
+  endif ( HAS_64_BIT )
+  SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /MP")
+  SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3")
+  SET (CMAKE_CXX_LINK_FLAGS "${CMAKE_CXX_LINK_FLAGS} /INCREMENTAL:NO")
+  SET (CMAKE_C_LINK_FLAGS "${CMAKE_C_LINK_FLAGS} /INCREMENTAL:NO")
+  SET (CMAKE_EXE_LINKER_FLAGS_DEBUG "/debug /INCREMENTAL:NO")
+  SET (CMAKE_SHARED_LINKER_FLAGS_DEBUG "/debug /INCREMENTAL:NO")
+  SET (CMAKE_STATIC_LINKER_FLAGS_DEBUG "/debug /INCREMENTAL:NO")
+  SET (CMAKE_MODULE_LINKER_FLAGS_DEBUG "/debug /INCREMENTAL:NO")
+  # The two flags below is to fix Windows problems in relation to multiple defined operators new/delete and some constructors that are defined in our headers
+#  SET (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /FORCE:MULTIPLE") 
+#  SET (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") 
+else (WIN32)
+  if (UNIX)
+    if (APPLE)
+      SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -stdlib=libc++")
+    else (APPLE)
+      SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -std=c++11")
+    endif (APPLE)
+  endif (UNIX)
+endif (WIN32)
+
+include_directories(${CMAKE_SOURCE_DIR}/toolboxes/log)
+
+# whether to suppress compilation warnings
+OPTION(BUILD_SUPPRESS_WARNINGS "Build package while suppressing warnings" Off)
+if (BUILD_SUPPRESS_WARNINGS)
+  if (WIN32)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W0")
+  elseif (WIN32)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -w")
+  endif (WIN32)
+endif (BUILD_SUPPRESS_WARNINGS)
+
+set(Boost_USE_STATIC_LIBS OFF)
+set(Boost_USE_MULTITHREADED ON)
+set(Boost_USE_STATIC_RUNTIME OFF)
+# necessary for Windows and RHEL <=6 systems
+set(Boost_NO_BOOST_CMAKE ON)
+
+if(WIN32)
+  add_definitions( -DBOOST_ALL_NO_LIB )
+  add_definitions( -DBOOST_ALL_DYN_LINK )
+endif(WIN32)
+
+if(WIN32)
+  add_definitions( -DBOOST_ALL_NO_LIB )
+  add_definitions( -DBOOST_ALL_DYN_LINK )
+endif(WIN32)
+
+# We actually only use system and thread explicitly, but they require linking in date_time and chrono
+if (WIN32)
+  find_package(Boost COMPONENTS system thread date_time chrono program_options filesystem REQUIRED)
+else(WIN32)
+  find_package(Boost COMPONENTS system thread program_options filesystem REQUIRED)
+endif(WIN32)
+
+find_package(ACE)
+if(NOT ACE_FOUND)
+  MESSAGE("ACE not found. Only toolboxes and standalone applications are compiled. The streaming framework will NOT be compiled.")
+endif(NOT ACE_FOUND)
+
+find_package(CUDA_advanced)
+
+if (CUDA_FOUND)
+  ADD_DEFINITIONS(-DUSE_CUDA)
+  SET( GADGETRON_CUDA_FOUND_BOOL 1 )
+  include_directories( ${CUDA_INCLUDE_DIRS} )
+  if (NOT WIN32)
+    set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+  endif (NOT WIN32)
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} )
+  set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+  #set(CUDA_VERBOSE_BUILD ON)
+
+  # Compile kernels for compute models 1.0 and 2.0 as default for Cuda 4.1
+  # Support compute model 3.0 from Cuda 4.2 and up
+  # Support compute model 3.5 from Cuda 5 and up
+
+  OPTION(GADGETRON_CUDA_ALL_COMPUTE_MODEL "Build CUDA components for all computing models" Off)
+  if (GADGETRON_CUDA_ALL_COMPUTE_MODEL)
+    MESSAGE("Compiling CUDA components to support compute model 2.0, 3.0 and 3.5") 
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-gencode arch=compute_20,code=sm_20")
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-gencode arch=compute_30,code=sm_30")
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-gencode arch=compute_35,code=sm_35")
+  endif (GADGETRON_CUDA_ALL_COMPUTE_MODEL)
+
+  MESSAGE("Compiling with ${CUDA_NVCC_FLAGS}")
+else (CUDA_FOUND)
+  MESSAGE("CUDA not found. CUDA components will not be compiled.")
+  SET( GADGETRON_CUDA_FOUND_BOOL 0 )
+  SET( CUDA_NVCC_FLAGS "NA")
+endif (CUDA_FOUND)
+
+find_package(GTest)
+#Add support for the default ubuntu package of gtest (which is not compiled
+if (NOT GTEST_FOUND)
+  find_path(GTEST_SRC_DIR src/gtest.cc HINTS /usr/src/gtest)
+  find_path(GTEST_INCLUDE_DIRS gtest.h HINTS /usr/include/gtest)
+  if (GTEST_SRC_DIR AND GTEST_INCLUDE_DIRS)
+    MESSAGE("GTest src package found. Compiling as part of Gadgetron.")
+    add_subdirectory(${GTEST_SRC_DIR} ${CMAKE_BINARY_DIR}/gtest )
+    include_directories(${GTEST_INCLUDE_DIRS})
+    set(GTEST_FOUND 1)
+    set(GTEST_LIBRARIES gtest gtest_main)
+  endif (GTEST_SRC_DIR AND GTEST_INCLUDE_DIRS)
+endif (NOT GTEST_FOUND)
+
+find_package(Armadillo)
+# check whether ILP64 MKL should is used
+if(ARMADILLO_FOUND)
+  ADD_DEFINITIONS(-DUSE_ARMADILLO)
+  set(ARMADILLO_BLAS_LONG_LONG FALSE)
+  if(EXISTS "${ARMADILLO_INCLUDE_DIR}/armadillo_bits/config.hpp")
+    # Read and parse armadillo config.hpp to find out whether BLAS uses long long
+    file(STRINGS "${ARMADILLO_INCLUDE_DIR}/armadillo_bits/config.hpp" _armadillo_blas_long_long REGEX "// #define ARMA_BLAS_LONG_LONG")
+    if ( NOT _armadillo_blas_long_long )
+      set(ARMADILLO_BLAS_LONG_LONG TRUE)
+      MESSAGE("Armadillo is found to use long long for BLAS calls")
+    else ( NOT _armadillo_blas_long_long )
+      MESSAGE("Armadillo found. Note that ARMADILLO_BLAS_LONG_LONG _must_ be defined in ${ARMADILLO_INCLUDE_DIR}/armadillo_bits/config.hpp to link against the MKL ILP64 interface.")
+    endif ( NOT _armadillo_blas_long_long )
+    unset(_armadillo_blas_long_long)
+  endif()
+else()
+  message("Armadillo not found. This will disable many toolboxes and gadgets.")
+endif()
+
+find_package(HDF5 1.8 COMPONENTS C CXX HL)
+if (HDF5_FOUND)
+  message("HDF5 Found")
+else()
+  message("HDF5 not found")
+endif()
+find_package(FFTW3 COMPONENTS single double)
+if (FFTW3_FOUND)
+  message("FFTW3 Found")
+else()
+  message("FFTW3 not found")
+endif()
+find_package(Ismrmrd)
+if (ISMRMRD_FOUND)
+  message("ISMRMRD Found")
+else()
+  message("ISMRMRD not found")
+endif()
+find_package(MKL)
+if (MKL_FOUND)
+  if ( MKL_VERSION_STRING VERSION_LESS 11.2.0 )
+    message(FATAL_ERROR "Gadgetron requires Intel MKL version >= 11.2.0")
+  endif ()
+endif (MKL_FOUND)
+find_package(BLAS)
+find_package(LAPACK)
+if (LAPACK_FOUND)
+  message("LAPACK Found")
+  ADD_DEFINITIONS(-DUSE_LAPACK)
+endif (LAPACK_FOUND)
+find_package(PythonLibs 2)
+find_package(NumPy)
+find_package(GMatlab)
+if (MATLAB_FOUND)
+  ADD_DEFINITIONS(-DUSE_MATLAB)
+endif (MATLAB_FOUND)
+
+message("Searching for OpenGL, GLEW, GLUT, and Qt. These libraries are only used in a single standalone application and are thus non-essential.")
+if(WIN32)
+  message("For Windows users in particular, for ease of installation we do not reccomend installing these libraries.")
+endif(WIN32)
+
+find_package(OpenGL)
+find_package(GLEW)
+find_package(GLUT)
+find_package(Qt4 4.6)
+
+add_subdirectory(toolboxes)
+add_subdirectory(apps)
+
+if (ACE_FOUND AND ISMRMRD_FOUND AND FFTW3_FOUND AND HDF5_FOUND)
+  add_subdirectory(gadgets)
+else()
+  message("Required dependencies for gadget compilation not found (ACE, ISMRMRD, FFTW3, HDF5).")
+endif()
+
+add_subdirectory(test)
+add_subdirectory(cmake)
+add_subdirectory(doc)
+add_subdirectory(chroot)
+
+# install dependencies for WIN32
+if (WIN32)
+  if (GADGETRON_INSTALL_DEPENDENCIES)
+    include(${CMAKE_SOURCE_DIR}/cmake/InstallWinDependencies.cmake)
+  endif (GADGETRON_INSTALL_DEPENDENCIES)
+endif (WIN32)
+
+if (UNIX)
+  if (NOT APPLE)
+    if (GADGETRON_INSTALL_DEPENDENCIES)
+      include(${CMAKE_SOURCE_DIR}/cmake/InstallLinuxDependencies.cmake)
+    endif (GADGETRON_INSTALL_DEPENDENCIES)
+  endif (NOT APPLE)
+endif (UNIX)
+
+#  ---   Main Library  (end) ----
+
+# Create package
+string(TOLOWER ${PROJECT_NAME} PROJECT_NAME_LOWER)
+include(${CMAKE_SOURCE_DIR}/cmake/gadgetron_cpack.cmake)
+if(CPACK_GENERATOR)
+  message(STATUS "Found CPack generators: ${CPACK_GENERATOR}")
+  configure_file("${CMAKE_SOURCE_DIR}/cmake/cpack_options.cmake.in" ${GADGETRON_CPACK_CFG_FILE} @ONLY)
+  set(CPACK_PROJECT_CONFIG_FILE ${GADGETRON_CPACK_CFG_FILE})  
+  include(CPack)
+endif(CPACK_GENERATOR)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..f704983
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,29 @@
+GADGETRON SOFTWARE LICENSE V1.0, NOVEMBER 2011
+
+PERMISSION IS HEREBY GRANTED, FREE OF CHARGE, TO ANY PERSON OBTAINING
+A COPY OF THIS SOFTWARE AND ASSOCIATED DOCUMENTATION FILES (THE
+"SOFTWARE"), TO DEAL IN THE SOFTWARE WITHOUT RESTRICTION, INCLUDING
+WITHOUT LIMITATION THE RIGHTS TO USE, COPY, MODIFY, MERGE, PUBLISH,
+DISTRIBUTE, SUBLICENSE, AND/OR SELL COPIES OF THE SOFTWARE, AND TO
+PERMIT PERSONS TO WHOM THE SOFTWARE IS FURNISHED TO DO SO, SUBJECT TO
+THE FOLLOWING CONDITIONS:
+
+THE ABOVE COPYRIGHT NOTICE, THIS PERMISSION NOTICE, AND THE LIMITATION
+OF LIABILITY BELOW SHALL BE INCLUDED IN ALL COPIES OR REDISTRIBUTIONS
+OF SUBSTANTIAL PORTIONS OF THE SOFTWARE.
+
+SOFTWARE IS BEING DEVELOPED IN PART AT THE NATIONAL HEART, LUNG, AND BLOOD
+INSTITUTE, NATIONAL INSTITUTES OF HEALTH BY AN EMPLOYEE OF THE FEDERAL
+GOVERNMENT IN THE COURSE OF HIS OFFICIAL DUTIES. PURSUANT TO TITLE 17, 
+SECTION 105 OF THE UNITED STATES CODE, THIS SOFTWARE IS NOT SUBJECT TO 
+COPYRIGHT PROTECTION AND IS IN THE PUBLIC DOMAIN. EXCEPT AS CONTAINED IN
+THIS NOTICE, THE NAME OF THE AUTHORS, THE NATIONAL HEART, LUNG, AND BLOOD
+INSTITUTE (NHLBI), OR THE NATIONAL INSTITUTES OF HEALTH (NIH) MAY NOT 
+BE USED TO ENDORSE OR PROMOTE PRODUCTS DERIVED FROM THIS SOFTWARE WITHOUT 
+SPECIFIC PRIOR WRITTEN PERMISSION FROM THE NHLBI OR THE NIH.THE SOFTWARE IS 
+PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/README b/README
new file mode 100644
index 0000000..d3b8af6
--- /dev/null
+++ b/README
@@ -0,0 +1,19 @@
+GADGETRON IMAGE RECONSTRUCTION FRAMEWORK
+
+Please read LICENSE file for licensing details.
+
+Detailed installation instructions and manual is available at:
+
+http://gadgetron.github.io
+
+-------------------------------------
+General Building Instructions (on Unix platforms)
+
+mkdir build
+cd build
+cmake ../
+make
+sudo make install
+
+Please check manual for detailed instructions for your platform.
+
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
new file mode 100644
index 0000000..71946d5
--- /dev/null
+++ b/apps/CMakeLists.txt
@@ -0,0 +1,6 @@
+IF (ACE_FOUND )
+  add_subdirectory(gadgetron)
+ENDIF (ACE_FOUND )
+
+add_subdirectory(clients)
+add_subdirectory(standalone)
diff --git a/apps/clients/CMakeLists.txt b/apps/clients/CMakeLists.txt
new file mode 100644
index 0000000..b7f0b2a
--- /dev/null
+++ b/apps/clients/CMakeLists.txt
@@ -0,0 +1,7 @@
+IF(ACE_FOUND)
+  add_subdirectory(utilities)
+ENDIF()
+
+IF(ISMRMRD_FOUND AND HDF5_FOUND)
+  add_subdirectory(gadgetron_ismrmrd_client)
+ENDIF()
\ No newline at end of file
diff --git a/apps/clients/gadgetron_ismrmrd_client/CMakeLists.txt b/apps/clients/gadgetron_ismrmrd_client/CMakeLists.txt
new file mode 100644
index 0000000..d55d060
--- /dev/null
+++ b/apps/clients/gadgetron_ismrmrd_client/CMakeLists.txt
@@ -0,0 +1,24 @@
+find_package(Ismrmrd REQUIRED)
+
+set(Boost_NO_BOOST_CMAKE ON)
+
+if(WIN32)
+  find_package(Boost COMPONENTS program_options thread system date_time chrono REQUIRED)
+else(WIN32)
+  find_package(Boost COMPONENTS program_options thread system REQUIRED)
+endif(WIN32)
+
+if(WIN32)
+  link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+include_directories(
+  ${Boost_INCLUDE_DIR} 
+  ${ISMRMRD_INCLUDE_DIR}
+  )
+
+add_executable(gadgetron_ismrmrd_client gadgetron_ismrmrd_client.cpp)
+
+target_link_libraries(gadgetron_ismrmrd_client ${ISMRMRD_LIBRARIES} ${Boost_LIBRARIES})
+
+install(TARGETS gadgetron_ismrmrd_client DESTINATION bin COMPONENT main)
diff --git a/apps/clients/gadgetron_ismrmrd_client/gadgetron_ismrmrd_client.cpp b/apps/clients/gadgetron_ismrmrd_client/gadgetron_ismrmrd_client.cpp
new file mode 100644
index 0000000..0a97328
--- /dev/null
+++ b/apps/clients/gadgetron_ismrmrd_client/gadgetron_ismrmrd_client.cpp
@@ -0,0 +1,1158 @@
+/*****************************************
+*  Standalone ISMRMRD Gadgetron Client  
+*
+* Author: Michael S. Hansen
+* 
+* Dependencies: ISMRMRD and Boost
+*
+*****************************************/
+
+//TODO:
+// -Blobs (for DICOM image support)
+//  - First implementation is in, but testing needed
+// -NIFTI and Analyze output
+// -Check on potential threading problem with asio socket 
+//    - having and reading and writing thread is supposedly not safe, but seems to work here
+// -Add command line switch for controlling verbosity of output
+// -Static linking for standalone executable. 
+
+#include <boost/program_options.hpp>
+#include <boost/asio.hpp>
+#include <boost/thread/thread.hpp>
+#include <boost/thread/mutex.hpp>
+#include <boost/shared_ptr.hpp>
+
+#include <ismrmrd/ismrmrd.h>
+#include <ismrmrd/dataset.h>
+#include <ismrmrd/meta.h>
+
+#include <fstream>
+#include <streambuf>
+#include <time.h>
+#include <iomanip>
+#include <sstream>
+#include <iostream>
+#include <exception>
+#include <map>
+
+
+std::string get_date_time_string()
+{
+    time_t rawtime;
+    struct tm * timeinfo;
+    time ( &rawtime );
+    timeinfo = localtime ( &rawtime );
+
+    std::stringstream str;
+    str << timeinfo->tm_year+1900 << "-"
+        << std::setw(2) << std::setfill('0') << timeinfo->tm_mon+1 << "-"
+        << std::setw(2) << std::setfill('0') << timeinfo->tm_mday << " "
+        << std::setw(2) << std::setfill('0') << timeinfo->tm_hour << ":"
+        << std::setw(2) << std::setfill('0') << timeinfo->tm_min << ":"
+        << std::setw(2) << std::setfill('0') << timeinfo->tm_sec;
+
+    std::string ret = str.str();
+
+    return ret;
+}
+
+
+namespace po = boost::program_options;
+using boost::asio::ip::tcp;
+
+
+enum GadgetronMessageID {
+    GADGET_MESSAGE_INT_ID_MIN                             =   0,
+    GADGET_MESSAGE_CONFIG_FILE                            =   1,
+    GADGET_MESSAGE_CONFIG_SCRIPT                          =   2,
+    GADGET_MESSAGE_PARAMETER_SCRIPT                       =   3,
+    GADGET_MESSAGE_CLOSE                                  =   4,
+    GADGET_MESSAGE_INT_ID_MAX                             = 999,
+    GADGET_MESSAGE_EXT_ID_MIN                             = 1000,
+    GADGET_MESSAGE_ACQUISITION                            = 1001, /**< DEPRECATED */
+    GADGET_MESSAGE_NEW_MEASUREMENT                        = 1002, /**< DEPRECATED */
+    GADGET_MESSAGE_END_OF_SCAN                            = 1003, /**< DEPRECATED */
+    GADGET_MESSAGE_IMAGE_CPLX_FLOAT                       = 1004, /**< DEPRECATED */
+    GADGET_MESSAGE_IMAGE_REAL_FLOAT                       = 1005, /**< DEPRECATED */
+    GADGET_MESSAGE_IMAGE_REAL_USHORT                      = 1006, /**< DEPRECATED */
+    GADGET_MESSAGE_EMPTY                                  = 1007, /**< DEPRECATED */
+    GADGET_MESSAGE_ISMRMRD_ACQUISITION                    = 1008,
+    GADGET_MESSAGE_ISMRMRD_IMAGE_CPLX_FLOAT               = 1009, /**< DEPRECATED */
+    GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_FLOAT               = 1010, /**< DEPRECATED */
+    GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_USHORT              = 1011, /**< DEPRECATED */
+    GADGET_MESSAGE_DICOM                                  = 1012, /**< DEPRECATED */
+    GADGET_MESSAGE_CLOUD_JOB                              = 1013,
+    GADGET_MESSAGE_GADGETCLOUD_JOB                        = 1014,
+    GADGET_MESSAGE_ISMRMRD_IMAGEWITHATTRIB_CPLX_FLOAT     = 1015, /**< DEPRECATED */
+    GADGET_MESSAGE_ISMRMRD_IMAGEWITHATTRIB_REAL_FLOAT     = 1016, /**< DEPRECATED */
+    GADGET_MESSAGE_ISMRMRD_IMAGEWITHATTRIB_REAL_USHORT    = 1017, /**< DEPRECATED */
+    GADGET_MESSAGE_DICOM_WITHNAME                         = 1018,
+    GADGET_MESSAGE_DEPENDENCY_QUERY                       = 1019,
+    GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_SHORT               = 1020, /**< DEPRECATED */
+    GADGET_MESSAGE_ISMRMRD_IMAGEWITHATTRIB_REAL_SHORT     = 1021, /**< DEPRECATED */
+    GADGET_MESSAGE_ISMRMRD_IMAGE                          = 1022,
+    GADGET_MESSAGE_EXT_ID_MAX                             = 4096
+};
+
+boost::mutex mtx;
+
+struct GadgetMessageIdentifier
+{
+    uint16_t id;
+};
+
+struct GadgetMessageConfigurationFile
+{
+    char configuration_file[1024];
+};
+
+struct GadgetMessageScript
+{
+    uint32_t script_length;
+};
+
+class GadgetronClientException : public std::exception
+{
+
+public:
+    GadgetronClientException(std::string msg)
+        : msg_(msg)
+    {
+
+    }
+
+    virtual ~GadgetronClientException() throw() {}
+
+    virtual const char* what() const throw()
+    {
+        return msg_.c_str();
+    }
+
+protected:
+    std::string msg_;
+};
+
+class GadgetronClientMessageReader
+{
+public:
+    virtual ~GadgetronClientMessageReader() {}
+
+    /**
+    Function must be implemented to read a specific message.
+    */
+    virtual void read(tcp::socket* s) = 0;
+
+};
+
+class GadgetronClientImageMessageReader : public GadgetronClientMessageReader
+{
+
+public:
+    GadgetronClientImageMessageReader(std::string filename, std::string groupname)
+        : file_name_(filename)
+        , group_name_(groupname)
+    {
+
+    }
+
+    ~GadgetronClientImageMessageReader() {
+    } 
+
+    template <typename T> 
+    void read_data_attrib(tcp::socket* stream, const ISMRMRD::ImageHeader& h, ISMRMRD::Image<T>& im)
+    {
+        im.setHead(h);
+
+        typedef unsigned long long size_t_type;
+
+        //Read meta attributes
+        size_t_type meta_attrib_length;
+        boost::asio::read(*stream, boost::asio::buffer(&meta_attrib_length, sizeof(size_t_type)));
+
+        if (meta_attrib_length>0)
+        {
+            std::string meta_attrib(meta_attrib_length, 0);
+            boost::asio::read(*stream, boost::asio::buffer(const_cast<char*>(meta_attrib.c_str()), meta_attrib_length));
+            im.setAttributeString(meta_attrib);
+        }
+
+        //Read image data
+        boost::asio::read(*stream, boost::asio::buffer(im.getDataPtr(), im.getDataSize()));
+        {
+            if (!dataset_) {
+
+                {
+                    mtx.lock();
+                    dataset_ = boost::shared_ptr<ISMRMRD::Dataset>(new ISMRMRD::Dataset(file_name_.c_str(), group_name_.c_str(), true)); // create if necessary 
+                    mtx.unlock();
+                }
+            }
+
+            std::stringstream st1;
+            st1 << "image_" << h.image_series_index;
+            std::string image_varname = st1.str();
+
+            {
+                mtx.lock();
+                //TODO should this be wrapped in a try/catch?
+                dataset_->appendImage(image_varname, im);
+                mtx.unlock();
+            }
+        }
+    }
+
+    virtual void read(tcp::socket* stream) 
+    {
+        //Read the image headerfrom the socket
+        ISMRMRD::ImageHeader h;
+        boost::asio::read(*stream, boost::asio::buffer(&h,sizeof(ISMRMRD::ImageHeader)));
+
+        if (h.data_type == ISMRMRD::ISMRMRD_USHORT)
+        {
+            ISMRMRD::Image<unsigned short> im;
+            this->read_data_attrib(stream, h, im);
+        }
+        else if (h.data_type == ISMRMRD::ISMRMRD_SHORT)
+        {
+            ISMRMRD::Image<short> im;
+            this->read_data_attrib(stream, h, im);
+        }
+        else if (h.data_type == ISMRMRD::ISMRMRD_UINT)
+        {
+            ISMRMRD::Image<unsigned int> im;
+            this->read_data_attrib(stream, h, im);
+        }
+        else if (h.data_type == ISMRMRD::ISMRMRD_INT)
+        {
+            ISMRMRD::Image<int> im;
+            this->read_data_attrib(stream, h, im);
+        }
+        else if (h.data_type == ISMRMRD::ISMRMRD_FLOAT)
+        {
+            ISMRMRD::Image<float> im;
+            this->read_data_attrib(stream, h, im);
+        }
+        else if (h.data_type == ISMRMRD::ISMRMRD_DOUBLE)
+        {
+            ISMRMRD::Image<double> im;
+            this->read_data_attrib(stream, h, im);
+        }
+        else if (h.data_type == ISMRMRD::ISMRMRD_CXFLOAT)
+        {
+            ISMRMRD::Image< std::complex<float> > im;
+            this->read_data_attrib(stream, h, im);
+        }
+        else if (h.data_type == ISMRMRD::ISMRMRD_CXDOUBLE)
+        {
+            ISMRMRD::Image< std::complex<double> > im;
+            this->read_data_attrib(stream, h, im);
+        }
+        else
+        {
+            throw GadgetronClientException("Invalide image data type ... ");
+        }
+    }
+
+protected:
+    std::string group_name_;
+    std::string file_name_;
+    boost::shared_ptr<ISMRMRD::Dataset> dataset_;
+};
+
+// ----------------------------------------------------------------
+// for the analyze image format
+#ifdef DT_UNKNOWN
+    #undef DT_UNKNOWN
+#endif // DT_UNKNOWN
+
+enum AnalyzeDataType
+{
+    DT_ANA_UNKNOWN=0,
+
+    DT_NONE                    =0,
+    DT_UNKNOWN                 =0,     /* what it says, dude           */
+    DT_BINARY                  =1,     /* binary (1 bit/voxel)         */
+    DT_UNSIGNED_CHAR           =2,     /* unsigned char (8 bits/voxel) */
+    DT_SIGNED_SHORT            =4,     /* signed short (16 bits/voxel) */
+    DT_UNSIGNED_SHORT          =5,
+    DT_SIGNED_INT              =8,     /* signed int (32 bits/voxel)   */
+    DT_UNSIGNED_INT            =9,
+    DT_FLOAT                  =16,     /* float (32 bits/voxel)        */
+    DT_COMPLEX                =32,     /* complex (64 bits/voxel)      */
+    DT_DOUBLE                 =64,     /* double (64 bits/voxel)       */
+    DT_RGB                   =128,     /* RGB triple (24 bits/voxel)   */
+    DT_ALL                   =255,     /* not very useful (?)          */
+
+                                /*----- another set of names for the same ---*/
+    DT_UINT8                   =2,
+    DT_INT16                   =4,
+    DT_INT32                   =8,
+    DT_FLOAT32                =16,
+    DT_COMPLEX64              =32,
+    DT_FLOAT64                =64,
+    DT_RGB24                 =128,
+
+                                /*------------------- new codes for NIFTI ---*/
+    DT_INT8                  =256,     /* signed char (8 bits)         */
+    DT_UINT16                =512,     /* unsigned short (16 bits)     */
+    DT_UINT32                =768,     /* unsigned int (32 bits)       */
+    DT_INT64                =1024,     /* long long (64 bits)          */
+    DT_UINT64               =1280,     /* unsigned long long (64 bits) */
+    DT_FLOAT128             =1536,     /* long double (128 bits)       */
+    DT_COMPLEX128           =1792,     /* double pair (128 bits)       */
+    DT_COMPLEX256           =2048,     /* long double pair (256 bits)  */
+    DT_RGBA32               =2304,     /* 4 byte RGBA (32 bits/voxel)  */
+};
+
+AnalyzeDataType getDataTypeFromRTTI(const std::string& name)
+{
+    AnalyzeDataType analyzeDT = DT_ANA_UNKNOWN;
+
+    if ( name == typeid(unsigned char).name() )
+    {
+        analyzeDT = DT_UNSIGNED_CHAR;
+    }
+
+    if ( name == typeid(short).name() )
+    {
+        analyzeDT = DT_SIGNED_SHORT;
+    }
+
+    if ( name == typeid(unsigned short).name() )
+    {
+        analyzeDT = DT_UINT16;
+    }
+
+    if ( name == typeid(int).name() )
+    {
+        analyzeDT = DT_SIGNED_INT;
+    }
+
+    if ( name == typeid(unsigned int).name() )
+    {
+        analyzeDT = DT_UINT32;
+    }
+
+    if ( name == typeid(float).name() )
+    {
+        analyzeDT = DT_FLOAT;
+    }
+
+    if ( name == typeid(double).name() )
+    {
+        analyzeDT = DT_DOUBLE;
+    }
+
+    if ( name == typeid(long double).name() )
+    {
+        analyzeDT = DT_FLOAT128;
+    }
+
+    if ( name == typeid(std::complex<float>).name() )
+    {
+        analyzeDT = DT_COMPLEX;
+    }
+
+    if ( name == typeid(std::complex<double>).name() )
+    {
+        analyzeDT = DT_COMPLEX128;
+    }
+
+    if ( name == typeid(std::complex<long double>).name() )
+    {
+        analyzeDT = DT_COMPLEX256;
+    }
+
+    return analyzeDT;
+}
+
+struct header_key
+{
+    int sizeof_hdr;
+    char data_type[10];
+    char db_name[18];
+    int extents;
+    short int session_error;
+    char regular;
+    char hkey_un0;
+};
+
+struct image_dimension
+{
+    short int dim[8];
+    short int unused8;
+    short int unused9;
+    short int unused10;
+    short int unused11;
+    short int unused12;
+    short int unused13;
+    short int unused14;
+    short int datatype;
+    short int bitpix;
+    short int dim_un0;
+    float pixdim[8];
+    float vox_offset;
+    float funused1;
+    float funused2;
+    float funused3;
+    float cal_max;
+    float cal_min;
+    float compressed;
+    float verified;
+    int glmax,glmin;
+};
+
+struct data_history
+{
+    char descrip[80];
+    char aux_file[24];
+    char orient;
+    char originator[10];
+    char generated[10];
+    char scannum[10];
+    char patient_id[10];
+    char exp_date[10];
+    char exp_time[10];
+    char hist_un0[3];
+    int views;
+    int vols_added;
+    int start_field;
+    int field_skip;
+    int omax, omin;
+    int smax, smin;
+};
+
+// Analyze75 header has 348 bytes
+struct dsr
+{
+    struct header_key hk;
+    struct image_dimension dime;
+    struct data_history hist;
+};
+
+class IOAnalyze
+{
+public:
+
+    typedef dsr HeaderType;
+
+    IOAnalyze() {}
+    virtual ~IOAnalyze() {}
+
+    template <typename T> void array2Header(const std::vector<size_t>& dim, const std::vector<float>& pixelSize, HeaderType& header)
+    {
+        try
+        {
+            // set everything to zero
+            memset(&header, 0, sizeof(dsr));
+
+            // header_key
+            header.hk.sizeof_hdr = 348;
+            size_t i;
+            for (i=0; i<10; i++ ) header.hk.data_type[i] = 0;
+            for (i=0; i<18; i++ ) header.hk.db_name[i] = 0;
+            header.hk.extents = 16384;
+            header.hk.session_error = 0;
+            header.hk.regular = 'r';
+            header.hk.hkey_un0 = 0;
+
+            // image_dimension
+            size_t NDim = dim.size();
+
+            header.dime.dim[0] = (short)(NDim);
+            header.dime.dim[1] = (short)(dim[0]);
+
+            if ( NDim > 1 )
+                header.dime.dim[2] = (short)(dim[1]);
+            else
+                header.dime.dim[2] = 1;
+
+            if ( NDim > 2 )
+                header.dime.dim[3] = (short)(dim[2]);
+            else
+                header.dime.dim[3] = 1;
+
+            if ( NDim > 3 )
+                header.dime.dim[4] = (short)(dim[3]);
+            else
+                header.dime.dim[4] = 1;
+
+            if ( NDim > 4 )
+                header.dime.dim[5] = (short)(dim[4]);
+            else
+                header.dime.dim[5] = 1;
+
+            if ( NDim > 5 )
+                header.dime.dim[6] = (short)(dim[5]);
+            else
+                header.dime.dim[6] = 1;
+
+            if ( NDim > 6 )
+                header.dime.dim[7] = (short)(dim[6]);
+            else
+                header.dime.dim[7] = 1;
+
+            if ( NDim > 7 )
+                header.dime.unused8 = (short)(dim[7]);
+            else
+                header.dime.unused8 = 1;
+
+            if ( NDim > 8 )
+                header.dime.unused9 = (short)(dim[8]);
+            else
+                header.dime.unused9 = 1;
+
+            if ( NDim > 9 )
+                header.dime.unused10 = (short)(dim[9]);
+            else
+                header.dime.unused10 = 1;
+
+            header.dime.unused11 = 0;
+            header.dime.unused12 = 0;
+            header.dime.unused13 = 0;
+            header.dime.unused14 = 0;
+
+            std::string rttiID = std::string(typeid(T).name());
+            header.dime.datatype = (short)getDataTypeFromRTTI(rttiID);
+            header.dime.bitpix = (short)(8*sizeof(T));
+            header.dime.dim_un0 = 0;
+
+            // since the NDArray does not carry the pixel spacing
+            header.dime.pixdim[0] = 0;
+            if ( pixelSize.size() > 1 )
+                header.dime.pixdim[1] = pixelSize[0];
+            if ( pixelSize.size() > 2 )
+                header.dime.pixdim[2] = pixelSize[1];
+            if ( pixelSize.size() > 3 )
+                header.dime.pixdim[3] = pixelSize[2];
+            if ( pixelSize.size() > 4 )
+                header.dime.pixdim[4] = pixelSize[3];
+            if ( pixelSize.size() > 5 )
+                header.dime.pixdim[5] = pixelSize[4];
+            if ( pixelSize.size() > 6 )
+                header.dime.pixdim[6] = pixelSize[5];
+            if ( pixelSize.size() > 7 )
+                header.dime.pixdim[7] = pixelSize[6];
+
+            header.dime.vox_offset = 0;
+            header.dime.funused1 = 0;
+            header.dime.funused2 = 0;
+            header.dime.funused3 = 0;
+            header.dime.cal_max = 0;
+            header.dime.cal_min = 0;
+            header.dime.compressed = 0;
+            header.dime.verified = 0;
+            header.dime.glmax = 0;
+            header.dime.glmin = 0;
+
+            // data history
+            for (i=0; i<80; i++ ) header.hist.descrip[i] = 0;
+            for (i=0; i<24; i++ ) header.hist.aux_file[i] = 0;
+            header.hist.orient = 0;
+            for (i=0; i<10; i++ ) header.hist.originator[i] = 0;
+            for (i=0; i<10; i++ ) header.hist.generated[i] = 0;
+            for (i=0; i<10; i++ ) header.hist.scannum[i] = 0;
+            for (i=0; i<10; i++ ) header.hist.patient_id[i] = 0;
+            for (i=0; i<10; i++ ) header.hist.exp_date[i] = 0;
+            for (i=0; i<10; i++ ) header.hist.exp_time[i] = 0;
+            for (i=0; i<3; i++ ) header.hist.hist_un0[i] = 0;
+            header.hist.views = 0;
+            header.hist.vols_added = 0;
+            header.hist.start_field = 0;
+            header.hist.field_skip = 0;
+            header.hist.omax = 0;
+            header.hist.omin = 0;
+            header.hist.smax = 0;
+            header.hist.smin = 0;
+        }
+        catch(...)
+        {
+            throw GadgetronClientException("Errors in IOAnalyze::array2Analyze(dim, header) ... ");
+        }
+    }
+};
+
+class GadgetronClientAnalyzeImageMessageReader : public GadgetronClientMessageReader
+{
+
+public:
+
+    GadgetronClientAnalyzeImageMessageReader(const std::string& prefix = std::string("Image")) : prefix_(prefix)
+    {
+
+    }
+
+    ~GadgetronClientAnalyzeImageMessageReader() {
+    } 
+
+    template <typename T>
+    void read_data_attrib(tcp::socket* stream, const ISMRMRD::ImageHeader& h, ISMRMRD::Image<T>& im)
+    {
+        im.setHead(h);
+
+        std::cout << "Receiving image : " << h.image_series_index << " - " << h.image_index << std::endl;
+
+        typedef unsigned long long size_t_type;
+
+        std::ostringstream ostr;
+
+        if (!prefix_.empty())
+        {
+            ostr << prefix_ << "_";
+        }
+
+        ostr << "SLC" << h.slice << "_"
+            << "CON" << h.contrast << "_"
+            << "PHS" << h.phase << "_"
+            << "REP" << h.repetition << "_"
+            << "SET" << h.set << "_"
+            << "AVE" << h.average << "_"
+            << h.image_index
+            << "_" << h.image_series_index;
+
+        std::string filename = ostr.str();
+
+        //Read meta attributes
+        size_t_type meta_attrib_length;
+        boost::asio::read(*stream, boost::asio::buffer(&meta_attrib_length, sizeof(size_t_type)));
+
+        if (meta_attrib_length > 0)
+        {
+            std::string meta_attrib(meta_attrib_length, 0);
+            boost::asio::read(*stream, boost::asio::buffer(const_cast<char*>(meta_attrib.c_str()), meta_attrib_length));
+
+            // deserialize the meta attribute
+            ISMRMRD::MetaContainer imgAttrib;
+            ISMRMRD::deserialize(meta_attrib.c_str(), imgAttrib);
+
+            std::stringstream st3;
+            st3 << filename << ".attrib";
+            std::string meta_varname = st3.str();
+
+            std::ofstream outfile;
+            outfile.open(meta_varname.c_str(), std::ios::out | std::ios::binary);
+            outfile.write(meta_attrib.c_str(), meta_attrib_length);
+            outfile.close();
+        }
+
+        //Read data
+        boost::asio::read(*stream, boost::asio::buffer(im.getDataPtr(), im.getDataSize()));
+
+        // analyze header
+        std::stringstream st1;
+        st1 << filename << ".hdr";
+        std::string head_varname = st1.str();
+
+        std::vector<size_t> dim(3);
+        dim[0] = h.matrix_size[0];
+        dim[1] = h.matrix_size[1];
+        dim[2] = h.matrix_size[2];
+
+        std::vector<float> pixelSize(3);
+        pixelSize[0] = h.field_of_view[0] / h.matrix_size[0];
+        pixelSize[1] = h.field_of_view[1] / h.matrix_size[1];
+        pixelSize[2] = h.field_of_view[2] / h.matrix_size[2];
+
+        IOAnalyze hdr;
+        dsr header;
+        hdr.array2Header<T>(dim, pixelSize, header);
+
+        std::ofstream outfileHeader;
+        outfileHeader.open(head_varname.c_str(), std::ios::out | std::ios::binary);
+        outfileHeader.write(reinterpret_cast<const char*>(&header), sizeof(dsr));
+        outfileHeader.close();
+
+        // data
+        std::stringstream st2;
+        st2 << filename << ".img";
+        std::string img_varname = st2.str();
+
+        std::ofstream outfileData;
+        outfileData.open(img_varname.c_str(), std::ios::out | std::ios::binary);
+        outfileData.write(reinterpret_cast<const char*>(im.getDataPtr()), sizeof(T)*dim[0] * dim[1] * dim[2]);
+        outfileData.close();
+    }
+
+    virtual void read(tcp::socket* stream) 
+    {
+        //Read the image headerfrom the socket
+        ISMRMRD::ImageHeader h;
+        boost::asio::read(*stream, boost::asio::buffer(&h,sizeof(ISMRMRD::ImageHeader)));
+
+        if (h.data_type == ISMRMRD::ISMRMRD_USHORT)
+        {
+            ISMRMRD::Image<unsigned short> im;
+            this->read_data_attrib(stream, h, im);
+        }
+        else if (h.data_type == ISMRMRD::ISMRMRD_SHORT)
+        {
+            ISMRMRD::Image<short> im;
+            this->read_data_attrib(stream, h, im);
+        }
+        else if (h.data_type == ISMRMRD::ISMRMRD_UINT)
+        {
+            ISMRMRD::Image<unsigned int> im;
+            this->read_data_attrib(stream, h, im);
+        }
+        else if (h.data_type == ISMRMRD::ISMRMRD_INT)
+        {
+            ISMRMRD::Image<int> im;
+            this->read_data_attrib(stream, h, im);
+        }
+        else if (h.data_type == ISMRMRD::ISMRMRD_FLOAT)
+        {
+            ISMRMRD::Image<float> im;
+            this->read_data_attrib(stream, h, im);
+        }
+        else if (h.data_type == ISMRMRD::ISMRMRD_DOUBLE)
+        {
+            ISMRMRD::Image<double> im;
+            this->read_data_attrib(stream, h, im);
+        }
+        else if (h.data_type == ISMRMRD::ISMRMRD_CXFLOAT)
+        {
+            ISMRMRD::Image< std::complex<float> > im;
+            this->read_data_attrib(stream, h, im);
+        }
+        else if (h.data_type == ISMRMRD::ISMRMRD_CXDOUBLE)
+        {
+            ISMRMRD::Image< std::complex<double> > im;
+            this->read_data_attrib(stream, h, im);
+        }
+        else
+        {
+            throw GadgetronClientException("Invalide image data type ... ");
+        }
+    }
+
+protected:
+
+    std::string prefix_;
+};
+
+// ----------------------------------------------------------------
+
+#define MAX_BLOBS_LOG_10    6
+
+class GadgetronClientBlobMessageReader 
+    : public GadgetronClientMessageReader
+{
+
+public:
+    GadgetronClientBlobMessageReader(std::string fileprefix, std::string filesuffix)
+        : number_of_calls_(0)
+        , file_prefix(fileprefix)
+        , file_suffix(filesuffix)
+
+    {
+
+    }
+
+    virtual ~GadgetronClientBlobMessageReader() {}
+
+    virtual void read(tcp::socket* socket) 
+    {
+
+        // MUST READ 32-bits
+        uint32_t nbytes;
+        boost::asio::read(*socket, boost::asio::buffer(&nbytes,sizeof(uint32_t)));
+
+        std::vector<char> data(nbytes,0);
+        boost::asio::read(*socket, boost::asio::buffer(&data[0],nbytes));
+
+        unsigned long long fileNameLen;
+        boost::asio::read(*socket, boost::asio::buffer(&fileNameLen,sizeof(unsigned long long)));
+
+        std::string filenameBuf(fileNameLen,0);
+        boost::asio::read(*socket, boost::asio::buffer(const_cast<char*>(filenameBuf.c_str()),fileNameLen));
+
+        typedef unsigned long long size_t_type;
+
+        size_t_type meta_attrib_length;
+        boost::asio::read(*socket, boost::asio::buffer(&meta_attrib_length, sizeof(size_t_type)));
+
+        std::string meta_attrib;
+        if (meta_attrib_length > 0)
+        {
+            std::string meta_attrib_socket(meta_attrib_length, 0);
+            boost::asio::read(*socket, boost::asio::buffer(const_cast<char*>(meta_attrib_socket.c_str()), meta_attrib_length));
+            meta_attrib = meta_attrib_socket;
+        }
+
+        std::stringstream filename;
+        std::string filename_attrib;
+
+        // Create the filename: (prefix_%06.suffix)
+        filename << file_prefix << "_";
+        filename << std::setfill('0') << std::setw(MAX_BLOBS_LOG_10) << number_of_calls_;
+        filename_attrib = filename.str();
+        filename << "." << file_suffix;
+        filename_attrib.append("_attrib.xml");
+
+        std::cout << "Writing image " << filename.str() << std::endl;
+
+        std::ofstream outfile;
+        outfile.open(filename.str().c_str(), std::ios::out | std::ios::binary);
+
+        std::ofstream outfile_attrib;
+        if (meta_attrib_length > 0)
+        {
+            outfile_attrib.open(filename_attrib.c_str(), std::ios::out | std::ios::binary);
+        }
+
+        if (outfile.good())
+        {
+            /* write 'size' bytes starting at 'data's pointer */
+            outfile.write(&data[0], nbytes);
+            outfile.close();
+
+            if (meta_attrib_length > 0)
+            {
+                outfile_attrib.write(meta_attrib.c_str(), meta_attrib.length());
+                outfile_attrib.close();
+            }
+
+            number_of_calls_++;
+        }
+        else
+        {
+            throw GadgetronClientException("Unable to write blob to output file\n");
+        }
+    }
+
+protected:
+    size_t number_of_calls_;
+    std::string file_prefix;
+    std::string file_suffix;
+
+};
+
+class GadgetronClientConnector
+{
+
+public:
+    GadgetronClientConnector() 
+        : socket_(0)
+    {
+
+    }
+
+    virtual ~GadgetronClientConnector() 
+    {
+        if (socket_) {
+            socket_->close();
+            delete socket_;
+        }
+    }
+
+    void read_task()
+    {
+        if (!socket_) {
+            throw GadgetronClientException("Unable to create socket.");
+        }
+
+        GadgetMessageIdentifier id;
+        while (socket_->is_open()) {
+            boost::asio::read(*socket_, boost::asio::buffer(&id,sizeof(GadgetMessageIdentifier)));
+
+            if (id.id == GADGET_MESSAGE_CLOSE) {
+                break;
+            }
+
+            GadgetronClientMessageReader* r = find_reader(id.id);
+
+            if (!r) {
+                std::cout << "Message received with ID: " << id.id << std::endl;
+                throw GadgetronClientException("Unknown Message ID");
+            } else {
+                r->read(socket_);
+            }
+        }
+    }
+
+    void wait() {
+        reader_thread_.join();
+    }
+
+    void connect(std::string hostname, std::string port)
+    {
+
+
+        tcp::resolver resolver(io_service);
+        tcp::resolver::query query(tcp::v4(), hostname.c_str(), port.c_str());
+        tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);
+        tcp::resolver::iterator end;
+
+        socket_ = new tcp::socket(io_service);
+
+        if (!socket_) {
+            throw GadgetronClientException("Unable to create socket.");
+        }
+
+        //TODO:
+        //For newer versions of Boost, we should use
+        //   boost::asio::connect(*socket_, iterator);
+
+        boost::system::error_code error = boost::asio::error::host_not_found;
+        while (error && endpoint_iterator != end) {
+            socket_->close();
+            socket_->connect(*endpoint_iterator++, error);
+        }
+        if (error)
+            throw GadgetronClientException("Error connecting using socket.");
+
+        reader_thread_ = boost::thread(boost::bind(&GadgetronClientConnector::read_task, this));
+
+    }
+
+    void send_gadgetron_close() { 
+        if (!socket_) {
+            throw GadgetronClientException("Invalid socket.");
+        }
+        GadgetMessageIdentifier id;
+        id.id = GADGET_MESSAGE_CLOSE;    
+        boost::asio::write(*socket_, boost::asio::buffer(&id, sizeof(GadgetMessageIdentifier)));
+    }
+
+    void send_gadgetron_configuration_file(std::string config_xml_name) {
+
+        if (!socket_) {
+            throw GadgetronClientException("Invalid socket.");
+        }
+
+        GadgetMessageIdentifier id;
+        id.id = GADGET_MESSAGE_CONFIG_FILE;
+
+        GadgetMessageConfigurationFile ini;
+        memset(&ini,0,sizeof(GadgetMessageConfigurationFile));
+        strncpy(ini.configuration_file, config_xml_name.c_str(),config_xml_name.size());
+
+        boost::asio::write(*socket_, boost::asio::buffer(&id, sizeof(GadgetMessageIdentifier)));
+        boost::asio::write(*socket_, boost::asio::buffer(&ini, sizeof(GadgetMessageConfigurationFile)));
+
+    }
+
+    void send_gadgetron_configuration_script(std::string xml_string)
+    {
+        if (!socket_) {
+            throw GadgetronClientException("Invalid socket.");
+        }
+
+        GadgetMessageIdentifier id;
+        id.id = GADGET_MESSAGE_CONFIG_SCRIPT;
+
+        GadgetMessageScript conf;
+        conf.script_length = (uint32_t)xml_string.size()+1;
+
+        boost::asio::write(*socket_, boost::asio::buffer(&id, sizeof(GadgetMessageIdentifier)));
+        boost::asio::write(*socket_, boost::asio::buffer(&conf, sizeof(GadgetMessageScript)));
+        boost::asio::write(*socket_, boost::asio::buffer(xml_string.c_str(), conf.script_length));    
+
+    }
+
+
+    void  send_gadgetron_parameters(std::string xml_string)
+    {
+        if (!socket_) {
+            throw GadgetronClientException("Invalid socket.");
+        }
+
+        GadgetMessageIdentifier id;
+        id.id = GADGET_MESSAGE_PARAMETER_SCRIPT;
+
+        GadgetMessageScript conf;
+        conf.script_length = (uint32_t)xml_string.size()+1;
+
+        boost::asio::write(*socket_, boost::asio::buffer(&id, sizeof(GadgetMessageIdentifier)));
+        boost::asio::write(*socket_, boost::asio::buffer(&conf, sizeof(GadgetMessageScript)));
+        boost::asio::write(*socket_, boost::asio::buffer(xml_string.c_str(), conf.script_length));    
+    }
+
+    void send_ismrmrd_acquisition(ISMRMRD::Acquisition& acq) 
+    {
+        if (!socket_) {
+            throw GadgetronClientException("Invalid socket.");
+        }
+
+        GadgetMessageIdentifier id;
+        id.id = GADGET_MESSAGE_ISMRMRD_ACQUISITION;;
+
+        boost::asio::write(*socket_, boost::asio::buffer(&id, sizeof(GadgetMessageIdentifier)));
+        boost::asio::write(*socket_, boost::asio::buffer(&acq.getHead(), sizeof(ISMRMRD::AcquisitionHeader)));
+
+        unsigned long trajectory_elements = acq.getHead().trajectory_dimensions*acq.getHead().number_of_samples;
+        unsigned long data_elements = acq.getHead().active_channels*acq.getHead().number_of_samples;
+
+        if (trajectory_elements) {
+            boost::asio::write(*socket_, boost::asio::buffer(&acq.getTrajPtr()[0], sizeof(float)*trajectory_elements));
+        }
+
+
+        if (data_elements) {
+            boost::asio::write(*socket_, boost::asio::buffer(&acq.getDataPtr()[0], 2*sizeof(float)*data_elements));
+        }
+    }
+
+    void register_reader(unsigned short slot, boost::shared_ptr<GadgetronClientMessageReader> r) {
+        readers_[slot] = r;
+    }
+
+protected:
+    typedef std::map<unsigned short, boost::shared_ptr<GadgetronClientMessageReader> > maptype;
+
+    GadgetronClientMessageReader* find_reader(unsigned short r)
+    {
+        GadgetronClientMessageReader* ret = 0;
+
+        maptype::iterator it = readers_.find(r);
+
+        if (it != readers_.end()) {
+            ret = it->second.get();
+        }
+
+        return ret;
+    }
+
+    boost::asio::io_service io_service;
+    tcp::socket* socket_;
+    boost::thread reader_thread_;
+    maptype readers_;
+
+
+};
+
+
+int main(int argc, char **argv)
+{
+
+    std::string host_name;
+    std::string port;
+    std::string in_filename;
+    std::string out_filename;
+    std::string hdf5_in_group;
+    std::string hdf5_out_group;
+    std::string config_file;
+    std::string config_file_local;
+    std::string config_xml_local;
+    unsigned int loops;
+    std::string out_fileformat;
+
+    po::options_description desc("Allowed options");
+
+    desc.add_options()
+        ("help,h", "produce help message")
+        ("port,p", po::value<std::string>(&port)->default_value("9002"), "Port")
+        ("address,a", po::value<std::string>(&host_name)->default_value("localhost"), "Address (hostname) of Gadgetron host")
+        ("filename,f", po::value<std::string>(&in_filename), "Input file")
+        ("outfile,o", po::value<std::string>(&out_filename)->default_value("out.h5"), "Output file")
+        ("in-group,g", po::value<std::string>(&hdf5_in_group)->default_value("/dataset"), "Input data group")
+        ("out-group,G", po::value<std::string>(&hdf5_out_group)->default_value(get_date_time_string()), "Output group name")  
+        ("config,c", po::value<std::string>(&config_file)->default_value("default.xml"), "Configuration file (remote)")
+        ("config-local,C", po::value<std::string>(&config_file_local), "Configuration file (local)")
+        ("loops,l", po::value<unsigned int>(&loops)->default_value(1), "Loops")
+        ("outformat,F", po::value<std::string>(&out_fileformat)->default_value("h5"), "Out format, h5 for hdf5 and hdr for analyze image")
+        ;
+
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.count("help")) {
+        std::cout << desc << std::endl;
+        return 0;
+    }
+
+    if (!vm.count("filename")) {
+        std::cout << std::endl << std::endl << "\tYou must supply a filename" << std::endl << std::endl;
+        std::cout << desc << std::endl;
+        return -1;
+    }
+
+    if (vm.count("config-local")) {
+        std::ifstream t(config_file_local.c_str());
+        if (t) {
+            //Read in the file.
+            config_xml_local = std::string((std::istreambuf_iterator<char>(t)),
+                std::istreambuf_iterator<char>());
+        } else {
+            std::cout << "Unable to read local xml configuration: " << config_file_local  << std::endl;
+            return -1;
+        }
+    }
+
+    std::cout << "Gadgetron ISMRMRD client" << std::endl;
+
+    //Let's check if the files exist:
+    std::string hdf5_xml_varname = std::string(hdf5_in_group) + std::string("/xml");
+    std::string hdf5_data_varname = std::string(hdf5_in_group) + std::string("/data");
+
+
+    //TODO:
+    // Add check to see if input file exists
+
+    //Let's open the input file
+    ISMRMRD::Dataset ismrmrd_dataset(in_filename.c_str(), hdf5_in_group.c_str(), false);
+    // Read the header
+    std::string xml_config;
+    ismrmrd_dataset.readHeader(xml_config);
+
+
+    std::cout << "  -- host            :      " << host_name << std::endl;
+    std::cout << "  -- port            :      " << port << std::endl;
+    std::cout << "  -- hdf5 file  in   :      " << in_filename << std::endl;
+    std::cout << "  -- hdf5 group in   :      " << hdf5_in_group << std::endl;
+    std::cout << "  -- conf            :      " << config_file << std::endl;
+    std::cout << "  -- loop            :      " << loops << std::endl;
+    std::cout << "  -- hdf5 file out   :      " << out_filename << std::endl;
+    std::cout << "  -- hdf5 group out  :      " << hdf5_out_group << std::endl;
+
+
+    GadgetronClientConnector con;
+
+    if ( out_fileformat == "hdr" )
+    {
+        con.register_reader(GADGET_MESSAGE_ISMRMRD_IMAGE, boost::shared_ptr<GadgetronClientMessageReader>(new GadgetronClientAnalyzeImageMessageReader(hdf5_out_group)));
+    }
+    else
+    {
+        con.register_reader(GADGET_MESSAGE_ISMRMRD_IMAGE, boost::shared_ptr<GadgetronClientMessageReader>(new GadgetronClientImageMessageReader(out_filename, hdf5_out_group)));
+    }
+
+    con.register_reader(GADGET_MESSAGE_DICOM_WITHNAME, boost::shared_ptr<GadgetronClientMessageReader>(new GadgetronClientBlobMessageReader(std::string(hdf5_out_group), std::string("dcm"))));
+
+    try {
+        con.connect(host_name,port);
+        if (vm.count("config-local")) {
+            con.send_gadgetron_configuration_script(config_xml_local);
+        } else {
+            con.send_gadgetron_configuration_file(config_file);
+        }
+        con.send_gadgetron_parameters(xml_config);
+
+        uint32_t acquisitions = 0;
+        {
+            mtx.lock();
+            acquisitions = ismrmrd_dataset.getNumberOfAcquisitions();
+            mtx.unlock();
+        }
+
+        ISMRMRD::Acquisition acq_tmp;
+        for (uint32_t i = 0; i < acquisitions; i++) {
+            {
+                {
+                    boost::mutex::scoped_lock scoped_lock(mtx);
+                    ismrmrd_dataset.readAcquisition(i, acq_tmp);
+                }
+                con.send_ismrmrd_acquisition(acq_tmp);
+            }
+        }
+
+        con.send_gadgetron_close();
+        con.wait();
+
+    } catch (std::exception& ex) {
+        std::cout << "Error caught: " << ex.what() << std::endl;
+    }
+
+    return 0;
+}
diff --git a/apps/clients/utilities/CMakeLists.txt b/apps/clients/utilities/CMakeLists.txt
new file mode 100644
index 0000000..c02622d
--- /dev/null
+++ b/apps/clients/utilities/CMakeLists.txt
@@ -0,0 +1,49 @@
+set(Boost_NO_BOOST_CMAKE ON)
+
+if(WIN32)
+  find_package(Boost COMPONENTS thread system date_time chrono REQUIRED)
+else(WIN32)
+  find_package(Boost COMPONENTS thread system REQUIRED)
+endif(WIN32)
+
+if(WIN32)
+  link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+include_directories(
+    ${CMAKE_SOURCE_DIR}/apps/gadgetron
+    ${CMAKE_BINARY_DIR}/apps/gadgetron
+    ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+    ${CMAKE_SOURCE_DIR}/toolboxes/core
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/algorithm
+    ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+    ${ACE_INCLUDE_DIR} 
+    ${Boost_INCLUDE_DIR} 
+    ${ISMRMRD_INCLUDE_DIR}
+    )
+
+add_executable(gt_alive gt_alive.cpp)
+add_executable(gtdependencyquery gt_query.cpp DependencyQueryReader.h gtquery.xml)
+
+target_link_libraries(gt_alive gadgetron_toolbox_cpucore 
+                               gadgetron_toolbox_gadgettools 
+			       gadgetron_toolbox_log
+                               optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+                               ${Boost_LIBRARIES} 
+                               ${ISMRMRD_LIBRARIES} )
+
+target_link_libraries(gtdependencyquery gadgetron_toolbox_cpucore 
+                                        gadgetron_toolbox_gadgettools 
+					gadgetron_toolbox_log
+                                        optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+                                        ${Boost_LIBRARIES} 
+                                        ${ISMRMRD_LIBRARIES} )
+
+install(TARGETS gt_alive gtdependencyquery DESTINATION bin COMPONENT main)
+install(FILES DependencyQueryReader.h DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+install(FILES isalive.xml gtquery.xml DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
diff --git a/apps/clients/utilities/DependencyQueryReader.h b/apps/clients/utilities/DependencyQueryReader.h
new file mode 100644
index 0000000..67c6977
--- /dev/null
+++ b/apps/clients/utilities/DependencyQueryReader.h
@@ -0,0 +1,100 @@
+
+/** \file   DependencyQueryReader.h
+    \brief  Implement the writer to write the dependency query reults into a file
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <fstream>
+#include <iomanip>
+
+#include "GadgetMessageInterface.h"
+#include "ismrmrd/meta.h"
+
+namespace Gadgetron
+{
+
+class DependencyQueryReader : public GadgetMessageReader
+{
+    public:
+
+        DependencyQueryReader(std::string filename) : number_of_calls_(0) , filename_(filename)
+        {
+        }
+
+        virtual ~DependencyQueryReader()
+        {
+        }
+
+        virtual ACE_Message_Block* read(ACE_SOCK_Stream* socket)
+        {
+            ssize_t recv_count = 0;
+
+            typedef unsigned long long size_t_type;
+
+            size_t_type len(0);
+            if ( ( recv_count = socket->recv_n( &len, sizeof(size_t_type)) ) <= 0 )
+            {
+	      GERROR("DependencyQueryReader, failed to read query results length\n");
+	      return 0;
+            }
+
+            char* buf = NULL;
+            try
+            {
+                buf = new char[len];
+                if ( buf == NULL )
+                {
+		  GERROR("DependencyQueryReader, failed to allocate buffer\n");
+		  return 0;
+                }
+
+                memset(buf, '\0', len);
+                memcpy(buf, &len, sizeof(size_t_type));
+            }
+            catch (std::runtime_error &err)
+            {
+                GEXCEPTION(err,"DependencyQueryReader, failed to allocate buffer\n");
+                return 0;
+            }
+
+            if ( ( recv_count = socket->recv_n( buf, len) ) <= 0 )
+            {
+	      GERROR("DependencyQueryReader, failed to read query results\n");
+	      delete [] buf;
+	      return 0;
+            }
+
+            std::ofstream outfile;
+            outfile.open (filename_.c_str(), std::ios::out|std::ios::binary);
+
+            if (outfile.good())
+            {
+                outfile.write(buf, len);
+                outfile.close();
+                number_of_calls_++;
+            }
+            else
+            {
+                delete[] buf;
+
+                GERROR_STREAM("File " << filename_ << " is not good for writing\n");
+                return 0;
+            }
+
+            delete[] buf;
+
+            // The GadgetronConnector expects an ACE_Message_Block* (NOT NULL)
+            ACE_Message_Block *mb = new ACE_Message_Block();
+
+            return mb;
+        }
+
+    protected:
+
+        size_t number_of_calls_;
+        std::string filename_;
+};
+
+} // namespace Gadgetron
diff --git a/apps/clients/utilities/gt_alive.cpp b/apps/clients/utilities/gt_alive.cpp
new file mode 100644
index 0000000..c138bd0
--- /dev/null
+++ b/apps/clients/utilities/gt_alive.cpp
@@ -0,0 +1,69 @@
+#include "GadgetronConnector.h"
+#include "GadgetMRIHeaders.h"
+#include "GadgetContainerMessage.h"
+#include "FileInfo.h"
+
+#include <ace/SOCK_Acceptor.h>
+#include <ace/Addr.h>
+#include <ace/INET_Addr.h>
+#include <ace/Log_Msg.h>
+#include <ace/Get_Opt.h>
+#include <ace/OS_NS_string.h>
+
+#include <fstream>
+#include <time.h>
+#include <iomanip>
+#include <iostream>
+
+using namespace Gadgetron;
+
+int ACE_TMAIN(int argc, ACE_TCHAR *argv[] )
+{
+	GadgetronConnector con;
+
+	std::string host("localhost");
+	std::string port("9002");
+
+	ACE_TCHAR hostname[1024];
+	//We will do a little trick to figure out what the hostname would be accoring to ACE
+	ACE_SOCK_Acceptor listener (ACE_Addr::sap_any);
+	ACE_INET_Addr addr;
+	listener.get_local_addr (addr);
+	ACE_OS_String::strncpy(hostname, addr.get_host_name(), 1024);
+
+	host = std::string(hostname);
+        
+	if (argc > 1) {
+		host = std::string(argv[1]);
+	}
+
+	if (argc > 2) {
+		port = std::string(argv[2]);
+	}
+
+	if (con.open(host,port) != 0) {
+	  GERROR("Unable to connect to the Gadgetron host\n");
+	  return -1;
+	}
+
+	//Tell Gadgetron which XML configuration to run.
+	if (con.send_gadgetron_configuration_file(std::string("isalive.xml")) != 0) {
+	  GERROR("Unable to send XML configuration to the Gadgetron host\n");
+	  return -1;
+	}
+
+
+	GadgetContainerMessage<GadgetMessageIdentifier>* m1 =
+			new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+	m1->getObjectPtr()->id = GADGET_MESSAGE_CLOSE;
+
+	if (con.putq(m1) == -1) {
+	  GERROR("Unable to put CLOSE package on queue\n");
+	  return -1;
+	}
+
+	con.wait();
+
+	return 0;
+}
diff --git a/apps/clients/utilities/gt_query.cpp b/apps/clients/utilities/gt_query.cpp
new file mode 100644
index 0000000..a93b842
--- /dev/null
+++ b/apps/clients/utilities/gt_query.cpp
@@ -0,0 +1,99 @@
+#include "GadgetronConnector.h"
+#include "GadgetMRIHeaders.h"
+#include "GadgetContainerMessage.h"
+#include "DependencyQueryReader.h"
+
+#include <ace/Log_Msg.h>
+#include <ace/Get_Opt.h>
+#include <ace/OS_NS_string.h>
+
+#include <fstream>
+#include <time.h>
+#include <iomanip>
+#include <sstream>
+
+using namespace Gadgetron;
+
+static void usage()
+{
+    using namespace std;
+    std::ostringstream outs;
+
+    outs << "Query the gadgetron server for the stored dependency measurements" << endl;
+    outs << "gtdependencyquery   -p <PORT>                      (default 9002)" << endl;
+    outs << "                    -h <HOST>                      (default localhost)" << endl;
+    outs << "                    -o <Query out file>            (default dependency.xml)" << endl;
+    outs << std::ends; 
+
+    GDEBUG_STREAM(outs.str());
+}
+
+int ACE_TMAIN(int argc, ACE_TCHAR *argv[] )
+{
+    GadgetronConnector con;
+
+    std::string host("localhost");
+    std::string port("9002");
+    std::string out("dependency.xml");
+
+    static const ACE_TCHAR options[] = ACE_TEXT(":p:h:o:");
+
+    ACE_Get_Opt cmd_opts(argc, argv, options);
+
+    int option;
+    while ((option = cmd_opts()) != EOF)
+    {
+        switch (option) {
+        case 'p':
+            port = std::string(cmd_opts.opt_arg());
+            break;
+        case 'h':
+            host = std::string(cmd_opts.opt_arg());
+            break;
+        case 'o':
+            out = std::string(cmd_opts.opt_arg());
+            break;
+        case ':':
+            usage();
+            GERROR("-%c requires an argument.\n", cmd_opts.opt_opt());
+	    return -1;
+            break;
+        default:
+            usage();
+            GERROR("Command line parse error\n");
+	    return -1;
+            break;
+        }
+    }
+
+    if (con.open(host,port) != 0)
+    {
+      GERROR("Unable to connect to the Gadgetron host\n");
+      return -1;
+    }
+
+    // need to register a reader
+    con.register_reader(GADGET_MESSAGE_DEPENDENCY_QUERY, new DependencyQueryReader(out));
+
+    //Tell Gadgetron which XML configuration to run.
+    if (con.send_gadgetron_configuration_file(std::string("gtquery.xml")) != 0)
+    {
+      GERROR("Unable to send XML configuration to the Gadgetron host\n");
+      return -1;
+    }
+
+    GadgetContainerMessage<GadgetMessageIdentifier>* m1 =
+            new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+    m1->getObjectPtr()->id = GADGET_MESSAGE_CLOSE;
+
+    if (con.putq(m1) == -1)
+    {
+      GERROR("Unable to put CLOSE package on queue\n");
+      return -1;
+    }
+
+    con.wait();
+
+    return 0;
+}
diff --git a/apps/clients/utilities/gtquery.xml b/apps/clients/utilities/gtquery.xml
new file mode 100644
index 0000000..86e6a3d
--- /dev/null
+++ b/apps/clients/utilities/gtquery.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <writer>
+        <slot>1019</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>DependencyQueryWriter</classname>
+    </writer>
+
+    <gadget>
+        <name>DependencyQuery</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>DependencyQueryGadget</classname>
+
+        <!-- If set to true, check the file creation time and delete old stored files -->
+        <property>
+            <name>clean_storage_while_query</name>
+            <value>true</value>
+        </property>
+
+        <!-- In the unit of hours, the maximal time duration allowed for a file stored
+             If clean_storage_while_query == true, this time limit will be used to 
+             determine which files to be deleted
+         -->
+        <property>
+            <name>time_limit_in_storage</name>
+            <value>24.0</value>
+        </property>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/apps/clients/utilities/isalive.xml b/apps/clients/utilities/isalive.xml
new file mode 100644
index 0000000..27c33fc
--- /dev/null
+++ b/apps/clients/utilities/isalive.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+	<!--        
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetroncore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+  
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetroncore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetroncore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetroncore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+  
+    <gadget>
+      <name>Acc</name>
+      <dll>gadgetroncore</dll>
+      <classname>AccumulatorGadget</classname>
+    </gadget>
+    <gadget>
+      <name>FFT</name>
+      <dll>gadgetroncore</dll>
+      <classname>FFTGadget</classname>
+    </gadget>
+    <gadget>
+      <name>CropCombine</name>
+      <dll>gadgetroncore</dll>
+      <classname>CropAndCombineGadget</classname>
+    </gadget>
+    <gadget>
+      <name>Extract</name>
+      <dll>gadgetroncore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>  
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetroncore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+	-->
+</gadgetronStreamConfiguration>
diff --git a/apps/gadgetron/CMakeLists.txt b/apps/gadgetron/CMakeLists.txt
new file mode 100644
index 0000000..2739630
--- /dev/null
+++ b/apps/gadgetron/CMakeLists.txt
@@ -0,0 +1,104 @@
+configure_file(gadgetron_config.in gadgetron_config.h)
+
+include_directories(
+  ${CMAKE_CURRENT_BINARY_DIR}
+  ${CMAKE_SOURCE_DIR}/apps/gadgetron
+  ${CMAKE_SOURCE_DIR}/toolboxes/cloudbus
+  ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/algorithm
+  ${Boost_INCLUDE_DIR}
+  ${ACE_INCLUDE_DIR}
+  )
+
+if (CUDA_FOUND)
+  include_directories(${CUDA_INCLUDE_DIRS})
+endif(CUDA_FOUND)
+
+add_executable(gadgetron 
+  main.cpp 
+  GadgetServerAcceptor.h
+  GadgetServerAcceptor.cpp 
+  GadgetStreamController.h
+  EndGadget.h 
+  Gadget.h 
+  GadgetContainerMessage.h 
+  GadgetMessageInterface.h 
+  GadgetronExport.h 
+  gadgetron_xml.h
+  )
+
+target_link_libraries(gadgetron 
+  gadgetron_gadgetbase
+  gadgetron_toolbox_log
+  gadgetron_toolbox_gadgettools gadgetron_toolbox_cloudbus 
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+ )
+
+find_package(PythonLibs)
+find_package(NumPy)
+find_package(Boost COMPONENTS python)
+
+if (Boost_PYTHON_FOUND AND PYTHONLIBS_FOUND AND NUMPY_FOUND)
+  add_definitions("-DCOMPILING_WITH_PYTHON_SUPPORT")
+endif()
+
+add_executable(gadgetron_info
+  gadgetron_info.cpp
+)
+
+target_link_libraries(gadgetron_info 
+  gadgetron_toolbox_log
+  gadgetron_gadgetbase
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+ )
+
+if (CUDA_FOUND)
+  target_link_libraries(gadgetron_info ${CUDA_LIBRARIES})
+endif(CUDA_FOUND)
+
+add_library(gadgetron_gadgetbase SHARED
+  Gadget.cpp
+  GadgetStreamController.cpp
+  gadgetron_xml.cpp
+  pugixml.cpp  
+)
+
+target_link_libraries(gadgetron_gadgetbase
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY}
+  gadgetron_toolbox_gadgettools
+  gadgetron_toolbox_log
+)
+
+set_target_properties(gadgetron_gadgetbase PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+set_target_properties (gadgetron_gadgetbase PROPERTIES COMPILE_DEFINITIONS "__BUILD_GADGETRON_GADGETBASE__")
+
+install(TARGETS gadgetron gadgetron_info DESTINATION bin COMPONENT main)
+install(TARGETS gadgetron_gadgetbase DESTINATION lib COMPONENT main)
+
+install(FILES
+  gadgetbase_export.h
+  EndGadget.h
+  Gadget.h
+  GadgetContainerMessage.h
+  GadgetMessageInterface.h
+  GadgetronExport.h
+  gadgetron_paths.h
+  gadgetron_xml.h
+  GadgetServerAcceptor.h
+  GadgetStreamController.h
+  GadgetStreamInterface.h
+  ${CMAKE_CURRENT_BINARY_DIR}/gadgetron_config.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main) 
+
+install(FILES 
+  gadgetron.xml.example
+  DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
+
+install(FILES 
+  schema/gadgetron.xsd
+  DESTINATION ${GADGETRON_INSTALL_SCHEMA_PATH} COMPONENT main)
+
+add_subdirectory(webapp)
diff --git a/apps/gadgetron/EndGadget.h b/apps/gadgetron/EndGadget.h
new file mode 100644
index 0000000..0b6bdb9
--- /dev/null
+++ b/apps/gadgetron/EndGadget.h
@@ -0,0 +1,55 @@
+/*
+ * EndGadget.h
+ *
+ *  Created on: Nov 3, 2011
+ *      Author: hansenms
+ */
+
+#ifndef ENDGADGET_H_
+#define ENDGADGET_H_
+
+#include "Gadget.h"
+#include "GadgetMessageInterface.h"
+
+namespace Gadgetron{
+class EndGadget : public Gadget
+{
+	virtual int close(unsigned long flags)
+	{
+		GDEBUG("Close called in EndGadget with flags %d\n", flags);
+
+		GadgetContainerMessage<GadgetMessageIdentifier>* mb =
+				new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+		mb->getObjectPtr()->id = GADGET_MESSAGE_CLOSE;
+
+		if (controller_->output_ready(mb) < 0) {
+			return GADGET_FAIL;
+		}
+
+		GDEBUG("Calling close in base class  with flags %d\n", flags);
+		return Gadget::close(flags);
+	}
+
+protected:
+	virtual int process(ACE_Message_Block *m)
+	{
+		m->release();
+		return 0;
+	}
+
+	virtual int next_step(ACE_Message_Block *m)
+	{
+		m->release();
+		return 0;
+	}
+
+	virtual int process_config(ACE_Message_Block * m) {
+		m->release();
+		return 0;
+	}
+
+};
+}
+
+#endif /* ENDGADGET_H_ */
diff --git a/apps/gadgetron/Gadget.cpp b/apps/gadgetron/Gadget.cpp
new file mode 100644
index 0000000..86c4638
--- /dev/null
+++ b/apps/gadgetron/Gadget.cpp
@@ -0,0 +1,50 @@
+#include "Gadget.h"
+#include "GadgetStreamController.h"
+
+namespace Gadgetron
+{
+  boost::shared_ptr<std::string> Gadget::get_string_value(const char* name, unsigned int recursive) {
+    const unsigned int recursive_limit = 10;
+    if (recursive > recursive_limit) {
+      GDEBUG("Recursive level %d exceeds maimum limit (%d) in Gadget::get_string_value(...)\n", recursive, recursive_limit);
+      return boost::shared_ptr<std::string>(new std::string(""));
+    }
+
+    std::string str_val;
+    GadgetPropertyBase* p = find_property(name);
+    if (using_properties_) {
+      if (!p) {
+	GERROR("Property %s\n", name);
+	throw std::runtime_error("Attempting to access non existent property on Gadget");
+      }
+      str_val = std::string(p->string_value());
+    } else {
+      std::map<std::string,std::string>::iterator it;
+      parameter_mutex_.acquire();
+      it = parameters_.find(std::string(name));
+      parameter_mutex_.release();
+      if (it != parameters_.end()) {
+	str_val = it->second;
+      }
+    }
+
+    //If string contains an @ sign, we should look for this parameter on another gadget
+    size_t at_pos = str_val.find('@');
+    if (at_pos != std::string::npos) {
+      //There was an add sign, which means look for that parameter on another gadget
+      std::string parm = str_val.substr(0,at_pos);
+      std::string gadget = str_val.substr(at_pos+1);
+	  
+      Gadget* ref_gadget = this->controller_->find_gadget(gadget.c_str());
+      
+      if (ref_gadget) {
+	recursive++;
+	return ref_gadget->get_string_value(parm.c_str(), recursive);
+      }
+    } else {
+      return boost::shared_ptr<std::string>(new std::string(str_val));
+    }
+
+    return boost::shared_ptr<std::string>(new std::string(""));
+  }
+}
diff --git a/apps/gadgetron/Gadget.h b/apps/gadgetron/Gadget.h
new file mode 100644
index 0000000..74b0a8d
--- /dev/null
+++ b/apps/gadgetron/Gadget.h
@@ -0,0 +1,683 @@
+#ifndef GADGET_H
+#define GADGET_H
+#pragma once
+
+#include <ace/OS_NS_stdlib.h>
+#include <ace/Task.h>
+#include <ace/Stream.h>
+#include <ace/Module.h>
+#include <ace/OS_Memory.h>
+#include <ace/Svc_Handler.h>
+#include <ace/SOCK_Stream.h>
+
+#include <map>
+#include <string>
+#include <boost/shared_ptr.hpp>
+
+#include "gadgetbase_export.h"
+#include "GadgetContainerMessage.h"
+#include "GadgetronExport.h"
+#include "gadgetron_config.h"
+#include "log.h"
+
+#include <stdexcept>
+
+#define GADGET_FAIL -1
+#define GADGET_OK    0
+
+namespace Gadgetron{
+
+  class GadgetPropertyBase
+  {
+  public:
+  GadgetPropertyBase(const char* name, const char* type_string, const char* description)
+    : name_(name)
+    , type_str_(type_string)
+    , description_(description)
+    , str_value_("")
+    , is_reference_(false)
+    , reference_gadget_("") 
+    , reference_property_("") 
+    {
+      
+    }
+    
+    virtual const char* name()
+    {
+      return name_.c_str();
+    }
+
+    virtual const char* string_value()
+    {
+      return str_value_.c_str();
+    }
+
+    virtual void string_value(const char* value)
+    {
+      str_value_ = value;
+      size_t at_pos = str_value_.find('@');
+      if (at_pos != std::string::npos) {
+	//There was an add sign, which means look for that parameter on another gadget
+	std::string reference_property_ = str_value_.substr(0,at_pos);
+	std::string reference_gadget_   = str_value_.substr(at_pos+1);
+	is_reference_ = true;
+      }
+    }
+
+    virtual const char* type_string()
+    {
+      return type_str_.c_str();
+    }
+
+    virtual const char* description()
+    {
+      return description_.c_str();
+    }
+
+    virtual const char* limits_description()
+    {
+      return "";
+    }
+
+  protected:
+    std::string name_;
+    std::string type_str_;
+    std::string description_;
+    std::string str_value_;
+    bool is_reference_;
+    std::string reference_gadget_;
+    std::string reference_property_;
+  };
+  
+    //Forward declarations
+    class GadgetStreamInterface;
+  
+    class EXPORTGADGETBASE Gadget : public ACE_Task<ACE_MT_SYNCH>
+    {
+
+    public:
+        typedef ACE_Task<ACE_MT_SYNCH> inherited;
+
+        enum
+        {
+            GADGET_MESSAGE_CONFIG = (ACE_Message_Block::USER_FLAGS << 1)
+        };
+
+        Gadget()
+            : inherited()
+	    , using_properties_(false)
+            , desired_threads_(1)
+            , pass_on_undesired_data_(false)
+            , controller_(0)
+	    , parameter_mutex_("GadgetParameterMutex")
+        {
+
+	  gadgetron_version_ = std::string(GADGETRON_VERSION_STRING) + std::string(" (") + 
+	    std::string(GADGETRON_GIT_SHA1_HASH) + std::string(")");
+	  
+        }
+
+        virtual ~Gadget()
+        {
+	  if (this->module()) {
+            GDEBUG("Shutting down Gadget (%s)\n", this->module()->name());
+	  }
+        }
+
+
+        virtual int init(void)
+        {
+            return 0;
+        }
+
+        virtual int open(void* = 0)
+        {
+
+            int t = this->get_int_value("threads");
+            if (t > 0) {
+                GDEBUG("Setting number of threads of gadget %s to %d\n", this->module()->name(), t);
+                this->desired_threads(t);
+            }
+
+	    this->pass_on_undesired_data(this->get_bool_value("pass_on_undesired_data"));
+
+            return this->activate( THR_NEW_LWP | THR_JOINABLE,
+                this->desired_threads() );
+        }
+
+        int put(ACE_Message_Block *m, ACE_Time_Value* timeout = 0)
+        {
+            return this->putq(m, timeout);
+        }
+
+        virtual unsigned int desired_threads()
+        {
+            return desired_threads_;
+        }
+
+        virtual void desired_threads(unsigned int t)
+        {
+            desired_threads_ = t;
+        }
+
+	virtual bool pass_on_undesired_data()
+	{
+	  return pass_on_undesired_data_;
+	}
+
+	virtual void pass_on_undesired_data(bool d)
+	{
+	  pass_on_undesired_data_ = d;
+	}
+
+        virtual void set_controller(GadgetStreamInterface* controller) {
+            controller_ = controller;
+        }
+
+        virtual GadgetStreamInterface* get_controller()
+        {
+            return controller_;
+        }
+
+        virtual int close(unsigned long flags)
+        {
+            GDEBUG("Gadget (%s) Close Called with flags = %d\n", this->module()->name(), flags);
+            int rval = 0;
+            if (flags == 1) {
+                ACE_Message_Block *hangup = new ACE_Message_Block();
+                hangup->msg_type( ACE_Message_Block::MB_HANGUP );
+                if (this->putq(hangup) == -1) {
+                    hangup->release();
+                    GDEBUG("Gadget (%s) failed to put hang up message on queue\n", this->module()->name());
+                    return GADGET_FAIL;
+                }
+                GDEBUG("Gadget (%s) waiting for thread to finish\n", this->module()->name());
+                rval = this->wait();
+                GDEBUG("Gadget (%s) thread finished\n", this->module()->name());
+                controller_ = 0;
+            }
+            return rval;
+        }
+
+        virtual int svc(void)
+        {
+            for (ACE_Message_Block *m = 0; ;) {
+
+                //GDEBUG("Waiting for message in Gadget (%s)\n", this->module()->name());
+                if (this->getq(m) == -1) {
+                    GDEBUG("Gadget (%s) failed to get message from queue\n", this->module()->name());
+                    return GADGET_FAIL;
+                }
+                //GDEBUG("Message Received in Gadget (%s)\n", this->module()->name());
+
+                //If this is a hangup message, we are done, put the message back on the queue before breaking
+                if (m->msg_type() == ACE_Message_Block::MB_HANGUP) {
+		  //GDEBUG("Gadget (%s) Hangup message encountered\n", this->module()->name());
+                    if (this->putq(m) == -1) {
+                        GDEBUG("Gadget (%s) failed to put hang up message on queue (for other threads)\n", this->module()->name());
+                        return GADGET_FAIL;
+                    }
+                    //GDEBUG("Gadget (%s) breaking loop\n", this->module()->name());
+                    break;
+                }
+
+
+                //Is this config info, if so call appropriate process function
+                if (m->flags() & GADGET_MESSAGE_CONFIG) {
+
+                    int success;
+                    try{ success = this->process_config(m); }
+                    catch (std::runtime_error& err){
+                        GEXCEPTION(err,"Gadget::process_config() failed\n");
+                        success = -1;
+                    }
+
+                    if (success == -1) {
+                        m->release();
+                        this->flush();
+                        GDEBUG("Gadget (%s) process config failed\n", this->module()->name());
+                        return GADGET_FAIL;
+
+                    }
+
+                    //Push this onto next gadgets queue, other gadgets may need this configuration information
+                    if (this->next()) {
+                        if (this->next()->putq(m) == -1) {
+                            m->release();
+                            GDEBUG("Gadget (%s) process config failed to put config on dowstream gadget\n", this->module()->name());
+                            return GADGET_FAIL;
+                        }
+                    }
+                    continue;
+                }
+
+                int success;
+                try{ success = this->process(m); }
+                catch (std::runtime_error& err){
+                    GEXCEPTION(err,"Gadget::process() failed\n");
+                    success = -1;
+                }
+
+                if (success == -1) {
+                    m->release();
+                    this->flush();
+                    GDEBUG("Gadget (%s) process failed\n", this->module()->name());
+                    return GADGET_FAIL;
+                }
+            }
+            return 0;
+        }
+
+        virtual int set_parameter(const char* name, const char* val, bool trigger = true) {
+	  boost::shared_ptr<std::string> old_value = get_string_value(name);
+	  GadgetPropertyBase* p = this->find_property(name);
+
+	  if (p) {
+	    p->string_value(val);
+	  } else {
+	    if (using_properties_) {
+	      throw std::runtime_error("Attempting to set non-registered property while operaying in forced using_properties mode");
+	    }
+	  }
+
+	  parameter_mutex_.acquire();
+	  parameters_[std::string(name)] = std::string(val);
+	  parameter_mutex_.release();
+
+	  if (trigger) {
+	    return parameter_changed(std::string(name), std::string(val), *old_value);
+	  }
+
+	  return 0;
+        }
+
+        virtual int get_bool_value(const char* name) {
+            return (0 == ACE_OS::strcmp(get_string_value(name)->c_str(), "true"));
+        }
+
+        virtual int get_int_value(const char* name) {
+            return ACE_OS::atoi(get_string_value(name)->c_str());
+        }
+
+        virtual double get_double_value(const char* name) {
+            return ACE_OS::atof(get_string_value(name)->c_str());
+        }
+
+	boost::shared_ptr<std::string> get_string_value(const char* name, unsigned int recursive = 0);
+
+        /**
+        *  This trigger function is called whenever set_parameter is called with the trigger = true;
+        */
+        virtual int parameter_changed(std::string name, std::string new_value, std::string old_value)
+        {
+            return GADGET_OK;
+        }
+
+	void print_properties()
+	{
+	  for (std::vector<GadgetPropertyBase*>::iterator it = properties_.begin(); it != properties_.end(); it++)
+	    {
+	      GDEBUG("Parameter with name: %s\n", (*it)->name());
+	    }
+	}
+
+	int get_number_of_properties()
+	{
+	  return properties_.size();
+	}
+
+	GadgetPropertyBase* get_property_by_index(size_t i)
+	{
+	  if (i >= properties_.size()) {
+	    return 0;
+	  }
+	  return properties_[i];
+	}
+
+	GadgetPropertyBase* find_property(const char* name)
+	{
+	  GadgetPropertyBase* p = 0;
+	  parameter_mutex_.acquire();
+	  for (std::vector<GadgetPropertyBase*>::iterator it = properties_.begin(); it != properties_.end(); it++) {
+	    if (std::string(name) == std::string((*it)->name())) {
+	      p = *it;
+	      break;
+	    }
+	  }
+	  parameter_mutex_.release();
+	  return p;
+	}
+	void register_property(GadgetPropertyBase* p, bool using_properties = true)
+	{
+	  parameter_mutex_.acquire();
+	  properties_.push_back(p);
+	  using_properties_ = using_properties;
+	  parameter_mutex_.release();
+	}
+
+	const char* get_gadgetron_version() {
+	  return gadgetron_version_.c_str();
+	}
+
+    protected:
+	std::vector<GadgetPropertyBase*> properties_;
+	bool using_properties_;
+
+        virtual int next_step(ACE_Message_Block *m)
+        {
+            return this->put_next(m);//next()->putq(m);
+        }
+
+        virtual int process(ACE_Message_Block * m) = 0;
+
+        virtual int process_config(ACE_Message_Block * m) {
+            return 0;
+        }
+
+        unsigned int desired_threads_;
+        bool pass_on_undesired_data_;
+        GadgetStreamInterface* controller_;
+	ACE_Thread_Mutex parameter_mutex_;
+    private:
+        std::map<std::string, std::string> parameters_;
+	std::string gadgetron_version_;
+    };
+
+
+    template <typename T> class GadgetPropertyLimits
+    {
+    public:
+      virtual bool within_limits(T& v) = 0;
+      virtual const char* limits_description() = 0;
+    };
+
+    template <typename T> class GadgetPropertyLimitsNoLimits
+      : public GadgetPropertyLimits<T>
+    {
+    public:
+      virtual bool within_limits(T& v) {
+	return true;
+      }
+
+      virtual const char* limits_description() {
+	return "";
+      }
+    };
+
+    template <typename T> class GadgetPropertyLimitsEnumeration
+      : public GadgetPropertyLimits<T>
+    {
+    public:
+      GadgetPropertyLimitsEnumeration(std::initializer_list<T> valid_vals) {
+	valid_vals_.insert(valid_vals_.end(), valid_vals.begin(), valid_vals.end());
+      }
+      
+      virtual bool within_limits(T& v) 
+      {
+	typename std::vector<T>::iterator it;
+	it = find(valid_vals_.begin(), valid_vals_.end(), v);
+	if (it != valid_vals_.end()) return true;
+	return false;
+      }
+
+      virtual const char* limits_description() 
+      {
+	if (!limits_desc_.size()) {
+	  std::stringstream strstream;
+	  typename std::vector<T>::iterator it;
+	  it = valid_vals_.begin();
+	  if (it != valid_vals_.end()) {
+	    strstream << "[";	
+	    strstream << *it;
+	    it++;
+	    while (it != valid_vals_.end()) {
+	      strstream << ", " << *it;
+	      it++;
+	    }
+	    strstream << "]";
+	  }
+	  limits_desc_ = strstream.str();
+	}
+	return limits_desc_.c_str();
+      }
+
+    protected:
+      std::vector<T> valid_vals_;
+      std::string limits_desc_;
+    };
+
+    template <typename T> class GadgetPropertyLimitsRange
+      : public GadgetPropertyLimits<T>
+    {
+    public:
+      GadgetPropertyLimitsRange(T min_val, T max_val) 
+	: min_(min_val)
+	, max_(max_val)
+      {
+      }
+
+      virtual bool within_limits(T& v) 
+      {
+	return ( (v >= min_) && (v <= max_) ); 
+      }
+
+      virtual const char* limits_description() 
+      {
+	if (!limits_desc_.size()) {
+	  std::stringstream strstream;
+	  strstream << "[" << min_ << ":" << max_ << "]" << std::endl;
+	  limits_desc_ = strstream.str();
+	}
+	return limits_desc_.c_str();
+      }
+            
+    protected:
+      T min_;
+      T max_;
+      std::string limits_desc_;
+    };
+    template <typename T, typename L> class GadgetProperty
+      : public GadgetPropertyBase
+      {
+      public:
+      GadgetProperty(const char* name, const char* type_string, const char* description,
+		     Gadget* g, T default_value, L limits, bool force_using_properties = true)
+	: GadgetPropertyBase(name,type_string,description)
+	, g_(g)
+	, limits_(limits)
+	{
+	  g_->register_property(this, force_using_properties);
+	  this->value(default_value);
+	}
+	
+	T value()
+	{
+	  if (is_reference_) {
+	    boost::shared_ptr<std::string> val = this->g_->get_string_value(this->name());
+	    std::stringstream(*val) >> std::boolalpha >> value_;
+	  }
+	  return value_;
+	}
+	
+	void value(T v)
+	{
+	  value_ = v;
+	  std::stringstream strstream;
+	  strstream << std::boolalpha << v;
+	  strstream >> str_value_;
+	  is_reference_ = false;
+	  if (!limits_.within_limits(v)) {
+	    GERROR("Property: %s, value: %s, limits:%s\n", this->name(), str_value_.c_str(), this->limits_.limits_description());
+	    throw std::runtime_error("Value assigned outside limit range");
+	  }
+	}
+
+	virtual void string_value(const char* val)
+	{
+	  GadgetPropertyBase::string_value(val);
+
+	  if (!is_reference_)
+	  {
+	    T tmp;
+	    std::stringstream(val) >> std::boolalpha >> tmp;
+	    this->value(tmp);
+	  }
+	}
+
+
+	bool operator==(const T &v) const
+	{
+	  return this->value() == v;
+	}
+	
+	virtual const char* limits_description()
+	{
+	  return limits_.limits_description();
+	}
+
+      protected:
+	T value_;
+	L limits_;
+	Gadget* g_;
+      };
+    
+#define GADGET_PROPERTY(varname, vartype, description, defaultvalue) GadgetProperty<vartype, GadgetPropertyLimitsNoLimits<vartype> > varname{#varname,#vartype, description, this, defaultvalue, GadgetPropertyLimitsNoLimits<vartype>()}
+#define GADGET_PROPERTY_NO_FORCE(varname, vartype, description, defaultvalue) GadgetProperty<vartype, GadgetPropertyLimitsNoLimits<vartype> > varname{#varname,#vartype, description, this, defaultvalue, GadgetPropertyLimitsNoLimits<vartype>(), false}
+#define GADGET_PROPERTY_LIMITS(varname, vartype, description, defaultvalue, limitstype, ...) GadgetProperty<vartype, limitstype<vartype> > varname{#varname,#vartype, description, this, defaultvalue, limitstype<vartype>{ __VA_ARGS__ }}
+ 
+    class BasicPropertyGadget : public Gadget
+    {
+
+    protected:
+      GADGET_PROPERTY_NO_FORCE(using_cloudbus,bool,"Indicates whether the cloudbus is in use and available", false);
+      GADGET_PROPERTY_NO_FORCE(pass_on_undesired_data,bool, "If true, data not matching the process function will be passed to next Gadget", false);
+      GADGET_PROPERTY_NO_FORCE(threads,int, "Number of threads to run in this Gadget", 1);
+#ifdef _WIN32
+      GADGET_PROPERTY_NO_FORCE(workingDirectory, std::string, "Where to store temporary files", "c:\\temp\\gadgetron\\");
+#else
+      GADGET_PROPERTY_NO_FORCE(workingDirectory, std::string, "Where to store temporary files", "/tmp/gadgetron/");
+#endif // _WIN32
+    }; 
+
+    template <class P1> class Gadget1 : public BasicPropertyGadget
+    {
+
+    protected:
+        int process(ACE_Message_Block* mb)
+        {
+            GadgetContainerMessage<P1>* m = AsContainerMessage<P1>(mb);
+
+            if (!m) {
+                if (!pass_on_undesired_data_) {
+		  GERROR("Gadget1::process, conversion of message block");
+		  return -1;
+                } else {
+                    return (this->next()->putq(mb));
+                }
+
+            }
+
+            return this->process(m);
+        }
+
+        virtual int process(GadgetContainerMessage<P1>* m) = 0;
+
+    };
+
+    template <class P1, class P2> class Gadget2 : public BasicPropertyGadget
+    {
+
+    protected:
+        int process(ACE_Message_Block* mb)
+        {
+
+            GadgetContainerMessage<P1>* m1 = AsContainerMessage<P1>(mb);
+
+            GadgetContainerMessage<P2>* m2 = 0;
+            if (m1) {
+                m2 = AsContainerMessage<P2>(m1->cont());
+            }
+
+            if (!m1 || !m2) {
+                if (!pass_on_undesired_data_) {
+		  GERROR("%s -> %s, (%s, %s, %p, %p), (%s, %s, %p, %p)\n",
+                        this->module()->name(),
+                        "Gadget2::process, Conversion of Message Block Failed",
+                        typeid(GadgetContainerMessage<P1>*).name(),
+                        typeid(m1).name(),
+                        mb,
+                        m1,
+                        typeid(GadgetContainerMessage<P2>*).name(),
+                        typeid(m2).name(),
+                        mb->cont(),
+                        m2);
+                    return -1;
+                } else {
+                    return (this->next()->putq(mb));
+                }
+            }
+
+            return this->process(m1,m2);
+        }
+
+        virtual int process(GadgetContainerMessage<P1>* m1, GadgetContainerMessage<P2>* m2) = 0;
+
+    };
+
+
+    template <class P1, class P2, class P3> class Gadget3 : public BasicPropertyGadget
+    {
+
+    protected:
+        int process(ACE_Message_Block* mb)
+        {
+
+            GadgetContainerMessage<P1>* m1 = AsContainerMessage<P1>(mb);
+
+            GadgetContainerMessage<P2>* m2 = 0;
+            if (m1) {
+                m2 = AsContainerMessage<P2>(m1->cont());
+            }
+
+            GadgetContainerMessage<P3>* m3 = 0;
+            if (m2) {
+                m3 = AsContainerMessage<P3>(m2->cont());
+            }
+
+            if (!m1 || !m2 || !m3) {
+                if (!pass_on_undesired_data_) {
+		  GERROR("%s -> %s, (%s, %s, %p), (%s, %s, %p), (%s, %s, %p)\n",
+                        this->module()->name(),
+                        "Gadget3::process, Conversion of Message Block Failed",
+                        typeid(GadgetContainerMessage<P1>*).name(),
+                        typeid(m1).name(),
+                        m1,
+                        typeid(GadgetContainerMessage<P2>*).name(),
+                        typeid(m2).name(),
+                        m2,
+                        typeid(GadgetContainerMessage<P3>*).name(),
+                        typeid(m3).name(),
+                        m3);
+                    return -1;
+                } else {
+                    return (this->next()->putq(mb));
+                }
+            }
+
+            return this->process(m1,m2,m3);
+        }
+
+        virtual int process(GadgetContainerMessage<P1>* m1, GadgetContainerMessage<P2>* m2, GadgetContainerMessage<P3>* m3) = 0;
+
+    };
+
+/* Macros for handling dyamic linking */
+// #define GADGET_DECLARE(GADGET) GADGETRON_LOADABLE_DECLARE(GADGET)
+// #define GADGET_FACTORY_DECLARE(GADGET) GADGETRON_LOADABLE_FACTORY_DECLARE(Gadget,GADGET)
+
+#define GADGET_DECLARE(GADGET) 
+#define GADGET_FACTORY_DECLARE(GADGET) GADGETRON_LOADABLE_FACTORY_DECLARE(Gadget,GADGET)
+
+}
+
+#endif //GADGET_H
diff --git a/apps/gadgetron/GadgetContainerMessage.h b/apps/gadgetron/GadgetContainerMessage.h
new file mode 100644
index 0000000..eb848f2
--- /dev/null
+++ b/apps/gadgetron/GadgetContainerMessage.h
@@ -0,0 +1,160 @@
+#ifndef GADGETCONTAINERMESSAGE_H
+#define GADGETCONTAINERMESSAGE_H
+#pragma once
+
+#include <ace/Message_Block.h>
+#include <string>
+
+namespace Gadgetron{
+/**
+   The purpose of this case is to provide a type indepent interface to all ContainerMessages
+
+   This interface is able to set a magic number for each type which is later on used
+   instead of RTTI to "safely" cast to the right GadgetContainerMessage type
+
+ */
+class GadgetContainerMessageBase : public ACE_Message_Block
+{
+  typedef ACE_Message_Block base;
+  
+ public:
+
+  enum { CONTAINER_MESSAGE_BLOCK = (ACE_Message_Block::USER_FLAGS << 2) };
+
+  GadgetContainerMessageBase(size_t size) : base(size)
+  {
+    set_flags(CONTAINER_MESSAGE_BLOCK); //Mark this message block as a container, so that we know it is safe to type cast it.
+  }
+
+  GadgetContainerMessageBase(ACE_Data_Block* d)
+    : base(d)
+  {
+    set_flags(CONTAINER_MESSAGE_BLOCK);
+  }
+  
+
+#ifdef WIN32
+  std::string getTypeID() { return type_magic_id_; }
+  template <class T> static std::string magic_number_for_type() { return std::string(typeid(T).name()); } 
+
+protected:
+  std::string type_magic_id_;
+
+#else
+
+  int getTypeID() { return type_magic_id_; }
+
+  template <class T> static int magic_number_for_type(){
+    //Will only get set once for each instanciation of this function
+    static int result(next_magic_type_number()); 
+    return result;
+  }
+
+ protected:
+  int type_magic_id_;
+
+  //Utility function for increting the magic number for types.
+  static int next_magic_type_number()
+  {
+    static int magic(0);
+    return magic++;
+  }	 
+#endif  
+};
+
+template <class T> class GadgetContainerMessage : public GadgetContainerMessageBase
+{
+  typedef GadgetContainerMessageBase base;
+
+
+public:
+  /**
+   *  Constructor, passing on input arguments to the contained class.
+   * @param xs Variadic arguments to the contained class
+   */
+  template<typename... X> GadgetContainerMessage(X... xs)
+  :base(sizeof(T)), content_(0)
+   {
+	 //Using placement new to put the new object at the ACE_Message_Block location
+    content_ = new (this->wr_ptr()) T{xs...};
+
+    //Advance the write pointer appropriately.
+    this->wr_ptr(sizeof(T));
+
+    //Assign type ID that will allow us to safely cast this message.
+    type_magic_id_ = magic_number_for_type<T>();
+
+
+   }
+
+
+  GadgetContainerMessage(ACE_Data_Block* d)
+    : base(d)
+  {
+    type_magic_id_ = magic_number_for_type<T>();
+    content_ = reinterpret_cast<T*>(this->rd_ptr());
+  }
+
+  virtual ~GadgetContainerMessage() 
+  {
+    //ACE_Message_Block will take care of deallocating space for the object itself;
+  }
+
+  virtual ACE_Message_Block* release()
+  {    
+    //In case the object contained in this object has allocated memory on the heap, it must be destroyed
+    if (this->reference_count() <= 1) {
+      if (content_) content_->~T();
+    } 
+    if (cont_) {
+      cont_->release();
+      cont_ = 0;
+    }
+    return ACE_Message_Block::release();
+  }
+
+  T* getObjectPtr() 
+  {
+    return content_;
+  }
+
+  virtual GadgetContainerMessage<T>* duplicate() 
+  {
+    GadgetContainerMessage<T>* nb = new GadgetContainerMessage<T>(this->data_block()->duplicate());
+    nb->rd_ptr (this->rd_ptr_);
+    nb->wr_ptr (this->wr_ptr_);
+    if (this->cont_) {
+      nb->cont_ = this->cont_->duplicate();
+    }
+    return nb;
+  }
+
+protected:
+  T* content_;
+}; 
+
+/**
+   This function replaces the slower dynamic_cast which we would otherwise rely on.
+   The speed of dynamic_cast varies greatly from platform to platform.
+
+   This function is less safe since it assumes casting to ContainerMessageBase is OK
+   when a certain flag is set on the ACE_Message_Block. If some user decides to use that flag
+   for other purposes, it could cause major problems that are hard to debug.
+
+   TODO: Find a more elegant solution for this.
+*/
+template <class T> GadgetContainerMessage<T>* AsContainerMessage(ACE_Message_Block* mb)
+{
+  if (!mb || !(mb->flags() & GadgetContainerMessageBase::CONTAINER_MESSAGE_BLOCK)) {
+    return 0;
+  }
+
+  GadgetContainerMessageBase* mbb = reinterpret_cast<GadgetContainerMessageBase*>(mb);
+  if (mbb->getTypeID() != GadgetContainerMessageBase::magic_number_for_type<T>()) {
+    return 0;
+  }
+
+  return reinterpret_cast<GadgetContainerMessage<T>* >(mbb);
+}
+}
+#endif  //GADGETCONTAINERMESSAGE_H
diff --git a/apps/gadgetron/GadgetMessageInterface.h b/apps/gadgetron/GadgetMessageInterface.h
new file mode 100644
index 0000000..2bf5252
--- /dev/null
+++ b/apps/gadgetron/GadgetMessageInterface.h
@@ -0,0 +1,236 @@
+#ifndef GADGETMESSAGEINTERFACE_H
+#define GADGETMESSAGEINTERFACE_H
+
+#include "GadgetContainerMessage.h"
+#include "GadgetronExport.h"
+#include "Gadget.h"
+
+#include <ace/SOCK_Stream.h>
+#include <ace/Basic_Types.h>
+#include <map>
+
+namespace Gadgetron
+{
+
+enum GadgetronMessageID {
+  GADGET_MESSAGE_INT_ID_MIN       =   0,
+  GADGET_MESSAGE_CONFIG_FILE      =   1,
+  GADGET_MESSAGE_CONFIG_SCRIPT    =   2,
+  GADGET_MESSAGE_PARAMETER_SCRIPT =   3,
+  GADGET_MESSAGE_CLOSE            =   4,
+  GADGET_MESSAGE_INT_ID_MAX       = 999
+};
+
+struct GadgetMessageIdentifier
+{
+  ACE_UINT16 id;
+};
+
+struct GadgetMessageConfigurationFile
+{
+  char configuration_file[1024];
+};
+
+struct GadgetMessageScript
+{
+  ACE_UINT32 script_length;
+};
+
+
+/**
+   Interface for classes capable of reading a specific message
+
+   This is an abstract class, implementations need to be done for each message type.
+ */
+class GadgetMessageReader
+{
+ public:
+	virtual ~GadgetMessageReader() {}
+
+  /**
+     Function must be implemented to read a specific message.
+   */
+  virtual ACE_Message_Block* read(ACE_SOCK_Stream* stream) = 0;
+
+};
+
+/**
+   Interface for classes capable of writing for writing a specific message to a socket. 
+   This is an abstract class, implementations need to be done for each message type.
+ */
+class GadgetMessageWriter
+{
+ public:
+	virtual ~GadgetMessageWriter() {}
+
+   /**
+     Function must be implemented to write a specific message.
+   */
+  virtual int write(ACE_SOCK_Stream* stream, ACE_Message_Block* mb) = 0;
+};
+
+class GadgetMessageWriterContainer
+{
+ public:
+  virtual ~GadgetMessageWriterContainer() {
+    clear();
+  }
+
+
+  GadgetMessageWriter* find(ACE_UINT16 slot) {
+    std::map< ACE_UINT16, GadgetMessageWriter* >::iterator it;
+
+    it = map_.find(slot);
+    GadgetMessageWriter* ret = 0;
+    if (it != map_.end()) {
+      ret = it->second;
+    }
+    return ret;
+  }
+
+  int insert ( unsigned short slot, GadgetMessageWriter* dispatcher) {
+    std::map< ACE_UINT16, GadgetMessageWriter* >::iterator it;
+
+    it = map_.find(slot);
+    if (it != map_.end()) {
+      delete it->second;
+      it->second = dispatcher;
+    } else {
+      map_[slot] = dispatcher;
+    }
+    return GADGET_OK;
+  }
+
+  int clear()
+  {
+    std::map< ACE_UINT16, GadgetMessageWriter* >::iterator it;
+    for (it = map_.begin(); it != map_.end(); it++) {
+      delete it->second;
+     }
+    map_.clear();
+    return 0;
+  }
+
+ protected:
+  std::map<ACE_UINT16, GadgetMessageWriter*> map_;
+};
+
+
+class GadgetMessageReaderContainer
+{
+ public:
+  virtual ~GadgetMessageReaderContainer() {
+    clear();
+  }
+
+
+  GadgetMessageReader* find(ACE_UINT16 slot) {
+    std::map< ACE_UINT16, GadgetMessageReader* >::iterator it;
+
+    it = map_.find(slot);
+    GadgetMessageReader* ret = 0;
+    if (it != map_.end()) {
+      ret = it->second;
+    }
+    return ret;
+  }
+
+  int insert ( unsigned short slot, GadgetMessageReader* dispatcher) {
+    std::map< ACE_UINT16, GadgetMessageReader* >::iterator it;
+
+    it = map_.find(slot);
+    if (it != map_.end()) {
+      delete it->second;
+      it->second = dispatcher;
+    } else {
+      map_[slot] = dispatcher;
+    }
+    return GADGET_OK;
+  }
+
+  int clear()
+  {
+    std::map< ACE_UINT16, GadgetMessageReader* >::iterator it;
+
+    for (it = map_.begin(); it != map_.end(); it++) {
+      delete it->second;
+     }
+    map_.clear();
+    return 0;
+  }
+ protected:
+  std::map<ACE_UINT16, GadgetMessageReader*> map_;
+};
+
+class GadgetMessageConfigFileReader : public GadgetMessageReader
+{
+ public:
+  virtual ACE_Message_Block* read(ACE_SOCK_STREAM* stream) {
+
+    GadgetContainerMessage<GadgetMessageConfigurationFile>* mb1 =
+      new GadgetContainerMessage<GadgetMessageConfigurationFile>();
+    
+    if (!mb1) {
+      GDEBUG("Unable to allocate GadgetMessageConfigurationFile\n");
+      return 0;
+    }
+
+    ssize_t recv_cnt = 0;
+    if ((recv_cnt = stream->recv_n (mb1->getObjectPtr(), sizeof(GadgetMessageConfigurationFile))) <= 0) {
+      GDEBUG("Unable to read configuration file information\n");
+      mb1->release();
+      return 0;
+    }
+
+    return mb1;
+  }
+};
+
+
+class GadgetMessageScriptReader : public GadgetMessageReader
+{
+ public:
+  virtual ACE_Message_Block* read(ACE_SOCK_STREAM* stream) {
+
+    GadgetMessageScript ms;
+
+    ssize_t recv_cnt = 0;
+    if ((recv_cnt = stream->recv_n (&ms, sizeof(GadgetMessageScript))) <= 0) {
+      GDEBUG("Unable to read configuration file information\n");
+       return 0;
+    }
+    
+    ACE_Message_Block* mb = new ACE_Message_Block(ms.script_length);
+
+    if ((recv_cnt = stream->recv_n (mb->wr_ptr(), ms.script_length)) <= 0) {
+      GERROR("Unable to read script\n");
+      return 0;
+    }
+    mb->wr_ptr(ms.script_length);
+    mb->set_flags(Gadget::GADGET_MESSAGE_CONFIG);
+
+    return mb;
+  }
+};
+
+/* Macros for handling dyamic linking */
+
+//#define GADGETRON_READER_DECLARE(READER) \
+//  GADGETRON_LOADABLE_DECLARE(READER)
+
+#define GADGETRON_READER_DECLARE(READER) 
+
+#define GADGETRON_READER_FACTORY_DECLARE(READER)	\
+  GADGETRON_LOADABLE_FACTORY_DECLARE(GadgetMessageReader, READER)
+
+//#define GADGETRON_WRITER_DECLARE(WRITER) \
+//  GADGETRON_LOADABLE_DECLARE(WRITER)
+
+#define GADGETRON_WRITER_DECLARE(WRITER) 
+
+#define GADGETRON_WRITER_FACTORY_DECLARE(WRITER)	\
+  GADGETRON_LOADABLE_FACTORY_DECLARE(GadgetMessageWriter, WRITER)
+
+}
+
+#endif //GADGETMESSAGEINTERFACE_H
diff --git a/apps/gadgetron/GadgetServerAcceptor.cpp b/apps/gadgetron/GadgetServerAcceptor.cpp
new file mode 100644
index 0000000..7a9dd38
--- /dev/null
+++ b/apps/gadgetron/GadgetServerAcceptor.cpp
@@ -0,0 +1,55 @@
+#include "GadgetServerAcceptor.h"
+#include "GadgetStreamController.h"
+
+using namespace Gadgetron;
+
+GadgetServerAcceptor::~GadgetServerAcceptor ()
+{
+  this->handle_close (ACE_INVALID_HANDLE, 0);
+}
+
+int GadgetServerAcceptor::open (const ACE_INET_Addr &listen_addr)
+{
+  if (this->acceptor_.open (listen_addr, 1) == -1) {
+    GERROR("error opening acceptor\n");
+    return -1;
+  }
+
+  return this->reactor ()->register_handler(this, ACE_Event_Handler::ACCEPT_MASK);
+}
+
+int GadgetServerAcceptor::handle_input (ACE_HANDLE)
+{
+  GadgetStreamController *controller;
+
+  ACE_NEW_RETURN (controller, GadgetStreamController, -1);
+
+  auto_ptr<GadgetStreamController> p (controller);
+
+  controller->set_global_gadget_parameters(global_gadget_parameters_);
+
+  if (this->acceptor_.accept (controller->peer ()) == -1) {
+    GERROR("Failed to accept controller connection\n"); 
+    return -1;
+  }
+  
+  p.release ();
+  controller->reactor (this->reactor ());
+  if (controller->open () == -1)
+    controller->handle_close (ACE_INVALID_HANDLE, 0);
+  return 0;
+}
+
+int GadgetServerAcceptor::handle_close (ACE_HANDLE, ACE_Reactor_Mask)
+{
+  GDEBUG("GadgetServerAcceptor::handle_close\n");
+  GDEBUG("Close Data Acceptor\n");
+
+  if (this->acceptor_.get_handle () != ACE_INVALID_HANDLE) {
+    ACE_Reactor_Mask m = 
+      ACE_Event_Handler::ACCEPT_MASK | ACE_Event_Handler::DONT_CALL;
+    this->reactor ()->remove_handler (this, m);
+    this->acceptor_.close ();
+  }
+  return 0;
+}
diff --git a/apps/gadgetron/GadgetServerAcceptor.h b/apps/gadgetron/GadgetServerAcceptor.h
new file mode 100644
index 0000000..3df5bca
--- /dev/null
+++ b/apps/gadgetron/GadgetServerAcceptor.h
@@ -0,0 +1,31 @@
+#ifndef _GADGETSERVERACCEPTOR_H
+#define _GADGETSERVERACCEPTOR_H
+
+#include "ace/SOCK_Acceptor.h"
+#include "ace/Reactor.h"
+#include <string>
+#include <map>
+
+namespace Gadgetron{
+class GadgetServerAcceptor : public ACE_Event_Handler
+{
+public:
+  virtual ~GadgetServerAcceptor ();
+
+  int open (const ACE_INET_Addr &listen_addr);
+
+  virtual ACE_HANDLE get_handle (void) const
+    { return this->acceptor_.get_handle (); }
+
+  virtual int handle_input (ACE_HANDLE fd = ACE_INVALID_HANDLE);
+
+  virtual int handle_close (ACE_HANDLE handle,
+                            ACE_Reactor_Mask close_mask);
+
+  std::map<std::string, std::string> global_gadget_parameters_;
+
+protected:
+  ACE_SOCK_Acceptor acceptor_;
+};
+}
+#endif //_GADGETSERVERACCEPTOR_H
diff --git a/apps/gadgetron/GadgetStreamController.cpp b/apps/gadgetron/GadgetStreamController.cpp
new file mode 100644
index 0000000..661752b
--- /dev/null
+++ b/apps/gadgetron/GadgetStreamController.cpp
@@ -0,0 +1,384 @@
+#include "ace/OS_NS_stdlib.h"
+#include "ace/OS_NS_string.h"
+#include "ace/OS_NS_stdio.h"
+#include "ace/OS_NS_netdb.h"
+
+#include "GadgetStreamController.h"
+#include "GadgetContainerMessage.h"
+#include "GadgetMessageInterface.h"
+#include "GadgetronConnector.h"
+#include "Gadget.h"
+#include "EndGadget.h"
+#include "gadgetron_config.h"
+
+#include "gadgetron_xml.h"
+#include "url_encode.h"
+
+#include <complex>
+#include <fstream>
+
+using namespace Gadgetron;
+
+GadgetStreamController::GadgetStreamController()
+  : GadgetStreamInterface()
+  , notifier_ (0, this, ACE_Event_Handler::WRITE_MASK)
+  , writer_task_(&this->peer())
+{
+}
+
+int GadgetStreamController::open (void)
+{
+	//We will set up the controllers message queue such that when a packet is enqueued write will be triggered.
+	this->notifier_.reactor (this->reactor ());
+	this->msg_queue ()->notification_strategy (&this->notifier_);
+
+	ACE_TCHAR peer_name[MAXHOSTNAMELEN];
+	ACE_INET_Addr peer_addr;
+	if (peer().get_remote_addr (peer_addr) == 0 &&
+	    peer_addr.addr_to_string (peer_name, MAXHOSTNAMELEN) == 0) {
+	  GINFO("Connection from %s\n", peer_name);
+	}
+
+	//We have to have these basic types to be able to receive configuration file for stream
+	readers_.insert(GADGET_MESSAGE_CONFIG_FILE,
+			new GadgetMessageConfigFileReader());
+
+	readers_.insert(GADGET_MESSAGE_CONFIG_SCRIPT,
+			new GadgetMessageScriptReader());
+
+	readers_.insert(GADGET_MESSAGE_PARAMETER_SCRIPT,
+			new GadgetMessageScriptReader());
+
+	GadgetModule *head = 0;
+	GadgetModule *tail = 0;
+
+	if (tail == 0) {
+		Gadget* eg = new EndGadget();
+		if (eg) {
+			eg->set_controller(this);
+		}
+		
+		ACE_NEW_RETURN(tail,
+			       ACE_Module<ACE_MT_SYNCH>( ACE_TEXT("EndGadget"),
+							 eg ),
+			       -1);
+
+		stream_.open(0,head,tail);
+	}
+
+	this->writer_task_.open();
+
+	return this->reactor ()->register_handler(this,
+			ACE_Event_Handler::READ_MASK);// | ACE_Event_Handler::WRITE_MASK);
+}
+
+
+int GadgetStreamController::handle_input (ACE_HANDLE)
+{
+	//Reading sequence:
+	GadgetMessageIdentifier id;
+	ssize_t recv_cnt = 0;
+	if ((recv_cnt = peer().recv_n (&id, sizeof(GadgetMessageIdentifier))) <= 0) {
+	  GERROR("GadgetStreamController, unable to read message identifier\n");
+	  return -1;
+	}
+
+	if (id.id == GADGET_MESSAGE_CLOSE) {
+	  GDEBUG("Received close signal from client. Closing stream...\n");
+	  stream_.close(1); //Shutdown gadgets and wait for them
+	  GDEBUG("Stream closed\n");
+	  GDEBUG("Closing writer task\n");
+	  this->writer_task_.close(1);
+	  GDEBUG("Writer task closed\n");
+	  return 0;
+	}
+
+	GadgetMessageReader* r = readers_.find(id.id);
+
+	if (!r) {
+	  GERROR("Unrecognized Message ID received: %d\n", id.id);
+	  return GADGET_FAIL;
+	}
+
+	ACE_Message_Block* mb = r->read(&peer());
+
+	if (!mb) {
+	  GERROR("GadgetMessageReader returned null pointer\n");
+	  return GADGET_FAIL;
+	}
+
+	//We need to handle some special cases to make sure that we can get a stream set up.
+	if (id.id == GADGET_MESSAGE_CONFIG_FILE) {
+	  GadgetContainerMessage<GadgetMessageConfigurationFile>* cfgm =
+	    AsContainerMessage<GadgetMessageConfigurationFile>(mb);
+
+	  if (!cfgm) {
+	    GERROR("Failed to cast message block to configuration file\n");
+	    mb->release();
+	    return GADGET_FAIL;
+	  } else {
+	    if (this->configure_from_file(std::string(cfgm->getObjectPtr()->configuration_file)) != GADGET_OK) {
+	      GERROR("GadgetStream configuration failed\n");
+	      mb->release();
+	      return GADGET_FAIL;
+	    } else {
+	      mb->release();
+	      return GADGET_OK;
+	    }
+	  }
+	} else if (id.id == GADGET_MESSAGE_CONFIG_SCRIPT) {
+	  std::string xml_config(mb->rd_ptr(), mb->length());
+	  if (this->configure(xml_config) != GADGET_OK) {
+	    GERROR("GadgetStream configuration failed\n");
+	    mb->release();
+	    return GADGET_FAIL;
+	  } else {
+	    mb->release();
+	    return GADGET_OK;
+	  }
+	}
+
+	ACE_Time_Value wait = ACE_OS::gettimeofday() + ACE_Time_Value(0,10000); //10ms from now
+	if (stream_.put(mb) == -1) {
+	  GERROR("Failed to put stuff on stream, too long wait, %d\n",  ACE_OS::last_error () ==  EWOULDBLOCK);
+	  mb->release();
+	  return GADGET_FAIL;
+	}
+
+	return GADGET_OK;
+}
+
+
+int GadgetStreamController::output_ready(ACE_Message_Block* mb) 
+{ 
+	int res = this->writer_task_.putq(mb);
+	return res;
+}
+
+
+
+int GadgetStreamController::handle_close (ACE_HANDLE, ACE_Reactor_Mask mask)
+{
+  GDEBUG("handle_close called\n");
+  
+  if (mask == ACE_Event_Handler::WRITE_MASK)
+    return 0;
+
+  GINFO("Shutting down stream and closing up shop...\n");
+  
+  this->stream_.close();
+  
+  mask = ACE_Event_Handler::ALL_EVENTS_MASK |
+    ACE_Event_Handler::DONT_CALL;
+  
+  this->reactor ()->remove_handler (this, mask);
+  
+  //Empty output queue in case there is something on it.
+  int messages_dropped = this->msg_queue ()->flush();
+  
+  if (messages_dropped) {
+    GDEBUG("Flushed %d messages from output queue\n", messages_dropped);
+    this->reactor ()->handle_events(); //Flush any remaining events before we delete this Stream Controller
+  }
+
+  // Remove all readers and writers
+  //writers_.clear();
+  readers_.clear();
+  
+  //Clear DLL handles (to make DLLs unload if needed)
+  for (unsigned int i = 0; i < dll_handles_.size(); i++) {
+#if defined WIN32
+    dll_handles_[i]->close(0); //On windows we will not unload the DLLs even when there are no more refs
+#else 
+    dll_handles_[i]->close(0); //On Unix/Mac it seems to be OK to do this
+#endif
+  }
+  dll_handles_.clear();
+  
+  GINFO("Stream is closed\n");
+
+  delete this;
+  return 0;
+}
+
+
+
+int GadgetStreamController::configure_from_file(std::string config_xml_filename)
+{
+  ACE_TCHAR config_file_name[4096];
+  ACE_OS::sprintf(config_file_name, "%s/%s/%s", gadgetron_home_.c_str(), GADGETRON_CONFIG_PATH, config_xml_filename.c_str());
+  
+  GINFO("Running configuration: %s\n", config_file_name);
+
+  std::ifstream file (config_file_name, std::ios::in|std::ios::binary|std::ios::ate);
+  if (file.is_open()) {
+    size_t size = file.tellg();
+    char* buffer = new char [size];
+    if (!buffer) {
+      GERROR("Unable to create temporary buffer for configuration file\n");
+      return GADGET_FAIL;
+    }
+    file.seekg (0, std::ios::beg);
+    file.read (buffer, size);
+    file.close();
+    std::string xml_file_contents(buffer,size);
+    
+    return configure(xml_file_contents);
+    delete[] buffer;
+    
+  } else {
+    GERROR("Unable to open configuation file: %s\n", config_file_name);
+    return GADGET_FAIL;
+  }
+  
+  return GADGET_OK;
+}
+
+int GadgetStreamController::configure(std::string config_xml_string)
+{
+
+  GadgetronXML::GadgetStreamConfiguration cfg;
+  try {
+    deserialize(config_xml_string.c_str(), cfg);  
+  }  catch (const std::runtime_error& e) {
+    GERROR("Failed to parse Gadget Stream Configuration: %s\n", e.what());
+    return GADGET_FAIL;
+  }
+
+  GINFO("Found %d readers\n", cfg.reader.size());
+  GINFO("Found %d writers\n", cfg.writer.size());
+  GINFO("Found %d gadgets\n", cfg.gadget.size());
+  
+  //Configuration of readers
+  for (std::vector<GadgetronXML::Reader>::iterator i = cfg.reader.begin();
+       i != cfg.reader.end();
+       ++i) 
+    {
+
+      long slot = 0;
+      std::string dllname("");
+      std::string classname("");
+
+      slot = i->slot;
+      dllname = i->dll;
+      classname = i->classname;
+
+      GINFO("--Found reader declaration\n");
+      GINFO("  Reader dll: %s\n", dllname.c_str());
+      GINFO("  Reader class: %s\n", classname.c_str());
+      GINFO("  Reader slot: %d\n", slot);
+
+      GadgetMessageReader* r =
+	load_dll_component<GadgetMessageReader>(dllname.c_str(),
+						classname.c_str());
+      
+      if (!r) {
+	GERROR("Failed to load GadgetMessageReader from DLL\n");
+	return GADGET_FAIL;
+      }
+      
+      readers_.insert(slot, r);
+      
+    }	
+  //Configuration of readers end
+
+
+  //Configuration of writers
+  for (std::vector<GadgetronXML::Writer>::iterator i = cfg.writer.begin();
+       i != cfg.writer.end();
+       ++i) 
+    {
+      long slot = 0;
+      std::string dllname("");
+      std::string classname("");
+      
+      slot = i->slot;
+      dllname = i->dll;
+      classname = i->classname;
+
+      GINFO("--Found writer declaration\n");
+      GINFO("  Reader dll: %s\n", dllname.c_str());
+      GINFO("  Reader class: %s\n", classname.c_str());
+      GINFO("  Reader slot: %d\n", slot);
+      
+      GadgetMessageWriter* w =
+	load_dll_component<GadgetMessageWriter>(dllname.c_str(),
+						classname.c_str());
+      
+      if (!w) {
+	GERROR("Failed to load GadgetMessageWriter from DLL\n");
+	return GADGET_FAIL;
+      }
+      
+      writer_task_.register_writer(slot, w);
+    }
+  //Configuration of writers end
+
+  //Let's configure the stream
+  GDEBUG("Processing %d gadgets in reverse order\n",cfg.gadget.size());
+
+  for (std::vector<GadgetronXML::Gadget>::reverse_iterator i = cfg.gadget.rbegin();
+       i != cfg.gadget.rend();
+       ++i) 
+    {
+      std::string gadgetname("");
+      std::string dllname("");
+      std::string classname("");
+
+      gadgetname = i->name;
+      dllname = i->dll;
+      classname = i->classname;
+
+      GINFO("--Found gadget declaration\n");
+      GINFO("  Gadget Name: %s\n", gadgetname.c_str());
+      GINFO("  Gadget dll: %s\n", dllname.c_str());
+      GINFO("  Gadget class: %s\n", classname.c_str());
+
+      GadgetModule* m = create_gadget_module(dllname.c_str(),
+					     classname.c_str(),
+					     gadgetname.c_str());
+      
+      if (!m) {
+	GERROR("Failed to create GadgetModule from %s:%s\n",
+	       classname.c_str(),
+	       dllname.c_str());
+	return GADGET_FAIL;
+      }
+      
+      Gadget* g = dynamic_cast<Gadget*>(m->writer());//Get the gadget out of the module
+      
+      GINFO("  Gadget parameters: %d\n", i->property.size());
+      for (std::vector<GadgetronXML::GadgetronParameter>::iterator p = i->property.begin();
+	   p != i->property.end();
+	   ++p)
+	{
+	  std::string pname(p->name);
+	  std::string pval(p->value);
+	  GINFO("Setting parameter %s = %s\n", pname.c_str(),pval.c_str());
+	  g->set_parameter(pname.c_str(),pval.c_str(),false);
+	}
+      
+        // set the global gadget parameters for every gadget
+      std::map<std::string, std::string>::const_iterator iter;
+      for ( iter=global_gadget_parameters_.begin(); iter!=global_gadget_parameters_.end(); iter++ )
+        {
+	  std::string key = iter->first;
+	  std::string value = iter->second;
+	  g->set_parameter(key.c_str(), value.c_str(), false);
+        }
+
+      if (stream_.push(m) < 0) {
+	GERROR("Failed to push Gadget %s onto stream\n", gadgetname.c_str());
+	delete m;
+	return GADGET_FAIL;
+      }
+      
+    }
+
+  GINFO("Gadget Stream configured\n");
+  stream_configured_ = true;
+
+  return GADGET_OK;
+}
+
+
diff --git a/apps/gadgetron/GadgetStreamController.h b/apps/gadgetron/GadgetStreamController.h
new file mode 100644
index 0000000..5223745
--- /dev/null
+++ b/apps/gadgetron/GadgetStreamController.h
@@ -0,0 +1,50 @@
+#ifndef GADGETSTREAMCONTROLLER_H
+#define GADGETSTREAMCONTROLLER_H
+
+#include "ace/Log_Msg.h"
+#include "ace/Reactor.h"
+#include "ace/SOCK_Stream.h"
+#include "ace/Message_Queue.h"
+#include "ace/Svc_Handler.h"
+#include "ace/Reactor_Notification_Strategy.h"
+
+#include <complex>
+#include <vector>
+
+#include "gadgetbase_export.h"
+#include "GadgetronConnector.h"
+#include "GadgetStreamInterface.h"
+
+
+namespace Gadgetron{
+
+class EXPORTGADGETBASE GadgetStreamController 
+  : public ACE_Svc_Handler<ACE_SOCK_STREAM, ACE_MT_SYNCH>
+  , public GadgetStreamInterface
+{
+public:
+  GadgetStreamController();
+
+  virtual ~GadgetStreamController()
+    { 
+    }
+
+  int open (void);
+
+
+  virtual int handle_input (ACE_HANDLE fd = ACE_INVALID_HANDLE);
+  virtual int handle_close (ACE_HANDLE handle,
+                            ACE_Reactor_Mask close_mask);
+
+  virtual int output_ready(ACE_Message_Block* mb);
+
+private:
+  WriterTask writer_task_;
+  ACE_Reactor_Notification_Strategy notifier_;
+  GadgetMessageReaderContainer readers_;
+  virtual int configure(std::string config_xml_string);
+  virtual int configure_from_file(std::string config_xml_filename);
+};
+
+}
+#endif //GADGETSTREAMCONTROLLER_H
diff --git a/apps/gadgetron/GadgetStreamInterface.h b/apps/gadgetron/GadgetStreamInterface.h
new file mode 100644
index 0000000..09e0a77
--- /dev/null
+++ b/apps/gadgetron/GadgetStreamInterface.h
@@ -0,0 +1,129 @@
+#ifndef GADGETSTREAMINTERFACE_H
+#define GADGETSTREAMINTERFACE_H
+
+#include "ace/Stream.h"
+#include "ace/DLL.h"
+#include "ace/DLL_Manager.h"
+
+#include "gadgetron_paths.h"
+#include "Gadget.h"
+
+typedef ACE_Module<ACE_MT_SYNCH> GadgetModule;
+
+/**
+   Abstract class for structure containing stream of Gadgets.
+
+ */
+namespace Gadgetron {
+
+  class GadgetStreamInterface
+  {
+  public:
+    GadgetStreamInterface()
+      : stream_configured_(false)
+    {  
+      gadgetron_home_ = get_gadgetron_home();
+    } 
+
+    virtual int output_ready(ACE_Message_Block* mb) = 0;
+
+    virtual Gadget* find_gadget(std::string gadget_name)
+    {
+      GadgetModule* gm = stream_.find(gadget_name.c_str());
+      
+      if (gm) {
+	Gadget* g = dynamic_cast<Gadget*>(gm->writer());
+	return g;
+      } else {
+	GDEBUG("Gadget with name %s not found! Returning null pointer\n", gadget_name.c_str());
+      }
+      
+      return 0;
+    }
+
+    void set_global_gadget_parameters(const std::map<std::string, std::string>& globalGadgetPara)
+    {
+      global_gadget_parameters_ = globalGadgetPara;
+    }
+
+  protected:
+    ACE_Stream<ACE_MT_SYNCH> stream_;
+    bool stream_configured_;  
+    std::vector<ACE_DLL_Handle*> dll_handles_;
+    std::map<std::string, std::string> global_gadget_parameters_;
+    std::string gadgetron_home_;
+    
+    virtual GadgetModule * create_gadget_module(const char* DLL, const char* gadget, const char* gadget_module_name)
+    {
+
+      Gadget* g = load_dll_component<Gadget>(DLL,gadget);
+      
+      if (!g) {
+	GERROR("Failed to load gadget using factory\n");
+	return 0;
+      }
+      
+      g->set_controller(this);
+      
+      GadgetModule *module = 0;
+      ACE_NEW_RETURN (module,
+		      GadgetModule (gadget_module_name, g),
+		      0);
+      
+      return module;
+    }
+
+    template <class T>  T* load_dll_component(const char* DLL, const char* component_name)
+    {
+      ACE_DLL_Manager* dllmgr = ACE_DLL_Manager::instance();
+      
+      ACE_DLL_Handle* dll = 0;
+      ACE_SHLIB_HANDLE dll_handle = 0;
+      
+      ACE_TCHAR dllname[1024];
+#if defined(WIN32) && defined(_DEBUG)
+      ACE_OS::sprintf(dllname, "%s%sd",ACE_DLL_PREFIX, DLL);
+#else
+      ACE_OS::sprintf(dllname, "%s%s",ACE_DLL_PREFIX, DLL);
+#endif
+
+      ACE_TCHAR factoryname[1024];
+      ACE_OS::sprintf(factoryname, "make_%s", component_name);
+      
+      dll = dllmgr->open_dll (dllname, ACE_DEFAULT_SHLIB_MODE, dll_handle );
+      
+      if (!dll) {
+	GERROR("Failed to load DLL, Possible reasons: \n");
+	GERROR("   * Name of DLL is wrong in XML file \n");
+	GERROR("   * Path of DLL is not in your DLL search path (LD_LIBRARY_PATH on Unix)\n");
+	GERROR("   * Path of other DLLs that this DLL depends on is not in the search path\n");
+	return 0;
+      } else {
+	dll_handles_.push_back(dll);
+      }
+
+      //Function pointer
+      typedef T* (*ComponentCreator) (void);
+      
+      void *void_ptr = dll->symbol (factoryname);
+      ptrdiff_t tmp = reinterpret_cast<ptrdiff_t> (void_ptr);
+      ComponentCreator cc = reinterpret_cast<ComponentCreator> (tmp);
+      
+      if (cc == 0) {
+	GERROR("Failed to load factory (%s) from DLL (%s)\n", dllname, factoryname);
+	return 0;
+      }
+      
+      T* c = cc();
+      
+      if (!c) {
+	GERROR("Failed to create component using factory\n");
+	return 0;
+      }
+      
+      return c;
+    }
+  };
+}
+
+#endif //GADGETSTREAMINTERFACE_H
diff --git a/apps/gadgetron/GadgetronExport.h b/apps/gadgetron/GadgetronExport.h
new file mode 100644
index 0000000..f9b2ff0
--- /dev/null
+++ b/apps/gadgetron/GadgetronExport.h
@@ -0,0 +1,38 @@
+#ifndef GADGETRONEXPORT_H
+#define GADGETRONEXPORT_H
+#pragma once
+
+#if defined (WIN32)
+#ifdef __BUILD_GADGETS__
+#define GADGETEXPORT __declspec(dllexport)
+#else
+#define GADGETEXPORT __declspec(dllimport)
+#endif
+#else
+#define GADGETEXPORT
+#endif
+
+//In header file add this macro
+//#define GADGETRON_LOADABLE_DECLARE(COMPONENT)                   \
+//  void *operator new (size_t bytes);                            \
+//  void operator delete (void *ptr);                             \
+//  void *operator new(size_t s, void * p) { return p; }
+
+//In CPP file add this macro add the end
+#define GADGETRON_LOADABLE_FACTORY_DECLARE(CLASS, COMPONENT)	\
+extern "C" GADGETEXPORT CLASS * make_##COMPONENT (void);        \
+CLASS * make_##COMPONENT (void)       				\
+{							       	\
+  return new COMPONENT;                                         \
+}                                                               \
+/*void * COMPONENT ::operator new (size_t bytes)                  \
+{                                                               \
+  return ::new char[bytes];                                     \
+}                                                               \
+void COMPONENT ::operator delete (void *ptr)                    \
+{                                                               \
+  delete [] static_cast <char *> (ptr);                         \
+}*/ 
+
+
+#endif
diff --git a/apps/gadgetron/gadgetbase_export.h b/apps/gadgetron/gadgetbase_export.h
new file mode 100644
index 0000000..d90ecb9
--- /dev/null
+++ b/apps/gadgetron/gadgetbase_export.h
@@ -0,0 +1,16 @@
+#ifndef GADGETBASE_EXPORT_H_
+#define GADGETBASE_EXPORT_H_
+
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GADGETBASE__) || defined (gadgetron_gadgetbase_EXPORTS)
+#define EXPORTGADGETBASE __declspec(dllexport)
+#else
+#define EXPORTGADGETBASE __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETBASE
+#endif
+
+
+#endif /* GADGETBASE_EXPORT_H_ */
diff --git a/apps/gadgetron/gadgetron.xml.example b/apps/gadgetron/gadgetron.xml.example
new file mode 100644
index 0000000..fdc20c1
--- /dev/null
+++ b/apps/gadgetron/gadgetron.xml.example
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        
+  <port>9002</port>
+
+  <cloudBus>
+    <multiCastAddress>224.2.2.9</multiCastAddress>
+    <port>4148</port>
+  </cloudBus>
+  
+</gadgetronConfiguration>
+  
diff --git a/apps/gadgetron/gadgetron_config.in b/apps/gadgetron/gadgetron_config.in
new file mode 100644
index 0000000..eea1ac1
--- /dev/null
+++ b/apps/gadgetron/gadgetron_config.in
@@ -0,0 +1,13 @@
+#ifndef GADGETRON_CONFIG_H
+#define GADGETRON_CONFIG_H
+
+#define GADGETRON_VERSION_MAJOR @GADGETRON_VERSION_MAJOR@
+#define GADGETRON_VERSION_MINOR @GADGETRON_VERSION_MINOR@
+#define GADGETRON_VERSION_PATCH @GADGETRON_VERSION_PATCH@
+#define GADGETRON_VERSION_STRING "@GADGETRON_VERSION_STRING@"
+#define GADGETRON_CONFIG_PATH "@GADGETRON_INSTALL_CONFIG_PATH@"
+#define GADGETRON_PYTHON_PATH "@GADGETRON_INSTALL_PYTHON_MODULE_PATH@"
+#define GADGETRON_GIT_SHA1_HASH "@GADGETRON_GIT_SHA1@"
+#define GADGETRON_CUDA_NVCC_FLAGS "@CUDA_NVCC_FLAGS@"
+
+#endif //GADGETRON_CONFIG_H
diff --git a/apps/gadgetron/gadgetron_info.cpp b/apps/gadgetron/gadgetron_info.cpp
new file mode 100644
index 0000000..7f11fac
--- /dev/null
+++ b/apps/gadgetron/gadgetron_info.cpp
@@ -0,0 +1,193 @@
+//#include "ace/OS_NS_stdlib.h"
+#include "ace/OS_NS_string.h"
+//#include "ace/OS_NS_stdio.h"
+#include "ace/DLL.h"
+#include "ace/DLL_Manager.h"
+//#include "ace/OS_NS_netdb.h"
+
+#include "gadgetron_config.h"
+#include "Gadget.h"
+
+#include <iostream>
+
+
+using namespace Gadgetron;
+
+#if defined(_WIN32)
+  #include <Windows.h>
+#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
+  #include <unistd.h>
+  #include <sys/types.h>
+  #include <sys/param.h>
+#endif
+#if defined(BSD)
+#include <sys/sysctl.h>
+#endif
+
+#if USE_CUDA
+// CUDA-C includes
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+
+size_t get_system_memory_size()
+{
+#if defined(_WIN32)
+  MEMORYSTATUSEX status;
+  status.dwLength = sizeof(status);
+  GlobalMemoryStatusEx( &status );
+  return (size_t)status.ullTotalPhys;
+#else //Unix variant
+
+#if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64)) //Mac
+  int mib[2];
+  mib[0] = CTL_HW;
+
+#if defined(HW_MEMSIZE)
+  mib[1] = HW_MEMSIZE;
+#elif defined(HW_PHYSMEM64)
+  mib[1] = HW_PHYSMEM64;
+#endif
+
+  int64_t size = 0;
+  size_t len = sizeof( size );
+  if ( sysctl( mib, 2, &size, &len, NULL, 0 ) == 0 )
+    return (size_t)size;
+  return 0L;
+
+#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) //Linux
+  return (size_t)sysconf( _SC_PHYS_PAGES ) *
+    (size_t)sysconf( _SC_PAGESIZE );
+  
+#endif //Mac
+
+#endif //WIN32
+  return 0L;
+}
+
+int main(int argc, char** argv)
+{
+  std::cout << "Gadgetron Version Info" << std::endl;
+  std::cout << "  -- Version            : " << GADGETRON_VERSION_STRING << std::endl;
+  std::cout << "  -- Git SHA1           : " << GADGETRON_GIT_SHA1_HASH << std::endl;
+  std::cout << "  -- System Memory size : " << get_system_memory_size()/(1024*1024) << " MB" << std::endl;
+
+#if defined COMPILING_WITH_PYTHON_SUPPORT
+  std::cout << "  -- Python Support     : YES" << std::endl;
+#else
+  std::cout << "  -- Python Support     : NO" << std::endl; 
+#endif
+
+#if defined USE_CUDA
+  std::cout << "  -- CUDA Support       : YES (" << GADGETRON_CUDA_NVCC_FLAGS << ")" << std::endl;
+  int deviceCount = 0;
+  cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
+  
+  if (error_id != cudaSuccess) {
+    std::cout << "    * Unable to get device count" << std::endl;
+  } else {
+    std::cout << "    * Number of CUDA capable devices: " << deviceCount << std::endl;
+    if (deviceCount) {
+      
+      int dev, driverVersion = 0, runtimeVersion = 0;
+      for (dev = 0; dev < deviceCount; ++dev) {
+	
+        cudaSetDevice(dev);
+	
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, dev);
+        cudaDriverGetVersion(&driverVersion);
+        cudaRuntimeGetVersion(&runtimeVersion);
+
+	std::cout << "      - Device " << dev << ": " << deviceProp.name << std::endl; 
+	std::cout << "         + CUDA Driver Version / Runtime Version: " 
+		  <<  (driverVersion/1000)  << "." << (driverVersion%100)/10 << "/" 
+		  <<  (runtimeVersion/1000) << "." << (runtimeVersion%100)/10 << std::endl;
+	std::cout << "         + CUDA Capability Major/Minor version number: " <<  deviceProp.major << "." << deviceProp.minor << std::endl;
+	std::cout << "         + Total amount of global GPU memory: " << (float)deviceProp.totalGlobalMem/1048576.0f << " MB" << std::endl;
+      }
+    }
+  }
+#else
+  std::cout << "  -- CUDA Support       : NO" << std::endl; 
+#endif
+    
+  std::cout << std::endl;
+
+  if (argc == 1) {
+    return 0;
+  }
+
+  if ((argc == 2) || (argc > 3)) {
+    std::cout << "Invalid number of arguments (" << argc -1 << ")." << std::endl;
+    std::cout << "Usage (gadget library info):  " << argc << std::endl;
+    std::cout << " -- gadgetron_info <SHARED LIB> <GADGET_INFO>" << std::endl;
+    return -1; 
+  }
+
+  const char* DLL = argv[1];
+  const char* component_name = argv[2];
+
+  //We must be investigating a certain gadget
+  std::cout << "Examining Gadget (SHARED LIB): " << component_name << " (" << DLL << ")" << std::endl;
+
+  //Attempt to load Gadget
+  //ACE_DLL_Manager* dllmgr = ACE_DLL_Manager::instance();
+  
+  ACE_DLL_Handle dll;// = 0;
+  ACE_SHLIB_HANDLE dll_handle = 0;
+  
+  ACE_TCHAR dllname[1024];
+#if defined(WIN32) && defined(_DEBUG)
+  ACE_OS::sprintf(dllname, "%s%sd",ACE_DLL_PREFIX, DLL);
+#else
+  ACE_OS::sprintf(dllname, "%s%s",ACE_DLL_PREFIX, DLL);
+#endif
+
+  ACE_TCHAR factoryname[1024];
+  ACE_OS::sprintf(factoryname, "make_%s", component_name);
+  
+  if (dll.open(dllname, ACE_DEFAULT_SHLIB_MODE, dll_handle )) {
+    std::cout << "Failed to load DLL (" << DLL << "), Possible reasons:" << std::endl;
+    std::cout << "   - Name of DLL is wrong" << std::endl;
+    std::cout << "   - Path of DLL is not in your DLL search path (LD_LIBRARY_PATH on Unix)" << std::endl;
+    std::cout << "   - Path of other DLLs that this DLL depends on is not in the search path" << std::endl;
+    std::cout << "" << std::endl;
+    std::cout << "Set environment variable ACE_DEBUG=1 to get more information" << std::endl << std::endl; 
+    return 0;
+  } 
+
+  //Function pointer
+  typedef Gadget* (*ComponentCreator) (void);
+
+  void *void_ptr = dll.symbol (factoryname);
+  ptrdiff_t tmp = reinterpret_cast<ptrdiff_t> (void_ptr);
+  ComponentCreator cc = reinterpret_cast<ComponentCreator> (tmp);
+  
+  if (cc == 0) {
+    std::cout << "Failed to load factory (" << factoryname << ") from DLL (" << dllname << ")" << std::endl;
+    return -1;
+  }
+  
+  Gadget* g = cc();
+  if (!g) {
+    std::cout << "Failed to create component using factory" << std::endl;
+    return 0;
+  }
+
+  std::cout << "  -- Gadget compiled against Gadgetron version " << g->get_gadgetron_version() << std::endl;
+  std::cout << "    -- Properties:" << std::endl;
+  int number_of_properties = g->get_number_of_properties();
+  for (int i = 0; i < number_of_properties; i++) {
+    GadgetPropertyBase* p = g->get_property_by_index(i);
+    if (p) {
+      std::cout << "      * " << p->name() << " (" << p->type_string() << "): " << p->description() << std::endl;
+      if (std::string(p->limits_description()) != std::string("")) {
+	std::cout << "        LIMITS: " << p->limits_description() << std::endl;
+      } 
+    }
+  }
+  delete g;
+
+  return 0;
+}
diff --git a/apps/gadgetron/gadgetron_paths.h b/apps/gadgetron/gadgetron_paths.h
new file mode 100644
index 0000000..6969a35
--- /dev/null
+++ b/apps/gadgetron/gadgetron_paths.h
@@ -0,0 +1,76 @@
+#ifndef GADGETRON_PATHS_H
+#define GADGETRON_PATHS_H
+
+#include <limits.h>
+#include <string>
+#include "log.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#include <Shlwapi.h>
+#pragma comment(lib, "shlwapi.lib")
+#else
+#include <unistd.h>
+#endif // _WIN32
+
+#ifdef __APPLE__
+#include <mach-o/dyld.h>/* _NSGetExecutablePath */
+#endif
+
+#define MAX_GADGETRON_HOME_LENGTH 1024
+
+namespace Gadgetron
+{
+  inline std::string get_gadgetron_home()
+  {
+#if defined  __APPLE__
+    char path[PATH_MAX];
+    uint32_t size = sizeof(path);
+    char resolved[PATH_MAX];
+    if ((_NSGetExecutablePath(path, &size) == 0) && (realpath(path, resolved) != NULL)) {
+      std::string s1(resolved);
+      return s1.substr(0, s1.find_last_of("\\/")) + std::string("/../");
+    } else {
+      GDEBUG_STREAM("Unable to determine GADGETRON_HOME" << std::endl);
+      return std::string("");
+    }
+#elif defined _WIN32 || _WIN64
+    // Full path to the executable (including the executable file)
+    char fullPath[MAX_GADGETRON_HOME_LENGTH];	
+    // Full path to the executable (without executable file)
+    char *rightPath;
+    // Will contain exe path
+    HMODULE hModule = GetModuleHandle(NULL);
+    if (hModule != NULL)
+      {
+	// When passing NULL to GetModuleHandle, it returns handle of exe itself
+	GetModuleFileName(hModule, fullPath, (sizeof(fullPath))); 
+	rightPath = fullPath;
+	PathRemoveFileSpec(rightPath);
+	for(int i = 0; i < strlen(rightPath); i++)
+	  if(rightPath[i] == '\\') rightPath[i] = '/';
+
+	std::string s1(rightPath);
+	return s1 + std::string("/../");
+      }
+    else
+      {
+        GDEBUG_STREAM("The path to the executable is NULL" << std::endl);
+        return std::string("");
+      }
+#else //Probably some NIX where readlink should work
+    char buff[MAX_GADGETRON_HOME_LENGTH];
+    ssize_t len = ::readlink("/proc/self/exe", buff, sizeof(buff)-1);
+    if (len != -1) {
+      buff[len] = '\0';
+      std::string s1(buff);
+      return s1.substr(0, s1.find_last_of("\\/")) + std::string("/../");
+    } else {
+      GDEBUG_STREAM("Unable to determine GADGETRON_HOME" << std::endl);
+      return std::string("");
+    }
+#endif
+  }
+}
+
+#endif //GADGETRON_PATHS_H
diff --git a/apps/gadgetron/gadgetron_start.pl b/apps/gadgetron/gadgetron_start.pl
new file mode 100644
index 0000000..d9d6d7e
--- /dev/null
+++ b/apps/gadgetron/gadgetron_start.pl
@@ -0,0 +1,31 @@
+#!/usr/bin/perl
+
+use Cwd 'abs_path';
+use FindBin '$Bin';
+$gadgetron_home = $Bin . "/../";
+
+print "gadgetron_home: $gadgetron_home\n";
+
+my $executable = "$gadgetron_home/bin/gadgetron";
+my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time);
+my $timestring = sprintf "%4d%02d%02d_%02d%02d%02d",$year+1900,$mon+1,$mday,$hour,$min,$sec;
+
+print "Time string: $timestring\n";
+
+
+$ENV{'GADGETRON_HOME'} = $gadgetron_home;
+$ENV{'LD_LIBRARY_PATH'} = "/usr/local/lib:/usr/local/cuda/lib64:/usr/local/cula/lib64:" . $gadgetron_home . "/lib";
+
+$exe_command = "killall -9 gadgetron";
+system($exe_command);
+sleep(1);
+
+$exe_command = "mkdir -p log";
+system($exe_command);
+
+$logfilename = "log/gadgetron_log_$timestring" . ".txt";
+$exe_command = "nohup $executable > $logfilename 2> $logfilename < /dev/null &" ;
+system($exe_command);
+
+sleep(1);
+
diff --git a/apps/gadgetron/gadgetron_xml.cpp b/apps/gadgetron/gadgetron_xml.cpp
new file mode 100644
index 0000000..839ba1c
--- /dev/null
+++ b/apps/gadgetron/gadgetron_xml.cpp
@@ -0,0 +1,95 @@
+#include "gadgetron_xml.h"
+#include "pugixml.hpp"
+#include <stdexcept>
+#include <cstdlib>
+
+namespace GadgetronXML
+{
+
+  void deserialize(const char* xml_config, GadgetronConfiguration& h)
+  {
+    pugi::xml_document doc;
+    pugi::xml_parse_result result = doc.load(xml_config);
+    pugi::xml_node root = doc.child("gadgetronConfiguration");
+
+    if (!root) {
+      throw std::runtime_error("gadgetronConfiguration element not found in configuration file");
+    }
+    
+    pugi::xml_node port = root.child("port");
+    if (!port) {
+      throw std::runtime_error("Port not found in Gadgetron configuration");
+    }
+
+    h.port = port.child_value();
+
+    pugi::xml_node p = root.child("globalGadgetParameter");
+    while (p) {
+      GadgetronParameter pp;
+      pp.name = p.child_value("name");
+      pp.value = p.child_value("value");
+      h.globalGadgetParameter.push_back(pp);
+      p = p.next_sibling("globalGadgetParameter");
+    }
+
+    pugi::xml_node b = root.child("cloudBus");
+    if (b) {
+      CloudBus cb;
+      cb.multiCastAddress = b.child_value("multiCastAddress");
+      cb.port = static_cast<unsigned int>(std::atoi(b.child_value("port")));
+      h.cloudBus = cb;
+    }
+    
+  }
+
+  void deserialize(const char* xml_config, GadgetStreamConfiguration& cfg)
+  {
+    pugi::xml_document doc;
+    pugi::xml_parse_result result = doc.load(xml_config);
+    pugi::xml_node root = doc.child("gadgetronStreamConfiguration");
+
+    if (!root) {
+      throw std::runtime_error("gadgetronStreamConfiguration element not found in configuration file");
+    }
+
+    pugi::xml_node reader = root.child("reader");
+    while (reader) {
+      Reader r;
+      r.slot = static_cast<unsigned short>(std::atoi(reader.child_value("slot")));
+      r.dll = reader.child_value("dll");
+      r.classname = reader.child_value("classname");
+      cfg.reader.push_back(r);
+      reader = reader.next_sibling("reader");
+    }
+    
+    pugi::xml_node writer = root.child("writer");
+    while (writer) {
+      Writer w;
+      w.slot = static_cast<unsigned short>(std::atoi(writer.child_value("slot")));
+      w.dll = writer.child_value("dll");
+      w.classname = writer.child_value("classname");
+      cfg.writer.push_back(w);
+      writer = writer.next_sibling("writer");
+    }
+
+    pugi::xml_node gadget = root.child("gadget");
+    while (gadget) {
+      Gadget g;
+      g.name = gadget.child_value("name");
+      g.dll = gadget.child_value("dll");
+      g.classname = gadget.child_value("classname");
+      
+      pugi::xml_node property = gadget.child("property");
+      while (property) {
+	GadgetronParameter p;
+	p.name = property.child_value("name");
+	p.value = property.child_value("value");
+	g.property.push_back(p);
+	property = property.next_sibling("property");
+      }
+
+      cfg.gadget.push_back(g);
+      gadget = gadget.next_sibling("gadget");
+    }
+  }
+}
diff --git a/apps/gadgetron/gadgetron_xml.h b/apps/gadgetron/gadgetron_xml.h
new file mode 100644
index 0000000..896bed2
--- /dev/null
+++ b/apps/gadgetron/gadgetron_xml.h
@@ -0,0 +1,122 @@
+#ifndef GADGETRON_XML_H
+#define GADGETRON_XML_H
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+#include "gadgetbase_export.h"
+
+namespace GadgetronXML
+{
+  template <typename T> class Optional
+  {
+  public:
+    Optional()
+      : present_(false)
+    {
+
+    }
+
+    Optional(const T&v) {
+      present_ = true;
+      value_ = v;      
+    }
+
+    const Optional& operator=(const T& v) {
+      present_ = true;
+      value_ = v;
+      return *this;
+    }
+
+    const T* operator->() const {
+      return &value_;
+    }
+
+    const T& operator*() const {
+      return value_;
+    }
+
+    operator bool() const {
+      return present_;
+    }
+
+    bool is_present() const {
+      return present_;
+    }
+
+    T& get() {
+      if (!present_) {
+	throw std::runtime_error("Access optional value, which has not been set");
+      }
+      return value_;
+    }
+    
+    T& operator()() {
+      return get();
+    }
+
+    void set(const T& v) {
+      present_ = true;
+      value_ = v;
+    }
+
+  protected:
+    bool present_;
+    T value_;
+
+  }; 
+
+
+  struct GadgetronParameter
+  {
+    std::string name;
+    std::string value;
+  };
+
+  struct CloudBus
+  {
+    std::string multiCastAddress;
+    unsigned int port;
+  };
+
+
+  struct GadgetronConfiguration
+  {
+    std::string port;
+    std::vector<GadgetronParameter> globalGadgetParameter;
+    Optional<CloudBus> cloudBus;    
+  };
+
+  void EXPORTGADGETBASE deserialize(const char* xml_config, GadgetronConfiguration& h);
+  
+  struct Reader
+  {
+    unsigned short slot;
+    std::string dll;
+    std::string classname;
+  };
+
+  typedef Reader Writer;
+  
+  struct Gadget
+  {
+    std::string name;
+    std::string dll;
+    std::string classname;
+    std::vector<GadgetronParameter> property;
+  };
+
+  struct GadgetStreamConfiguration
+  {
+    std::vector<Reader> reader;
+    std::vector<Writer> writer;
+    std::vector<Gadget> gadget;
+  };
+
+  void EXPORTGADGETBASE deserialize(const char* xml, GadgetStreamConfiguration& cfg);
+
+};
+
+#endif //GADGETRON_XML_H
+
+
diff --git a/apps/gadgetron/main.cpp b/apps/gadgetron/main.cpp
new file mode 100644
index 0000000..7ab954c
--- /dev/null
+++ b/apps/gadgetron/main.cpp
@@ -0,0 +1,196 @@
+#include "GadgetServerAcceptor.h"
+#include "FileInfo.h"
+#include "url_encode.h"
+#include "gadgetron_xml.h"
+#include "gadgetron_config.h"
+#include "gadgetron_paths.h"
+#include "CloudBus.h"
+
+#include <ace/Log_Msg.h>
+#include <ace/Service_Config.h>
+#include <ace/Reactor.h>
+#include <ace/Get_Opt.h>
+#include <ace/OS_NS_string.h>
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <streambuf>
+
+
+#ifdef _WIN32
+#include <windows.h>
+#include <Shlwapi.h>
+#pragma comment(lib, "shlwapi.lib")
+#else
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif // _WIN32
+
+#include <boost/filesystem.hpp>
+using namespace boost::filesystem;
+
+using namespace Gadgetron;
+
+#define GT_WORKING_DIRECTORY "workingDirectory"
+
+namespace Gadgetron {
+
+
+  bool create_folder_with_all_permissions(const std::string& workingdirectory)
+  {
+    if ( !boost::filesystem::exists(workingdirectory) )
+      {
+        boost::filesystem::path workingPath(workingdirectory);
+        if ( !boost::filesystem::create_directory(workingPath) )
+	  {
+	    GERROR("Error creating the working directory.\n");
+	    return false;
+	  }
+
+        // set the permission for the folder
+#ifdef _WIN32
+	try
+	  {
+	    boost::filesystem::permissions(workingPath, all_all);
+	  }
+	catch(...)
+	  {
+	    GERROR("Error changing the permission of the working directory.\n");
+	    return false;
+	  }
+#else
+	// in case an older version of boost is used in non-win system
+	// the system call is used
+	int res = chmod(workingPath.string().c_str(), S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IXGRP|S_IROTH|S_IWOTH|S_IXOTH);
+	if ( res != 0 )
+	  {
+	    GERROR("Error changing the permission of the working directory.\n");
+	    return false;
+	  }
+#endif // _WIN32
+      }
+
+    return true;
+  }
+
+}
+
+void print_usage()
+{
+  GINFO("Usage: \n");
+  GINFO("gadgetron   -p <PORT>                      (default 9002)       \n");
+}
+
+int ACE_TMAIN(int argc, ACE_TCHAR *argv[])
+{
+  std::string  gadgetron_home = get_gadgetron_home();
+
+  if (gadgetron_home.size() == 0) {
+    GERROR("GADGETRON_HOME variable not set.\n");
+    return -1;
+  }
+
+  std::string gcfg = gadgetron_home + std::string("/") + std::string(GADGETRON_CONFIG_PATH) + std::string("/gadgetron.xml");
+  if (!FileInfo(gcfg).exists()) {
+    GERROR("Gadgetron configuration file %s not found.\n", gcfg.c_str());
+    return -1;
+  }
+
+
+  ACE_TCHAR port_no[1024];
+  std::map<std::string, std::string> gadget_parameters;
+
+  // the working directory of gadgetron should always be set
+  bool workingDirectorySet = false;
+
+  GadgetronXML::GadgetronConfiguration c;
+  try
+    {
+      std::ifstream t(gcfg.c_str());
+      std::string gcfg_text((std::istreambuf_iterator<char>(t)),
+			    std::istreambuf_iterator<char>());
+      
+      GadgetronXML::deserialize(gcfg_text.c_str(), c);
+      ACE_OS_String::strncpy(port_no, c.port.c_str(), 1024);
+
+      for (std::vector<GadgetronXML::GadgetronParameter>::iterator it = c.globalGadgetParameter.begin();
+	   it != c.globalGadgetParameter.end();
+	   ++it)
+	{
+	  std::string key = it->name;
+	  std::string value = it->value;
+      
+	  gadget_parameters[key] = value;
+	  
+	  if ( key == std::string(GT_WORKING_DIRECTORY) ) workingDirectorySet = true;
+        }
+    }  catch (std::runtime_error& e) {
+    GERROR("XML Parse Error: %s\n", e.what());
+    GERROR("Error parsing configuration file %s.\n", gcfg.c_str());
+    return -1;
+  }
+
+  static const ACE_TCHAR options[] = ACE_TEXT(":p:");
+  ACE_Get_Opt cmd_opts(argc, argv, options);
+
+  int option;
+  while ((option = cmd_opts()) != EOF) {
+    switch (option) {
+    case 'p':
+      ACE_OS_String::strncpy(port_no, cmd_opts.opt_arg(), 1024);
+      break;
+    case ':':
+      print_usage();
+      GERROR("-%c requires an argument.\n", cmd_opts.opt_opt());
+      return -1;
+      break;
+    default:
+      print_usage();
+      GERROR("Command line parse error\n");
+      return -1;
+      break;
+    }
+  }
+
+  if (c.cloudBus) {
+    GINFO("Starting cloudBus: %s:%d\n", 
+	  c.cloudBus->multiCastAddress.c_str(), c.cloudBus->port);
+    Gadgetron::CloudBus::set_mcast_address(c.cloudBus->multiCastAddress.c_str());
+    Gadgetron::CloudBus::set_mcast_port(c.cloudBus->port);
+    Gadgetron::CloudBus::set_gadgetron_port(std::atoi(port_no));
+    Gadgetron::CloudBus* cb = Gadgetron::CloudBus::instance();//This actually starts the bus.
+    gadget_parameters["using_cloudbus"] = std::string("true"); //This is our message to the Gadgets that we have activated the bus
+  }
+
+
+  // if the working directory is not set, use the default path
+  if ( !workingDirectorySet )
+    {
+#ifdef _WIN32
+      gadget_parameters[std::string(GT_WORKING_DIRECTORY)] = std::string("c:\\temp\\gadgetron\\");
+#else
+      gadget_parameters[std::string(GT_WORKING_DIRECTORY)] = std::string("/tmp/gadgetron/");
+#endif // _WIN32
+    }
+
+  // check and create workingdirectory
+  std::string workingDirectory = gadget_parameters[std::string(GT_WORKING_DIRECTORY)];
+  if ( !Gadgetron::create_folder_with_all_permissions(workingDirectory) )
+    {
+      GERROR("Gadgetron creating working directory %s failed ... \n", workingDirectory.c_str());
+      return -1;
+    }
+
+  GINFO("Configuring services, Running on port %s\n", port_no);
+
+  ACE_INET_Addr port_to_listen (port_no);
+  GadgetServerAcceptor acceptor;
+  acceptor.global_gadget_parameters_ = gadget_parameters;
+  acceptor.reactor (ACE_Reactor::instance ());
+  if (acceptor.open (port_to_listen) == -1)
+    return 1;
+
+  ACE_Reactor::instance()->run_reactor_event_loop ();
+
+  return 0;
+}
diff --git a/apps/gadgetron/pugiconfig.hpp b/apps/gadgetron/pugiconfig.hpp
new file mode 100644
index 0000000..56f1d22
--- /dev/null
+++ b/apps/gadgetron/pugiconfig.hpp
@@ -0,0 +1,72 @@
+/**
+ * pugixml parser - version 1.4
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine at gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen at tima.net)
+ */
+
+#ifndef HEADER_PUGICONFIG_HPP
+#define HEADER_PUGICONFIG_HPP
+
+// Uncomment this to enable wchar_t mode
+// #define PUGIXML_WCHAR_MODE
+
+// Uncomment this to disable XPath
+// #define PUGIXML_NO_XPATH
+
+// Uncomment this to disable STL
+// #define PUGIXML_NO_STL
+
+// Uncomment this to disable exceptions
+// #define PUGIXML_NO_EXCEPTIONS
+
+// Set this to control attributes for public classes/functions, i.e.:
+// #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL
+// #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL
+// #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall
+// In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead
+
+// Tune these constants to adjust memory-related behavior
+// #define PUGIXML_MEMORY_PAGE_SIZE 32768
+// #define PUGIXML_MEMORY_OUTPUT_STACK 10240
+// #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096
+
+// Uncomment this to switch to header-only version
+// #define PUGIXML_HEADER_ONLY
+// #include "pugixml.cpp"
+
+// Uncomment this to enable long long support
+// #define PUGIXML_HAS_LONG_LONG
+
+#endif
+
+/**
+ * Copyright (c) 2006-2014 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/apps/gadgetron/pugixml.cpp b/apps/gadgetron/pugixml.cpp
new file mode 100644
index 0000000..754f92f
--- /dev/null
+++ b/apps/gadgetron/pugixml.cpp
@@ -0,0 +1,10639 @@
+/**
+ * pugixml parser - version 1.4
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine at gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen at tima.net)
+ */
+
+#ifndef SOURCE_PUGIXML_CPP
+#define SOURCE_PUGIXML_CPP
+
+#include "pugixml.hpp"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#ifdef PUGIXML_WCHAR_MODE
+#	include <wchar.h>
+#endif
+
+#ifndef PUGIXML_NO_XPATH
+#	include <math.h>
+#	include <float.h>
+#	ifdef PUGIXML_NO_EXCEPTIONS
+#		include <setjmp.h>
+#	endif
+#endif
+
+#ifndef PUGIXML_NO_STL
+#	include <istream>
+#	include <ostream>
+#	include <string>
+#endif
+
+// For placement new
+#include <new>
+
+#ifdef _MSC_VER
+#	pragma warning(push)
+#	pragma warning(disable: 4127) // conditional expression is constant
+#	pragma warning(disable: 4324) // structure was padded due to __declspec(align())
+#	pragma warning(disable: 4611) // interaction between '_setjmp' and C++ object destruction is non-portable
+#	pragma warning(disable: 4702) // unreachable code
+#	pragma warning(disable: 4996) // this function or variable may be unsafe
+#	pragma warning(disable: 4793) // function compiled as native: presence of '_setjmp' makes a function unmanaged
+#endif
+
+#ifdef __INTEL_COMPILER
+#	pragma warning(disable: 177) // function was declared but never referenced 
+#	pragma warning(disable: 279) // controlling expression is constant
+#	pragma warning(disable: 1478 1786) // function was declared "deprecated"
+#	pragma warning(disable: 1684) // conversion from pointer to same-sized integral type
+#endif
+
+#if defined(__BORLANDC__) && defined(PUGIXML_HEADER_ONLY)
+#	pragma warn -8080 // symbol is declared but never used; disabling this inside push/pop bracket does not make the warning go away
+#endif
+
+#ifdef __BORLANDC__
+#	pragma option push
+#	pragma warn -8008 // condition is always false
+#	pragma warn -8066 // unreachable code
+#endif
+
+#ifdef __SNC__
+// Using diag_push/diag_pop does not disable the warnings inside templates due to a compiler bug
+#	pragma diag_suppress=178 // function was declared but never referenced
+#	pragma diag_suppress=237 // controlling expression is constant
+#endif
+
+// Inlining controls
+#if defined(_MSC_VER) && _MSC_VER >= 1300
+#	define PUGI__NO_INLINE __declspec(noinline)
+#elif defined(__GNUC__)
+#	define PUGI__NO_INLINE __attribute__((noinline))
+#else
+#	define PUGI__NO_INLINE 
+#endif
+
+// Simple static assertion
+#define PUGI__STATIC_ASSERT(cond) { static const char condition_failed[(cond) ? 1 : -1] = {0}; (void)condition_failed[0]; }
+
+// Digital Mars C++ bug workaround for passing char loaded from memory via stack
+#ifdef __DMC__
+#	define PUGI__DMC_VOLATILE volatile
+#else
+#	define PUGI__DMC_VOLATILE
+#endif
+
+// Borland C++ bug workaround for not defining ::memcpy depending on header include order (can't always use std::memcpy because some compilers don't have it at all)
+#if defined(__BORLANDC__) && !defined(__MEM_H_USING_LIST)
+using std::memcpy;
+using std::memmove;
+#endif
+
+// In some environments MSVC is a compiler but the CRT lacks certain MSVC-specific features
+#if defined(_MSC_VER) && !defined(__S3E__)
+#	define PUGI__MSVC_CRT_VERSION _MSC_VER
+#endif
+
+#ifdef PUGIXML_HEADER_ONLY
+#	define PUGI__NS_BEGIN namespace pugi { namespace impl {
+#	define PUGI__NS_END } }
+#	define PUGI__FN inline
+#	define PUGI__FN_NO_INLINE inline
+#else
+#	if defined(_MSC_VER) && _MSC_VER < 1300 // MSVC6 seems to have an amusing bug with anonymous namespaces inside namespaces
+#		define PUGI__NS_BEGIN namespace pugi { namespace impl {
+#		define PUGI__NS_END } }
+#	else
+#		define PUGI__NS_BEGIN namespace pugi { namespace impl { namespace {
+#		define PUGI__NS_END } } }
+#	endif
+#	define PUGI__FN
+#	define PUGI__FN_NO_INLINE PUGI__NO_INLINE
+#endif
+
+// uintptr_t
+#if !defined(_MSC_VER) || _MSC_VER >= 1600
+#	include <stdint.h>
+#else
+#	ifndef _UINTPTR_T_DEFINED
+// No native uintptr_t in MSVC6 and in some WinCE versions
+typedef size_t uintptr_t;
+#define _UINTPTR_T_DEFINED
+#	endif
+PUGI__NS_BEGIN
+	typedef unsigned __int8 uint8_t;
+	typedef unsigned __int16 uint16_t;
+	typedef unsigned __int32 uint32_t;
+PUGI__NS_END
+#endif
+
+// Memory allocation
+PUGI__NS_BEGIN
+	PUGI__FN void* default_allocate(size_t size)
+	{
+		return malloc(size);
+	}
+
+	PUGI__FN void default_deallocate(void* ptr)
+	{
+		free(ptr);
+	}
+
+	template <typename T>
+	struct xml_memory_management_function_storage
+	{
+		static allocation_function allocate;
+		static deallocation_function deallocate;
+	};
+
+	template <typename T> allocation_function xml_memory_management_function_storage<T>::allocate = default_allocate;
+	template <typename T> deallocation_function xml_memory_management_function_storage<T>::deallocate = default_deallocate;
+
+	typedef xml_memory_management_function_storage<int> xml_memory;
+PUGI__NS_END
+
+// String utilities
+PUGI__NS_BEGIN
+	// Get string length
+	PUGI__FN size_t strlength(const char_t* s)
+	{
+		assert(s);
+
+	#ifdef PUGIXML_WCHAR_MODE
+		return wcslen(s);
+	#else
+		return strlen(s);
+	#endif
+	}
+
+	// Compare two strings
+	PUGI__FN bool strequal(const char_t* src, const char_t* dst)
+	{
+		assert(src && dst);
+
+	#ifdef PUGIXML_WCHAR_MODE
+		return wcscmp(src, dst) == 0;
+	#else
+		return strcmp(src, dst) == 0;
+	#endif
+	}
+
+	// Compare lhs with [rhs_begin, rhs_end)
+	PUGI__FN bool strequalrange(const char_t* lhs, const char_t* rhs, size_t count)
+	{
+		for (size_t i = 0; i < count; ++i)
+			if (lhs[i] != rhs[i])
+				return false;
+	
+		return lhs[count] == 0;
+	}
+
+	// Get length of wide string, even if CRT lacks wide character support
+	PUGI__FN size_t strlength_wide(const wchar_t* s)
+	{
+		assert(s);
+
+	#ifdef PUGIXML_WCHAR_MODE
+		return wcslen(s);
+	#else
+		const wchar_t* end = s;
+		while (*end) end++;
+		return static_cast<size_t>(end - s);
+	#endif
+	}
+
+#ifdef PUGIXML_WCHAR_MODE
+	// Convert string to wide string, assuming all symbols are ASCII
+	PUGI__FN void widen_ascii(wchar_t* dest, const char* source)
+	{
+		for (const char* i = source; *i; ++i) *dest++ = *i;
+		*dest = 0;
+	}
+#endif
+PUGI__NS_END
+
+#if !defined(PUGIXML_NO_STL) || !defined(PUGIXML_NO_XPATH)
+// auto_ptr-like buffer holder for exception recovery
+PUGI__NS_BEGIN
+	struct buffer_holder
+	{
+		void* data;
+		void (*deleter)(void*);
+
+		buffer_holder(void* data_, void (*deleter_)(void*)): data(data_), deleter(deleter_)
+		{
+		}
+
+		~buffer_holder()
+		{
+			if (data) deleter(data);
+		}
+
+		void* release()
+		{
+			void* result = data;
+			data = 0;
+			return result;
+		}
+	};
+PUGI__NS_END
+#endif
+
+PUGI__NS_BEGIN
+	static const size_t xml_memory_page_size =
+	#ifdef PUGIXML_MEMORY_PAGE_SIZE
+		PUGIXML_MEMORY_PAGE_SIZE
+	#else
+		32768
+	#endif
+		;
+
+	static const uintptr_t xml_memory_page_alignment = 32;
+	static const uintptr_t xml_memory_page_pointer_mask = ~(xml_memory_page_alignment - 1);
+	static const uintptr_t xml_memory_page_name_allocated_mask = 16;
+	static const uintptr_t xml_memory_page_value_allocated_mask = 8;
+	static const uintptr_t xml_memory_page_type_mask = 7;
+
+	struct xml_allocator;
+
+	struct xml_memory_page
+	{
+		static xml_memory_page* construct(void* memory)
+		{
+			if (!memory) return 0; //$ redundant, left for performance
+
+			xml_memory_page* result = static_cast<xml_memory_page*>(memory);
+
+			result->allocator = 0;
+			result->memory = 0;
+			result->prev = 0;
+			result->next = 0;
+			result->busy_size = 0;
+			result->freed_size = 0;
+
+			return result;
+		}
+
+		xml_allocator* allocator;
+
+		void* memory;
+
+		xml_memory_page* prev;
+		xml_memory_page* next;
+
+		size_t busy_size;
+		size_t freed_size;
+
+		char data[1];
+	};
+
+	struct xml_memory_string_header
+	{
+		uint16_t page_offset; // offset from page->data
+		uint16_t full_size; // 0 if string occupies whole page
+	};
+
+	struct xml_allocator
+	{
+		xml_allocator(xml_memory_page* root): _root(root), _busy_size(root->busy_size)
+		{
+		}
+
+		xml_memory_page* allocate_page(size_t data_size)
+		{
+			size_t size = offsetof(xml_memory_page, data) + data_size;
+
+			// allocate block with some alignment, leaving memory for worst-case padding
+			void* memory = xml_memory::allocate(size + xml_memory_page_alignment);
+			if (!memory) return 0;
+
+			// align upwards to page boundary
+			void* page_memory = reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(memory) + (xml_memory_page_alignment - 1)) & ~(xml_memory_page_alignment - 1));
+
+			// prepare page structure
+			xml_memory_page* page = xml_memory_page::construct(page_memory);
+			assert(page);
+
+			page->memory = memory;
+			page->allocator = _root->allocator;
+
+			return page;
+		}
+
+		static void deallocate_page(xml_memory_page* page)
+		{
+			xml_memory::deallocate(page->memory);
+		}
+
+		void* allocate_memory_oob(size_t size, xml_memory_page*& out_page);
+
+		void* allocate_memory(size_t size, xml_memory_page*& out_page)
+		{
+			if (_busy_size + size > xml_memory_page_size) return allocate_memory_oob(size, out_page);
+
+			void* buf = _root->data + _busy_size;
+
+			_busy_size += size;
+
+			out_page = _root;
+
+			return buf;
+		}
+
+		void deallocate_memory(void* ptr, size_t size, xml_memory_page* page)
+		{
+			if (page == _root) page->busy_size = _busy_size;
+
+			assert(ptr >= page->data && ptr < page->data + page->busy_size);
+			(void)!ptr;
+
+			page->freed_size += size;
+			assert(page->freed_size <= page->busy_size);
+
+			if (page->freed_size == page->busy_size)
+			{
+				if (page->next == 0)
+				{
+					assert(_root == page);
+
+					// top page freed, just reset sizes
+					page->busy_size = page->freed_size = 0;
+					_busy_size = 0;
+				}
+				else
+				{
+					assert(_root != page);
+					assert(page->prev);
+
+					// remove from the list
+					page->prev->next = page->next;
+					page->next->prev = page->prev;
+
+					// deallocate
+					deallocate_page(page);
+				}
+			}
+		}
+
+		char_t* allocate_string(size_t length)
+		{
+			// allocate memory for string and header block
+			size_t size = sizeof(xml_memory_string_header) + length * sizeof(char_t);
+			
+			// round size up to pointer alignment boundary
+			size_t full_size = (size + (sizeof(void*) - 1)) & ~(sizeof(void*) - 1);
+
+			xml_memory_page* page;
+			xml_memory_string_header* header = static_cast<xml_memory_string_header*>(allocate_memory(full_size, page));
+
+			if (!header) return 0;
+
+			// setup header
+			ptrdiff_t page_offset = reinterpret_cast<char*>(header) - page->data;
+
+			assert(page_offset >= 0 && page_offset < (1 << 16));
+			header->page_offset = static_cast<uint16_t>(page_offset);
+
+			// full_size == 0 for large strings that occupy the whole page
+			assert(full_size < (1 << 16) || (page->busy_size == full_size && page_offset == 0));
+			header->full_size = static_cast<uint16_t>(full_size < (1 << 16) ? full_size : 0);
+
+			// round-trip through void* to avoid 'cast increases required alignment of target type' warning
+			// header is guaranteed a pointer-sized alignment, which should be enough for char_t
+			return static_cast<char_t*>(static_cast<void*>(header + 1));
+		}
+
+		void deallocate_string(char_t* string)
+		{
+			// this function casts pointers through void* to avoid 'cast increases required alignment of target type' warnings
+			// we're guaranteed the proper (pointer-sized) alignment on the input string if it was allocated via allocate_string
+
+			// get header
+			xml_memory_string_header* header = static_cast<xml_memory_string_header*>(static_cast<void*>(string)) - 1;
+
+			// deallocate
+			size_t page_offset = offsetof(xml_memory_page, data) + header->page_offset;
+			xml_memory_page* page = reinterpret_cast<xml_memory_page*>(static_cast<void*>(reinterpret_cast<char*>(header) - page_offset));
+
+			// if full_size == 0 then this string occupies the whole page
+			size_t full_size = header->full_size == 0 ? page->busy_size : header->full_size;
+
+			deallocate_memory(header, full_size, page);
+		}
+
+		xml_memory_page* _root;
+		size_t _busy_size;
+	};
+
+	PUGI__FN_NO_INLINE void* xml_allocator::allocate_memory_oob(size_t size, xml_memory_page*& out_page)
+	{
+		const size_t large_allocation_threshold = xml_memory_page_size / 4;
+
+		xml_memory_page* page = allocate_page(size <= large_allocation_threshold ? xml_memory_page_size : size);
+		out_page = page;
+
+		if (!page) return 0;
+
+		if (size <= large_allocation_threshold)
+		{
+			_root->busy_size = _busy_size;
+
+			// insert page at the end of linked list
+			page->prev = _root;
+			_root->next = page;
+			_root = page;
+
+			_busy_size = size;
+		}
+		else
+		{
+			// insert page before the end of linked list, so that it is deleted as soon as possible
+			// the last page is not deleted even if it's empty (see deallocate_memory)
+			assert(_root->prev);
+
+			page->prev = _root->prev;
+			page->next = _root;
+
+			_root->prev->next = page;
+			_root->prev = page;
+		}
+
+		// allocate inside page
+		page->busy_size = size;
+
+		return page->data;
+	}
+PUGI__NS_END
+
+namespace pugi
+{
+	/// A 'name=value' XML attribute structure.
+	struct xml_attribute_struct
+	{
+		/// Default ctor
+		xml_attribute_struct(impl::xml_memory_page* page): header(reinterpret_cast<uintptr_t>(page)), name(0), value(0), prev_attribute_c(0), next_attribute(0)
+		{
+		}
+
+		uintptr_t header;
+
+		char_t* name;	///< Pointer to attribute name.
+		char_t*	value;	///< Pointer to attribute value.
+
+		xml_attribute_struct* prev_attribute_c;	///< Previous attribute (cyclic list)
+		xml_attribute_struct* next_attribute;	///< Next attribute
+	};
+
+	/// An XML document tree node.
+	struct xml_node_struct
+	{
+		/// Default ctor
+		/// \param type - node type
+		xml_node_struct(impl::xml_memory_page* page, xml_node_type type): header(reinterpret_cast<uintptr_t>(page) | (type - 1)), parent(0), name(0), value(0), first_child(0), prev_sibling_c(0), next_sibling(0), first_attribute(0)
+		{
+		}
+
+		uintptr_t header;
+
+		xml_node_struct*		parent;					///< Pointer to parent
+
+		char_t*					name;					///< Pointer to element name.
+		char_t*					value;					///< Pointer to any associated string data.
+
+		xml_node_struct*		first_child;			///< First child
+		
+		xml_node_struct*		prev_sibling_c;			///< Left brother (cyclic list)
+		xml_node_struct*		next_sibling;			///< Right brother
+		
+		xml_attribute_struct*	first_attribute;		///< First attribute
+	};
+}
+
+PUGI__NS_BEGIN
+	struct xml_extra_buffer
+	{
+		char_t* buffer;
+		xml_extra_buffer* next;
+	};
+
+	struct xml_document_struct: public xml_node_struct, public xml_allocator
+	{
+		xml_document_struct(xml_memory_page* page): xml_node_struct(page, node_document), xml_allocator(page), buffer(0), extra_buffers(0)
+		{
+		}
+
+		const char_t* buffer;
+
+		xml_extra_buffer* extra_buffers;
+	};
+
+	inline xml_allocator& get_allocator(const xml_node_struct* node)
+	{
+		assert(node);
+
+		return *reinterpret_cast<xml_memory_page*>(node->header & xml_memory_page_pointer_mask)->allocator;
+	}
+PUGI__NS_END
+
+// Low-level DOM operations
+PUGI__NS_BEGIN
+	inline xml_attribute_struct* allocate_attribute(xml_allocator& alloc)
+	{
+		xml_memory_page* page;
+		void* memory = alloc.allocate_memory(sizeof(xml_attribute_struct), page);
+
+		return new (memory) xml_attribute_struct(page);
+	}
+
+	inline xml_node_struct* allocate_node(xml_allocator& alloc, xml_node_type type)
+	{
+		xml_memory_page* page;
+		void* memory = alloc.allocate_memory(sizeof(xml_node_struct), page);
+
+		return new (memory) xml_node_struct(page, type);
+	}
+
+	inline void destroy_attribute(xml_attribute_struct* a, xml_allocator& alloc)
+	{
+		uintptr_t header = a->header;
+
+		if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(a->name);
+		if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(a->value);
+
+		alloc.deallocate_memory(a, sizeof(xml_attribute_struct), reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask));
+	}
+
+	inline void destroy_node(xml_node_struct* n, xml_allocator& alloc)
+	{
+		uintptr_t header = n->header;
+
+		if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(n->name);
+		if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(n->value);
+
+		for (xml_attribute_struct* attr = n->first_attribute; attr; )
+		{
+			xml_attribute_struct* next = attr->next_attribute;
+
+			destroy_attribute(attr, alloc);
+
+			attr = next;
+		}
+
+		for (xml_node_struct* child = n->first_child; child; )
+		{
+			xml_node_struct* next = child->next_sibling;
+
+			destroy_node(child, alloc);
+
+			child = next;
+		}
+
+		alloc.deallocate_memory(n, sizeof(xml_node_struct), reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask));
+	}
+
+	PUGI__FN_NO_INLINE xml_node_struct* append_node(xml_node_struct* node, xml_allocator& alloc, xml_node_type type = node_element)
+	{
+		xml_node_struct* child = allocate_node(alloc, type);
+		if (!child) return 0;
+
+		child->parent = node;
+
+		xml_node_struct* first_child = node->first_child;
+			
+		if (first_child)
+		{
+			xml_node_struct* last_child = first_child->prev_sibling_c;
+
+			last_child->next_sibling = child;
+			child->prev_sibling_c = last_child;
+			first_child->prev_sibling_c = child;
+		}
+		else
+		{
+			node->first_child = child;
+			child->prev_sibling_c = child;
+		}
+			
+		return child;
+	}
+
+	PUGI__FN_NO_INLINE xml_attribute_struct* append_attribute_ll(xml_node_struct* node, xml_allocator& alloc)
+	{
+		xml_attribute_struct* a = allocate_attribute(alloc);
+		if (!a) return 0;
+
+		xml_attribute_struct* first_attribute = node->first_attribute;
+
+		if (first_attribute)
+		{
+			xml_attribute_struct* last_attribute = first_attribute->prev_attribute_c;
+
+			last_attribute->next_attribute = a;
+			a->prev_attribute_c = last_attribute;
+			first_attribute->prev_attribute_c = a;
+		}
+		else
+		{
+			node->first_attribute = a;
+			a->prev_attribute_c = a;
+		}
+			
+		return a;
+	}
+PUGI__NS_END
+
+// Helper classes for code generation
+PUGI__NS_BEGIN
+	struct opt_false
+	{
+		enum { value = 0 };
+	};
+
+	struct opt_true
+	{
+		enum { value = 1 };
+	};
+PUGI__NS_END
+
+// Unicode utilities
+PUGI__NS_BEGIN
+	inline uint16_t endian_swap(uint16_t value)
+	{
+		return static_cast<uint16_t>(((value & 0xff) << 8) | (value >> 8));
+	}
+
+	inline uint32_t endian_swap(uint32_t value)
+	{
+		return ((value & 0xff) << 24) | ((value & 0xff00) << 8) | ((value & 0xff0000) >> 8) | (value >> 24);
+	}
+
+	struct utf8_counter
+	{
+		typedef size_t value_type;
+
+		static value_type low(value_type result, uint32_t ch)
+		{
+			// U+0000..U+007F
+			if (ch < 0x80) return result + 1;
+			// U+0080..U+07FF
+			else if (ch < 0x800) return result + 2;
+			// U+0800..U+FFFF
+			else return result + 3;
+		}
+
+		static value_type high(value_type result, uint32_t)
+		{
+			// U+10000..U+10FFFF
+			return result + 4;
+		}
+	};
+
+	struct utf8_writer
+	{
+		typedef uint8_t* value_type;
+
+		static value_type low(value_type result, uint32_t ch)
+		{
+			// U+0000..U+007F
+			if (ch < 0x80)
+			{
+				*result = static_cast<uint8_t>(ch);
+				return result + 1;
+			}
+			// U+0080..U+07FF
+			else if (ch < 0x800)
+			{
+				result[0] = static_cast<uint8_t>(0xC0 | (ch >> 6));
+				result[1] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+				return result + 2;
+			}
+			// U+0800..U+FFFF
+			else
+			{
+				result[0] = static_cast<uint8_t>(0xE0 | (ch >> 12));
+				result[1] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
+				result[2] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+				return result + 3;
+			}
+		}
+
+		static value_type high(value_type result, uint32_t ch)
+		{
+			// U+10000..U+10FFFF
+			result[0] = static_cast<uint8_t>(0xF0 | (ch >> 18));
+			result[1] = static_cast<uint8_t>(0x80 | ((ch >> 12) & 0x3F));
+			result[2] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
+			result[3] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+			return result + 4;
+		}
+
+		static value_type any(value_type result, uint32_t ch)
+		{
+			return (ch < 0x10000) ? low(result, ch) : high(result, ch);
+		}
+	};
+
+	struct utf16_counter
+	{
+		typedef size_t value_type;
+
+		static value_type low(value_type result, uint32_t)
+		{
+			return result + 1;
+		}
+
+		static value_type high(value_type result, uint32_t)
+		{
+			return result + 2;
+		}
+	};
+
+	struct utf16_writer
+	{
+		typedef uint16_t* value_type;
+
+		static value_type low(value_type result, uint32_t ch)
+		{
+			*result = static_cast<uint16_t>(ch);
+
+			return result + 1;
+		}
+
+		static value_type high(value_type result, uint32_t ch)
+		{
+			uint32_t msh = static_cast<uint32_t>(ch - 0x10000) >> 10;
+			uint32_t lsh = static_cast<uint32_t>(ch - 0x10000) & 0x3ff;
+
+			result[0] = static_cast<uint16_t>(0xD800 + msh);
+			result[1] = static_cast<uint16_t>(0xDC00 + lsh);
+
+			return result + 2;
+		}
+
+		static value_type any(value_type result, uint32_t ch)
+		{
+			return (ch < 0x10000) ? low(result, ch) : high(result, ch);
+		}
+	};
+
+	struct utf32_counter
+	{
+		typedef size_t value_type;
+
+		static value_type low(value_type result, uint32_t)
+		{
+			return result + 1;
+		}
+
+		static value_type high(value_type result, uint32_t)
+		{
+			return result + 1;
+		}
+	};
+
+	struct utf32_writer
+	{
+		typedef uint32_t* value_type;
+
+		static value_type low(value_type result, uint32_t ch)
+		{
+			*result = ch;
+
+			return result + 1;
+		}
+
+		static value_type high(value_type result, uint32_t ch)
+		{
+			*result = ch;
+
+			return result + 1;
+		}
+
+		static value_type any(value_type result, uint32_t ch)
+		{
+			*result = ch;
+
+			return result + 1;
+		}
+	};
+
+	struct latin1_writer
+	{
+		typedef uint8_t* value_type;
+
+		static value_type low(value_type result, uint32_t ch)
+		{
+			*result = static_cast<uint8_t>(ch > 255 ? '?' : ch);
+
+			return result + 1;
+		}
+
+		static value_type high(value_type result, uint32_t ch)
+		{
+			(void)ch;
+
+			*result = '?';
+
+			return result + 1;
+		}
+	};
+
+	template <size_t size> struct wchar_selector;
+
+	template <> struct wchar_selector<2>
+	{
+		typedef uint16_t type;
+		typedef utf16_counter counter;
+		typedef utf16_writer writer;
+	};
+
+	template <> struct wchar_selector<4>
+	{
+		typedef uint32_t type;
+		typedef utf32_counter counter;
+		typedef utf32_writer writer;
+	};
+
+	typedef wchar_selector<sizeof(wchar_t)>::counter wchar_counter;
+	typedef wchar_selector<sizeof(wchar_t)>::writer wchar_writer;
+
+	template <typename Traits, typename opt_swap = opt_false> struct utf_decoder
+	{
+		static inline typename Traits::value_type decode_utf8_block(const uint8_t* data, size_t size, typename Traits::value_type result)
+		{
+			const uint8_t utf8_byte_mask = 0x3f;
+
+			while (size)
+			{
+				uint8_t lead = *data;
+
+				// 0xxxxxxx -> U+0000..U+007F
+				if (lead < 0x80)
+				{
+					result = Traits::low(result, lead);
+					data += 1;
+					size -= 1;
+
+					// process aligned single-byte (ascii) blocks
+					if ((reinterpret_cast<uintptr_t>(data) & 3) == 0)
+					{
+						// round-trip through void* to silence 'cast increases required alignment of target type' warnings
+						while (size >= 4 && (*static_cast<const uint32_t*>(static_cast<const void*>(data)) & 0x80808080) == 0)
+						{
+							result = Traits::low(result, data[0]);
+							result = Traits::low(result, data[1]);
+							result = Traits::low(result, data[2]);
+							result = Traits::low(result, data[3]);
+							data += 4;
+							size -= 4;
+						}
+					}
+				}
+				// 110xxxxx -> U+0080..U+07FF
+				else if (static_cast<unsigned int>(lead - 0xC0) < 0x20 && size >= 2 && (data[1] & 0xc0) == 0x80)
+				{
+					result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask));
+					data += 2;
+					size -= 2;
+				}
+				// 1110xxxx -> U+0800-U+FFFF
+				else if (static_cast<unsigned int>(lead - 0xE0) < 0x10 && size >= 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80)
+				{
+					result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask));
+					data += 3;
+					size -= 3;
+				}
+				// 11110xxx -> U+10000..U+10FFFF
+				else if (static_cast<unsigned int>(lead - 0xF0) < 0x08 && size >= 4 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80)
+				{
+					result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask));
+					data += 4;
+					size -= 4;
+				}
+				// 10xxxxxx or 11111xxx -> invalid
+				else
+				{
+					data += 1;
+					size -= 1;
+				}
+			}
+
+			return result;
+		}
+
+		static inline typename Traits::value_type decode_utf16_block(const uint16_t* data, size_t size, typename Traits::value_type result)
+		{
+			const uint16_t* end = data + size;
+
+			while (data < end)
+			{
+				unsigned int lead = opt_swap::value ? endian_swap(*data) : *data;
+
+				// U+0000..U+D7FF
+				if (lead < 0xD800)
+				{
+					result = Traits::low(result, lead);
+					data += 1;
+				}
+				// U+E000..U+FFFF
+				else if (static_cast<unsigned int>(lead - 0xE000) < 0x2000)
+				{
+					result = Traits::low(result, lead);
+					data += 1;
+				}
+				// surrogate pair lead
+				else if (static_cast<unsigned int>(lead - 0xD800) < 0x400 && data + 1 < end)
+				{
+					uint16_t next = opt_swap::value ? endian_swap(data[1]) : data[1];
+
+					if (static_cast<unsigned int>(next - 0xDC00) < 0x400)
+					{
+						result = Traits::high(result, 0x10000 + ((lead & 0x3ff) << 10) + (next & 0x3ff));
+						data += 2;
+					}
+					else
+					{
+						data += 1;
+					}
+				}
+				else
+				{
+					data += 1;
+				}
+			}
+
+			return result;
+		}
+
+		static inline typename Traits::value_type decode_utf32_block(const uint32_t* data, size_t size, typename Traits::value_type result)
+		{
+			const uint32_t* end = data + size;
+
+			while (data < end)
+			{
+				uint32_t lead = opt_swap::value ? endian_swap(*data) : *data;
+
+				// U+0000..U+FFFF
+				if (lead < 0x10000)
+				{
+					result = Traits::low(result, lead);
+					data += 1;
+				}
+				// U+10000..U+10FFFF
+				else
+				{
+					result = Traits::high(result, lead);
+					data += 1;
+				}
+			}
+
+			return result;
+		}
+
+		static inline typename Traits::value_type decode_latin1_block(const uint8_t* data, size_t size, typename Traits::value_type result)
+		{
+			for (size_t i = 0; i < size; ++i)
+			{
+				result = Traits::low(result, data[i]);
+			}
+
+			return result;
+		}
+
+		static inline typename Traits::value_type decode_wchar_block_impl(const uint16_t* data, size_t size, typename Traits::value_type result)
+		{
+			return decode_utf16_block(data, size, result);
+		}
+
+		static inline typename Traits::value_type decode_wchar_block_impl(const uint32_t* data, size_t size, typename Traits::value_type result)
+		{
+			return decode_utf32_block(data, size, result);
+		}
+
+		static inline typename Traits::value_type decode_wchar_block(const wchar_t* data, size_t size, typename Traits::value_type result)
+		{
+			return decode_wchar_block_impl(reinterpret_cast<const wchar_selector<sizeof(wchar_t)>::type*>(data), size, result);
+		}
+	};
+
+	template <typename T> PUGI__FN void convert_utf_endian_swap(T* result, const T* data, size_t length)
+	{
+		for (size_t i = 0; i < length; ++i) result[i] = endian_swap(data[i]);
+	}
+
+#ifdef PUGIXML_WCHAR_MODE
+	PUGI__FN void convert_wchar_endian_swap(wchar_t* result, const wchar_t* data, size_t length)
+	{
+		for (size_t i = 0; i < length; ++i) result[i] = static_cast<wchar_t>(endian_swap(static_cast<wchar_selector<sizeof(wchar_t)>::type>(data[i])));
+	}
+#endif
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+	enum chartype_t
+	{
+		ct_parse_pcdata = 1,	// \0, &, \r, <
+		ct_parse_attr = 2,		// \0, &, \r, ', "
+		ct_parse_attr_ws = 4,	// \0, &, \r, ', ", \n, tab
+		ct_space = 8,			// \r, \n, space, tab
+		ct_parse_cdata = 16,	// \0, ], >, \r
+		ct_parse_comment = 32,	// \0, -, >, \r
+		ct_symbol = 64,			// Any symbol > 127, a-z, A-Z, 0-9, _, :, -, .
+		ct_start_symbol = 128	// Any symbol > 127, a-z, A-Z, _, :
+	};
+
+	static const unsigned char chartype_table[256] =
+	{
+		55,  0,   0,   0,   0,   0,   0,   0,      0,   12,  12,  0,   0,   63,  0,   0,   // 0-15
+		0,   0,   0,   0,   0,   0,   0,   0,      0,   0,   0,   0,   0,   0,   0,   0,   // 16-31
+		8,   0,   6,   0,   0,   0,   7,   6,      0,   0,   0,   0,   0,   96,  64,  0,   // 32-47
+		64,  64,  64,  64,  64,  64,  64,  64,     64,  64,  192, 0,   1,   0,   48,  0,   // 48-63
+		0,   192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 64-79
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 0,   0,   16,  0,   192, // 80-95
+		0,   192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 96-111
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 0, 0, 0, 0, 0,           // 112-127
+
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 128+
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192
+	};
+
+	enum chartypex_t
+	{
+		ctx_special_pcdata = 1,   // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, >
+		ctx_special_attr = 2,     // Any symbol >= 0 and < 32 (except \t), &, <, >, "
+		ctx_start_symbol = 4,	  // Any symbol > 127, a-z, A-Z, _
+		ctx_digit = 8,			  // 0-9
+		ctx_symbol = 16			  // Any symbol > 127, a-z, A-Z, 0-9, _, -, .
+	};
+	
+	static const unsigned char chartypex_table[256] =
+	{
+		3,  3,  3,  3,  3,  3,  3,  3,     3,  0,  2,  3,  3,  2,  3,  3,     // 0-15
+		3,  3,  3,  3,  3,  3,  3,  3,     3,  3,  3,  3,  3,  3,  3,  3,     // 16-31
+		0,  0,  2,  0,  0,  0,  3,  0,     0,  0,  0,  0,  0, 16, 16,  0,     // 32-47
+		24, 24, 24, 24, 24, 24, 24, 24,    24, 24, 0,  0,  3,  0,  3,  0,     // 48-63
+
+		0,  20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 64-79
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 0,  0,  0,  0,  20,    // 80-95
+		0,  20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 96-111
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 0,  0,  0,  0,  0,     // 112-127
+
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 128+
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20
+	};
+	
+#ifdef PUGIXML_WCHAR_MODE
+	#define PUGI__IS_CHARTYPE_IMPL(c, ct, table) ((static_cast<unsigned int>(c) < 128 ? table[static_cast<unsigned int>(c)] : table[128]) & (ct))
+#else
+	#define PUGI__IS_CHARTYPE_IMPL(c, ct, table) (table[static_cast<unsigned char>(c)] & (ct))
+#endif
+
+	#define PUGI__IS_CHARTYPE(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartype_table)
+	#define PUGI__IS_CHARTYPEX(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartypex_table)
+
+	PUGI__FN bool is_little_endian()
+	{
+		unsigned int ui = 1;
+
+		return *reinterpret_cast<unsigned char*>(&ui) == 1;
+	}
+
+	PUGI__FN xml_encoding get_wchar_encoding()
+	{
+		PUGI__STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4);
+
+		if (sizeof(wchar_t) == 2)
+			return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+		else 
+			return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+	}
+
+	PUGI__FN xml_encoding guess_buffer_encoding(uint8_t d0, uint8_t d1, uint8_t d2, uint8_t d3)
+	{
+		// look for BOM in first few bytes
+		if (d0 == 0 && d1 == 0 && d2 == 0xfe && d3 == 0xff) return encoding_utf32_be;
+		if (d0 == 0xff && d1 == 0xfe && d2 == 0 && d3 == 0) return encoding_utf32_le;
+		if (d0 == 0xfe && d1 == 0xff) return encoding_utf16_be;
+		if (d0 == 0xff && d1 == 0xfe) return encoding_utf16_le;
+		if (d0 == 0xef && d1 == 0xbb && d2 == 0xbf) return encoding_utf8;
+
+		// look for <, <? or <?xm in various encodings
+		if (d0 == 0 && d1 == 0 && d2 == 0 && d3 == 0x3c) return encoding_utf32_be;
+		if (d0 == 0x3c && d1 == 0 && d2 == 0 && d3 == 0) return encoding_utf32_le;
+		if (d0 == 0 && d1 == 0x3c && d2 == 0 && d3 == 0x3f) return encoding_utf16_be;
+		if (d0 == 0x3c && d1 == 0 && d2 == 0x3f && d3 == 0) return encoding_utf16_le;
+		if (d0 == 0x3c && d1 == 0x3f && d2 == 0x78 && d3 == 0x6d) return encoding_utf8;
+
+		// look for utf16 < followed by node name (this may fail, but is better than utf8 since it's zero terminated so early)
+		if (d0 == 0 && d1 == 0x3c) return encoding_utf16_be;
+		if (d0 == 0x3c && d1 == 0) return encoding_utf16_le;
+
+		// no known BOM detected, assume utf8
+		return encoding_utf8;
+	}
+
+	PUGI__FN xml_encoding get_buffer_encoding(xml_encoding encoding, const void* contents, size_t size)
+	{
+		// replace wchar encoding with utf implementation
+		if (encoding == encoding_wchar) return get_wchar_encoding();
+
+		// replace utf16 encoding with utf16 with specific endianness
+		if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+		// replace utf32 encoding with utf32 with specific endianness
+		if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+		// only do autodetection if no explicit encoding is requested
+		if (encoding != encoding_auto) return encoding;
+
+		// skip encoding autodetection if input buffer is too small
+		if (size < 4) return encoding_utf8;
+
+		// try to guess encoding (based on XML specification, Appendix F.1)
+		const uint8_t* data = static_cast<const uint8_t*>(contents);
+
+		PUGI__DMC_VOLATILE uint8_t d0 = data[0], d1 = data[1], d2 = data[2], d3 = data[3];
+
+		return guess_buffer_encoding(d0, d1, d2, d3);
+	}
+
+	PUGI__FN bool get_mutable_buffer(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+	{
+		size_t length = size / sizeof(char_t);
+
+		if (is_mutable)
+		{
+			out_buffer = static_cast<char_t*>(const_cast<void*>(contents));
+			out_length = length;
+		}
+		else
+		{
+			char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+			if (!buffer) return false;
+
+			memcpy(buffer, contents, length * sizeof(char_t));
+			buffer[length] = 0;
+
+			out_buffer = buffer;
+			out_length = length + 1;
+		}
+
+		return true;
+	}
+
+#ifdef PUGIXML_WCHAR_MODE
+	PUGI__FN bool need_endian_swap_utf(xml_encoding le, xml_encoding re)
+	{
+		return (le == encoding_utf16_be && re == encoding_utf16_le) || (le == encoding_utf16_le && re == encoding_utf16_be) ||
+			   (le == encoding_utf32_be && re == encoding_utf32_le) || (le == encoding_utf32_le && re == encoding_utf32_be);
+	}
+
+	PUGI__FN bool convert_buffer_endian_swap(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+	{
+		const char_t* data = static_cast<const char_t*>(contents);
+		size_t length = size / sizeof(char_t);
+
+		if (is_mutable)
+		{
+			char_t* buffer = const_cast<char_t*>(data);
+
+			convert_wchar_endian_swap(buffer, data, length);
+
+			out_buffer = buffer;
+			out_length = length;
+		}
+		else
+		{
+			char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+			if (!buffer) return false;
+
+			convert_wchar_endian_swap(buffer, data, length);
+			buffer[length] = 0;
+
+			out_buffer = buffer;
+			out_length = length + 1;
+		}
+
+		return true;
+	}
+
+	PUGI__FN bool convert_buffer_utf8(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
+	{
+		const uint8_t* data = static_cast<const uint8_t*>(contents);
+		size_t data_length = size;
+
+		// first pass: get length in wchar_t units
+		size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, data_length, 0);
+
+		// allocate buffer of suitable length
+		char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+		if (!buffer) return false;
+
+		// second pass: convert utf8 input to wchar_t
+		wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
+		wchar_writer::value_type oend = utf_decoder<wchar_writer>::decode_utf8_block(data, data_length, obegin);
+
+		assert(oend == obegin + length);
+		*oend = 0;
+
+		out_buffer = buffer;
+		out_length = length + 1;
+
+		return true;
+	}
+
+	template <typename opt_swap> PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+	{
+		const uint16_t* data = static_cast<const uint16_t*>(contents);
+		size_t data_length = size / sizeof(uint16_t);
+
+		// first pass: get length in wchar_t units
+		size_t length = utf_decoder<wchar_counter, opt_swap>::decode_utf16_block(data, data_length, 0);
+
+		// allocate buffer of suitable length
+		char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+		if (!buffer) return false;
+
+		// second pass: convert utf16 input to wchar_t
+		wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
+		wchar_writer::value_type oend = utf_decoder<wchar_writer, opt_swap>::decode_utf16_block(data, data_length, obegin);
+
+		assert(oend == obegin + length);
+		*oend = 0;
+
+		out_buffer = buffer;
+		out_length = length + 1;
+
+		return true;
+	}
+
+	template <typename opt_swap> PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+	{
+		const uint32_t* data = static_cast<const uint32_t*>(contents);
+		size_t data_length = size / sizeof(uint32_t);
+
+		// first pass: get length in wchar_t units
+		size_t length = utf_decoder<wchar_counter, opt_swap>::decode_utf32_block(data, data_length, 0);
+
+		// allocate buffer of suitable length
+		char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+		if (!buffer) return false;
+
+		// second pass: convert utf32 input to wchar_t
+		wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
+		wchar_writer::value_type oend = utf_decoder<wchar_writer, opt_swap>::decode_utf32_block(data, data_length, obegin);
+
+		assert(oend == obegin + length);
+		*oend = 0;
+
+		out_buffer = buffer;
+		out_length = length + 1;
+
+		return true;
+	}
+
+	PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
+	{
+		const uint8_t* data = static_cast<const uint8_t*>(contents);
+		size_t data_length = size;
+
+		// get length in wchar_t units
+		size_t length = data_length;
+
+		// allocate buffer of suitable length
+		char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+		if (!buffer) return false;
+
+		// convert latin1 input to wchar_t
+		wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
+		wchar_writer::value_type oend = utf_decoder<wchar_writer>::decode_latin1_block(data, data_length, obegin);
+
+		assert(oend == obegin + length);
+		*oend = 0;
+
+		out_buffer = buffer;
+		out_length = length + 1;
+
+		return true;
+	}
+
+	PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
+	{
+		// get native encoding
+		xml_encoding wchar_encoding = get_wchar_encoding();
+
+		// fast path: no conversion required
+		if (encoding == wchar_encoding) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+		// only endian-swapping is required
+		if (need_endian_swap_utf(encoding, wchar_encoding)) return convert_buffer_endian_swap(out_buffer, out_length, contents, size, is_mutable);
+
+		// source encoding is utf8
+		if (encoding == encoding_utf8) return convert_buffer_utf8(out_buffer, out_length, contents, size);
+
+		// source encoding is utf16
+		if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+		{
+			xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+			return (native_encoding == encoding) ?
+				convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
+				convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
+		}
+
+		// source encoding is utf32
+		if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+		{
+			xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+			return (native_encoding == encoding) ?
+				convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
+				convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
+		}
+
+		// source encoding is latin1
+		if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size);
+
+		assert(!"Invalid encoding");
+		return false;
+	}
+#else
+	template <typename opt_swap> PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+	{
+		const uint16_t* data = static_cast<const uint16_t*>(contents);
+		size_t data_length = size / sizeof(uint16_t);
+
+		// first pass: get length in utf8 units
+		size_t length = utf_decoder<utf8_counter, opt_swap>::decode_utf16_block(data, data_length, 0);
+
+		// allocate buffer of suitable length
+		char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+		if (!buffer) return false;
+
+		// second pass: convert utf16 input to utf8
+		uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer);
+		uint8_t* oend = utf_decoder<utf8_writer, opt_swap>::decode_utf16_block(data, data_length, obegin);
+
+		assert(oend == obegin + length);
+		*oend = 0;
+
+		out_buffer = buffer;
+		out_length = length + 1;
+
+		return true;
+	}
+
+	template <typename opt_swap> PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+	{
+		const uint32_t* data = static_cast<const uint32_t*>(contents);
+		size_t data_length = size / sizeof(uint32_t);
+
+		// first pass: get length in utf8 units
+		size_t length = utf_decoder<utf8_counter, opt_swap>::decode_utf32_block(data, data_length, 0);
+
+		// allocate buffer of suitable length
+		char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+		if (!buffer) return false;
+
+		// second pass: convert utf32 input to utf8
+		uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer);
+		uint8_t* oend = utf_decoder<utf8_writer, opt_swap>::decode_utf32_block(data, data_length, obegin);
+
+		assert(oend == obegin + length);
+		*oend = 0;
+
+		out_buffer = buffer;
+		out_length = length + 1;
+
+		return true;
+	}
+
+	PUGI__FN size_t get_latin1_7bit_prefix_length(const uint8_t* data, size_t size)
+	{
+		for (size_t i = 0; i < size; ++i)
+			if (data[i] > 127)
+				return i;
+
+		return size;
+	}
+
+	PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+	{
+		const uint8_t* data = static_cast<const uint8_t*>(contents);
+		size_t data_length = size;
+
+		// get size of prefix that does not need utf8 conversion
+		size_t prefix_length = get_latin1_7bit_prefix_length(data, data_length);
+		assert(prefix_length <= data_length);
+
+		const uint8_t* postfix = data + prefix_length;
+		size_t postfix_length = data_length - prefix_length;
+
+		// if no conversion is needed, just return the original buffer
+		if (postfix_length == 0) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+		// first pass: get length in utf8 units
+		size_t length = prefix_length + utf_decoder<utf8_counter>::decode_latin1_block(postfix, postfix_length, 0);
+
+		// allocate buffer of suitable length
+		char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+		if (!buffer) return false;
+
+		// second pass: convert latin1 input to utf8
+		memcpy(buffer, data, prefix_length);
+
+		uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer);
+		uint8_t* oend = utf_decoder<utf8_writer>::decode_latin1_block(postfix, postfix_length, obegin + prefix_length);
+
+		assert(oend == obegin + length);
+		*oend = 0;
+
+		out_buffer = buffer;
+		out_length = length + 1;
+
+		return true;
+	}
+
+	PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
+	{
+		// fast path: no conversion required
+		if (encoding == encoding_utf8) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+		// source encoding is utf16
+		if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+		{
+			xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+			return (native_encoding == encoding) ?
+				convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
+				convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
+		}
+
+		// source encoding is utf32
+		if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+		{
+			xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+			return (native_encoding == encoding) ?
+				convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
+				convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
+		}
+
+		// source encoding is latin1
+		if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size, is_mutable);
+
+		assert(!"Invalid encoding");
+		return false;
+	}
+#endif
+
+	PUGI__FN size_t as_utf8_begin(const wchar_t* str, size_t length)
+	{
+		// get length in utf8 characters
+		return utf_decoder<utf8_counter>::decode_wchar_block(str, length, 0);
+	}
+
+	PUGI__FN void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length)
+	{
+		// convert to utf8
+		uint8_t* begin = reinterpret_cast<uint8_t*>(buffer);
+		uint8_t* end = utf_decoder<utf8_writer>::decode_wchar_block(str, length, begin);
+	
+		assert(begin + size == end);
+		(void)!end;
+
+		// zero-terminate
+		buffer[size] = 0;
+	}
+	
+#ifndef PUGIXML_NO_STL
+	PUGI__FN std::string as_utf8_impl(const wchar_t* str, size_t length)
+	{
+		// first pass: get length in utf8 characters
+		size_t size = as_utf8_begin(str, length);
+
+		// allocate resulting string
+		std::string result;
+		result.resize(size);
+
+		// second pass: convert to utf8
+		if (size > 0) as_utf8_end(&result[0], size, str, length);
+
+		return result;
+	}
+
+	PUGI__FN std::basic_string<wchar_t> as_wide_impl(const char* str, size_t size)
+	{
+		const uint8_t* data = reinterpret_cast<const uint8_t*>(str);
+
+		// first pass: get length in wchar_t units
+		size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0);
+
+		// allocate resulting string
+		std::basic_string<wchar_t> result;
+		result.resize(length);
+
+		// second pass: convert to wchar_t
+		if (length > 0)
+		{
+			wchar_writer::value_type begin = reinterpret_cast<wchar_writer::value_type>(&result[0]);
+			wchar_writer::value_type end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, begin);
+
+			assert(begin + length == end);
+			(void)!end;
+		}
+
+		return result;
+	}
+#endif
+
+	inline bool strcpy_insitu_allow(size_t length, uintptr_t allocated, char_t* target)
+	{
+		assert(target);
+		size_t target_length = strlength(target);
+
+		// always reuse document buffer memory if possible
+		if (!allocated) return target_length >= length;
+
+		// reuse heap memory if waste is not too great
+		const size_t reuse_threshold = 32;
+
+		return target_length >= length && (target_length < reuse_threshold || target_length - length < target_length / 2);
+	}
+
+	PUGI__FN bool strcpy_insitu(char_t*& dest, uintptr_t& header, uintptr_t header_mask, const char_t* source)
+	{
+		assert(header);
+
+		size_t source_length = strlength(source);
+
+		if (source_length == 0)
+		{
+			// empty string and null pointer are equivalent, so just deallocate old memory
+			xml_allocator* alloc = reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask)->allocator;
+
+			if (header & header_mask) alloc->deallocate_string(dest);
+			
+			// mark the string as not allocated
+			dest = 0;
+			header &= ~header_mask;
+
+			return true;
+		}
+		else if (dest && strcpy_insitu_allow(source_length, header & header_mask, dest))
+		{
+			// we can reuse old buffer, so just copy the new data (including zero terminator)
+			memcpy(dest, source, (source_length + 1) * sizeof(char_t));
+			
+			return true;
+		}
+		else
+		{
+			xml_allocator* alloc = reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask)->allocator;
+
+			// allocate new buffer
+			char_t* buf = alloc->allocate_string(source_length + 1);
+			if (!buf) return false;
+
+			// copy the string (including zero terminator)
+			memcpy(buf, source, (source_length + 1) * sizeof(char_t));
+
+			// deallocate old buffer (*after* the above to protect against overlapping memory and/or allocation failures)
+			if (header & header_mask) alloc->deallocate_string(dest);
+			
+			// the string is now allocated, so set the flag
+			dest = buf;
+			header |= header_mask;
+
+			return true;
+		}
+	}
+
+	struct gap
+	{
+		char_t* end;
+		size_t size;
+			
+		gap(): end(0), size(0)
+		{
+		}
+			
+		// Push new gap, move s count bytes further (skipping the gap).
+		// Collapse previous gap.
+		void push(char_t*& s, size_t count)
+		{
+			if (end) // there was a gap already; collapse it
+			{
+				// Move [old_gap_end, new_gap_start) to [old_gap_start, ...)
+				assert(s >= end);
+				memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
+			}
+				
+			s += count; // end of current gap
+				
+			// "merge" two gaps
+			end = s;
+			size += count;
+		}
+			
+		// Collapse all gaps, return past-the-end pointer
+		char_t* flush(char_t* s)
+		{
+			if (end)
+			{
+				// Move [old_gap_end, current_pos) to [old_gap_start, ...)
+				assert(s >= end);
+				memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
+
+				return s - size;
+			}
+			else return s;
+		}
+	};
+	
+	PUGI__FN char_t* strconv_escape(char_t* s, gap& g)
+	{
+		char_t* stre = s + 1;
+
+		switch (*stre)
+		{
+			case '#':	// &#...
+			{
+				unsigned int ucsc = 0;
+
+				if (stre[1] == 'x') // &#x... (hex code)
+				{
+					stre += 2;
+
+					char_t ch = *stre;
+
+					if (ch == ';') return stre;
+
+					for (;;)
+					{
+						if (static_cast<unsigned int>(ch - '0') <= 9)
+							ucsc = 16 * ucsc + (ch - '0');
+						else if (static_cast<unsigned int>((ch | ' ') - 'a') <= 5)
+							ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10);
+						else if (ch == ';')
+							break;
+						else // cancel
+							return stre;
+
+						ch = *++stre;
+					}
+					
+					++stre;
+				}
+				else	// &#... (dec code)
+				{
+					char_t ch = *++stre;
+
+					if (ch == ';') return stre;
+
+					for (;;)
+					{
+						if (static_cast<unsigned int>(static_cast<unsigned int>(ch) - '0') <= 9)
+							ucsc = 10 * ucsc + (ch - '0');
+						else if (ch == ';')
+							break;
+						else // cancel
+							return stre;
+
+						ch = *++stre;
+					}
+					
+					++stre;
+				}
+
+			#ifdef PUGIXML_WCHAR_MODE
+				s = reinterpret_cast<char_t*>(wchar_writer::any(reinterpret_cast<wchar_writer::value_type>(s), ucsc));
+			#else
+				s = reinterpret_cast<char_t*>(utf8_writer::any(reinterpret_cast<uint8_t*>(s), ucsc));
+			#endif
+					
+				g.push(s, stre - s);
+				return stre;
+			}
+
+			case 'a':	// &a
+			{
+				++stre;
+
+				if (*stre == 'm') // &am
+				{
+					if (*++stre == 'p' && *++stre == ';') // &
+					{
+						*s++ = '&';
+						++stre;
+							
+						g.push(s, stre - s);
+						return stre;
+					}
+				}
+				else if (*stre == 'p') // &ap
+				{
+					if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // '
+					{
+						*s++ = '\'';
+						++stre;
+
+						g.push(s, stre - s);
+						return stre;
+					}
+				}
+				break;
+			}
+
+			case 'g': // &g
+			{
+				if (*++stre == 't' && *++stre == ';') // >
+				{
+					*s++ = '>';
+					++stre;
+					
+					g.push(s, stre - s);
+					return stre;
+				}
+				break;
+			}
+
+			case 'l': // &l
+			{
+				if (*++stre == 't' && *++stre == ';') // <
+				{
+					*s++ = '<';
+					++stre;
+						
+					g.push(s, stre - s);
+					return stre;
+				}
+				break;
+			}
+
+			case 'q': // &q
+			{
+				if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // "
+				{
+					*s++ = '"';
+					++stre;
+					
+					g.push(s, stre - s);
+					return stre;
+				}
+				break;
+			}
+
+			default:
+				break;
+		}
+		
+		return stre;
+	}
+
+	// Utility macro for last character handling
+	#define ENDSWITH(c, e) ((c) == (e) || ((c) == 0 && endch == (e)))
+
+	PUGI__FN char_t* strconv_comment(char_t* s, char_t endch)
+	{
+		gap g;
+		
+		while (true)
+		{
+			while (!PUGI__IS_CHARTYPE(*s, ct_parse_comment)) ++s;
+		
+			if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+			{
+				*s++ = '\n'; // replace first one with 0x0a
+				
+				if (*s == '\n') g.push(s, 1);
+			}
+			else if (s[0] == '-' && s[1] == '-' && ENDSWITH(s[2], '>')) // comment ends here
+			{
+				*g.flush(s) = 0;
+				
+				return s + (s[2] == '>' ? 3 : 2);
+			}
+			else if (*s == 0)
+			{
+				return 0;
+			}
+			else ++s;
+		}
+	}
+
+	PUGI__FN char_t* strconv_cdata(char_t* s, char_t endch)
+	{
+		gap g;
+			
+		while (true)
+		{
+			while (!PUGI__IS_CHARTYPE(*s, ct_parse_cdata)) ++s;
+			
+			if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+			{
+				*s++ = '\n'; // replace first one with 0x0a
+				
+				if (*s == '\n') g.push(s, 1);
+			}
+			else if (s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>')) // CDATA ends here
+			{
+				*g.flush(s) = 0;
+				
+				return s + 1;
+			}
+			else if (*s == 0)
+			{
+				return 0;
+			}
+			else ++s;
+		}
+	}
+	
+	typedef char_t* (*strconv_pcdata_t)(char_t*);
+		
+	template <typename opt_trim, typename opt_eol, typename opt_escape> struct strconv_pcdata_impl
+	{
+		static char_t* parse(char_t* s)
+		{
+			gap g;
+
+			char_t* begin = s;
+
+			while (true)
+			{
+				while (!PUGI__IS_CHARTYPE(*s, ct_parse_pcdata)) ++s;
+					
+				if (*s == '<') // PCDATA ends here
+				{
+					char_t* end = g.flush(s);
+
+					if (opt_trim::value)
+						while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space))
+							--end;
+
+					*end = 0;
+					
+					return s + 1;
+				}
+				else if (opt_eol::value && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+				{
+					*s++ = '\n'; // replace first one with 0x0a
+					
+					if (*s == '\n') g.push(s, 1);
+				}
+				else if (opt_escape::value && *s == '&')
+				{
+					s = strconv_escape(s, g);
+				}
+				else if (*s == 0)
+				{
+					char_t* end = g.flush(s);
+
+					if (opt_trim::value)
+						while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space))
+							--end;
+
+					*end = 0;
+
+					return s;
+				}
+				else ++s;
+			}
+		}
+	};
+	
+	PUGI__FN strconv_pcdata_t get_strconv_pcdata(unsigned int optmask)
+	{
+		PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_trim_pcdata == 0x0800);
+
+		switch (((optmask >> 4) & 3) | ((optmask >> 9) & 4)) // get bitmask for flags (eol escapes trim)
+		{
+		case 0: return strconv_pcdata_impl<opt_false, opt_false, opt_false>::parse;
+		case 1: return strconv_pcdata_impl<opt_false, opt_false, opt_true>::parse;
+		case 2: return strconv_pcdata_impl<opt_false, opt_true, opt_false>::parse;
+		case 3: return strconv_pcdata_impl<opt_false, opt_true, opt_true>::parse;
+		case 4: return strconv_pcdata_impl<opt_true, opt_false, opt_false>::parse;
+		case 5: return strconv_pcdata_impl<opt_true, opt_false, opt_true>::parse;
+		case 6: return strconv_pcdata_impl<opt_true, opt_true, opt_false>::parse;
+		case 7: return strconv_pcdata_impl<opt_true, opt_true, opt_true>::parse;
+		default: assert(false); return 0; // should not get here
+		}
+	}
+
+	typedef char_t* (*strconv_attribute_t)(char_t*, char_t);
+	
+	template <typename opt_escape> struct strconv_attribute_impl
+	{
+		static char_t* parse_wnorm(char_t* s, char_t end_quote)
+		{
+			gap g;
+
+			// trim leading whitespaces
+			if (PUGI__IS_CHARTYPE(*s, ct_space))
+			{
+				char_t* str = s;
+				
+				do ++str;
+				while (PUGI__IS_CHARTYPE(*str, ct_space));
+				
+				g.push(s, str - s);
+			}
+
+			while (true)
+			{
+				while (!PUGI__IS_CHARTYPE(*s, ct_parse_attr_ws | ct_space)) ++s;
+				
+				if (*s == end_quote)
+				{
+					char_t* str = g.flush(s);
+					
+					do *str-- = 0;
+					while (PUGI__IS_CHARTYPE(*str, ct_space));
+				
+					return s + 1;
+				}
+				else if (PUGI__IS_CHARTYPE(*s, ct_space))
+				{
+					*s++ = ' ';
+		
+					if (PUGI__IS_CHARTYPE(*s, ct_space))
+					{
+						char_t* str = s + 1;
+						while (PUGI__IS_CHARTYPE(*str, ct_space)) ++str;
+						
+						g.push(s, str - s);
+					}
+				}
+				else if (opt_escape::value && *s == '&')
+				{
+					s = strconv_escape(s, g);
+				}
+				else if (!*s)
+				{
+					return 0;
+				}
+				else ++s;
+			}
+		}
+
+		static char_t* parse_wconv(char_t* s, char_t end_quote)
+		{
+			gap g;
+
+			while (true)
+			{
+				while (!PUGI__IS_CHARTYPE(*s, ct_parse_attr_ws)) ++s;
+				
+				if (*s == end_quote)
+				{
+					*g.flush(s) = 0;
+				
+					return s + 1;
+				}
+				else if (PUGI__IS_CHARTYPE(*s, ct_space))
+				{
+					if (*s == '\r')
+					{
+						*s++ = ' ';
+				
+						if (*s == '\n') g.push(s, 1);
+					}
+					else *s++ = ' ';
+				}
+				else if (opt_escape::value && *s == '&')
+				{
+					s = strconv_escape(s, g);
+				}
+				else if (!*s)
+				{
+					return 0;
+				}
+				else ++s;
+			}
+		}
+
+		static char_t* parse_eol(char_t* s, char_t end_quote)
+		{
+			gap g;
+
+			while (true)
+			{
+				while (!PUGI__IS_CHARTYPE(*s, ct_parse_attr)) ++s;
+				
+				if (*s == end_quote)
+				{
+					*g.flush(s) = 0;
+				
+					return s + 1;
+				}
+				else if (*s == '\r')
+				{
+					*s++ = '\n';
+					
+					if (*s == '\n') g.push(s, 1);
+				}
+				else if (opt_escape::value && *s == '&')
+				{
+					s = strconv_escape(s, g);
+				}
+				else if (!*s)
+				{
+					return 0;
+				}
+				else ++s;
+			}
+		}
+
+		static char_t* parse_simple(char_t* s, char_t end_quote)
+		{
+			gap g;
+
+			while (true)
+			{
+				while (!PUGI__IS_CHARTYPE(*s, ct_parse_attr)) ++s;
+				
+				if (*s == end_quote)
+				{
+					*g.flush(s) = 0;
+				
+					return s + 1;
+				}
+				else if (opt_escape::value && *s == '&')
+				{
+					s = strconv_escape(s, g);
+				}
+				else if (!*s)
+				{
+					return 0;
+				}
+				else ++s;
+			}
+		}
+	};
+
+	PUGI__FN strconv_attribute_t get_strconv_attribute(unsigned int optmask)
+	{
+		PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80);
+		
+		switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes)
+		{
+		case 0:  return strconv_attribute_impl<opt_false>::parse_simple;
+		case 1:  return strconv_attribute_impl<opt_true>::parse_simple;
+		case 2:  return strconv_attribute_impl<opt_false>::parse_eol;
+		case 3:  return strconv_attribute_impl<opt_true>::parse_eol;
+		case 4:  return strconv_attribute_impl<opt_false>::parse_wconv;
+		case 5:  return strconv_attribute_impl<opt_true>::parse_wconv;
+		case 6:  return strconv_attribute_impl<opt_false>::parse_wconv;
+		case 7:  return strconv_attribute_impl<opt_true>::parse_wconv;
+		case 8:  return strconv_attribute_impl<opt_false>::parse_wnorm;
+		case 9:  return strconv_attribute_impl<opt_true>::parse_wnorm;
+		case 10: return strconv_attribute_impl<opt_false>::parse_wnorm;
+		case 11: return strconv_attribute_impl<opt_true>::parse_wnorm;
+		case 12: return strconv_attribute_impl<opt_false>::parse_wnorm;
+		case 13: return strconv_attribute_impl<opt_true>::parse_wnorm;
+		case 14: return strconv_attribute_impl<opt_false>::parse_wnorm;
+		case 15: return strconv_attribute_impl<opt_true>::parse_wnorm;
+		default: assert(false); return 0; // should not get here
+		}
+	}
+
+	inline xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0)
+	{
+		xml_parse_result result;
+		result.status = status;
+		result.offset = offset;
+
+		return result;
+	}
+
+	struct xml_parser
+	{
+		xml_allocator alloc;
+		char_t* error_offset;
+		xml_parse_status error_status;
+		
+		// Parser utilities.
+		#define PUGI__SKIPWS()			{ while (PUGI__IS_CHARTYPE(*s, ct_space)) ++s; }
+		#define PUGI__OPTSET(OPT)			( optmsk & (OPT) )
+		#define PUGI__PUSHNODE(TYPE)		{ cursor = append_node(cursor, alloc, TYPE); if (!cursor) PUGI__THROW_ERROR(status_out_of_memory, s); }
+		#define PUGI__POPNODE()			{ cursor = cursor->parent; }
+		#define PUGI__SCANFOR(X)			{ while (*s != 0 && !(X)) ++s; }
+		#define PUGI__SCANWHILE(X)		{ while ((X)) ++s; }
+		#define PUGI__ENDSEG()			{ ch = *s; *s = 0; ++s; }
+		#define PUGI__THROW_ERROR(err, m)	return error_offset = m, error_status = err, static_cast<char_t*>(0)
+		#define PUGI__CHECK_ERROR(err, m)	{ if (*s == 0) PUGI__THROW_ERROR(err, m); }
+		
+		xml_parser(const xml_allocator& alloc_): alloc(alloc_), error_offset(0), error_status(status_ok)
+		{
+		}
+
+		// DOCTYPE consists of nested sections of the following possible types:
+		// <!-- ... -->, <? ... ?>, "...", '...'
+		// <![...]]>
+		// <!...>
+		// First group can not contain nested groups
+		// Second group can contain nested groups of the same type
+		// Third group can contain all other groups
+		char_t* parse_doctype_primitive(char_t* s)
+		{
+			if (*s == '"' || *s == '\'')
+			{
+				// quoted string
+				char_t ch = *s++;
+				PUGI__SCANFOR(*s == ch);
+				if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+				s++;
+			}
+			else if (s[0] == '<' && s[1] == '?')
+			{
+				// <? ... ?>
+				s += 2;
+				PUGI__SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype
+				if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+				s += 2;
+			}
+			else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-')
+			{
+				s += 4;
+				PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype
+				if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+				s += 4;
+			}
+			else PUGI__THROW_ERROR(status_bad_doctype, s);
+
+			return s;
+		}
+
+		char_t* parse_doctype_ignore(char_t* s)
+		{
+			assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
+			s++;
+
+			while (*s)
+			{
+				if (s[0] == '<' && s[1] == '!' && s[2] == '[')
+				{
+					// nested ignore section
+					s = parse_doctype_ignore(s);
+					if (!s) return s;
+				}
+				else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
+				{
+					// ignore section end
+					s += 3;
+
+					return s;
+				}
+				else s++;
+			}
+
+			PUGI__THROW_ERROR(status_bad_doctype, s);
+		}
+
+		char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel)
+		{
+			assert((s[0] == '<' || s[0] == 0) && s[1] == '!');
+			s++;
+
+			while (*s)
+			{
+				if (s[0] == '<' && s[1] == '!' && s[2] != '-')
+				{
+					if (s[2] == '[')
+					{
+						// ignore
+						s = parse_doctype_ignore(s);
+						if (!s) return s;
+					}
+					else
+					{
+						// some control group
+						s = parse_doctype_group(s, endch, false);
+						if (!s) return s;
+
+						// skip >
+						assert(*s == '>');
+						s++;
+					}
+				}
+				else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
+				{
+					// unknown tag (forbidden), or some primitive group
+					s = parse_doctype_primitive(s);
+					if (!s) return s;
+				}
+				else if (*s == '>')
+				{
+					return s;
+				}
+				else s++;
+			}
+
+			if (!toplevel || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);
+
+			return s;
+		}
+
+		char_t* parse_exclamation(char_t* s, xml_node_struct* cursor, unsigned int optmsk, char_t endch)
+		{
+			// parse node contents, starting with exclamation mark
+			++s;
+
+			if (*s == '-') // '<!-...'
+			{
+				++s;
+
+				if (*s == '-') // '<!--...'
+				{
+					++s;
+
+					if (PUGI__OPTSET(parse_comments))
+					{
+						PUGI__PUSHNODE(node_comment); // Append a new node on the tree.
+						cursor->value = s; // Save the offset.
+					}
+
+					if (PUGI__OPTSET(parse_eol) && PUGI__OPTSET(parse_comments))
+					{
+						s = strconv_comment(s, endch);
+
+						if (!s) PUGI__THROW_ERROR(status_bad_comment, cursor->value);
+					}
+					else
+					{
+						// Scan for terminating '-->'.
+						PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && ENDSWITH(s[2], '>'));
+						PUGI__CHECK_ERROR(status_bad_comment, s);
+
+						if (PUGI__OPTSET(parse_comments))
+							*s = 0; // Zero-terminate this segment at the first terminating '-'.
+
+						s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'.
+					}
+				}
+				else PUGI__THROW_ERROR(status_bad_comment, s);
+			}
+			else if (*s == '[')
+			{
+				// '<![CDATA[...'
+				if (*++s=='C' && *++s=='D' && *++s=='A' && *++s=='T' && *++s=='A' && *++s == '[')
+				{
+					++s;
+
+					if (PUGI__OPTSET(parse_cdata))
+					{
+						PUGI__PUSHNODE(node_cdata); // Append a new node on the tree.
+						cursor->value = s; // Save the offset.
+
+						if (PUGI__OPTSET(parse_eol))
+						{
+							s = strconv_cdata(s, endch);
+
+							if (!s) PUGI__THROW_ERROR(status_bad_cdata, cursor->value);
+						}
+						else
+						{
+							// Scan for terminating ']]>'.
+							PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>'));
+							PUGI__CHECK_ERROR(status_bad_cdata, s);
+
+							*s++ = 0; // Zero-terminate this segment.
+						}
+					}
+					else // Flagged for discard, but we still have to scan for the terminator.
+					{
+						// Scan for terminating ']]>'.
+						PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>'));
+						PUGI__CHECK_ERROR(status_bad_cdata, s);
+
+						++s;
+					}
+
+					s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'.
+				}
+				else PUGI__THROW_ERROR(status_bad_cdata, s);
+			}
+			else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && ENDSWITH(s[6], 'E'))
+			{
+				s -= 2;
+
+				if (cursor->parent) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+				char_t* mark = s + 9;
+
+				s = parse_doctype_group(s, endch, true);
+				if (!s) return s;
+
+				assert((*s == 0 && endch == '>') || *s == '>');
+				if (*s) *s++ = 0;
+
+				if (PUGI__OPTSET(parse_doctype))
+				{
+					while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark;
+
+					PUGI__PUSHNODE(node_doctype);
+
+					cursor->value = mark;
+
+					PUGI__POPNODE();
+				}
+			}
+			else if (*s == 0 && endch == '-') PUGI__THROW_ERROR(status_bad_comment, s);
+			else if (*s == 0 && endch == '[') PUGI__THROW_ERROR(status_bad_cdata, s);
+			else PUGI__THROW_ERROR(status_unrecognized_tag, s);
+
+			return s;
+		}
+
+		char_t* parse_question(char_t* s, xml_node_struct*& ref_cursor, unsigned int optmsk, char_t endch)
+		{
+			// load into registers
+			xml_node_struct* cursor = ref_cursor;
+			char_t ch = 0;
+
+			// parse node contents, starting with question mark
+			++s;
+
+			// read PI target
+			char_t* target = s;
+
+			if (!PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_pi, s);
+
+			PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol));
+			PUGI__CHECK_ERROR(status_bad_pi, s);
+
+			// determine node type; stricmp / strcasecmp is not portable
+			bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' && (target[2] | ' ') == 'l' && target + 3 == s;
+
+			if (declaration ? PUGI__OPTSET(parse_declaration) : PUGI__OPTSET(parse_pi))
+			{
+				if (declaration)
+				{
+					// disallow non top-level declarations
+					if (cursor->parent) PUGI__THROW_ERROR(status_bad_pi, s);
+
+					PUGI__PUSHNODE(node_declaration);
+				}
+				else
+				{
+					PUGI__PUSHNODE(node_pi);
+				}
+
+				cursor->name = target;
+
+				PUGI__ENDSEG();
+
+				// parse value/attributes
+				if (ch == '?')
+				{
+					// empty node
+					if (!ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_pi, s);
+					s += (*s == '>');
+
+					PUGI__POPNODE();
+				}
+				else if (PUGI__IS_CHARTYPE(ch, ct_space))
+				{
+					PUGI__SKIPWS();
+
+					// scan for tag end
+					char_t* value = s;
+
+					PUGI__SCANFOR(s[0] == '?' && ENDSWITH(s[1], '>'));
+					PUGI__CHECK_ERROR(status_bad_pi, s);
+
+					if (declaration)
+					{
+						// replace ending ? with / so that 'element' terminates properly
+						*s = '/';
+
+						// we exit from this function with cursor at node_declaration, which is a signal to parse() to go to LOC_ATTRIBUTES
+						s = value;
+					}
+					else
+					{
+						// store value and step over >
+						cursor->value = value;
+						PUGI__POPNODE();
+
+						PUGI__ENDSEG();
+
+						s += (*s == '>');
+					}
+				}
+				else PUGI__THROW_ERROR(status_bad_pi, s);
+			}
+			else
+			{
+				// scan for tag end
+				PUGI__SCANFOR(s[0] == '?' && ENDSWITH(s[1], '>'));
+				PUGI__CHECK_ERROR(status_bad_pi, s);
+
+				s += (s[1] == '>' ? 2 : 1);
+			}
+
+			// store from registers
+			ref_cursor = cursor;
+
+			return s;
+		}
+
+		char_t* parse_tree(char_t* s, xml_node_struct* root, unsigned int optmsk, char_t endch)
+		{
+			strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk);
+			strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk);
+			
+			char_t ch = 0;
+			xml_node_struct* cursor = root;
+			char_t* mark = s;
+
+			while (*s != 0)
+			{
+				if (*s == '<')
+				{
+					++s;
+
+				LOC_TAG:
+					if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // '<#...'
+					{
+						PUGI__PUSHNODE(node_element); // Append a new node to the tree.
+
+						cursor->name = s;
+
+						PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol)); // Scan for a terminator.
+						PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
+
+						if (ch == '>')
+						{
+							// end of tag
+						}
+						else if (PUGI__IS_CHARTYPE(ch, ct_space))
+						{
+						LOC_ATTRIBUTES:
+							while (true)
+							{
+								PUGI__SKIPWS(); // Eat any whitespace.
+						
+								if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // <... #...
+								{
+									xml_attribute_struct* a = append_attribute_ll(cursor, alloc); // Make space for this attribute.
+									if (!a) PUGI__THROW_ERROR(status_out_of_memory, s);
+
+									a->name = s; // Save the offset.
+
+									PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol)); // Scan for a terminator.
+									PUGI__CHECK_ERROR(status_bad_attribute, s); //$ redundant, left for performance
+
+									PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
+									PUGI__CHECK_ERROR(status_bad_attribute, s); //$ redundant, left for performance
+
+									if (PUGI__IS_CHARTYPE(ch, ct_space))
+									{
+										PUGI__SKIPWS(); // Eat any whitespace.
+										PUGI__CHECK_ERROR(status_bad_attribute, s); //$ redundant, left for performance
+
+										ch = *s;
+										++s;
+									}
+									
+									if (ch == '=') // '<... #=...'
+									{
+										PUGI__SKIPWS(); // Eat any whitespace.
+
+										if (*s == '"' || *s == '\'') // '<... #="...'
+										{
+											ch = *s; // Save quote char to avoid breaking on "''" -or- '""'.
+											++s; // Step over the quote.
+											a->value = s; // Save the offset.
+
+											s = strconv_attribute(s, ch);
+										
+											if (!s) PUGI__THROW_ERROR(status_bad_attribute, a->value);
+
+											// After this line the loop continues from the start;
+											// Whitespaces, / and > are ok, symbols and EOF are wrong,
+											// everything else will be detected
+											if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_attribute, s);
+										}
+										else PUGI__THROW_ERROR(status_bad_attribute, s);
+									}
+									else PUGI__THROW_ERROR(status_bad_attribute, s);
+								}
+								else if (*s == '/')
+								{
+									++s;
+									
+									if (*s == '>')
+									{
+										PUGI__POPNODE();
+										s++;
+										break;
+									}
+									else if (*s == 0 && endch == '>')
+									{
+										PUGI__POPNODE();
+										break;
+									}
+									else PUGI__THROW_ERROR(status_bad_start_element, s);
+								}
+								else if (*s == '>')
+								{
+									++s;
+
+									break;
+								}
+								else if (*s == 0 && endch == '>')
+								{
+									break;
+								}
+								else PUGI__THROW_ERROR(status_bad_start_element, s);
+							}
+
+							// !!!
+						}
+						else if (ch == '/') // '<#.../'
+						{
+							if (!ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_start_element, s);
+
+							PUGI__POPNODE(); // Pop.
+
+							s += (*s == '>');
+						}
+						else if (ch == 0)
+						{
+							// we stepped over null terminator, backtrack & handle closing tag
+							--s;
+							
+							if (endch != '>') PUGI__THROW_ERROR(status_bad_start_element, s);
+						}
+						else PUGI__THROW_ERROR(status_bad_start_element, s);
+					}
+					else if (*s == '/')
+					{
+						++s;
+
+						char_t* name = cursor->name;
+						if (!name) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+						
+						while (PUGI__IS_CHARTYPE(*s, ct_symbol))
+						{
+							if (*s++ != *name++) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+						}
+
+						if (*name)
+						{
+							if (*s == 0 && name[0] == endch && name[1] == 0) PUGI__THROW_ERROR(status_bad_end_element, s);
+							else PUGI__THROW_ERROR(status_end_element_mismatch, s);
+						}
+							
+						PUGI__POPNODE(); // Pop.
+
+						PUGI__SKIPWS();
+
+						if (*s == 0)
+						{
+							if (endch != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
+						}
+						else
+						{
+							if (*s != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
+							++s;
+						}
+					}
+					else if (*s == '?') // '<?...'
+					{
+						s = parse_question(s, cursor, optmsk, endch);
+						if (!s) return s;
+
+						assert(cursor);
+						if ((cursor->header & xml_memory_page_type_mask) + 1 == node_declaration) goto LOC_ATTRIBUTES;
+					}
+					else if (*s == '!') // '<!...'
+					{
+						s = parse_exclamation(s, cursor, optmsk, endch);
+						if (!s) return s;
+					}
+					else if (*s == 0 && endch == '?') PUGI__THROW_ERROR(status_bad_pi, s);
+					else PUGI__THROW_ERROR(status_unrecognized_tag, s);
+				}
+				else
+				{
+					mark = s; // Save this offset while searching for a terminator.
+
+					PUGI__SKIPWS(); // Eat whitespace if no genuine PCDATA here.
+
+					if (*s == '<' || !*s)
+					{
+						// We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one
+						assert(mark != s);
+
+						if (!PUGI__OPTSET(parse_ws_pcdata | parse_ws_pcdata_single) || PUGI__OPTSET(parse_trim_pcdata))
+						{
+							continue;
+						}
+						else if (PUGI__OPTSET(parse_ws_pcdata_single))
+						{
+							if (s[0] != '<' || s[1] != '/' || cursor->first_child) continue;
+						}
+					}
+
+					if (!PUGI__OPTSET(parse_trim_pcdata))
+						s = mark;
+							
+					if (cursor->parent || PUGI__OPTSET(parse_fragment))
+					{
+						PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
+						cursor->value = s; // Save the offset.
+
+						s = strconv_pcdata(s);
+								
+						PUGI__POPNODE(); // Pop since this is a standalone.
+						
+						if (!*s) break;
+					}
+					else
+					{
+						PUGI__SCANFOR(*s == '<'); // '...<'
+						if (!*s) break;
+						
+						++s;
+					}
+
+					// We're after '<'
+					goto LOC_TAG;
+				}
+			}
+
+			// check that last tag is closed
+			if (cursor != root) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+
+			return s;
+		}
+
+	#ifdef PUGIXML_WCHAR_MODE
+		static char_t* parse_skip_bom(char_t* s)
+		{
+			unsigned int bom = 0xfeff;
+			return (s[0] == static_cast<wchar_t>(bom)) ? s + 1 : s;
+		}
+	#else
+		static char_t* parse_skip_bom(char_t* s)
+		{
+			return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s;
+		}
+	#endif
+
+		static bool has_element_node_siblings(xml_node_struct* node)
+		{
+			while (node)
+			{
+				xml_node_type type = static_cast<xml_node_type>((node->header & impl::xml_memory_page_type_mask) + 1);
+				if (type == node_element) return true;
+
+				node = node->next_sibling;
+			}
+
+			return false;
+		}
+
+		static xml_parse_result parse(char_t* buffer, size_t length, xml_document_struct* xmldoc, xml_node_struct* root, unsigned int optmsk)
+		{
+			// allocator object is a part of document object
+			xml_allocator& alloc = *static_cast<xml_allocator*>(xmldoc);
+
+			// early-out for empty documents
+			if (length == 0)
+				return make_parse_result(PUGI__OPTSET(parse_fragment) ? status_ok : status_no_document_element);
+
+			// get last child of the root before parsing
+			xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c : 0;
+	
+			// create parser on stack
+			xml_parser parser(alloc);
+
+			// save last character and make buffer zero-terminated (speeds up parsing)
+			char_t endch = buffer[length - 1];
+			buffer[length - 1] = 0;
+			
+			// skip BOM to make sure it does not end up as part of parse output
+			char_t* buffer_data = parse_skip_bom(buffer);
+
+			// perform actual parsing
+			parser.parse_tree(buffer_data, root, optmsk, endch);
+
+			// update allocator state
+			alloc = parser.alloc;
+
+			xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0);
+			assert(result.offset >= 0 && static_cast<size_t>(result.offset) <= length);
+
+			if (result)
+			{
+				// since we removed last character, we have to handle the only possible false positive (stray <)
+				if (endch == '<')
+					return make_parse_result(status_unrecognized_tag, length - 1);
+
+				// check if there are any element nodes parsed
+				xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling : root->first_child;
+
+				if (!PUGI__OPTSET(parse_fragment) && !has_element_node_siblings(first_root_child_parsed))
+					return make_parse_result(status_no_document_element, length - 1);
+			}
+			else
+			{
+				// roll back offset if it occurs on a null terminator in the source buffer
+				if (result.offset > 0 && static_cast<size_t>(result.offset) == length - 1 && endch == 0)
+					result.offset--;
+			}
+
+			return result;
+		}
+	};
+
+	// Output facilities
+	PUGI__FN xml_encoding get_write_native_encoding()
+	{
+	#ifdef PUGIXML_WCHAR_MODE
+		return get_wchar_encoding();
+	#else
+		return encoding_utf8;
+	#endif
+	}
+
+	PUGI__FN xml_encoding get_write_encoding(xml_encoding encoding)
+	{
+		// replace wchar encoding with utf implementation
+		if (encoding == encoding_wchar) return get_wchar_encoding();
+
+		// replace utf16 encoding with utf16 with specific endianness
+		if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+		// replace utf32 encoding with utf32 with specific endianness
+		if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+		// only do autodetection if no explicit encoding is requested
+		if (encoding != encoding_auto) return encoding;
+
+		// assume utf8 encoding
+		return encoding_utf8;
+	}
+
+#ifdef PUGIXML_WCHAR_MODE
+	PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
+	{
+		assert(length > 0);
+
+		// discard last character if it's the lead of a surrogate pair 
+		return (sizeof(wchar_t) == 2 && static_cast<unsigned int>(static_cast<uint16_t>(data[length - 1]) - 0xD800) < 0x400) ? length - 1 : length;
+	}
+
+	PUGI__FN size_t convert_buffer_output(char_t* r_char, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
+	{
+		// only endian-swapping is required
+		if (need_endian_swap_utf(encoding, get_wchar_encoding()))
+		{
+			convert_wchar_endian_swap(r_char, data, length);
+
+			return length * sizeof(char_t);
+		}
+	
+		// convert to utf8
+		if (encoding == encoding_utf8)
+		{
+			uint8_t* dest = r_u8;
+			uint8_t* end = utf_decoder<utf8_writer>::decode_wchar_block(data, length, dest);
+
+			return static_cast<size_t>(end - dest);
+		}
+
+		// convert to utf16
+		if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+		{
+			uint16_t* dest = r_u16;
+
+			// convert to native utf16
+			uint16_t* end = utf_decoder<utf16_writer>::decode_wchar_block(data, length, dest);
+
+			// swap if necessary
+			xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+			if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+			return static_cast<size_t>(end - dest) * sizeof(uint16_t);
+		}
+
+		// convert to utf32
+		if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+		{
+			uint32_t* dest = r_u32;
+
+			// convert to native utf32
+			uint32_t* end = utf_decoder<utf32_writer>::decode_wchar_block(data, length, dest);
+
+			// swap if necessary
+			xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+			if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+			return static_cast<size_t>(end - dest) * sizeof(uint32_t);
+		}
+
+		// convert to latin1
+		if (encoding == encoding_latin1)
+		{
+			uint8_t* dest = r_u8;
+			uint8_t* end = utf_decoder<latin1_writer>::decode_wchar_block(data, length, dest);
+
+			return static_cast<size_t>(end - dest);
+		}
+
+		assert(!"Invalid encoding");
+		return 0;
+	}
+#else
+	PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
+	{
+		assert(length > 4);
+
+		for (size_t i = 1; i <= 4; ++i)
+		{
+			uint8_t ch = static_cast<uint8_t>(data[length - i]);
+
+			// either a standalone character or a leading one
+			if ((ch & 0xc0) != 0x80) return length - i;
+		}
+
+		// there are four non-leading characters at the end, sequence tail is broken so might as well process the whole chunk
+		return length;
+	}
+
+	PUGI__FN size_t convert_buffer_output(char_t* /* r_char */, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
+	{
+		if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+		{
+			uint16_t* dest = r_u16;
+
+			// convert to native utf16
+			uint16_t* end = utf_decoder<utf16_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+			// swap if necessary
+			xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+			if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+			return static_cast<size_t>(end - dest) * sizeof(uint16_t);
+		}
+
+		if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+		{
+			uint32_t* dest = r_u32;
+
+			// convert to native utf32
+			uint32_t* end = utf_decoder<utf32_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+			// swap if necessary
+			xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+			if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+			return static_cast<size_t>(end - dest) * sizeof(uint32_t);
+		}
+
+		if (encoding == encoding_latin1)
+		{
+			uint8_t* dest = r_u8;
+			uint8_t* end = utf_decoder<latin1_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+			return static_cast<size_t>(end - dest);
+		}
+
+		assert(!"Invalid encoding");
+		return 0;
+	}
+#endif
+
+	class xml_buffered_writer
+	{
+		xml_buffered_writer(const xml_buffered_writer&);
+		xml_buffered_writer& operator=(const xml_buffered_writer&);
+
+	public:
+		xml_buffered_writer(xml_writer& writer_, xml_encoding user_encoding): writer(writer_), bufsize(0), encoding(get_write_encoding(user_encoding))
+		{
+			PUGI__STATIC_ASSERT(bufcapacity >= 8);
+		}
+
+		~xml_buffered_writer()
+		{
+			flush();
+		}
+
+		void flush()
+		{
+			flush(buffer, bufsize);
+			bufsize = 0;
+		}
+
+		void flush(const char_t* data, size_t size)
+		{
+			if (size == 0) return;
+
+			// fast path, just write data
+			if (encoding == get_write_native_encoding())
+				writer.write(data, size * sizeof(char_t));
+			else
+			{
+				// convert chunk
+				size_t result = convert_buffer_output(scratch.data_char, scratch.data_u8, scratch.data_u16, scratch.data_u32, data, size, encoding);
+				assert(result <= sizeof(scratch));
+
+				// write data
+				writer.write(scratch.data_u8, result);
+			}
+		}
+
+		void write(const char_t* data, size_t length)
+		{
+			if (bufsize + length > bufcapacity)
+			{
+				// flush the remaining buffer contents
+				flush();
+
+				// handle large chunks
+				if (length > bufcapacity)
+				{
+					if (encoding == get_write_native_encoding())
+					{
+						// fast path, can just write data chunk
+						writer.write(data, length * sizeof(char_t));
+						return;
+					}
+
+					// need to convert in suitable chunks
+					while (length > bufcapacity)
+					{
+						// get chunk size by selecting such number of characters that are guaranteed to fit into scratch buffer
+						// and form a complete codepoint sequence (i.e. discard start of last codepoint if necessary)
+						size_t chunk_size = get_valid_length(data, bufcapacity);
+
+						// convert chunk and write
+						flush(data, chunk_size);
+
+						// iterate
+						data += chunk_size;
+						length -= chunk_size;
+					}
+
+					// small tail is copied below
+					bufsize = 0;
+				}
+			}
+
+			memcpy(buffer + bufsize, data, length * sizeof(char_t));
+			bufsize += length;
+		}
+
+		void write(const char_t* data)
+		{
+			write(data, strlength(data));
+		}
+
+		void write(char_t d0)
+		{
+			if (bufsize + 1 > bufcapacity) flush();
+
+			buffer[bufsize + 0] = d0;
+			bufsize += 1;
+		}
+
+		void write(char_t d0, char_t d1)
+		{
+			if (bufsize + 2 > bufcapacity) flush();
+
+			buffer[bufsize + 0] = d0;
+			buffer[bufsize + 1] = d1;
+			bufsize += 2;
+		}
+
+		void write(char_t d0, char_t d1, char_t d2)
+		{
+			if (bufsize + 3 > bufcapacity) flush();
+
+			buffer[bufsize + 0] = d0;
+			buffer[bufsize + 1] = d1;
+			buffer[bufsize + 2] = d2;
+			bufsize += 3;
+		}
+
+		void write(char_t d0, char_t d1, char_t d2, char_t d3)
+		{
+			if (bufsize + 4 > bufcapacity) flush();
+
+			buffer[bufsize + 0] = d0;
+			buffer[bufsize + 1] = d1;
+			buffer[bufsize + 2] = d2;
+			buffer[bufsize + 3] = d3;
+			bufsize += 4;
+		}
+
+		void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4)
+		{
+			if (bufsize + 5 > bufcapacity) flush();
+
+			buffer[bufsize + 0] = d0;
+			buffer[bufsize + 1] = d1;
+			buffer[bufsize + 2] = d2;
+			buffer[bufsize + 3] = d3;
+			buffer[bufsize + 4] = d4;
+			bufsize += 5;
+		}
+
+		void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4, char_t d5)
+		{
+			if (bufsize + 6 > bufcapacity) flush();
+
+			buffer[bufsize + 0] = d0;
+			buffer[bufsize + 1] = d1;
+			buffer[bufsize + 2] = d2;
+			buffer[bufsize + 3] = d3;
+			buffer[bufsize + 4] = d4;
+			buffer[bufsize + 5] = d5;
+			bufsize += 6;
+		}
+
+		// utf8 maximum expansion: x4 (-> utf32)
+		// utf16 maximum expansion: x2 (-> utf32)
+		// utf32 maximum expansion: x1
+		enum
+		{
+			bufcapacitybytes =
+			#ifdef PUGIXML_MEMORY_OUTPUT_STACK
+				PUGIXML_MEMORY_OUTPUT_STACK
+			#else
+				10240
+			#endif
+			,
+			bufcapacity = bufcapacitybytes / (sizeof(char_t) + 4)
+		};
+
+		char_t buffer[bufcapacity];
+
+		union
+		{
+			uint8_t data_u8[4 * bufcapacity];
+			uint16_t data_u16[2 * bufcapacity];
+			uint32_t data_u32[bufcapacity];
+			char_t data_char[bufcapacity];
+		} scratch;
+
+		xml_writer& writer;
+		size_t bufsize;
+		xml_encoding encoding;
+	};
+
+	PUGI__FN void text_output_escaped(xml_buffered_writer& writer, const char_t* s, chartypex_t type)
+	{
+		while (*s)
+		{
+			const char_t* prev = s;
+			
+			// While *s is a usual symbol
+			while (!PUGI__IS_CHARTYPEX(*s, type)) ++s;
+		
+			writer.write(prev, static_cast<size_t>(s - prev));
+
+			switch (*s)
+			{
+				case 0: break;
+				case '&':
+					writer.write('&', 'a', 'm', 'p', ';');
+					++s;
+					break;
+				case '<':
+					writer.write('&', 'l', 't', ';');
+					++s;
+					break;
+				case '>':
+					writer.write('&', 'g', 't', ';');
+					++s;
+					break;
+				case '"':
+					writer.write('&', 'q', 'u', 'o', 't', ';');
+					++s;
+					break;
+				default: // s is not a usual symbol
+				{
+					unsigned int ch = static_cast<unsigned int>(*s++);
+					assert(ch < 32);
+
+					writer.write('&', '#', static_cast<char_t>((ch / 10) + '0'), static_cast<char_t>((ch % 10) + '0'), ';');
+				}
+			}
+		}
+	}
+
+	PUGI__FN void text_output(xml_buffered_writer& writer, const char_t* s, chartypex_t type, unsigned int flags)
+	{
+		if (flags & format_no_escapes)
+			writer.write(s);
+		else
+			text_output_escaped(writer, s, type);
+	}
+
+	PUGI__FN void text_output_cdata(xml_buffered_writer& writer, const char_t* s)
+	{
+		do
+		{
+			writer.write('<', '!', '[', 'C', 'D');
+			writer.write('A', 'T', 'A', '[');
+
+			const char_t* prev = s;
+
+			// look for ]]> sequence - we can't output it as is since it terminates CDATA
+			while (*s && !(s[0] == ']' && s[1] == ']' && s[2] == '>')) ++s;
+
+			// skip ]] if we stopped at ]]>, > will go to the next CDATA section
+			if (*s) s += 2;
+
+			writer.write(prev, static_cast<size_t>(s - prev));
+
+			writer.write(']', ']', '>');
+		}
+		while (*s);
+	}
+
+	PUGI__FN void node_output_attributes(xml_buffered_writer& writer, const xml_node& node, unsigned int flags)
+	{
+		const char_t* default_name = PUGIXML_TEXT(":anonymous");
+
+		for (xml_attribute a = node.first_attribute(); a; a = a.next_attribute())
+		{
+			writer.write(' ');
+			writer.write(a.name()[0] ? a.name() : default_name);
+			writer.write('=', '"');
+
+			text_output(writer, a.value(), ctx_special_attr, flags);
+
+			writer.write('"');
+		}
+	}
+
+	PUGI__FN void node_output(xml_buffered_writer& writer, const xml_node& node, const char_t* indent, unsigned int flags, unsigned int depth)
+	{
+		const char_t* default_name = PUGIXML_TEXT(":anonymous");
+
+		if ((flags & format_indent) != 0 && (flags & format_raw) == 0)
+			for (unsigned int i = 0; i < depth; ++i) writer.write(indent);
+
+		switch (node.type())
+		{
+		case node_document:
+		{
+			for (xml_node n = node.first_child(); n; n = n.next_sibling())
+				node_output(writer, n, indent, flags, depth);
+			break;
+		}
+			
+		case node_element:
+		{
+			const char_t* name = node.name()[0] ? node.name() : default_name;
+
+			writer.write('<');
+			writer.write(name);
+
+			node_output_attributes(writer, node, flags);
+
+			if (flags & format_raw)
+			{
+				if (!node.first_child())
+					writer.write(' ', '/', '>');
+				else
+				{
+					writer.write('>');
+
+					for (xml_node n = node.first_child(); n; n = n.next_sibling())
+						node_output(writer, n, indent, flags, depth + 1);
+
+					writer.write('<', '/');
+					writer.write(name);
+					writer.write('>');
+				}
+			}
+			else if (!node.first_child())
+				writer.write(' ', '/', '>', '\n');
+			else if (node.first_child() == node.last_child() && (node.first_child().type() == node_pcdata || node.first_child().type() == node_cdata))
+			{
+				writer.write('>');
+
+				if (node.first_child().type() == node_pcdata)
+					text_output(writer, node.first_child().value(), ctx_special_pcdata, flags);
+				else
+					text_output_cdata(writer, node.first_child().value());
+
+				writer.write('<', '/');
+				writer.write(name);
+				writer.write('>', '\n');
+			}
+			else
+			{
+				writer.write('>', '\n');
+				
+				for (xml_node n = node.first_child(); n; n = n.next_sibling())
+					node_output(writer, n, indent, flags, depth + 1);
+
+				if ((flags & format_indent) != 0 && (flags & format_raw) == 0)
+					for (unsigned int i = 0; i < depth; ++i) writer.write(indent);
+				
+				writer.write('<', '/');
+				writer.write(name);
+				writer.write('>', '\n');
+			}
+
+			break;
+		}
+		
+		case node_pcdata:
+			text_output(writer, node.value(), ctx_special_pcdata, flags);
+			if ((flags & format_raw) == 0) writer.write('\n');
+			break;
+
+		case node_cdata:
+			text_output_cdata(writer, node.value());
+			if ((flags & format_raw) == 0) writer.write('\n');
+			break;
+
+		case node_comment:
+			writer.write('<', '!', '-', '-');
+			writer.write(node.value());
+			writer.write('-', '-', '>');
+			if ((flags & format_raw) == 0) writer.write('\n');
+			break;
+
+		case node_pi:
+		case node_declaration:
+			writer.write('<', '?');
+			writer.write(node.name()[0] ? node.name() : default_name);
+
+			if (node.type() == node_declaration)
+			{
+				node_output_attributes(writer, node, flags);
+			}
+			else if (node.value()[0])
+			{
+				writer.write(' ');
+				writer.write(node.value());
+			}
+
+			writer.write('?', '>');
+			if ((flags & format_raw) == 0) writer.write('\n');
+			break;
+
+		case node_doctype:
+			writer.write('<', '!', 'D', 'O', 'C');
+			writer.write('T', 'Y', 'P', 'E');
+
+			if (node.value()[0])
+			{
+				writer.write(' ');
+				writer.write(node.value());
+			}
+
+			writer.write('>');
+			if ((flags & format_raw) == 0) writer.write('\n');
+			break;
+
+		default:
+			assert(!"Invalid node type");
+		}
+	}
+
+	inline bool has_declaration(const xml_node& node)
+	{
+		for (xml_node child = node.first_child(); child; child = child.next_sibling())
+		{
+			xml_node_type type = child.type();
+
+			if (type == node_declaration) return true;
+			if (type == node_element) return false;
+		}
+
+		return false;
+	}
+
+	inline bool allow_insert_child(xml_node_type parent, xml_node_type child)
+	{
+		if (parent != node_document && parent != node_element) return false;
+		if (child == node_document || child == node_null) return false;
+		if (parent != node_document && (child == node_declaration || child == node_doctype)) return false;
+
+		return true;
+	}
+
+	PUGI__FN void recursive_copy_skip(xml_node& dest, const xml_node& source, const xml_node& skip)
+	{
+		assert(dest.type() == source.type());
+
+		switch (source.type())
+		{
+		case node_element:
+		{
+			dest.set_name(source.name());
+
+			for (xml_attribute a = source.first_attribute(); a; a = a.next_attribute())
+				dest.append_attribute(a.name()).set_value(a.value());
+
+			for (xml_node c = source.first_child(); c; c = c.next_sibling())
+			{
+				if (c == skip) continue;
+
+				xml_node cc = dest.append_child(c.type());
+				assert(cc);
+
+				recursive_copy_skip(cc, c, skip);
+			}
+
+			break;
+		}
+
+		case node_pcdata:
+		case node_cdata:
+		case node_comment:
+		case node_doctype:
+			dest.set_value(source.value());
+			break;
+
+		case node_pi:
+			dest.set_name(source.name());
+			dest.set_value(source.value());
+			break;
+
+		case node_declaration:
+		{
+			dest.set_name(source.name());
+
+			for (xml_attribute a = source.first_attribute(); a; a = a.next_attribute())
+				dest.append_attribute(a.name()).set_value(a.value());
+
+			break;
+		}
+
+		default:
+			assert(!"Invalid node type");
+		}
+	}
+
+	inline bool is_text_node(xml_node_struct* node)
+	{
+		xml_node_type type = static_cast<xml_node_type>((node->header & impl::xml_memory_page_type_mask) + 1);
+
+		return type == node_pcdata || type == node_cdata;
+	}
+
+	// get value with conversion functions
+	PUGI__FN int get_integer_base(const char_t* value)
+	{
+		const char_t* s = value;
+
+		while (PUGI__IS_CHARTYPE(*s, ct_space))
+			s++;
+
+		if (*s == '-')
+			s++;
+
+		return (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) ? 16 : 10;
+	}
+
+	PUGI__FN int get_value_int(const char_t* value, int def)
+	{
+		if (!value) return def;
+
+		int base = get_integer_base(value);
+
+	#ifdef PUGIXML_WCHAR_MODE
+		return static_cast<int>(wcstol(value, 0, base));
+	#else
+		return static_cast<int>(strtol(value, 0, base));
+	#endif
+	}
+
+	PUGI__FN unsigned int get_value_uint(const char_t* value, unsigned int def)
+	{
+		if (!value) return def;
+
+		int base = get_integer_base(value);
+
+	#ifdef PUGIXML_WCHAR_MODE
+		return static_cast<unsigned int>(wcstoul(value, 0, base));
+	#else
+		return static_cast<unsigned int>(strtoul(value, 0, base));
+	#endif
+	}
+
+	PUGI__FN double get_value_double(const char_t* value, double def)
+	{
+		if (!value) return def;
+
+	#ifdef PUGIXML_WCHAR_MODE
+		return wcstod(value, 0);
+	#else
+		return strtod(value, 0);
+	#endif
+	}
+
+	PUGI__FN float get_value_float(const char_t* value, float def)
+	{
+		if (!value) return def;
+
+	#ifdef PUGIXML_WCHAR_MODE
+		return static_cast<float>(wcstod(value, 0));
+	#else
+		return static_cast<float>(strtod(value, 0));
+	#endif
+	}
+
+	PUGI__FN bool get_value_bool(const char_t* value, bool def)
+	{
+		if (!value) return def;
+
+		// only look at first char
+		char_t first = *value;
+
+		// 1*, t* (true), T* (True), y* (yes), Y* (YES)
+		return (first == '1' || first == 't' || first == 'T' || first == 'y' || first == 'Y');
+	}
+
+#ifdef PUGIXML_HAS_LONG_LONG
+	PUGI__FN long long get_value_llong(const char_t* value, long long def)
+	{
+		if (!value) return def;
+
+		int base = get_integer_base(value);
+
+	#ifdef PUGIXML_WCHAR_MODE
+		#ifdef PUGI__MSVC_CRT_VERSION
+			return _wcstoi64(value, 0, base);
+		#else
+			return wcstoll(value, 0, base);
+		#endif
+	#else
+		#ifdef PUGI__MSVC_CRT_VERSION
+			return _strtoi64(value, 0, base);
+		#else
+			return strtoll(value, 0, base);
+		#endif
+	#endif
+	}
+
+	PUGI__FN unsigned long long get_value_ullong(const char_t* value, unsigned long long def)
+	{
+		if (!value) return def;
+
+		int base = get_integer_base(value);
+
+	#ifdef PUGIXML_WCHAR_MODE
+		#ifdef PUGI__MSVC_CRT_VERSION
+			return _wcstoui64(value, 0, base);
+		#else
+			return wcstoull(value, 0, base);
+		#endif
+	#else
+		#ifdef PUGI__MSVC_CRT_VERSION
+			return _strtoui64(value, 0, base);
+		#else
+			return strtoull(value, 0, base);
+		#endif
+	#endif
+	}
+#endif
+
+	// set value with conversion functions
+	PUGI__FN bool set_value_buffer(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char (&buf)[128])
+	{
+	#ifdef PUGIXML_WCHAR_MODE
+		char_t wbuf[128];
+		impl::widen_ascii(wbuf, buf);
+
+		return strcpy_insitu(dest, header, header_mask, wbuf);
+	#else
+		return strcpy_insitu(dest, header, header_mask, buf);
+	#endif
+	}
+
+	PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, int value)
+	{
+		char buf[128];
+		sprintf(buf, "%d", value);
+	
+		return set_value_buffer(dest, header, header_mask, buf);
+	}
+
+	PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, unsigned int value)
+	{
+		char buf[128];
+		sprintf(buf, "%u", value);
+
+		return set_value_buffer(dest, header, header_mask, buf);
+	}
+
+	PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, double value)
+	{
+		char buf[128];
+		sprintf(buf, "%g", value);
+
+		return set_value_buffer(dest, header, header_mask, buf);
+	}
+	
+	PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, bool value)
+	{
+		return strcpy_insitu(dest, header, header_mask, value ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"));
+	}
+
+#ifdef PUGIXML_HAS_LONG_LONG
+	PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, long long value)
+	{
+		char buf[128];
+		sprintf(buf, "%lld", value);
+	
+		return set_value_buffer(dest, header, header_mask, buf);
+	}
+
+	PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, unsigned long long value)
+	{
+		char buf[128];
+		sprintf(buf, "%llu", value);
+	
+		return set_value_buffer(dest, header, header_mask, buf);
+	}
+#endif
+
+	// we need to get length of entire file to load it in memory; the only (relatively) sane way to do it is via seek/tell trick
+	PUGI__FN xml_parse_status get_file_size(FILE* file, size_t& out_result)
+	{
+	#if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE)
+		// there are 64-bit versions of fseek/ftell, let's use them
+		typedef __int64 length_type;
+
+		_fseeki64(file, 0, SEEK_END);
+		length_type length = _ftelli64(file);
+		_fseeki64(file, 0, SEEK_SET);
+	#elif defined(__MINGW32__) && !defined(__NO_MINGW_LFS) && !defined(__STRICT_ANSI__)
+		// there are 64-bit versions of fseek/ftell, let's use them
+		typedef off64_t length_type;
+
+		fseeko64(file, 0, SEEK_END);
+		length_type length = ftello64(file);
+		fseeko64(file, 0, SEEK_SET);
+	#else
+		// if this is a 32-bit OS, long is enough; if this is a unix system, long is 64-bit, which is enough; otherwise we can't do anything anyway.
+		typedef long length_type;
+
+		fseek(file, 0, SEEK_END);
+		length_type length = ftell(file);
+		fseek(file, 0, SEEK_SET);
+	#endif
+
+		// check for I/O errors
+		if (length < 0) return status_io_error;
+		
+		// check for overflow
+		size_t result = static_cast<size_t>(length);
+
+		if (static_cast<length_type>(result) != length) return status_out_of_memory;
+
+		// finalize
+		out_result = result;
+
+		return status_ok;
+	}
+
+	PUGI__FN size_t zero_terminate_buffer(void* buffer, size_t size, xml_encoding encoding) 
+	{
+		// We only need to zero-terminate if encoding conversion does not do it for us
+	#ifdef PUGIXML_WCHAR_MODE
+		xml_encoding wchar_encoding = get_wchar_encoding();
+
+		if (encoding == wchar_encoding || need_endian_swap_utf(encoding, wchar_encoding))
+		{
+			size_t length = size / sizeof(char_t);
+
+			static_cast<char_t*>(buffer)[length] = 0;
+			return (length + 1) * sizeof(char_t);
+		}
+	#else
+		if (encoding == encoding_utf8)
+		{
+			static_cast<char*>(buffer)[size] = 0;
+			return size + 1;
+		}
+	#endif
+
+		return size;
+	}
+
+	PUGI__FN xml_parse_result load_file_impl(xml_document& doc, FILE* file, unsigned int options, xml_encoding encoding)
+	{
+		if (!file) return make_parse_result(status_file_not_found);
+
+		// get file size (can result in I/O errors)
+		size_t size = 0;
+		xml_parse_status size_status = get_file_size(file, size);
+
+		if (size_status != status_ok)
+		{
+			fclose(file);
+			return make_parse_result(size_status);
+		}
+		
+		size_t max_suffix_size = sizeof(char_t);
+
+		// allocate buffer for the whole file
+		char* contents = static_cast<char*>(xml_memory::allocate(size + max_suffix_size));
+
+		if (!contents)
+		{
+			fclose(file);
+			return make_parse_result(status_out_of_memory);
+		}
+
+		// read file in memory
+		size_t read_size = fread(contents, 1, size, file);
+		fclose(file);
+
+		if (read_size != size)
+		{
+			xml_memory::deallocate(contents);
+			return make_parse_result(status_io_error);
+		}
+
+		xml_encoding real_encoding = get_buffer_encoding(encoding, contents, size);
+		
+		return doc.load_buffer_inplace_own(contents, zero_terminate_buffer(contents, size, real_encoding), options, real_encoding);
+	}
+
+#ifndef PUGIXML_NO_STL
+	template <typename T> struct xml_stream_chunk
+	{
+		static xml_stream_chunk* create()
+		{
+			void* memory = xml_memory::allocate(sizeof(xml_stream_chunk));
+			
+			return new (memory) xml_stream_chunk();
+		}
+
+		static void destroy(void* ptr)
+		{
+			xml_stream_chunk* chunk = static_cast<xml_stream_chunk*>(ptr);
+
+			// free chunk chain
+			while (chunk)
+			{
+				xml_stream_chunk* next = chunk->next;
+				xml_memory::deallocate(chunk);
+				chunk = next;
+			}
+		}
+
+		xml_stream_chunk(): next(0), size(0)
+		{
+		}
+
+		xml_stream_chunk* next;
+		size_t size;
+
+		T data[xml_memory_page_size / sizeof(T)];
+	};
+
+	template <typename T> PUGI__FN xml_parse_status load_stream_data_noseek(std::basic_istream<T>& stream, void** out_buffer, size_t* out_size)
+	{
+		buffer_holder chunks(0, xml_stream_chunk<T>::destroy);
+
+		// read file to a chunk list
+		size_t total = 0;
+		xml_stream_chunk<T>* last = 0;
+
+		while (!stream.eof())
+		{
+			// allocate new chunk
+			xml_stream_chunk<T>* chunk = xml_stream_chunk<T>::create();
+			if (!chunk) return status_out_of_memory;
+
+			// append chunk to list
+			if (last) last = last->next = chunk;
+			else chunks.data = last = chunk;
+
+			// read data to chunk
+			stream.read(chunk->data, static_cast<std::streamsize>(sizeof(chunk->data) / sizeof(T)));
+			chunk->size = static_cast<size_t>(stream.gcount()) * sizeof(T);
+
+			// read may set failbit | eofbit in case gcount() is less than read length, so check for other I/O errors
+			if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error;
+
+			// guard against huge files (chunk size is small enough to make this overflow check work)
+			if (total + chunk->size < total) return status_out_of_memory;
+			total += chunk->size;
+		}
+
+		size_t max_suffix_size = sizeof(char_t);
+
+		// copy chunk list to a contiguous buffer
+		char* buffer = static_cast<char*>(xml_memory::allocate(total + max_suffix_size));
+		if (!buffer) return status_out_of_memory;
+
+		char* write = buffer;
+
+		for (xml_stream_chunk<T>* chunk = static_cast<xml_stream_chunk<T>*>(chunks.data); chunk; chunk = chunk->next)
+		{
+			assert(write + chunk->size <= buffer + total);
+			memcpy(write, chunk->data, chunk->size);
+			write += chunk->size;
+		}
+
+		assert(write == buffer + total);
+
+		// return buffer
+		*out_buffer = buffer;
+		*out_size = total;
+
+		return status_ok;
+	}
+
+	template <typename T> PUGI__FN xml_parse_status load_stream_data_seek(std::basic_istream<T>& stream, void** out_buffer, size_t* out_size)
+	{
+		// get length of remaining data in stream
+		typename std::basic_istream<T>::pos_type pos = stream.tellg();
+		stream.seekg(0, std::ios::end);
+		std::streamoff length = stream.tellg() - pos;
+		stream.seekg(pos);
+
+		if (stream.fail() || pos < 0) return status_io_error;
+
+		// guard against huge files
+		size_t read_length = static_cast<size_t>(length);
+
+		if (static_cast<std::streamsize>(read_length) != length || length < 0) return status_out_of_memory;
+
+		size_t max_suffix_size = sizeof(char_t);
+
+		// read stream data into memory (guard against stream exceptions with buffer holder)
+		buffer_holder buffer(xml_memory::allocate(read_length * sizeof(T) + max_suffix_size), xml_memory::deallocate);
+		if (!buffer.data) return status_out_of_memory;
+
+		stream.read(static_cast<T*>(buffer.data), static_cast<std::streamsize>(read_length));
+
+		// read may set failbit | eofbit in case gcount() is less than read_length (i.e. line ending conversion), so check for other I/O errors
+		if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error;
+
+		// return buffer
+		size_t actual_length = static_cast<size_t>(stream.gcount());
+		assert(actual_length <= read_length);
+		
+		*out_buffer = buffer.release();
+		*out_size = actual_length * sizeof(T);
+
+		return status_ok;
+	}
+
+	template <typename T> PUGI__FN xml_parse_result load_stream_impl(xml_document& doc, std::basic_istream<T>& stream, unsigned int options, xml_encoding encoding)
+	{
+		void* buffer = 0;
+		size_t size = 0;
+		xml_parse_status status = status_ok;
+
+		// if stream has an error bit set, bail out (otherwise tellg() can fail and we'll clear error bits)
+		if (stream.fail()) return make_parse_result(status_io_error);
+
+		// load stream to memory (using seek-based implementation if possible, since it's faster and takes less memory)
+		if (stream.tellg() < 0)
+		{
+			stream.clear(); // clear error flags that could be set by a failing tellg
+			status = load_stream_data_noseek(stream, &buffer, &size);
+		}
+		else
+			status = load_stream_data_seek(stream, &buffer, &size);
+
+		if (status != status_ok) return make_parse_result(status);
+
+		xml_encoding real_encoding = get_buffer_encoding(encoding, buffer, size);
+		
+		return doc.load_buffer_inplace_own(buffer, zero_terminate_buffer(buffer, size, real_encoding), options, real_encoding);
+	}
+#endif
+
+#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__) || (defined(__MINGW32__) && !defined(__STRICT_ANSI__))
+	PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode)
+	{
+		return _wfopen(path, mode);
+	}
+#else
+	PUGI__FN char* convert_path_heap(const wchar_t* str)
+	{
+		assert(str);
+
+		// first pass: get length in utf8 characters
+		size_t length = strlength_wide(str);
+		size_t size = as_utf8_begin(str, length);
+
+		// allocate resulting string
+		char* result = static_cast<char*>(xml_memory::allocate(size + 1));
+		if (!result) return 0;
+
+		// second pass: convert to utf8
+		as_utf8_end(result, size, str, length);
+
+		return result;
+	}
+
+	PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode)
+	{
+		// there is no standard function to open wide paths, so our best bet is to try utf8 path
+		char* path_utf8 = convert_path_heap(path);
+		if (!path_utf8) return 0;
+
+		// convert mode to ASCII (we mirror _wfopen interface)
+		char mode_ascii[4] = {0};
+		for (size_t i = 0; mode[i]; ++i) mode_ascii[i] = static_cast<char>(mode[i]);
+
+		// try to open the utf8 path
+		FILE* result = fopen(path_utf8, mode_ascii);
+
+		// free dummy buffer
+		xml_memory::deallocate(path_utf8);
+
+		return result;
+	}
+#endif
+
+	PUGI__FN bool save_file_impl(const xml_document& doc, FILE* file, const char_t* indent, unsigned int flags, xml_encoding encoding)
+	{
+		if (!file) return false;
+
+		xml_writer_file writer(file);
+		doc.save(writer, indent, flags, encoding);
+
+		int result = ferror(file);
+
+		fclose(file);
+
+		return result == 0;
+	}
+
+	PUGI__FN xml_parse_result load_buffer_impl(xml_document_struct* doc, xml_node_struct* root, void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own, char_t** out_buffer)
+	{
+		// check input buffer
+		assert(contents || size == 0);
+
+		// get actual encoding
+		xml_encoding buffer_encoding = impl::get_buffer_encoding(encoding, contents, size);
+
+		// get private buffer
+		char_t* buffer = 0;
+		size_t length = 0;
+
+		if (!impl::convert_buffer(buffer, length, buffer_encoding, contents, size, is_mutable)) return impl::make_parse_result(status_out_of_memory);
+		
+		// delete original buffer if we performed a conversion
+		if (own && buffer != contents && contents) impl::xml_memory::deallocate(contents);
+
+		// store buffer for offset_debug
+		doc->buffer = buffer;
+
+		// parse
+		xml_parse_result res = impl::xml_parser::parse(buffer, length, doc, root, options);
+
+		// remember encoding
+		res.encoding = buffer_encoding;
+
+		// grab onto buffer if it's our buffer, user is responsible for deallocating contents himself
+		if (own || buffer != contents) *out_buffer = buffer;
+
+		return res;
+	}
+PUGI__NS_END
+
+namespace pugi
+{
+	PUGI__FN xml_writer_file::xml_writer_file(void* file_): file(file_)
+	{
+	}
+
+	PUGI__FN void xml_writer_file::write(const void* data, size_t size)
+	{
+		size_t result = fwrite(data, 1, size, static_cast<FILE*>(file));
+		(void)!result; // unfortunately we can't do proper error handling here
+	}
+
+#ifndef PUGIXML_NO_STL
+	PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream): narrow_stream(&stream), wide_stream(0)
+	{
+	}
+
+	PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream): narrow_stream(0), wide_stream(&stream)
+	{
+	}
+
+	PUGI__FN void xml_writer_stream::write(const void* data, size_t size)
+	{
+		if (narrow_stream)
+		{
+			assert(!wide_stream);
+			narrow_stream->write(reinterpret_cast<const char*>(data), static_cast<std::streamsize>(size));
+		}
+		else
+		{
+			assert(wide_stream);
+			assert(size % sizeof(wchar_t) == 0);
+
+			wide_stream->write(reinterpret_cast<const wchar_t*>(data), static_cast<std::streamsize>(size / sizeof(wchar_t)));
+		}
+	}
+#endif
+
+	PUGI__FN xml_tree_walker::xml_tree_walker(): _depth(0)
+	{
+	}
+	
+	PUGI__FN xml_tree_walker::~xml_tree_walker()
+	{
+	}
+
+	PUGI__FN int xml_tree_walker::depth() const
+	{
+		return _depth;
+	}
+
+	PUGI__FN bool xml_tree_walker::begin(xml_node&)
+	{
+		return true;
+	}
+
+	PUGI__FN bool xml_tree_walker::end(xml_node&)
+	{
+		return true;
+	}
+
+	PUGI__FN xml_attribute::xml_attribute(): _attr(0)
+	{
+	}
+
+	PUGI__FN xml_attribute::xml_attribute(xml_attribute_struct* attr): _attr(attr)
+	{
+	}
+
+	PUGI__FN static void unspecified_bool_xml_attribute(xml_attribute***)
+	{
+	}
+
+	PUGI__FN xml_attribute::operator xml_attribute::unspecified_bool_type() const
+	{
+		return _attr ? unspecified_bool_xml_attribute : 0;
+	}
+
+	PUGI__FN bool xml_attribute::operator!() const
+	{
+		return !_attr;
+	}
+
+	PUGI__FN bool xml_attribute::operator==(const xml_attribute& r) const
+	{
+		return (_attr == r._attr);
+	}
+	
+	PUGI__FN bool xml_attribute::operator!=(const xml_attribute& r) const
+	{
+		return (_attr != r._attr);
+	}
+
+	PUGI__FN bool xml_attribute::operator<(const xml_attribute& r) const
+	{
+		return (_attr < r._attr);
+	}
+	
+	PUGI__FN bool xml_attribute::operator>(const xml_attribute& r) const
+	{
+		return (_attr > r._attr);
+	}
+	
+	PUGI__FN bool xml_attribute::operator<=(const xml_attribute& r) const
+	{
+		return (_attr <= r._attr);
+	}
+	
+	PUGI__FN bool xml_attribute::operator>=(const xml_attribute& r) const
+	{
+		return (_attr >= r._attr);
+	}
+
+	PUGI__FN xml_attribute xml_attribute::next_attribute() const
+	{
+		return _attr ? xml_attribute(_attr->next_attribute) : xml_attribute();
+	}
+
+	PUGI__FN xml_attribute xml_attribute::previous_attribute() const
+	{
+		return _attr && _attr->prev_attribute_c->next_attribute ? xml_attribute(_attr->prev_attribute_c) : xml_attribute();
+	}
+
+	PUGI__FN const char_t* xml_attribute::as_string(const char_t* def) const
+	{
+		return (_attr && _attr->value) ? _attr->value : def;
+	}
+
+	PUGI__FN int xml_attribute::as_int(int def) const
+	{
+		return impl::get_value_int(_attr ? _attr->value : 0, def);
+	}
+
+	PUGI__FN unsigned int xml_attribute::as_uint(unsigned int def) const
+	{
+		return impl::get_value_uint(_attr ? _attr->value : 0, def);
+	}
+
+	PUGI__FN double xml_attribute::as_double(double def) const
+	{
+		return impl::get_value_double(_attr ? _attr->value : 0, def);
+	}
+
+	PUGI__FN float xml_attribute::as_float(float def) const
+	{
+		return impl::get_value_float(_attr ? _attr->value : 0, def);
+	}
+
+	PUGI__FN bool xml_attribute::as_bool(bool def) const
+	{
+		return impl::get_value_bool(_attr ? _attr->value : 0, def);
+	}
+
+#ifdef PUGIXML_HAS_LONG_LONG
+	PUGI__FN long long xml_attribute::as_llong(long long def) const
+	{
+		return impl::get_value_llong(_attr ? _attr->value : 0, def);
+	}
+
+	PUGI__FN unsigned long long xml_attribute::as_ullong(unsigned long long def) const
+	{
+		return impl::get_value_ullong(_attr ? _attr->value : 0, def);
+	}
+#endif
+
+	PUGI__FN bool xml_attribute::empty() const
+	{
+		return !_attr;
+	}
+
+	PUGI__FN const char_t* xml_attribute::name() const
+	{
+		return (_attr && _attr->name) ? _attr->name : PUGIXML_TEXT("");
+	}
+
+	PUGI__FN const char_t* xml_attribute::value() const
+	{
+		return (_attr && _attr->value) ? _attr->value : PUGIXML_TEXT("");
+	}
+
+	PUGI__FN size_t xml_attribute::hash_value() const
+	{
+		return static_cast<size_t>(reinterpret_cast<uintptr_t>(_attr) / sizeof(xml_attribute_struct));
+	}
+
+	PUGI__FN xml_attribute_struct* xml_attribute::internal_object() const
+	{
+		return _attr;
+	}
+
+	PUGI__FN xml_attribute& xml_attribute::operator=(const char_t* rhs)
+	{
+		set_value(rhs);
+		return *this;
+	}
+	
+	PUGI__FN xml_attribute& xml_attribute::operator=(int rhs)
+	{
+		set_value(rhs);
+		return *this;
+	}
+
+	PUGI__FN xml_attribute& xml_attribute::operator=(unsigned int rhs)
+	{
+		set_value(rhs);
+		return *this;
+	}
+
+	PUGI__FN xml_attribute& xml_attribute::operator=(double rhs)
+	{
+		set_value(rhs);
+		return *this;
+	}
+	
+	PUGI__FN xml_attribute& xml_attribute::operator=(bool rhs)
+	{
+		set_value(rhs);
+		return *this;
+	}
+
+#ifdef PUGIXML_HAS_LONG_LONG
+	PUGI__FN xml_attribute& xml_attribute::operator=(long long rhs)
+	{
+		set_value(rhs);
+		return *this;
+	}
+
+	PUGI__FN xml_attribute& xml_attribute::operator=(unsigned long long rhs)
+	{
+		set_value(rhs);
+		return *this;
+	}
+#endif
+
+	PUGI__FN bool xml_attribute::set_name(const char_t* rhs)
+	{
+		if (!_attr) return false;
+		
+		return impl::strcpy_insitu(_attr->name, _attr->header, impl::xml_memory_page_name_allocated_mask, rhs);
+	}
+		
+	PUGI__FN bool xml_attribute::set_value(const char_t* rhs)
+	{
+		if (!_attr) return false;
+
+		return impl::strcpy_insitu(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+	}
+
+	PUGI__FN bool xml_attribute::set_value(int rhs)
+	{
+		if (!_attr) return false;
+
+		return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+	}
+
+	PUGI__FN bool xml_attribute::set_value(unsigned int rhs)
+	{
+		if (!_attr) return false;
+
+		return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+	}
+
+	PUGI__FN bool xml_attribute::set_value(double rhs)
+	{
+		if (!_attr) return false;
+
+		return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+	}
+	
+	PUGI__FN bool xml_attribute::set_value(bool rhs)
+	{
+		if (!_attr) return false;
+
+		return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+	}
+
+#ifdef PUGIXML_HAS_LONG_LONG
+	PUGI__FN bool xml_attribute::set_value(long long rhs)
+	{
+		if (!_attr) return false;
+
+		return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+	}
+
+	PUGI__FN bool xml_attribute::set_value(unsigned long long rhs)
+	{
+		if (!_attr) return false;
+
+		return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+	}
+#endif
+
+#ifdef __BORLANDC__
+	PUGI__FN bool operator&&(const xml_attribute& lhs, bool rhs)
+	{
+		return (bool)lhs && rhs;
+	}
+
+	PUGI__FN bool operator||(const xml_attribute& lhs, bool rhs)
+	{
+		return (bool)lhs || rhs;
+	}
+#endif
+
+	PUGI__FN xml_node::xml_node(): _root(0)
+	{
+	}
+
+	PUGI__FN xml_node::xml_node(xml_node_struct* p): _root(p)
+	{
+	}
+	
+	PUGI__FN static void unspecified_bool_xml_node(xml_node***)
+	{
+	}
+
+	PUGI__FN xml_node::operator xml_node::unspecified_bool_type() const
+	{
+		return _root ? unspecified_bool_xml_node : 0;
+	}
+
+	PUGI__FN bool xml_node::operator!() const
+	{
+		return !_root;
+	}
+
+	PUGI__FN xml_node::iterator xml_node::begin() const
+	{
+		return iterator(_root ? _root->first_child : 0, _root);
+	}
+
+	PUGI__FN xml_node::iterator xml_node::end() const
+	{
+		return iterator(0, _root);
+	}
+	
+	PUGI__FN xml_node::attribute_iterator xml_node::attributes_begin() const
+	{
+		return attribute_iterator(_root ? _root->first_attribute : 0, _root);
+	}
+
+	PUGI__FN xml_node::attribute_iterator xml_node::attributes_end() const
+	{
+		return attribute_iterator(0, _root);
+	}
+	
+	PUGI__FN xml_object_range<xml_node_iterator> xml_node::children() const
+	{
+		return xml_object_range<xml_node_iterator>(begin(), end());
+	}
+
+	PUGI__FN xml_object_range<xml_named_node_iterator> xml_node::children(const char_t* name_) const
+	{
+		return xml_object_range<xml_named_node_iterator>(xml_named_node_iterator(child(name_)._root, _root, name_), xml_named_node_iterator(0, _root, name_));
+	}
+
+	PUGI__FN xml_object_range<xml_attribute_iterator> xml_node::attributes() const
+	{
+		return xml_object_range<xml_attribute_iterator>(attributes_begin(), attributes_end());
+	}
+
+	PUGI__FN bool xml_node::operator==(const xml_node& r) const
+	{
+		return (_root == r._root);
+	}
+
+	PUGI__FN bool xml_node::operator!=(const xml_node& r) const
+	{
+		return (_root != r._root);
+	}
+
+	PUGI__FN bool xml_node::operator<(const xml_node& r) const
+	{
+		return (_root < r._root);
+	}
+	
+	PUGI__FN bool xml_node::operator>(const xml_node& r) const
+	{
+		return (_root > r._root);
+	}
+	
+	PUGI__FN bool xml_node::operator<=(const xml_node& r) const
+	{
+		return (_root <= r._root);
+	}
+	
+	PUGI__FN bool xml_node::operator>=(const xml_node& r) const
+	{
+		return (_root >= r._root);
+	}
+
+	PUGI__FN bool xml_node::empty() const
+	{
+		return !_root;
+	}
+	
+	PUGI__FN const char_t* xml_node::name() const
+	{
+		return (_root && _root->name) ? _root->name : PUGIXML_TEXT("");
+	}
+
+	PUGI__FN xml_node_type xml_node::type() const
+	{
+		return _root ? static_cast<xml_node_type>((_root->header & impl::xml_memory_page_type_mask) + 1) : node_null;
+	}
+	
+	PUGI__FN const char_t* xml_node::value() const
+	{
+		return (_root && _root->value) ? _root->value : PUGIXML_TEXT("");
+	}
+	
+	PUGI__FN xml_node xml_node::child(const char_t* name_) const
+	{
+		if (!_root) return xml_node();
+
+		for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+			if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+		return xml_node();
+	}
+
+	PUGI__FN xml_attribute xml_node::attribute(const char_t* name_) const
+	{
+		if (!_root) return xml_attribute();
+
+		for (xml_attribute_struct* i = _root->first_attribute; i; i = i->next_attribute)
+			if (i->name && impl::strequal(name_, i->name))
+				return xml_attribute(i);
+		
+		return xml_attribute();
+	}
+	
+	PUGI__FN xml_node xml_node::next_sibling(const char_t* name_) const
+	{
+		if (!_root) return xml_node();
+		
+		for (xml_node_struct* i = _root->next_sibling; i; i = i->next_sibling)
+			if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+		return xml_node();
+	}
+
+	PUGI__FN xml_node xml_node::next_sibling() const
+	{
+		if (!_root) return xml_node();
+		
+		if (_root->next_sibling) return xml_node(_root->next_sibling);
+		else return xml_node();
+	}
+
+	PUGI__FN xml_node xml_node::previous_sibling(const char_t* name_) const
+	{
+		if (!_root) return xml_node();
+		
+		for (xml_node_struct* i = _root->prev_sibling_c; i->next_sibling; i = i->prev_sibling_c)
+			if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+		return xml_node();
+	}
+
+	PUGI__FN xml_node xml_node::previous_sibling() const
+	{
+		if (!_root) return xml_node();
+		
+		if (_root->prev_sibling_c->next_sibling) return xml_node(_root->prev_sibling_c);
+		else return xml_node();
+	}
+
+	PUGI__FN xml_node xml_node::parent() const
+	{
+		return _root ? xml_node(_root->parent) : xml_node();
+	}
+
+	PUGI__FN xml_node xml_node::root() const
+	{
+		if (!_root) return xml_node();
+
+		impl::xml_memory_page* page = reinterpret_cast<impl::xml_memory_page*>(_root->header & impl::xml_memory_page_pointer_mask);
+
+		return xml_node(static_cast<impl::xml_document_struct*>(page->allocator));
+	}
+
+	PUGI__FN xml_text xml_node::text() const
+	{
+		return xml_text(_root);
+	}
+
+	PUGI__FN const char_t* xml_node::child_value() const
+	{
+		if (!_root) return PUGIXML_TEXT("");
+		
+		for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+			if (i->value && impl::is_text_node(i))
+				return i->value;
+
+		return PUGIXML_TEXT("");
+	}
+
+	PUGI__FN const char_t* xml_node::child_value(const char_t* name_) const
+	{
+		return child(name_).child_value();
+	}
+
+	PUGI__FN xml_attribute xml_node::first_attribute() const
+	{
+		return _root ? xml_attribute(_root->first_attribute) : xml_attribute();
+	}
+
+	PUGI__FN xml_attribute xml_node::last_attribute() const
+	{
+		return _root && _root->first_attribute ? xml_attribute(_root->first_attribute->prev_attribute_c) : xml_attribute();
+	}
+
+	PUGI__FN xml_node xml_node::first_child() const
+	{
+		return _root ? xml_node(_root->first_child) : xml_node();
+	}
+
+	PUGI__FN xml_node xml_node::last_child() const
+	{
+		return _root && _root->first_child ? xml_node(_root->first_child->prev_sibling_c) : xml_node();
+	}
+
+	PUGI__FN bool xml_node::set_name(const char_t* rhs)
+	{
+		switch (type())
+		{
+		case node_pi:
+		case node_declaration:
+		case node_element:
+			return impl::strcpy_insitu(_root->name, _root->header, impl::xml_memory_page_name_allocated_mask, rhs);
+
+		default:
+			return false;
+		}
+	}
+		
+	PUGI__FN bool xml_node::set_value(const char_t* rhs)
+	{
+		switch (type())
+		{
+		case node_pi:
+		case node_cdata:
+		case node_pcdata:
+		case node_comment:
+		case node_doctype:
+			return impl::strcpy_insitu(_root->value, _root->header, impl::xml_memory_page_value_allocated_mask, rhs);
+
+		default:
+			return false;
+		}
+	}
+
+	PUGI__FN xml_attribute xml_node::append_attribute(const char_t* name_)
+	{
+		if (type() != node_element && type() != node_declaration) return xml_attribute();
+		
+		xml_attribute a(impl::append_attribute_ll(_root, impl::get_allocator(_root)));
+		a.set_name(name_);
+		
+		return a;
+	}
+
+	PUGI__FN xml_attribute xml_node::prepend_attribute(const char_t* name_)
+	{
+		if (type() != node_element && type() != node_declaration) return xml_attribute();
+		
+		xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+		if (!a) return xml_attribute();
+
+		a.set_name(name_);
+		
+		xml_attribute_struct* head = _root->first_attribute;
+
+		if (head)
+		{
+			a._attr->prev_attribute_c = head->prev_attribute_c;
+			head->prev_attribute_c = a._attr;
+		}
+		else
+			a._attr->prev_attribute_c = a._attr;
+		
+		a._attr->next_attribute = head;
+		_root->first_attribute = a._attr;
+				
+		return a;
+	}
+
+	PUGI__FN xml_attribute xml_node::insert_attribute_before(const char_t* name_, const xml_attribute& attr)
+	{
+		if ((type() != node_element && type() != node_declaration) || attr.empty()) return xml_attribute();
+		
+		// check that attribute belongs to *this
+		xml_attribute_struct* cur = attr._attr;
+
+		while (cur->prev_attribute_c->next_attribute) cur = cur->prev_attribute_c;
+
+		if (cur != _root->first_attribute) return xml_attribute();
+
+		xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+		if (!a) return xml_attribute();
+
+		a.set_name(name_);
+
+		if (attr._attr->prev_attribute_c->next_attribute)
+			attr._attr->prev_attribute_c->next_attribute = a._attr;
+		else
+			_root->first_attribute = a._attr;
+		
+		a._attr->prev_attribute_c = attr._attr->prev_attribute_c;
+		a._attr->next_attribute = attr._attr;
+		attr._attr->prev_attribute_c = a._attr;
+				
+		return a;
+	}
+
+	PUGI__FN xml_attribute xml_node::insert_attribute_after(const char_t* name_, const xml_attribute& attr)
+	{
+		if ((type() != node_element && type() != node_declaration) || attr.empty()) return xml_attribute();
+		
+		// check that attribute belongs to *this
+		xml_attribute_struct* cur = attr._attr;
+
+		while (cur->prev_attribute_c->next_attribute) cur = cur->prev_attribute_c;
+
+		if (cur != _root->first_attribute) return xml_attribute();
+
+		xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+		if (!a) return xml_attribute();
+
+		a.set_name(name_);
+
+		if (attr._attr->next_attribute)
+			attr._attr->next_attribute->prev_attribute_c = a._attr;
+		else
+			_root->first_attribute->prev_attribute_c = a._attr;
+		
+		a._attr->next_attribute = attr._attr->next_attribute;
+		a._attr->prev_attribute_c = attr._attr;
+		attr._attr->next_attribute = a._attr;
+
+		return a;
+	}
+
+	PUGI__FN xml_attribute xml_node::append_copy(const xml_attribute& proto)
+	{
+		if (!proto) return xml_attribute();
+
+		xml_attribute result = append_attribute(proto.name());
+		result.set_value(proto.value());
+
+		return result;
+	}
+
+	PUGI__FN xml_attribute xml_node::prepend_copy(const xml_attribute& proto)
+	{
+		if (!proto) return xml_attribute();
+
+		xml_attribute result = prepend_attribute(proto.name());
+		result.set_value(proto.value());
+
+		return result;
+	}
+
+	PUGI__FN xml_attribute xml_node::insert_copy_after(const xml_attribute& proto, const xml_attribute& attr)
+	{
+		if (!proto) return xml_attribute();
+
+		xml_attribute result = insert_attribute_after(proto.name(), attr);
+		result.set_value(proto.value());
+
+		return result;
+	}
+
+	PUGI__FN xml_attribute xml_node::insert_copy_before(const xml_attribute& proto, const xml_attribute& attr)
+	{
+		if (!proto) return xml_attribute();
+
+		xml_attribute result = insert_attribute_before(proto.name(), attr);
+		result.set_value(proto.value());
+
+		return result;
+	}
+
+	PUGI__FN xml_node xml_node::append_child(xml_node_type type_)
+	{
+		if (!impl::allow_insert_child(this->type(), type_)) return xml_node();
+		
+		xml_node n(impl::append_node(_root, impl::get_allocator(_root), type_));
+
+		if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+		return n;
+	}
+
+	PUGI__FN xml_node xml_node::prepend_child(xml_node_type type_)
+	{
+		if (!impl::allow_insert_child(this->type(), type_)) return xml_node();
+		
+		xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+		if (!n) return xml_node();
+
+		n._root->parent = _root;
+
+		xml_node_struct* head = _root->first_child;
+
+		if (head)
+		{
+			n._root->prev_sibling_c = head->prev_sibling_c;
+			head->prev_sibling_c = n._root;
+		}
+		else
+			n._root->prev_sibling_c = n._root;
+		
+		n._root->next_sibling = head;
+		_root->first_child = n._root;
+				
+		if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+		return n;
+	}
+
+	PUGI__FN xml_node xml_node::insert_child_before(xml_node_type type_, const xml_node& node)
+	{
+		if (!impl::allow_insert_child(this->type(), type_)) return xml_node();
+		if (!node._root || node._root->parent != _root) return xml_node();
+	
+		xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+		if (!n) return xml_node();
+
+		n._root->parent = _root;
+		
+		if (node._root->prev_sibling_c->next_sibling)
+			node._root->prev_sibling_c->next_sibling = n._root;
+		else
+			_root->first_child = n._root;
+		
+		n._root->prev_sibling_c = node._root->prev_sibling_c;
+		n._root->next_sibling = node._root;
+		node._root->prev_sibling_c = n._root;
+
+		if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+		return n;
+	}
+
+	PUGI__FN xml_node xml_node::insert_child_after(xml_node_type type_, const xml_node& node)
+	{
+		if (!impl::allow_insert_child(this->type(), type_)) return xml_node();
+		if (!node._root || node._root->parent != _root) return xml_node();
+	
+		xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+		if (!n) return xml_node();
+
+		n._root->parent = _root;
+	
+		if (node._root->next_sibling)
+			node._root->next_sibling->prev_sibling_c = n._root;
+		else
+			_root->first_child->prev_sibling_c = n._root;
+		
+		n._root->next_sibling = node._root->next_sibling;
+		n._root->prev_sibling_c = node._root;
+		node._root->next_sibling = n._root;
+
+		if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+		return n;
+	}
+
+	PUGI__FN xml_node xml_node::append_child(const char_t* name_)
+	{
+		xml_node result = append_child(node_element);
+
+		result.set_name(name_);
+
+		return result;
+	}
+
+	PUGI__FN xml_node xml_node::prepend_child(const char_t* name_)
+	{
+		xml_node result = prepend_child(node_element);
+
+		result.set_name(name_);
+
+		return result;
+	}
+
+	PUGI__FN xml_node xml_node::insert_child_after(const char_t* name_, const xml_node& node)
+	{
+		xml_node result = insert_child_after(node_element, node);
+
+		result.set_name(name_);
+
+		return result;
+	}
+
+	PUGI__FN xml_node xml_node::insert_child_before(const char_t* name_, const xml_node& node)
+	{
+		xml_node result = insert_child_before(node_element, node);
+
+		result.set_name(name_);
+
+		return result;
+	}
+
+	PUGI__FN xml_node xml_node::append_copy(const xml_node& proto)
+	{
+		xml_node result = append_child(proto.type());
+
+		if (result) impl::recursive_copy_skip(result, proto, result);
+
+		return result;
+	}
+
+	PUGI__FN xml_node xml_node::prepend_copy(const xml_node& proto)
+	{
+		xml_node result = prepend_child(proto.type());
+
+		if (result) impl::recursive_copy_skip(result, proto, result);
+
+		return result;
+	}
+
+	PUGI__FN xml_node xml_node::insert_copy_after(const xml_node& proto, const xml_node& node)
+	{
+		xml_node result = insert_child_after(proto.type(), node);
+
+		if (result) impl::recursive_copy_skip(result, proto, result);
+
+		return result;
+	}
+
+	PUGI__FN xml_node xml_node::insert_copy_before(const xml_node& proto, const xml_node& node)
+	{
+		xml_node result = insert_child_before(proto.type(), node);
+
+		if (result) impl::recursive_copy_skip(result, proto, result);
+
+		return result;
+	}
+
+	PUGI__FN bool xml_node::remove_attribute(const char_t* name_)
+	{
+		return remove_attribute(attribute(name_));
+	}
+
+	PUGI__FN bool xml_node::remove_attribute(const xml_attribute& a)
+	{
+		if (!_root || !a._attr) return false;
+
+		// check that attribute belongs to *this
+		xml_attribute_struct* attr = a._attr;
+
+		while (attr->prev_attribute_c->next_attribute) attr = attr->prev_attribute_c;
+
+		if (attr != _root->first_attribute) return false;
+
+		if (a._attr->next_attribute) a._attr->next_attribute->prev_attribute_c = a._attr->prev_attribute_c;
+		else if (_root->first_attribute) _root->first_attribute->prev_attribute_c = a._attr->prev_attribute_c;
+		
+		if (a._attr->prev_attribute_c->next_attribute) a._attr->prev_attribute_c->next_attribute = a._attr->next_attribute;
+		else _root->first_attribute = a._attr->next_attribute;
+
+		impl::destroy_attribute(a._attr, impl::get_allocator(_root));
+
+		return true;
+	}
+
+	PUGI__FN bool xml_node::remove_child(const char_t* name_)
+	{
+		return remove_child(child(name_));
+	}
+
+	PUGI__FN bool xml_node::remove_child(const xml_node& n)
+	{
+		if (!_root || !n._root || n._root->parent != _root) return false;
+
+		if (n._root->next_sibling) n._root->next_sibling->prev_sibling_c = n._root->prev_sibling_c;
+		else if (_root->first_child) _root->first_child->prev_sibling_c = n._root->prev_sibling_c;
+		
+		if (n._root->prev_sibling_c->next_sibling) n._root->prev_sibling_c->next_sibling = n._root->next_sibling;
+		else _root->first_child = n._root->next_sibling;
+		
+		impl::destroy_node(n._root, impl::get_allocator(_root));
+
+		return true;
+	}
+
+	PUGI__FN xml_parse_result xml_node::append_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding)
+	{
+		// append_buffer is only valid for elements/documents
+		if (!impl::allow_insert_child(type(), node_element)) return impl::make_parse_result(status_append_invalid_root);
+
+		// get document node
+		impl::xml_document_struct* doc = static_cast<impl::xml_document_struct*>(root()._root);
+		assert(doc);
+		
+		// get extra buffer element (we'll store the document fragment buffer there so that we can deallocate it later)
+		impl::xml_memory_page* page = 0;
+		impl::xml_extra_buffer* extra = static_cast<impl::xml_extra_buffer*>(doc->allocate_memory(sizeof(impl::xml_extra_buffer), page));
+		(void)page;
+
+		if (!extra) return impl::make_parse_result(status_out_of_memory);
+
+		// save name; name of the root has to be NULL before parsing - otherwise closing node mismatches will not be detected at the top level
+		char_t* rootname = _root->name;
+		_root->name = 0;
+
+		// parse
+		char_t* buffer = 0;
+		xml_parse_result res = impl::load_buffer_impl(doc, _root, const_cast<void*>(contents), size, options, encoding, false, false, &buffer);
+
+		// restore name
+		_root->name = rootname;
+
+		// add extra buffer to the list
+		extra->buffer = buffer;
+		extra->next = doc->extra_buffers;
+		doc->extra_buffers = extra;
+
+		return res;
+	}
+
+	PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* name_, const char_t* attr_name, const char_t* attr_value) const
+	{
+		if (!_root) return xml_node();
+		
+		for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+			if (i->name && impl::strequal(name_, i->name))
+			{
+				for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute)
+					if (a->name && impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value ? a->value : PUGIXML_TEXT("")))
+						return xml_node(i);
+			}
+
+		return xml_node();
+	}
+
+	PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const
+	{
+		if (!_root) return xml_node();
+		
+		for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+			for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute)
+				if (a->name && impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value ? a->value : PUGIXML_TEXT("")))
+					return xml_node(i);
+
+		return xml_node();
+	}
+
+#ifndef PUGIXML_NO_STL
+	PUGI__FN string_t xml_node::path(char_t delimiter) const
+	{
+		xml_node cursor = *this; // Make a copy.
+		
+		string_t result = cursor.name();
+
+		while (cursor.parent())
+		{
+			cursor = cursor.parent();
+			
+			string_t temp = cursor.name();
+			temp += delimiter;
+			temp += result;
+			result.swap(temp);
+		}
+
+		return result;
+	}
+#endif
+
+	PUGI__FN xml_node xml_node::first_element_by_path(const char_t* path_, char_t delimiter) const
+	{
+		xml_node found = *this; // Current search context.
+
+		if (!_root || !path_ || !path_[0]) return found;
+
+		if (path_[0] == delimiter)
+		{
+			// Absolute path; e.g. '/foo/bar'
+			found = found.root();
+			++path_;
+		}
+
+		const char_t* path_segment = path_;
+
+		while (*path_segment == delimiter) ++path_segment;
+
+		const char_t* path_segment_end = path_segment;
+
+		while (*path_segment_end && *path_segment_end != delimiter) ++path_segment_end;
+
+		if (path_segment == path_segment_end) return found;
+
+		const char_t* next_segment = path_segment_end;
+
+		while (*next_segment == delimiter) ++next_segment;
+
+		if (*path_segment == '.' && path_segment + 1 == path_segment_end)
+			return found.first_element_by_path(next_segment, delimiter);
+		else if (*path_segment == '.' && *(path_segment+1) == '.' && path_segment + 2 == path_segment_end)
+			return found.parent().first_element_by_path(next_segment, delimiter);
+		else
+		{
+			for (xml_node_struct* j = found._root->first_child; j; j = j->next_sibling)
+			{
+				if (j->name && impl::strequalrange(j->name, path_segment, static_cast<size_t>(path_segment_end - path_segment)))
+				{
+					xml_node subsearch = xml_node(j).first_element_by_path(next_segment, delimiter);
+
+					if (subsearch) return subsearch;
+				}
+			}
+
+			return xml_node();
+		}
+	}
+
+	PUGI__FN bool xml_node::traverse(xml_tree_walker& walker)
+	{
+		walker._depth = -1;
+		
+		xml_node arg_begin = *this;
+		if (!walker.begin(arg_begin)) return false;
+
+		xml_node cur = first_child();
+				
+		if (cur)
+		{
+			++walker._depth;
+
+			do 
+			{
+				xml_node arg_for_each = cur;
+				if (!walker.for_each(arg_for_each))
+					return false;
+						
+				if (cur.first_child())
+				{
+					++walker._depth;
+					cur = cur.first_child();
+				}
+				else if (cur.next_sibling())
+					cur = cur.next_sibling();
+				else
+				{
+					// Borland C++ workaround
+					while (!cur.next_sibling() && cur != *this && !cur.parent().empty())
+					{
+						--walker._depth;
+						cur = cur.parent();
+					}
+						
+					if (cur != *this)
+						cur = cur.next_sibling();
+				}
+			}
+			while (cur && cur != *this);
+		}
+
+		assert(walker._depth == -1);
+
+		xml_node arg_end = *this;
+		return walker.end(arg_end);
+	}
+
+	PUGI__FN size_t xml_node::hash_value() const
+	{
+		return static_cast<size_t>(reinterpret_cast<uintptr_t>(_root) / sizeof(xml_node_struct));
+	}
+
+	PUGI__FN xml_node_struct* xml_node::internal_object() const
+	{
+		return _root;
+	}
+
+	PUGI__FN void xml_node::print(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const
+	{
+		if (!_root) return;
+
+		impl::xml_buffered_writer buffered_writer(writer, encoding);
+
+		impl::node_output(buffered_writer, *this, indent, flags, depth);
+	}
+
+#ifndef PUGIXML_NO_STL
+	PUGI__FN void xml_node::print(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const
+	{
+		xml_writer_stream writer(stream);
+
+		print(writer, indent, flags, encoding, depth);
+	}
+
+	PUGI__FN void xml_node::print(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent, unsigned int flags, unsigned int depth) const
+	{
+		xml_writer_stream writer(stream);
+
+		print(writer, indent, flags, encoding_wchar, depth);
+	}
+#endif
+
+	PUGI__FN ptrdiff_t xml_node::offset_debug() const
+	{
+		xml_node_struct* r = root()._root;
+
+		if (!r) return -1;
+
+		const char_t* buffer = static_cast<impl::xml_document_struct*>(r)->buffer;
+
+		if (!buffer) return -1;
+
+		switch (type())
+		{
+		case node_document:
+			return 0;
+
+		case node_element:
+		case node_declaration:
+		case node_pi:
+			return (_root->header & impl::xml_memory_page_name_allocated_mask) ? -1 : _root->name - buffer;
+
+		case node_pcdata:
+		case node_cdata:
+		case node_comment:
+		case node_doctype:
+			return (_root->header & impl::xml_memory_page_value_allocated_mask) ? -1 : _root->value - buffer;
+
+		default:
+			return -1;
+		}
+	}
+
+#ifdef __BORLANDC__
+	PUGI__FN bool operator&&(const xml_node& lhs, bool rhs)
+	{
+		return (bool)lhs && rhs;
+	}
+
+	PUGI__FN bool operator||(const xml_node& lhs, bool rhs)
+	{
+		return (bool)lhs || rhs;
+	}
+#endif
+
+	PUGI__FN xml_text::xml_text(xml_node_struct* root): _root(root)
+	{
+	}
+
+	PUGI__FN xml_node_struct* xml_text::_data() const
+	{
+		if (!_root || impl::is_text_node(_root)) return _root;
+
+		for (xml_node_struct* node = _root->first_child; node; node = node->next_sibling)
+			if (impl::is_text_node(node))
+				return node;
+
+		return 0;
+	}
+
+	PUGI__FN xml_node_struct* xml_text::_data_new()
+	{
+		xml_node_struct* d = _data();
+		if (d) return d;
+
+		return xml_node(_root).append_child(node_pcdata).internal_object();
+	}
+
+	PUGI__FN xml_text::xml_text(): _root(0)
+	{
+	}
+
+	PUGI__FN static void unspecified_bool_xml_text(xml_text***)
+	{
+	}
+
+	PUGI__FN xml_text::operator xml_text::unspecified_bool_type() const
+	{
+		return _data() ? unspecified_bool_xml_text : 0;
+	}
+
+	PUGI__FN bool xml_text::operator!() const
+	{
+		return !_data();
+	}
+
+	PUGI__FN bool xml_text::empty() const
+	{
+		return _data() == 0;
+	}
+
+	PUGI__FN const char_t* xml_text::get() const
+	{
+		xml_node_struct* d = _data();
+
+		return (d && d->value) ? d->value : PUGIXML_TEXT("");
+	}
+
+	PUGI__FN const char_t* xml_text::as_string(const char_t* def) const
+	{
+		xml_node_struct* d = _data();
+
+		return (d && d->value) ? d->value : def;
+	}
+
+	PUGI__FN int xml_text::as_int(int def) const
+	{
+		xml_node_struct* d = _data();
+
+		return impl::get_value_int(d ? d->value : 0, def);
+	}
+
+	PUGI__FN unsigned int xml_text::as_uint(unsigned int def) const
+	{
+		xml_node_struct* d = _data();
+
+		return impl::get_value_uint(d ? d->value : 0, def);
+	}
+
+	PUGI__FN double xml_text::as_double(double def) const
+	{
+		xml_node_struct* d = _data();
+
+		return impl::get_value_double(d ? d->value : 0, def);
+	}
+
+	PUGI__FN float xml_text::as_float(float def) const
+	{
+		xml_node_struct* d = _data();
+
+		return impl::get_value_float(d ? d->value : 0, def);
+	}
+
+	PUGI__FN bool xml_text::as_bool(bool def) const
+	{
+		xml_node_struct* d = _data();
+
+		return impl::get_value_bool(d ? d->value : 0, def);
+	}
+
+#ifdef PUGIXML_HAS_LONG_LONG
+	PUGI__FN long long xml_text::as_llong(long long def) const
+	{
+		xml_node_struct* d = _data();
+
+		return impl::get_value_llong(d ? d->value : 0, def);
+	}
+
+	PUGI__FN unsigned long long xml_text::as_ullong(unsigned long long def) const
+	{
+		xml_node_struct* d = _data();
+
+		return impl::get_value_ullong(d ? d->value : 0, def);
+	}
+#endif
+
+	PUGI__FN bool xml_text::set(const char_t* rhs)
+	{
+		xml_node_struct* dn = _data_new();
+
+		return dn ? impl::strcpy_insitu(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+	}
+
+	PUGI__FN bool xml_text::set(int rhs)
+	{
+		xml_node_struct* dn = _data_new();
+
+		return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+	}
+
+	PUGI__FN bool xml_text::set(unsigned int rhs)
+	{
+		xml_node_struct* dn = _data_new();
+
+		return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+	}
+
+	PUGI__FN bool xml_text::set(double rhs)
+	{
+		xml_node_struct* dn = _data_new();
+
+		return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+	}
+
+	PUGI__FN bool xml_text::set(bool rhs)
+	{
+		xml_node_struct* dn = _data_new();
+
+		return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+	}
+
+#ifdef PUGIXML_HAS_LONG_LONG
+	PUGI__FN bool xml_text::set(long long rhs)
+	{
+		xml_node_struct* dn = _data_new();
+
+		return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+	}
+
+	PUGI__FN bool xml_text::set(unsigned long long rhs)
+	{
+		xml_node_struct* dn = _data_new();
+
+		return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+	}
+#endif
+
+	PUGI__FN xml_text& xml_text::operator=(const char_t* rhs)
+	{
+		set(rhs);
+		return *this;
+	}
+
+	PUGI__FN xml_text& xml_text::operator=(int rhs)
+	{
+		set(rhs);
+		return *this;
+	}
+
+	PUGI__FN xml_text& xml_text::operator=(unsigned int rhs)
+	{
+		set(rhs);
+		return *this;
+	}
+
+	PUGI__FN xml_text& xml_text::operator=(double rhs)
+	{
+		set(rhs);
+		return *this;
+	}
+
+	PUGI__FN xml_text& xml_text::operator=(bool rhs)
+	{
+		set(rhs);
+		return *this;
+	}
+
+#ifdef PUGIXML_HAS_LONG_LONG
+	PUGI__FN xml_text& xml_text::operator=(long long rhs)
+	{
+		set(rhs);
+		return *this;
+	}
+
+	PUGI__FN xml_text& xml_text::operator=(unsigned long long rhs)
+	{
+		set(rhs);
+		return *this;
+	}
+#endif
+
+	PUGI__FN xml_node xml_text::data() const
+	{
+		return xml_node(_data());
+	}
+
+#ifdef __BORLANDC__
+	PUGI__FN bool operator&&(const xml_text& lhs, bool rhs)
+	{
+		return (bool)lhs && rhs;
+	}
+
+	PUGI__FN bool operator||(const xml_text& lhs, bool rhs)
+	{
+		return (bool)lhs || rhs;
+	}
+#endif
+
+	PUGI__FN xml_node_iterator::xml_node_iterator()
+	{
+	}
+
+	PUGI__FN xml_node_iterator::xml_node_iterator(const xml_node& node): _wrap(node), _parent(node.parent())
+	{
+	}
+
+	PUGI__FN xml_node_iterator::xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent)
+	{
+	}
+
+	PUGI__FN bool xml_node_iterator::operator==(const xml_node_iterator& rhs) const
+	{
+		return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root;
+	}
+	
+	PUGI__FN bool xml_node_iterator::operator!=(const xml_node_iterator& rhs) const
+	{
+		return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root;
+	}
+
+	PUGI__FN xml_node& xml_node_iterator::operator*() const
+	{
+		assert(_wrap._root);
+		return _wrap;
+	}
+
+	PUGI__FN xml_node* xml_node_iterator::operator->() const
+	{
+		assert(_wrap._root);
+		return const_cast<xml_node*>(&_wrap); // BCC32 workaround
+	}
+
+	PUGI__FN const xml_node_iterator& xml_node_iterator::operator++()
+	{
+		assert(_wrap._root);
+		_wrap._root = _wrap._root->next_sibling;
+		return *this;
+	}
+
+	PUGI__FN xml_node_iterator xml_node_iterator::operator++(int)
+	{
+		xml_node_iterator temp = *this;
+		++*this;
+		return temp;
+	}
+
+	PUGI__FN const xml_node_iterator& xml_node_iterator::operator--()
+	{
+		_wrap = _wrap._root ? _wrap.previous_sibling() : _parent.last_child();
+		return *this;
+	}
+
+	PUGI__FN xml_node_iterator xml_node_iterator::operator--(int)
+	{
+		xml_node_iterator temp = *this;
+		--*this;
+		return temp;
+	}
+
+	PUGI__FN xml_attribute_iterator::xml_attribute_iterator()
+	{
+	}
+
+	PUGI__FN xml_attribute_iterator::xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent): _wrap(attr), _parent(parent)
+	{
+	}
+
+	PUGI__FN xml_attribute_iterator::xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent)
+	{
+	}
+
+	PUGI__FN bool xml_attribute_iterator::operator==(const xml_attribute_iterator& rhs) const
+	{
+		return _wrap._attr == rhs._wrap._attr && _parent._root == rhs._parent._root;
+	}
+	
+	PUGI__FN bool xml_attribute_iterator::operator!=(const xml_attribute_iterator& rhs) const
+	{
+		return _wrap._attr != rhs._wrap._attr || _parent._root != rhs._parent._root;
+	}
+
+	PUGI__FN xml_attribute& xml_attribute_iterator::operator*() const
+	{
+		assert(_wrap._attr);
+		return _wrap;
+	}
+
+	PUGI__FN xml_attribute* xml_attribute_iterator::operator->() const
+	{
+		assert(_wrap._attr);
+		return const_cast<xml_attribute*>(&_wrap); // BCC32 workaround
+	}
+
+	PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator++()
+	{
+		assert(_wrap._attr);
+		_wrap._attr = _wrap._attr->next_attribute;
+		return *this;
+	}
+
+	PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator++(int)
+	{
+		xml_attribute_iterator temp = *this;
+		++*this;
+		return temp;
+	}
+
+	PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator--()
+	{
+		_wrap = _wrap._attr ? _wrap.previous_attribute() : _parent.last_attribute();
+		return *this;
+	}
+
+	PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator--(int)
+	{
+		xml_attribute_iterator temp = *this;
+		--*this;
+		return temp;
+	}
+
+	PUGI__FN xml_named_node_iterator::xml_named_node_iterator(): _name(0)
+	{
+	}
+
+	PUGI__FN xml_named_node_iterator::xml_named_node_iterator(const xml_node& node, const char_t* name): _wrap(node), _parent(node.parent()), _name(name)
+	{
+	}
+
+	PUGI__FN xml_named_node_iterator::xml_named_node_iterator(xml_node_struct* ref, xml_node_struct* parent, const char_t* name): _wrap(ref), _parent(parent), _name(name)
+	{
+	}
+
+	PUGI__FN bool xml_named_node_iterator::operator==(const xml_named_node_iterator& rhs) const
+	{
+		return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root;
+	}
+
+	PUGI__FN bool xml_named_node_iterator::operator!=(const xml_named_node_iterator& rhs) const
+	{
+		return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root;
+	}
+
+	PUGI__FN xml_node& xml_named_node_iterator::operator*() const
+	{
+		assert(_wrap._root);
+		return _wrap;
+	}
+
+	PUGI__FN xml_node* xml_named_node_iterator::operator->() const
+	{
+		assert(_wrap._root);
+		return const_cast<xml_node*>(&_wrap); // BCC32 workaround
+	}
+
+	PUGI__FN const xml_named_node_iterator& xml_named_node_iterator::operator++()
+	{
+		assert(_wrap._root);
+		_wrap = _wrap.next_sibling(_name);
+		return *this;
+	}
+
+	PUGI__FN xml_named_node_iterator xml_named_node_iterator::operator++(int)
+	{
+		xml_named_node_iterator temp = *this;
+		++*this;
+		return temp;
+	}
+
+	PUGI__FN const xml_named_node_iterator& xml_named_node_iterator::operator--()
+	{
+		if (_wrap._root)
+			_wrap = _wrap.previous_sibling(_name);
+		else
+		{
+			_wrap = _parent.last_child();
+
+			if (!impl::strequal(_wrap.name(), _name))
+				_wrap = _wrap.previous_sibling(_name);
+		}
+
+		return *this;
+	}
+
+	PUGI__FN xml_named_node_iterator xml_named_node_iterator::operator--(int)
+	{
+		xml_named_node_iterator temp = *this;
+		--*this;
+		return temp;
+	}
+
+	PUGI__FN xml_parse_result::xml_parse_result(): status(status_internal_error), offset(0), encoding(encoding_auto)
+	{
+	}
+
+	PUGI__FN xml_parse_result::operator bool() const
+	{
+		return status == status_ok;
+	}
+
+	PUGI__FN const char* xml_parse_result::description() const
+	{
+		switch (status)
+		{
+		case status_ok: return "No error";
+
+		case status_file_not_found: return "File was not found";
+		case status_io_error: return "Error reading from file/stream";
+		case status_out_of_memory: return "Could not allocate memory";
+		case status_internal_error: return "Internal error occurred";
+
+		case status_unrecognized_tag: return "Could not determine tag type";
+
+		case status_bad_pi: return "Error parsing document declaration/processing instruction";
+		case status_bad_comment: return "Error parsing comment";
+		case status_bad_cdata: return "Error parsing CDATA section";
+		case status_bad_doctype: return "Error parsing document type declaration";
+		case status_bad_pcdata: return "Error parsing PCDATA section";
+		case status_bad_start_element: return "Error parsing start element tag";
+		case status_bad_attribute: return "Error parsing element attribute";
+		case status_bad_end_element: return "Error parsing end element tag";
+		case status_end_element_mismatch: return "Start-end tags mismatch";
+
+		case status_append_invalid_root: return "Unable to append nodes: root is not an element or document";
+
+		case status_no_document_element: return "No document element found";
+
+		default: return "Unknown error";
+		}
+	}
+
+	PUGI__FN xml_document::xml_document(): _buffer(0)
+	{
+		create();
+	}
+
+	PUGI__FN xml_document::~xml_document()
+	{
+		destroy();
+	}
+
+	PUGI__FN void xml_document::reset()
+	{
+		destroy();
+		create();
+	}
+
+	PUGI__FN void xml_document::reset(const xml_document& proto)
+	{
+		reset();
+
+		for (xml_node cur = proto.first_child(); cur; cur = cur.next_sibling())
+			append_copy(cur);
+	}
+
+	PUGI__FN void xml_document::create()
+	{
+        assert(!_root);
+
+		// initialize sentinel page
+		PUGI__STATIC_ASSERT(sizeof(impl::xml_memory_page) + sizeof(impl::xml_document_struct) + impl::xml_memory_page_alignment <= sizeof(_memory));
+
+		// align upwards to page boundary
+		void* page_memory = reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(_memory) + (impl::xml_memory_page_alignment - 1)) & ~(impl::xml_memory_page_alignment - 1));
+
+		// prepare page structure
+		impl::xml_memory_page* page = impl::xml_memory_page::construct(page_memory);
+		assert(page);
+
+		page->busy_size = impl::xml_memory_page_size;
+
+		// allocate new root
+		_root = new (page->data) impl::xml_document_struct(page);
+		_root->prev_sibling_c = _root;
+
+		// setup sentinel page
+		page->allocator = static_cast<impl::xml_document_struct*>(_root);
+	}
+
+	PUGI__FN void xml_document::destroy()
+	{
+        assert(_root);
+
+		// destroy static storage
+		if (_buffer)
+		{
+			impl::xml_memory::deallocate(_buffer);
+			_buffer = 0;
+		}
+
+		// destroy extra buffers (note: no need to destroy linked list nodes, they're allocated using document allocator)
+		for (impl::xml_extra_buffer* extra = static_cast<impl::xml_document_struct*>(_root)->extra_buffers; extra; extra = extra->next)
+		{
+			if (extra->buffer) impl::xml_memory::deallocate(extra->buffer);
+		}
+
+		// destroy dynamic storage, leave sentinel page (it's in static memory)
+        impl::xml_memory_page* root_page = reinterpret_cast<impl::xml_memory_page*>(_root->header & impl::xml_memory_page_pointer_mask);
+        assert(root_page && !root_page->prev && !root_page->memory);
+
+        for (impl::xml_memory_page* page = root_page->next; page; )
+        {
+            impl::xml_memory_page* next = page->next;
+
+            impl::xml_allocator::deallocate_page(page);
+
+            page = next;
+        }
+
+        _root = 0;
+	}
+
+#ifndef PUGIXML_NO_STL
+	PUGI__FN xml_parse_result xml_document::load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options, xml_encoding encoding)
+	{
+		reset();
+
+		return impl::load_stream_impl(*this, stream, options, encoding);
+	}
+
+	PUGI__FN xml_parse_result xml_document::load(std::basic_istream<wchar_t, std::char_traits<wchar_t> >& stream, unsigned int options)
+	{
+		reset();
+
+		return impl::load_stream_impl(*this, stream, options, encoding_wchar);
+	}
+#endif
+
+	PUGI__FN xml_parse_result xml_document::load(const char_t* contents, unsigned int options)
+	{
+		// Force native encoding (skip autodetection)
+	#ifdef PUGIXML_WCHAR_MODE
+		xml_encoding encoding = encoding_wchar;
+	#else
+		xml_encoding encoding = encoding_utf8;
+	#endif
+
+		return load_buffer(contents, impl::strlength(contents) * sizeof(char_t), options, encoding);
+	}
+
+	PUGI__FN xml_parse_result xml_document::load_file(const char* path_, unsigned int options, xml_encoding encoding)
+	{
+		reset();
+
+		FILE* file = fopen(path_, "rb");
+
+		return impl::load_file_impl(*this, file, options, encoding);
+	}
+
+	PUGI__FN xml_parse_result xml_document::load_file(const wchar_t* path_, unsigned int options, xml_encoding encoding)
+	{
+		reset();
+
+		FILE* file = impl::open_file_wide(path_, L"rb");
+
+		return impl::load_file_impl(*this, file, options, encoding);
+	}
+
+	PUGI__FN xml_parse_result xml_document::load_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding)
+	{
+		reset();
+
+		return impl::load_buffer_impl(static_cast<impl::xml_document_struct*>(_root), _root, const_cast<void*>(contents), size, options, encoding, false, false, &_buffer);
+	}
+
+	PUGI__FN xml_parse_result xml_document::load_buffer_inplace(void* contents, size_t size, unsigned int options, xml_encoding encoding)
+	{
+		reset();
+
+		return impl::load_buffer_impl(static_cast<impl::xml_document_struct*>(_root), _root, contents, size, options, encoding, true, false, &_buffer);
+	}
+		
+	PUGI__FN xml_parse_result xml_document::load_buffer_inplace_own(void* contents, size_t size, unsigned int options, xml_encoding encoding)
+	{
+		reset();
+
+		return impl::load_buffer_impl(static_cast<impl::xml_document_struct*>(_root), _root, contents, size, options, encoding, true, true, &_buffer);
+	}
+
+	PUGI__FN void xml_document::save(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+	{
+		impl::xml_buffered_writer buffered_writer(writer, encoding);
+
+		if ((flags & format_write_bom) && encoding != encoding_latin1)
+		{
+			// BOM always represents the codepoint U+FEFF, so just write it in native encoding
+		#ifdef PUGIXML_WCHAR_MODE
+			unsigned int bom = 0xfeff;
+			buffered_writer.write(static_cast<wchar_t>(bom));
+		#else
+			buffered_writer.write('\xef', '\xbb', '\xbf');
+		#endif
+		}
+
+		if (!(flags & format_no_declaration) && !impl::has_declaration(*this))
+		{
+			buffered_writer.write(PUGIXML_TEXT("<?xml version=\"1.0\""));
+			if (encoding == encoding_latin1) buffered_writer.write(PUGIXML_TEXT(" encoding=\"ISO-8859-1\""));
+			buffered_writer.write('?', '>');
+			if (!(flags & format_raw)) buffered_writer.write('\n');
+		}
+
+		impl::node_output(buffered_writer, *this, indent, flags, 0);
+	}
+
+#ifndef PUGIXML_NO_STL
+	PUGI__FN void xml_document::save(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+	{
+		xml_writer_stream writer(stream);
+
+		save(writer, indent, flags, encoding);
+	}
+
+	PUGI__FN void xml_document::save(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent, unsigned int flags) const
+	{
+		xml_writer_stream writer(stream);
+
+		save(writer, indent, flags, encoding_wchar);
+	}
+#endif
+
+	PUGI__FN bool xml_document::save_file(const char* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+	{
+		FILE* file = fopen(path_, (flags & format_save_file_text) ? "w" : "wb");
+		return impl::save_file_impl(*this, file, indent, flags, encoding);
+	}
+
+	PUGI__FN bool xml_document::save_file(const wchar_t* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+	{
+		FILE* file = impl::open_file_wide(path_, (flags & format_save_file_text) ? L"w" : L"wb");
+		return impl::save_file_impl(*this, file, indent, flags, encoding);
+	}
+
+	PUGI__FN xml_node xml_document::document_element() const
+	{
+        assert(_root);
+
+		for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+			if ((i->header & impl::xml_memory_page_type_mask) + 1 == node_element)
+				return xml_node(i);
+
+		return xml_node();
+	}
+
+#ifndef PUGIXML_NO_STL
+	PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const wchar_t* str)
+	{
+		assert(str);
+
+		return impl::as_utf8_impl(str, impl::strlength_wide(str));
+	}
+
+	PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t>& str)
+	{
+		return impl::as_utf8_impl(str.c_str(), str.size());
+	}
+	
+	PUGI__FN std::basic_string<wchar_t> PUGIXML_FUNCTION as_wide(const char* str)
+	{
+		assert(str);
+
+		return impl::as_wide_impl(str, strlen(str));
+	}
+	
+	PUGI__FN std::basic_string<wchar_t> PUGIXML_FUNCTION as_wide(const std::string& str)
+	{
+		return impl::as_wide_impl(str.c_str(), str.size());
+	}
+#endif
+
+	PUGI__FN void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate)
+	{
+		impl::xml_memory::allocate = allocate;
+		impl::xml_memory::deallocate = deallocate;
+	}
+
+	PUGI__FN allocation_function PUGIXML_FUNCTION get_memory_allocation_function()
+	{
+		return impl::xml_memory::allocate;
+	}
+
+	PUGI__FN deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function()
+	{
+		return impl::xml_memory::deallocate;
+	}
+}
+
+#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC))
+namespace std
+{
+	// Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
+	PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_node_iterator&)
+	{
+		return std::bidirectional_iterator_tag();
+	}
+
+	PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_attribute_iterator&)
+	{
+		return std::bidirectional_iterator_tag();
+	}
+
+	PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_named_node_iterator&)
+	{
+		return std::bidirectional_iterator_tag();
+	}
+}
+#endif
+
+#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC)
+namespace std
+{
+	// Workarounds for (non-standard) iterator category detection
+	PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_node_iterator&)
+	{
+		return std::bidirectional_iterator_tag();
+	}
+
+	PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_attribute_iterator&)
+	{
+		return std::bidirectional_iterator_tag();
+	}
+
+	PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_named_node_iterator&)
+	{
+		return std::bidirectional_iterator_tag();
+	}
+}
+#endif
+
+#ifndef PUGIXML_NO_XPATH
+
+// STL replacements
+PUGI__NS_BEGIN
+	struct equal_to
+	{
+		template <typename T> bool operator()(const T& lhs, const T& rhs) const
+		{
+			return lhs == rhs;
+		}
+	};
+
+	struct not_equal_to
+	{
+		template <typename T> bool operator()(const T& lhs, const T& rhs) const
+		{
+			return lhs != rhs;
+		}
+	};
+
+	struct less
+	{
+		template <typename T> bool operator()(const T& lhs, const T& rhs) const
+		{
+			return lhs < rhs;
+		}
+	};
+
+	struct less_equal
+	{
+		template <typename T> bool operator()(const T& lhs, const T& rhs) const
+		{
+			return lhs <= rhs;
+		}
+	};
+
+	template <typename T> void swap(T& lhs, T& rhs)
+	{
+		T temp = lhs;
+		lhs = rhs;
+		rhs = temp;
+	}
+
+	template <typename I, typename Pred> I min_element(I begin, I end, const Pred& pred)
+	{
+		I result = begin;
+
+		for (I it = begin + 1; it != end; ++it)
+			if (pred(*it, *result))
+				result = it;
+
+		return result;
+	}
+
+	template <typename I> void reverse(I begin, I end)
+	{
+		while (end - begin > 1) swap(*begin++, *--end);
+	}
+
+	template <typename I> I unique(I begin, I end)
+	{
+		// fast skip head
+		while (end - begin > 1 && *begin != *(begin + 1)) begin++;
+
+		if (begin == end) return begin;
+
+		// last written element
+		I write = begin++; 
+
+		// merge unique elements
+		while (begin != end)
+		{
+			if (*begin != *write)
+				*++write = *begin++;
+			else
+				begin++;
+		}
+
+		// past-the-end (write points to live element)
+		return write + 1;
+	}
+
+	template <typename I> void copy_backwards(I begin, I end, I target)
+	{
+		while (begin != end) *--target = *--end;
+	}
+
+	template <typename I, typename Pred, typename T> void insertion_sort(I begin, I end, const Pred& pred, T*)
+	{
+		assert(begin != end);
+
+		for (I it = begin + 1; it != end; ++it)
+		{
+			T val = *it;
+
+			if (pred(val, *begin))
+			{
+				// move to front
+				copy_backwards(begin, it, it + 1);
+				*begin = val;
+			}
+			else
+			{
+				I hole = it;
+
+				// move hole backwards
+				while (pred(val, *(hole - 1)))
+				{
+					*hole = *(hole - 1);
+					hole--;
+				}
+
+				// fill hole with element
+				*hole = val;
+			}
+		}
+	}
+
+	// std variant for elements with ==
+	template <typename I, typename Pred> void partition(I begin, I middle, I end, const Pred& pred, I* out_eqbeg, I* out_eqend)
+	{
+		I eqbeg = middle, eqend = middle + 1;
+
+		// expand equal range
+		while (eqbeg != begin && *(eqbeg - 1) == *eqbeg) --eqbeg;
+		while (eqend != end && *eqend == *eqbeg) ++eqend;
+
+		// process outer elements
+		I ltend = eqbeg, gtbeg = eqend;
+
+		for (;;)
+		{
+			// find the element from the right side that belongs to the left one
+			for (; gtbeg != end; ++gtbeg)
+				if (!pred(*eqbeg, *gtbeg))
+				{
+					if (*gtbeg == *eqbeg) swap(*gtbeg, *eqend++);
+					else break;
+				}
+
+			// find the element from the left side that belongs to the right one
+			for (; ltend != begin; --ltend)
+				if (!pred(*(ltend - 1), *eqbeg))
+				{
+					if (*eqbeg == *(ltend - 1)) swap(*(ltend - 1), *--eqbeg);
+					else break;
+				}
+
+			// scanned all elements
+			if (gtbeg == end && ltend == begin)
+			{
+				*out_eqbeg = eqbeg;
+				*out_eqend = eqend;
+				return;
+			}
+
+			// make room for elements by moving equal area
+			if (gtbeg == end)
+			{
+				if (--ltend != --eqbeg) swap(*ltend, *eqbeg);
+				swap(*eqbeg, *--eqend);
+			}
+			else if (ltend == begin)
+			{
+				if (eqend != gtbeg) swap(*eqbeg, *eqend);
+				++eqend;
+				swap(*gtbeg++, *eqbeg++);
+			}
+			else swap(*gtbeg++, *--ltend);
+		}
+	}
+
+	template <typename I, typename Pred> void median3(I first, I middle, I last, const Pred& pred)
+	{
+		if (pred(*middle, *first)) swap(*middle, *first);
+		if (pred(*last, *middle)) swap(*last, *middle);
+		if (pred(*middle, *first)) swap(*middle, *first);
+	}
+
+	template <typename I, typename Pred> void median(I first, I middle, I last, const Pred& pred)
+	{
+		if (last - first <= 40)
+		{
+			// median of three for small chunks
+			median3(first, middle, last, pred);
+		}
+		else
+		{
+			// median of nine
+			size_t step = (last - first + 1) / 8;
+
+			median3(first, first + step, first + 2 * step, pred);
+			median3(middle - step, middle, middle + step, pred);
+			median3(last - 2 * step, last - step, last, pred);
+			median3(first + step, middle, last - step, pred);
+		}
+	}
+
+	template <typename I, typename Pred> void sort(I begin, I end, const Pred& pred)
+	{
+		// sort large chunks
+		while (end - begin > 32)
+		{
+			// find median element
+			I middle = begin + (end - begin) / 2;
+			median(begin, middle, end - 1, pred);
+
+			// partition in three chunks (< = >)
+			I eqbeg, eqend;
+			partition(begin, middle, end, pred, &eqbeg, &eqend);
+
+			// loop on larger half
+			if (eqbeg - begin > end - eqend)
+			{
+				sort(eqend, end, pred);
+				end = eqbeg;
+			}
+			else
+			{
+				sort(begin, eqbeg, pred);
+				begin = eqend;
+			}
+		}
+
+		// insertion sort small chunk
+		if (begin != end) insertion_sort(begin, end, pred, &*begin);
+	}
+PUGI__NS_END
+
+// Allocator used for AST and evaluation stacks
+PUGI__NS_BEGIN
+	struct xpath_memory_block
+	{	
+		xpath_memory_block* next;
+
+		char data[
+	#ifdef PUGIXML_MEMORY_XPATH_PAGE_SIZE
+			PUGIXML_MEMORY_XPATH_PAGE_SIZE
+	#else
+			4096
+	#endif
+		];
+	};
+		
+	class xpath_allocator
+	{
+		xpath_memory_block* _root;
+		size_t _root_size;
+
+	public:
+	#ifdef PUGIXML_NO_EXCEPTIONS
+		jmp_buf* error_handler;
+	#endif
+
+		xpath_allocator(xpath_memory_block* root, size_t root_size = 0): _root(root), _root_size(root_size)
+		{
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			error_handler = 0;
+		#endif
+		}
+		
+		void* allocate_nothrow(size_t size)
+		{
+			const size_t block_capacity = sizeof(_root->data);
+
+			// align size so that we're able to store pointers in subsequent blocks
+			size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+			if (_root_size + size <= block_capacity)
+			{
+				void* buf = _root->data + _root_size;
+				_root_size += size;
+				return buf;
+			}
+			else
+			{
+				size_t block_data_size = (size > block_capacity) ? size : block_capacity;
+				size_t block_size = block_data_size + offsetof(xpath_memory_block, data);
+
+				xpath_memory_block* block = static_cast<xpath_memory_block*>(xml_memory::allocate(block_size));
+				if (!block) return 0;
+				
+				block->next = _root;
+				
+				_root = block;
+				_root_size = size;
+				
+				return block->data;
+			}
+		}
+
+		void* allocate(size_t size)
+		{
+			void* result = allocate_nothrow(size);
+
+			if (!result)
+			{
+			#ifdef PUGIXML_NO_EXCEPTIONS
+				assert(error_handler);
+				longjmp(*error_handler, 1);
+			#else
+				throw std::bad_alloc();
+			#endif
+			}
+
+			return result;
+		}
+
+		void* reallocate(void* ptr, size_t old_size, size_t new_size)
+		{
+			// align size so that we're able to store pointers in subsequent blocks
+			old_size = (old_size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+			new_size = (new_size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+			// we can only reallocate the last object
+			assert(ptr == 0 || static_cast<char*>(ptr) + old_size == _root->data + _root_size);
+
+			// adjust root size so that we have not allocated the object at all
+			bool only_object = (_root_size == old_size);
+
+			if (ptr) _root_size -= old_size;
+
+			// allocate a new version (this will obviously reuse the memory if possible)
+			void* result = allocate(new_size);
+			assert(result);
+
+			// we have a new block
+			if (result != ptr && ptr)
+			{
+				// copy old data
+				assert(new_size >= old_size);
+				memcpy(result, ptr, old_size);
+
+				// free the previous page if it had no other objects
+				if (only_object)
+				{
+					assert(_root->data == result);
+					assert(_root->next);
+
+					xpath_memory_block* next = _root->next->next;
+
+					if (next)
+					{
+						// deallocate the whole page, unless it was the first one
+						xml_memory::deallocate(_root->next);
+						_root->next = next;
+					}
+				}
+			}
+
+			return result;
+		}
+
+		void revert(const xpath_allocator& state)
+		{
+			// free all new pages
+			xpath_memory_block* cur = _root;
+
+			while (cur != state._root)
+			{
+				xpath_memory_block* next = cur->next;
+
+				xml_memory::deallocate(cur);
+
+				cur = next;
+			}
+
+			// restore state
+			_root = state._root;
+			_root_size = state._root_size;
+		}
+
+		void release()
+		{
+			xpath_memory_block* cur = _root;
+			assert(cur);
+
+			while (cur->next)
+			{
+				xpath_memory_block* next = cur->next;
+
+				xml_memory::deallocate(cur);
+
+				cur = next;
+			}
+		}
+	};
+
+	struct xpath_allocator_capture
+	{
+		xpath_allocator_capture(xpath_allocator* alloc): _target(alloc), _state(*alloc)
+		{
+		}
+
+		~xpath_allocator_capture()
+		{
+			_target->revert(_state);
+		}
+
+		xpath_allocator* _target;
+		xpath_allocator _state;
+	};
+
+	struct xpath_stack
+	{
+		xpath_allocator* result;
+		xpath_allocator* temp;
+	};
+
+	struct xpath_stack_data
+	{
+		xpath_memory_block blocks[2];
+		xpath_allocator result;
+		xpath_allocator temp;
+		xpath_stack stack;
+
+	#ifdef PUGIXML_NO_EXCEPTIONS
+		jmp_buf error_handler;
+	#endif
+
+		xpath_stack_data(): result(blocks + 0), temp(blocks + 1)
+		{
+			blocks[0].next = blocks[1].next = 0;
+
+			stack.result = &result;
+			stack.temp = &temp;
+
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			result.error_handler = temp.error_handler = &error_handler;
+		#endif
+		}
+
+		~xpath_stack_data()
+		{
+			result.release();
+			temp.release();
+		}
+	};
+PUGI__NS_END
+
+// String class
+PUGI__NS_BEGIN
+	class xpath_string
+	{
+		const char_t* _buffer;
+		bool _uses_heap;
+
+		static char_t* duplicate_string(const char_t* string, size_t length, xpath_allocator* alloc)
+		{
+			char_t* result = static_cast<char_t*>(alloc->allocate((length + 1) * sizeof(char_t)));
+			assert(result);
+
+			memcpy(result, string, length * sizeof(char_t));
+			result[length] = 0;
+
+			return result;
+		}
+
+		static char_t* duplicate_string(const char_t* string, xpath_allocator* alloc)
+		{
+			return duplicate_string(string, strlength(string), alloc);
+		}
+
+	public:
+		xpath_string(): _buffer(PUGIXML_TEXT("")), _uses_heap(false)
+		{
+		}
+
+		explicit xpath_string(const char_t* str, xpath_allocator* alloc)
+		{
+			bool empty_ = (*str == 0);
+
+			_buffer = empty_ ? PUGIXML_TEXT("") : duplicate_string(str, alloc);
+			_uses_heap = !empty_;
+		}
+
+		explicit xpath_string(const char_t* str, bool use_heap): _buffer(str), _uses_heap(use_heap)
+		{
+		}
+
+		xpath_string(const char_t* begin, const char_t* end, xpath_allocator* alloc)
+		{
+			assert(begin <= end);
+
+			bool empty_ = (begin == end);
+
+			_buffer = empty_ ? PUGIXML_TEXT("") : duplicate_string(begin, static_cast<size_t>(end - begin), alloc);
+			_uses_heap = !empty_;
+		}
+
+		void append(const xpath_string& o, xpath_allocator* alloc)
+		{
+			// skip empty sources
+			if (!*o._buffer) return;
+
+			// fast append for constant empty target and constant source
+			if (!*_buffer && !_uses_heap && !o._uses_heap)
+			{
+				_buffer = o._buffer;
+			}
+			else
+			{
+				// need to make heap copy
+				size_t target_length = strlength(_buffer);
+				size_t source_length = strlength(o._buffer);
+				size_t result_length = target_length + source_length;
+
+				// allocate new buffer
+				char_t* result = static_cast<char_t*>(alloc->reallocate(_uses_heap ? const_cast<char_t*>(_buffer) : 0, (target_length + 1) * sizeof(char_t), (result_length + 1) * sizeof(char_t)));
+				assert(result);
+
+				// append first string to the new buffer in case there was no reallocation
+				if (!_uses_heap) memcpy(result, _buffer, target_length * sizeof(char_t));
+
+				// append second string to the new buffer
+				memcpy(result + target_length, o._buffer, source_length * sizeof(char_t));
+				result[result_length] = 0;
+
+				// finalize
+				_buffer = result;
+				_uses_heap = true;
+			}
+		}
+
+		const char_t* c_str() const
+		{
+			return _buffer;
+		}
+
+		size_t length() const
+		{
+			return strlength(_buffer);
+		}
+		
+		char_t* data(xpath_allocator* alloc)
+		{
+			// make private heap copy
+			if (!_uses_heap)
+			{
+				_buffer = duplicate_string(_buffer, alloc);
+				_uses_heap = true;
+			}
+
+			return const_cast<char_t*>(_buffer);
+		}
+
+		bool empty() const
+		{
+			return *_buffer == 0;
+		}
+
+		bool operator==(const xpath_string& o) const
+		{
+			return strequal(_buffer, o._buffer);
+		}
+
+		bool operator!=(const xpath_string& o) const
+		{
+			return !strequal(_buffer, o._buffer);
+		}
+
+		bool uses_heap() const
+		{
+			return _uses_heap;
+		}
+	};
+
+	PUGI__FN xpath_string xpath_string_const(const char_t* str)
+	{
+		return xpath_string(str, false);
+	}
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+	PUGI__FN bool starts_with(const char_t* string, const char_t* pattern)
+	{
+		while (*pattern && *string == *pattern)
+		{
+			string++;
+			pattern++;
+		}
+
+		return *pattern == 0;
+	}
+
+	PUGI__FN const char_t* find_char(const char_t* s, char_t c)
+	{
+	#ifdef PUGIXML_WCHAR_MODE
+		return wcschr(s, c);
+	#else
+		return strchr(s, c);
+	#endif
+	}
+
+	PUGI__FN const char_t* find_substring(const char_t* s, const char_t* p)
+	{
+	#ifdef PUGIXML_WCHAR_MODE
+		// MSVC6 wcsstr bug workaround (if s is empty it always returns 0)
+		return (*p == 0) ? s : wcsstr(s, p);
+	#else
+		return strstr(s, p);
+	#endif
+	}
+
+	// Converts symbol to lower case, if it is an ASCII one
+	PUGI__FN char_t tolower_ascii(char_t ch)
+	{
+		return static_cast<unsigned int>(ch - 'A') < 26 ? static_cast<char_t>(ch | ' ') : ch;
+	}
+
+	PUGI__FN xpath_string string_value(const xpath_node& na, xpath_allocator* alloc)
+	{
+		if (na.attribute())
+			return xpath_string_const(na.attribute().value());
+		else
+		{
+			const xml_node& n = na.node();
+
+			switch (n.type())
+			{
+			case node_pcdata:
+			case node_cdata:
+			case node_comment:
+			case node_pi:
+				return xpath_string_const(n.value());
+			
+			case node_document:
+			case node_element:
+			{
+				xpath_string result;
+
+				xml_node cur = n.first_child();
+				
+				while (cur && cur != n)
+				{
+					if (cur.type() == node_pcdata || cur.type() == node_cdata)
+						result.append(xpath_string_const(cur.value()), alloc);
+
+					if (cur.first_child())
+						cur = cur.first_child();
+					else if (cur.next_sibling())
+						cur = cur.next_sibling();
+					else
+					{
+						while (!cur.next_sibling() && cur != n)
+							cur = cur.parent();
+
+						if (cur != n) cur = cur.next_sibling();
+					}
+				}
+				
+				return result;
+			}
+			
+			default:
+				return xpath_string();
+			}
+		}
+	}
+	
+	PUGI__FN unsigned int node_height(xml_node n)
+	{
+		unsigned int result = 0;
+		
+		while (n)
+		{
+			++result;
+			n = n.parent();
+		}
+		
+		return result;
+	}
+	
+	PUGI__FN bool node_is_before(xml_node ln, unsigned int lh, xml_node rn, unsigned int rh)
+	{
+		// normalize heights
+		for (unsigned int i = rh; i < lh; i++) ln = ln.parent();
+		for (unsigned int j = lh; j < rh; j++) rn = rn.parent();
+		
+		// one node is the ancestor of the other
+		if (ln == rn) return lh < rh;
+		
+		// find common ancestor
+		while (ln.parent() != rn.parent())
+		{
+			ln = ln.parent();
+			rn = rn.parent();
+		}
+
+		// there is no common ancestor (the shared parent is null), nodes are from different documents
+		if (!ln.parent()) return ln < rn;
+
+		// determine sibling order
+		for (; ln; ln = ln.next_sibling())
+			if (ln == rn)
+				return true;
+				
+		return false;
+	}
+
+	PUGI__FN bool node_is_ancestor(xml_node parent, xml_node node)
+	{
+		while (node && node != parent) node = node.parent();
+
+		return parent && node == parent;
+	}
+
+	PUGI__FN const void* document_order(const xpath_node& xnode)
+	{
+		xml_node_struct* node = xnode.node().internal_object();
+
+		if (node)
+		{
+			if (node->name && (node->header & xml_memory_page_name_allocated_mask) == 0) return node->name;
+			if (node->value && (node->header & xml_memory_page_value_allocated_mask) == 0) return node->value;
+			return 0;
+		}
+
+		xml_attribute_struct* attr = xnode.attribute().internal_object();
+
+		if (attr)
+		{
+			if ((attr->header & xml_memory_page_name_allocated_mask) == 0) return attr->name;
+			if ((attr->header & xml_memory_page_value_allocated_mask) == 0) return attr->value;
+			return 0;
+		}
+
+		return 0;
+	}
+	
+	struct document_order_comparator
+	{
+		bool operator()(const xpath_node& lhs, const xpath_node& rhs) const
+		{
+			// optimized document order based check
+			const void* lo = document_order(lhs);
+			const void* ro = document_order(rhs);
+
+			if (lo && ro) return lo < ro;
+
+			// slow comparison
+			xml_node ln = lhs.node(), rn = rhs.node();
+
+			// compare attributes
+			if (lhs.attribute() && rhs.attribute())
+			{
+				// shared parent
+				if (lhs.parent() == rhs.parent())
+				{
+					// determine sibling order
+					for (xml_attribute a = lhs.attribute(); a; a = a.next_attribute())
+						if (a == rhs.attribute())
+							return true;
+					
+					return false;
+				}
+				
+				// compare attribute parents
+				ln = lhs.parent();
+				rn = rhs.parent();
+			}
+			else if (lhs.attribute())
+			{
+				// attributes go after the parent element
+				if (lhs.parent() == rhs.node()) return false;
+				
+				ln = lhs.parent();
+			}
+			else if (rhs.attribute())
+			{
+				// attributes go after the parent element
+				if (rhs.parent() == lhs.node()) return true;
+				
+				rn = rhs.parent();
+			}
+
+			if (ln == rn) return false;
+			
+			unsigned int lh = node_height(ln);
+			unsigned int rh = node_height(rn);
+			
+			return node_is_before(ln, lh, rn, rh);
+		}
+	};
+
+	struct duplicate_comparator
+	{
+		bool operator()(const xpath_node& lhs, const xpath_node& rhs) const
+		{
+			if (lhs.attribute()) return rhs.attribute() ? lhs.attribute() < rhs.attribute() : true;
+			else return rhs.attribute() ? false : lhs.node() < rhs.node();
+		}
+	};
+	
+	PUGI__FN double gen_nan()
+	{
+	#if defined(__STDC_IEC_559__) || ((FLT_RADIX - 0 == 2) && (FLT_MAX_EXP - 0 == 128) && (FLT_MANT_DIG - 0 == 24))
+		union { float f; uint32_t i; } u[sizeof(float) == sizeof(uint32_t) ? 1 : -1];
+		u[0].i = 0x7fc00000;
+		return u[0].f;
+	#else
+		// fallback
+		const volatile double zero = 0.0;
+		return zero / zero;
+	#endif
+	}
+	
+	PUGI__FN bool is_nan(double value)
+	{
+	#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__)
+		return !!_isnan(value);
+	#elif defined(fpclassify) && defined(FP_NAN)
+		return fpclassify(value) == FP_NAN;
+	#else
+		// fallback
+		const volatile double v = value;
+		return v != v;
+	#endif
+	}
+	
+	PUGI__FN const char_t* convert_number_to_string_special(double value)
+	{
+	#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__)
+		if (_finite(value)) return (value == 0) ? PUGIXML_TEXT("0") : 0;
+		if (_isnan(value)) return PUGIXML_TEXT("NaN");
+		return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+	#elif defined(fpclassify) && defined(FP_NAN) && defined(FP_INFINITE) && defined(FP_ZERO)
+		switch (fpclassify(value))
+		{
+		case FP_NAN:
+			return PUGIXML_TEXT("NaN");
+
+		case FP_INFINITE:
+			return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+
+		case FP_ZERO:
+			return PUGIXML_TEXT("0");
+
+		default:
+			return 0;
+		}
+	#else
+		// fallback
+		const volatile double v = value;
+
+		if (v == 0) return PUGIXML_TEXT("0");
+		if (v != v) return PUGIXML_TEXT("NaN");
+		if (v * 2 == v) return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+		return 0;
+	#endif
+	}
+	
+	PUGI__FN bool convert_number_to_boolean(double value)
+	{
+		return (value != 0 && !is_nan(value));
+	}
+	
+	PUGI__FN void truncate_zeros(char* begin, char* end)
+	{
+		while (begin != end && end[-1] == '0') end--;
+
+		*end = 0;
+	}
+
+	// gets mantissa digits in the form of 0.xxxxx with 0. implied and the exponent
+#if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE)
+	PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent)
+	{
+		// get base values
+		int sign, exponent;
+		_ecvt_s(buffer, buffer_size, value, DBL_DIG + 1, &exponent, &sign);
+
+		// truncate redundant zeros
+		truncate_zeros(buffer, buffer + strlen(buffer));
+
+		// fill results
+		*out_mantissa = buffer;
+		*out_exponent = exponent;
+	}
+#else
+	PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent)
+	{
+		// get a scientific notation value with IEEE DBL_DIG decimals
+		sprintf(buffer, "%.*e", DBL_DIG, value);
+		assert(strlen(buffer) < buffer_size);
+		(void)!buffer_size;
+
+		// get the exponent (possibly negative)
+		char* exponent_string = strchr(buffer, 'e');
+		assert(exponent_string);
+
+		int exponent = atoi(exponent_string + 1);
+
+		// extract mantissa string: skip sign
+		char* mantissa = buffer[0] == '-' ? buffer + 1 : buffer;
+		assert(mantissa[0] != '0' && mantissa[1] == '.');
+
+		// divide mantissa by 10 to eliminate integer part
+		mantissa[1] = mantissa[0];
+		mantissa++;
+		exponent++;
+
+		// remove extra mantissa digits and zero-terminate mantissa
+		truncate_zeros(mantissa, exponent_string);
+
+		// fill results
+		*out_mantissa = mantissa;
+		*out_exponent = exponent;
+	}
+#endif
+
+	PUGI__FN xpath_string convert_number_to_string(double value, xpath_allocator* alloc)
+	{
+		// try special number conversion
+		const char_t* special = convert_number_to_string_special(value);
+		if (special) return xpath_string_const(special);
+
+		// get mantissa + exponent form
+		char mantissa_buffer[32];
+
+		char* mantissa;
+		int exponent;
+		convert_number_to_mantissa_exponent(value, mantissa_buffer, sizeof(mantissa_buffer), &mantissa, &exponent);
+
+		// allocate a buffer of suitable length for the number
+		size_t result_size = strlen(mantissa_buffer) + (exponent > 0 ? exponent : -exponent) + 4;
+		char_t* result = static_cast<char_t*>(alloc->allocate(sizeof(char_t) * result_size));
+		assert(result);
+
+		// make the number!
+		char_t* s = result;
+
+		// sign
+		if (value < 0) *s++ = '-';
+
+		// integer part
+		if (exponent <= 0)
+		{
+			*s++ = '0';
+		}
+		else
+		{
+			while (exponent > 0)
+			{
+				assert(*mantissa == 0 || static_cast<unsigned int>(static_cast<unsigned int>(*mantissa) - '0') <= 9);
+				*s++ = *mantissa ? *mantissa++ : '0';
+				exponent--;
+			}
+		}
+
+		// fractional part
+		if (*mantissa)
+		{
+			// decimal point
+			*s++ = '.';
+
+			// extra zeroes from negative exponent
+			while (exponent < 0)
+			{
+				*s++ = '0';
+				exponent++;
+			}
+
+			// extra mantissa digits
+			while (*mantissa)
+			{
+				assert(static_cast<unsigned int>(*mantissa - '0') <= 9);
+				*s++ = *mantissa++;
+			}
+		}
+
+		// zero-terminate
+		assert(s < result + result_size);
+		*s = 0;
+
+		return xpath_string(result, true);
+	}
+	
+	PUGI__FN bool check_string_to_number_format(const char_t* string)
+	{
+		// parse leading whitespace
+		while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string;
+
+		// parse sign
+		if (*string == '-') ++string;
+
+		if (!*string) return false;
+
+		// if there is no integer part, there should be a decimal part with at least one digit
+		if (!PUGI__IS_CHARTYPEX(string[0], ctx_digit) && (string[0] != '.' || !PUGI__IS_CHARTYPEX(string[1], ctx_digit))) return false;
+
+		// parse integer part
+		while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string;
+
+		// parse decimal part
+		if (*string == '.')
+		{
+			++string;
+
+			while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string;
+		}
+
+		// parse trailing whitespace
+		while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string;
+
+		return *string == 0;
+	}
+
+	PUGI__FN double convert_string_to_number(const char_t* string)
+	{
+		// check string format
+		if (!check_string_to_number_format(string)) return gen_nan();
+
+		// parse string
+	#ifdef PUGIXML_WCHAR_MODE
+		return wcstod(string, 0);
+	#else
+		return atof(string);
+	#endif
+	}
+
+	PUGI__FN bool convert_string_to_number_scratch(char_t (&buffer)[32], const char_t* begin, const char_t* end, double* out_result)
+	{
+		size_t length = static_cast<size_t>(end - begin);
+		char_t* scratch = buffer;
+
+		if (length >= sizeof(buffer) / sizeof(buffer[0]))
+		{
+			// need to make dummy on-heap copy
+			scratch = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+			if (!scratch) return false;
+		}
+
+		// copy string to zero-terminated buffer and perform conversion
+		memcpy(scratch, begin, length * sizeof(char_t));
+		scratch[length] = 0;
+
+		*out_result = convert_string_to_number(scratch);
+
+		// free dummy buffer
+		if (scratch != buffer) xml_memory::deallocate(scratch);
+
+		return true;
+	}
+	
+	PUGI__FN double round_nearest(double value)
+	{
+		return floor(value + 0.5);
+	}
+
+	PUGI__FN double round_nearest_nzero(double value)
+	{
+		// same as round_nearest, but returns -0 for [-0.5, -0]
+		// ceil is used to differentiate between +0 and -0 (we return -0 for [-0.5, -0] and +0 for +0)
+		return (value >= -0.5 && value <= 0) ? ceil(value) : floor(value + 0.5);
+	}
+	
+	PUGI__FN const char_t* qualified_name(const xpath_node& node)
+	{
+		return node.attribute() ? node.attribute().name() : node.node().name();
+	}
+	
+	PUGI__FN const char_t* local_name(const xpath_node& node)
+	{
+		const char_t* name = qualified_name(node);
+		const char_t* p = find_char(name, ':');
+		
+		return p ? p + 1 : name;
+	}
+
+	struct namespace_uri_predicate
+	{
+		const char_t* prefix;
+		size_t prefix_length;
+
+		namespace_uri_predicate(const char_t* name)
+		{
+			const char_t* pos = find_char(name, ':');
+
+			prefix = pos ? name : 0;
+			prefix_length = pos ? static_cast<size_t>(pos - name) : 0;
+		}
+
+		bool operator()(const xml_attribute& a) const
+		{
+			const char_t* name = a.name();
+
+			if (!starts_with(name, PUGIXML_TEXT("xmlns"))) return false;
+
+			return prefix ? name[5] == ':' && strequalrange(name + 6, prefix, prefix_length) : name[5] == 0;
+		}
+	};
+
+	PUGI__FN const char_t* namespace_uri(const xml_node& node)
+	{
+		namespace_uri_predicate pred = node.name();
+		
+		xml_node p = node;
+		
+		while (p)
+		{
+			xml_attribute a = p.find_attribute(pred);
+			
+			if (a) return a.value();
+			
+			p = p.parent();
+		}
+		
+		return PUGIXML_TEXT("");
+	}
+
+	PUGI__FN const char_t* namespace_uri(const xml_attribute& attr, const xml_node& parent)
+	{
+		namespace_uri_predicate pred = attr.name();
+		
+		// Default namespace does not apply to attributes
+		if (!pred.prefix) return PUGIXML_TEXT("");
+		
+		xml_node p = parent;
+		
+		while (p)
+		{
+			xml_attribute a = p.find_attribute(pred);
+			
+			if (a) return a.value();
+			
+			p = p.parent();
+		}
+		
+		return PUGIXML_TEXT("");
+	}
+
+	PUGI__FN const char_t* namespace_uri(const xpath_node& node)
+	{
+		return node.attribute() ? namespace_uri(node.attribute(), node.parent()) : namespace_uri(node.node());
+	}
+
+	PUGI__FN void normalize_space(char_t* buffer)
+	{
+		char_t* write = buffer;
+
+		for (char_t* it = buffer; *it; )
+		{
+			char_t ch = *it++;
+
+			if (PUGI__IS_CHARTYPE(ch, ct_space))
+			{
+				// replace whitespace sequence with single space
+				while (PUGI__IS_CHARTYPE(*it, ct_space)) it++;
+
+				// avoid leading spaces
+				if (write != buffer) *write++ = ' ';
+			}
+			else *write++ = ch;
+		}
+
+		// remove trailing space
+		if (write != buffer && PUGI__IS_CHARTYPE(write[-1], ct_space)) write--;
+
+		// zero-terminate
+		*write = 0;
+	}
+
+	PUGI__FN void translate(char_t* buffer, const char_t* from, const char_t* to)
+	{
+		size_t to_length = strlength(to);
+
+		char_t* write = buffer;
+
+		while (*buffer)
+		{
+			PUGI__DMC_VOLATILE char_t ch = *buffer++;
+
+			const char_t* pos = find_char(from, ch);
+
+			if (!pos)
+				*write++ = ch; // do not process
+			else if (static_cast<size_t>(pos - from) < to_length)
+				*write++ = to[pos - from]; // replace
+		}
+
+		// zero-terminate
+		*write = 0;
+	}
+
+	struct xpath_variable_boolean: xpath_variable
+	{
+		xpath_variable_boolean(): value(false)
+		{
+		}
+
+		bool value;
+		char_t name[1];
+	};
+
+	struct xpath_variable_number: xpath_variable
+	{
+		xpath_variable_number(): value(0)
+		{
+		}
+
+		double value;
+		char_t name[1];
+	};
+
+	struct xpath_variable_string: xpath_variable
+	{
+		xpath_variable_string(): value(0)
+		{
+		}
+
+		~xpath_variable_string()
+		{
+			if (value) xml_memory::deallocate(value);
+		}
+
+		char_t* value;
+		char_t name[1];
+	};
+
+	struct xpath_variable_node_set: xpath_variable
+	{
+		xpath_node_set value;
+		char_t name[1];
+	};
+
+	static const xpath_node_set dummy_node_set;
+
+	PUGI__FN unsigned int hash_string(const char_t* str)
+	{
+		// Jenkins one-at-a-time hash (http://en.wikipedia.org/wiki/Jenkins_hash_function#one-at-a-time)
+		unsigned int result = 0;
+
+		while (*str)
+		{
+			result += static_cast<unsigned int>(*str++);
+			result += result << 10;
+			result ^= result >> 6;
+		}
+	
+		result += result << 3;
+		result ^= result >> 11;
+		result += result << 15;
+	
+		return result;
+	}
+
+	template <typename T> PUGI__FN T* new_xpath_variable(const char_t* name)
+	{
+		size_t length = strlength(name);
+		if (length == 0) return 0; // empty variable names are invalid
+
+		// $$ we can't use offsetof(T, name) because T is non-POD, so we just allocate additional length characters
+		void* memory = xml_memory::allocate(sizeof(T) + length * sizeof(char_t));
+		if (!memory) return 0;
+
+		T* result = new (memory) T();
+
+		memcpy(result->name, name, (length + 1) * sizeof(char_t));
+
+		return result;
+	}
+
+	PUGI__FN xpath_variable* new_xpath_variable(xpath_value_type type, const char_t* name)
+	{
+		switch (type)
+		{
+		case xpath_type_node_set:
+			return new_xpath_variable<xpath_variable_node_set>(name);
+
+		case xpath_type_number:
+			return new_xpath_variable<xpath_variable_number>(name);
+
+		case xpath_type_string:
+			return new_xpath_variable<xpath_variable_string>(name);
+
+		case xpath_type_boolean:
+			return new_xpath_variable<xpath_variable_boolean>(name);
+
+		default:
+			return 0;
+		}
+	}
+
+	template <typename T> PUGI__FN void delete_xpath_variable(T* var)
+	{
+		var->~T();
+		xml_memory::deallocate(var);
+	}
+
+	PUGI__FN void delete_xpath_variable(xpath_value_type type, xpath_variable* var)
+	{
+		switch (type)
+		{
+		case xpath_type_node_set:
+			delete_xpath_variable(static_cast<xpath_variable_node_set*>(var));
+			break;
+
+		case xpath_type_number:
+			delete_xpath_variable(static_cast<xpath_variable_number*>(var));
+			break;
+
+		case xpath_type_string:
+			delete_xpath_variable(static_cast<xpath_variable_string*>(var));
+			break;
+
+		case xpath_type_boolean:
+			delete_xpath_variable(static_cast<xpath_variable_boolean*>(var));
+			break;
+
+		default:
+			assert(!"Invalid variable type");
+		}
+	}
+
+	PUGI__FN xpath_variable* get_variable_scratch(char_t (&buffer)[32], xpath_variable_set* set, const char_t* begin, const char_t* end)
+	{
+		size_t length = static_cast<size_t>(end - begin);
+		char_t* scratch = buffer;
+
+		if (length >= sizeof(buffer) / sizeof(buffer[0]))
+		{
+			// need to make dummy on-heap copy
+			scratch = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+			if (!scratch) return 0;
+		}
+
+		// copy string to zero-terminated buffer and perform lookup
+		memcpy(scratch, begin, length * sizeof(char_t));
+		scratch[length] = 0;
+
+		xpath_variable* result = set->get(scratch);
+
+		// free dummy buffer
+		if (scratch != buffer) xml_memory::deallocate(scratch);
+
+		return result;
+	}
+PUGI__NS_END
+
+// Internal node set class
+PUGI__NS_BEGIN
+	PUGI__FN xpath_node_set::type_t xpath_sort(xpath_node* begin, xpath_node* end, xpath_node_set::type_t type, bool rev)
+	{
+		xpath_node_set::type_t order = rev ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted;
+
+		if (type == xpath_node_set::type_unsorted)
+		{
+			sort(begin, end, document_order_comparator());
+
+			type = xpath_node_set::type_sorted;
+		}
+		
+		if (type != order) reverse(begin, end);
+			
+		return order;
+	}
+
+	PUGI__FN xpath_node xpath_first(const xpath_node* begin, const xpath_node* end, xpath_node_set::type_t type)
+	{
+		if (begin == end) return xpath_node();
+
+		switch (type)
+		{
+		case xpath_node_set::type_sorted:
+			return *begin;
+
+		case xpath_node_set::type_sorted_reverse:
+			return *(end - 1);
+
+		case xpath_node_set::type_unsorted:
+			return *min_element(begin, end, document_order_comparator());
+
+		default:
+			assert(!"Invalid node set type");
+			return xpath_node();
+		}
+	}
+
+	class xpath_node_set_raw
+	{
+		xpath_node_set::type_t _type;
+
+		xpath_node* _begin;
+		xpath_node* _end;
+		xpath_node* _eos;
+
+	public:
+		xpath_node_set_raw(): _type(xpath_node_set::type_unsorted), _begin(0), _end(0), _eos(0)
+		{
+		}
+
+		xpath_node* begin() const
+		{
+			return _begin;
+		}
+
+		xpath_node* end() const
+		{
+			return _end;
+		}
+
+		bool empty() const
+		{
+			return _begin == _end;
+		}
+
+		size_t size() const
+		{
+			return static_cast<size_t>(_end - _begin);
+		}
+
+		xpath_node first() const
+		{
+			return xpath_first(_begin, _end, _type);
+		}
+
+		void push_back(const xpath_node& node, xpath_allocator* alloc)
+		{
+			if (_end == _eos)
+			{
+				size_t capacity = static_cast<size_t>(_eos - _begin);
+
+				// get new capacity (1.5x rule)
+				size_t new_capacity = capacity + capacity / 2 + 1;
+
+				// reallocate the old array or allocate a new one
+				xpath_node* data = static_cast<xpath_node*>(alloc->reallocate(_begin, capacity * sizeof(xpath_node), new_capacity * sizeof(xpath_node)));
+				assert(data);
+
+				// finalize
+				_begin = data;
+				_end = data + capacity;
+				_eos = data + new_capacity;
+			}
+
+			*_end++ = node;
+		}
+
+		void append(const xpath_node* begin_, const xpath_node* end_, xpath_allocator* alloc)
+		{
+			size_t size_ = static_cast<size_t>(_end - _begin);
+			size_t capacity = static_cast<size_t>(_eos - _begin);
+			size_t count = static_cast<size_t>(end_ - begin_);
+
+			if (size_ + count > capacity)
+			{
+				// reallocate the old array or allocate a new one
+				xpath_node* data = static_cast<xpath_node*>(alloc->reallocate(_begin, capacity * sizeof(xpath_node), (size_ + count) * sizeof(xpath_node)));
+				assert(data);
+
+				// finalize
+				_begin = data;
+				_end = data + size_;
+				_eos = data + size_ + count;
+			}
+
+			memcpy(_end, begin_, count * sizeof(xpath_node));
+			_end += count;
+		}
+
+		void sort_do()
+		{
+			_type = xpath_sort(_begin, _end, _type, false);
+		}
+
+		void truncate(xpath_node* pos)
+		{
+			assert(_begin <= pos && pos <= _end);
+
+			_end = pos;
+		}
+
+		void remove_duplicates()
+		{
+			if (_type == xpath_node_set::type_unsorted)
+				sort(_begin, _end, duplicate_comparator());
+		
+			_end = unique(_begin, _end);
+		}
+
+		xpath_node_set::type_t type() const
+		{
+			return _type;
+		}
+
+		void set_type(xpath_node_set::type_t value)
+		{
+			_type = value;
+		}
+	};
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+	struct xpath_context
+	{
+		xpath_node n;
+		size_t position, size;
+
+		xpath_context(const xpath_node& n_, size_t position_, size_t size_): n(n_), position(position_), size(size_)
+		{
+		}
+	};
+
+	enum lexeme_t
+	{
+		lex_none = 0,
+		lex_equal,
+		lex_not_equal,
+		lex_less,
+		lex_greater,
+		lex_less_or_equal,
+		lex_greater_or_equal,
+		lex_plus,
+		lex_minus,
+		lex_multiply,
+		lex_union,
+		lex_var_ref,
+		lex_open_brace,
+		lex_close_brace,
+		lex_quoted_string,
+		lex_number,
+		lex_slash,
+		lex_double_slash,
+		lex_open_square_brace,
+		lex_close_square_brace,
+		lex_string,
+		lex_comma,
+		lex_axis_attribute,
+		lex_dot,
+		lex_double_dot,
+		lex_double_colon,
+		lex_eof
+	};
+
+	struct xpath_lexer_string
+	{
+		const char_t* begin;
+		const char_t* end;
+
+		xpath_lexer_string(): begin(0), end(0)
+		{
+		}
+
+		bool operator==(const char_t* other) const
+		{
+			size_t length = static_cast<size_t>(end - begin);
+
+			return strequalrange(other, begin, length);
+		}
+	};
+
+	class xpath_lexer
+	{
+		const char_t* _cur;
+		const char_t* _cur_lexeme_pos;
+		xpath_lexer_string _cur_lexeme_contents;
+
+		lexeme_t _cur_lexeme;
+
+	public:
+		explicit xpath_lexer(const char_t* query): _cur(query)
+		{
+			next();
+		}
+		
+		const char_t* state() const
+		{
+			return _cur;
+		}
+		
+		void next()
+		{
+			const char_t* cur = _cur;
+
+			while (PUGI__IS_CHARTYPE(*cur, ct_space)) ++cur;
+
+			// save lexeme position for error reporting
+			_cur_lexeme_pos = cur;
+
+			switch (*cur)
+			{
+			case 0:
+				_cur_lexeme = lex_eof;
+				break;
+			
+			case '>':
+				if (*(cur+1) == '=')
+				{
+					cur += 2;
+					_cur_lexeme = lex_greater_or_equal;
+				}
+				else
+				{
+					cur += 1;
+					_cur_lexeme = lex_greater;
+				}
+				break;
+
+			case '<':
+				if (*(cur+1) == '=')
+				{
+					cur += 2;
+					_cur_lexeme = lex_less_or_equal;
+				}
+				else
+				{
+					cur += 1;
+					_cur_lexeme = lex_less;
+				}
+				break;
+
+			case '!':
+				if (*(cur+1) == '=')
+				{
+					cur += 2;
+					_cur_lexeme = lex_not_equal;
+				}
+				else
+				{
+					_cur_lexeme = lex_none;
+				}
+				break;
+
+			case '=':
+				cur += 1;
+				_cur_lexeme = lex_equal;
+
+				break;
+			
+			case '+':
+				cur += 1;
+				_cur_lexeme = lex_plus;
+
+				break;
+
+			case '-':
+				cur += 1;
+				_cur_lexeme = lex_minus;
+
+				break;
+
+			case '*':
+				cur += 1;
+				_cur_lexeme = lex_multiply;
+
+				break;
+
+			case '|':
+				cur += 1;
+				_cur_lexeme = lex_union;
+
+				break;
+			
+			case '$':
+				cur += 1;
+
+				if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol))
+				{
+					_cur_lexeme_contents.begin = cur;
+
+					while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+
+					if (cur[0] == ':' && PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) // qname
+					{
+						cur++; // :
+
+						while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+					}
+
+					_cur_lexeme_contents.end = cur;
+				
+					_cur_lexeme = lex_var_ref;
+				}
+				else
+				{
+					_cur_lexeme = lex_none;
+				}
+
+				break;
+
+			case '(':
+				cur += 1;
+				_cur_lexeme = lex_open_brace;
+
+				break;
+
+			case ')':
+				cur += 1;
+				_cur_lexeme = lex_close_brace;
+
+				break;
+			
+			case '[':
+				cur += 1;
+				_cur_lexeme = lex_open_square_brace;
+
+				break;
+
+			case ']':
+				cur += 1;
+				_cur_lexeme = lex_close_square_brace;
+
+				break;
+
+			case ',':
+				cur += 1;
+				_cur_lexeme = lex_comma;
+
+				break;
+
+			case '/':
+				if (*(cur+1) == '/')
+				{
+					cur += 2;
+					_cur_lexeme = lex_double_slash;
+				}
+				else
+				{
+					cur += 1;
+					_cur_lexeme = lex_slash;
+				}
+				break;
+		
+			case '.':
+				if (*(cur+1) == '.')
+				{
+					cur += 2;
+					_cur_lexeme = lex_double_dot;
+				}
+				else if (PUGI__IS_CHARTYPEX(*(cur+1), ctx_digit))
+				{
+					_cur_lexeme_contents.begin = cur; // .
+
+					++cur;
+
+					while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+
+					_cur_lexeme_contents.end = cur;
+					
+					_cur_lexeme = lex_number;
+				}
+				else
+				{
+					cur += 1;
+					_cur_lexeme = lex_dot;
+				}
+				break;
+
+			case '@':
+				cur += 1;
+				_cur_lexeme = lex_axis_attribute;
+
+				break;
+
+			case '"':
+			case '\'':
+			{
+				char_t terminator = *cur;
+
+				++cur;
+
+				_cur_lexeme_contents.begin = cur;
+				while (*cur && *cur != terminator) cur++;
+				_cur_lexeme_contents.end = cur;
+				
+				if (!*cur)
+					_cur_lexeme = lex_none;
+				else
+				{
+					cur += 1;
+					_cur_lexeme = lex_quoted_string;
+				}
+
+				break;
+			}
+
+			case ':':
+				if (*(cur+1) == ':')
+				{
+					cur += 2;
+					_cur_lexeme = lex_double_colon;
+				}
+				else
+				{
+					_cur_lexeme = lex_none;
+				}
+				break;
+
+			default:
+				if (PUGI__IS_CHARTYPEX(*cur, ctx_digit))
+				{
+					_cur_lexeme_contents.begin = cur;
+
+					while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+				
+					if (*cur == '.')
+					{
+						cur++;
+
+						while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+					}
+
+					_cur_lexeme_contents.end = cur;
+
+					_cur_lexeme = lex_number;
+				}
+				else if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol))
+				{
+					_cur_lexeme_contents.begin = cur;
+
+					while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+
+					if (cur[0] == ':')
+					{
+						if (cur[1] == '*') // namespace test ncname:*
+						{
+							cur += 2; // :*
+						}
+						else if (PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) // namespace test qname
+						{
+							cur++; // :
+
+							while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+						}
+					}
+
+					_cur_lexeme_contents.end = cur;
+				
+					_cur_lexeme = lex_string;
+				}
+				else
+				{
+					_cur_lexeme = lex_none;
+				}
+			}
+
+			_cur = cur;
+		}
+
+		lexeme_t current() const
+		{
+			return _cur_lexeme;
+		}
+
+		const char_t* current_pos() const
+		{
+			return _cur_lexeme_pos;
+		}
+
+		const xpath_lexer_string& contents() const
+		{
+			assert(_cur_lexeme == lex_var_ref || _cur_lexeme == lex_number || _cur_lexeme == lex_string || _cur_lexeme == lex_quoted_string);
+
+			return _cur_lexeme_contents;
+		}
+	};
+
+	enum ast_type_t
+	{
+		ast_unknown,
+		ast_op_or,						// left or right
+		ast_op_and,						// left and right
+		ast_op_equal,					// left = right
+		ast_op_not_equal,				// left != right
+		ast_op_less,					// left < right
+		ast_op_greater,					// left > right
+		ast_op_less_or_equal,			// left <= right
+		ast_op_greater_or_equal,		// left >= right
+		ast_op_add,						// left + right
+		ast_op_subtract,				// left - right
+		ast_op_multiply,				// left * right
+		ast_op_divide,					// left / right
+		ast_op_mod,						// left % right
+		ast_op_negate,					// left - right
+		ast_op_union,					// left | right
+		ast_predicate,					// apply predicate to set; next points to next predicate
+		ast_filter,						// select * from left where right
+		ast_filter_posinv,				// select * from left where right; proximity position invariant
+		ast_string_constant,			// string constant
+		ast_number_constant,			// number constant
+		ast_variable,					// variable
+		ast_func_last,					// last()
+		ast_func_position,				// position()
+		ast_func_count,					// count(left)
+		ast_func_id,					// id(left)
+		ast_func_local_name_0,			// local-name()
+		ast_func_local_name_1,			// local-name(left)
+		ast_func_namespace_uri_0,		// namespace-uri()
+		ast_func_namespace_uri_1,		// namespace-uri(left)
+		ast_func_name_0,				// name()
+		ast_func_name_1,				// name(left)
+		ast_func_string_0,				// string()
+		ast_func_string_1,				// string(left)
+		ast_func_concat,				// concat(left, right, siblings)
+		ast_func_starts_with,			// starts_with(left, right)
+		ast_func_contains,				// contains(left, right)
+		ast_func_substring_before,		// substring-before(left, right)
+		ast_func_substring_after,		// substring-after(left, right)
+		ast_func_substring_2,			// substring(left, right)
+		ast_func_substring_3,			// substring(left, right, third)
+		ast_func_string_length_0,		// string-length()
+		ast_func_string_length_1,		// string-length(left)
+		ast_func_normalize_space_0,		// normalize-space()
+		ast_func_normalize_space_1,		// normalize-space(left)
+		ast_func_translate,				// translate(left, right, third)
+		ast_func_boolean,				// boolean(left)
+		ast_func_not,					// not(left)
+		ast_func_true,					// true()
+		ast_func_false,					// false()
+		ast_func_lang,					// lang(left)
+		ast_func_number_0,				// number()
+		ast_func_number_1,				// number(left)
+		ast_func_sum,					// sum(left)
+		ast_func_floor,					// floor(left)
+		ast_func_ceiling,				// ceiling(left)
+		ast_func_round,					// round(left)
+		ast_step,						// process set left with step
+		ast_step_root					// select root node
+	};
+
+	enum axis_t
+	{
+		axis_ancestor,
+		axis_ancestor_or_self,
+		axis_attribute,
+		axis_child,
+		axis_descendant,
+		axis_descendant_or_self,
+		axis_following,
+		axis_following_sibling,
+		axis_namespace,
+		axis_parent,
+		axis_preceding,
+		axis_preceding_sibling,
+		axis_self
+	};
+	
+	enum nodetest_t
+	{
+		nodetest_none,
+		nodetest_name,
+		nodetest_type_node,
+		nodetest_type_comment,
+		nodetest_type_pi,
+		nodetest_type_text,
+		nodetest_pi,
+		nodetest_all,
+		nodetest_all_in_namespace
+	};
+
+	template <axis_t N> struct axis_to_type
+	{
+		static const axis_t axis;
+	};
+
+	template <axis_t N> const axis_t axis_to_type<N>::axis = N;
+		
+	class xpath_ast_node
+	{
+	private:
+		// node type
+		char _type;
+		char _rettype;
+
+		// for ast_step / ast_predicate
+		char _axis;
+		char _test;
+
+		// tree node structure
+		xpath_ast_node* _left;
+		xpath_ast_node* _right;
+		xpath_ast_node* _next;
+
+		union
+		{
+			// value for ast_string_constant
+			const char_t* string;
+			// value for ast_number_constant
+			double number;
+			// variable for ast_variable
+			xpath_variable* variable;
+			// node test for ast_step (node name/namespace/node type/pi target)
+			const char_t* nodetest;
+		} _data;
+
+		xpath_ast_node(const xpath_ast_node&);
+		xpath_ast_node& operator=(const xpath_ast_node&);
+
+		template <class Comp> static bool compare_eq(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp)
+		{
+			xpath_value_type lt = lhs->rettype(), rt = rhs->rettype();
+
+			if (lt != xpath_type_node_set && rt != xpath_type_node_set)
+			{
+				if (lt == xpath_type_boolean || rt == xpath_type_boolean)
+					return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack));
+				else if (lt == xpath_type_number || rt == xpath_type_number)
+					return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack));
+				else if (lt == xpath_type_string || rt == xpath_type_string)
+				{
+					xpath_allocator_capture cr(stack.result);
+
+					xpath_string ls = lhs->eval_string(c, stack);
+					xpath_string rs = rhs->eval_string(c, stack);
+
+					return comp(ls, rs);
+				}
+			}
+			else if (lt == xpath_type_node_set && rt == xpath_type_node_set)
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_node_set_raw ls = lhs->eval_node_set(c, stack);
+				xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
+
+				for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+					for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+					{
+						xpath_allocator_capture cri(stack.result);
+
+						if (comp(string_value(*li, stack.result), string_value(*ri, stack.result)))
+							return true;
+					}
+
+				return false;
+			}
+			else
+			{
+				if (lt == xpath_type_node_set)
+				{
+					swap(lhs, rhs);
+					swap(lt, rt);
+				}
+
+				if (lt == xpath_type_boolean)
+					return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack));
+				else if (lt == xpath_type_number)
+				{
+					xpath_allocator_capture cr(stack.result);
+
+					double l = lhs->eval_number(c, stack);
+					xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
+
+					for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+					{
+						xpath_allocator_capture cri(stack.result);
+
+						if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+							return true;
+					}
+
+					return false;
+				}
+				else if (lt == xpath_type_string)
+				{
+					xpath_allocator_capture cr(stack.result);
+
+					xpath_string l = lhs->eval_string(c, stack);
+					xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
+
+					for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+					{
+						xpath_allocator_capture cri(stack.result);
+
+						if (comp(l, string_value(*ri, stack.result)))
+							return true;
+					}
+
+					return false;
+				}
+			}
+
+			assert(!"Wrong types");
+			return false;
+		}
+
+		template <class Comp> static bool compare_rel(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp)
+		{
+			xpath_value_type lt = lhs->rettype(), rt = rhs->rettype();
+
+			if (lt != xpath_type_node_set && rt != xpath_type_node_set)
+				return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack));
+			else if (lt == xpath_type_node_set && rt == xpath_type_node_set)
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_node_set_raw ls = lhs->eval_node_set(c, stack);
+				xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
+
+				for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+				{
+					xpath_allocator_capture cri(stack.result);
+
+					double l = convert_string_to_number(string_value(*li, stack.result).c_str());
+
+					for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+					{
+						xpath_allocator_capture crii(stack.result);
+
+						if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+							return true;
+					}
+				}
+
+				return false;
+			}
+			else if (lt != xpath_type_node_set && rt == xpath_type_node_set)
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				double l = lhs->eval_number(c, stack);
+				xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
+
+				for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+				{
+					xpath_allocator_capture cri(stack.result);
+
+					if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+						return true;
+				}
+
+				return false;
+			}
+			else if (lt == xpath_type_node_set && rt != xpath_type_node_set)
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_node_set_raw ls = lhs->eval_node_set(c, stack);
+				double r = rhs->eval_number(c, stack);
+
+				for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+				{
+					xpath_allocator_capture cri(stack.result);
+
+					if (comp(convert_string_to_number(string_value(*li, stack.result).c_str()), r))
+						return true;
+				}
+
+				return false;
+			}
+			else
+			{
+				assert(!"Wrong types");
+				return false;
+			}
+		}
+
+		void apply_predicate(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack)
+		{
+			assert(ns.size() >= first);
+
+			size_t i = 1;
+			size_t size = ns.size() - first;
+				
+			xpath_node* last = ns.begin() + first;
+				
+			// remove_if... or well, sort of
+			for (xpath_node* it = last; it != ns.end(); ++it, ++i)
+			{
+				xpath_context c(*it, i, size);
+			
+				if (expr->rettype() == xpath_type_number)
+				{
+					if (expr->eval_number(c, stack) == i)
+						*last++ = *it;
+				}
+				else if (expr->eval_boolean(c, stack))
+					*last++ = *it;
+			}
+			
+			ns.truncate(last);
+		}
+
+		void apply_predicates(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack)
+		{
+			if (ns.size() == first) return;
+			
+			for (xpath_ast_node* pred = _right; pred; pred = pred->_next)
+			{
+				apply_predicate(ns, first, pred->_left, stack);
+			}
+		}
+
+		void step_push(xpath_node_set_raw& ns, const xml_attribute& a, const xml_node& parent, xpath_allocator* alloc)
+		{
+			if (!a) return;
+
+			const char_t* name = a.name();
+
+			// There are no attribute nodes corresponding to attributes that declare namespaces
+			// That is, "xmlns:..." or "xmlns"
+			if (starts_with(name, PUGIXML_TEXT("xmlns")) && (name[5] == 0 || name[5] == ':')) return;
+			
+			switch (_test)
+			{
+			case nodetest_name:
+				if (strequal(name, _data.nodetest)) ns.push_back(xpath_node(a, parent), alloc);
+				break;
+				
+			case nodetest_type_node:
+			case nodetest_all:
+				ns.push_back(xpath_node(a, parent), alloc);
+				break;
+				
+			case nodetest_all_in_namespace:
+				if (starts_with(name, _data.nodetest))
+					ns.push_back(xpath_node(a, parent), alloc);
+				break;
+			
+			default:
+				;
+			}
+		}
+		
+		void step_push(xpath_node_set_raw& ns, const xml_node& n, xpath_allocator* alloc)
+		{
+			if (!n) return;
+
+			switch (_test)
+			{
+			case nodetest_name:
+				if (n.type() == node_element && strequal(n.name(), _data.nodetest)) ns.push_back(n, alloc);
+				break;
+				
+			case nodetest_type_node:
+				ns.push_back(n, alloc);
+				break;
+				
+			case nodetest_type_comment:
+				if (n.type() == node_comment)
+					ns.push_back(n, alloc);
+				break;
+				
+			case nodetest_type_text:
+				if (n.type() == node_pcdata || n.type() == node_cdata)
+					ns.push_back(n, alloc);
+				break;
+				
+			case nodetest_type_pi:
+				if (n.type() == node_pi)
+					ns.push_back(n, alloc);
+				break;
+									
+			case nodetest_pi:
+				if (n.type() == node_pi && strequal(n.name(), _data.nodetest))
+					ns.push_back(n, alloc);
+				break;
+				
+			case nodetest_all:
+				if (n.type() == node_element)
+					ns.push_back(n, alloc);
+				break;
+				
+			case nodetest_all_in_namespace:
+				if (n.type() == node_element && starts_with(n.name(), _data.nodetest))
+					ns.push_back(n, alloc);
+				break;
+
+			default:
+				assert(!"Unknown axis");
+			} 
+		}
+
+		template <class T> void step_fill(xpath_node_set_raw& ns, const xml_node& n, xpath_allocator* alloc, T)
+		{
+			const axis_t axis = T::axis;
+
+			switch (axis)
+			{
+			case axis_attribute:
+			{
+				for (xml_attribute a = n.first_attribute(); a; a = a.next_attribute())
+					step_push(ns, a, n, alloc);
+				
+				break;
+			}
+			
+			case axis_child:
+			{
+				for (xml_node c = n.first_child(); c; c = c.next_sibling())
+					step_push(ns, c, alloc);
+					
+				break;
+			}
+			
+			case axis_descendant:
+			case axis_descendant_or_self:
+			{
+				if (axis == axis_descendant_or_self)
+					step_push(ns, n, alloc);
+					
+				xml_node cur = n.first_child();
+				
+				while (cur && cur != n)
+				{
+					step_push(ns, cur, alloc);
+					
+					if (cur.first_child())
+						cur = cur.first_child();
+					else if (cur.next_sibling())
+						cur = cur.next_sibling();
+					else
+					{
+						while (!cur.next_sibling() && cur != n)
+							cur = cur.parent();
+					
+						if (cur != n) cur = cur.next_sibling();
+					}
+				}
+				
+				break;
+			}
+			
+			case axis_following_sibling:
+			{
+				for (xml_node c = n.next_sibling(); c; c = c.next_sibling())
+					step_push(ns, c, alloc);
+				
+				break;
+			}
+			
+			case axis_preceding_sibling:
+			{
+				for (xml_node c = n.previous_sibling(); c; c = c.previous_sibling())
+					step_push(ns, c, alloc);
+				
+				break;
+			}
+			
+			case axis_following:
+			{
+				xml_node cur = n;
+
+				// exit from this node so that we don't include descendants
+				while (cur && !cur.next_sibling()) cur = cur.parent();
+				cur = cur.next_sibling();
+
+				for (;;)
+				{
+					step_push(ns, cur, alloc);
+
+					if (cur.first_child())
+						cur = cur.first_child();
+					else if (cur.next_sibling())
+						cur = cur.next_sibling();
+					else
+					{
+						while (cur && !cur.next_sibling()) cur = cur.parent();
+						cur = cur.next_sibling();
+
+						if (!cur) break;
+					}
+				}
+
+				break;
+			}
+
+			case axis_preceding:
+			{
+				xml_node cur = n;
+
+				while (cur && !cur.previous_sibling()) cur = cur.parent();
+				cur = cur.previous_sibling();
+
+				for (;;)
+				{
+					if (cur.last_child())
+						cur = cur.last_child();
+					else
+					{
+						// leaf node, can't be ancestor
+						step_push(ns, cur, alloc);
+
+						if (cur.previous_sibling())
+							cur = cur.previous_sibling();
+						else
+						{
+							do 
+							{
+								cur = cur.parent();
+								if (!cur) break;
+
+								if (!node_is_ancestor(cur, n)) step_push(ns, cur, alloc);
+							}
+							while (!cur.previous_sibling());
+
+							cur = cur.previous_sibling();
+
+							if (!cur) break;
+						}
+					}
+				}
+
+				break;
+			}
+			
+			case axis_ancestor:
+			case axis_ancestor_or_self:
+			{
+				if (axis == axis_ancestor_or_self)
+					step_push(ns, n, alloc);
+
+				xml_node cur = n.parent();
+				
+				while (cur)
+				{
+					step_push(ns, cur, alloc);
+					
+					cur = cur.parent();
+				}
+				
+				break;
+			}
+
+			case axis_self:
+			{
+				step_push(ns, n, alloc);
+
+				break;
+			}
+
+			case axis_parent:
+			{
+				if (n.parent()) step_push(ns, n.parent(), alloc);
+
+				break;
+			}
+				
+			default:
+				assert(!"Unimplemented axis");
+			}
+		}
+		
+		template <class T> void step_fill(xpath_node_set_raw& ns, const xml_attribute& a, const xml_node& p, xpath_allocator* alloc, T v)
+		{
+			const axis_t axis = T::axis;
+
+			switch (axis)
+			{
+			case axis_ancestor:
+			case axis_ancestor_or_self:
+			{
+				if (axis == axis_ancestor_or_self && _test == nodetest_type_node) // reject attributes based on principal node type test
+					step_push(ns, a, p, alloc);
+
+				xml_node cur = p;
+				
+				while (cur)
+				{
+					step_push(ns, cur, alloc);
+					
+					cur = cur.parent();
+				}
+				
+				break;
+			}
+
+			case axis_descendant_or_self:
+			case axis_self:
+			{
+				if (_test == nodetest_type_node) // reject attributes based on principal node type test
+					step_push(ns, a, p, alloc);
+
+				break;
+			}
+
+			case axis_following:
+			{
+				xml_node cur = p;
+				
+				for (;;)
+				{
+					if (cur.first_child())
+						cur = cur.first_child();
+					else if (cur.next_sibling())
+						cur = cur.next_sibling();
+					else
+					{
+						while (cur && !cur.next_sibling()) cur = cur.parent();
+						cur = cur.next_sibling();
+						
+						if (!cur) break;
+					}
+
+					step_push(ns, cur, alloc);
+				}
+
+				break;
+			}
+
+			case axis_parent:
+			{
+				step_push(ns, p, alloc);
+
+				break;
+			}
+
+			case axis_preceding:
+			{
+				// preceding:: axis does not include attribute nodes and attribute ancestors (they are the same as parent's ancestors), so we can reuse node preceding
+				step_fill(ns, p, alloc, v);
+				break;
+			}
+			
+			default:
+				assert(!"Unimplemented axis");
+			}
+		}
+		
+		template <class T> xpath_node_set_raw step_do(const xpath_context& c, const xpath_stack& stack, T v)
+		{
+			const axis_t axis = T::axis;
+			bool attributes = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_descendant_or_self || axis == axis_following || axis == axis_parent || axis == axis_preceding || axis == axis_self);
+
+			xpath_node_set_raw ns;
+			ns.set_type((axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_preceding || axis == axis_preceding_sibling) ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted);
+
+			if (_left)
+			{
+				xpath_node_set_raw s = _left->eval_node_set(c, stack);
+
+				// self axis preserves the original order
+				if (axis == axis_self) ns.set_type(s.type());
+
+				for (const xpath_node* it = s.begin(); it != s.end(); ++it)
+				{
+					size_t size = ns.size();
+
+					// in general, all axes generate elements in a particular order, but there is no order guarantee if axis is applied to two nodes
+					if (axis != axis_self && size != 0) ns.set_type(xpath_node_set::type_unsorted);
+					
+					if (it->node())
+						step_fill(ns, it->node(), stack.result, v);
+					else if (attributes)
+						step_fill(ns, it->attribute(), it->parent(), stack.result, v);
+						
+					apply_predicates(ns, size, stack);
+				}
+			}
+			else
+			{
+				if (c.n.node())
+					step_fill(ns, c.n.node(), stack.result, v);
+				else if (attributes)
+					step_fill(ns, c.n.attribute(), c.n.parent(), stack.result, v);
+				
+				apply_predicates(ns, 0, stack);
+			}
+
+			// child, attribute and self axes always generate unique set of nodes
+			// for other axis, if the set stayed sorted, it stayed unique because the traversal algorithms do not visit the same node twice
+			if (axis != axis_child && axis != axis_attribute && axis != axis_self && ns.type() == xpath_node_set::type_unsorted)
+				ns.remove_duplicates();
+
+			return ns;
+		}
+		
+	public:
+		xpath_ast_node(ast_type_t type, xpath_value_type rettype_, const char_t* value):
+			_type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+		{
+			assert(type == ast_string_constant);
+			_data.string = value;
+		}
+
+		xpath_ast_node(ast_type_t type, xpath_value_type rettype_, double value):
+			_type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+		{
+			assert(type == ast_number_constant);
+			_data.number = value;
+		}
+		
+		xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_variable* value):
+			_type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+		{
+			assert(type == ast_variable);
+			_data.variable = value;
+		}
+		
+		xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_ast_node* left = 0, xpath_ast_node* right = 0):
+			_type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(left), _right(right), _next(0)
+		{
+		}
+
+		xpath_ast_node(ast_type_t type, xpath_ast_node* left, axis_t axis, nodetest_t test, const char_t* contents):
+			_type(static_cast<char>(type)), _rettype(xpath_type_node_set), _axis(static_cast<char>(axis)), _test(static_cast<char>(test)), _left(left), _right(0), _next(0)
+		{
+			_data.nodetest = contents;
+		}
+
+		void set_next(xpath_ast_node* value)
+		{
+			_next = value;
+		}
+
+		void set_right(xpath_ast_node* value)
+		{
+			_right = value;
+		}
+
+		bool eval_boolean(const xpath_context& c, const xpath_stack& stack)
+		{
+			switch (_type)
+			{
+			case ast_op_or:
+				return _left->eval_boolean(c, stack) || _right->eval_boolean(c, stack);
+				
+			case ast_op_and:
+				return _left->eval_boolean(c, stack) && _right->eval_boolean(c, stack);
+				
+			case ast_op_equal:
+				return compare_eq(_left, _right, c, stack, equal_to());
+
+			case ast_op_not_equal:
+				return compare_eq(_left, _right, c, stack, not_equal_to());
+	
+			case ast_op_less:
+				return compare_rel(_left, _right, c, stack, less());
+			
+			case ast_op_greater:
+				return compare_rel(_right, _left, c, stack, less());
+
+			case ast_op_less_or_equal:
+				return compare_rel(_left, _right, c, stack, less_equal());
+			
+			case ast_op_greater_or_equal:
+				return compare_rel(_right, _left, c, stack, less_equal());
+
+			case ast_func_starts_with:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_string lr = _left->eval_string(c, stack);
+				xpath_string rr = _right->eval_string(c, stack);
+
+				return starts_with(lr.c_str(), rr.c_str());
+			}
+
+			case ast_func_contains:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_string lr = _left->eval_string(c, stack);
+				xpath_string rr = _right->eval_string(c, stack);
+
+				return find_substring(lr.c_str(), rr.c_str()) != 0;
+			}
+
+			case ast_func_boolean:
+				return _left->eval_boolean(c, stack);
+				
+			case ast_func_not:
+				return !_left->eval_boolean(c, stack);
+				
+			case ast_func_true:
+				return true;
+				
+			case ast_func_false:
+				return false;
+
+			case ast_func_lang:
+			{
+				if (c.n.attribute()) return false;
+				
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_string lang = _left->eval_string(c, stack);
+				
+				for (xml_node n = c.n.node(); n; n = n.parent())
+				{
+					xml_attribute a = n.attribute(PUGIXML_TEXT("xml:lang"));
+					
+					if (a)
+					{
+						const char_t* value = a.value();
+						
+						// strnicmp / strncasecmp is not portable
+						for (const char_t* lit = lang.c_str(); *lit; ++lit)
+						{
+							if (tolower_ascii(*lit) != tolower_ascii(*value)) return false;
+							++value;
+						}
+						
+						return *value == 0 || *value == '-';
+					}
+				}
+				
+				return false;
+			}
+
+			case ast_variable:
+			{
+				assert(_rettype == _data.variable->type());
+
+				if (_rettype == xpath_type_boolean)
+					return _data.variable->get_boolean();
+
+				// fallthrough to type conversion
+			}
+
+			default:
+			{
+				switch (_rettype)
+				{
+				case xpath_type_number:
+					return convert_number_to_boolean(eval_number(c, stack));
+					
+				case xpath_type_string:
+				{
+					xpath_allocator_capture cr(stack.result);
+
+					return !eval_string(c, stack).empty();
+				}
+					
+				case xpath_type_node_set:				
+				{
+					xpath_allocator_capture cr(stack.result);
+
+					return !eval_node_set(c, stack).empty();
+				}
+
+				default:
+					assert(!"Wrong expression for return type boolean");
+					return false;
+				}
+			}
+			}
+		}
+
+		double eval_number(const xpath_context& c, const xpath_stack& stack)
+		{
+			switch (_type)
+			{
+			case ast_op_add:
+				return _left->eval_number(c, stack) + _right->eval_number(c, stack);
+				
+			case ast_op_subtract:
+				return _left->eval_number(c, stack) - _right->eval_number(c, stack);
+
+			case ast_op_multiply:
+				return _left->eval_number(c, stack) * _right->eval_number(c, stack);
+
+			case ast_op_divide:
+				return _left->eval_number(c, stack) / _right->eval_number(c, stack);
+
+			case ast_op_mod:
+				return fmod(_left->eval_number(c, stack), _right->eval_number(c, stack));
+
+			case ast_op_negate:
+				return -_left->eval_number(c, stack);
+
+			case ast_number_constant:
+				return _data.number;
+
+			case ast_func_last:
+				return static_cast<double>(c.size);
+			
+			case ast_func_position:
+				return static_cast<double>(c.position);
+
+			case ast_func_count:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				return static_cast<double>(_left->eval_node_set(c, stack).size());
+			}
+			
+			case ast_func_string_length_0:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				return static_cast<double>(string_value(c.n, stack.result).length());
+			}
+			
+			case ast_func_string_length_1:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				return static_cast<double>(_left->eval_string(c, stack).length());
+			}
+			
+			case ast_func_number_0:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				return convert_string_to_number(string_value(c.n, stack.result).c_str());
+			}
+			
+			case ast_func_number_1:
+				return _left->eval_number(c, stack);
+
+			case ast_func_sum:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				double r = 0;
+				
+				xpath_node_set_raw ns = _left->eval_node_set(c, stack);
+				
+				for (const xpath_node* it = ns.begin(); it != ns.end(); ++it)
+				{
+					xpath_allocator_capture cri(stack.result);
+
+					r += convert_string_to_number(string_value(*it, stack.result).c_str());
+				}
+			
+				return r;
+			}
+
+			case ast_func_floor:
+			{
+				double r = _left->eval_number(c, stack);
+				
+				return r == r ? floor(r) : r;
+			}
+
+			case ast_func_ceiling:
+			{
+				double r = _left->eval_number(c, stack);
+				
+				return r == r ? ceil(r) : r;
+			}
+
+			case ast_func_round:
+				return round_nearest_nzero(_left->eval_number(c, stack));
+			
+			case ast_variable:
+			{
+				assert(_rettype == _data.variable->type());
+
+				if (_rettype == xpath_type_number)
+					return _data.variable->get_number();
+
+				// fallthrough to type conversion
+			}
+
+			default:
+			{
+				switch (_rettype)
+				{
+				case xpath_type_boolean:
+					return eval_boolean(c, stack) ? 1 : 0;
+					
+				case xpath_type_string:
+				{
+					xpath_allocator_capture cr(stack.result);
+
+					return convert_string_to_number(eval_string(c, stack).c_str());
+				}
+					
+				case xpath_type_node_set:
+				{
+					xpath_allocator_capture cr(stack.result);
+
+					return convert_string_to_number(eval_string(c, stack).c_str());
+				}
+					
+				default:
+					assert(!"Wrong expression for return type number");
+					return 0;
+				}
+				
+			}
+			}
+		}
+		
+		xpath_string eval_string_concat(const xpath_context& c, const xpath_stack& stack)
+		{
+			assert(_type == ast_func_concat);
+
+			xpath_allocator_capture ct(stack.temp);
+
+			// count the string number
+			size_t count = 1;
+			for (xpath_ast_node* nc = _right; nc; nc = nc->_next) count++;
+
+			// gather all strings
+			xpath_string static_buffer[4];
+			xpath_string* buffer = static_buffer;
+
+			// allocate on-heap for large concats
+			if (count > sizeof(static_buffer) / sizeof(static_buffer[0]))
+			{
+				buffer = static_cast<xpath_string*>(stack.temp->allocate(count * sizeof(xpath_string)));
+				assert(buffer);
+			}
+
+			// evaluate all strings to temporary stack
+			xpath_stack swapped_stack = {stack.temp, stack.result};
+
+			buffer[0] = _left->eval_string(c, swapped_stack);
+
+			size_t pos = 1;
+			for (xpath_ast_node* n = _right; n; n = n->_next, ++pos) buffer[pos] = n->eval_string(c, swapped_stack);
+			assert(pos == count);
+
+			// get total length
+			size_t length = 0;
+			for (size_t i = 0; i < count; ++i) length += buffer[i].length();
+
+			// create final string
+			char_t* result = static_cast<char_t*>(stack.result->allocate((length + 1) * sizeof(char_t)));
+			assert(result);
+
+			char_t* ri = result;
+
+			for (size_t j = 0; j < count; ++j)
+				for (const char_t* bi = buffer[j].c_str(); *bi; ++bi)
+					*ri++ = *bi;
+
+			*ri = 0;
+
+			return xpath_string(result, true);
+		}
+
+		xpath_string eval_string(const xpath_context& c, const xpath_stack& stack)
+		{
+			switch (_type)
+			{
+			case ast_string_constant:
+				return xpath_string_const(_data.string);
+			
+			case ast_func_local_name_0:
+			{
+				xpath_node na = c.n;
+				
+				return xpath_string_const(local_name(na));
+			}
+
+			case ast_func_local_name_1:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_node_set_raw ns = _left->eval_node_set(c, stack);
+				xpath_node na = ns.first();
+				
+				return xpath_string_const(local_name(na));
+			}
+
+			case ast_func_name_0:
+			{
+				xpath_node na = c.n;
+				
+				return xpath_string_const(qualified_name(na));
+			}
+
+			case ast_func_name_1:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_node_set_raw ns = _left->eval_node_set(c, stack);
+				xpath_node na = ns.first();
+				
+				return xpath_string_const(qualified_name(na));
+			}
+
+			case ast_func_namespace_uri_0:
+			{
+				xpath_node na = c.n;
+				
+				return xpath_string_const(namespace_uri(na));
+			}
+
+			case ast_func_namespace_uri_1:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_node_set_raw ns = _left->eval_node_set(c, stack);
+				xpath_node na = ns.first();
+				
+				return xpath_string_const(namespace_uri(na));
+			}
+
+			case ast_func_string_0:
+				return string_value(c.n, stack.result);
+
+			case ast_func_string_1:
+				return _left->eval_string(c, stack);
+
+			case ast_func_concat:
+				return eval_string_concat(c, stack);
+
+			case ast_func_substring_before:
+			{
+				xpath_allocator_capture cr(stack.temp);
+
+				xpath_stack swapped_stack = {stack.temp, stack.result};
+
+				xpath_string s = _left->eval_string(c, swapped_stack);
+				xpath_string p = _right->eval_string(c, swapped_stack);
+
+				const char_t* pos = find_substring(s.c_str(), p.c_str());
+				
+				return pos ? xpath_string(s.c_str(), pos, stack.result) : xpath_string();
+			}
+			
+			case ast_func_substring_after:
+			{
+				xpath_allocator_capture cr(stack.temp);
+
+				xpath_stack swapped_stack = {stack.temp, stack.result};
+
+				xpath_string s = _left->eval_string(c, swapped_stack);
+				xpath_string p = _right->eval_string(c, swapped_stack);
+				
+				const char_t* pos = find_substring(s.c_str(), p.c_str());
+				if (!pos) return xpath_string();
+
+				const char_t* result = pos + p.length();
+
+				return s.uses_heap() ? xpath_string(result, stack.result) : xpath_string_const(result);
+			}
+
+			case ast_func_substring_2:
+			{
+				xpath_allocator_capture cr(stack.temp);
+
+				xpath_stack swapped_stack = {stack.temp, stack.result};
+
+				xpath_string s = _left->eval_string(c, swapped_stack);
+				size_t s_length = s.length();
+
+				double first = round_nearest(_right->eval_number(c, stack));
+				
+				if (is_nan(first)) return xpath_string(); // NaN
+				else if (first >= s_length + 1) return xpath_string();
+				
+				size_t pos = first < 1 ? 1 : static_cast<size_t>(first);
+				assert(1 <= pos && pos <= s_length + 1);
+
+				const char_t* rbegin = s.c_str() + (pos - 1);
+				
+				return s.uses_heap() ? xpath_string(rbegin, stack.result) : xpath_string_const(rbegin);
+			}
+			
+			case ast_func_substring_3:
+			{
+				xpath_allocator_capture cr(stack.temp);
+
+				xpath_stack swapped_stack = {stack.temp, stack.result};
+
+				xpath_string s = _left->eval_string(c, swapped_stack);
+				size_t s_length = s.length();
+
+				double first = round_nearest(_right->eval_number(c, stack));
+				double last = first + round_nearest(_right->_next->eval_number(c, stack));
+				
+				if (is_nan(first) || is_nan(last)) return xpath_string();
+				else if (first >= s_length + 1) return xpath_string();
+				else if (first >= last) return xpath_string();
+				else if (last < 1) return xpath_string();
+				
+				size_t pos = first < 1 ? 1 : static_cast<size_t>(first);
+				size_t end = last >= s_length + 1 ? s_length + 1 : static_cast<size_t>(last);
+
+				assert(1 <= pos && pos <= end && end <= s_length + 1);
+				const char_t* rbegin = s.c_str() + (pos - 1);
+				const char_t* rend = s.c_str() + (end - 1);
+
+				return (end == s_length + 1 && !s.uses_heap()) ? xpath_string_const(rbegin) : xpath_string(rbegin, rend, stack.result);
+			}
+
+			case ast_func_normalize_space_0:
+			{
+				xpath_string s = string_value(c.n, stack.result);
+
+				normalize_space(s.data(stack.result));
+
+				return s;
+			}
+
+			case ast_func_normalize_space_1:
+			{
+				xpath_string s = _left->eval_string(c, stack);
+
+				normalize_space(s.data(stack.result));
+			
+				return s;
+			}
+
+			case ast_func_translate:
+			{
+				xpath_allocator_capture cr(stack.temp);
+
+				xpath_stack swapped_stack = {stack.temp, stack.result};
+
+				xpath_string s = _left->eval_string(c, stack);
+				xpath_string from = _right->eval_string(c, swapped_stack);
+				xpath_string to = _right->_next->eval_string(c, swapped_stack);
+
+				translate(s.data(stack.result), from.c_str(), to.c_str());
+
+				return s;
+			}
+
+			case ast_variable:
+			{
+				assert(_rettype == _data.variable->type());
+
+				if (_rettype == xpath_type_string)
+					return xpath_string_const(_data.variable->get_string());
+
+				// fallthrough to type conversion
+			}
+
+			default:
+			{
+				switch (_rettype)
+				{
+				case xpath_type_boolean:
+					return xpath_string_const(eval_boolean(c, stack) ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"));
+					
+				case xpath_type_number:
+					return convert_number_to_string(eval_number(c, stack), stack.result);
+					
+				case xpath_type_node_set:
+				{
+					xpath_allocator_capture cr(stack.temp);
+
+					xpath_stack swapped_stack = {stack.temp, stack.result};
+
+					xpath_node_set_raw ns = eval_node_set(c, swapped_stack);
+					return ns.empty() ? xpath_string() : string_value(ns.first(), stack.result);
+				}
+				
+				default:
+					assert(!"Wrong expression for return type string");
+					return xpath_string();
+				}
+			}
+			}
+		}
+
+		xpath_node_set_raw eval_node_set(const xpath_context& c, const xpath_stack& stack)
+		{
+			switch (_type)
+			{
+			case ast_op_union:
+			{
+				xpath_allocator_capture cr(stack.temp);
+
+				xpath_stack swapped_stack = {stack.temp, stack.result};
+
+				xpath_node_set_raw ls = _left->eval_node_set(c, swapped_stack);
+				xpath_node_set_raw rs = _right->eval_node_set(c, stack);
+				
+				// we can optimize merging two sorted sets, but this is a very rare operation, so don't bother
+				rs.set_type(xpath_node_set::type_unsorted);
+
+				rs.append(ls.begin(), ls.end(), stack.result);
+				rs.remove_duplicates();
+				
+				return rs;
+			}
+
+			case ast_filter:
+			case ast_filter_posinv:
+			{
+				xpath_node_set_raw set = _left->eval_node_set(c, stack);
+
+				// either expression is a number or it contains position() call; sort by document order
+				if (_type == ast_filter) set.sort_do();
+
+				apply_predicate(set, 0, _right, stack);
+			
+				return set;
+			}
+			
+			case ast_func_id:
+				return xpath_node_set_raw();
+			
+			case ast_step:
+			{
+				switch (_axis)
+				{
+				case axis_ancestor:
+					return step_do(c, stack, axis_to_type<axis_ancestor>());
+					
+				case axis_ancestor_or_self:
+					return step_do(c, stack, axis_to_type<axis_ancestor_or_self>());
+
+				case axis_attribute:
+					return step_do(c, stack, axis_to_type<axis_attribute>());
+
+				case axis_child:
+					return step_do(c, stack, axis_to_type<axis_child>());
+				
+				case axis_descendant:
+					return step_do(c, stack, axis_to_type<axis_descendant>());
+
+				case axis_descendant_or_self:
+					return step_do(c, stack, axis_to_type<axis_descendant_or_self>());
+
+				case axis_following:
+					return step_do(c, stack, axis_to_type<axis_following>());
+				
+				case axis_following_sibling:
+					return step_do(c, stack, axis_to_type<axis_following_sibling>());
+				
+				case axis_namespace:
+					// namespaced axis is not supported
+					return xpath_node_set_raw();
+				
+				case axis_parent:
+					return step_do(c, stack, axis_to_type<axis_parent>());
+				
+				case axis_preceding:
+					return step_do(c, stack, axis_to_type<axis_preceding>());
+
+				case axis_preceding_sibling:
+					return step_do(c, stack, axis_to_type<axis_preceding_sibling>());
+				
+				case axis_self:
+					return step_do(c, stack, axis_to_type<axis_self>());
+
+				default:
+					assert(!"Unknown axis");
+					return xpath_node_set_raw();
+				}
+			}
+
+			case ast_step_root:
+			{
+				assert(!_right); // root step can't have any predicates
+
+				xpath_node_set_raw ns;
+
+				ns.set_type(xpath_node_set::type_sorted);
+
+				if (c.n.node()) ns.push_back(c.n.node().root(), stack.result);
+				else if (c.n.attribute()) ns.push_back(c.n.parent().root(), stack.result);
+
+				return ns;
+			}
+
+			case ast_variable:
+			{
+				assert(_rettype == _data.variable->type());
+
+				if (_rettype == xpath_type_node_set)
+				{
+					const xpath_node_set& s = _data.variable->get_node_set();
+
+					xpath_node_set_raw ns;
+
+					ns.set_type(s.type());
+					ns.append(s.begin(), s.end(), stack.result);
+
+					return ns;
+				}
+
+				// fallthrough to type conversion
+			}
+
+			default:
+				assert(!"Wrong expression for return type node set");
+				return xpath_node_set_raw();
+			}
+		}
+		
+		bool is_posinv()
+		{
+			switch (_type)
+			{
+			case ast_func_position:
+				return false;
+
+			case ast_string_constant:
+			case ast_number_constant:
+			case ast_variable:
+				return true;
+
+			case ast_step:
+			case ast_step_root:
+				return true;
+
+			case ast_predicate:
+			case ast_filter:
+			case ast_filter_posinv:
+				return true;
+
+			default:
+				if (_left && !_left->is_posinv()) return false;
+				
+				for (xpath_ast_node* n = _right; n; n = n->_next)
+					if (!n->is_posinv()) return false;
+					
+				return true;
+			}
+		}
+
+		xpath_value_type rettype() const
+		{
+			return static_cast<xpath_value_type>(_rettype);
+		}
+	};
+
+	struct xpath_parser
+	{
+		xpath_allocator* _alloc;
+		xpath_lexer _lexer;
+
+		const char_t* _query;
+		xpath_variable_set* _variables;
+
+		xpath_parse_result* _result;
+
+		char_t _scratch[32];
+
+	#ifdef PUGIXML_NO_EXCEPTIONS
+		jmp_buf _error_handler;
+	#endif
+
+		void throw_error(const char* message)
+		{
+			_result->error = message;
+			_result->offset = _lexer.current_pos() - _query;
+
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			longjmp(_error_handler, 1);
+		#else
+			throw xpath_exception(*_result);
+		#endif
+		}
+
+		void throw_error_oom()
+		{
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			throw_error("Out of memory");
+		#else
+			throw std::bad_alloc();
+		#endif
+		}
+
+		void* alloc_node()
+		{
+			void* result = _alloc->allocate_nothrow(sizeof(xpath_ast_node));
+
+			if (!result) throw_error_oom();
+
+			return result;
+		}
+
+		const char_t* alloc_string(const xpath_lexer_string& value)
+		{
+			if (value.begin)
+			{
+				size_t length = static_cast<size_t>(value.end - value.begin);
+
+				char_t* c = static_cast<char_t*>(_alloc->allocate_nothrow((length + 1) * sizeof(char_t)));
+				if (!c) throw_error_oom();
+				assert(c); // workaround for clang static analysis
+
+				memcpy(c, value.begin, length * sizeof(char_t));
+				c[length] = 0;
+
+				return c;
+			}
+			else return 0;
+		}
+
+		xpath_ast_node* parse_function_helper(ast_type_t type0, ast_type_t type1, size_t argc, xpath_ast_node* args[2])
+		{
+			assert(argc <= 1);
+
+			if (argc == 1 && args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+
+			return new (alloc_node()) xpath_ast_node(argc == 0 ? type0 : type1, xpath_type_string, args[0]);
+		}
+
+		xpath_ast_node* parse_function(const xpath_lexer_string& name, size_t argc, xpath_ast_node* args[2])
+		{
+			switch (name.begin[0])
+			{
+			case 'b':
+				if (name == PUGIXML_TEXT("boolean") && argc == 1)
+					return new (alloc_node()) xpath_ast_node(ast_func_boolean, xpath_type_boolean, args[0]);
+					
+				break;
+			
+			case 'c':
+				if (name == PUGIXML_TEXT("count") && argc == 1)
+				{
+					if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+					return new (alloc_node()) xpath_ast_node(ast_func_count, xpath_type_number, args[0]);
+				}
+				else if (name == PUGIXML_TEXT("contains") && argc == 2)
+					return new (alloc_node()) xpath_ast_node(ast_func_contains, xpath_type_boolean, args[0], args[1]);
+				else if (name == PUGIXML_TEXT("concat") && argc >= 2)
+					return new (alloc_node()) xpath_ast_node(ast_func_concat, xpath_type_string, args[0], args[1]);
+				else if (name == PUGIXML_TEXT("ceiling") && argc == 1)
+					return new (alloc_node()) xpath_ast_node(ast_func_ceiling, xpath_type_number, args[0]);
+					
+				break;
+			
+			case 'f':
+				if (name == PUGIXML_TEXT("false") && argc == 0)
+					return new (alloc_node()) xpath_ast_node(ast_func_false, xpath_type_boolean);
+				else if (name == PUGIXML_TEXT("floor") && argc == 1)
+					return new (alloc_node()) xpath_ast_node(ast_func_floor, xpath_type_number, args[0]);
+					
+				break;
+			
+			case 'i':
+				if (name == PUGIXML_TEXT("id") && argc == 1)
+					return new (alloc_node()) xpath_ast_node(ast_func_id, xpath_type_node_set, args[0]);
+					
+				break;
+			
+			case 'l':
+				if (name == PUGIXML_TEXT("last") && argc == 0)
+					return new (alloc_node()) xpath_ast_node(ast_func_last, xpath_type_number);
+				else if (name == PUGIXML_TEXT("lang") && argc == 1)
+					return new (alloc_node()) xpath_ast_node(ast_func_lang, xpath_type_boolean, args[0]);
+				else if (name == PUGIXML_TEXT("local-name") && argc <= 1)
+					return parse_function_helper(ast_func_local_name_0, ast_func_local_name_1, argc, args);
+			
+				break;
+			
+			case 'n':
+				if (name == PUGIXML_TEXT("name") && argc <= 1)
+					return parse_function_helper(ast_func_name_0, ast_func_name_1, argc, args);
+				else if (name == PUGIXML_TEXT("namespace-uri") && argc <= 1)
+					return parse_function_helper(ast_func_namespace_uri_0, ast_func_namespace_uri_1, argc, args);
+				else if (name == PUGIXML_TEXT("normalize-space") && argc <= 1)
+					return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_normalize_space_0 : ast_func_normalize_space_1, xpath_type_string, args[0], args[1]);
+				else if (name == PUGIXML_TEXT("not") && argc == 1)
+					return new (alloc_node()) xpath_ast_node(ast_func_not, xpath_type_boolean, args[0]);
+				else if (name == PUGIXML_TEXT("number") && argc <= 1)
+					return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_number_0 : ast_func_number_1, xpath_type_number, args[0]);
+			
+				break;
+			
+			case 'p':
+				if (name == PUGIXML_TEXT("position") && argc == 0)
+					return new (alloc_node()) xpath_ast_node(ast_func_position, xpath_type_number);
+				
+				break;
+			
+			case 'r':
+				if (name == PUGIXML_TEXT("round") && argc == 1)
+					return new (alloc_node()) xpath_ast_node(ast_func_round, xpath_type_number, args[0]);
+
+				break;
+			
+			case 's':
+				if (name == PUGIXML_TEXT("string") && argc <= 1)
+					return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_0 : ast_func_string_1, xpath_type_string, args[0]);
+				else if (name == PUGIXML_TEXT("string-length") && argc <= 1)
+					return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_length_0 : ast_func_string_length_1, xpath_type_number, args[0]);
+				else if (name == PUGIXML_TEXT("starts-with") && argc == 2)
+					return new (alloc_node()) xpath_ast_node(ast_func_starts_with, xpath_type_boolean, args[0], args[1]);
+				else if (name == PUGIXML_TEXT("substring-before") && argc == 2)
+					return new (alloc_node()) xpath_ast_node(ast_func_substring_before, xpath_type_string, args[0], args[1]);
+				else if (name == PUGIXML_TEXT("substring-after") && argc == 2)
+					return new (alloc_node()) xpath_ast_node(ast_func_substring_after, xpath_type_string, args[0], args[1]);
+				else if (name == PUGIXML_TEXT("substring") && (argc == 2 || argc == 3))
+					return new (alloc_node()) xpath_ast_node(argc == 2 ? ast_func_substring_2 : ast_func_substring_3, xpath_type_string, args[0], args[1]);
+				else if (name == PUGIXML_TEXT("sum") && argc == 1)
+				{
+					if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+					return new (alloc_node()) xpath_ast_node(ast_func_sum, xpath_type_number, args[0]);
+				}
+
+				break;
+			
+			case 't':
+				if (name == PUGIXML_TEXT("translate") && argc == 3)
+					return new (alloc_node()) xpath_ast_node(ast_func_translate, xpath_type_string, args[0], args[1]);
+				else if (name == PUGIXML_TEXT("true") && argc == 0)
+					return new (alloc_node()) xpath_ast_node(ast_func_true, xpath_type_boolean);
+					
+				break;
+
+			default:
+				break;
+			}
+
+			throw_error("Unrecognized function or wrong parameter count");
+
+			return 0;
+		}
+
+		axis_t parse_axis_name(const xpath_lexer_string& name, bool& specified)
+		{
+			specified = true;
+
+			switch (name.begin[0])
+			{
+			case 'a':
+				if (name == PUGIXML_TEXT("ancestor"))
+					return axis_ancestor;
+				else if (name == PUGIXML_TEXT("ancestor-or-self"))
+					return axis_ancestor_or_self;
+				else if (name == PUGIXML_TEXT("attribute"))
+					return axis_attribute;
+				
+				break;
+			
+			case 'c':
+				if (name == PUGIXML_TEXT("child"))
+					return axis_child;
+				
+				break;
+			
+			case 'd':
+				if (name == PUGIXML_TEXT("descendant"))
+					return axis_descendant;
+				else if (name == PUGIXML_TEXT("descendant-or-self"))
+					return axis_descendant_or_self;
+				
+				break;
+			
+			case 'f':
+				if (name == PUGIXML_TEXT("following"))
+					return axis_following;
+				else if (name == PUGIXML_TEXT("following-sibling"))
+					return axis_following_sibling;
+				
+				break;
+			
+			case 'n':
+				if (name == PUGIXML_TEXT("namespace"))
+					return axis_namespace;
+				
+				break;
+			
+			case 'p':
+				if (name == PUGIXML_TEXT("parent"))
+					return axis_parent;
+				else if (name == PUGIXML_TEXT("preceding"))
+					return axis_preceding;
+				else if (name == PUGIXML_TEXT("preceding-sibling"))
+					return axis_preceding_sibling;
+				
+				break;
+			
+			case 's':
+				if (name == PUGIXML_TEXT("self"))
+					return axis_self;
+				
+				break;
+
+			default:
+				break;
+			}
+
+			specified = false;
+			return axis_child;
+		}
+
+		nodetest_t parse_node_test_type(const xpath_lexer_string& name)
+		{
+			switch (name.begin[0])
+			{
+			case 'c':
+				if (name == PUGIXML_TEXT("comment"))
+					return nodetest_type_comment;
+
+				break;
+
+			case 'n':
+				if (name == PUGIXML_TEXT("node"))
+					return nodetest_type_node;
+
+				break;
+
+			case 'p':
+				if (name == PUGIXML_TEXT("processing-instruction"))
+					return nodetest_type_pi;
+
+				break;
+
+			case 't':
+				if (name == PUGIXML_TEXT("text"))
+					return nodetest_type_text;
+
+				break;
+			
+			default:
+				break;
+			}
+
+			return nodetest_none;
+		}
+
+		// PrimaryExpr ::= VariableReference | '(' Expr ')' | Literal | Number | FunctionCall
+		xpath_ast_node* parse_primary_expression()
+		{
+			switch (_lexer.current())
+			{
+			case lex_var_ref:
+			{
+				xpath_lexer_string name = _lexer.contents();
+
+				if (!_variables)
+					throw_error("Unknown variable: variable set is not provided");
+
+				xpath_variable* var = get_variable_scratch(_scratch, _variables, name.begin, name.end);
+
+				if (!var)
+					throw_error("Unknown variable: variable set does not contain the given name");
+
+				_lexer.next();
+
+				return new (alloc_node()) xpath_ast_node(ast_variable, var->type(), var);
+			}
+
+			case lex_open_brace:
+			{
+				_lexer.next();
+
+				xpath_ast_node* n = parse_expression();
+
+				if (_lexer.current() != lex_close_brace)
+					throw_error("Unmatched braces");
+
+				_lexer.next();
+
+				return n;
+			}
+
+			case lex_quoted_string:
+			{
+				const char_t* value = alloc_string(_lexer.contents());
+
+				xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_string_constant, xpath_type_string, value);
+				_lexer.next();
+
+				return n;
+			}
+
+			case lex_number:
+			{
+				double value = 0;
+
+				if (!convert_string_to_number_scratch(_scratch, _lexer.contents().begin, _lexer.contents().end, &value))
+					throw_error_oom();
+
+				xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_number_constant, xpath_type_number, value);
+				_lexer.next();
+
+				return n;
+			}
+
+			case lex_string:
+			{
+				xpath_ast_node* args[2] = {0};
+				size_t argc = 0;
+				
+				xpath_lexer_string function = _lexer.contents();
+				_lexer.next();
+				
+				xpath_ast_node* last_arg = 0;
+				
+				if (_lexer.current() != lex_open_brace)
+					throw_error("Unrecognized function call");
+				_lexer.next();
+
+				if (_lexer.current() != lex_close_brace)
+					args[argc++] = parse_expression();
+
+				while (_lexer.current() != lex_close_brace)
+				{
+					if (_lexer.current() != lex_comma)
+						throw_error("No comma between function arguments");
+					_lexer.next();
+					
+					xpath_ast_node* n = parse_expression();
+					
+					if (argc < 2) args[argc] = n;
+					else last_arg->set_next(n);
+
+					argc++;
+					last_arg = n;
+				}
+				
+				_lexer.next();
+
+				return parse_function(function, argc, args);
+			}
+
+			default:
+				throw_error("Unrecognizable primary expression");
+
+				return 0;
+			}
+		}
+		
+		// FilterExpr ::= PrimaryExpr | FilterExpr Predicate
+		// Predicate ::= '[' PredicateExpr ']'
+		// PredicateExpr ::= Expr
+		xpath_ast_node* parse_filter_expression()
+		{
+			xpath_ast_node* n = parse_primary_expression();
+
+			while (_lexer.current() == lex_open_square_brace)
+			{
+				_lexer.next();
+
+				xpath_ast_node* expr = parse_expression();
+
+				if (n->rettype() != xpath_type_node_set) throw_error("Predicate has to be applied to node set");
+
+				bool posinv = expr->rettype() != xpath_type_number && expr->is_posinv();
+
+				n = new (alloc_node()) xpath_ast_node(posinv ? ast_filter_posinv : ast_filter, xpath_type_node_set, n, expr);
+
+				if (_lexer.current() != lex_close_square_brace)
+					throw_error("Unmatched square brace");
+			
+				_lexer.next();
+			}
+			
+			return n;
+		}
+		
+		// Step ::= AxisSpecifier NodeTest Predicate* | AbbreviatedStep
+		// AxisSpecifier ::= AxisName '::' | '@'?
+		// NodeTest ::= NameTest | NodeType '(' ')' | 'processing-instruction' '(' Literal ')'
+		// NameTest ::= '*' | NCName ':' '*' | QName
+		// AbbreviatedStep ::= '.' | '..'
+		xpath_ast_node* parse_step(xpath_ast_node* set)
+		{
+			if (set && set->rettype() != xpath_type_node_set)
+				throw_error("Step has to be applied to node set");
+
+			bool axis_specified = false;
+			axis_t axis = axis_child; // implied child axis
+
+			if (_lexer.current() == lex_axis_attribute)
+			{
+				axis = axis_attribute;
+				axis_specified = true;
+				
+				_lexer.next();
+			}
+			else if (_lexer.current() == lex_dot)
+			{
+				_lexer.next();
+				
+				return new (alloc_node()) xpath_ast_node(ast_step, set, axis_self, nodetest_type_node, 0);
+			}
+			else if (_lexer.current() == lex_double_dot)
+			{
+				_lexer.next();
+				
+				return new (alloc_node()) xpath_ast_node(ast_step, set, axis_parent, nodetest_type_node, 0);
+			}
+		
+			nodetest_t nt_type = nodetest_none;
+			xpath_lexer_string nt_name;
+			
+			if (_lexer.current() == lex_string)
+			{
+				// node name test
+				nt_name = _lexer.contents();
+				_lexer.next();
+
+				// was it an axis name?
+				if (_lexer.current() == lex_double_colon)
+				{
+					// parse axis name
+					if (axis_specified) throw_error("Two axis specifiers in one step");
+
+					axis = parse_axis_name(nt_name, axis_specified);
+
+					if (!axis_specified) throw_error("Unknown axis");
+
+					// read actual node test
+					_lexer.next();
+
+					if (_lexer.current() == lex_multiply)
+					{
+						nt_type = nodetest_all;
+						nt_name = xpath_lexer_string();
+						_lexer.next();
+					}
+					else if (_lexer.current() == lex_string)
+					{
+						nt_name = _lexer.contents();
+						_lexer.next();
+					}
+					else throw_error("Unrecognized node test");
+				}
+				
+				if (nt_type == nodetest_none)
+				{
+					// node type test or processing-instruction
+					if (_lexer.current() == lex_open_brace)
+					{
+						_lexer.next();
+						
+						if (_lexer.current() == lex_close_brace)
+						{
+							_lexer.next();
+
+							nt_type = parse_node_test_type(nt_name);
+
+							if (nt_type == nodetest_none) throw_error("Unrecognized node type");
+							
+							nt_name = xpath_lexer_string();
+						}
+						else if (nt_name == PUGIXML_TEXT("processing-instruction"))
+						{
+							if (_lexer.current() != lex_quoted_string)
+								throw_error("Only literals are allowed as arguments to processing-instruction()");
+						
+							nt_type = nodetest_pi;
+							nt_name = _lexer.contents();
+							_lexer.next();
+							
+							if (_lexer.current() != lex_close_brace)
+								throw_error("Unmatched brace near processing-instruction()");
+							_lexer.next();
+						}
+						else
+							throw_error("Unmatched brace near node type test");
+
+					}
+					// QName or NCName:*
+					else
+					{
+						if (nt_name.end - nt_name.begin > 2 && nt_name.end[-2] == ':' && nt_name.end[-1] == '*') // NCName:*
+						{
+							nt_name.end--; // erase *
+							
+							nt_type = nodetest_all_in_namespace;
+						}
+						else nt_type = nodetest_name;
+					}
+				}
+			}
+			else if (_lexer.current() == lex_multiply)
+			{
+				nt_type = nodetest_all;
+				_lexer.next();
+			}
+			else throw_error("Unrecognized node test");
+			
+			xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step, set, axis, nt_type, alloc_string(nt_name));
+			
+			xpath_ast_node* last = 0;
+			
+			while (_lexer.current() == lex_open_square_brace)
+			{
+				_lexer.next();
+				
+				xpath_ast_node* expr = parse_expression();
+
+				xpath_ast_node* pred = new (alloc_node()) xpath_ast_node(ast_predicate, xpath_type_node_set, expr);
+				
+				if (_lexer.current() != lex_close_square_brace)
+					throw_error("Unmatched square brace");
+				_lexer.next();
+				
+				if (last) last->set_next(pred);
+				else n->set_right(pred);
+				
+				last = pred;
+			}
+			
+			return n;
+		}
+		
+		// RelativeLocationPath ::= Step | RelativeLocationPath '/' Step | RelativeLocationPath '//' Step
+		xpath_ast_node* parse_relative_location_path(xpath_ast_node* set)
+		{
+			xpath_ast_node* n = parse_step(set);
+			
+			while (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash)
+			{
+				lexeme_t l = _lexer.current();
+				_lexer.next();
+
+				if (l == lex_double_slash)
+					n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+				
+				n = parse_step(n);
+			}
+			
+			return n;
+		}
+		
+		// LocationPath ::= RelativeLocationPath | AbsoluteLocationPath
+		// AbsoluteLocationPath ::= '/' RelativeLocationPath? | '//' RelativeLocationPath
+		xpath_ast_node* parse_location_path()
+		{
+			if (_lexer.current() == lex_slash)
+			{
+				_lexer.next();
+				
+				xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set);
+
+				// relative location path can start from axis_attribute, dot, double_dot, multiply and string lexemes; any other lexeme means standalone root path
+				lexeme_t l = _lexer.current();
+
+				if (l == lex_string || l == lex_axis_attribute || l == lex_dot || l == lex_double_dot || l == lex_multiply)
+					return parse_relative_location_path(n);
+				else
+					return n;
+			}
+			else if (_lexer.current() == lex_double_slash)
+			{
+				_lexer.next();
+				
+				xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set);
+				n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+				
+				return parse_relative_location_path(n);
+			}
+
+			// else clause moved outside of if because of bogus warning 'control may reach end of non-void function being inlined' in gcc 4.0.1
+			return parse_relative_location_path(0);
+		}
+		
+		// PathExpr ::= LocationPath
+		//				| FilterExpr
+		//				| FilterExpr '/' RelativeLocationPath
+		//				| FilterExpr '//' RelativeLocationPath
+		// UnionExpr ::= PathExpr | UnionExpr '|' PathExpr
+		// UnaryExpr ::= UnionExpr | '-' UnaryExpr
+		xpath_ast_node* parse_path_or_unary_expression()
+		{
+			// Clarification.
+			// PathExpr begins with either LocationPath or FilterExpr.
+			// FilterExpr begins with PrimaryExpr
+			// PrimaryExpr begins with '$' in case of it being a variable reference,
+			// '(' in case of it being an expression, string literal, number constant or
+			// function call.
+
+			if (_lexer.current() == lex_var_ref || _lexer.current() == lex_open_brace || 
+				_lexer.current() == lex_quoted_string || _lexer.current() == lex_number ||
+				_lexer.current() == lex_string)
+			{
+				if (_lexer.current() == lex_string)
+				{
+					// This is either a function call, or not - if not, we shall proceed with location path
+					const char_t* state = _lexer.state();
+					
+					while (PUGI__IS_CHARTYPE(*state, ct_space)) ++state;
+					
+					if (*state != '(') return parse_location_path();
+
+					// This looks like a function call; however this still can be a node-test. Check it.
+					if (parse_node_test_type(_lexer.contents()) != nodetest_none) return parse_location_path();
+				}
+				
+				xpath_ast_node* n = parse_filter_expression();
+
+				if (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash)
+				{
+					lexeme_t l = _lexer.current();
+					_lexer.next();
+					
+					if (l == lex_double_slash)
+					{
+						if (n->rettype() != xpath_type_node_set) throw_error("Step has to be applied to node set");
+
+						n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+					}
+	
+					// select from location path
+					return parse_relative_location_path(n);
+				}
+
+				return n;
+			}
+			else if (_lexer.current() == lex_minus)
+			{
+				_lexer.next();
+
+				// precedence 7+ - only parses union expressions
+				xpath_ast_node* expr = parse_expression_rec(parse_path_or_unary_expression(), 7);
+
+				return new (alloc_node()) xpath_ast_node(ast_op_negate, xpath_type_number, expr);
+			}
+			else
+				return parse_location_path();
+		}
+
+		struct binary_op_t
+		{
+			ast_type_t asttype;
+			xpath_value_type rettype;
+			int precedence;
+
+			binary_op_t(): asttype(ast_unknown), rettype(xpath_type_none), precedence(0)
+			{
+			}
+
+			binary_op_t(ast_type_t asttype_, xpath_value_type rettype_, int precedence_): asttype(asttype_), rettype(rettype_), precedence(precedence_)
+			{
+			}
+
+			static binary_op_t parse(xpath_lexer& lexer)
+			{
+				switch (lexer.current())
+				{
+				case lex_string:
+					if (lexer.contents() == PUGIXML_TEXT("or"))
+						return binary_op_t(ast_op_or, xpath_type_boolean, 1);
+					else if (lexer.contents() == PUGIXML_TEXT("and"))
+						return binary_op_t(ast_op_and, xpath_type_boolean, 2);
+					else if (lexer.contents() == PUGIXML_TEXT("div"))
+						return binary_op_t(ast_op_divide, xpath_type_number, 6);
+					else if (lexer.contents() == PUGIXML_TEXT("mod"))
+						return binary_op_t(ast_op_mod, xpath_type_number, 6);
+					else
+						return binary_op_t();
+
+				case lex_equal:
+					return binary_op_t(ast_op_equal, xpath_type_boolean, 3);
+
+				case lex_not_equal:
+					return binary_op_t(ast_op_not_equal, xpath_type_boolean, 3);
+
+				case lex_less:
+					return binary_op_t(ast_op_less, xpath_type_boolean, 4);
+
+				case lex_greater:
+					return binary_op_t(ast_op_greater, xpath_type_boolean, 4);
+
+				case lex_less_or_equal:
+					return binary_op_t(ast_op_less_or_equal, xpath_type_boolean, 4);
+
+				case lex_greater_or_equal:
+					return binary_op_t(ast_op_greater_or_equal, xpath_type_boolean, 4);
+
+				case lex_plus:
+					return binary_op_t(ast_op_add, xpath_type_number, 5);
+
+				case lex_minus:
+					return binary_op_t(ast_op_subtract, xpath_type_number, 5);
+
+				case lex_multiply:
+					return binary_op_t(ast_op_multiply, xpath_type_number, 6);
+
+				case lex_union:
+					return binary_op_t(ast_op_union, xpath_type_node_set, 7);
+
+				default:
+					return binary_op_t();
+				}
+			}
+		};
+
+		xpath_ast_node* parse_expression_rec(xpath_ast_node* lhs, int limit)
+		{
+			binary_op_t op = binary_op_t::parse(_lexer);
+
+			while (op.asttype != ast_unknown && op.precedence >= limit)
+			{
+				_lexer.next();
+
+				xpath_ast_node* rhs = parse_path_or_unary_expression();
+
+				binary_op_t nextop = binary_op_t::parse(_lexer);
+
+				while (nextop.asttype != ast_unknown && nextop.precedence > op.precedence)
+				{
+					rhs = parse_expression_rec(rhs, nextop.precedence);
+
+					nextop = binary_op_t::parse(_lexer);
+				}
+
+				if (op.asttype == ast_op_union && (lhs->rettype() != xpath_type_node_set || rhs->rettype() != xpath_type_node_set))
+					throw_error("Union operator has to be applied to node sets");
+
+				lhs = new (alloc_node()) xpath_ast_node(op.asttype, op.rettype, lhs, rhs);
+
+				op = binary_op_t::parse(_lexer);
+			}
+
+			return lhs;
+		}
+
+		// Expr ::= OrExpr
+		// OrExpr ::= AndExpr | OrExpr 'or' AndExpr
+		// AndExpr ::= EqualityExpr | AndExpr 'and' EqualityExpr
+		// EqualityExpr ::= RelationalExpr
+		//					| EqualityExpr '=' RelationalExpr
+		//					| EqualityExpr '!=' RelationalExpr
+		// RelationalExpr ::= AdditiveExpr
+		//					  | RelationalExpr '<' AdditiveExpr
+		//					  | RelationalExpr '>' AdditiveExpr
+		//					  | RelationalExpr '<=' AdditiveExpr
+		//					  | RelationalExpr '>=' AdditiveExpr
+		// AdditiveExpr ::= MultiplicativeExpr
+		//					| AdditiveExpr '+' MultiplicativeExpr
+		//					| AdditiveExpr '-' MultiplicativeExpr
+		// MultiplicativeExpr ::= UnaryExpr
+		//						  | MultiplicativeExpr '*' UnaryExpr
+		//						  | MultiplicativeExpr 'div' UnaryExpr
+		//						  | MultiplicativeExpr 'mod' UnaryExpr
+		xpath_ast_node* parse_expression()
+		{
+			return parse_expression_rec(parse_path_or_unary_expression(), 0);
+		}
+
+		xpath_parser(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result): _alloc(alloc), _lexer(query), _query(query), _variables(variables), _result(result)
+		{
+		}
+
+		xpath_ast_node* parse()
+		{
+			xpath_ast_node* result = parse_expression();
+			
+			if (_lexer.current() != lex_eof)
+			{
+				// there are still unparsed tokens left, error
+				throw_error("Incorrect query");
+			}
+			
+			return result;
+		}
+
+		static xpath_ast_node* parse(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result)
+		{
+			xpath_parser parser(query, variables, alloc, result);
+
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			int error = setjmp(parser._error_handler);
+
+			return (error == 0) ? parser.parse() : 0;
+		#else
+			return parser.parse();
+		#endif
+		}
+	};
+
+	struct xpath_query_impl
+	{
+		static xpath_query_impl* create()
+		{
+			void* memory = xml_memory::allocate(sizeof(xpath_query_impl));
+
+			return new (memory) xpath_query_impl();
+		}
+
+		static void destroy(void* ptr)
+		{
+			if (!ptr) return;
+			
+			// free all allocated pages
+			static_cast<xpath_query_impl*>(ptr)->alloc.release();
+
+			// free allocator memory (with the first page)
+			xml_memory::deallocate(ptr);
+		}
+
+		xpath_query_impl(): root(0), alloc(&block)
+		{
+			block.next = 0;
+		}
+
+		xpath_ast_node* root;
+		xpath_allocator alloc;
+		xpath_memory_block block;
+	};
+
+	PUGI__FN xpath_string evaluate_string_impl(xpath_query_impl* impl, const xpath_node& n, xpath_stack_data& sd)
+	{
+		if (!impl) return xpath_string();
+
+	#ifdef PUGIXML_NO_EXCEPTIONS
+		if (setjmp(sd.error_handler)) return xpath_string();
+	#endif
+
+		xpath_context c(n, 1, 1);
+
+		return impl->root->eval_string(c, sd.stack);
+	}
+PUGI__NS_END
+
+namespace pugi
+{
+#ifndef PUGIXML_NO_EXCEPTIONS
+	PUGI__FN xpath_exception::xpath_exception(const xpath_parse_result& result_): _result(result_)
+	{
+		assert(_result.error);
+	}
+	
+	PUGI__FN const char* xpath_exception::what() const throw()
+	{
+		return _result.error;
+	}
+
+	PUGI__FN const xpath_parse_result& xpath_exception::result() const
+	{
+		return _result;
+	}
+#endif
+	
+	PUGI__FN xpath_node::xpath_node()
+	{
+	}
+		
+	PUGI__FN xpath_node::xpath_node(const xml_node& node_): _node(node_)
+	{
+	}
+		
+	PUGI__FN xpath_node::xpath_node(const xml_attribute& attribute_, const xml_node& parent_): _node(attribute_ ? parent_ : xml_node()), _attribute(attribute_)
+	{
+	}
+
+	PUGI__FN xml_node xpath_node::node() const
+	{
+		return _attribute ? xml_node() : _node;
+	}
+		
+	PUGI__FN xml_attribute xpath_node::attribute() const
+	{
+		return _attribute;
+	}
+	
+	PUGI__FN xml_node xpath_node::parent() const
+	{
+		return _attribute ? _node : _node.parent();
+	}
+
+	PUGI__FN static void unspecified_bool_xpath_node(xpath_node***)
+	{
+	}
+
+	PUGI__FN xpath_node::operator xpath_node::unspecified_bool_type() const
+	{
+		return (_node || _attribute) ? unspecified_bool_xpath_node : 0;
+	}
+	
+	PUGI__FN bool xpath_node::operator!() const
+	{
+		return !(_node || _attribute);
+	}
+
+	PUGI__FN bool xpath_node::operator==(const xpath_node& n) const
+	{
+		return _node == n._node && _attribute == n._attribute;
+	}
+	
+	PUGI__FN bool xpath_node::operator!=(const xpath_node& n) const
+	{
+		return _node != n._node || _attribute != n._attribute;
+	}
+
+#ifdef __BORLANDC__
+	PUGI__FN bool operator&&(const xpath_node& lhs, bool rhs)
+	{
+		return (bool)lhs && rhs;
+	}
+
+	PUGI__FN bool operator||(const xpath_node& lhs, bool rhs)
+	{
+		return (bool)lhs || rhs;
+	}
+#endif
+
+	PUGI__FN void xpath_node_set::_assign(const_iterator begin_, const_iterator end_)
+	{
+		assert(begin_ <= end_);
+
+		size_t size_ = static_cast<size_t>(end_ - begin_);
+
+		if (size_ <= 1)
+		{
+			// deallocate old buffer
+			if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+
+			// use internal buffer
+			if (begin_ != end_) _storage = *begin_;
+
+			_begin = &_storage;
+			_end = &_storage + size_;
+		}
+		else
+		{
+			// make heap copy
+			xpath_node* storage = static_cast<xpath_node*>(impl::xml_memory::allocate(size_ * sizeof(xpath_node)));
+
+			if (!storage)
+			{
+			#ifdef PUGIXML_NO_EXCEPTIONS
+				return;
+			#else
+				throw std::bad_alloc();
+			#endif
+			}
+
+			memcpy(storage, begin_, size_ * sizeof(xpath_node));
+			
+			// deallocate old buffer
+			if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+
+			// finalize
+			_begin = storage;
+			_end = storage + size_;
+		}
+	}
+
+	PUGI__FN xpath_node_set::xpath_node_set(): _type(type_unsorted), _begin(&_storage), _end(&_storage)
+	{
+	}
+
+	PUGI__FN xpath_node_set::xpath_node_set(const_iterator begin_, const_iterator end_, type_t type_): _type(type_), _begin(&_storage), _end(&_storage)
+	{
+		_assign(begin_, end_);
+	}
+
+	PUGI__FN xpath_node_set::~xpath_node_set()
+	{
+		if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+	}
+		
+	PUGI__FN xpath_node_set::xpath_node_set(const xpath_node_set& ns): _type(ns._type), _begin(&_storage), _end(&_storage)
+	{
+		_assign(ns._begin, ns._end);
+	}
+	
+	PUGI__FN xpath_node_set& xpath_node_set::operator=(const xpath_node_set& ns)
+	{
+		if (this == &ns) return *this;
+		
+		_type = ns._type;
+		_assign(ns._begin, ns._end);
+
+		return *this;
+	}
+
+	PUGI__FN xpath_node_set::type_t xpath_node_set::type() const
+	{
+		return _type;
+	}
+		
+	PUGI__FN size_t xpath_node_set::size() const
+	{
+		return _end - _begin;
+	}
+		
+	PUGI__FN bool xpath_node_set::empty() const
+	{
+		return _begin == _end;
+	}
+		
+	PUGI__FN const xpath_node& xpath_node_set::operator[](size_t index) const
+	{
+		assert(index < size());
+		return _begin[index];
+	}
+
+	PUGI__FN xpath_node_set::const_iterator xpath_node_set::begin() const
+	{
+		return _begin;
+	}
+		
+	PUGI__FN xpath_node_set::const_iterator xpath_node_set::end() const
+	{
+		return _end;
+	}
+	
+	PUGI__FN void xpath_node_set::sort(bool reverse)
+	{
+		_type = impl::xpath_sort(_begin, _end, _type, reverse);
+	}
+
+	PUGI__FN xpath_node xpath_node_set::first() const
+	{
+		return impl::xpath_first(_begin, _end, _type);
+	}
+
+	PUGI__FN xpath_parse_result::xpath_parse_result(): error("Internal error"), offset(0)
+	{
+	}
+
+	PUGI__FN xpath_parse_result::operator bool() const
+	{
+		return error == 0;
+	}
+
+	PUGI__FN const char* xpath_parse_result::description() const
+	{
+		return error ? error : "No error";
+	}
+
+	PUGI__FN xpath_variable::xpath_variable(): _type(xpath_type_none), _next(0)
+	{
+	}
+
+	PUGI__FN const char_t* xpath_variable::name() const
+	{
+		switch (_type)
+		{
+		case xpath_type_node_set:
+			return static_cast<const impl::xpath_variable_node_set*>(this)->name;
+
+		case xpath_type_number:
+			return static_cast<const impl::xpath_variable_number*>(this)->name;
+
+		case xpath_type_string:
+			return static_cast<const impl::xpath_variable_string*>(this)->name;
+
+		case xpath_type_boolean:
+			return static_cast<const impl::xpath_variable_boolean*>(this)->name;
+
+		default:
+			assert(!"Invalid variable type");
+			return 0;
+		}
+	}
+
+	PUGI__FN xpath_value_type xpath_variable::type() const
+	{
+		return _type;
+	}
+
+	PUGI__FN bool xpath_variable::get_boolean() const
+	{
+		return (_type == xpath_type_boolean) ? static_cast<const impl::xpath_variable_boolean*>(this)->value : false;
+	}
+
+	PUGI__FN double xpath_variable::get_number() const
+	{
+		return (_type == xpath_type_number) ? static_cast<const impl::xpath_variable_number*>(this)->value : impl::gen_nan();
+	}
+
+	PUGI__FN const char_t* xpath_variable::get_string() const
+	{
+		const char_t* value = (_type == xpath_type_string) ? static_cast<const impl::xpath_variable_string*>(this)->value : 0;
+		return value ? value : PUGIXML_TEXT("");
+	}
+
+	PUGI__FN const xpath_node_set& xpath_variable::get_node_set() const
+	{
+		return (_type == xpath_type_node_set) ? static_cast<const impl::xpath_variable_node_set*>(this)->value : impl::dummy_node_set;
+	}
+
+	PUGI__FN bool xpath_variable::set(bool value)
+	{
+		if (_type != xpath_type_boolean) return false;
+
+		static_cast<impl::xpath_variable_boolean*>(this)->value = value;
+		return true;
+	}
+
+	PUGI__FN bool xpath_variable::set(double value)
+	{
+		if (_type != xpath_type_number) return false;
+
+		static_cast<impl::xpath_variable_number*>(this)->value = value;
+		return true;
+	}
+
+	PUGI__FN bool xpath_variable::set(const char_t* value)
+	{
+		if (_type != xpath_type_string) return false;
+
+		impl::xpath_variable_string* var = static_cast<impl::xpath_variable_string*>(this);
+
+		// duplicate string
+		size_t size = (impl::strlength(value) + 1) * sizeof(char_t);
+
+		char_t* copy = static_cast<char_t*>(impl::xml_memory::allocate(size));
+		if (!copy) return false;
+
+		memcpy(copy, value, size);
+
+		// replace old string
+		if (var->value) impl::xml_memory::deallocate(var->value);
+		var->value = copy;
+
+		return true;
+	}
+
+	PUGI__FN bool xpath_variable::set(const xpath_node_set& value)
+	{
+		if (_type != xpath_type_node_set) return false;
+
+		static_cast<impl::xpath_variable_node_set*>(this)->value = value;
+		return true;
+	}
+
+	PUGI__FN xpath_variable_set::xpath_variable_set()
+	{
+		for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) _data[i] = 0;
+	}
+
+	PUGI__FN xpath_variable_set::~xpath_variable_set()
+	{
+		for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i)
+		{
+			xpath_variable* var = _data[i];
+
+			while (var)
+			{
+				xpath_variable* next = var->_next;
+
+				impl::delete_xpath_variable(var->_type, var);
+
+				var = next;
+			}
+		}
+	}
+
+	PUGI__FN xpath_variable* xpath_variable_set::find(const char_t* name) const
+	{
+		const size_t hash_size = sizeof(_data) / sizeof(_data[0]);
+		size_t hash = impl::hash_string(name) % hash_size;
+
+		// look for existing variable
+		for (xpath_variable* var = _data[hash]; var; var = var->_next)
+			if (impl::strequal(var->name(), name))
+				return var;
+
+		return 0;
+	}
+
+	PUGI__FN xpath_variable* xpath_variable_set::add(const char_t* name, xpath_value_type type)
+	{
+		const size_t hash_size = sizeof(_data) / sizeof(_data[0]);
+		size_t hash = impl::hash_string(name) % hash_size;
+
+		// look for existing variable
+		for (xpath_variable* var = _data[hash]; var; var = var->_next)
+			if (impl::strequal(var->name(), name))
+				return var->type() == type ? var : 0;
+
+		// add new variable
+		xpath_variable* result = impl::new_xpath_variable(type, name);
+
+		if (result)
+		{
+			result->_type = type;
+			result->_next = _data[hash];
+
+			_data[hash] = result;
+		}
+
+		return result;
+	}
+
+	PUGI__FN bool xpath_variable_set::set(const char_t* name, bool value)
+	{
+		xpath_variable* var = add(name, xpath_type_boolean);
+		return var ? var->set(value) : false;
+	}
+
+	PUGI__FN bool xpath_variable_set::set(const char_t* name, double value)
+	{
+		xpath_variable* var = add(name, xpath_type_number);
+		return var ? var->set(value) : false;
+	}
+
+	PUGI__FN bool xpath_variable_set::set(const char_t* name, const char_t* value)
+	{
+		xpath_variable* var = add(name, xpath_type_string);
+		return var ? var->set(value) : false;
+	}
+
+	PUGI__FN bool xpath_variable_set::set(const char_t* name, const xpath_node_set& value)
+	{
+		xpath_variable* var = add(name, xpath_type_node_set);
+		return var ? var->set(value) : false;
+	}
+
+	PUGI__FN xpath_variable* xpath_variable_set::get(const char_t* name)
+	{
+		return find(name);
+	}
+
+	PUGI__FN const xpath_variable* xpath_variable_set::get(const char_t* name) const
+	{
+		return find(name);
+	}
+
+	PUGI__FN xpath_query::xpath_query(const char_t* query, xpath_variable_set* variables): _impl(0)
+	{
+		impl::xpath_query_impl* qimpl = impl::xpath_query_impl::create();
+
+		if (!qimpl)
+		{
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			_result.error = "Out of memory";
+		#else
+			throw std::bad_alloc();
+		#endif
+		}
+		else
+		{
+			impl::buffer_holder impl_holder(qimpl, impl::xpath_query_impl::destroy);
+
+			qimpl->root = impl::xpath_parser::parse(query, variables, &qimpl->alloc, &_result);
+
+			if (qimpl->root)
+			{
+				_impl = static_cast<impl::xpath_query_impl*>(impl_holder.release());
+				_result.error = 0;
+			}
+		}
+	}
+
+	PUGI__FN xpath_query::~xpath_query()
+	{
+		impl::xpath_query_impl::destroy(_impl);
+	}
+
+	PUGI__FN xpath_value_type xpath_query::return_type() const
+	{
+		if (!_impl) return xpath_type_none;
+
+		return static_cast<impl::xpath_query_impl*>(_impl)->root->rettype();
+	}
+
+	PUGI__FN bool xpath_query::evaluate_boolean(const xpath_node& n) const
+	{
+		if (!_impl) return false;
+		
+		impl::xpath_context c(n, 1, 1);
+		impl::xpath_stack_data sd;
+
+	#ifdef PUGIXML_NO_EXCEPTIONS
+		if (setjmp(sd.error_handler)) return false;
+	#endif
+		
+		return static_cast<impl::xpath_query_impl*>(_impl)->root->eval_boolean(c, sd.stack);
+	}
+	
+	PUGI__FN double xpath_query::evaluate_number(const xpath_node& n) const
+	{
+		if (!_impl) return impl::gen_nan();
+		
+		impl::xpath_context c(n, 1, 1);
+		impl::xpath_stack_data sd;
+
+	#ifdef PUGIXML_NO_EXCEPTIONS
+		if (setjmp(sd.error_handler)) return impl::gen_nan();
+	#endif
+
+		return static_cast<impl::xpath_query_impl*>(_impl)->root->eval_number(c, sd.stack);
+	}
+
+#ifndef PUGIXML_NO_STL
+	PUGI__FN string_t xpath_query::evaluate_string(const xpath_node& n) const
+	{
+		impl::xpath_stack_data sd;
+
+		return impl::evaluate_string_impl(static_cast<impl::xpath_query_impl*>(_impl), n, sd).c_str();
+	}
+#endif
+
+	PUGI__FN size_t xpath_query::evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const
+	{
+		impl::xpath_stack_data sd;
+
+		impl::xpath_string r = impl::evaluate_string_impl(static_cast<impl::xpath_query_impl*>(_impl), n, sd);
+
+		size_t full_size = r.length() + 1;
+		
+		if (capacity > 0)
+		{
+			size_t size = (full_size < capacity) ? full_size : capacity;
+			assert(size > 0);
+
+			memcpy(buffer, r.c_str(), (size - 1) * sizeof(char_t));
+			buffer[size - 1] = 0;
+		}
+		
+		return full_size;
+	}
+
+	PUGI__FN xpath_node_set xpath_query::evaluate_node_set(const xpath_node& n) const
+	{
+		if (!_impl) return xpath_node_set();
+
+		impl::xpath_ast_node* root = static_cast<impl::xpath_query_impl*>(_impl)->root;
+
+		if (root->rettype() != xpath_type_node_set)
+		{
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			return xpath_node_set();
+		#else
+			xpath_parse_result res;
+			res.error = "Expression does not evaluate to node set";
+
+			throw xpath_exception(res);
+		#endif
+		}
+		
+		impl::xpath_context c(n, 1, 1);
+		impl::xpath_stack_data sd;
+
+	#ifdef PUGIXML_NO_EXCEPTIONS
+		if (setjmp(sd.error_handler)) return xpath_node_set();
+	#endif
+
+		impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack);
+
+		return xpath_node_set(r.begin(), r.end(), r.type());
+	}
+
+	PUGI__FN const xpath_parse_result& xpath_query::result() const
+	{
+		return _result;
+	}
+
+	PUGI__FN static void unspecified_bool_xpath_query(xpath_query***)
+	{
+	}
+
+	PUGI__FN xpath_query::operator xpath_query::unspecified_bool_type() const
+	{
+		return _impl ? unspecified_bool_xpath_query : 0;
+	}
+
+	PUGI__FN bool xpath_query::operator!() const
+	{
+		return !_impl;
+	}
+
+	PUGI__FN xpath_node xml_node::select_single_node(const char_t* query, xpath_variable_set* variables) const
+	{
+		xpath_query q(query, variables);
+		return select_single_node(q);
+	}
+
+	PUGI__FN xpath_node xml_node::select_single_node(const xpath_query& query) const
+	{
+		xpath_node_set s = query.evaluate_node_set(*this);
+		return s.empty() ? xpath_node() : s.first();
+	}
+
+	PUGI__FN xpath_node_set xml_node::select_nodes(const char_t* query, xpath_variable_set* variables) const
+	{
+		xpath_query q(query, variables);
+		return select_nodes(q);
+	}
+
+	PUGI__FN xpath_node_set xml_node::select_nodes(const xpath_query& query) const
+	{
+		return query.evaluate_node_set(*this);
+	}
+}
+
+#endif
+
+#ifdef __BORLANDC__
+#	pragma option pop
+#endif
+
+// Intel C++ does not properly keep warning state for function templates,
+// so popping warning state at the end of translation unit leads to warnings in the middle.
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#	pragma warning(pop)
+#endif
+
+// Undefine all local macros (makes sure we're not leaking macros in header-only mode)
+#undef PUGI__NO_INLINE
+#undef PUGI__STATIC_ASSERT
+#undef PUGI__DMC_VOLATILE
+#undef PUGI__MSVC_CRT_VERSION
+#undef PUGI__NS_BEGIN
+#undef PUGI__NS_END
+#undef PUGI__FN
+#undef PUGI__FN_NO_INLINE
+#undef PUGI__IS_CHARTYPE_IMPL
+#undef PUGI__IS_CHARTYPE
+#undef PUGI__IS_CHARTYPEX
+#undef PUGI__SKIPWS
+#undef PUGI__OPTSET
+#undef PUGI__PUSHNODE
+#undef PUGI__POPNODE
+#undef PUGI__SCANFOR
+#undef PUGI__SCANWHILE
+#undef PUGI__ENDSEG
+#undef PUGI__THROW_ERROR
+#undef PUGI__CHECK_ERROR
+
+#endif
+
+/**
+ * Copyright (c) 2006-2014 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/apps/gadgetron/pugixml.hpp b/apps/gadgetron/pugixml.hpp
new file mode 100644
index 0000000..6fb99be
--- /dev/null
+++ b/apps/gadgetron/pugixml.hpp
@@ -0,0 +1,1332 @@
+/**
+ * pugixml parser - version 1.4
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine at gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen at tima.net)
+ */
+
+#ifndef PUGIXML_VERSION
+// Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons
+#	define PUGIXML_VERSION 140
+#endif
+
+// Include user configuration file (this can define various configuration macros)
+#include "pugiconfig.hpp"
+
+#ifndef HEADER_PUGIXML_HPP
+#define HEADER_PUGIXML_HPP
+
+// Include stddef.h for size_t and ptrdiff_t
+#include <stddef.h>
+
+// Include exception header for XPath
+#if !defined(PUGIXML_NO_XPATH) && !defined(PUGIXML_NO_EXCEPTIONS)
+#	include <exception>
+#endif
+
+// Include STL headers
+#ifndef PUGIXML_NO_STL
+#	include <iterator>
+#	include <iosfwd>
+#	include <string>
+#endif
+
+// Macro for deprecated features
+#ifndef PUGIXML_DEPRECATED
+#	if defined(__GNUC__)
+#		define PUGIXML_DEPRECATED __attribute__((deprecated))
+#	elif defined(_MSC_VER) && _MSC_VER >= 1300
+#		define PUGIXML_DEPRECATED __declspec(deprecated)
+#	else
+#		define PUGIXML_DEPRECATED
+#	endif
+#endif
+
+// If no API is defined, assume default
+#ifndef PUGIXML_API
+#	define PUGIXML_API
+#endif
+
+// If no API for classes is defined, assume default
+#ifndef PUGIXML_CLASS
+#	define PUGIXML_CLASS PUGIXML_API
+#endif
+
+// If no API for functions is defined, assume default
+#ifndef PUGIXML_FUNCTION
+#	define PUGIXML_FUNCTION PUGIXML_API
+#endif
+
+// If the platform is known to have long long support, enable long long functions
+#ifndef PUGIXML_HAS_LONG_LONG
+#	if defined(__cplusplus) && __cplusplus >= 201103
+#		define PUGIXML_HAS_LONG_LONG
+#	elif defined(_MSC_VER) && _MSC_VER >= 1400
+#		define PUGIXML_HAS_LONG_LONG
+#	endif
+#endif
+
+// Character interface macros
+#ifdef PUGIXML_WCHAR_MODE
+#	define PUGIXML_TEXT(t) L ## t
+#	define PUGIXML_CHAR wchar_t
+#else
+#	define PUGIXML_TEXT(t) t
+#	define PUGIXML_CHAR char
+#endif
+
+namespace pugi
+{
+	// Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE
+	typedef PUGIXML_CHAR char_t;
+
+#ifndef PUGIXML_NO_STL
+	// String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE
+	typedef std::basic_string<PUGIXML_CHAR, std::char_traits<PUGIXML_CHAR>, std::allocator<PUGIXML_CHAR> > string_t;
+#endif
+}
+
+// The PugiXML namespace
+namespace pugi
+{
+	// Tree node types
+	enum xml_node_type
+	{
+		node_null,			// Empty (null) node handle
+		node_document,		// A document tree's absolute root
+		node_element,		// Element tag, i.e. '<node/>'
+		node_pcdata,		// Plain character data, i.e. 'text'
+		node_cdata,			// Character data, i.e. '<![CDATA[text]]>'
+		node_comment,		// Comment tag, i.e. '<!-- text -->'
+		node_pi,			// Processing instruction, i.e. '<?name?>'
+		node_declaration,	// Document declaration, i.e. '<?xml version="1.0"?>'
+		node_doctype		// Document type declaration, i.e. '<!DOCTYPE doc>'
+	};
+
+	// Parsing options
+
+	// Minimal parsing mode (equivalent to turning all other flags off).
+	// Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed.
+	const unsigned int parse_minimal = 0x0000;
+
+	// This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is off by default.
+	const unsigned int parse_pi = 0x0001;
+
+	// This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by default.
+	const unsigned int parse_comments = 0x0002;
+
+	// This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by default.
+	const unsigned int parse_cdata = 0x0004;
+
+	// This flag determines if plain character data (node_pcdata) that consist only of whitespace are added to the DOM tree.
+	// This flag is off by default; turning it on usually results in slower parsing and more memory consumption.
+	const unsigned int parse_ws_pcdata = 0x0008;
+
+	// This flag determines if character and entity references are expanded during parsing. This flag is on by default.
+	const unsigned int parse_escapes = 0x0010;
+
+	// This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default.
+	const unsigned int parse_eol = 0x0020;
+	
+	// This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default.
+	const unsigned int parse_wconv_attribute = 0x0040;
+
+	// This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default.
+	const unsigned int parse_wnorm_attribute = 0x0080;
+	
+	// This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default.
+	const unsigned int parse_declaration = 0x0100;
+
+	// This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default.
+	const unsigned int parse_doctype = 0x0200;
+
+	// This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only
+	// of whitespace is added to the DOM tree.
+	// This flag is off by default; turning it on may result in slower parsing and more memory consumption.
+	const unsigned int parse_ws_pcdata_single = 0x0400;
+
+	// This flag determines if leading and trailing whitespace is to be removed from plain character data. This flag is off by default.
+	const unsigned int parse_trim_pcdata = 0x0800;
+
+	// This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document
+	// is a valid document. This flag is off by default.
+	const unsigned int parse_fragment = 0x1000;
+
+	// The default parsing mode.
+	// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
+	// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
+	const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol;
+
+	// The full parsing mode.
+	// Nodes of all types are added to the DOM tree, character/reference entities are expanded,
+	// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
+	const unsigned int parse_full = parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype;
+
+	// These flags determine the encoding of input data for XML document
+	enum xml_encoding
+	{
+		encoding_auto,		// Auto-detect input encoding using BOM or < / <? detection; use UTF8 if BOM is not found
+		encoding_utf8,		// UTF8 encoding
+		encoding_utf16_le,	// Little-endian UTF16
+		encoding_utf16_be,	// Big-endian UTF16
+		encoding_utf16,		// UTF16 with native endianness
+		encoding_utf32_le,	// Little-endian UTF32
+		encoding_utf32_be,	// Big-endian UTF32
+		encoding_utf32,		// UTF32 with native endianness
+		encoding_wchar,		// The same encoding wchar_t has (either UTF16 or UTF32)
+		encoding_latin1
+	};
+
+	// Formatting flags
+	
+	// Indent the nodes that are written to output stream with as many indentation strings as deep the node is in DOM tree. This flag is on by default.
+	const unsigned int format_indent = 0x01;
+	
+	// Write encoding-specific BOM to the output stream. This flag is off by default.
+	const unsigned int format_write_bom = 0x02;
+
+	// Use raw output mode (no indentation and no line breaks are written). This flag is off by default.
+	const unsigned int format_raw = 0x04;
+	
+	// Omit default XML declaration even if there is no declaration in the document. This flag is off by default.
+	const unsigned int format_no_declaration = 0x08;
+
+	// Don't escape attribute values and PCDATA contents. This flag is off by default.
+	const unsigned int format_no_escapes = 0x10;
+
+	// Open file using text mode in xml_document::save_file. This enables special character (i.e. new-line) conversions on some systems. This flag is off by default.
+	const unsigned int format_save_file_text = 0x20;
+
+	// The default set of formatting flags.
+	// Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none.
+	const unsigned int format_default = format_indent;
+		
+	// Forward declarations
+	struct xml_attribute_struct;
+	struct xml_node_struct;
+
+	class xml_node_iterator;
+	class xml_attribute_iterator;
+	class xml_named_node_iterator;
+
+	class xml_tree_walker;
+
+	struct xml_parse_result;
+
+	class xml_node;
+
+	class xml_text;
+	
+	#ifndef PUGIXML_NO_XPATH
+	class xpath_node;
+	class xpath_node_set;
+	class xpath_query;
+	class xpath_variable_set;
+	#endif
+
+	// Range-based for loop support
+	template <typename It> class xml_object_range
+	{
+	public:
+		typedef It const_iterator;
+		typedef It iterator;
+
+		xml_object_range(It b, It e): _begin(b), _end(e)
+		{
+		}
+
+		It begin() const { return _begin; }
+		It end() const { return _end; }
+
+	private:
+		It _begin, _end;
+	};
+
+	// Writer interface for node printing (see xml_node::print)
+	class PUGIXML_CLASS xml_writer
+	{
+	public:
+		virtual ~xml_writer() {}
+
+		// Write memory chunk into stream/file/whatever
+		virtual void write(const void* data, size_t size) = 0;
+	};
+
+	// xml_writer implementation for FILE*
+	class PUGIXML_CLASS xml_writer_file: public xml_writer
+	{
+	public:
+		// Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio
+		xml_writer_file(void* file);
+
+		virtual void write(const void* data, size_t size);
+
+	private:
+		void* file;
+	};
+
+	#ifndef PUGIXML_NO_STL
+	// xml_writer implementation for streams
+	class PUGIXML_CLASS xml_writer_stream: public xml_writer
+	{
+	public:
+		// Construct writer from an output stream object
+		xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream);
+		xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream);
+
+		virtual void write(const void* data, size_t size);
+
+	private:
+		std::basic_ostream<char, std::char_traits<char> >* narrow_stream;
+		std::basic_ostream<wchar_t, std::char_traits<wchar_t> >* wide_stream;
+	};
+	#endif
+
+	// A light-weight handle for manipulating attributes in DOM tree
+	class PUGIXML_CLASS xml_attribute
+	{
+		friend class xml_attribute_iterator;
+		friend class xml_node;
+
+	private:
+		xml_attribute_struct* _attr;
+	
+		typedef void (*unspecified_bool_type)(xml_attribute***);
+
+	public:
+		// Default constructor. Constructs an empty attribute.
+		xml_attribute();
+		
+		// Constructs attribute from internal pointer
+		explicit xml_attribute(xml_attribute_struct* attr);
+
+		// Safe bool conversion operator
+		operator unspecified_bool_type() const;
+
+		// Borland C++ workaround
+		bool operator!() const;
+
+		// Comparison operators (compares wrapped attribute pointers)
+		bool operator==(const xml_attribute& r) const;
+		bool operator!=(const xml_attribute& r) const;
+		bool operator<(const xml_attribute& r) const;
+		bool operator>(const xml_attribute& r) const;
+		bool operator<=(const xml_attribute& r) const;
+		bool operator>=(const xml_attribute& r) const;
+
+		// Check if attribute is empty
+		bool empty() const;
+
+		// Get attribute name/value, or "" if attribute is empty
+		const char_t* name() const;
+		const char_t* value() const;
+
+		// Get attribute value, or the default value if attribute is empty
+		const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
+
+		// Get attribute value as a number, or the default value if conversion did not succeed or attribute is empty
+		int as_int(int def = 0) const;
+		unsigned int as_uint(unsigned int def = 0) const;
+		double as_double(double def = 0) const;
+		float as_float(float def = 0) const;
+
+	#ifdef PUGIXML_HAS_LONG_LONG
+		long long as_llong(long long def = 0) const;
+		unsigned long long as_ullong(unsigned long long def = 0) const;
+	#endif
+
+		// Get attribute value as bool (returns true if first character is in '1tTyY' set), or the default value if attribute is empty
+		bool as_bool(bool def = false) const;
+
+		// Set attribute name/value (returns false if attribute is empty or there is not enough memory)
+		bool set_name(const char_t* rhs);
+		bool set_value(const char_t* rhs);
+
+		// Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
+		bool set_value(int rhs);
+		bool set_value(unsigned int rhs);
+		bool set_value(double rhs);
+		bool set_value(bool rhs);
+
+	#ifdef PUGIXML_HAS_LONG_LONG
+		bool set_value(long long rhs);
+		bool set_value(unsigned long long rhs);
+	#endif
+
+		// Set attribute value (equivalent to set_value without error checking)
+		xml_attribute& operator=(const char_t* rhs);
+		xml_attribute& operator=(int rhs);
+		xml_attribute& operator=(unsigned int rhs);
+		xml_attribute& operator=(double rhs);
+		xml_attribute& operator=(bool rhs);
+
+	#ifdef PUGIXML_HAS_LONG_LONG
+		xml_attribute& operator=(long long rhs);
+		xml_attribute& operator=(unsigned long long rhs);
+	#endif
+
+		// Get next/previous attribute in the attribute list of the parent node
+		xml_attribute next_attribute() const;
+		xml_attribute previous_attribute() const;
+
+		// Get hash value (unique for handles to the same object)
+		size_t hash_value() const;
+
+		// Get internal pointer
+		xml_attribute_struct* internal_object() const;
+	};
+
+#ifdef __BORLANDC__
+	// Borland C++ workaround
+	bool PUGIXML_FUNCTION operator&&(const xml_attribute& lhs, bool rhs);
+	bool PUGIXML_FUNCTION operator||(const xml_attribute& lhs, bool rhs);
+#endif
+
+	// A light-weight handle for manipulating nodes in DOM tree
+	class PUGIXML_CLASS xml_node
+	{
+		friend class xml_attribute_iterator;
+		friend class xml_node_iterator;
+		friend class xml_named_node_iterator;
+
+	protected:
+		xml_node_struct* _root;
+
+		typedef void (*unspecified_bool_type)(xml_node***);
+
+	public:
+		// Default constructor. Constructs an empty node.
+		xml_node();
+
+		// Constructs node from internal pointer
+		explicit xml_node(xml_node_struct* p);
+
+		// Safe bool conversion operator
+		operator unspecified_bool_type() const;
+
+		// Borland C++ workaround
+		bool operator!() const;
+	
+		// Comparison operators (compares wrapped node pointers)
+		bool operator==(const xml_node& r) const;
+		bool operator!=(const xml_node& r) const;
+		bool operator<(const xml_node& r) const;
+		bool operator>(const xml_node& r) const;
+		bool operator<=(const xml_node& r) const;
+		bool operator>=(const xml_node& r) const;
+
+		// Check if node is empty.
+		bool empty() const;
+
+		// Get node type
+		xml_node_type type() const;
+
+		// Get node name, or "" if node is empty or it has no name
+		const char_t* name() const;
+
+		// Get node value, or "" if node is empty or it has no value
+        // Note: For <node>text</node> node.value() does not return "text"! Use child_value() or text() methods to access text inside nodes.
+		const char_t* value() const;
+	
+		// Get attribute list
+		xml_attribute first_attribute() const;
+		xml_attribute last_attribute() const;
+
+		// Get children list
+		xml_node first_child() const;
+		xml_node last_child() const;
+
+		// Get next/previous sibling in the children list of the parent node
+		xml_node next_sibling() const;
+		xml_node previous_sibling() const;
+		
+		// Get parent node
+		xml_node parent() const;
+
+		// Get root of DOM tree this node belongs to
+		xml_node root() const;
+
+		// Get text object for the current node
+		xml_text text() const;
+
+		// Get child, attribute or next/previous sibling with the specified name
+		xml_node child(const char_t* name) const;
+		xml_attribute attribute(const char_t* name) const;
+		xml_node next_sibling(const char_t* name) const;
+		xml_node previous_sibling(const char_t* name) const;
+
+		// Get child value of current node; that is, value of the first child node of type PCDATA/CDATA
+		const char_t* child_value() const;
+
+		// Get child value of child with specified name. Equivalent to child(name).child_value().
+		const char_t* child_value(const char_t* name) const;
+
+		// Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value)
+		bool set_name(const char_t* rhs);
+		bool set_value(const char_t* rhs);
+		
+		// Add attribute with specified name. Returns added attribute, or empty attribute on errors.
+		xml_attribute append_attribute(const char_t* name);
+		xml_attribute prepend_attribute(const char_t* name);
+		xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr);
+		xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr);
+
+		// Add a copy of the specified attribute. Returns added attribute, or empty attribute on errors.
+		xml_attribute append_copy(const xml_attribute& proto);
+		xml_attribute prepend_copy(const xml_attribute& proto);
+		xml_attribute insert_copy_after(const xml_attribute& proto, const xml_attribute& attr);
+		xml_attribute insert_copy_before(const xml_attribute& proto, const xml_attribute& attr);
+
+		// Add child node with specified type. Returns added node, or empty node on errors.
+		xml_node append_child(xml_node_type type = node_element);
+		xml_node prepend_child(xml_node_type type = node_element);
+		xml_node insert_child_after(xml_node_type type, const xml_node& node);
+		xml_node insert_child_before(xml_node_type type, const xml_node& node);
+
+		// Add child element with specified name. Returns added node, or empty node on errors.
+		xml_node append_child(const char_t* name);
+		xml_node prepend_child(const char_t* name);
+		xml_node insert_child_after(const char_t* name, const xml_node& node);
+		xml_node insert_child_before(const char_t* name, const xml_node& node);
+
+		// Add a copy of the specified node as a child. Returns added node, or empty node on errors.
+		xml_node append_copy(const xml_node& proto);
+		xml_node prepend_copy(const xml_node& proto);
+		xml_node insert_copy_after(const xml_node& proto, const xml_node& node);
+		xml_node insert_copy_before(const xml_node& proto, const xml_node& node);
+
+		// Remove specified attribute
+		bool remove_attribute(const xml_attribute& a);
+		bool remove_attribute(const char_t* name);
+
+		// Remove specified child
+		bool remove_child(const xml_node& n);
+		bool remove_child(const char_t* name);
+
+		// Parses buffer as an XML document fragment and appends all nodes as children of the current node.
+		// Copies/converts the buffer, so it may be deleted or changed after the function returns.
+		// Note: append_buffer allocates memory that has the lifetime of the owning document; removing the appended nodes does not immediately reclaim that memory.
+		xml_parse_result append_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+		// Find attribute using predicate. Returns first attribute for which predicate returned true.
+		template <typename Predicate> xml_attribute find_attribute(Predicate pred) const
+		{
+			if (!_root) return xml_attribute();
+			
+			for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute())
+				if (pred(attrib))
+					return attrib;
+		
+			return xml_attribute();
+		}
+
+		// Find child node using predicate. Returns first child for which predicate returned true.
+		template <typename Predicate> xml_node find_child(Predicate pred) const
+		{
+			if (!_root) return xml_node();
+	
+			for (xml_node node = first_child(); node; node = node.next_sibling())
+				if (pred(node))
+					return node;
+		
+			return xml_node();
+		}
+
+		// Find node from subtree using predicate. Returns first node from subtree (depth-first), for which predicate returned true.
+		template <typename Predicate> xml_node find_node(Predicate pred) const
+		{
+			if (!_root) return xml_node();
+
+			xml_node cur = first_child();
+			
+			while (cur._root && cur._root != _root)
+			{
+				if (pred(cur)) return cur;
+
+				if (cur.first_child()) cur = cur.first_child();
+				else if (cur.next_sibling()) cur = cur.next_sibling();
+				else
+				{
+					while (!cur.next_sibling() && cur._root != _root) cur = cur.parent();
+
+					if (cur._root != _root) cur = cur.next_sibling();
+				}
+			}
+
+			return xml_node();
+		}
+
+		// Find child node by attribute name/value
+		xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const;
+		xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const;
+
+	#ifndef PUGIXML_NO_STL
+		// Get the absolute node path from root as a text string.
+		string_t path(char_t delimiter = '/') const;
+	#endif
+
+		// Search for a node by path consisting of node names and . or .. elements.
+		xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const;
+
+		// Recursively traverse subtree with xml_tree_walker
+		bool traverse(xml_tree_walker& walker);
+	
+	#ifndef PUGIXML_NO_XPATH
+		// Select single node by evaluating XPath query. Returns first node from the resulting node set.
+		xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
+		xpath_node select_single_node(const xpath_query& query) const;
+
+		// Select node set by evaluating XPath query
+		xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = 0) const;
+		xpath_node_set select_nodes(const xpath_query& query) const;
+	#endif
+		
+		// Print subtree using a writer object
+		void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
+
+	#ifndef PUGIXML_NO_STL
+		// Print subtree to stream
+		void print(std::basic_ostream<char, std::char_traits<char> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
+		void print(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const;
+	#endif
+
+		// Child nodes iterators
+		typedef xml_node_iterator iterator;
+
+		iterator begin() const;
+		iterator end() const;
+
+		// Attribute iterators
+		typedef xml_attribute_iterator attribute_iterator;
+
+		attribute_iterator attributes_begin() const;
+		attribute_iterator attributes_end() const;
+
+		// Range-based for support
+		xml_object_range<xml_node_iterator> children() const;
+		xml_object_range<xml_named_node_iterator> children(const char_t* name) const;
+		xml_object_range<xml_attribute_iterator> attributes() const;
+
+		// Get node offset in parsed file/string (in char_t units) for debugging purposes
+		ptrdiff_t offset_debug() const;
+
+		// Get hash value (unique for handles to the same object)
+		size_t hash_value() const;
+
+		// Get internal pointer
+		xml_node_struct* internal_object() const;
+	};
+
+#ifdef __BORLANDC__
+	// Borland C++ workaround
+	bool PUGIXML_FUNCTION operator&&(const xml_node& lhs, bool rhs);
+	bool PUGIXML_FUNCTION operator||(const xml_node& lhs, bool rhs);
+#endif
+
+	// A helper for working with text inside PCDATA nodes
+	class PUGIXML_CLASS xml_text
+	{
+		friend class xml_node;
+
+		xml_node_struct* _root;
+
+		typedef void (*unspecified_bool_type)(xml_text***);
+
+		explicit xml_text(xml_node_struct* root);
+
+		xml_node_struct* _data_new();
+		xml_node_struct* _data() const;
+
+	public:
+		// Default constructor. Constructs an empty object.
+		xml_text();
+
+		// Safe bool conversion operator
+		operator unspecified_bool_type() const;
+
+		// Borland C++ workaround
+		bool operator!() const;
+
+		// Check if text object is empty
+		bool empty() const;
+
+		// Get text, or "" if object is empty
+		const char_t* get() const;
+
+		// Get text, or the default value if object is empty
+		const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
+
+		// Get text as a number, or the default value if conversion did not succeed or object is empty
+		int as_int(int def = 0) const;
+		unsigned int as_uint(unsigned int def = 0) const;
+		double as_double(double def = 0) const;
+		float as_float(float def = 0) const;
+
+	#ifdef PUGIXML_HAS_LONG_LONG
+		long long as_llong(long long def = 0) const;
+		unsigned long long as_ullong(unsigned long long def = 0) const;
+	#endif
+
+		// Get text as bool (returns true if first character is in '1tTyY' set), or the default value if object is empty
+		bool as_bool(bool def = false) const;
+
+		// Set text (returns false if object is empty or there is not enough memory)
+		bool set(const char_t* rhs);
+
+		// Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
+		bool set(int rhs);
+		bool set(unsigned int rhs);
+		bool set(double rhs);
+		bool set(bool rhs);
+
+	#ifdef PUGIXML_HAS_LONG_LONG
+		bool set(long long rhs);
+		bool set(unsigned long long rhs);
+	#endif
+
+		// Set text (equivalent to set without error checking)
+		xml_text& operator=(const char_t* rhs);
+		xml_text& operator=(int rhs);
+		xml_text& operator=(unsigned int rhs);
+		xml_text& operator=(double rhs);
+		xml_text& operator=(bool rhs);
+
+	#ifdef PUGIXML_HAS_LONG_LONG
+		xml_text& operator=(long long rhs);
+		xml_text& operator=(unsigned long long rhs);
+	#endif
+
+		// Get the data node (node_pcdata or node_cdata) for this object
+		xml_node data() const;
+	};
+
+#ifdef __BORLANDC__
+	// Borland C++ workaround
+	bool PUGIXML_FUNCTION operator&&(const xml_text& lhs, bool rhs);
+	bool PUGIXML_FUNCTION operator||(const xml_text& lhs, bool rhs);
+#endif
+
+	// Child node iterator (a bidirectional iterator over a collection of xml_node)
+	class PUGIXML_CLASS xml_node_iterator
+	{
+		friend class xml_node;
+
+	private:
+		mutable xml_node _wrap;
+		xml_node _parent;
+
+		xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent);
+
+	public:
+		// Iterator traits
+		typedef ptrdiff_t difference_type;
+		typedef xml_node value_type;
+		typedef xml_node* pointer;
+		typedef xml_node& reference;
+
+	#ifndef PUGIXML_NO_STL
+		typedef std::bidirectional_iterator_tag iterator_category;
+	#endif
+
+		// Default constructor
+		xml_node_iterator();
+
+		// Construct an iterator which points to the specified node
+		xml_node_iterator(const xml_node& node);
+
+		// Iterator operators
+		bool operator==(const xml_node_iterator& rhs) const;
+		bool operator!=(const xml_node_iterator& rhs) const;
+
+		xml_node& operator*() const;
+		xml_node* operator->() const;
+
+		const xml_node_iterator& operator++();
+		xml_node_iterator operator++(int);
+
+		const xml_node_iterator& operator--();
+		xml_node_iterator operator--(int);
+	};
+
+	// Attribute iterator (a bidirectional iterator over a collection of xml_attribute)
+	class PUGIXML_CLASS xml_attribute_iterator
+	{
+		friend class xml_node;
+
+	private:
+		mutable xml_attribute _wrap;
+		xml_node _parent;
+
+		xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent);
+
+	public:
+		// Iterator traits
+		typedef ptrdiff_t difference_type;
+		typedef xml_attribute value_type;
+		typedef xml_attribute* pointer;
+		typedef xml_attribute& reference;
+
+	#ifndef PUGIXML_NO_STL
+		typedef std::bidirectional_iterator_tag iterator_category;
+	#endif
+
+		// Default constructor
+		xml_attribute_iterator();
+
+		// Construct an iterator which points to the specified attribute
+		xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent);
+
+		// Iterator operators
+		bool operator==(const xml_attribute_iterator& rhs) const;
+		bool operator!=(const xml_attribute_iterator& rhs) const;
+
+		xml_attribute& operator*() const;
+		xml_attribute* operator->() const;
+
+		const xml_attribute_iterator& operator++();
+		xml_attribute_iterator operator++(int);
+
+		const xml_attribute_iterator& operator--();
+		xml_attribute_iterator operator--(int);
+	};
+
+	// Named node range helper
+	class PUGIXML_CLASS xml_named_node_iterator
+	{
+		friend class xml_node;
+
+	public:
+		// Iterator traits
+		typedef ptrdiff_t difference_type;
+		typedef xml_node value_type;
+		typedef xml_node* pointer;
+		typedef xml_node& reference;
+
+	#ifndef PUGIXML_NO_STL
+		typedef std::bidirectional_iterator_tag iterator_category;
+	#endif
+
+		// Default constructor
+		xml_named_node_iterator();
+
+		// Construct an iterator which points to the specified node
+		xml_named_node_iterator(const xml_node& node, const char_t* name);
+
+		// Iterator operators
+		bool operator==(const xml_named_node_iterator& rhs) const;
+		bool operator!=(const xml_named_node_iterator& rhs) const;
+
+		xml_node& operator*() const;
+		xml_node* operator->() const;
+
+		const xml_named_node_iterator& operator++();
+		xml_named_node_iterator operator++(int);
+
+		const xml_named_node_iterator& operator--();
+		xml_named_node_iterator operator--(int);
+
+	private:
+		mutable xml_node _wrap;
+		xml_node _parent;
+		const char_t* _name;
+
+		xml_named_node_iterator(xml_node_struct* ref, xml_node_struct* parent, const char_t* name);
+	};
+
+	// Abstract tree walker class (see xml_node::traverse)
+	class PUGIXML_CLASS xml_tree_walker
+	{
+		friend class xml_node;
+
+	private:
+		int _depth;
+	
+	protected:
+		// Get current traversal depth
+		int depth() const;
+	
+	public:
+		xml_tree_walker();
+		virtual ~xml_tree_walker();
+
+		// Callback that is called when traversal begins
+		virtual bool begin(xml_node& node);
+
+		// Callback that is called for each node traversed
+		virtual bool for_each(xml_node& node) = 0;
+
+		// Callback that is called when traversal ends
+		virtual bool end(xml_node& node);
+	};
+
+	// Parsing status, returned as part of xml_parse_result object
+	enum xml_parse_status
+	{
+		status_ok = 0,				// No error
+
+		status_file_not_found,		// File was not found during load_file()
+		status_io_error,			// Error reading from file/stream
+		status_out_of_memory,		// Could not allocate memory
+		status_internal_error,		// Internal error occurred
+
+		status_unrecognized_tag,	// Parser could not determine tag type
+
+		status_bad_pi,				// Parsing error occurred while parsing document declaration/processing instruction
+		status_bad_comment,			// Parsing error occurred while parsing comment
+		status_bad_cdata,			// Parsing error occurred while parsing CDATA section
+		status_bad_doctype,			// Parsing error occurred while parsing document type declaration
+		status_bad_pcdata,			// Parsing error occurred while parsing PCDATA section
+		status_bad_start_element,	// Parsing error occurred while parsing start element tag
+		status_bad_attribute,		// Parsing error occurred while parsing element attribute
+		status_bad_end_element,		// Parsing error occurred while parsing end element tag
+		status_end_element_mismatch,// There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag)
+
+		status_append_invalid_root,	// Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer)
+
+		status_no_document_element	// Parsing resulted in a document without element nodes
+	};
+
+	// Parsing result
+	struct PUGIXML_CLASS xml_parse_result
+	{
+		// Parsing status (see xml_parse_status)
+		xml_parse_status status;
+
+		// Last parsed offset (in char_t units from start of input data)
+		ptrdiff_t offset;
+
+		// Source document encoding
+		xml_encoding encoding;
+
+		// Default constructor, initializes object to failed state
+		xml_parse_result();
+
+		// Cast to bool operator
+		operator bool() const;
+
+		// Get error description
+		const char* description() const;
+	};
+
+	// Document class (DOM tree root)
+	class PUGIXML_CLASS xml_document: public xml_node
+	{
+	private:
+		char_t* _buffer;
+
+		char _memory[192];
+		
+		// Non-copyable semantics
+		xml_document(const xml_document&);
+		const xml_document& operator=(const xml_document&);
+
+		void create();
+		void destroy();
+
+	public:
+		// Default constructor, makes empty document
+		xml_document();
+
+		// Destructor, invalidates all node/attribute handles to this document
+		~xml_document();
+
+		// Removes all nodes, leaving the empty document
+		void reset();
+
+		// Removes all nodes, then copies the entire contents of the specified document
+		void reset(const xml_document& proto);
+
+	#ifndef PUGIXML_NO_STL
+		// Load document from stream.
+		xml_parse_result load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+		xml_parse_result load(std::basic_istream<wchar_t, std::char_traits<wchar_t> >& stream, unsigned int options = parse_default);
+	#endif
+
+		// Load document from zero-terminated string. No encoding conversions are applied.
+		xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
+
+		// Load document from file
+		xml_parse_result load_file(const char* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+		xml_parse_result load_file(const wchar_t* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+		// Load document from buffer. Copies/converts the buffer, so it may be deleted or changed after the function returns.
+		xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+		// Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
+		// You should ensure that buffer data will persist throughout the document's lifetime, and free the buffer memory manually once document is destroyed.
+		xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+		// Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
+		// You should allocate the buffer with pugixml allocation function; document will free the buffer when it is no longer needed (you can't use it anymore).
+		xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+		// Save XML document to writer (semantics is slightly different from xml_node::print, see documentation for details).
+		void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+
+	#ifndef PUGIXML_NO_STL
+		// Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details).
+		void save(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+		void save(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const;
+	#endif
+
+		// Save XML to file
+		bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+		bool save_file(const wchar_t* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+
+		// Get document element
+		xml_node document_element() const;
+	};
+
+#ifndef PUGIXML_NO_XPATH
+	// XPath query return type
+	enum xpath_value_type
+	{
+		xpath_type_none,	  // Unknown type (query failed to compile)
+		xpath_type_node_set,  // Node set (xpath_node_set)
+		xpath_type_number,	  // Number
+		xpath_type_string,	  // String
+		xpath_type_boolean	  // Boolean
+	};
+
+	// XPath parsing result
+	struct PUGIXML_CLASS xpath_parse_result
+	{
+		// Error message (0 if no error)
+		const char* error;
+
+		// Last parsed offset (in char_t units from string start)
+		ptrdiff_t offset;
+
+		// Default constructor, initializes object to failed state
+		xpath_parse_result();
+
+		// Cast to bool operator
+		operator bool() const;
+
+		// Get error description
+		const char* description() const;
+	};
+
+	// A single XPath variable
+	class PUGIXML_CLASS xpath_variable
+	{
+		friend class xpath_variable_set;
+
+	protected:
+		xpath_value_type _type;
+		xpath_variable* _next;
+
+		xpath_variable();
+
+		// Non-copyable semantics
+		xpath_variable(const xpath_variable&);
+		xpath_variable& operator=(const xpath_variable&);
+		
+	public:
+		// Get variable name
+		const char_t* name() const;
+
+		// Get variable type
+		xpath_value_type type() const;
+
+		// Get variable value; no type conversion is performed, default value (false, NaN, empty string, empty node set) is returned on type mismatch error
+		bool get_boolean() const;
+		double get_number() const;
+		const char_t* get_string() const;
+		const xpath_node_set& get_node_set() const;
+
+		// Set variable value; no type conversion is performed, false is returned on type mismatch error
+		bool set(bool value);
+		bool set(double value);
+		bool set(const char_t* value);
+		bool set(const xpath_node_set& value);
+	};
+
+	// A set of XPath variables
+	class PUGIXML_CLASS xpath_variable_set
+	{
+	private:
+		xpath_variable* _data[64];
+
+		// Non-copyable semantics
+		xpath_variable_set(const xpath_variable_set&);
+		xpath_variable_set& operator=(const xpath_variable_set&);
+
+		xpath_variable* find(const char_t* name) const;
+
+	public:
+		// Default constructor/destructor
+		xpath_variable_set();
+		~xpath_variable_set();
+
+		// Add a new variable or get the existing one, if the types match
+		xpath_variable* add(const char_t* name, xpath_value_type type);
+
+		// Set value of an existing variable; no type conversion is performed, false is returned if there is no such variable or if types mismatch
+		bool set(const char_t* name, bool value);
+		bool set(const char_t* name, double value);
+		bool set(const char_t* name, const char_t* value);
+		bool set(const char_t* name, const xpath_node_set& value);
+
+		// Get existing variable by name
+		xpath_variable* get(const char_t* name);
+		const xpath_variable* get(const char_t* name) const;
+	};
+
+	// A compiled XPath query object
+	class PUGIXML_CLASS xpath_query
+	{
+	private:
+		void* _impl;
+		xpath_parse_result _result;
+
+		typedef void (*unspecified_bool_type)(xpath_query***);
+
+		// Non-copyable semantics
+		xpath_query(const xpath_query&);
+		xpath_query& operator=(const xpath_query&);
+
+	public:
+		// Construct a compiled object from XPath expression.
+		// If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on compilation errors.
+		explicit xpath_query(const char_t* query, xpath_variable_set* variables = 0);
+
+		// Destructor
+		~xpath_query();
+
+		// Get query expression return type
+		xpath_value_type return_type() const;
+		
+		// Evaluate expression as boolean value in the specified context; performs type conversion if necessary.
+		// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+		bool evaluate_boolean(const xpath_node& n) const;
+		
+		// Evaluate expression as double value in the specified context; performs type conversion if necessary.
+		// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+		double evaluate_number(const xpath_node& n) const;
+		
+	#ifndef PUGIXML_NO_STL
+		// Evaluate expression as string value in the specified context; performs type conversion if necessary.
+		// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+		string_t evaluate_string(const xpath_node& n) const;
+	#endif
+		
+		// Evaluate expression as string value in the specified context; performs type conversion if necessary.
+		// At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero).
+		// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+		// If PUGIXML_NO_EXCEPTIONS is defined, returns empty  set instead.
+		size_t evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const;
+
+		// Evaluate expression as node set in the specified context.
+		// If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors.
+		// If PUGIXML_NO_EXCEPTIONS is defined, returns empty node set instead.
+		xpath_node_set evaluate_node_set(const xpath_node& n) const;
+
+		// Get parsing result (used to get compilation errors in PUGIXML_NO_EXCEPTIONS mode)
+		const xpath_parse_result& result() const;
+
+		// Safe bool conversion operator
+		operator unspecified_bool_type() const;
+
+		// Borland C++ workaround
+		bool operator!() const;
+	};
+	
+	#ifndef PUGIXML_NO_EXCEPTIONS
+	// XPath exception class
+	class PUGIXML_CLASS xpath_exception: public std::exception
+	{
+	private:
+		xpath_parse_result _result;
+
+	public:
+		// Construct exception from parse result
+		explicit xpath_exception(const xpath_parse_result& result);
+
+		// Get error message
+		virtual const char* what() const throw();
+
+		// Get parse result
+		const xpath_parse_result& result() const;
+	};
+	#endif
+	
+	// XPath node class (either xml_node or xml_attribute)
+	class PUGIXML_CLASS xpath_node
+	{
+	private:
+		xml_node _node;
+		xml_attribute _attribute;
+	
+		typedef void (*unspecified_bool_type)(xpath_node***);
+
+	public:
+		// Default constructor; constructs empty XPath node
+		xpath_node();
+		
+		// Construct XPath node from XML node/attribute
+		xpath_node(const xml_node& node);
+		xpath_node(const xml_attribute& attribute, const xml_node& parent);
+
+		// Get node/attribute, if any
+		xml_node node() const;
+		xml_attribute attribute() const;
+		
+		// Get parent of contained node/attribute
+		xml_node parent() const;
+
+		// Safe bool conversion operator
+		operator unspecified_bool_type() const;
+		
+		// Borland C++ workaround
+		bool operator!() const;
+
+		// Comparison operators
+		bool operator==(const xpath_node& n) const;
+		bool operator!=(const xpath_node& n) const;
+	};
+
+#ifdef __BORLANDC__
+	// Borland C++ workaround
+	bool PUGIXML_FUNCTION operator&&(const xpath_node& lhs, bool rhs);
+	bool PUGIXML_FUNCTION operator||(const xpath_node& lhs, bool rhs);
+#endif
+
+	// A fixed-size collection of XPath nodes
+	class PUGIXML_CLASS xpath_node_set
+	{
+	public:
+		// Collection type
+		enum type_t
+		{
+			type_unsorted,			// Not ordered
+			type_sorted,			// Sorted by document order (ascending)
+			type_sorted_reverse		// Sorted by document order (descending)
+		};
+		
+		// Constant iterator type
+		typedef const xpath_node* const_iterator;
+	
+		// Default constructor. Constructs empty set.
+		xpath_node_set();
+
+		// Constructs a set from iterator range; data is not checked for duplicates and is not sorted according to provided type, so be careful
+		xpath_node_set(const_iterator begin, const_iterator end, type_t type = type_unsorted);
+
+		// Destructor
+		~xpath_node_set();
+		
+		// Copy constructor/assignment operator
+		xpath_node_set(const xpath_node_set& ns);
+		xpath_node_set& operator=(const xpath_node_set& ns);
+
+		// Get collection type
+		type_t type() const;
+		
+		// Get collection size
+		size_t size() const;
+
+		// Indexing operator
+		const xpath_node& operator[](size_t index) const;
+		
+		// Collection iterators
+		const_iterator begin() const;
+		const_iterator end() const;
+
+		// Sort the collection in ascending/descending order by document order
+		void sort(bool reverse = false);
+		
+		// Get first node in the collection by document order
+		xpath_node first() const;
+		
+		// Check if collection is empty
+		bool empty() const;
+	
+	private:
+		type_t _type;
+		
+		xpath_node _storage;
+		
+		xpath_node* _begin;
+		xpath_node* _end;
+
+		void _assign(const_iterator begin, const_iterator end);
+	};
+#endif
+
+#ifndef PUGIXML_NO_STL
+	// Convert wide string to UTF8
+	std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const wchar_t* str);
+	std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >& str);
+	
+	// Convert UTF8 to wide string
+	std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const char* str);
+	std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const std::basic_string<char, std::char_traits<char>, std::allocator<char> >& str);
+#endif
+
+	// Memory allocation function interface; returns pointer to allocated memory or NULL on failure
+	typedef void* (*allocation_function)(size_t size);
+	
+	// Memory deallocation function interface
+	typedef void (*deallocation_function)(void* ptr);
+
+	// Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions.
+	void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate);
+	
+	// Get current memory management functions
+	allocation_function PUGIXML_FUNCTION get_memory_allocation_function();
+	deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function();
+}
+
+#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC))
+namespace std
+{
+	// Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
+	std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_node_iterator&);
+	std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_attribute_iterator&);
+	std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_named_node_iterator&);
+}
+#endif
+
+#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC)
+namespace std
+{
+	// Workarounds for (non-standard) iterator category detection
+	std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_node_iterator&);
+	std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_attribute_iterator&);
+	std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_named_node_iterator&);
+}
+#endif
+
+#endif
+
+/**
+ * Copyright (c) 2006-2014 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/apps/gadgetron/schema/gadgetron.xsd b/apps/gadgetron/schema/gadgetron.xsd
new file mode 100644
index 0000000..6cad8c0
--- /dev/null
+++ b/apps/gadgetron/schema/gadgetron.xsd
@@ -0,0 +1,81 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<xs:schema xmlns="http://gadgetron.sf.net/gadgetron" xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified" targetNamespace="http://gadgetron.sf.net/gadgetron">
+
+    <!--
+    Gadgetron configuration xml includes two sections: port and globalGadgetParameters
+    The 'port' is where gadgetron is listening to. The 'globalGadgetParameters' section 
+    lists the parameters which will be pre-set for all gadgets
+    e.g. if 'workingDirectory' in the globalGadgetParameters is set, the dependency measurements
+    will be stored and read from this directory. If it is not set, the default
+    directory will be used ('/tmp/gadgetron' in linux and 'c:/temp/gadgetron' in windows).
+    -->
+
+    <xs:element name="gadgetronConfiguration">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element name="port" type="xs:string"/>
+
+                <xs:element maxOccurs="unbounded" minOccurs="0" name="globalGadgetParameter">
+                    <xs:complexType>
+                        <xs:sequence>
+                            <xs:element maxOccurs="1" minOccurs="1" name="name" type="xs:string"/>
+                            <xs:element maxOccurs="1" minOccurs="1" name="value" type="xs:string"/>
+                        </xs:sequence>
+                    </xs:complexType>
+                </xs:element>
+
+		<xs:element maxOccurs="1" minOccurs="0" name="cloudBus">
+		  <xs:complexType>
+		    <xs:sequence>
+		      <xs:element maxOccurs="1" minOccurs="1" name="multiCastAddress" type="xs:string"/>
+		      <xs:element maxOccurs="1" minOccurs="1" name="port" type="xs:unsignedInt"/>
+		    </xs:sequence>
+		  </xs:complexType>
+		</xs:element>
+
+            </xs:sequence>
+        </xs:complexType>
+    </xs:element>
+
+  <xs:element name="gadgetronStreamConfiguration">
+    <xs:complexType>
+      <xs:sequence>
+                <xs:element maxOccurs="unbounded" minOccurs="0" name="reader">
+                    <xs:complexType>
+                          <xs:sequence>
+                              <xs:element name="slot" type="xs:unsignedShort"/>
+                              <xs:element name="dll" type="xs:string"/>
+                              <xs:element name="classname" type="xs:string"/>
+                          </xs:sequence>
+                      </xs:complexType>
+                </xs:element>
+                <xs:element maxOccurs="unbounded" minOccurs="0" name="writer">
+                    <xs:complexType>
+                          <xs:sequence>
+                              <xs:element maxOccurs="1" minOccurs="1" name="slot" type="xs:unsignedShort"/>
+                              <xs:element maxOccurs="1" minOccurs="1"  name="dll" type="xs:string"/>
+                              <xs:element maxOccurs="1" minOccurs="1"  name="classname" type="xs:string"/>
+                          </xs:sequence>
+                      </xs:complexType>
+                </xs:element>
+                <xs:element maxOccurs="unbounded" minOccurs="0" name="gadget">
+                    <xs:complexType>
+                          <xs:sequence>
+                              <xs:element maxOccurs="1" minOccurs="1"  name="name" type="xs:string"/>
+                              <xs:element maxOccurs="1" minOccurs="1"  name="dll" type="xs:string"/>
+                              <xs:element maxOccurs="1" minOccurs="1"  name="classname" type="xs:string"/>
+                              <xs:element maxOccurs="unbounded" minOccurs="0" name="property">
+                                  <xs:complexType>
+                                      <xs:sequence>
+                                          <xs:element maxOccurs="1" minOccurs="1" name="name" type="xs:string"/>
+                                          <xs:element maxOccurs="1" minOccurs="1" name="value" type="xs:string"/>
+                                      </xs:sequence>        
+                                  </xs:complexType>
+                              </xs:element>
+                             </xs:sequence>
+          </xs:complexType>
+        </xs:element>
+      </xs:sequence>
+    </xs:complexType>
+  </xs:element>
+</xs:schema>
diff --git a/apps/gadgetron/upstart/gadgetron.conf b/apps/gadgetron/upstart/gadgetron.conf
new file mode 100644
index 0000000..e957c65
--- /dev/null
+++ b/apps/gadgetron/upstart/gadgetron.conf
@@ -0,0 +1,36 @@
+description "Gadgetron Upstart Script - Starts and Stops Gadgetron server"
+version "1.0"
+author "Michael S. Hansen (michael.hansen at nih.gov)"
+
+start on filesystem or runlevel [2345]
+stop on runlevel [!2345]
+
+expect fork
+
+# configuration variables.
+env GADGETRON_HOME=/usr/local/gadgetron
+env GADGETRON_USER=gadgetron
+
+#Log output to log file (/var/log/upstart/gadgetron.log)
+console log
+
+pre-start script
+#We will make the log file world readable to make it easier for users (without sudo privileges) to monitor
+touch /var/log/upstart/gadgetron.log
+chmod o+r /var/log/upstart/gadgetron.log
+end script
+
+script
+
+export LD_LIBRARY_PATH="${GADGETRON_HOME}/lib:/usr/local/ismrmrd/lib:/usr/local/cuda/lib64:/usr/local/cula/lib64"
+export PATH=$PATH:${GADGETRON_HOME}/bin
+
+#Start as GADGETRON_USER
+exec su -s /bin/sh -c ${GADGETRON_HOME}/bin/gadgetron ${GADGETRON_USER} &
+
+# create a custom event in case we want to chain later
+emit gadgetron_running
+end script
+
+#respawn if process dies or is killed
+respawn
diff --git a/apps/gadgetron/webapp/CMakeLists.txt b/apps/gadgetron/webapp/CMakeLists.txt
new file mode 100644
index 0000000..7654088
--- /dev/null
+++ b/apps/gadgetron/webapp/CMakeLists.txt
@@ -0,0 +1,27 @@
+configure_file("gadgetron_web_app.in" ${CMAKE_BINARY_DIR}/gadgetron_web_app.cfg @ONLY)
+configure_file("gadgetron_web.conf.in" ${CMAKE_BINARY_DIR}/gadgetron_web.conf @ONLY)
+configure_file("gadgetron_web_ld.conf.in" ${CMAKE_BINARY_DIR}/gadgetron_web_ld.conf @ONLY)
+
+if (WIN32)
+    install(FILES gadgetron_web_app.py DESTINATION bin)
+    install(FILES ${CMAKE_BINARY_DIR}/gadgetron_web_app.cfg DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH})
+else (WIN32)
+    #install(FILES ${CMAKE_BINARY_DIR}/gadgetron_web.conf DESTINATION /etc/init COMPONENT web)
+    #install(FILES ${CMAKE_BINARY_DIR}/gadgetron_web_ld.conf DESTINATION /etc/ld.so.conf.d COMPONENT web)
+    install(FILES gadgetron_web_app.py DESTINATION bin COMPONENT web)
+    install(FILES ${CMAKE_BINARY_DIR}/gadgetron_web_app.cfg DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT web)
+endif (WIN32)
+
+message("Add gadgetron_web_info ...")
+add_executable(gadgetron_web_info main.cpp)
+install(TARGETS gadgetron_web_info DESTINATION bin COMPONENT web)
+
+install(FILES
+    ${CMAKE_BINARY_DIR}/gadgetron_web_app.cfg
+    ${CMAKE_BINARY_DIR}/gadgetron_web.conf
+    ${CMAKE_BINARY_DIR}/gadgetron_web_ld.conf
+    ${CMAKE_SOURCE_DIR}/chroot/gadgetron_chroot.conf
+    gadgetron_web_app.py
+    DESTINATION ${GADGETRON_INSTALL_CHROOT_SCRIPTS_PATH}
+    COMPONENT scripts
+    )
diff --git a/apps/gadgetron/webapp/gadgetron_web.conf b/apps/gadgetron/webapp/gadgetron_web.conf
new file mode 100644
index 0000000..14a9e7a
--- /dev/null
+++ b/apps/gadgetron/webapp/gadgetron_web.conf
@@ -0,0 +1,15 @@
+description     "Foobar management daemon"
+author          "Alex Smith"
+
+start on started network
+stop on stopping network
+stop on starting shutdown
+
+console output
+kill signal INT
+
+exec su -c "python /usr/local/gadgetron/bin/gadgetron_web_app.py /usr/local/gadgetron/config/gadgetron_web_app.cfg" hansenms
+
+respawn
+
+respawn
diff --git a/apps/gadgetron/webapp/gadgetron_web.conf.in b/apps/gadgetron/webapp/gadgetron_web.conf.in
new file mode 100644
index 0000000..4b210f7
--- /dev/null
+++ b/apps/gadgetron/webapp/gadgetron_web.conf.in
@@ -0,0 +1,16 @@
+description     "Foobar management daemon"
+author          "Alex Smith"
+
+start on started network
+start on started startup
+stop on stopping network
+stop on starting shutdown
+
+console output
+kill signal INT
+
+exec su -c "python @CMAKE_INSTALL_PREFIX@/bin/gadgetron_web_app.py @CMAKE_INSTALL_PREFIX@/config/gadgetron_web_app.cfg" gadgetron
+
+respawn
+
+respawn
diff --git a/apps/gadgetron/webapp/gadgetron_web_app.cfg b/apps/gadgetron/webapp/gadgetron_web_app.cfg
new file mode 100644
index 0000000..0eb5f3a
--- /dev/null
+++ b/apps/gadgetron/webapp/gadgetron_web_app.cfg
@@ -0,0 +1,8 @@
+[WEBSERVER]
+port=8090
+
+[GADGETRON]
+port=9002
+GADGETRON_HOME=/usr/local/gadgetron
+ISMRMRD_HOME=/usr/local
+logfile=/tmp/gadgetron.log
\ No newline at end of file
diff --git a/apps/gadgetron/webapp/gadgetron_web_app.in b/apps/gadgetron/webapp/gadgetron_web_app.in
new file mode 100644
index 0000000..ccc0fe5
--- /dev/null
+++ b/apps/gadgetron/webapp/gadgetron_web_app.in
@@ -0,0 +1,8 @@
+[WEBSERVER]
+port=8090
+
+[GADGETRON]
+port=9002
+GADGETRON_HOME=@CMAKE_INSTALL_PREFIX@
+ISMRMRD_HOME=@ISMRMRD_INCLUDE_DIR@/..
+logfile=/tmp/gadgetron.log
\ No newline at end of file
diff --git a/apps/gadgetron/webapp/gadgetron_web_app.py b/apps/gadgetron/webapp/gadgetron_web_app.py
new file mode 100644
index 0000000..be57309
--- /dev/null
+++ b/apps/gadgetron/webapp/gadgetron_web_app.py
@@ -0,0 +1,182 @@
+from twisted.web import server, resource, static
+from twisted.internet import reactor
+
+import subprocess
+import time
+import sys
+import ConfigParser
+import os
+import platform
+import threading 
+import signal
+import psutil
+import inspect
+import socket;
+
+run_gadgetron_check = True
+
+def ctrlcsignal(signal, frame):
+    global reactor
+    global run_gadgetron_check
+    print "Shutting down server (SIGINT)"
+    run_gadgetron_check = False
+    reactor.stop()
+
+def termsignal(signal, frame):
+    global reactor
+    global run_gadgetron_check
+    print "Shutting down server (TERM)"
+    run_gadgetron_check = False
+    reactor.stop()
+
+def isGadgetronAlive(port,environment):
+    try:
+        hostname = socket.gethostbyname(socket.gethostname())
+    except:
+        hostname = "127.0.0.1"
+
+    process = subprocess.Popen(["gt_alive",hostname,str(port)], env=environment)
+    
+    time.sleep(1)
+    ret = process.poll()
+    if ret == None:
+        #Process is hanging
+        process.kill()
+        return -1
+    elif ret != 0:
+        #Failed to connect
+        return -1
+    else:
+        return 0
+
+
+class GadgetronResource(resource.Resource):
+    isLeaf = True
+    numberRequests = 0
+    gadgetron_log_filename = 'gadgetron_log.txt'
+    gadgetron_process = 0
+    environment = 0;
+    gadgetron_port = 9002
+    check_thread = 0
+    run_gadgetron_check = True
+    process_lock = threading.Lock()
+
+    def __init__(self, cfgfilename):
+        config = ConfigParser.RawConfigParser()
+        config.read(cfgfilename)
+        gadgetron_home = config.get('GADGETRON', 'GADGETRON_HOME')
+        ismrmrd_home = config.get('GADGETRON', 'ISMRMRD_HOME')
+        self.gadgetron_log_filename = config.get('GADGETRON','logfile')
+        self.gadgetron_port = config.get('GADGETRON','port')
+        gf = open(self.gadgetron_log_filename,"w")
+        
+        self.environment = dict()
+        self.environment["GADGETRON_HOME"]=gadgetron_home
+        self.environment["PATH"]=self.environment["GADGETRON_HOME"] + "/bin"
+
+        if (platform.system() == 'Linux'):
+            self.environment["LD_LIBRARY_PATH"]="/usr/local/cuda/lib64:/usr/local/cula/lib64:" +  self.environment["GADGETRON_HOME"] + "/lib:" + ismrmrd_home + "/lib"  
+        elif (platform.system() == 'Darwin'):
+            self.environment["DYLD_LIBRARY_PATH"]="/usr/local/cuda/lib64:/usr/local/cula/lib64:" +  self.environment["GADGETRON_HOME"] + "/lib:" + ismrmrd_home + "/lib:/opt/local/lib"  
+
+        #self.process_lock.acquire()
+        self.gadgetron_process = subprocess.Popen(["gadgetron","-p",self.gadgetron_port], env=self.environment,stdout=gf,stderr=gf)
+        #self.process_lock.release()
+        resource.Resource.__init__(self)
+        
+        self.check_thread = threading.Thread(target=self.check_gadgetron)
+        self.check_thread.start()
+
+    def __del__(self):
+        self.run_gadgetron_check = False
+        self.check_thread.join()
+        self.gadgetron_process.terminate()
+
+    def restart_gadgetron(self):
+        self.process_lock.acquire()
+        s = self.gadgetron_process.poll()
+        if (s == None):
+            self.gadgetron_process.kill()
+            time.sleep(2)
+        gf = open(self.gadgetron_log_filename,"w")
+        self.gadgetron_process = subprocess.Popen(["gadgetron","-p",self.gadgetron_port], env=self.environment,stdout=gf,stderr=gf)
+        time.sleep(2)
+        self.process_lock.release()
+
+    def check_gadgetron(self):
+        global run_gadgetron_check
+        while (run_gadgetron_check):
+            self.process_lock.acquire()
+            s = self.gadgetron_process.poll()
+            self.process_lock.release()
+            if (s != None):
+                self.restart_gadgetron()
+            time.sleep(3)
+        
+
+    def render_page(self):
+        doc = "<html>\n<body>\n"
+        doc += "<h1>Gadgetron Monitor</h1>\n"
+
+        alive = (isGadgetronAlive(self.gadgetron_port,self.environment) == 0)
+
+        doc += "<div>Gadgetron Status: "
+
+        if (alive):
+            doc += "<span style=\"color: green;\">[OK]</span></div>"
+        else:
+            doc += "<span style=\"color: red;\">[UNRESPONSIVE]</span></div>"
+            
+        doc += "<div><p><span><form method=\"POST\"><input type=\"submit\" value=\"RESTART\"><input type=\"hidden\" name=\"command\" value=\"restart\"></form></span></div>"
+        doc += "<div><p><span><form method=\"POST\"><input type=\"submit\" value=\"REFRESH\"><input type=\"hidden\" name=\"command\" value=\"refresh\"></form></span></div>"
+        if (alive):
+            p = psutil.Process(self.gadgetron_process.pid)
+            doc += "<div><ul>"
+            doc += "<li>Process ID: " + str(self.gadgetron_process.pid) + "</li>"
+            doc += "<li>CPU Percent: " + str(round(p.get_cpu_percent(),2)) + "</li>"
+            doc += "<li>Memory Percent: " + str(round(p.get_memory_percent(),2)) + "</li>"
+            doc += "</ul></div>"
+
+            doc += "<div><iframe width=\"1024\" height=\"768\" src=\"/log\"></iframe></div>" 
+        
+        doc += "</body>\n</html>"
+        return doc
+
+        
+    def render_GET(self, request):
+        return self.render_page()
+        
+    def render_POST(self, request):
+        if 'command' in request.args:
+            if request.args['command'] == ['restart']:
+                print "Restarting Gadgetron"
+                self.restart_gadgetron()
+
+        return self.render_page()
+
+class GadgetronLogResource(resource.Resource):
+    filename = ""
+
+    def __init__(self, logfilename):
+        self.filename = logfilename
+        resource.Resource.__init__(self)
+
+    def render_GET(self, request):
+        gf = open(self.filename,"r")
+        l = gf.read()
+        return "<html><body><pre style=\"font-size: 8px\">" + l + "</pre></body></html>"
+
+config = ConfigParser.RawConfigParser()
+config.read(sys.argv[1])
+gadgetron_home = config.get('GADGETRON', 'GADGETRON_HOME')
+port = int(config.get('WEBSERVER','port'))
+
+root = resource.Resource()
+root.putChild('gadgetron',GadgetronResource(sys.argv[1]))
+root.putChild('log', GadgetronLogResource(config.get('GADGETRON','logfile')))
+
+signal.signal(signal.SIGINT, ctrlcsignal)
+signal.signal(signal.SIGHUP, termsignal)
+
+reactor.listenTCP(port, server.Site(root))
+reactor.run()
diff --git a/apps/gadgetron/webapp/gadgetron_web_ld.conf.in b/apps/gadgetron/webapp/gadgetron_web_ld.conf.in
new file mode 100644
index 0000000..26c2dc1
--- /dev/null
+++ b/apps/gadgetron/webapp/gadgetron_web_ld.conf.in
@@ -0,0 +1,2 @@
+ at CMAKE_INSTALL_PREFIX@/../lib
+ at CMAKE_INSTALL_PREFIX@/lib
\ No newline at end of file
diff --git a/apps/gadgetron/webapp/main.cpp b/apps/gadgetron/webapp/main.cpp
new file mode 100644
index 0000000..f5525c2
--- /dev/null
+++ b/apps/gadgetron/webapp/main.cpp
@@ -0,0 +1,8 @@
+
+#include <iostream>
+#include <string>
+
+int main(int argc, char *argv[])
+{
+    return 0;
+}
diff --git a/apps/matlab/mexGT.h b/apps/matlab/mexGT.h
new file mode 100644
index 0000000..3c61ea7
--- /dev/null
+++ b/apps/matlab/mexGT.h
@@ -0,0 +1,580 @@
+/*
+ * @(#)mex.h    generated by: makeheader 4.21  Fri Apr 23 18:16:45 2004
+ *
+ *		built from:	../../src/include/copyright.h
+ *				../../src/include/pragma_interface.h
+ *				mex_typedefs.h
+ *				./fmexapi.cpp
+ *				./fmexapiv5.cpp
+ *				./globals.cpp
+ *				./mexapi.cpp
+ *				./mexapiv4.cpp
+ *				./mexapiv5.cpp
+ *				./mexcbk.cpp
+ *				./mexdispatch.cpp
+ *				./mexintrf.cpp
+ *				mexdbg.h
+ */
+
+#ifndef mex_h
+#define mex_h
+
+
+/*
+ * Copyright 1984-2003 The MathWorks, Inc.
+ * All Rights Reserved.
+ */
+
+
+
+/*
+ * Prevent g++ from making copies of vtable and typeinfo data
+ * in every compilation unit.  By allowing for only one, we can
+ * save space and prevent some situations where the linker fails
+ * to coalesce them properly into a single entry.
+ *
+ * References:
+ *    http://gcc.gnu.org/onlinedocs/gcc/Vague-Linkage.html#Vague%20Linkage
+ *    http://gcc.gnu.org/onlinedocs/gcc/C---Interface.html
+ */
+
+#ifdef __cplusplus
+#  ifdef GLNX86
+#    pragma interface
+#  endif
+#endif
+
+
+
+/* $Revision: 1.7 $ */
+#ifndef mex_typedefs_h
+#define mex_typedefs_h
+typedef struct impl_info_tag *MEX_impl_info;
+
+#include "matrix.h"
+
+typedef struct mexGlobalTableEntry_Tag
+{
+    const char *name;             /* The name of the global */
+    mxArray    **variable;        /* A pointer to the variable */ 
+} mexGlobalTableEntry, *mexGlobalTable;
+
+#if defined(MSWIND)
+#define cicompare(s1,s2) utStrcmpi((s1),(s2))
+#else
+#define cicompare(s1,s2) strcmp((s1),(s2))
+#endif
+#define cscompare(s1,s2) strcmp((s1),(s2))
+
+typedef struct mexFunctionTableEntry_tag {
+  const char *  name;
+  mxFunctionPtr f;
+  int           nargin;
+  int           nargout;
+  struct _mexLocalFunctionTable *local_function_table;
+} mexFunctionTableEntry, *mexFunctionTable;
+
+typedef struct _mexLocalFunctionTable {
+  size_t           length;
+  mexFunctionTable entries;
+} _mexLocalFunctionTable, *mexLocalFunctionTable;
+
+typedef struct {
+  void (*initialize)(void);
+  void (*terminate)(void);
+} _mexInitTermTableEntry, *mexInitTermTableEntry;
+
+#define MEX_INFORMATION_VERSION 1
+
+typedef struct {
+  int                   version;
+  int                   file_function_table_length;
+  mexFunctionTable      file_function_table;
+  int                   global_variable_table_length;
+  mexGlobalTable        global_variable_table;
+  int                   npaths;
+  const char **         paths;
+  int                   init_term_table_length;
+  mexInitTermTableEntry init_term_table;
+} _mex_information, *mex_information;
+
+typedef mex_information(*fn_mex_file)(void);
+
+typedef void (*fn_clean_up_after_error)(void);
+typedef const char *(*fn_simple_function_to_string)(mxFunctionPtr f);
+
+typedef void (*fn_mex_enter_mex_library)(mex_information x);
+typedef fn_mex_enter_mex_library fn_mex_exit_mex_library;
+
+typedef mexLocalFunctionTable (*fn_mex_get_local_function_table)(void);
+typedef mexLocalFunctionTable (*fn_mex_set_local_function_table)(mexLocalFunctionTable);
+
+#endif
+
+
+/*
+ * This header file "mex.h" declares all the types, macros and
+ * functions necessary to interface mex files with the current
+ * version of MATLAB.  See the release notes for information on 
+ * supporting syntax from earlier versions.
+ */  
+#include "matrix.h"
+
+#include <stdio.h>
+
+
+
+#ifdef __cplusplus
+extern"C"__declspec(dllexport)
+#endif
+
+void mexFunction(
+    int           nlhs,           /* number of expected outputs */
+    mxArray       *plhs[],        /* array of pointers to output arguments */
+    int           nrhs,           /* number of inputs */
+    const mxArray *prhs[]         /* array of pointers to input arguments */
+);
+#ifdef __cplusplus
+#endif
+
+/*#ifdef __cpluslus
+#define _MEXFUNCTION extern"C"__declspec(dllexport)
+#else
+#define _MEXFUNCTION __declspec(dllexport)
+#endif
+
+_MEXFUNCTION void mexFunction(
+    int           nlhs,           
+    mxArray       *plhs[],        
+    int           nrhs,           
+    const mxArray *prhs[]         
+);*/
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Issue error message and return to MATLAB prompt
+ */
+extern void mexErrMsgTxt(
+    const char	*error_msg	/* string with error message */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Issue formatted error message with corresponding error identifier and return to MATLAB
+ * prompt.
+ */
+extern void mexErrMsgIdAndTxt(
+    const char * identifier, /* string with error message identifier */
+    const char * err_msg,    /* string with error message printf-style format */
+    ...                      /* any additional arguments */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Invoke an unidentified warning. Such warnings can only be affected by the M-code
+ * 'warning * all', since they have no specific identifier. See also mexWarnMsgIdAndTxt.
+ */
+extern void mexWarnMsgTxt(
+    const char	*warn_msg	/* string with warning message */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Invoke a warning with message identifier 'identifier' and message derived from 'fmt' and
+ * subsequent arguments. The warning may either get printed as is (if it is set to 'on'), or
+ * not actually get printed (if set to 'off'). See 'help warning' in MATLAB for more
+ * details.
+ */
+extern void mexWarnMsgIdAndTxt(
+    const char * identifier,    /* string with warning message identifer */
+    const char * warn_msg,	/* string with warning message printf-style format */
+    ...                         /* any additional arguments */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * mex equivalent to MATLAB's "disp" function
+ */
+extern int mexPrintf(
+    const char	*fmt,	/* printf style format */
+    ...				/* any additional arguments */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+
+#define printf mexPrintf
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Remove all components of an array plus the array header itself
+ * from MATLAB's memory allocation list.  The array will now
+ * persist between calls to the mex function.  To destroy this
+ * array, you will need to explicitly call mxDestroyArray().
+ */
+extern void mexMakeArrayPersistent(
+    mxArray *pa              /* pointer to array */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Remove memory previously allocated via mxCalloc from MATLAB's
+ * memory allocation list.  To free this memory, you will need to
+ * explicitly call mxFree().
+ */
+extern void mexMakeMemoryPersistent(void *ptr);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Look up a function and return an opaque handle for use with
+ * mexCallMATLABFunction.
+ */
+extern void mexGetFunctionHandle(void);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Call a function whose handle was determined by mexGetFunctionHandle.
+ */
+extern void mexCallMATLABFunction(void);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Register a function pointer as a MATLAB-callable function.
+ */
+extern void mexRegisterFunction(void);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * mex equivalent to MATLAB's "set" function
+ */
+extern int mexSet(double handle, const char *property, mxArray *value);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* API interface which mimics the "get" function */
+extern const mxArray *mexGet(double handle, const char *property);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * call MATLAB function
+ */
+extern int mexCallMATLAB(
+    int		nlhs,			/* number of expected outputs */
+    mxArray	*plhs[],		/* pointer array to outputs */
+    int		nrhs,			/* number of inputs */
+    mxArray	*prhs[],		/* pointer array to inputs */
+    const char	*fcn_name		/* name of function to execute */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * set or clear mexCallMATLAB trap flag (if set then an error in  
+ * mexCallMATLAB is caught and mexCallMATLAB will return a status value, 
+ * if not set an error will cause control to revert to MATLAB)
+ */
+extern void mexSetTrapFlag(int flag);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Perform in-place subscript assignment.
+ */
+extern void mexSubsAssign(
+      mxArray *plhs, /* pointer to lhs, to be modified in-place */
+      const mxArray *prhs, /* pointer to rhs */
+      const mxArray *subs[], /* array of subscripts for lhs */
+      int nsubs     /* number os subscripts */
+      );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Retrieve a specified subset of an array.
+ */
+extern mxArray *mexSubsReference(
+      const mxArray *prhs, /* pointer to rhs */
+      const mxArray *subs[], /* array of subscripts for rhs */
+      int nsubs /* number of subscripts */
+      );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Print an assertion-style error message and return control to the
+ * MATLAB command line.
+ */ 
+extern void mexPrintAssertion(
+		const char *test, 
+		const char *fname, 
+		int linenum, 
+		const char *message);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Tell whether or not a mxArray is in MATLAB's global workspace.
+ */
+extern bool mexIsGlobal(const mxArray *pA);
+#ifdef __cplusplus
+}
+#endif
+
+
+#define mexGetGlobal()    mexGetGlobal_is_obsolete
+#define mxSetString()     mxSetString_is_obsolete
+#define mxSetDispMode()   mxSetDispMode_is_obsolete
+#define mexGetMatrixPtr() mexGetMatrixPtr_is_obsolete
+#define mexGetMatrix()    mexGetMatrix_is_obsolete
+#define mexPutMatrix()    mexPutMatrix_is_obsolete
+#define mexPutFull()      mexPutFull_is_obsolete
+#define mexGetFull()      mexGetFull_is_obsolete
+#define mexGetEps()       mexGetEps_is_obsolete
+#define mexGetInf()       mexGetInf_is_obsolete
+#define mexGetNaN()       mexGetNaN_is_obsolete
+#define mexIsFinite()     mexIsFinite_is_obsolete
+#define mexIsInf()        mexIsInf_is_obsolete
+#define mexIsNaN()        mexIsNaN_is_obsolete
+
+
+/*
+ * mexAddFlops is no longer allowed.  
+ */
+#define mexAddFlops(x) mexAddFlops_is_obsolete
+
+#if defined(V5_COMPAT)
+#define mexPutArray(parray, workspace) mexPutVariable(workspace, mxGetName(parray), parray)
+#define mexGetArray(name, workspace) mexGetVariable(workspace, name)
+#define mexGetArrayPtr(name, workspace) mexGetVariablePtr(workspace, name)
+#else
+#define mexPutArray() mexPutArray_is_obsolete
+#define mexGetArray() mexGetArray_is_obsolete
+#define mexGetArrayPtr() mexGetArrayPtr_is_obsolete
+#endif /* defined(V5_COMPAT) */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Place a copy of the array value into the specified workspace with the
+ * specified name
+ */
+extern int mexPutVariable(
+    const char *workspace,
+    const char *name,
+    const mxArray *parray		/* matrix to copy */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * return a pointer to the array value with the specified variable
+ * name in the specified workspace
+ */
+extern const mxArray *mexGetVariablePtr(
+    const char *workspace,
+    const char *name		/* name of symbol */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * return a copy of the array value with the specified variable
+ * name in the specified workspace
+ */
+extern mxArray *mexGetVariable(
+    const char	*workspace,		
+    const char  *name                /* name of variable in question */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Lock a MEX-function so that it cannot be cleared from memory.
+ */
+extern void mexLock(void);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Unlock a locked MEX-function so that it can be cleared from memory.
+ */
+extern void mexUnlock(void);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Return true if the MEX-function is currently locked, false otherwise.
+ */
+extern bool mexIsLocked(void);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Return the name of a the MEXfunction currently executing.
+ */
+extern const char *mexFunctionName(void);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Parse and execute MATLAB syntax in string.  Returns zero if successful,
+ * and a non zero value if an error occurs.
+ */
+extern int mexEvalString(
+   const char *str	   /* matlab command string */
+);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Register Mex-file's At-Exit function (accessed via MEX callback)
+ */
+extern int mexAtExit(
+    void	(*exit_fcn)(void)
+    );
+#ifdef __cplusplus
+}
+#endif
+
+
+#define NEW_DISPATCHER_EVAL_CALLER 1
+
+
+/* $Revision: 1.9 $ */
+#ifdef ARGCHECK
+
+#include "mwdebug.h" /* Prototype _d versions of API functions */
+
+#define mexAtExit(exitfcn) 				mexAtExit_d(exitfcn, __FILE__, __LINE__)
+#define mexCallMATLAB(nlhs, plhs, nrhs, prhs, fcn) mexCallMATLAB_d(nlhs, plhs, nrhs, prhs, fcn, __FILE__, __LINE__)
+#define mexErrMsgTxt(errmsg)			mexErrMsgTxt_d(errmsg, __FILE__, __LINE__)
+#define mexEvalString(str) 				mexEvalString_d(str, __FILE__, __LINE__)
+#define mexGet(handle, property) 		mexGet_d(handle, property, __FILE__, __LINE__)
+#define mexGetVariable(workspace, name) 	mexGetVariable_d(workspace, name, __FILE__, __LINE__)
+#define mexGetVariablePtr(workspace, name)      mexGetVariablePtr_d(workspace, name, __FILE__, __LINE__)
+#define mexIsGlobal(pa)                 mexIsGlobal_d(pa, __FILE__, __LINE__)
+#define mexMakeArrayPersistent(pa) 		mexMakeArrayPersistent_d(pa, __FILE__, __LINE__)              
+#define mexMakeMemoryPersistent(ptr) 	mexMakeMemoryPersistent_d(ptr, __FILE__, __LINE__)
+#define mexPutVariable(workspace, name, pa) 	mexPutVariable_d(workspace, name, pa, __FILE__, __LINE__)
+#define mexSet(handle, property, value) mexSet_d(handle, property, value, __FILE__, __LINE__)
+#define mexSetTrapFlag(value)           mexSetTrapFlag_d(value, __FILE__, __LINE__)
+#define mexSubsAssign(plhs, sub, nsubs, rhs)    mexSubsAssign_d(plhs, sub, nsubs, rhs, __FILE__, __LINE__)
+#define mexSubsReference(prhs, sub, nsubs)    mexSubsReference_d(prhs, sub, nsubs, __FILE__, __LINE__)
+#define mexWarnMsgTxt(str)		 		mexWarnMsgTxt_d(str, __FILE__, __LINE__)
+#endif
+
+#endif /* mex_h */
diff --git a/apps/standalone/CMakeLists.txt b/apps/standalone/CMakeLists.txt
new file mode 100644
index 0000000..62e606b
--- /dev/null
+++ b/apps/standalone/CMakeLists.txt
@@ -0,0 +1,13 @@
+if (MKL_FOUND)
+    INCLUDE_DIRECTORIES( ${MKL_INCLUDE_DIR} )
+    LINK_DIRECTORIES( ${MKL_LIB_DIR} ${MKL_COMPILER_LIB_DIR} )
+    link_libraries(${MKL_LIBRARIES})
+endif (MKL_FOUND)
+
+if (ARMADILLO_FOUND)
+  add_subdirectory(cpu)
+endif (ARMADILLO_FOUND)
+
+if (CUDA_FOUND)
+  add_subdirectory(gpu)
+endif(CUDA_FOUND)
diff --git a/apps/standalone/cpu/CMakeLists.txt b/apps/standalone/cpu/CMakeLists.txt
new file mode 100644
index 0000000..da2fee8
--- /dev/null
+++ b/apps/standalone/cpu/CMakeLists.txt
@@ -0,0 +1,48 @@
+include_directories( 
+    ${ARMADILLO_INCLUDE_DIRS}
+    ${Boost_INCLUDE_DIR}
+    ${FFTW3_INCLUDE_DIR}
+    ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/algorithm
+    ${CMAKE_SOURCE_DIR}/toolboxes/operators
+    ${CMAKE_SOURCE_DIR}/toolboxes/operators/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+    ${CMAKE_SOURCE_DIR}/toolboxes/solvers/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu/transformation
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu/solver
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu/warper
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu/dissimilarity
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu/register
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu/application
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/util
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/workflow
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/algorithm
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/solver
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/matlab
+    ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+    ${CMAKE_SOURCE_DIR}/apps/gadgetron
+    ${CMAKE_SOURCE_DIR}/apps/matlab
+    ${CMAKE_SOURCE_DIR}/gadgets/mri_core 
+    ${CMAKE_SOURCE_DIR}/gadgets/gtPlus 
+)
+
+#add_subdirectory(MRI)
+add_subdirectory(denoising)
+#add_subdirectory(deblurring)
+add_subdirectory(registration)
+
+if(ISMRMRD_FOUND)
+  add_subdirectory(gtplus)
+endif()
diff --git a/apps/standalone/cpu/denoising/2d/CMakeLists.txt b/apps/standalone/cpu/denoising/2d/CMakeLists.txt
new file mode 100644
index 0000000..f7a6baf
--- /dev/null
+++ b/apps/standalone/cpu/denoising/2d/CMakeLists.txt
@@ -0,0 +1,26 @@
+if (WIN32)
+    ADD_DEFINITIONS(-D_USE_MATH_DEFINES)
+endif (WIN32)
+
+include_directories( 
+                    ${CMAKE_SOURCE_DIR}/toolboxes/core 
+                    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu 
+                    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+                    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math 
+                    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+                    ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+                    ${CMAKE_SOURCE_DIR}/toolboxes/solvers/cpu
+                    ${CMAKE_SOURCE_DIR}/toolboxes/operators
+                    ${CMAKE_SOURCE_DIR}/toolboxes/operators/cpu
+                    ${ACE_INCLUDE_DIR}
+                    ${ISMRMRD_INCLUDE_DIR} )
+
+add_executable(cpu_denoise_TV denoise_TV.cpp)
+
+target_link_libraries(cpu_denoise_TV 
+                    gadgetron_toolbox_cpucore 
+                    gadgetron_toolbox_cpucore_math 
+                    gadgetron_toolbox_hostutils
+                    ${ARMADILLO_LIBRARIES} )
+
+install(TARGETS cpu_denoise_TV DESTINATION bin COMPONENT main)
diff --git a/apps/standalone/cpu/denoising/2d/denoise_TV.cpp b/apps/standalone/cpu/denoising/2d/denoise_TV.cpp
new file mode 100644
index 0000000..0a82d3d
--- /dev/null
+++ b/apps/standalone/cpu/denoising/2d/denoise_TV.cpp
@@ -0,0 +1,117 @@
+/*
+  Total variation denoising based on the paper 
+  "The Split Bregman Method for L1-Regularized Problems" by Tom Goldstein and Stanley Osher. 
+  Siam J. Imaging Sciences. Vol. 2, No. 2, pp. 323-343.
+*/
+
+// Gadgetron includes
+#include "hoNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "hoSbCgSolver.h"
+#include "hoIdentityOperator.h"
+#include "hoPartialDerivativeOperator.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Noisy image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "denoised_image_TV.real" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of cg iterations", true, "20" );
+  parms.add_parameter( 'I', COMMAND_LINE_INT,    1, "Number of sb inner iterations", true, "1" );
+  parms.add_parameter( 'O', COMMAND_LINE_INT,    1, "Number of sb outer iterations", true, "10" );
+  parms.add_parameter( 'm', COMMAND_LINE_FLOAT,  1, "Regularization weight (mu)", true, "25.0" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running denoising with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+    
+  // Load sample data from disk
+  boost::shared_ptr< hoNDArray<_real> > data = 
+    read_nd_array<_real>((char*)parms.get_parameter('d')->get_string_value());
+
+  if( !data.get() ){
+    cout << endl << "Input image not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( data->get_number_of_dimensions() != 2 ){
+    cout << endl << "Input image is not two-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  _real mu = (_real) parms.get_parameter('m')->get_float_value();
+  _real lambda = (_real)2.0*mu; // This is a good alround setting according to Goldstein et al.
+
+  if( mu <= (_real) 0.0 ) {
+    cout << endl << "Regularization parameter mu should be strictly positive. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  size_t num_cg_iterations = parms.get_parameter('i')->get_int_value();
+  size_t num_inner_iterations = parms.get_parameter('I')->get_int_value();
+  size_t num_outer_iterations = parms.get_parameter('O')->get_int_value();
+  
+  // Setup regularization operators
+  boost::shared_ptr< hoPartialDerivativeOperator<_real,2> > Rx( new hoPartialDerivativeOperator<_real,2>(0) );
+  Rx->set_weight( lambda );
+  Rx->set_domain_dimensions(data->get_dimensions().get());
+  Rx->set_codomain_dimensions(data->get_dimensions().get());
+  
+  boost::shared_ptr< hoPartialDerivativeOperator<_real,2> > Ry( new hoPartialDerivativeOperator<_real,2>(1) );
+  Ry->set_weight( lambda );
+  Ry->set_domain_dimensions(data->get_dimensions().get());
+  Ry->set_codomain_dimensions(data->get_dimensions().get());
+  
+  // Define encoding operator (identity)
+  boost::shared_ptr< identityOperator<hoNDArray<_real> > > E( new identityOperator<hoNDArray<_real> >() );
+  E->set_weight( mu );
+  E->set_domain_dimensions(data->get_dimensions().get());
+  E->set_codomain_dimensions(data->get_dimensions().get());
+  
+  // Setup split-Bregman solver
+  hoSbCgSolver<_real> sb;
+  sb.set_encoding_operator( E );
+  //sb.add_regularization_operator( Rx ); // Anisotropic denoising
+  //sb.add_regularization_operator( Ry ); // Anisotropic denoising
+  sb.add_regularization_group_operator( Rx ); // Isotropic denoising
+  sb.add_regularization_group_operator( Ry); // Isotropic denoising
+  sb.add_group();
+  sb.set_max_outer_iterations(num_outer_iterations);
+  sb.set_max_inner_iterations(num_inner_iterations);
+  sb.set_output_mode( hoCgSolver<_real>::OUTPUT_VERBOSE );
+  
+  // Setup inner conjugate gradient solver
+  sb.get_inner_solver()->set_max_iterations( num_cg_iterations );
+  sb.get_inner_solver()->set_tc_tolerance( 1e-4 );
+  sb.get_inner_solver()->set_output_mode( hoCgSolver<_real>::OUTPUT_WARNINGS );
+  
+  // Run split-Bregman solver
+  boost::shared_ptr< hoNDArray<_real> > sbresult = sb.solve(data.get());
+  
+  // All done, write out the result
+  write_nd_array<_real>(sbresult.get(), (char*)parms.get_parameter('r')->get_string_value());
+  
+  return 0;
+}
diff --git a/apps/standalone/cpu/denoising/CMakeLists.txt b/apps/standalone/cpu/denoising/CMakeLists.txt
new file mode 100644
index 0000000..5c4cec9
--- /dev/null
+++ b/apps/standalone/cpu/denoising/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(2d)
diff --git a/apps/standalone/cpu/gtplus/CMakeLists.txt b/apps/standalone/cpu/gtplus/CMakeLists.txt
new file mode 100644
index 0000000..2db9ff2
--- /dev/null
+++ b/apps/standalone/cpu/gtplus/CMakeLists.txt
@@ -0,0 +1,62 @@
+
+# matlab warpper
+if (MATLAB_FOUND)
+
+    message("MATLAB FOUND: matlab wrapper for gtplus toolbox will be compiled.")
+
+    SET(CMAKE_DEBUG_POSTFIX)
+
+    include_directories( 
+        ${CMAKE_SOURCE_DIR}/toolboxes/fft/cpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/mri_core
+        ${ACE_INCLUDE_DIR}
+        ${MATLAB_INCLUDE_DIR}  
+        ${ISMRMRD_INCLUDE_DIR} 
+    )
+
+    link_directories(${Boost_LIBRARY_DIRS})
+    link_libraries(${MATLAB_LIBRARIES} 
+                    optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+                    ${ISMRMRD_LIBRARIES} 
+                    gadgetron_toolbox_gtplus 
+                    gadgetron_toolbox_cpucore 
+                    gadgetron_toolbox_cpucore_math
+                    gadgetron_toolbox_cpureg )
+
+    if (WIN32)
+        if ( HAS_64_BIT )
+            SET(MATLAB_SUFFIX ".mexw64")
+        else ( HAS_64_BIT )
+            SET(MATLAB_SUFFIX ".mexw32")
+        endif ( HAS_64_BIT )
+    endif (WIN32)
+
+    if (UNIX)
+        if ( HAS_64_BIT )
+            SET(MATLAB_SUFFIX ".mexa64")
+        else ( HAS_64_BIT )
+            SET(MATLAB_SUFFIX ".mexglx")
+        endif ( HAS_64_BIT )
+    endif (UNIX)
+
+    if (APPLE)
+        if ( HAS_64_BIT )
+            SET(MATLAB_SUFFIX ".mexmaci64")
+        endif ( HAS_64_BIT )
+    endif(APPLE)
+
+    if ( MKL_FOUND AND FFTW3_FOUND )
+
+        add_library(Matlab_gt_read_analyze SHARED Matlab_gt_read_analyze.cpp)
+        SET_TARGET_PROPERTIES(Matlab_gt_read_analyze PROPERTIES SUFFIX ${MATLAB_SUFFIX})
+        install(TARGETS Matlab_gt_read_analyze DESTINATION ${GADGETRON_INSTALL_MATLAB_PATH} COMPONENT main)
+
+        add_library(Matlab_gt_write_analyze SHARED Matlab_gt_write_analyze.cpp)
+        SET_TARGET_PROPERTIES(Matlab_gt_write_analyze PROPERTIES SUFFIX ${MATLAB_SUFFIX})
+        install(TARGETS Matlab_gt_write_analyze DESTINATION ${GADGETRON_INSTALL_MATLAB_PATH} COMPONENT main)
+
+    endif ( MKL_FOUND AND FFTW3_FOUND )
+
+else(MATLAB_FOUND)
+    message("MATLAB NOT FOUND: matlab wrapper for gtplus toolbox will not be compiled.")
+endif(MATLAB_FOUND)
diff --git a/apps/standalone/cpu/gtplus/Matlab_compute_coil_map_2D.cpp b/apps/standalone/cpu/gtplus/Matlab_compute_coil_map_2D.cpp
new file mode 100644
index 0000000..7b2d8d3
--- /dev/null
+++ b/apps/standalone/cpu/gtplus/Matlab_compute_coil_map_2D.cpp
@@ -0,0 +1,136 @@
+
+#include <matrix.h>
+#include <mat.h>
+#ifdef _WIN32
+    #include <mexGT.h>
+#else
+    #include <mex.h>
+#endif // _WIN32
+
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtMatlabConverter.h"
+#include "gtMatlabConverterComplex.h"
+
+#define MEXPRINTF(name) mexPrintf(#name);
+
+static void usage()
+{
+    using namespace std;
+    std::ostrstream outs;
+
+    outs << "==============================================================================================" << endl;
+    outs << "Usage: compute_coil_map_2D \n";
+    outs << "6 Input paras:" << endl;
+    outs << '\t' << "complexIm  : RO*E1*CHA*N, 2D complex image array, in complex float" << endl;
+    outs << '\t' << "algo       : ISMRMRD_SOUHEIL or ISMRMRD_SOUHEIL_ITER" << endl;
+    outs << '\t' << "ks         : kernel size, used by both methods" << endl;
+    outs << '\t' << "power      : number of times to perform power method, used by ISMRMRD_SOUHEIL" << endl;
+    outs << '\t' << "iterNum    : number of maximal iteration times, used by ISMRMRD_SOUHEIL_ITER" << endl;
+    outs << '\t' << "thres      : threshold of iteration, used by ISMRMRD_SOUHEIL_ITER" << endl;
+
+    outs << "1 Output para:" << endl;
+    outs << '\t' << "coilMap    : RO*E1*CHA*N coil map" << endl;
+    outs << "==============================================================================================" << endl;
+    outs << std::ends; 
+
+    mexPrintf("%s\n", outs.str() );
+}
+
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+    try
+    {
+        // ---------------------------------------------------------------
+        // consistency check
+        // ---------------------------------------------------------------    
+        if (nrhs != 6) 
+        {
+            mexWarnMsgTxt("6 input arguments are required ...");
+            usage();
+            return;
+        }
+
+        if (nlhs < 1 )
+        {
+            mexWarnMsgTxt("1 output argument is required ...");
+            usage();
+            return;
+        }
+
+        typedef std::complex<float> ValueType;
+
+        Gadgetron::GadgetronTimer timer("Running coil map estimation");
+
+        Gadgetron::gtMatlabConverter<float> converter;
+        Gadgetron::gtMatlabConverterComplex<ValueType> converterComplex;
+
+        Gadgetron::gtPlus::gtPlusISMRMRDReconUtilComplex<ValueType> gtPlus_util_complex_;
+
+        // ---------------------------------------------------------------
+        // input parameters
+        // ---------------------------------------------------------------    
+        // target
+        if ( !mxIsSingle(prhs[0]) || !mxIsComplex(prhs[0]) )
+        {
+            mexWarnMsgTxt("The first input parameter should be a complex single array ...");
+        }
+
+        mwSize nDim = mxGetNumberOfDimensions(prhs[0]);
+        if ( nDim!=3 && nDim!=4 )
+        {
+            mexWarnMsgTxt("1st array is not a 3D or 4D array");
+            return;
+        }
+
+        const mwSize* dims = mxGetDimensions(prhs[0]);
+
+        // algo
+        Gadgetron::gtPlus::ISMRMRDCOILMAPALGO algo = Gadgetron::gtPlus::ISMRMRD_SOUHEIL_ITER;
+        std::string algoStr;
+        converter.Matlab2Str(prhs[1], algoStr);
+        if ( algoStr == "ISMRMRD_SOUHEIL" )
+        {
+            algo = Gadgetron::gtPlus::ISMRMRD_SOUHEIL;
+        }
+
+        // ks
+        unsigned long long ks = mxGetScalar(prhs[2]);
+
+        // power
+        unsigned long long power = mxGetScalar(prhs[3]);
+
+        // iterNum
+        unsigned long long iterNum = (unsigned long long)(mxGetScalar(prhs[4]));
+
+        // iterNum
+        float thres = (float)(mxGetScalar(prhs[5]));
+
+        // ---------------------------------------------------------------
+        // perform the computation
+        // ---------------------------------------------------------------
+        Gadgetron::hoNDArray<ValueType> complexIm;
+        converterComplex.Matlab2hoNDArray(prhs[0], complexIm);
+
+        Gadgetron::hoNDArray<ValueType> coilMap;
+
+        if ( !gtPlus_util_complex_.coilMap2DNIH(complexIm, coilMap, algo, ks, power, iterNum, thres, true) )
+        {
+            mexWarnMsgTxt("coilMap2DNIH(...) failed ... ");
+            return;
+        }
+
+        // ---------------------------------------------------------------
+        // output parameter
+        // ---------------------------------------------------------------
+        mxArray* coilMapMx = NULL;
+        converterComplex.hoNDArray2Matlab(coilMap, coilMapMx);
+        plhs[0] = coilMapMx;
+   }
+    catch(...)
+    {
+        mexWarnMsgTxt("Exceptions happened in Matlab compute_coil_map_2D() ...");
+        return;
+    }
+
+    return;
+}
diff --git a/apps/standalone/cpu/gtplus/Matlab_compute_coil_map_3D.cpp b/apps/standalone/cpu/gtplus/Matlab_compute_coil_map_3D.cpp
new file mode 100644
index 0000000..ae6bded
--- /dev/null
+++ b/apps/standalone/cpu/gtplus/Matlab_compute_coil_map_3D.cpp
@@ -0,0 +1,137 @@
+
+#include <matrix.h>
+#include <mat.h>
+
+#ifdef _WIN32
+    #include <mexGT.h>
+#else
+    #include <mex.h>
+#endif // _WIN32
+
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtMatlabConverter.h"
+#include "gtMatlabConverterComplex.h"
+
+#define MEXPRINTF(name) mexPrintf(#name);
+
+static void usage()
+{
+    using namespace std;
+    std::ostrstream outs;
+
+    outs << "==============================================================================================" << endl;
+    outs << "Usage: compute_coil_map_3D \n";
+    outs << "6 Input paras:" << endl;
+    outs << '\t' << "complexIm  : RO*E1*E2*CHA*N, 3D complex image array, in complex float" << endl;
+    outs << '\t' << "algo       : ISMRMRD_SOUHEIL or ISMRMRD_SOUHEIL_ITER" << endl;
+    outs << '\t' << "ks         : kernel size, used by both methods" << endl;
+    outs << '\t' << "power      : number of times to perform power method, used by ISMRMRD_SOUHEIL" << endl;
+    outs << '\t' << "iterNum    : number of maximal iteration times, used by ISMRMRD_SOUHEIL_ITER" << endl;
+    outs << '\t' << "thres      : threshold of iteration, used by ISMRMRD_SOUHEIL_ITER" << endl;
+
+    outs << "1 Output para:" << endl;
+    outs << '\t' << "coilMap    : RO*E1*E2*CHA*N coil map" << endl;
+    outs << "==============================================================================================" << endl;
+    outs << std::ends; 
+
+    mexPrintf("%s\n", outs.str() );
+}
+
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+    try
+    {
+        // ---------------------------------------------------------------
+        // consistency check
+        // ---------------------------------------------------------------    
+        if (nrhs != 6) 
+        {
+            mexWarnMsgTxt("6 input arguments are required ...");
+            usage();
+            return;
+        }
+
+        if (nlhs < 1 )
+        {
+            mexWarnMsgTxt("1 output argument is required ...");
+            usage();
+            return;
+        }
+
+        typedef std::complex<float> ValueType;
+
+        Gadgetron::GadgetronTimer timer("Running coil map estimation");
+
+        Gadgetron::gtMatlabConverter<float> converter;
+        Gadgetron::gtMatlabConverterComplex<ValueType> converterComplex;
+
+        Gadgetron::gtPlus::gtPlusISMRMRDReconUtilComplex<ValueType> gtPlus_util_complex_;
+
+        // ---------------------------------------------------------------
+        // input parameters
+        // ---------------------------------------------------------------    
+        // target
+        if ( !mxIsSingle(prhs[0]) || !mxIsComplex(prhs[0]) )
+        {
+            mexWarnMsgTxt("The first input parameter should be a complex single array ...");
+        }
+
+        mwSize nDim = mxGetNumberOfDimensions(prhs[0]);
+        if ( nDim!=4 && nDim!=5 )
+        {
+            mexWarnMsgTxt("1st array is not a 4D or 5D array");
+            return;
+        }
+
+        const mwSize* dims = mxGetDimensions(prhs[0]);
+
+        // algo
+        Gadgetron::gtPlus::ISMRMRDCOILMAPALGO algo = Gadgetron::gtPlus::ISMRMRD_SOUHEIL_ITER;
+        std::string algoStr;
+        converter.Matlab2Str(prhs[1], algoStr);
+        if ( algoStr == "ISMRMRD_SOUHEIL" )
+        {
+            algo = Gadgetron::gtPlus::ISMRMRD_SOUHEIL;
+        }
+
+        // ks
+        unsigned long long ks = mxGetScalar(prhs[2]);
+
+        // power
+        unsigned long long power = mxGetScalar(prhs[3]);
+
+        // iterNum
+        unsigned long long iterNum = (unsigned long long)(mxGetScalar(prhs[4]));
+
+        // iterNum
+        float thres = (float)(mxGetScalar(prhs[5]));
+
+        // ---------------------------------------------------------------
+        // perform the computation
+        // ---------------------------------------------------------------
+        Gadgetron::hoNDArray<ValueType> complexIm;
+        converterComplex.Matlab2hoNDArray(prhs[0], complexIm);
+
+        Gadgetron::hoNDArray<ValueType> coilMap;
+
+        if ( !gtPlus_util_complex_.coilMap3DNIH(complexIm, coilMap, algo, ks, power, iterNum, thres) )
+        {
+            mexWarnMsgTxt("coilMap3DNIH(...) failed ... ");
+            return;
+        }
+
+        // ---------------------------------------------------------------
+        // output parameter
+        // ---------------------------------------------------------------
+        mxArray* coilMapMx = NULL;
+        converterComplex.hoNDArray2Matlab(coilMap, coilMapMx);
+        plhs[0] = coilMapMx;
+   }
+    catch(...)
+    {
+        mexWarnMsgTxt("Exceptions happened in Matlab compute_coil_map_3D() ...");
+        return;
+    }
+
+    return;
+}
diff --git a/apps/standalone/cpu/gtplus/Matlab_gt_read_analyze.cpp b/apps/standalone/cpu/gtplus/Matlab_gt_read_analyze.cpp
new file mode 100644
index 0000000..aaf4873
--- /dev/null
+++ b/apps/standalone/cpu/gtplus/Matlab_gt_read_analyze.cpp
@@ -0,0 +1,260 @@
+
+#include <matrix.h>
+#include <mat.h>
+#ifdef _WIN32
+    #include <mexGT.h>
+#else
+    #include <mex.h>
+#endif // _WIN32
+
+// Gadgetron includes
+#include "gtMatlab.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "hoNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "hoNDPoint.h"
+#include "hoNDImage.h"
+
+#include "gtMatlabConverter.h"
+#include "gtMatlabConverterComplex.h"
+
+#define MEXPRINTF(name) mexPrintf(#name);
+
+#define NIn 1
+#define NOut 2
+
+static void usage()
+{
+    using namespace std;
+    std::stringstream outs;
+
+    outs << "==============================================================================================" << endl;
+    outs << "Usage: Matlab_gt_read_analyze \n";
+    outs << "Read in the Gadgetron produced Analyze image file" << endl;
+    outs << "Support 2D/3D/4D/5D/6D images with short/float/double data types" << endl;
+    printAuthorInfo(outs);
+    outs << "1 Input paras:" << endl;
+    outs << '\t' << "filename   : file name of the analyze image, no .hdr or .img extension needed" << endl;
+
+    outs << "2 Output para:" << endl;
+    outs << '\t' << "data       : image data" << endl;
+    outs << '\t' << "header     : image header" << endl;
+    outs << "==============================================================================================" << endl;
+    outs << std::ends; 
+
+    std::string msg = outs.str();
+    mexPrintf("%s\n", msg.c_str() );
+}
+
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+    try
+    {
+        // ---------------------------------------------------------------
+        // consistency check
+        // ---------------------------------------------------------------    
+        if (nrhs != NIn) 
+        {
+            mexWarnMsgTxt("1 input arguments are required ...");
+            usage();
+            return;
+        }
+
+        if (nlhs < NOut )
+        {
+            mexWarnMsgTxt("2 output argument is required ...");
+            usage();
+            return;
+        }
+
+        using namespace Gadgetron;
+        using namespace Gadgetron::gtPlus;
+
+        Gadgetron::gtMatlabConverter<float> converterFloat;
+        Gadgetron::gtMatlabConverter<double> converterDouble;
+        Gadgetron::gtMatlabConverter<short> converterShort;
+
+        // ---------------------------------------------------------------
+        // input parameters
+        // ---------------------------------------------------------------    
+        // file name
+        std::string filename;
+        converterFloat.Matlab2Str(prhs[0], filename);
+
+        gtPlusIOAnalyze gt_io;
+
+        mxArray* aMx = NULL;
+        mxArray* aHeader = NULL;
+
+        try
+        {
+            hoNDImage<float, 2> data;
+            if ( gt_io.importImage(data, filename) )
+            {
+                converterFloat.hoNDImage2Matlab(data, aMx, aHeader);
+            }
+            else
+            {
+                hoNDImage<double, 2> data;
+                if ( gt_io.importImage(data, filename) )
+                {
+                    converterDouble.hoNDImage2Matlab(data, aMx, aHeader);
+                }
+                else
+                {
+                    hoNDImage<short, 2> data;
+                    if ( gt_io.importImage(data, filename) )
+                    {
+                        converterShort.hoNDImage2Matlab(data, aMx, aHeader);
+                    }
+                    else
+                    {
+                        throw("not 2D ... ");
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            try
+            {
+                hoNDImage<float, 3> data;
+                if ( gt_io.importImage(data, filename) )
+                {
+                    converterFloat.hoNDImage2Matlab(data, aMx, aHeader);
+                }
+                else
+                {
+                    hoNDImage<double, 3> data;
+                    if ( gt_io.importImage(data, filename) )
+                    {
+                        converterDouble.hoNDImage2Matlab(data, aMx, aHeader);
+                    }
+                    else
+                    {
+                        hoNDImage<short, 3> data;
+                        if ( gt_io.importImage(data, filename) )
+                        {
+                            converterShort.hoNDImage2Matlab(data, aMx, aHeader);
+                        }
+                        else
+                        {
+                            throw("not 3D ... ");
+                        }
+                    }
+                }
+            }
+            catch(...)
+            {
+                try
+                {
+                    hoNDImage<float, 4> data;
+                    if ( gt_io.importImage(data, filename) )
+                    {
+                        converterFloat.hoNDImage2Matlab(data, aMx, aHeader);
+                    }
+                    else
+                    {
+                        hoNDImage<double, 4> data;
+                        if ( gt_io.importImage(data, filename) )
+                        {
+                            converterDouble.hoNDImage2Matlab(data, aMx, aHeader);
+                        }
+                        else
+                        {
+                            hoNDImage<short, 4> data;
+                            if ( gt_io.importImage(data, filename) )
+                            {
+                                converterShort.hoNDImage2Matlab(data, aMx, aHeader);
+                            }
+                            else
+                            {
+                                throw("not 4D ... ");
+                            }
+                        }
+                    }
+                }
+                catch(...)
+                {
+                    try
+                    {
+                        hoNDImage<float, 5> data;
+                        if ( gt_io.importImage(data, filename) )
+                        {
+                            converterFloat.hoNDImage2Matlab(data, aMx, aHeader);
+                        }
+                        else
+                        {
+                            hoNDImage<double, 5> data;
+                            if ( gt_io.importImage(data, filename) )
+                            {
+                                converterDouble.hoNDImage2Matlab(data, aMx, aHeader);
+                            }
+                            else
+                            {
+                                hoNDImage<short, 5> data;
+                                if ( gt_io.importImage(data, filename) )
+                                {
+                                    converterShort.hoNDImage2Matlab(data, aMx, aHeader);
+                                }
+                                else
+                                {
+                                    throw("not 5D ... ");
+                                }
+                            }
+                        }
+                    }
+                    catch(...)
+                    {
+                        try
+                        {
+                            hoNDImage<float, 6> data;
+                            if ( gt_io.importImage(data, filename) )
+                            {
+                                converterFloat.hoNDImage2Matlab(data, aMx, aHeader);
+                            }
+                            else
+                            {
+                                hoNDImage<double, 6> data;
+                                if ( gt_io.importImage(data, filename) )
+                                {
+                                    converterDouble.hoNDImage2Matlab(data, aMx, aHeader);
+                                }
+                                else
+                                {
+                                    hoNDImage<short, 6> data;
+                                    if ( gt_io.importImage(data, filename) )
+                                    {
+                                        converterShort.hoNDImage2Matlab(data, aMx, aHeader);
+                                    }
+                                    else
+                                    {
+                                        throw("not 6D ... ");
+                                    }
+                                }
+                            }
+                        }
+                        catch(...)
+                        {
+                            mexWarnMsgTxt("Images must be 2D/3D/4D/5D/6D ...");
+                            return;
+                        }
+                    }
+                }
+            }
+        }
+
+        // ---------------------------------------------------------------
+        // output parameter
+        // ---------------------------------------------------------------
+        plhs[0] = aMx;
+        plhs[1] = aHeader;
+    }
+    catch(...)
+    {
+        mexWarnMsgTxt("Exceptions happened in Matlab_gt_read_analyze(...) ...");
+        return;
+    }
+
+    return;
+}
diff --git a/apps/standalone/cpu/gtplus/Matlab_gt_write_analyze.cpp b/apps/standalone/cpu/gtplus/Matlab_gt_write_analyze.cpp
new file mode 100644
index 0000000..71b5103
--- /dev/null
+++ b/apps/standalone/cpu/gtplus/Matlab_gt_write_analyze.cpp
@@ -0,0 +1,246 @@
+
+#include <matrix.h>
+#include <mat.h>
+#ifdef _WIN32
+    #include <mexGT.h>
+#else
+    #include <mex.h>
+#endif // _WIN32
+
+// Gadgetron includes
+#include "gtMatlab.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "hoNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "hoNDPoint.h"
+#include "hoNDImage.h"
+
+#include "gtMatlabConverter.h"
+#include "gtMatlabConverterComplex.h"
+
+#define MEXPRINTF(name) mexPrintf(#name);
+
+#define NIn 3
+#define NOut 0
+
+static void usage()
+{
+    using namespace std;
+    std::stringstream outs;
+
+    outs << "==============================================================================================" << endl;
+    outs << "Usage: Matlab_gt_write_analyze \n";
+    outs << "Write out the Gadgetron produced Analyze image file" << endl;
+    outs << "Support 2D/3D/4D/5D/6D images with short/float/double data types" << endl;
+    printAuthorInfo(outs);
+    outs << "3 Input paras:" << endl;
+    outs << '\t' << "data       : image data" << endl;
+    outs << '\t' << "header     : image header" << endl;
+    outs << '\t' << "filename   : file name of the analyze image, no .hdr or .img extension needed" << endl;
+
+    outs << "0 Output para" << endl;
+    outs << "==============================================================================================" << endl;
+    outs << std::ends; 
+
+    std::string msg = outs.str();
+    mexPrintf("%s\n", msg.c_str() );
+}
+
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+    try
+    {
+        // ---------------------------------------------------------------
+        // consistency check
+        // ---------------------------------------------------------------    
+        if (nrhs != NIn) 
+        {
+            mexWarnMsgTxt("3 input arguments are required ...");
+            usage();
+            return;
+        }
+
+        using namespace Gadgetron;
+        using namespace Gadgetron::gtPlus;
+
+        Gadgetron::gtMatlabConverter<float> converterFloat;
+        Gadgetron::gtMatlabConverter<double> converterDouble;
+        Gadgetron::gtMatlabConverter<short> converterShort;
+
+        // ---------------------------------------------------------------
+        // input parameters
+        // ---------------------------------------------------------------    
+        gtPlusIOAnalyze gt_io;
+
+        const mxArray* aMx = prhs[0];
+        const mxArray* aHeader = prhs[1];
+
+        std::string filename;
+        converterFloat.Matlab2Str(prhs[2], filename);
+
+        try
+        {
+            hoNDImage<float, 2> data;
+            if ( converterFloat.Matlab2hoNDImage(aMx, aHeader, data) )
+            {
+                gt_io.exportImage(data, filename);
+            }
+            else
+            {
+                hoNDImage<double, 2> data;
+                if ( converterDouble.Matlab2hoNDImage(aMx, aHeader, data) )
+                {
+                    gt_io.exportImage(data, filename);
+                }
+                else
+                {
+                    hoNDImage<short, 2> data;
+                    if ( converterShort.Matlab2hoNDImage(aMx, aHeader, data) )
+                    {
+                        gt_io.exportImage(data, filename);
+                    }
+                    else
+                    {
+                        throw("not 2D ... ");
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            try
+            {
+                hoNDImage<float, 3> data;
+                if ( converterFloat.Matlab2hoNDImage(aMx, aHeader, data) )
+                {
+                    gt_io.exportImage(data, filename);
+                }
+                else
+                {
+                    hoNDImage<double, 3> data;
+                    if ( converterDouble.Matlab2hoNDImage(aMx, aHeader, data) )
+                    {
+                        gt_io.exportImage(data, filename);
+                    }
+                    else
+                    {
+                        hoNDImage<short, 3> data;
+                        if ( converterShort.Matlab2hoNDImage(aMx, aHeader, data) )
+                        {
+                            gt_io.exportImage(data, filename);
+                        }
+                        else
+                        {
+                            throw("not 3D ... ");
+                        }
+                    }
+                }
+            }
+            catch(...)
+            {
+                try
+                {
+                    hoNDImage<float, 4> data;
+                    if ( converterFloat.Matlab2hoNDImage(aMx, aHeader, data) )
+                    {
+                        gt_io.exportImage(data, filename);
+                    }
+                    else
+                    {
+                        hoNDImage<double, 4> data;
+                        if ( converterDouble.Matlab2hoNDImage(aMx, aHeader, data) )
+                        {
+                            gt_io.exportImage(data, filename);
+                        }
+                        else
+                        {
+                            hoNDImage<short, 4> data;
+                            if ( converterShort.Matlab2hoNDImage(aMx, aHeader, data) )
+                            {
+                                gt_io.exportImage(data, filename);
+                            }
+                            else
+                            {
+                                throw("not 4D ... ");
+                            }
+                        }
+                    }
+                }
+                catch(...)
+                {
+                    try
+                    {
+                        hoNDImage<float, 5> data;
+                        if ( converterFloat.Matlab2hoNDImage(aMx, aHeader, data) )
+                        {
+                            gt_io.exportImage(data, filename);
+                        }
+                        else
+                        {
+                            hoNDImage<double, 5> data;
+                            if ( converterDouble.Matlab2hoNDImage(aMx, aHeader, data) )
+                            {
+                                gt_io.exportImage(data, filename);
+                            }
+                            else
+                            {
+                                hoNDImage<short, 5> data;
+                                if ( converterShort.Matlab2hoNDImage(aMx, aHeader, data) )
+                                {
+                                    gt_io.exportImage(data, filename);
+                                }
+                                else
+                                {
+                                    throw("not 5D ... ");
+                                }
+                            }
+                        }
+                    }
+                    catch(...)
+                    {
+                        try
+                        {
+                            hoNDImage<float, 6> data;
+                            if ( converterFloat.Matlab2hoNDImage(aMx, aHeader, data) )
+                            {
+                                gt_io.exportImage(data, filename);
+                            }
+                            else
+                            {
+                                hoNDImage<double, 6> data;
+                                if ( converterDouble.Matlab2hoNDImage(aMx, aHeader, data) )
+                                {
+                                    gt_io.exportImage(data, filename);
+                                }
+                                else
+                                {
+                                    hoNDImage<short, 6> data;
+                                    if ( converterShort.Matlab2hoNDImage(aMx, aHeader, data) )
+                                    {
+                                        gt_io.exportImage(data, filename);
+                                    }
+                                    else
+                                    {
+                                        throw("not 6D ... ");
+                                    }
+                                }
+                            }
+                        }
+                        catch(...)
+                        {
+                            mexWarnMsgTxt("Images must be 2D/3D/4D/5D/6D ...");
+                            return;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        mexWarnMsgTxt("Exceptions happened in Matlab_gt_write_analyze(...) ...");
+        return;
+    }
+
+    return;
+}
diff --git a/apps/standalone/cpu/registration/2d/CMakeLists.txt b/apps/standalone/cpu/registration/2d/CMakeLists.txt
new file mode 100644
index 0000000..f385828
--- /dev/null
+++ b/apps/standalone/cpu/registration/2d/CMakeLists.txt
@@ -0,0 +1,46 @@
+include_directories( 
+        ${CMAKE_SOURCE_DIR}/toolboxes/fft/cpu
+        ${ACE_INCLUDE_DIR}
+        ${ISMRMRD_INCLUDE_DIR}
+    )
+
+add_executable(register_HS_2d_cpu register_HS_2d.cpp)
+add_executable(register_CK_2d_cpu register_CK_2d.cpp)
+
+target_link_libraries(register_HS_2d_cpu 
+  gadgetron_toolbox_hostutils 
+  gadgetron_toolbox_cpureg 
+  gadgetron_toolbox_cpucore 
+  gadgetron_toolbox_cpucore_math
+  ${ARMADILLO_LIBRARIES}
+  )
+
+target_link_libraries(register_CK_2d_cpu 
+  gadgetron_toolbox_hostutils 
+  gadgetron_toolbox_cpureg 
+  gadgetron_toolbox_cpucore
+  gadgetron_toolbox_cpucore_math
+  ${ARMADILLO_LIBRARIES}
+  )
+
+install(TARGETS 
+  register_HS_2d_cpu
+  register_CK_2d_cpu 
+  DESTINATION bin COMPONENT main)
+
+# matlab wrapper
+if (MATLAB_FOUND)
+  message("Matlab found> ${MATLAB_INCLUDE_DIR}. Matlab registration wrapper is being compiled.")
+  SET(CMAKE_DEBUG_POSTFIX)
+  if (WIN32)
+    include_directories( ${MATLAB_INCLUDE_DIR} )
+    add_library(Matlab_register_CK_2d_cpu SHARED Matlab_register_CK_2d.cpp)
+    target_link_libraries(Matlab_register_CK_2d_cpu ${MATLAB_LIBRARIES} gadgetron_toolbox_hostutils gadgetron_toolbox_cpureg gadgetron_toolbox_cpucore gadgetron_toolbox_cpucore_math)
+    if ( HAS_64_BIT )				
+      SET_TARGET_PROPERTIES(Matlab_register_CK_2d_cpu PROPERTIES SUFFIX .mexw64)
+    endif ( HAS_64_BIT )    
+    install(TARGETS Matlab_register_CK_2d_cpu DESTINATION ${GADGETRON_INSTALL_MATLAB_PATH} COMPONENT main)
+  endif (WIN32)
+else(MATLAB_FOUND)
+  message("Matlab not found. Matlab wrapper for registration toolbox will not be compiled.")
+endif(MATLAB_FOUND)
diff --git a/apps/standalone/cpu/registration/2d/Matlab_register_CK_2d.cpp b/apps/standalone/cpu/registration/2d/Matlab_register_CK_2d.cpp
new file mode 100644
index 0000000..65e1f93
--- /dev/null
+++ b/apps/standalone/cpu/registration/2d/Matlab_register_CK_2d.cpp
@@ -0,0 +1,197 @@
+
+#include <matrix.h>
+#include <mat.h>
+#include <mexGT.h>
+#include <cmath>
+#include <vector>
+#include <iostream>
+#include <strstream>
+
+// Gadgetron includes
+#include "hoCKOpticalFlowSolver.h"
+#include "hoLinearResampleOperator.h"
+#include "hoNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "GadgetronTimer.h"
+#include "parameterparser.h"
+
+#define MEXPRINTF(name) mexPrintf(#name);
+
+static void usage()
+{
+    using namespace std;
+    std::ostrstream outs;
+
+    outs << "==============================================================================================" << endl;
+    outs << "Usage: register_CK_2d \n";
+    outs << "5 Input paras:" << endl;
+    outs << '\t' << "target     : Nfe*Npe, 2D array, target (fixed) image, in double" << endl;
+    outs << '\t' << "source     : Nfe*Npe, 2D array, source (moving) image, in double" << endl;
+    outs << '\t' << "alpha      : regularization parameter, alpha" << endl;
+    outs << '\t' << "beta       : regularization parameter, beta" << endl;
+    outs << '\t' << "level      : number of resolution levels" << endl;
+
+    outs << "2 Output para:" << endl;
+    outs << '\t' << "dx         : deformation field, along 1st dimension" << endl;
+    outs << '\t' << "dy         : deformation field, along 2nd dimension" << endl;
+    outs << "==============================================================================================" << endl;
+    outs << std::ends; 
+    
+    mexPrintf("%s\n", outs.str() );
+}
+
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+    try
+    {
+        // ---------------------------------------------------------------
+        // consistency check
+        // ---------------------------------------------------------------    
+        if (nrhs != 5) 
+        {
+            mexWarnMsgTxt("5 input arguments are required ...");
+            usage();
+            return;
+        }
+
+        if (nlhs < 2 )
+        {
+            mexWarnMsgTxt("2 output argument is required ...");
+            usage();
+            return;
+        }
+
+        Gadgetron::GadgetronTimer timer("Running registration");
+
+        // ---------------------------------------------------------------
+        // input parameters
+        // ---------------------------------------------------------------    
+        // target
+        if ( !mxIsDouble(prhs[0]) )
+        {
+            mexWarnMsgTxt("The first input parameter should be a double array ...");
+        }
+
+        if ( !mxIsDouble(prhs[1]) )
+        {
+            mexWarnMsgTxt("The second input parameter should be a double array ...");
+        }
+
+        // for the image
+        mwSize nDim = mxGetNumberOfDimensions(prhs[0]);
+        if ( nDim!=2 )
+        {
+            mexWarnMsgTxt("1st array is not a 2D array");
+            return;
+        }
+
+        nDim = mxGetNumberOfDimensions(prhs[1]);
+        if ( nDim!=2 )
+        {
+            mexWarnMsgTxt("2nd array is not a 2D array");
+            return;
+        }
+
+        const mwSize* dims = mxGetDimensions(prhs[0]);
+        int numOfPixels = dims[0]*dims[1];
+
+        const mwSize* dims2 = mxGetDimensions(prhs[1]);
+        if ( dims[0]!=dims2[0] || dims[1]!=dims2[1] )
+        {
+            mexWarnMsgTxt("Input arrays have different size ... ");
+            return;
+        }
+
+        double* ptrTarget = static_cast<double*>(mxGetData(prhs[0]));
+        double* ptrSource = static_cast<double*>(mxGetData(prhs[1]));
+
+        // alpha
+        double alpha = mxGetScalar(prhs[2]);
+
+        // beta
+        double beta = mxGetScalar(prhs[3]);
+
+        // level
+        int level = (int)(mxGetScalar(prhs[4]));
+
+        // ---------------------------------------------------------------
+        // perform the registration
+        // ---------------------------------------------------------------
+        // allocate the results
+        mxArray* Dx = mxCreateNumericArray(nDim, dims, mxDOUBLE_CLASS, mxREAL);
+        if ( Dx == NULL )
+        {
+            mexWarnMsgTxt("Dx == NULL");
+            return;
+        }
+
+        mxArray* Dy = mxCreateNumericArray(nDim, dims, mxDOUBLE_CLASS, mxREAL);
+        if ( Dy == NULL )
+        {
+            mexWarnMsgTxt("Dy == NULL");
+            return;
+        }
+
+        double* ptrDx = static_cast<double*>(mxGetData(Dx));
+        double* ptrDy = static_cast<double*>(mxGetData(Dy));
+        memset(ptrDx, 0, sizeof(double)*numOfPixels);
+        memset(ptrDy, 0, sizeof(double)*numOfPixels);
+
+        // allocate the target and source images
+        typedef double _real;
+        using namespace Gadgetron;
+
+        std::vector<size_t> dim_array(2);
+        dim_array[0] = dims[0];
+        dim_array[1] = dims[1];
+
+        boost::shared_ptr< hoNDArray<_real> > fixed_image(new hoNDArray<_real>(&dim_array));
+        memcpy(fixed_image->begin(), ptrTarget, sizeof(_real)*numOfPixels);
+
+        boost::shared_ptr< hoNDArray<_real> > moving_image(new hoNDArray<_real>(&dim_array));
+        memcpy(moving_image->begin(), ptrSource, sizeof(_real)*numOfPixels);
+
+        boost::shared_ptr< hoLinearResampleOperator<_real,2> > R( new hoLinearResampleOperator<_real,2>() );
+
+        // Setup solver
+        hoCKOpticalFlowSolver<_real,2> CK;
+        CK.set_interpolator( R );
+        CK.set_output_mode( hoCKOpticalFlowSolver<_real,2>::OUTPUT_VERBOSE );
+        CK.set_num_multires_levels( level );
+        CK.set_max_num_iterations_per_level( 500 );
+        CK.set_alpha(alpha);
+        CK.set_beta(beta);
+        CK.set_limit(0.01f);
+  
+        // Run registration
+        //
+        boost::shared_ptr< hoNDArray<_real> > result;
+
+        {
+            Gadgetron::GadgetronTimer timer("Running registration - solve");
+            result = CK.solve( fixed_image.get(), moving_image.get() );
+        }
+
+        if( !result.get() )
+        {
+            mexWarnMsgTxt("Registration solver failed. Quitting!");
+            return;
+        }
+
+        memcpy(ptrDx, result->begin(), sizeof(_real)*numOfPixels);
+        memcpy(ptrDy, result->begin()+numOfPixels, sizeof(_real)*numOfPixels);
+
+        // ---------------------------------------------------------------
+        // output parameter
+        // ---------------------------------------------------------------
+        plhs[0] = Dx;
+        plhs[1] = Dy;
+   }
+    catch(...)
+    {
+        mexWarnMsgTxt("Exceptions happened in Matlab register_CK_2d() ...");
+        return;
+    }
+
+    return;
+}
diff --git a/apps/standalone/cpu/registration/2d/register_CK_2d.cpp b/apps/standalone/cpu/registration/2d/register_CK_2d.cpp
new file mode 100644
index 0000000..ea129a9
--- /dev/null
+++ b/apps/standalone/cpu/registration/2d/register_CK_2d.cpp
@@ -0,0 +1,121 @@
+/*
+  An example of how to register two 2d images using Cornelius-Kanade optical flow
+*/
+
+// Gadgetron includes
+#include "hoCKOpticalFlowSolver.h"
+#include "hoLinearResampleOperator.h"
+#include "hoNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "GadgetronTimer.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'f', COMMAND_LINE_STRING, 1, "Fixed image file name (.real)", true );
+  parms.add_parameter( 'm', COMMAND_LINE_STRING, 1, "Moving image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "displacement_field.real" );
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Regularization weight (alpha)", true, "0.05" );
+  parms.add_parameter( 'b', COMMAND_LINE_FLOAT,  1, "Regularization weight (beta)", true, "1.0" );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running registration with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load sample data from disk
+  //
+  
+  boost::shared_ptr< hoNDArray<_real> > fixed_image = 
+    read_nd_array<_real>((char*)parms.get_parameter('f')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > moving_image = 
+    read_nd_array<_real>((char*)parms.get_parameter('m')->get_string_value());
+  
+  if( !fixed_image.get() || !moving_image.get() ){
+    cout << endl << "One of the input images is not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  size_t num_fixed_dims = fixed_image->get_number_of_dimensions();
+  size_t num_moving_dims = moving_image->get_number_of_dimensions();
+
+  if( !(num_fixed_dims == 2 || num_fixed_dims == 3)  ){
+    cout << endl << "The fixed image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( !(num_moving_dims == 2 || num_moving_dims == 3)  ){
+    cout << endl << "The moving image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+
+  _real alpha = (_real) parms.get_parameter('a')->get_float_value();
+  _real beta = (_real) parms.get_parameter('b')->get_float_value();
+
+  // Use bilinear interpolation for resampling
+  //
+
+  boost::shared_ptr< hoLinearResampleOperator<_real,2> > R( new hoLinearResampleOperator<_real,2>() );
+
+  // Setup solver
+  //
+  
+  hoCKOpticalFlowSolver<_real,2> CK;
+  CK.set_interpolator( R );
+  CK.set_output_mode( hoCKOpticalFlowSolver<_real,2>::OUTPUT_VERBOSE );  
+  CK.set_num_multires_levels( 4 );
+  CK.set_max_num_iterations_per_level( 500 );
+  CK.set_alpha(alpha);
+  CK.set_beta(beta);
+  CK.set_limit(0.01f);
+  
+  // Run registration
+  //
+
+  boost::shared_ptr< hoNDArray<_real> > result;
+  {
+    GadgetronTimer timer("Running registration");
+    result = CK.solve( fixed_image.get(), moving_image.get() );
+  }
+
+  if( !result.get() ){
+    cout << endl << "Registration solver failed. Quitting!\n" << endl;
+    return 1;
+  }
+
+  boost::shared_ptr< hoNDArray<_real> > deformed_moving;
+  {
+    GadgetronTimer timer("Applying deformation");
+    deformed_moving = CK.deform( moving_image.get(), result );
+  }
+  
+  // All done, write out the result
+  //
+
+  write_nd_array<_real>(result.get(), (char*)parms.get_parameter('r')->get_string_value());
+  write_nd_array<_real>(deformed_moving.get(), "def_moving.real" );
+  
+  return 0;
+}
diff --git a/apps/standalone/cpu/registration/2d/register_HS_2d.cpp b/apps/standalone/cpu/registration/2d/register_HS_2d.cpp
new file mode 100644
index 0000000..057bbbc
--- /dev/null
+++ b/apps/standalone/cpu/registration/2d/register_HS_2d.cpp
@@ -0,0 +1,110 @@
+/*
+  An example of how to register two 2d images using Horn-Schunk optical flow
+*/
+
+// Gadgetron includes
+#include "hoHSOpticalFlowSolver.h"
+#include "hoLinearResampleOperator.h"
+#include "hoNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'f', COMMAND_LINE_STRING, 1, "Fixed image file name (.real)", true );
+  parms.add_parameter( 'm', COMMAND_LINE_STRING, 1, "Moving image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "displacement_field.real" );
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Regularization weight (alpha)", true, "0.1" );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running registration with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load sample data from disk
+  //
+  
+  boost::shared_ptr< hoNDArray<_real> > fixed_image = 
+    read_nd_array<_real>((char*)parms.get_parameter('f')->get_string_value());
+  
+  boost::shared_ptr< hoNDArray<_real> > moving_image = 
+    read_nd_array<_real>((char*)parms.get_parameter('m')->get_string_value());
+  
+  if( !moving_image.get() || !fixed_image.get() ){
+    cout << endl << "One of the input images is not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  size_t num_fixed_dims = fixed_image->get_number_of_dimensions();
+  size_t num_moving_dims = moving_image->get_number_of_dimensions();
+
+  if( !(num_fixed_dims == 2 || num_fixed_dims == 3)  ){
+    cout << endl << "The fixed image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( !(num_moving_dims == 2 || num_moving_dims == 3)  ){
+    cout << endl << "The moving image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+    
+  _real alpha = (_real) parms.get_parameter('a')->get_float_value();
+
+  // Use bilinear interpolation for resampling
+  //
+
+  boost::shared_ptr< hoLinearResampleOperator<_real,2> > R( new hoLinearResampleOperator<_real,2>() );
+
+  // Setup solver
+  //
+  
+  hoHSOpticalFlowSolver<_real,2> HS;
+  HS.set_interpolator( R );
+  HS.set_output_mode( hoHSOpticalFlowSolver<_real,2>::OUTPUT_VERBOSE );  
+  HS.set_num_multires_levels( 4 );
+  HS.set_max_num_iterations_per_level( 500 );
+  HS.set_alpha(alpha);
+  HS.set_limit(0.01f);
+  
+  // Run registration
+  //
+
+  boost::shared_ptr< hoNDArray<_real> > result = HS.solve( fixed_image.get(), moving_image.get() );
+
+  if( !result.get() ){
+    cout << endl << "Registration solver failed. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  boost::shared_ptr< hoNDArray<_real> > deformed_moving = HS.deform( moving_image.get(), result );
+  
+  // All done, write out the result
+  //
+
+  write_nd_array<_real>(result.get(), (char*)parms.get_parameter('r')->get_string_value());
+  write_nd_array<_real>(deformed_moving.get(), "def_moving.real" );
+  
+  return 0;
+}
diff --git a/apps/standalone/cpu/registration/3d/CMakeLists.txt b/apps/standalone/cpu/registration/3d/CMakeLists.txt
new file mode 100644
index 0000000..fe3e261
--- /dev/null
+++ b/apps/standalone/cpu/registration/3d/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_executable(register_CK_3d_cpu register_CK_3d.cpp)
+
+target_link_libraries(register_CK_3d_cpu
+  gadgetron_toolbox_hostutils 
+  gadgetron_toolbox_cpureg 
+  gadgetron_toolbox_cpucore 
+  gadgetron_toolbox_cpucore_math
+  ${ARMADILLO_LIBRARIES}
+  )
+
+install(TARGETS register_CK_3d_cpu DESTINATION bin COMPONENT main)
diff --git a/apps/standalone/cpu/registration/3d/register_CK_3d.cpp b/apps/standalone/cpu/registration/3d/register_CK_3d.cpp
new file mode 100644
index 0000000..a9bd14a
--- /dev/null
+++ b/apps/standalone/cpu/registration/3d/register_CK_3d.cpp
@@ -0,0 +1,115 @@
+/*
+  An example of how to register two 3d volumes using Cornelius-Kanade optical flow
+*/
+
+// Gadgetron includes
+#include "hoCKOpticalFlowSolver.h"
+#include "hoLinearResampleOperator.h"
+#include "hoNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'f', COMMAND_LINE_STRING, 1, "Fixed image file name (.real)", true );
+  parms.add_parameter( 'm', COMMAND_LINE_STRING, 1, "Moving image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "displacement_field.real" );
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Regularization weight (alpha)", true, "0.05" );
+  parms.add_parameter( 'b', COMMAND_LINE_FLOAT,  1, "Regularization weight (beta)", true, "1.0" );
+  parms.add_parameter( 'l', COMMAND_LINE_INT,    1, "Number of multiresolution levels", true, "3" );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running registration with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load sample data from disk
+  //
+  
+  boost::shared_ptr< hoNDArray<_real> > fixed_image = 
+    read_nd_array<_real>((char*)parms.get_parameter('f')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > moving_image = 
+    read_nd_array<_real>((char*)parms.get_parameter('m')->get_string_value());
+  
+  if( !fixed_image.get() || !moving_image.get() ){
+    cout << endl << "One of the input images is not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  size_t num_fixed_dims = fixed_image->get_number_of_dimensions();
+  size_t num_moving_dims = moving_image->get_number_of_dimensions();
+
+  if( !(num_fixed_dims == 3 || num_fixed_dims == 4)  ){
+    cout << endl << "The fixed image is not three- or four-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( !(num_moving_dims == 3 || num_moving_dims == 4)  ){
+    cout << endl << "The moving image is not three- or four-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+
+  _real alpha = (_real) parms.get_parameter('a')->get_float_value();
+  _real beta = (_real) parms.get_parameter('b')->get_float_value();
+
+  unsigned int multires_levels = parms.get_parameter('l')->get_int_value();
+
+  // Use trilinear interpolation for resampling
+  //
+
+  boost::shared_ptr< hoLinearResampleOperator<_real,3> > R( new hoLinearResampleOperator<_real,3>() );
+
+  // Setup solver
+  //
+  
+  hoCKOpticalFlowSolver<_real,3> CK;
+  CK.set_interpolator( R );
+  CK.set_output_mode( hoCKOpticalFlowSolver<_real,3>::OUTPUT_VERBOSE );  
+  CK.set_max_num_iterations_per_level( 500 );
+  CK.set_num_multires_levels( multires_levels );
+  CK.set_alpha(alpha);
+  CK.set_beta(beta);
+  CK.set_limit(0.01f);
+  
+  // Run registration
+  //
+
+  boost::shared_ptr< hoNDArray<_real> > result = CK.solve( fixed_image.get(), moving_image.get() );
+
+  if( !result.get() ){
+    cout << endl << "Registration solver failed. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  boost::shared_ptr< hoNDArray<_real> > deformed_moving = CK.deform( moving_image.get(), result );
+  
+  // All done, write out the result
+  //
+
+  write_nd_array<_real>(result.get(), (char*)parms.get_parameter('r')->get_string_value());
+  write_nd_array<_real>(deformed_moving.get(), "def_moving.real" );
+  
+  return 0;
+}
diff --git a/apps/standalone/cpu/registration/CMakeLists.txt b/apps/standalone/cpu/registration/CMakeLists.txt
new file mode 100644
index 0000000..89f0e6e
--- /dev/null
+++ b/apps/standalone/cpu/registration/CMakeLists.txt
@@ -0,0 +1,12 @@
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+  ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/fft/cpu
+  ${ACE_INCLUDE_DIR}
+  ${ISMRMRD_INCLUDE_DIR}
+  )
+
+if(${ARMADILLO_VERSION_STRING} VERSION_GREATER "3.819" )
+add_subdirectory(2d)
+add_subdirectory(3d)
+endif(${ARMADILLO_VERSION_STRING} VERSION_GREATER "3.819" )
diff --git a/apps/standalone/gpu/CMakeLists.txt b/apps/standalone/gpu/CMakeLists.txt
new file mode 100644
index 0000000..6cb2cf3
--- /dev/null
+++ b/apps/standalone/gpu/CMakeLists.txt
@@ -0,0 +1,22 @@
+include_directories( 
+	${CMAKE_SOURCE_DIR}/toolboxes/core 
+	${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+	${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+	${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+	${CMAKE_SOURCE_DIR}/toolboxes/core/gpu 
+    ${CMAKE_SOURCE_DIR}/toolboxes/fft/gpu
+	${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+	${CMAKE_SOURCE_DIR}/toolboxes/operators
+	${CMAKE_SOURCE_DIR}/toolboxes/operators/gpu
+	${CMAKE_SOURCE_DIR}/toolboxes/solvers
+	${CMAKE_SOURCE_DIR}/toolboxes/solvers/gpu
+	${CUDA_INCLUDE_DIRS}
+	${Boost_INCLUDE_DIR} 
+	${ISMRMRD_INCLUDE_DIR} 
+	)
+
+add_subdirectory(mri)
+add_subdirectory(ct)
+add_subdirectory(denoising)
+add_subdirectory(deblurring)
+add_subdirectory(registration)
diff --git a/apps/standalone/gpu/ct/CMakeLists.txt b/apps/standalone/gpu/ct/CMakeLists.txt
new file mode 100644
index 0000000..77738f0
--- /dev/null
+++ b/apps/standalone/gpu/ct/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(HDF5_FOUND)
+  add_subdirectory(xray)
+endif()
diff --git a/apps/standalone/gpu/ct/xray/CBCT_forwards_projection.cpp b/apps/standalone/gpu/ct/xray/CBCT_forwards_projection.cpp
new file mode 100644
index 0000000..80abb17
--- /dev/null
+++ b/apps/standalone/gpu/ct/xray/CBCT_forwards_projection.cpp
@@ -0,0 +1,252 @@
+#include "parameterparser.h"
+#include "CBCT_acquisition.h"
+#include "CBCT_binning.h"
+#include "hoCuConebeamProjectionOperator.h"
+#include "hoNDArray_fileio.h"
+#include "vector_td_utilities.h"
+#include "GPUTimer.h"
+#include "setup_grid.h"
+#include "cuNDArray_utils.h"
+
+#include <iostream>
+#include <algorithm>
+#include <sstream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Utility to load offsets - if file is provided - from an HDF5 file
+//
+
+std::vector<floatd2> 
+get_offsets( std::string filename )
+{  
+  hsize_t dim;
+  
+  hid_t file_id = H5Fopen (filename.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);  
+  
+  herr_t errCode = H5LTget_dataset_info(file_id,"/offsetx",&dim,NULL,NULL);
+  if (errCode < 0) 
+    throw std::runtime_error("Error getting /offsetx dataset info from file.");
+
+  std::vector<float> offsets_x = std::vector<float>(dim,0.0f);
+  errCode=H5LTread_dataset (file_id, "/offsetx", H5T_NATIVE_FLOAT, &offsets_x[0]);
+  if (errCode < 0)
+    throw std::runtime_error("Error reading /offsetx from file.");
+  
+  errCode=H5LTget_dataset_info(file_id,"/offsety",&dim,NULL,NULL);
+  if (errCode < 0)
+    throw std::runtime_error("Error getting /offsety dataset info from file.");
+
+  std::vector<float> offsets_y = std::vector<float>(dim,0.0f);
+
+  errCode = H5LTread_dataset (file_id, "/offsety", H5T_NATIVE_FLOAT, &offsets_y[0]);
+  if (errCode < 0)
+    throw std::runtime_error("Error reading /offsety from file.");
+  
+  if( offsets_x.size() != offsets_y.size() ){
+    throw std::runtime_error("CBCT_geometry::load : x/y offset arrays has different lengths");
+  }
+
+  std::vector<floatd2> res;
+  for( unsigned int i=0; i<offsets_x.size(); i++ )
+    res.push_back(floatd2( offsets_x[i], offsets_y[i]));
+  
+  return res;
+}
+
+int main(int argc, char** argv) 
+{ 
+  // Parse command line
+  //
+
+  ParameterParser parms(1024);
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input volume filename (.real)", true );
+  parms.add_parameter( 'b', COMMAND_LINE_STRING, 1, "Binning filename (.h5)", false );
+  parms.add_parameter( 'o', COMMAND_LINE_STRING, 1, "Offsets filename (.h5)", false );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output projections filename (.real)", true, "projections_simulated.real" );
+  parms.add_parameter( 'h', COMMAND_LINE_STRING, 1, "Output acquisition filename (.h5)", true, "acquisition_simulated.h5" );
+  parms.add_parameter( 'f', COMMAND_LINE_FLOAT, 3, "Input volume FOV in mm (3d)", true, "448, 448, 252" );
+  parms.add_parameter( 'p', COMMAND_LINE_FLOAT, 2, "Projection plate size in pixels (2d)", true, "512, 256" );
+  parms.add_parameter( 'q', COMMAND_LINE_FLOAT, 2, "Projection plate FOV in mm (2d)", true, "800.0, 400.0" );
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT, 1, "SAD", true, "1000.0" );
+  parms.add_parameter( 's', COMMAND_LINE_FLOAT, 1, "SDD", true, "1500.0" );
+  parms.add_parameter( 'u', COMMAND_LINE_FLOAT, 1, "Initial angle (degrees)", true, "0.0" );
+  parms.add_parameter( 'v', COMMAND_LINE_FLOAT, 1, "Angular spacing (degrees)", true, "0.5" );
+  parms.add_parameter( 'w', COMMAND_LINE_INT, 1, "Number of projections", true, "720" );
+  parms.add_parameter( 'P', COMMAND_LINE_INT, 1, "Projections per batch", false );
+  parms.add_parameter( 'S', COMMAND_LINE_FLOAT, 1, "Samples per pixel (float) in integral", false );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ) {
+    parms.print_parameter_list();
+  }
+  else{
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  std::string image_filename = (char*)parms.get_parameter('d')->get_string_value();
+  std::string projections_filename = (char*)parms.get_parameter('r')->get_string_value();
+  std::string acquisition_filename = (char*)parms.get_parameter('h')->get_string_value();
+  
+  // Load volume
+  //
+  
+  boost::shared_ptr< hoCuNDArray<float> > image(new hoCuNDArray<float>(*read_nd_array<float>( image_filename.c_str() )));
+  
+  if( image->get_number_of_dimensions() < 3 ){
+    std::cout << "Input image volume should have at least three dimensions" << std::endl;
+    exit(1);
+  }
+
+  // Add default temporal dimension of 1 since the operator only takes four-dimensional images
+  //
+
+  if( image->get_number_of_dimensions() == 3 ){
+    std::vector<size_t> dims = *image->get_dimensions();
+    dims.push_back(1);
+    image->reshape(&dims);
+  }
+
+  // Configuring...
+  //
+
+  uintd2 ps_dims_in_pixels( parms.get_parameter('p')->get_float_value(0), 
+			    parms.get_parameter('p')->get_float_value(1) );
+  
+  floatd2 ps_dims_in_mm( parms.get_parameter('q')->get_float_value(0),
+			 parms.get_parameter('q')->get_float_value(1) );
+
+  float SAD = parms.get_parameter('a')->get_float_value();
+  float SDD = parms.get_parameter('s')->get_float_value();
+
+  uintd3 is_dims_in_pixels ( image->get_size(0),
+			     image->get_size(1),
+			     image->get_size(2) );
+  
+  floatd3 is_dims_in_mm( parms.get_parameter('f')->get_float_value(0), 
+			 parms.get_parameter('f')->get_float_value(1), 
+			 parms.get_parameter('f')->get_float_value(2) );
+
+  float start_angle = parms.get_parameter('u')->get_float_value();
+  float angular_spacing = parms.get_parameter('v')->get_float_value();
+
+  unsigned int number_of_projections = parms.get_parameter('w')->get_int_value();
+
+  // Load or generate binning data
+  //
+  
+  boost::shared_ptr<CBCT_binning> binning( new CBCT_binning() );
+
+  if (parms.get_parameter('b')->get_is_set()){
+    std::string binningdata_filename = (char*)parms.get_parameter('b')->get_string_value();
+    std::cout << "Using binning data file: " << binningdata_filename << std::endl;
+    binning->load(binningdata_filename);
+
+    if( binning->get_maximum_projection_index() >= number_of_projections ) {
+      std::cout << "Maximum projection index in binning file (" << 
+	binning->get_maximum_projection_index() << 
+	") exceeds the number of projections requested at the command line (" << 
+	number_of_projections <<
+	")" << std::endl;
+      exit(1);
+    }
+  } 
+  else 
+    binning->set_as_default_3d_bin(number_of_projections);
+
+  binning->print();
+  
+  // Create projection angles array
+  //
+  
+  std::vector<float> angles;
+
+  for( unsigned int i=0; i<number_of_projections; i++ ){
+    float angle = start_angle + i*angular_spacing;
+    angles.push_back(angle);
+  }
+  
+  // Create projection offsets array
+  //
+
+  std::vector<floatd2> offsets;
+
+  if (parms.get_parameter('o')->get_is_set()){
+    std::string offsets_filename = (char*)parms.get_parameter('o')->get_string_value();
+    std::cout << "Using offsets file: " << offsets_filename << std::endl;
+    offsets = get_offsets(offsets_filename);
+  } 
+  else{
+    for( unsigned int i=0; i<number_of_projections; i++ ){
+      offsets.push_back(floatd2(0.0f));
+    } 
+  }   
+  
+  // Allocate and clear array to hold the result
+  //
+  
+  std::vector<size_t> ps_dims;
+  ps_dims.push_back(ps_dims_in_pixels[0]);
+  ps_dims.push_back(ps_dims_in_pixels[1]);
+  ps_dims.push_back(number_of_projections);
+
+  boost::shared_ptr< hoCuNDArray<float> > projections( new hoCuNDArray<float>(&ps_dims) );
+  clear(projections.get()); // Since the binning might not write to all projections
+
+  // Create geometry setup
+  //
+
+  boost::shared_ptr<CBCT_geometry> geometry( new CBCT_geometry() );
+  geometry->set_SAD(SAD);
+  geometry->set_SDD(SDD);
+  geometry->set_FOV(ps_dims_in_mm);
+  geometry->set_angles(angles);
+  geometry->set_offsets(offsets);
+
+  // Create acquisition setup
+  //
+
+  boost::shared_ptr<CBCT_acquisition> acquisition( new CBCT_acquisition() );
+  acquisition->set_geometry(geometry);
+  acquisition->set_projections(projections);
+
+  // Define conebeam projection operator
+  // - and configure based on input parameters
+  //
+
+  boost::shared_ptr< hoCuConebeamProjectionOperator > E( new hoCuConebeamProjectionOperator() );
+  
+  CommandLineParameter *parm = parms.get_parameter('P');
+  if( parm && parm->get_is_set() )
+    E->set_num_projections_per_batch( parm->get_int_value() );
+  
+  parm = parms.get_parameter('S');  
+  if( parm && parm->get_is_set() ) 
+    E->set_num_samples_per_pixel( parm->get_float_value() );
+  
+  E->setup( acquisition, binning, is_dims_in_mm );
+
+  // Initialize the device
+  // - just to report more accurate timings
+  //
+
+  cudaThreadSynchronize();
+
+  //
+  // Forwards projection (X-ray image simulation)
+  //
+  
+  {
+    GPUTimer timer("Running CBCT forwards projection");
+    E->mult_M( image.get(), projections.get() );
+    cudaThreadSynchronize();
+  }
+
+  write_nd_array<float>( projections.get(), projections_filename.c_str() );
+  acquisition->save( acquisition_filename );
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/ct/xray/CBCT_reconstruct_CG.cpp b/apps/standalone/gpu/ct/xray/CBCT_reconstruct_CG.cpp
new file mode 100644
index 0000000..b789945
--- /dev/null
+++ b/apps/standalone/gpu/ct/xray/CBCT_reconstruct_CG.cpp
@@ -0,0 +1,204 @@
+#include "hoCuNDArray_utils.h"
+#include "radial_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray.h"
+#include "imageOperator.h"
+#include "identityOperator.h"
+#include "hoPartialDerivativeOperator.h"
+#include "hoCuConebeamProjectionOperator.h"
+#include "cuConvolutionOperator.h"
+#include "hoCuIdentityOperator.h"
+#include "hoCuNDArray_math.h"
+#include "hoCuNDArray_blas.h"
+#include "hoCuCgSolver.h"
+#include "CBCT_acquisition.h"
+#include "complext.h"
+#include "encodingOperatorContainer.h"
+#include "vector_td_io.h"
+#include "hoCuPartialDerivativeOperator.h"
+#include "GPUTimer.h"
+
+#include <iostream>
+#include <algorithm>
+#include <sstream>
+#include <math_constants.h>
+#include <boost/program_options.hpp>
+
+using namespace std;
+using namespace Gadgetron;
+
+namespace po = boost::program_options;
+
+int 
+main(int argc, char** argv)
+{
+  string acquisition_filename;
+  string outputFile;
+  uintd3 imageSize;
+  floatd3 voxelSize;
+  float reg_weight;
+  int device;
+  unsigned int dump;
+  unsigned int downsamples;
+  unsigned int iterations;
+
+  po::options_description desc("Allowed options");
+  desc.add_options()
+    ("help", "produce help message")
+    ("acquisition,a", po::value<string>(&acquisition_filename)->default_value("acquisition.hdf5"), "Acquisition data")
+    ("samples,n",po::value<unsigned int>(),"Number of samples per ray")
+    ("output,f", po::value<string>(&outputFile)->default_value("reconstruction.real"), "Output filename")
+    ("size,s",po::value<uintd3>(&imageSize)->default_value(uintd3(512,512,1)),"Image size in pixels")
+    ("binning,b",po::value<string>(),"Binning file for 4d reconstruction")
+    ("SAG","Use exact SAG correction if present")
+    ("voxelSize,v",po::value<floatd3>(&voxelSize)->default_value(floatd3(0.488f,0.488f,1.0f)),"Voxel size in mm")
+    ("dimensions,d",po::value<floatd3>(),"Image dimensions in mm. Overwrites voxelSize.")
+    ("iterations,i",po::value<unsigned int>(&iterations)->default_value(10),"Number of iterations")
+    ("weight,w",po::value<float>(&reg_weight)->default_value(float(0.0f)),"Regularization weight")
+    ("device",po::value<int>(&device)->default_value(0),"Number of the device to use (0 indexed)")
+    ("downsample,D",po::value<unsigned int>(&downsamples)->default_value(0),"Downsample projections this factor")
+    ;
+  
+  po::variables_map vm;
+  po::store(po::parse_command_line(argc, argv, desc), vm);
+  po::notify(vm);
+
+  if (vm.count("help")) {
+    cout << desc << "\n";
+    return 1;
+  }
+  std::cout << "Command line options:" << std::endl;
+  for (po::variables_map::iterator it = vm.begin(); it != vm.end(); ++it){
+    boost::any a = it->second.value();
+    std::cout << it->first << ": ";
+    if (a.type() == typeid(std::string)) std::cout << it->second.as<std::string>();
+    else if (a.type() == typeid(int)) std::cout << it->second.as<int>();
+    else if (a.type() == typeid(unsigned int)) std::cout << it->second.as<unsigned int>();
+    else if (a.type() == typeid(float)) std::cout << it->second.as<float>();
+    else if (a.type() == typeid(vector_td<float,3>)) std::cout << it->second.as<vector_td<float,3> >();
+    else if (a.type() == typeid(vector_td<int,3>)) std::cout << it->second.as<vector_td<int,3> >();
+    else if (a.type() == typeid(vector_td<unsigned int,3>)) std::cout << it->second.as<vector_td<unsigned int,3> >();
+    else std::cout << "Unknown type" << std::endl;
+    std::cout << std::endl;
+  }
+  cudaSetDevice(device);
+  cudaDeviceReset();
+
+  //Really weird stuff. Needed to initialize the device?? Should find real bug.
+  cudaDeviceManager::Instance()->lockHandle();
+  cudaDeviceManager::Instance()->unlockHandle();
+
+  boost::shared_ptr<CBCT_acquisition> ps(new CBCT_acquisition());
+  ps->load(acquisition_filename);
+  ps->get_geometry()->print(std::cout);
+  ps->downsample(downsamples);
+
+  float SDD = ps->get_geometry()->get_SDD();
+  float SAD = ps->get_geometry()->get_SAD();
+
+  boost::shared_ptr<CBCT_binning> binning(new CBCT_binning());
+  if (vm.count("binning")){
+    std::cout << "Loading binning data" << std::endl;
+    binning->load(vm["binning"].as<string>());
+  } else 
+    binning->set_as_default_3d_bin(ps->get_projections()->get_size(2));
+
+  binning->print(std::cout);
+
+  floatd3 imageDimensions;
+  if (vm.count("dimensions")){
+    imageDimensions = vm["dimensions"].as<floatd3>();
+    voxelSize = imageDimensions/imageSize;
+  }
+  else imageDimensions = voxelSize*imageSize;
+
+  float lengthOfRay_in_mm = norm(imageDimensions);
+  unsigned int numSamplesPerPixel = 3;
+  float minSpacing = min(voxelSize)/numSamplesPerPixel;
+
+  unsigned int numSamplesPerRay;
+  if (vm.count("samples")) numSamplesPerRay = vm["samples"].as<unsigned int>();
+  else numSamplesPerRay = ceil( lengthOfRay_in_mm / minSpacing );
+
+  float step_size_in_mm = lengthOfRay_in_mm / numSamplesPerRay;
+  size_t numProjs = ps->get_projections()->get_size(2);
+  size_t needed_bytes = 2 * prod(imageSize) * sizeof(float);
+
+  std::vector<size_t> is_dims = to_std_vector((uint64d3)imageSize);
+  std::cout << "IS dimensions " << is_dims[0] << " " << is_dims[1] << " " << is_dims[2] << std::endl;
+  std::cout << "Image size " << imageDimensions << std::endl;
+
+  is_dims.push_back(binning->get_number_of_bins());
+
+  hoCuNDArray<float> projections(*ps->get_projections());
+
+  // Define encoding operator
+  boost::shared_ptr< hoCuConebeamProjectionOperator >
+    E( new hoCuConebeamProjectionOperator() );
+
+  E->setup(ps,binning,imageDimensions);
+  E->set_domain_dimensions(&is_dims);
+  E->set_codomain_dimensions(ps->get_projections()->get_dimensions().get());
+
+  if (E->get_use_offset_correction())
+    	E->offset_correct(&projections);
+
+  // Define regularization operator
+  boost::shared_ptr< hoCuIdentityOperator<float> >
+    I( new hoCuIdentityOperator<float>() );
+  
+  I->set_weight(reg_weight);
+
+  hoCuCgSolver<float> solver;
+
+  solver.set_encoding_operator(E);
+
+  if( reg_weight>0.0f ) {
+    std::cout << "Adding identity operator with weight " << reg_weight << std::endl;
+    solver.add_regularization_operator(I);
+  }
+
+  solver.set_max_iterations(iterations);
+  solver.set_tc_tolerance(1e-8);
+  solver.set_output_mode(hoCuCgSolver<float>::OUTPUT_VERBOSE);
+
+  /*  if (vm.count("TV")){
+    boost::shared_ptr<hoCuPartialDerivativeOperator<float,4> > dx (new hoCuPartialDerivativeOperator<float,4>(0) );
+    boost::shared_ptr<hoCuPartialDerivativeOperator<float,4> > dy (new hoCuPartialDerivativeOperator<float,4>(1) );
+    boost::shared_ptr<hoCuPartialDerivativeOperator<float,4> > dz (new hoCuPartialDerivativeOperator<float,4>(2) );
+    boost::shared_ptr<hoCuPartialDerivativeOperator<float,4> > dt (new hoCuPartialDerivativeOperator<float,4>(3) );
+
+    dx->set_codomain_dimensions(&is_dims);
+    dy->set_codomain_dimensions(&is_dims);
+    dz->set_codomain_dimensions(&is_dims);
+    dt->set_codomain_dimensions(&is_dims);
+
+    dx->set_domain_dimensions(&is_dims);
+    dy->set_domain_dimensions(&is_dims);
+    dz->set_domain_dimensions(&is_dims);
+    dt->set_domain_dimensions(&is_dims);
+
+    dx->set_weight(vm["TV"].as<float>());
+    dy->set_weight(vm["TV"].as<float>());
+    dz->set_weight(vm["TV"].as<float>());
+    dt->set_weight(vm["TV"].as<float>());
+
+    solver.add_regularization_group_operator(dx);
+    solver.add_regularization_group_operator(dy);
+    solver.add_regularization_group_operator(dz);
+    solver.add_regularization_group_operator(dt);
+    solver.add_group(1);
+    }*/
+
+  // Run solver
+  //
+
+  boost::shared_ptr< hoCuNDArray<float> > result;
+
+  {
+    GPUTimer timer("\nRunning conjugate gradient solver");
+    result = solver.solve(&projections);
+  }
+
+  write_nd_array<float>( result.get(), outputFile.c_str());
+}
diff --git a/apps/standalone/gpu/ct/xray/CBCT_reconstruct_FDK_3d.cpp b/apps/standalone/gpu/ct/xray/CBCT_reconstruct_FDK_3d.cpp
new file mode 100644
index 0000000..d5c9c3f
--- /dev/null
+++ b/apps/standalone/gpu/ct/xray/CBCT_reconstruct_FDK_3d.cpp
@@ -0,0 +1,143 @@
+#include "parameterparser.h"
+#include "CBCT_acquisition.h"
+#include "CBCT_binning.h"
+#include "hoCuConebeamProjectionOperator.h"
+#include "hoNDArray_fileio.h"
+#include "hoCuNDArray_math.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_utils.h"
+#include "GPUTimer.h"
+
+#include <iostream>
+#include <algorithm>
+#include <sstream>
+
+using namespace Gadgetron;
+using namespace std;
+
+int main(int argc, char** argv) 
+{ 
+  // Parse command line
+  //
+
+  ParameterParser parms(1024);
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input acquisition filename (.hdf5)", true );
+  parms.add_parameter( 'b', COMMAND_LINE_STRING, 1, "Binning filename (.hdf5)", false );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output image filename (.real)", true, "reconstruction_FDK.real" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT, 3, "Matrix size (3d)", true, "256, 256, 144" );
+  parms.add_parameter( 'f', COMMAND_LINE_FLOAT, 3, "FOV in mm (3d)", true, "448, 448, 252" );
+  parms.add_parameter( 'F', COMMAND_LINE_INT, 1, "Use filtered backprojection (fbp)", true, "1" );
+  parms.add_parameter( 'P', COMMAND_LINE_INT, 1, "Projections per batch", false );
+  parms.add_parameter( 'D', COMMAND_LINE_INT, 1, "Number of downsamples of projection plate", true, "0" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ) {
+    parms.print_parameter_list();
+  }
+  else{
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  std::string acquisition_filename = (char*)parms.get_parameter('d')->get_string_value();
+  std::string image_filename = (char*)parms.get_parameter('r')->get_string_value();
+
+  // Load acquisition data
+  //
+
+  boost::shared_ptr<CBCT_acquisition> acquisition( new CBCT_acquisition() );
+
+  {
+    GPUTimer timer("Loading projections");
+    acquisition->load(acquisition_filename);
+  }
+
+	// Downsample projections if requested
+	//
+
+	{
+		GPUTimer timer("Downsampling projections");
+		unsigned int num_downsamples = parms.get_parameter('D')->get_int_value();    
+		acquisition->downsample(num_downsamples);
+	}
+  
+  // Load or generate binning data
+  //
+  
+  boost::shared_ptr<CBCT_binning> binning( new CBCT_binning() );
+
+  if (parms.get_parameter('b')->get_is_set()){
+    std::string binningdata_filename = (char*)parms.get_parameter('b')->get_string_value();
+    std::cout << "Using binning data file: " << binningdata_filename << std::endl;
+    binning->load(binningdata_filename);
+    binning = boost::shared_ptr<CBCT_binning>(new CBCT_binning(binning->get_3d_binning()));
+  } 
+  else 
+    binning->set_as_default_3d_bin(acquisition->get_projections()->get_size(2));
+
+  // Configuring...
+  //
+
+  uintd2 ps_dims_in_pixels( acquisition->get_projections()->get_size(0),
+			    acquisition->get_projections()->get_size(1) );
+  
+  floatd2 ps_dims_in_mm( acquisition->get_geometry()->get_FOV()[0],
+			 acquisition->get_geometry()->get_FOV()[1] );
+
+  float SDD = acquisition->get_geometry()->get_SDD();
+  float SAD = acquisition->get_geometry()->get_SAD();
+
+  uintd3 is_dims_in_pixels( parms.get_parameter('m')->get_int_value(0),
+			    parms.get_parameter('m')->get_int_value(1),
+			    parms.get_parameter('m')->get_int_value(2) );
+  
+  floatd3 is_dims_in_mm( parms.get_parameter('f')->get_float_value(0), 
+			 parms.get_parameter('f')->get_float_value(1), 
+			 parms.get_parameter('f')->get_float_value(2) );
+  
+  bool use_fbp = parms.get_parameter('F')->get_int_value();
+
+  // Allocate array to hold the result
+  //
+  
+  std::vector<size_t> is_dims;
+  is_dims.push_back(is_dims_in_pixels[0]);
+  is_dims.push_back(is_dims_in_pixels[1]);
+  is_dims.push_back(is_dims_in_pixels[2]);
+  
+  hoCuNDArray<float> fdk_3d(&is_dims);
+  hoCuNDArray<float> projections(*acquisition->get_projections());  
+
+  // Define conebeam projection operator
+  // - and configure based on input parameters
+  //
+  
+  boost::shared_ptr< hoCuConebeamProjectionOperator > E( new hoCuConebeamProjectionOperator() );
+
+  E->setup( acquisition, binning, is_dims_in_mm );
+  E->set_use_filtered_backprojection(use_fbp);
+
+  CommandLineParameter *parm = parms.get_parameter('P');
+  if( parm && parm->get_is_set() )
+    E->set_num_projections_per_batch( parm->get_int_value() );
+  
+  // Initialize the device
+  // - just to report more accurate timings
+  //
+
+  cudaThreadSynchronize();
+
+  //
+  // Standard 3D FDK reconstruction
+  //
+
+  {
+    GPUTimer timer("Running 3D FDK reconstruction");
+    E->mult_MH( &projections, &fdk_3d );
+    cudaThreadSynchronize();
+  }
+
+  write_nd_array<float>( &fdk_3d, image_filename.c_str() );
+  return 0;
+}
diff --git a/apps/standalone/gpu/ct/xray/CBCT_reconstruct_FDK_4d.cpp b/apps/standalone/gpu/ct/xray/CBCT_reconstruct_FDK_4d.cpp
new file mode 100644
index 0000000..b2e50bd
--- /dev/null
+++ b/apps/standalone/gpu/ct/xray/CBCT_reconstruct_FDK_4d.cpp
@@ -0,0 +1,157 @@
+#include "parameterparser.h"
+#include "CBCT_acquisition.h"
+#include "CBCT_binning.h"
+#include "hoCuConebeamProjectionOperator.h"
+#include "hoNDArray_fileio.h"
+#include "hoCuNDArray_math.h"
+#include "vector_td_utilities.h"
+#include "GPUTimer.h"
+
+#include <iostream>
+#include <algorithm>
+#include <sstream>
+
+using namespace Gadgetron;
+using namespace std;
+
+int main(int argc, char** argv) 
+{ 
+	// Parse command line
+	//
+
+	ParameterParser parms(1024);
+	parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input acquisition filename (.hdf5)", true );
+	parms.add_parameter( 'b', COMMAND_LINE_STRING, 1, "Binning filename (.hdf5) - 4D FDK only", false );
+	parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output image filename (.real)", true, "reconstruction_FDK.real" );
+	parms.add_parameter( 'm', COMMAND_LINE_INT, 3, "Matrix size (3d)", true, "256, 256, 144" );
+	parms.add_parameter( 'f', COMMAND_LINE_FLOAT, 3, "FOV in mm (3d)", true, "448, 448, 252" );
+	parms.add_parameter( 'F', COMMAND_LINE_INT, 1, "Use filtered backprojection (fbp)", true, "1" );
+	parms.add_parameter( 'O', COMMAND_LINE_INT, 1, "Use oversampling in fbp", true, "0" );
+	parms.add_parameter( 'H', COMMAND_LINE_FLOAT, 1, "Half-scan mode maximum angle", true, "0" );
+	parms.add_parameter( 'P', COMMAND_LINE_INT, 1, "Projections per batch", true, "50" );
+  parms.add_parameter( 'D', COMMAND_LINE_INT, 1, "Number of downsamples of projection plate", true, "0" );
+
+	parms.parse_parameter_list(argc, argv);
+	if( parms.all_required_parameters_set() ) {
+		parms.print_parameter_list();
+	}
+	else{
+		parms.print_parameter_list();
+		parms.print_usage();
+		return 1;
+	}
+
+	std::string acquisition_filename = (char*)parms.get_parameter('d')->get_string_value();
+	std::string binning_filename = (char*)parms.get_parameter('b')->get_string_value();
+	std::string image_filename = (char*)parms.get_parameter('r')->get_string_value();
+
+	// Load acquisition data
+	//
+
+	boost::shared_ptr<CBCT_acquisition> acquisition( new CBCT_acquisition() );
+	acquisition->load(acquisition_filename);
+
+	// Downsample projections if requested
+	//
+
+	{
+		GPUTimer timer("Downsampling projections");
+		unsigned int num_downsamples = parms.get_parameter('D')->get_int_value();    
+		acquisition->downsample(num_downsamples);
+	}
+
+	// Configuring...
+	//
+
+	uintd2 ps_dims_in_pixels( acquisition->get_projections()->get_size(0),
+			acquisition->get_projections()->get_size(1) );
+
+	floatd2 ps_dims_in_mm( acquisition->get_geometry()->get_FOV()[0],
+			acquisition->get_geometry()->get_FOV()[1] );
+
+	float SDD = acquisition->get_geometry()->get_SDD();
+	float SAD = acquisition->get_geometry()->get_SAD();
+
+	uintd3 is_dims_in_pixels( parms.get_parameter('m')->get_int_value(0),
+			parms.get_parameter('m')->get_int_value(1),
+			parms.get_parameter('m')->get_int_value(2) );
+
+	floatd3 is_dims_in_mm( parms.get_parameter('f')->get_float_value(0),
+			parms.get_parameter('f')->get_float_value(1),
+			parms.get_parameter('f')->get_float_value(2) );
+
+	bool use_fbp = parms.get_parameter('F')->get_int_value();
+	bool use_fbp_os = parms.get_parameter('O')->get_int_value();
+	float half_scan_max_angle = parms.get_parameter('H')->get_float_value();
+	unsigned int projections_per_batch = parms.get_parameter('P')->get_int_value();
+	boost::shared_ptr<CBCT_binning> ps_bd4d(  new CBCT_binning());
+
+	std::cout << "binning data file: " << binning_filename << std::endl;
+	ps_bd4d->load(binning_filename);
+	ps_bd4d->print(std::cout);
+
+	// Load the binning data
+		//
+
+		boost::shared_ptr<CBCT_binning> binning( new CBCT_binning(ps_bd4d->get_3d_binning()) );
+
+	// Allocate array to hold the result
+	//
+
+	std::vector<size_t> is_dims;
+	is_dims.push_back(is_dims_in_pixels[0]);
+	is_dims.push_back(is_dims_in_pixels[1]);
+	is_dims.push_back(is_dims_in_pixels[2]);
+
+
+	hoCuNDArray<float> fdk_3d(&is_dims);
+
+	//
+	// Standard 3D FDK reconstruction
+	//
+
+	boost::shared_ptr< hoCuConebeamProjectionOperator > E( new hoCuConebeamProjectionOperator() );
+
+	E->setup( acquisition, binning, is_dims_in_mm );
+	E->set_use_filtered_backprojection(true);
+
+	hoCuNDArray<float> projections(*acquisition->get_projections());
+
+	{
+		GPUTimer timer("Running 3D FDK reconstruction");
+		E->mult_MH( &projections, &fdk_3d );
+	}
+
+	write_nd_array<float>( &fdk_3d, "fdk.real" );
+
+	/*4D FDK-MB algorithm starts here. McKinnon GC, RHT Bates,
+	 *
+	 *"Towards Imaging the Beating Heart Usefully with a Conventional CT Scanner,"
+	 *" Biomedical Engineering, IEEE Transactions on , vol.BME-28, no.2, pp.123,127, Feb. 1981
+	 * doi: 10.1109/TBME.1981.324785
+	 */
+
+
+
+	size_t numBins = ps_bd4d->get_number_of_bins();
+	is_dims.push_back(numBins);
+	boost::shared_ptr< hoCuConebeamProjectionOperator >
+	E4D( new hoCuConebeamProjectionOperator() );
+	E4D->setup(acquisition,ps_bd4d,is_dims_in_mm);
+	E4D->set_use_filtered_backprojection(true);
+	E4D->set_domain_dimensions(&is_dims);
+
+	hoCuNDArray<float> fdk(*expand(&fdk_3d,numBins));
+	hoCuNDArray<float> diff_proj(projections.get_dimensions());
+
+	E4D->mult_M(&fdk,&diff_proj);
+	projections -= diff_proj;
+
+	hoCuNDArray<float> result(&is_dims);
+	E4D->mult_MH(&projections,&result);
+
+	result += fdk;
+
+	write_nd_array<float>( &result, image_filename.c_str() );
+	return 0;
+}
diff --git a/apps/standalone/gpu/ct/xray/CBCT_reconstruct_NLCG.cpp b/apps/standalone/gpu/ct/xray/CBCT_reconstruct_NLCG.cpp
new file mode 100644
index 0000000..d49758d
--- /dev/null
+++ b/apps/standalone/gpu/ct/xray/CBCT_reconstruct_NLCG.cpp
@@ -0,0 +1,194 @@
+#include "hoCuNDArray_utils.h"
+#include "radial_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray.h"
+#include "imageOperator.h"
+#include "identityOperator.h"
+#include "hoPartialDerivativeOperator.h"
+#include "hoCuConebeamProjectionOperator.h"
+#include "cuConvolutionOperator.h"
+#include "hoCuNDArray_math.h"
+#include "hoCuNDArray_blas.h"
+#include "cgSolver.h"
+#include "CBCT_acquisition.h"
+#include "complext.h"
+#include "encodingOperatorContainer.h"
+#include "vector_td_io.h"
+#include "hoPartialDerivativeOperator.h"
+#include "hoCuTvOperator.h"
+#include "hoCuTvPicsOperator.h"
+#include "hoCuNlcgSolver.h"
+#include "hoCuPartialDerivativeOperator.h"
+
+#include <iostream>
+#include <algorithm>
+#include <sstream>
+#include <math_constants.h>
+#include <boost/program_options.hpp>
+
+using namespace std;
+using namespace Gadgetron;
+
+namespace po = boost::program_options;
+
+int main(int argc, char** argv) 
+{  
+  string acquisition_filename;
+  string outputFile;  
+  uintd3 imageSize;
+  floatd3 voxelSize;
+  int device;
+  unsigned int downsamples;
+  unsigned int iterations;
+  float rho;
+  po::options_description desc("Allowed options");
+
+  desc.add_options()
+    ("help", "produce help message")
+    ("acquisition,a", po::value<string>(&acquisition_filename)->default_value("acquisition.hdf5"), "Acquisition data")
+    ("samples,n",po::value<unsigned int>(),"Number of samples per ray")
+    ("output,f", po::value<string>(&outputFile)->default_value("reconstruction.real"), "Output filename")
+    ("size,s",po::value<uintd3>(&imageSize)->default_value(uintd3(512,512,1)),"Image size in pixels")
+    ("binning,b",po::value<string>(),"Binning file for 4d reconstruction")
+    ("SAG","Use exact SAG correction if present")
+    ("voxelSize,v",po::value<floatd3>(&voxelSize)->default_value(floatd3(0.488f,0.488f,1.0f)),"Voxel size in mm")
+    ("dimensions,d",po::value<floatd3>(),"Image dimensions in mm. Overwrites voxelSize.")
+    ("iterations,i",po::value<unsigned int>(&iterations)->default_value(10),"Number of iterations")
+    ("TV,T",po::value<float>(),"TV Weight ")
+    ("PICS",po::value<float>(),"TV Weight of the prior image (Prior image compressed sensing)")
+    ("device",po::value<int>(&device)->default_value(0),"Number of the device to use (0 indexed)")
+    ("downsample,D",po::value<unsigned int>(&downsamples)->default_value(0),"Downsample projections this factor")
+    ("rho",po::value<float>(&rho)->default_value(0.9f),"Rho-value for line search. Must be between 0 and 1. Smaller value means faster runtime, but less stable algorithm.")
+    ;
+
+  po::variables_map vm;
+  po::store(po::parse_command_line(argc, argv, desc), vm);
+  po::notify(vm);
+
+  if (vm.count("help")) {
+    cout << desc << "\n";
+    return 1;
+  }
+
+  std::cout << "Command line options:" << std::endl;
+  for (po::variables_map::iterator it = vm.begin(); it != vm.end(); ++it){
+    boost::any a = it->second.value();
+    std::cout << it->first << ": ";
+    if (a.type() == typeid(std::string)) std::cout << it->second.as<std::string>();
+    else if (a.type() == typeid(int)) std::cout << it->second.as<int>();
+    else if (a.type() == typeid(unsigned int)) std::cout << it->second.as<unsigned int>();
+    else if (a.type() == typeid(float)) std::cout << it->second.as<float>();
+    else if (a.type() == typeid(vector_td<float,3>)) std::cout << it->second.as<vector_td<float,3> >();
+    else if (a.type() == typeid(vector_td<int,3>)) std::cout << it->second.as<vector_td<int,3> >();
+    else if (a.type() == typeid(vector_td<unsigned int,3>)) std::cout << it->second.as<vector_td<unsigned int,3> >();
+    else std::cout << "Unknown type" << std::endl;
+    std::cout << std::endl;
+  }
+
+  cudaSetDevice(device);
+  cudaDeviceReset();
+
+  //Really weird stuff. Needed to initialize the device?? Should find real bug.
+  cudaDeviceManager::Instance()->lockHandle();
+  cudaDeviceManager::Instance()->unlockHandle();
+       
+  boost::shared_ptr<CBCT_acquisition> ps(new CBCT_acquisition());
+  ps->load(acquisition_filename);
+  ps->get_geometry()->print(std::cout);
+	ps->downsample(downsamples);
+
+  float SDD = ps->get_geometry()->get_SDD();
+  float SAD = ps->get_geometry()->get_SAD();
+
+  boost::shared_ptr<CBCT_binning> binning(new CBCT_binning());
+  if (vm.count("binning")){
+    std::cout << "Loading binning data" << std::endl;
+    binning->load(vm["binning"].as<string>());	  
+  } else binning->set_as_default_3d_bin(ps->get_projections()->get_size(2));
+  binning->print(std::cout);
+
+  floatd3 imageDimensions;
+  if (vm.count("dimensions")){
+    imageDimensions = vm["dimensions"].as<floatd3>();
+    voxelSize = imageDimensions/imageSize;
+  }
+  else imageDimensions = voxelSize*imageSize;
+
+  float lengthOfRay_in_mm = norm(imageDimensions);
+  unsigned int numSamplesPerPixel = 3;
+  float minSpacing = min(voxelSize)/numSamplesPerPixel;
+
+  unsigned int numSamplesPerRay;
+  if (vm.count("samples")) numSamplesPerRay = vm["samples"].as<unsigned int>();
+  else numSamplesPerRay = ceil( lengthOfRay_in_mm / minSpacing );
+
+  float step_size_in_mm = lengthOfRay_in_mm / numSamplesPerRay;
+  size_t numProjs = ps->get_projections()->get_size(2);
+  size_t needed_bytes = 2 * prod(imageSize) * sizeof(float);
+  std::vector<size_t> is_dims = to_std_vector((uint64d3)imageSize);
+
+  std::cout << "IS dimensions " << is_dims[0] << " " << is_dims[1] << " " << is_dims[2] << std::endl;
+  std::cout << "Image size " << imageDimensions << std::endl;
+
+  is_dims.push_back(binning->get_number_of_bins());
+
+  // Define encoding matrix
+  boost::shared_ptr< hoCuConebeamProjectionOperator >
+    E( new hoCuConebeamProjectionOperator() );
+
+  E->setup(ps,binning,imageDimensions);
+  E->set_domain_dimensions(&is_dims);
+  E->set_codomain_dimensions(ps->get_projections()->get_dimensions().get());
+
+  hoCuNlcgSolver<float> solver;
+
+  solver.set_encoding_operator(E);
+  solver.set_domain_dimensions(&is_dims);
+  solver.set_max_iterations(iterations);
+  solver.set_output_mode(hoCuNlcgSolver<float>::OUTPUT_VERBOSE);
+  solver.set_non_negativity_constraint(true);
+  solver.set_rho(rho);
+
+  hoCuNDArray<float> projections = *ps->get_projections();
+  
+  if (E->get_use_offset_correction())
+    	E->offset_correct(&projections);
+
+
+  if (vm.count("TV")){
+    std::cout << "Total variation regularization in use" << std::endl;
+    boost::shared_ptr<hoCuTvOperator<float,4> > tv(new hoCuTvOperator<float,4>);
+    tv->set_weight(vm["TV"].as<float>());
+    solver.add_nonlinear_operator(tv);
+  }
+
+  if (vm.count("PICS")){
+    std::cout << "PICS in use" << std::endl;
+    boost::shared_ptr<CBCT_binning> binning_pics( new CBCT_binning() );
+    binning_pics->set_as_default_3d_bin(ps->get_projections()->get_size(2));
+    std::vector<size_t> is_dims3d = to_std_vector((uint64d3)imageSize);
+    boost::shared_ptr< hoCuConebeamProjectionOperator >
+      Ep( new hoCuConebeamProjectionOperator() );
+    Ep->setup(ps,binning_pics,imageDimensions);
+    Ep->set_codomain_dimensions(ps->get_projections()->get_dimensions().get());
+    Ep->set_domain_dimensions(&is_dims3d);
+
+    boost::shared_ptr<hoCuNDArray<float> > prior3d(new hoCuNDArray<float>(&is_dims3d));
+    Ep->mult_MH(&projections,prior3d.get());
+
+    hoCuNDArray<float> tmp_proj(*ps->get_projections());
+    Ep->mult_M(prior3d.get(),&tmp_proj);
+    float s = dot(ps->get_projections().get(),&tmp_proj)/dot(&tmp_proj,&tmp_proj);
+    *prior3d *= s;
+    boost::shared_ptr<hoCuNDArray<float> > prior(new hoCuNDArray<float>(*expand( prior3d.get(), is_dims.back() )));
+    boost::shared_ptr<hoCuTvPicsOperator<float,3> > pics (new hoCuTvPicsOperator<float,3>);
+    pics->set_prior(prior);
+    pics->set_weight(vm["PICS"].as<float>());
+    solver.add_nonlinear_operator(pics);
+    solver.set_x0(prior);
+  }
+
+  boost::shared_ptr< hoCuNDArray<float> > result = solver.solve(&projections);
+
+  write_nd_array<float>( result.get(), outputFile.c_str());
+}
diff --git a/apps/standalone/gpu/ct/xray/CBCT_reconstruct_SB.cpp b/apps/standalone/gpu/ct/xray/CBCT_reconstruct_SB.cpp
new file mode 100644
index 0000000..27b29cc
--- /dev/null
+++ b/apps/standalone/gpu/ct/xray/CBCT_reconstruct_SB.cpp
@@ -0,0 +1,281 @@
+#include "hoCuNDArray_utils.h"
+#include "radial_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray.h"
+#include "imageOperator.h"
+#include "identityOperator.h"
+#include "hoPartialDerivativeOperator.h"
+#include "hoCuConebeamProjectionOperator.h"
+#include "cuConvolutionOperator.h"
+#include "hoCuNDArray_math.h"
+#include "cgSolver.h"
+#include "CBCT_acquisition.h"
+#include "complext.h"
+#include "encodingOperatorContainer.h"
+#include "vector_td_io.h"
+#include "hoCuPartialDerivativeOperator.h"
+#include "hoCuTvOperator.h"
+#include "hoCuTvPicsOperator.h"
+#include "hoCuSbcCgSolver.h"
+#include "GPUTimer.h"
+
+#include <iostream>
+#include <algorithm>
+#include <sstream>
+#include <math_constants.h>
+#include <boost/program_options.hpp>
+
+using namespace std;
+using namespace Gadgetron;
+
+namespace po = boost::program_options;
+
+class mySbcCgSolver : public hoCuSbcCgSolver<float> 
+{
+public:
+    mySbcCgSolver() : hoCuSbcCgSolver<float>() {
+        this->dumpFreq_ = 5;
+        this->counter_ = 1;
+    }
+    ~mySbcCgSolver() {}
+  
+  virtual bool post_linear_solver_callback( hoCuNDArray<float> *u ) {
+    printf("Iteration: %d\n", counter_);
+    if( (counter_ % dumpFreq_) == 0 ){
+      printf("Dumping frame\n");
+      char filename[19];
+      sprintf(filename, "img-itr-%04i.real", counter_);
+      write_nd_array<float>(u, filename);
+    }
+    counter_++;
+    return true;
+  }
+  void set_dump_frequency(unsigned int dumpFreq) {
+    if( dumpFreq == 0 )
+      this->dumpFreq_ = 9999999; // Not sure how modulus 0 behaves, so just make it a large number that is never reached...
+    else
+      this->dumpFreq_ = dumpFreq;
+  }
+protected:
+  unsigned int counter_;
+  unsigned int dumpFreq_;
+};
+
+
+int main(int argc, char** argv)
+{
+	string acquisition_filename;
+	string outputFile;
+	uintd3 imageSize;
+	floatd3 voxelSize;
+	int device;
+  unsigned int dump;
+	unsigned int downsamples;
+	unsigned int iterations;
+	unsigned int inner_iterations;
+	float non_negativity_weight;
+
+	po::options_description desc("Allowed options");
+	desc.add_options()
+    ("help", "produce help message")
+    ("acquisition,a", po::value<string>(&acquisition_filename)->default_value("acquisition.hdf5"), "Acquisition data")
+    ("samples,n",po::value<unsigned int>(),"Number of samples per ray")
+    ("output,f", po::value<string>(&outputFile)->default_value("reconstruction.real"), "Output filename")
+    ("size,s",po::value<uintd3>(&imageSize)->default_value(uintd3(512,512,1)),"Image size in pixels")
+    ("binning,b",po::value<string>(),"Binning file for 4d reconstruction")
+    ("SAG","Use exact SAG correction if present")
+    ("voxelSize,v",po::value<floatd3>(&voxelSize)->default_value(floatd3(0.488f,0.488f,1.0f)),"Voxel size in mm")
+    ("dimensions,d",po::value<floatd3>(),"Image dimensions in mm. Overwrites voxelSize.")
+    ("iterations,i",po::value<unsigned int>(&iterations)->default_value(10),"Number of iterations")
+    ("inner-iterations",po::value<unsigned int>(&inner_iterations)->default_value(5),"Number of iterations in the inner solver")
+    ("TV,T",po::value<float>(),"TV Weight ")
+    ("non-negativity,N",po::value<float>(&non_negativity_weight)->default_value(1.0f),"Weight for the non-negativity (soft) constraint ")
+    ("prior", po::value<std::string>(),"Prior image filename")
+    ("PICCS",po::value<float>(),"TV Weight of the prior image (Prior image constrained compressed sensing)")
+    ("device",po::value<int>(&device)->default_value(0),"Number of the device to use (0 indexed)")
+    ("dump",po::value<unsigned int>(&dump)->default_value(0),"Dump image every N iterations")    
+    ("downsample,D",po::value<unsigned int>(&downsamples)->default_value(0),"Downsample projections this factor")
+    ;
+  
+	po::variables_map vm;
+	po::store(po::parse_command_line(argc, argv, desc), vm);
+	po::notify(vm);
+
+	if (vm.count("help")) {
+		cout << desc << "\n";
+		return 1;
+	}
+	std::cout << "Command line options:" << std::endl;
+	for (po::variables_map::iterator it = vm.begin(); it != vm.end(); ++it){
+		boost::any a = it->second.value();
+		std::cout << it->first << ": ";
+		if (a.type() == typeid(std::string)) std::cout << it->second.as<std::string>();
+		else if (a.type() == typeid(int)) std::cout << it->second.as<int>();
+		else if (a.type() == typeid(unsigned int)) std::cout << it->second.as<unsigned int>();
+		else if (a.type() == typeid(float)) std::cout << it->second.as<float>();
+		else if (a.type() == typeid(vector_td<float,3>)) std::cout << it->second.as<vector_td<float,3> >();
+		else if (a.type() == typeid(vector_td<int,3>)) std::cout << it->second.as<vector_td<int,3> >();
+		else if (a.type() == typeid(vector_td<unsigned int,3>)) std::cout << it->second.as<vector_td<unsigned int,3> >();
+		else std::cout << "Unknown type" << std::endl;
+		std::cout << std::endl;
+	}
+	cudaSetDevice(device);
+	cudaDeviceReset();
+
+	//Really weird stuff. Needed to initialize the device?? Should find real bug.
+	cudaDeviceManager::Instance()->lockHandle();
+	cudaDeviceManager::Instance()->unlockHandle();
+
+	boost::shared_ptr<CBCT_acquisition> ps(new CBCT_acquisition());
+	ps->load(acquisition_filename);
+	ps->get_geometry()->print(std::cout);
+	ps->downsample(downsamples);
+
+	float SDD = ps->get_geometry()->get_SDD();
+	float SAD = ps->get_geometry()->get_SAD();
+
+	boost::shared_ptr<CBCT_binning> binning(new CBCT_binning());
+	if (vm.count("binning")){
+		std::cout << "Loading binning data" << std::endl;
+		binning->load(vm["binning"].as<string>());
+	} else 
+    binning->set_as_default_3d_bin(ps->get_projections()->get_size(2));
+
+	binning->print(std::cout);
+
+	floatd3 imageDimensions;
+	if (vm.count("dimensions")){
+		imageDimensions = vm["dimensions"].as<floatd3>();
+		voxelSize = imageDimensions/imageSize;
+	}
+	else imageDimensions = voxelSize*imageSize;
+
+	float lengthOfRay_in_mm = norm(imageDimensions);
+	unsigned int numSamplesPerPixel = 3;
+	float minSpacing = min(voxelSize)/numSamplesPerPixel;
+
+	unsigned int numSamplesPerRay;
+	if (vm.count("samples")) numSamplesPerRay = vm["samples"].as<unsigned int>();
+	else numSamplesPerRay = ceil( lengthOfRay_in_mm / minSpacing );
+
+	float step_size_in_mm = lengthOfRay_in_mm / numSamplesPerRay;
+	size_t numProjs = ps->get_projections()->get_size(2);
+	size_t needed_bytes = 2 * prod(imageSize) * sizeof(float);
+
+	std::vector<size_t> is_dims = to_std_vector((uint64d3)imageSize);
+	std::cout << "IS dimensions " << is_dims[0] << " " << is_dims[1] << " " << is_dims[2] << std::endl;
+	std::cout << "Image size " << imageDimensions << std::endl;
+
+	is_dims.push_back(binning->get_number_of_bins());
+
+	hoCuNDArray<float> projections(*ps->get_projections());
+
+	// Define encoding matrix
+	boost::shared_ptr< hoCuConebeamProjectionOperator >
+    E( new hoCuConebeamProjectionOperator() );
+
+	E->setup(ps,binning,imageDimensions);
+	E->set_domain_dimensions(&is_dims);
+	E->set_codomain_dimensions(ps->get_projections()->get_dimensions().get());
+
+	mySbcCgSolver solver;
+
+	solver.set_encoding_operator(E);
+	solver.set_max_outer_iterations(iterations);
+	solver.get_inner_solver()->set_max_iterations(inner_iterations);
+	solver.get_inner_solver()->set_tc_tolerance(1e-6);
+  solver.get_inner_solver()->set_output_mode(hoCuCgSolver<float>::OUTPUT_VERBOSE);
+	solver.set_non_negativity_filter(non_negativity_weight);
+	solver.set_output_mode(hoCuSbcCgSolver<float>::OUTPUT_VERBOSE);
+  solver.set_dump_frequency(dump);
+
+	if (vm.count("TV")){
+		boost::shared_ptr<hoCuPartialDerivativeOperator<float,4> > dx (new hoCuPartialDerivativeOperator<float,4>(0) );
+		boost::shared_ptr<hoCuPartialDerivativeOperator<float,4> > dy (new hoCuPartialDerivativeOperator<float,4>(1) );
+		boost::shared_ptr<hoCuPartialDerivativeOperator<float,4> > dz (new hoCuPartialDerivativeOperator<float,4>(2) );
+		boost::shared_ptr<hoCuPartialDerivativeOperator<float,4> > dt (new hoCuPartialDerivativeOperator<float,4>(3) );
+
+		dx->set_codomain_dimensions(&is_dims);
+		dy->set_codomain_dimensions(&is_dims);
+		dz->set_codomain_dimensions(&is_dims);
+		dt->set_codomain_dimensions(&is_dims);
+
+		dx->set_domain_dimensions(&is_dims);
+		dy->set_domain_dimensions(&is_dims);
+		dz->set_domain_dimensions(&is_dims);
+		dt->set_domain_dimensions(&is_dims);
+
+		dx->set_weight(vm["TV"].as<float>());
+		dy->set_weight(vm["TV"].as<float>());
+		dz->set_weight(vm["TV"].as<float>());
+		dt->set_weight(vm["TV"].as<float>());
+
+		solver.add_regularization_group_operator(dx);
+		solver.add_regularization_group_operator(dy);
+		solver.add_regularization_group_operator(dz);
+		solver.add_regularization_group_operator(dt);
+		solver.add_group(1);
+	}
+
+	if (vm.count("PICCS")){
+		std::cout << "PICCS in used" << std::endl;
+		CBCT_binning *binning_pics = new CBCT_binning();
+		binning_pics->set_as_default_3d_bin(ps->get_projections()->get_size(2));
+		std::vector<size_t> is_dims3d = to_std_vector((uint64d3)imageSize);
+		boost::shared_ptr< hoCuConebeamProjectionOperator >
+		Ep( new hoCuConebeamProjectionOperator() );
+		Ep->setup(ps,binning,imageDimensions);
+		Ep->set_use_filtered_backprojection(true);
+		Ep->set_codomain_dimensions(ps->get_projections()->get_dimensions().get());
+		Ep->set_domain_dimensions(&is_dims3d);
+
+		boost::shared_ptr<hoCuNDArray<float> > prior3d(new hoCuNDArray<float>(&is_dims3d));
+		Ep->mult_MH(&projections,prior3d.get());
+
+		hoCuNDArray<float> tmp_proj(projections);
+		Ep->mult_M(prior3d.get(),&tmp_proj);
+
+		float s = dot(&projections,&tmp_proj)/dot(&tmp_proj,&tmp_proj);
+		*prior3d *= s;
+		boost::shared_ptr<hoCuNDArray<float> > prior(new hoCuNDArray<float>(*expand( prior3d.get(), is_dims.back() )));
+		boost::shared_ptr<hoCuPartialDerivativeOperator<float,4> > dx (new hoCuPartialDerivativeOperator<float,4>(0) );
+		boost::shared_ptr<hoCuPartialDerivativeOperator<float,4> > dy (new hoCuPartialDerivativeOperator<float,4>(1) );
+		boost::shared_ptr<hoCuPartialDerivativeOperator<float,4> > dz (new hoCuPartialDerivativeOperator<float,4>(2) );
+		boost::shared_ptr<hoCuPartialDerivativeOperator<float,4> > dt (new hoCuPartialDerivativeOperator<float,4>(3) );
+
+		dx->set_weight(vm["PICCS"].as<float>());
+		dy->set_weight(vm["PICCS"].as<float>());
+		dz->set_weight(vm["PICCS"].as<float>());
+		dt->set_weight(vm["PICCS"].as<float>());
+
+		dx->set_codomain_dimensions(&is_dims);
+		dy->set_codomain_dimensions(&is_dims);
+		dz->set_codomain_dimensions(&is_dims);
+		dt->set_codomain_dimensions(&is_dims);
+    
+		dx->set_domain_dimensions(&is_dims);
+		dy->set_domain_dimensions(&is_dims);
+		dz->set_domain_dimensions(&is_dims);
+		dt->set_domain_dimensions(&is_dims);
+
+		solver.add_regularization_group_operator(dx);
+		solver.add_regularization_group_operator(dy);
+		solver.add_regularization_group_operator(dz);
+		solver.add_regularization_group_operator(dt);
+		solver.add_group(prior,1);
+
+		delete binning_pics;
+	}
+
+	// Run solver
+	//
+
+	boost::shared_ptr< hoCuNDArray<float> > result;
+
+	{
+		GPUTimer timer("\nRunning Split Bregman solver");
+		result = solver.solve(&projections);
+	}
+
+	write_nd_array<float>( result.get(), outputFile.c_str());
+}
diff --git a/apps/standalone/gpu/ct/xray/CMakeLists.txt b/apps/standalone/gpu/ct/xray/CMakeLists.txt
new file mode 100644
index 0000000..897d32d
--- /dev/null
+++ b/apps/standalone/gpu/ct/xray/CMakeLists.txt
@@ -0,0 +1,64 @@
+find_package(HDF5 REQUIRED HL)
+
+  if(${CUDA_VERSION} VERSION_GREATER "4.99")
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS2} ${CUDA_NVCC_FLAGS3} ${CUDA_NVCC_FLAGS4})
+  else(${CUDA_VERSION} VERSION_GREATER "4.99")    
+
+    if(${CUDA_VERSION} VERSION_GREATER "4.1")
+      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS2} ${CUDA_NVCC_FLAGS3})
+    else(${CUDA_VERSION} VERSION_GREATER "4.1")
+
+      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS2})
+
+    endif(${CUDA_VERSION} VERSION_GREATER "4.1")
+  endif(${CUDA_VERSION} VERSION_GREATER "4.99")
+
+include_directories(
+    ${CMAKE_SOURCE_DIR}/toolboxes/core
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+    ${CMAKE_SOURCE_DIR}/toolboxes/solvers/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/operators
+    ${CMAKE_SOURCE_DIR}/toolboxes/operators/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/ct/xray/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu
+    ${CUDA_INCLUDE_DIRS}
+    ${Boost_INCLUDE_DIR}
+    ${ARMADILLO_INCLUDE_DIRS}
+    ${HDF5_INCLUDE_DIR}
+    ${HDF5_INCLUDE_DIR}/cpp
+    ${ISMRMRD_INCLUDE_DIR}
+)
+
+link_directories(${Boost_LIBRARY_DIR})
+link_libraries(${HDF5_LIBRARIES})
+
+add_executable(CBCT_reconstruct_FDK_3d CBCT_reconstruct_FDK_3d.cpp)
+target_link_libraries(CBCT_reconstruct_FDK_3d gadgetron_toolbox_gpuxray gadgetron_toolbox_cpucore gadgetron_toolbox_cpucore_math gadgetron_toolbox_gpucore gadgetron_toolbox_hostutils ${CUDA_LIBRARIES} ${Boost_LIBRARIES})
+
+add_executable(CBCT_reconstruct_FDK_4d CBCT_reconstruct_FDK_4d.cpp)
+target_link_libraries(CBCT_reconstruct_FDK_4d gadgetron_toolbox_gpuxray gadgetron_toolbox_cpucore gadgetron_toolbox_cpucore_math gadgetron_toolbox_gpucore gadgetron_toolbox_hostutils ${CUDA_LIBRARIES} ${Boost_LIBRARIES})
+
+add_executable(CBCT_reconstruct_CG CBCT_reconstruct_CG.cpp)
+target_link_libraries(CBCT_reconstruct_CG gadgetron_toolbox_gpuxray gadgetron_toolbox_cpucore gadgetron_toolbox_cpucore_math gadgetron_toolbox_gpucore gadgetron_toolbox_gpuoperators gadgetron_toolbox_hostutils ${CUDA_LIBRARIES} ${Boost_LIBRARIES})
+
+add_executable(CBCT_reconstruct_NLCG CBCT_reconstruct_NLCG.cpp)
+target_link_libraries(CBCT_reconstruct_NLCG gadgetron_toolbox_gpuxray gadgetron_toolbox_cpucore gadgetron_toolbox_cpucore_math gadgetron_toolbox_gpucore gadgetron_toolbox_gpuoperators gadgetron_toolbox_hostutils ${CUDA_LIBRARIES} ${Boost_LIBRARIES})
+
+add_executable(CBCT_reconstruct_SB CBCT_reconstruct_SB.cpp)
+target_link_libraries(CBCT_reconstruct_SB gadgetron_toolbox_gpuxray gadgetron_toolbox_cpucore gadgetron_toolbox_cpucore_math gadgetron_toolbox_gpucore gadgetron_toolbox_gpuoperators gadgetron_toolbox_hostutils ${CUDA_LIBRARIES} ${Boost_LIBRARIES})
+
+add_executable(CBCT_forwards_projection CBCT_forwards_projection.cpp)
+target_link_libraries(CBCT_forwards_projection gadgetron_toolbox_gpuxray gadgetron_toolbox_cpucore gadgetron_toolbox_cpucore_math gadgetron_toolbox_gpucore gadgetron_toolbox_hostutils ${CUDA_LIBRARIES} ${Boost_LIBRARIES})
+
+install(TARGETS 
+  CBCT_reconstruct_FDK_3d
+  CBCT_reconstruct_FDK_4d
+  CBCT_reconstruct_CG
+  CBCT_reconstruct_NLCG
+  CBCT_reconstruct_SB
+  CBCT_forwards_projection
+  DESTINATION bin COMPONENT main)
diff --git a/apps/standalone/gpu/deblurring/2d/CMakeLists.txt b/apps/standalone/gpu/deblurring/2d/CMakeLists.txt
new file mode 100644
index 0000000..435425e
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/2d/CMakeLists.txt
@@ -0,0 +1,14 @@
+if (WIN32)
+ADD_DEFINITIONS(-D_USE_MATH_DEFINES)
+endif (WIN32)
+
+add_executable(blur_2d blur_2d.cpp)
+add_executable(deblur_2d_cg deblur_2d_cg.cpp)
+add_executable(deblur_2d_sb deblur_2d_sb.cpp)
+
+target_link_libraries(deblur_2d_cg gadgetron_toolbox_gpucore gadgetron_toolbox_hostutils gadgetron_toolbox_gpuoperators gadgetron_toolbox_gpusolvers ${CUDA_LIBRARIES})
+target_link_libraries(deblur_2d_sb gadgetron_toolbox_gpucore gadgetron_toolbox_hostutils gadgetron_toolbox_gpuoperators gadgetron_toolbox_gpusolvers ${CUDA_LIBRARIES})
+target_link_libraries(blur_2d gadgetron_toolbox_gpucore gadgetron_toolbox_hostutils gadgetron_toolbox_gpuoperators gadgetron_toolbox_gpusolvers ${CUDA_LIBRARIES})
+
+install(TARGETS blur_2d deblur_2d_cg deblur_2d_sb DESTINATION bin COMPONENT main)
+
diff --git a/apps/standalone/gpu/deblurring/2d/blur_2d.cpp b/apps/standalone/gpu/deblurring/2d/blur_2d.cpp
new file mode 100644
index 0000000..d8a8009
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/2d/blur_2d.cpp
@@ -0,0 +1,111 @@
+/*
+  Example code to blur an image and generate input data for the deblurring apps.
+*/
+
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "parameterparser.h"
+#include "cuConvolutionOperator.h"
+
+#include <iostream>
+#include <math.h>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output blurred image file name (.cplx)", true, "blurred_image.cplx" );
+  parms.add_parameter( 'k', COMMAND_LINE_STRING, 1, "Output kernel image file name (.cplx)", true, "kernel_image.cplx" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load image from disk (single precision assumed)
+  boost::shared_ptr< hoNDArray<float> > _host_image = 
+    read_nd_array<float>((char*)parms.get_parameter('d')->get_string_value());
+
+  if( !(_host_image->get_number_of_dimensions() == 2) ){
+    cout << endl << "Input image is not two-dimensional. Quitting.\n" << endl;
+    return 1;
+  }
+
+  // Convert to _real
+  hoNDArray<_real> host_image; host_image.create(_host_image->get_dimensions().get()); 
+  for( unsigned int i=0; i<host_image.get_number_of_elements(); i++ )
+    host_image.get_data_ptr()[i] = (_real) _host_image->get_data_ptr()[i];
+    
+  // Upload host image to device, normalize, and convert to complex type
+  cuNDArray<_real> _image(&host_image);
+  normalize( &_image, _real(1) );
+  boost::shared_ptr< cuNDArray<_complext> > image = real_to_complex<_complext>( &_image );
+  
+  // Setup resulting blurred image
+  cuNDArray<_complext> blurred_image; 
+  blurred_image.create(image->get_dimensions().get());
+  
+  // Generate convolution kernel (just do this on the host for now)
+  _real sigma = 2.5;
+  hoNDArray<_real> host_kernel;
+  host_kernel.create(image->get_dimensions().get());
+  for( unsigned int y=0; y<image->get_size(1); y++ ){
+    for( unsigned int x=0; x<image->get_size(0); x++ ){
+      _real biasx = (_real)(image->get_size(0)>>1);
+      _real biasy = (_real)(image->get_size(1)>>1);
+      _real cx = (_real)x-biasx;
+      _real cy = (_real)y-biasy;
+      host_kernel.get_data_ptr()[y*image->get_size(0)+x] = 1.0/(2.0*M_PI*sigma*sigma)*exp(-1.0*((cx*cx)/(2.0*sigma*sigma)+(cy*cy)/(2.0*sigma*sigma)));
+    }
+  }
+
+  cuNDArray<_real> _kernel(&host_kernel);
+  boost::shared_ptr< cuNDArray<_complext> > kernel = real_to_complex<_complext>( &_kernel );
+
+  // Normalize kernel
+  _real scale = asum(kernel.get());
+  *kernel /= scale;
+
+  // Create convolution operator and assign kernel
+  cuConvolutionOperator<_real,2> conv;
+  conv.set_kernel( kernel.get() );  
+
+  // Convolve
+  conv.mult_M( image.get(), &blurred_image );
+
+  //
+  // Output result
+  //
+  
+  boost::shared_ptr< hoNDArray<_complext> > blurred_image_host = blurred_image.to_host();
+  write_nd_array<_complext>( blurred_image_host.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(&blurred_image)->to_host();
+  write_nd_array<_real>( host_norm.get(), "blurred_image.real" );
+
+  boost::shared_ptr< hoNDArray<_complext> > kernel_image_host = kernel->to_host();
+  write_nd_array<_complext>( kernel_image_host.get(), (char*)parms.get_parameter('k')->get_string_value());
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/deblurring/2d/deblur_2d_cg.cpp b/apps/standalone/gpu/deblurring/2d/deblur_2d_cg.cpp
new file mode 100644
index 0000000..cc236ca
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/2d/deblur_2d_cg.cpp
@@ -0,0 +1,109 @@
+/*
+  Deblurring using conjugate gradient solver.
+*/
+
+// Gadgetron includes
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "cuCgSolver.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuConvolutionOperator.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Blurred image file name (.cplx)", true, "blurred_image.cplx" );
+  parms.add_parameter( 'k', COMMAND_LINE_STRING, 1, "Kernel image file name (.cplx)", true, "kernel_image.cplx" );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "cg_deblurred_image.cplx" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of iterations", true, "25" );
+  parms.add_parameter( 'K', COMMAND_LINE_FLOAT,  1, "Regularization weight", true, "0.1" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running deblurring with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+    
+  // Load sample data from disk
+  boost::shared_ptr< hoNDArray<_complext> > host_data = 
+    read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_complext> > host_kernel = 
+    read_nd_array<_complext>((char*)parms.get_parameter('k')->get_string_value());
+   
+  if( !(host_data->get_number_of_dimensions() == 2) || !(host_kernel->get_number_of_dimensions() == 2) ){
+    cout << endl << "Input data (image/kernel) is not two-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Upload host data to device
+  cuNDArray<_complext> data(host_data.get());
+  cuNDArray<_complext> kernel(host_kernel.get());
+  
+  _real kappa = (_real) parms.get_parameter('K')->get_float_value();
+  unsigned int num_iterations = parms.get_parameter('i')->get_int_value();
+  
+  // Setup regularization operators
+  //
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,2> > Rx( new cuPartialDerivativeOperator<_complext,2>(0) );
+  Rx->set_weight( kappa );
+
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,2> > Ry( new cuPartialDerivativeOperator<_complext,2>(1) );
+  Ry->set_weight( kappa );
+     
+  //
+  // Setup conjugate gradients solver
+  //
+
+  // Define encoding matrix
+  boost::shared_ptr< cuConvolutionOperator<_real,2> > E( new cuConvolutionOperator<_real,2>() );
+  E->set_kernel( &kernel );
+  E->set_domain_dimensions(data.get_dimensions().get());
+
+  // Setup conjugate gradient solver
+  cuCgSolver< _complext> cg;
+  cg.set_encoding_operator( E );                         // encoding matrix
+  if( kappa>0.0 ) cg.add_regularization_operator( Rx );  // regularization matrix
+  if( kappa>0.0 ) cg.add_regularization_operator( Ry );  // regularization matrix
+  cg.set_max_iterations( num_iterations );
+  cg.set_tc_tolerance( 1e-12 );
+  cg.set_output_mode( cuCgSolver< _complext>::OUTPUT_VERBOSE );
+                  
+  //
+  // Conjugate gradient solver
+  //
+  
+  boost::shared_ptr< cuNDArray<_complext> > cgresult = cg.solve( &data );
+
+  // All done, write out the result
+  
+  boost::shared_ptr< hoNDArray<_complext> > host_result = cgresult->to_host();
+  write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+    
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(cgresult.get())->to_host();
+  write_nd_array<_real>( host_norm.get(), "cg_deblurred_image.real" );  
+
+  return 0;
+}
+
diff --git a/apps/standalone/gpu/deblurring/2d/deblur_2d_sb.cpp b/apps/standalone/gpu/deblurring/2d/deblur_2d_sb.cpp
new file mode 100644
index 0000000..8a1824e
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/2d/deblur_2d_sb.cpp
@@ -0,0 +1,129 @@
+/*
+  Deblurring using conjugate gradient solver.
+*/
+
+// Gadgetron includes
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "cuSbcCgSolver.h"
+#include "cuCgSolver.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuConvolutionOperator.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Blurred image file name (.cplx)", true, "blurred_image.cplx" );
+  parms.add_parameter( 'k', COMMAND_LINE_STRING, 1, "Kernel image file name (.cplx)", true, "kernel_image.cplx" );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "sb_deblurred_image.cplx" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of cg iterations", true, "20" );
+  parms.add_parameter( 'I', COMMAND_LINE_INT,    1, "Number of sb inner iterations", true, "1" );
+  parms.add_parameter( 'O', COMMAND_LINE_INT,    1, "Number of sb outer iterations", true, "50" );
+  parms.add_parameter( 'M', COMMAND_LINE_FLOAT,  1, "Mu", true, "100.0" );
+  parms.add_parameter( 'L', COMMAND_LINE_FLOAT,  1, "Lambda", true, "100.0" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running deblurring with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+    
+  // Load sample data from disk
+  boost::shared_ptr< hoNDArray<_complext> > host_data = 
+    read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_complext> > host_kernel = 
+    read_nd_array<_complext>((char*)parms.get_parameter('k')->get_string_value());
+   
+  if( !(host_data->get_number_of_dimensions() == 2) || !(host_kernel->get_number_of_dimensions() == 2) ){
+    cout << endl << "Input data (image/kernel) is not two-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Upload host data to device
+  cuNDArray<_complext> data(host_data.get());
+  cuNDArray<_complext> kernel(host_kernel.get());
+  
+  unsigned int num_cg_iterations = parms.get_parameter('i')->get_int_value();
+  unsigned int num_inner_iterations = parms.get_parameter('I')->get_int_value();
+  unsigned int num_outer_iterations = parms.get_parameter('O')->get_int_value();
+  
+  // Setup regularization operators
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,2> > Rx( new cuPartialDerivativeOperator<_complext,2>(0) );
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,2> > Ry( new cuPartialDerivativeOperator<_complext,2>(1) );
+  
+  _real mu = (_real) parms.get_parameter('M')->get_float_value();
+  _real lambda = (_real) parms.get_parameter('L')->get_float_value();
+
+  if( mu <= (_real) 0.0 ) {
+    cout << endl << "Regularization parameter mu should be strictly positive. Quitting!\n" << endl;
+    return 1;
+  }
+
+  Rx->set_weight( lambda );
+  Rx->set_domain_dimensions(data.get_dimensions().get());
+  Rx->set_codomain_dimensions(data.get_dimensions().get());
+
+  Ry->set_weight( lambda );
+  Ry->set_domain_dimensions(data.get_dimensions().get());
+  Ry->set_codomain_dimensions(data.get_dimensions().get());
+
+  //
+  // Setup conjugate gradients solver
+  //
+
+  // Define encoding matrix
+  boost::shared_ptr< cuConvolutionOperator<_real,2> > E( new cuConvolutionOperator<_real,2>() );  
+  E->set_kernel( &kernel );
+  E->set_weight( mu );
+  E->set_domain_dimensions(data.get_dimensions().get());
+  E->set_codomain_dimensions(data.get_dimensions().get());
+
+  // Setup split-Bregman solver
+  cuSbcCgSolver<_complext> sb;
+  sb.set_encoding_operator( E );
+  sb.add_regularization_group_operator( Rx ); 
+  sb.add_regularization_group_operator( Ry ); 
+  sb.add_group();
+  sb.set_max_outer_iterations(num_outer_iterations);
+  sb.set_max_inner_iterations(num_inner_iterations);
+  sb.set_output_mode( cuSbcCgSolver<_complext>::OUTPUT_VERBOSE );
+
+  sb.get_inner_solver()->set_max_iterations( num_cg_iterations );
+  sb.get_inner_solver()->set_tc_tolerance( 1e-4 );
+  sb.get_inner_solver()->set_output_mode( cuCgSolver<_complext>::OUTPUT_WARNINGS );
+
+  // Run split-Bregman solver
+  boost::shared_ptr< cuNDArray<_complext> > sbresult = sb.solve(&data);
+
+  // All done, write out the result
+  
+  boost::shared_ptr< hoNDArray<_complext> > host_result = sbresult->to_host();
+  write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+    
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(sbresult.get())->to_host();
+  write_nd_array<_real>( host_norm.get(), "sb_deblurred_image.real" );  
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/deblurring/3d/CMakeLists.txt b/apps/standalone/gpu/deblurring/3d/CMakeLists.txt
new file mode 100644
index 0000000..01b3825
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/3d/CMakeLists.txt
@@ -0,0 +1,13 @@
+if (WIN32)
+ADD_DEFINITIONS(-D_USE_MATH_DEFINES)
+endif (WIN32)
+
+add_executable(blur_3d blur_3d.cpp)
+add_executable(deblur_3d_cg deblur_3d_cg.cpp)
+add_executable(deblur_3d_sb deblur_3d_sb.cpp)
+
+target_link_libraries(deblur_3d_cg gadgetron_toolbox_gpucore gadgetron_toolbox_hostutils gadgetron_toolbox_gpuoperators gadgetron_toolbox_gpusolvers gadgetron_toolbox_gpunfft ${CUDA_LIBRARIES})
+target_link_libraries(deblur_3d_sb gadgetron_toolbox_gpucore gadgetron_toolbox_hostutils gadgetron_toolbox_gpuoperators gadgetron_toolbox_gpusolvers gadgetron_toolbox_gpunfft ${CUDA_LIBRARIES})
+target_link_libraries(blur_3d gadgetron_toolbox_gpucore gadgetron_toolbox_hostutils gadgetron_toolbox_gpuoperators gadgetron_toolbox_gpusolvers gadgetron_toolbox_gpunfft ${CUDA_LIBRARIES})
+
+install(TARGETS blur_3d deblur_3d_cg deblur_3d_sb DESTINATION bin COMPONENT main)
diff --git a/apps/standalone/gpu/deblurring/3d/blur_3d.cpp b/apps/standalone/gpu/deblurring/3d/blur_3d.cpp
new file mode 100644
index 0000000..6942c76
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/3d/blur_3d.cpp
@@ -0,0 +1,113 @@
+/*
+  Example code to blur an image and generate input data for the deblurring apps.
+*/
+
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_elemwise.h"
+#include "parameterparser.h"
+#include "cuConvolutionOperator.h"
+#include <iostream>
+#include <math.h>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input image file name (.real)", true );
+  parms.add_parameter( 'k', COMMAND_LINE_STRING, 1, "In kernel image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output image file name (.cplx)", true, "blurred_image.cplx" );
+  parms.add_parameter( 'K', COMMAND_LINE_STRING, 1, "Output kernel file name (.cplx)", true, "kernel_image.cplx" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load image and kernel from disk (single precision assumed)
+  //
+  boost::shared_ptr< hoNDArray<float> > _host_image = 
+    read_nd_array<float>((char*)parms.get_parameter('d')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<float> > _host_kernel = 
+    read_nd_array<float>((char*)parms.get_parameter('k')->get_string_value());
+
+  if( !(_host_image->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input image is not three-dimensional. Quitting.\n" << endl;
+    return 1;
+  }
+
+  if( !(_host_kernel->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input kernel is not three-dimensional. Quitting.\n" << endl;
+    return 1;
+  }
+
+  // Convert image and kernel to _real
+  //
+  hoNDArray<_real> host_image; host_image.create(_host_image->get_dimensions().get()); 
+  for( unsigned int i=0; i<host_image.get_number_of_elements(); i++ )
+    host_image.get_data_ptr()[i] = (_real) _host_image->get_data_ptr()[i];
+    
+  hoNDArray<_real> host_kernel; host_kernel.create(_host_kernel->get_dimensions().get()); 
+  for( unsigned int i=0; i<host_kernel.get_number_of_elements(); i++ )
+    host_kernel.get_data_ptr()[i] = (_real) _host_kernel->get_data_ptr()[i];
+
+  // Upload host image/kernel and convert to complex type
+  //
+  cuNDArray<_real> _image(&host_image);
+  boost::shared_ptr< cuNDArray<_complext> > image = real_to_complex<_complext>( &_image );
+  
+  cuNDArray<_real> _kernel(&host_kernel);
+  boost::shared_ptr< cuNDArray<_complext> > kernel = real_to_complex<_complext>( &_kernel );
+
+  // Normalize kernel
+  _real scale = asum(kernel.get());
+  *kernel /= scale;
+
+  // Setup resulting blurred image
+  cuNDArray<_complext> blurred_image;
+  blurred_image.create(image->get_dimensions().get());
+  
+  // Create convolution operator and assign kernel
+  cuConvolutionOperator<_real,3> conv;
+  conv.set_kernel( kernel.get() );  
+
+  // Convolve
+  conv.mult_M( image.get(), &blurred_image );
+
+  //
+  // Output result
+  //
+  
+  boost::shared_ptr< hoNDArray<_complext> > blurred_image_host = blurred_image.to_host();
+  write_nd_array<_complext>( blurred_image_host.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(&blurred_image)->to_host();
+  write_nd_array<_real>( host_norm.get(), "blurred_image.real" );
+
+  boost::shared_ptr< hoNDArray<_complext> > kernel_image_host = kernel->to_host();
+  write_nd_array<_complext>( kernel_image_host.get(), (char*)parms.get_parameter('K')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_norm_kernel = abs(kernel.get())->to_host();
+  write_nd_array<_real>( host_norm_kernel.get(), "kernel_image.real" );
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/deblurring/3d/deblur_3d_cg.cpp b/apps/standalone/gpu/deblurring/3d/deblur_3d_cg.cpp
new file mode 100644
index 0000000..d75fe64
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/3d/deblur_3d_cg.cpp
@@ -0,0 +1,114 @@
+/*
+  Deblurring using conjugate gradient solver.
+*/
+
+// Gadgetron includes
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "cuCgSolver.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuConvolutionOperator.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Blurred image file name (.cplx)", true, "blurred_image.cplx" );
+  parms.add_parameter( 'k', COMMAND_LINE_STRING, 1, "Kernel image file name (.cplx)", true, "kernel_image.cplx" );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "cg_deblurred_image.cplx" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of iterations", true, "25" );
+  parms.add_parameter( 'K', COMMAND_LINE_FLOAT,  1, "Regularization weight", true, "0.1" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running deblurring with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+    
+  // Load sample data from disk
+  boost::shared_ptr< hoNDArray<_complext> > host_data = 
+    read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_complext> > host_kernel = 
+    read_nd_array<_complext>((char*)parms.get_parameter('k')->get_string_value());
+   
+  if( !(host_data->get_number_of_dimensions() == 3) || !(host_kernel->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input data (image/kernel) is not two-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Upload host data to device
+  cuNDArray<_complext> data(host_data.get());
+  cuNDArray<_complext> kernel(host_kernel.get());
+  
+  _real kappa = (_real) parms.get_parameter('K')->get_float_value();
+  unsigned int num_iterations = parms.get_parameter('i')->get_int_value();
+  
+  // Setup regularization operators
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> > Rx( new cuPartialDerivativeOperator<_complext,3>(0) );
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> > Ry( new cuPartialDerivativeOperator<_complext,3>(1) );
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> > Rz( new cuPartialDerivativeOperator<_complext,3>(2) );
+
+  Rx->set_weight( kappa );
+  Ry->set_weight( kappa );
+  Rz->set_weight( kappa );
+     
+  //
+  // Setup conjugate gradients solver
+  //
+
+  // Define encoding matrix
+  boost::shared_ptr< cuConvolutionOperator<_real,3> > E( new cuConvolutionOperator<_real,3>() );
+  E->set_kernel( &kernel );
+  E->set_domain_dimensions(data.get_dimensions().get());
+    
+  // Setup conjugate gradient solver
+  cuCgSolver<_complext> cg;
+  cg.set_encoding_operator( E );                         // encoding matrix
+  if( kappa>0.0 ) cg.add_regularization_operator( Rx );  // regularization matrix
+  if( kappa>0.0 ) cg.add_regularization_operator( Ry );  // regularization matrix
+  if( kappa>0.0 ) cg.add_regularization_operator( Rz );  // regularization matrix
+  cg.set_max_iterations( num_iterations );
+  cg.set_tc_tolerance( 1e-12 );
+  cg.set_output_mode( cuCgSolver<_complext>::OUTPUT_VERBOSE );
+                
+  // Form right hand side
+  cuNDArray<_complext> rhs; rhs.create(data.get_dimensions().get());
+  E->mult_MH( &data, &rhs );
+  
+  //
+  // Conjugate gradient solver
+  //
+  
+  boost::shared_ptr< cuNDArray<_complext> > cgresult = cg.solve_from_rhs(&rhs);
+
+  // All done, write out the result
+  
+  boost::shared_ptr< hoNDArray<_complext> > host_result = cgresult->to_host();
+  write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+    
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(cgresult.get())->to_host();
+  write_nd_array<_real>( host_norm.get(), "cg_deblurred_image.real" );  
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/deblurring/3d/deblur_3d_sb.cpp b/apps/standalone/gpu/deblurring/3d/deblur_3d_sb.cpp
new file mode 100644
index 0000000..348e640
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/3d/deblur_3d_sb.cpp
@@ -0,0 +1,135 @@
+/*
+  Deblurring using conjugate gradient solver.
+*/
+
+// Gadgetron includes
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "cuSbcCgSolver.h"
+#include "cuCgSolver.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuConvolutionOperator.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Blurred image file name (.cplx)", true, "blurred_image.cplx" );
+  parms.add_parameter( 'k', COMMAND_LINE_STRING, 1, "Kernel image file name (.cplx)", true, "kernel_image.cplx" );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "sb_deblurred_image.cplx" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of cg iterations", true, "20" );
+  parms.add_parameter( 'I', COMMAND_LINE_INT,    1, "Number of sb inner iterations", true, "1" );
+  parms.add_parameter( 'O', COMMAND_LINE_INT,    1, "Number of sb outer iterations", true, "50" );
+  parms.add_parameter( 'M', COMMAND_LINE_FLOAT,  1, "Mu", true, "1.0" );
+  parms.add_parameter( 'L', COMMAND_LINE_FLOAT,  1, "Lambda", true, "1.0" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running deblurring with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+    
+  // Load sample data from disk
+  boost::shared_ptr< hoNDArray<_complext> > host_data = 
+    read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_complext> > host_kernel = 
+    read_nd_array<_complext>((char*)parms.get_parameter('k')->get_string_value());
+   
+  if( !(host_data->get_number_of_dimensions() == 3) || !(host_kernel->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input data (image/kernel) is not two-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Upload host data to device
+  cuNDArray<_complext> data(host_data.get());
+  cuNDArray<_complext> kernel(host_kernel.get());
+  
+  unsigned int num_cg_iterations = parms.get_parameter('i')->get_int_value();
+  unsigned int num_inner_iterations = parms.get_parameter('I')->get_int_value();
+  unsigned int num_outer_iterations = parms.get_parameter('O')->get_int_value();
+  
+  // Setup regularization operators
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> > Rx( new cuPartialDerivativeOperator<_complext,3>(0) );
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> > Ry( new cuPartialDerivativeOperator<_complext,3>(1) );
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> > Rz( new cuPartialDerivativeOperator<_complext,3>(2) );
+  
+  _real mu = (_real) parms.get_parameter('M')->get_float_value();
+  _real lambda = (_real) parms.get_parameter('L')->get_float_value();
+
+  if( mu <= (_real) 0.0 ) {
+    cout << endl << "Regularization parameter mu should be strictly positive. Quitting!\n" << endl;
+    return 1;
+  }
+
+  Rx->set_weight( lambda );
+  Rx->set_domain_dimensions(data.get_dimensions().get());
+  Rx->set_codomain_dimensions(data.get_dimensions().get());
+
+  Ry->set_weight( lambda );
+  Ry->set_domain_dimensions(data.get_dimensions().get());
+  Ry->set_codomain_dimensions(data.get_dimensions().get());
+
+  Rz->set_weight( lambda );
+  Rz->set_domain_dimensions(data.get_dimensions().get());
+  Rz->set_codomain_dimensions(data.get_dimensions().get());
+
+  //
+  // Setup conjugate gradients solver
+  //
+
+  // Define encoding matrix
+  boost::shared_ptr< cuConvolutionOperator<_real,3> > E( new cuConvolutionOperator<_real,3>() );  
+  E->set_kernel( &kernel );
+  E->set_weight( mu );
+  E->set_domain_dimensions(data.get_dimensions().get());
+  E->set_codomain_dimensions(data.get_dimensions().get());
+  
+  // Setup split-Bregman solver
+  cuSbcCgSolver<_complext> sb;
+  sb.set_encoding_operator( E );
+  sb.add_regularization_group_operator( Rx ); 
+  sb.add_regularization_group_operator( Ry ); 
+  sb.add_group();
+  sb.add_regularization_operator( Rz ); 
+  sb.set_max_outer_iterations(num_outer_iterations);
+  sb.set_max_inner_iterations(num_inner_iterations);
+  sb.set_output_mode( cuSbcCgSolver< _complext>::OUTPUT_VERBOSE );
+
+  sb.get_inner_solver()->set_max_iterations( num_cg_iterations );
+  sb.get_inner_solver()->set_tc_tolerance( 1e-8 );
+  sb.get_inner_solver()->set_output_mode( cuCgSolver<_complext>::OUTPUT_WARNINGS );
+
+  // Run split-Bregman solver
+  boost::shared_ptr< cuNDArray<_complext> > sbresult = sb.solve(&data);
+
+  // All done, write out the result
+  
+  boost::shared_ptr< hoNDArray<_complext> > host_result = sbresult->to_host();
+  write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+    
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(sbresult.get())->to_host();
+  write_nd_array<_real>( host_norm.get(), "sb_deblurred_image.real" );  
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/deblurring/CMakeLists.txt b/apps/standalone/gpu/deblurring/CMakeLists.txt
new file mode 100644
index 0000000..5550044
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(2d)
+add_subdirectory(3d)
diff --git a/apps/standalone/gpu/denoising/2d/CMakeLists.txt b/apps/standalone/gpu/denoising/2d/CMakeLists.txt
new file mode 100644
index 0000000..a9d8501
--- /dev/null
+++ b/apps/standalone/gpu/denoising/2d/CMakeLists.txt
@@ -0,0 +1,10 @@
+if (WIN32)
+ADD_DEFINITIONS(-D_USE_MATH_DEFINES)
+endif (WIN32)
+
+include_directories(${CMAKE_SOURCE_DIR}/toolboxes/dwt/gpu)
+add_executable(denoise_TV denoise_TV.cpp)
+
+target_link_libraries(denoise_TV gadgetron_toolbox_gpudwt gadgetron_toolbox_gpucore gadgetron_toolbox_hostutils gadgetron_toolbox_gpusolvers gadgetron_toolbox_gpuoperators ${CUDA_LIBRARIES})
+
+install(TARGETS denoise_TV DESTINATION bin COMPONENT main)
diff --git a/apps/standalone/gpu/denoising/2d/denoise_TV.cpp b/apps/standalone/gpu/denoising/2d/denoise_TV.cpp
new file mode 100644
index 0000000..51ea43e
--- /dev/null
+++ b/apps/standalone/gpu/denoising/2d/denoise_TV.cpp
@@ -0,0 +1,154 @@
+/*
+  Total variation denoising based on the paper 
+  "The Split Bregman Method for L1-Regularized Problems" by Tom Goldstein and Stanley Osher. 
+  Siam J. Imaging Sciences. Vol. 2, No. 2, pp. 323-343.
+*/
+
+// Gadgetron includes
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "cuSbCgSolver.h"
+#include "cuCgSolver.h"
+#include "identityOperator.h"
+#include "cuPartialDerivativeOperator.h"
+#include "parameterparser.h"
+#include "cuNDDWT.h"
+#include "cuDWTOperator.h"
+#include <boost/make_shared.hpp>
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Noisy image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "denoised_image_TV.real" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of cg iterations", true, "20" );
+  parms.add_parameter( 'I', COMMAND_LINE_INT,    1, "Number of sb inner iterations", true, "1" );
+  parms.add_parameter( 'O', COMMAND_LINE_INT,    1, "Number of sb outer iterations", true, "10" );
+  parms.add_parameter( 'l', COMMAND_LINE_FLOAT,  1, "Total variation weight (lambda)", true, "50.0" );
+  parms.add_parameter( 'm', COMMAND_LINE_FLOAT,  1, "Regularization weight (mu)", true, "25.0" );
+  parms.add_parameter('w', COMMAND_LINE_FLOAT, 1, "Wavelet weight" ,true, "0");
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running denoising with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+    
+  // Load sample data from disk
+  boost::shared_ptr< hoNDArray<_real> > host_data = 
+    read_nd_array<_real>((char*)parms.get_parameter('d')->get_string_value());
+
+  if( !host_data.get() ){
+    cout << endl << "Input image not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( host_data->get_number_of_dimensions() != 2 ){
+    cout << endl << "Input image is not two-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  // Upload host data to device
+  cuNDArray<_real> data(host_data.get());
+  
+  _real mu = (_real) parms.get_parameter('m')->get_float_value();
+  _real lambda = (_real)parms.get_parameter('l')->get_float_value();
+
+  if( mu <= (_real) 0.0 ) {
+    cout << endl << "Regularization parameter mu should be strictly positive. Quitting!\n" << endl;
+    return 1;
+  }
+
+  unsigned int num_cg_iterations = parms.get_parameter('i')->get_int_value();
+  unsigned int num_inner_iterations = parms.get_parameter('I')->get_int_value();
+  unsigned int num_outer_iterations = parms.get_parameter('O')->get_int_value();
+  
+ // Define encoding operator (identity)
+  boost::shared_ptr< identityOperator<cuNDArray<_real> > > E( new identityOperator<cuNDArray<_real> >() );
+  E->set_weight( mu );
+  E->set_domain_dimensions(data.get_dimensions().get());
+  E->set_codomain_dimensions(data.get_dimensions().get());
+
+  // Setup split-Bregman solver
+  cuSbCgSolver<_real> sb;
+  sb.set_encoding_operator( E );
+  sb.set_max_outer_iterations(num_outer_iterations);
+  sb.set_max_inner_iterations(num_inner_iterations);
+  sb.set_output_mode( cuCgSolver<_real>::OUTPUT_VERBOSE );
+   // Setup regularization operators
+
+  if (lambda > 0){
+  boost::shared_ptr< cuPartialDerivativeOperator<_real,2> > Rx( new cuPartialDerivativeOperator<_real,2>(0) );
+  Rx->set_weight( lambda );
+  Rx->set_domain_dimensions(data.get_dimensions().get());
+  Rx->set_codomain_dimensions(data.get_dimensions().get());
+
+  boost::shared_ptr< cuPartialDerivativeOperator<_real,2> > Ry( new cuPartialDerivativeOperator<_real,2>(1) );
+  Ry->set_weight( lambda );
+  Ry->set_domain_dimensions(data.get_dimensions().get());
+  Ry->set_codomain_dimensions(data.get_dimensions().get());
+  //sb.add_regularization_operator( Rx ); // Anisotropic denoising
+  //sb.add_regularization_operator( Ry ); // Anisotropic denoising
+  sb.add_regularization_group_operator( Rx ); // Isotropic denoising
+  sb.add_regularization_group_operator( Ry); // Isotropic denoising
+  sb.add_group();
+  }
+  
+  _real wavelet = parms.get_parameter('w')->get_float_value();
+  if (wavelet > 0){
+	  auto dwt = boost::make_shared<cuDWTOperator<_real,2>>();
+	  dwt->set_levels(3);
+	  dwt->set_weight(wavelet);
+	  sb.add_regularization_operator(dwt);
+	  dwt->set_domain_dimensions(data.get_dimensions().get());
+	  dwt->set_codomain_dimensions(data.get_dimensions().get());
+	  dwt->use_random(true);
+  }
+
+  // Setup inner conjugate gradient solver
+  sb.get_inner_solver()->set_max_iterations( num_cg_iterations );
+  sb.get_inner_solver()->set_tc_tolerance( 1e-4 );
+  sb.get_inner_solver()->set_output_mode( cuCgSolver<_real>::OUTPUT_WARNINGS );
+
+  // Run split-Bregman solver
+  boost::shared_ptr< cuNDArray<_real> > sbresult = sb.solve(&data);
+
+  /*
+  boost::shared_ptr< cuNDArray<_real> > sbresult(new cuNDArray<_real>(data.get_dimensions()));
+  clear(sbresult.get());
+
+  vector_td<float,4> daubechies4({0.6830127f,1.1830127f,0.3169873f,-0.1830127f});
+  vector_td<float,2> haahr(1.0f,1.0f);
+  vector_td<float,6> daubechies6{0.47046721f,1.14111692f,0.650365f,-0.19093442f, -0.12083221f,0.0498175f};
+
+  cuDWTOperator<float,2> dwt;
+  dwt.set_levels(3);
+  dwt.mult_M(&data,sbresult.get());
+  //data = *sbresult;
+  shrink1(sbresult.get(),30.0f,&data);
+  dwt.mult_MH(&data,sbresult.get());*/
+  //clear(sbresult.get());
+ // All done, write out the result
+  boost::shared_ptr< hoNDArray<_real> > host_result = sbresult->to_host();
+  write_nd_array<_real>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+  
+  return 0;
+}
diff --git a/apps/standalone/gpu/denoising/CMakeLists.txt b/apps/standalone/gpu/denoising/CMakeLists.txt
new file mode 100644
index 0000000..5c4cec9
--- /dev/null
+++ b/apps/standalone/gpu/denoising/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(2d)
diff --git a/apps/standalone/gpu/mri/CMakeLists.txt b/apps/standalone/gpu/mri/CMakeLists.txt
new file mode 100644
index 0000000..866ed62
--- /dev/null
+++ b/apps/standalone/gpu/mri/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(nfft)
+add_subdirectory(sense)
diff --git a/apps/standalone/gpu/mri/nfft/2d/CMakeLists.txt b/apps/standalone/gpu/mri/nfft/2d/CMakeLists.txt
new file mode 100644
index 0000000..36fd0a5
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/2d/CMakeLists.txt
@@ -0,0 +1,17 @@
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+)
+
+add_executable(nfft_2d_radial main_nfft.cpp)
+add_executable(nffth_2d_radial main_nffth.cpp)
+#add_executable(moco moco.cpp)
+add_executable(nffth_cg_2d_radial main_cg.cpp)
+add_executable(nffth_sb_2d_radial main_sb.cpp)
+
+target_link_libraries(nfft_2d_radial gadgetron_toolbox_gpucore gadgetron_toolbox_gpuoperators gadgetron_toolbox_gpunfft gadgetron_toolbox_hostutils ${CUDA_LIBRARIES})
+target_link_libraries(nffth_2d_radial gadgetron_toolbox_gpucore gadgetron_toolbox_gpuoperators gadgetron_toolbox_gpunfft gadgetron_toolbox_hostutils ${CUDA_LIBRARIES})
+#target_link_libraries(moco gpusolvers gpureg gpucore gpuparallelmri gpuoperators gpunfft hostutils ${CUDA_LIBRARIES})
+target_link_libraries(nffth_cg_2d_radial gadgetron_toolbox_gpusolvers gadgetron_toolbox_gpuoperators gadgetron_toolbox_gpucore gadgetron_toolbox_gpunfft gadgetron_toolbox_hostutils ${CUDA_LIBRARIES})
+target_link_libraries(nffth_sb_2d_radial gadgetron_toolbox_gpusolvers gadgetron_toolbox_gpuoperators gadgetron_toolbox_gpucore gadgetron_toolbox_gpunfft gadgetron_toolbox_hostutils ${CUDA_LIBRARIES})
+
+install(TARGETS nfft_2d_radial nffth_2d_radial nffth_cg_2d_radial nffth_sb_2d_radial DESTINATION bin COMPONENT main)
diff --git a/apps/standalone/gpu/mri/nfft/2d/main_cg.cpp b/apps/standalone/gpu/mri/nfft/2d/main_cg.cpp
new file mode 100644
index 0000000..a6040fc
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/2d/main_cg.cpp
@@ -0,0 +1,141 @@
+/*
+  
+  Sample application of the NFFT toolbox: using the NFFT matrix operator in a conjugate gradient solver
+  
+*/
+
+#include "cuNFFT.h"
+#include "radial_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray.h"
+#include "vector_td_utilities.h"
+#include "parameterparser.h"
+#include "cuNFFTOperator.h"
+#include "cuCgSolver.h"
+#include "GPUTimer.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input samples file name (.cplx)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output image file name (.cplx)", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of iterations", true, "10" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load sample data from disk
+  timer = new GPUTimer("Loading samples from disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_samples = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_samples->get_number_of_dimensions() == 2) ){
+    cout << endl << "Samples ndarray is not two-dimensional (samples/profile x #profiles). Quitting.\n" << endl;
+    return 1;
+  }
+  
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  unsigned int num_iterations = parms.get_parameter('i')->get_int_value();
+
+  unsigned int num_profiles = host_samples->get_size(1);
+  unsigned int samples_per_profile = host_samples->get_size(0);  
+  _real alpha = (_real)matrix_size_os.vec[0]/(_real)matrix_size.vec[0];
+
+  // Upload host data to device
+  timer = new GPUTimer("Uploading samples to device");
+  cuNDArray<_complext> samples(host_samples.get());
+  delete timer;
+  
+  // Compute trajectories
+  timer = new GPUTimer("Computing golden ratio radial trajectories");
+  boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>( samples_per_profile, num_profiles,  1 );
+  delete timer;
+
+  // Compute density compensation weights
+  timer = new GPUTimer("Computing density compensation weights");
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, num_profiles, alpha, _real(1)/((_real)samples_per_profile/(_real)matrix_size.vec[0]) );
+  sqrt_inplace(dcw.get());
+  delete timer;
+
+  samples *= *dcw;
+
+  // Define and setup NFFT encoding operator
+  boost::shared_ptr< cuNFFTOperator<_real,2> > E( new cuNFFTOperator<_real,2>() );
+  
+  E->setup( matrix_size, matrix_size_os, kernel_width );
+
+  // Notify encoding operator of dcw
+  E->set_dcw(dcw);
+  
+  // Set image dimensions
+  vector<size_t> image_dims = to_std_vector(matrix_size);
+  E->set_domain_dimensions(&image_dims);
+  
+  // Preprocess
+  timer = new GPUTimer("NFFT preprocessing");
+  E->preprocess( traj.get() );
+  delete timer;
+
+  // Setup conjugate gradient solver
+  cuCgSolver< _complext> cg;
+  cg.set_max_iterations( num_iterations );
+  cg.set_tc_tolerance( 1e-6 );
+  cg.set_output_mode( cuCgSolver<_complext>::OUTPUT_VERBOSE );
+  cg.set_encoding_operator( E); 
+
+  // Solve
+  boost::shared_ptr< cuNDArray<_complext> > cgresult;
+  {
+    GPUTimer timer("GPU Conjugate Gradient solve");
+    cgresult = cg.solve(&samples);
+  }
+  
+  //
+  // Output result
+  //
+  
+  timer = new GPUTimer("Output result to disk");
+
+  boost::shared_ptr< hoNDArray<_complext> > host_image = cgresult->to_host();
+  write_nd_array<_complext>( host_image.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(cgresult.get())->to_host();
+  write_nd_array<_real>( host_norm.get(), "result.real" );
+
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/nfft/2d/main_nfft.cpp b/apps/standalone/gpu/mri/nfft/2d/main_nfft.cpp
new file mode 100644
index 0000000..8f29a26
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/2d/main_nfft.cpp
@@ -0,0 +1,142 @@
+/*
+
+  Sample application of the NFFT toolbox: standalone "inverse gridding" example.
+
+  -----------
+
+  The nfft is written generically and templetized to
+
+  - transform arbitrary trajectories
+  - transform an "arbitrary" number of dimensions (currently instantiated for 1d/2d/3d/4d)
+  - support both single and double precision
+
+  General principles of the implementation can be found in:
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12):1974-1985. 
+
+  This example programme of the nnft utilizes golden ratio based radial trajectories 
+  and outputs from an single precision input image ndarrays of the corresponding samples, trajectory, and density compensation weights.
+
+*/
+
+#include "cuNFFT.h"
+#include "radial_utilities.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_elemwise.h"
+#include "GPUTimer.h"
+#include "parameterparser.h"
+#include "complext.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+typedef cuNFFT_plan<_real,2> plan_type;
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name (.cplx)", true, "samples.cplx" );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'p', COMMAND_LINE_INT,    1, "Number of profiles", true );
+  parms.add_parameter( 's', COMMAND_LINE_INT,    1, "Samples per profiles", true );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load image from disk
+  timer = new GPUTimer("Loading image from disk");
+  boost::shared_ptr< hoNDArray<_real> > host_image = read_nd_array<_real>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_image->get_number_of_dimensions() == 2) ){
+    cout << endl << "Input image is not two-dimensional. Quitting.\n" << endl;
+    return 1;
+  }
+  
+  // Configuration from the command line
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  unsigned int num_profiles = parms.get_parameter('p')->get_int_value();
+  unsigned int samples_per_profile = parms.get_parameter('s')->get_int_value();  
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+
+  uint64d2 matrix_size = from_std_vector<size_t,2>(*(host_image->get_dimensions().get()));
+  _real alpha = (_real)matrix_size_os.vec[0]/(_real)matrix_size.vec[0];
+
+  if( matrix_size.vec[0] != matrix_size.vec[1] ){
+    cout << endl << "For this samples application we only allow square input images. "
+	 << endl << "The only reason being that only one oversampled matrix size is specified and the oversampling ratio must be consistent." << endl;
+  }
+    
+  // Upload host image to device, normalize, and convert to complex type
+  timer = new GPUTimer("Uploading, normalizing and converting to complex");
+  cuNDArray<_real> _image(host_image.get());
+  normalize( &_image, 1.0f );
+  boost::shared_ptr< cuNDArray<_complext> > image = real_to_complex<_complext>( &_image );
+  delete timer;
+  
+  // Setup resulting samples array
+  vector<size_t> samples_dims; samples_dims.push_back( samples_per_profile ); samples_dims.push_back( num_profiles );
+  cuNDArray<_complext> samples(&samples_dims);
+  
+  // Initialize plan
+  timer = new GPUTimer("Initializing plan");
+  plan_type plan( matrix_size, matrix_size_os, kernel_width );
+  delete timer;
+
+  // Compute trajectories
+  timer = new GPUTimer("Computing golden ratio radial trajectories");
+  boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>( samples_per_profile, num_profiles,  1 );
+  delete timer;
+  
+  // Preprocess
+  timer = new GPUTimer("NFFT preprocessing");
+  plan.preprocess( traj.get(), plan_type::NFFT_PREP_C2NC );
+  delete timer;
+
+  // Gridder
+  timer = new GPUTimer("Computing nfft");
+  plan.compute( image.get(), &samples, 0, plan_type::NFFT_FORWARDS_C2NC );
+  delete timer;
+
+  //
+  // Output result
+  //
+  
+  timer = new GPUTimer("Output result to disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_samples = samples.to_host();
+  write_nd_array<_complext>( host_samples.get(), (char*)parms.get_parameter('r')->get_string_value());
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/nfft/2d/main_nffth.cpp b/apps/standalone/gpu/mri/nfft/2d/main_nffth.cpp
new file mode 100644
index 0000000..611db0d
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/2d/main_nffth.cpp
@@ -0,0 +1,145 @@
+/*
+
+  Sample application of the NFFT toolbox: standalone "gridding" example.
+
+  -----------
+
+  The nfft is written generically and templetized to
+  - transform arbitrary trajectories
+  - transform an arbitrary number of dimensions (currently instantiated for 1d/2d/3d/4d)
+  - support both single and double precision
+
+  General principles of the implementation can be found in:
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12):1974-1985. 
+
+  This example programme of the nnft utilizes golden ratio based radial trajectories 
+  and outputs a gridded image from input ndarrays of the corresponding samples, trajectory, and density compensation weights.
+
+*/
+
+#include "cuNFFT.h"
+#include "radial_utilities.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_elemwise.h"
+#include "GPUTimer.h"
+#include "parameterparser.h"
+#include "complext.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+typedef cuNFFT_plan<_real,2> plan_type;
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input samples file name (.cplx)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output image file name (.cplx)", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load sample data from disk
+  timer = new GPUTimer("Loading samples from disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_samples = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_samples->get_number_of_dimensions() == 2) ){
+    cout << endl << "Samples ndarray is not two-dimensional (samples/profile x #profiles). Quitting.\n" << endl;
+    return 1;
+  }
+  
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+
+  unsigned int num_profiles = host_samples->get_size(1);
+  unsigned int samples_per_profile = host_samples->get_size(0);  
+  _real alpha = (_real)matrix_size_os.vec[0]/(_real)matrix_size.vec[0];
+
+  // Upload host data to device
+  timer = new GPUTimer("Uploading samples to device");
+  cuNDArray<_complext> samples(host_samples.get());
+  delete timer;
+  
+  // Setup resulting image array
+  vector<size_t> image_dims = to_std_vector(matrix_size);
+  cuNDArray<_complext> image(&image_dims);
+  
+  // Initialize plan
+  timer = new GPUTimer("Initializing plan");
+  plan_type plan( matrix_size, matrix_size_os, kernel_width );
+  delete timer;
+
+  // Compute trajectories
+  timer = new GPUTimer("Computing golden ratio radial trajectories");
+  boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>( samples_per_profile, num_profiles,  1 );
+  delete timer;
+  
+  // Preprocess
+  timer = new GPUTimer("NFFT preprocessing");
+  plan.preprocess( traj.get(), plan_type::NFFT_PREP_NC2C );
+  delete timer;
+
+  // Compute density compensation weights
+  timer = new GPUTimer("Computing density compensation weights");
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, num_profiles, alpha, _real(1)/((_real)samples_per_profile/(_real)matrix_size.vec[0]) );
+  delete timer;
+
+  // Gridder
+  timer = new GPUTimer("Computing adjoint nfft (gridding)");
+  plan.compute( &samples, &image, dcw.get(), plan_type::NFFT_BACKWARDS_NC2C );
+  delete timer;
+
+  //
+  // Output result
+  //
+  
+  timer = new GPUTimer("Output result to disk");
+
+  boost::shared_ptr< hoNDArray<_complext> > host_image = image.to_host();
+  write_nd_array<_complext>( host_image.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(&image)->to_host();
+  write_nd_array<_real>( host_norm.get(), "result.real" );
+
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/nfft/2d/main_sb.cpp b/apps/standalone/gpu/mri/nfft/2d/main_sb.cpp
new file mode 100644
index 0000000..cab25fc
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/2d/main_sb.cpp
@@ -0,0 +1,175 @@
+/*
+  
+  Sample application of the NFFT toolbox: using the NFFT matrix operator in a Split Bregman solver
+  
+*/
+
+#include "cuNFFT.h"
+#include "radial_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray.h"
+#include "parameterparser.h"
+#include "cuNFFTOperator.h"
+#include "cuSbcCgSolver.h"
+#include "vector_td_utilities.h"
+#include "cuPartialDerivativeOperator.h"
+#include "GPUTimer.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input samples file name (.cplx)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output image file name (.cplx)", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of inner iterations", true, "10" );
+  parms.add_parameter( 'I', COMMAND_LINE_INT,    1, "Number of outer iterations", true, "10" );
+  parms.add_parameter( 'l', COMMAND_LINE_FLOAT,  1, "Regularization weight (lambda)", true, "1.0" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load sample data from disk
+  timer = new GPUTimer("Loading samples from disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_samples = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_samples->get_number_of_dimensions() == 2) ){
+    cout << endl << "Samples ndarray is not two-dimensional (samples/profile x #profiles). Quitting.\n" << endl;
+    return 1;
+  }
+  
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  unsigned int num_cg_iterations = parms.get_parameter('i')->get_int_value();
+  unsigned int num_sb_iterations = parms.get_parameter('I')->get_int_value();
+
+  unsigned int num_profiles = host_samples->get_size(1);
+  unsigned int samples_per_profile = host_samples->get_size(0);  
+  _real alpha = (_real)matrix_size_os.vec[0]/(_real)matrix_size.vec[0];
+  _real lambda = (_real)parms.get_parameter('l')->get_float_value();
+  
+  // Upload host data to device
+  timer = new GPUTimer("Uploading samples to device");
+  cuNDArray<_complext> samples(host_samples.get());
+  delete timer;
+
+  // Reshape the data array to a one-dimensional array (we have no batch dimension)
+  std::vector<size_t> sample_dims;
+  sample_dims.push_back(samples.get_number_of_elements());
+  
+  // Compute trajectories
+  timer = new GPUTimer("Computing golden ratio radial trajectories");
+  boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>( samples_per_profile, num_profiles,  1 );
+  delete timer;
+
+  // Compute density compensation weights
+  timer = new GPUTimer("Computing density compensation weights");
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, num_profiles, alpha, _real(1)/((_real)samples_per_profile/(_real)matrix_size.vec[0]) );
+  sqrt_inplace(dcw.get());
+
+  samples *= *dcw;
+
+  delete timer;
+
+
+  // Define and setup NFFT encoding operator
+  boost::shared_ptr< cuNFFTOperator<_real,2> > E( new cuNFFTOperator<_real,2>() );
+  E->set_weight(lambda);
+
+   E->setup( matrix_size, matrix_size_os, kernel_width );
+
+  // Notify encoding operator of dcw
+  E->set_dcw(dcw);
+  
+  // Set image dimensions
+  vector<size_t> image_dims = to_std_vector(matrix_size);
+  E->set_domain_dimensions(&image_dims);
+  E->set_codomain_dimensions(&sample_dims);
+
+  // Setup regularization operators
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,2> >
+    Rx( new cuPartialDerivativeOperator<_complext,2>(0) );
+  Rx->set_weight( lambda );
+  Rx->set_domain_dimensions(&image_dims);
+  Rx->set_codomain_dimensions(&image_dims);
+
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,2> >
+    Ry( new cuPartialDerivativeOperator<_complext,2>(1) );
+  Ry->set_weight( lambda );
+  Ry->set_domain_dimensions(&image_dims);
+  Ry->set_codomain_dimensions(&image_dims);
+  
+  // Preprocess
+  timer = new GPUTimer("NFFT preprocessing");
+  E->preprocess( traj.get() );
+  delete timer;
+
+  // Setup split bregman solver
+  cuSbcCgSolver<_complext> sb;
+  sb.set_max_outer_iterations( num_sb_iterations );
+  sb.set_max_inner_iterations( 1 );
+  sb.set_output_mode( cuCgSolver< _complext>::OUTPUT_VERBOSE );
+
+  sb.set_encoding_operator( E); 
+  sb.add_regularization_group_operator( Rx ); 
+  sb.add_regularization_group_operator( Ry ); 
+  sb.add_group();
+
+  // Setup inner conjugate gradient solver
+  sb.get_inner_solver()->set_output_mode( cuCgSolver<_complext>::OUTPUT_WARNINGS );
+  sb.get_inner_solver()->set_max_iterations( num_cg_iterations );
+  
+  // Solve
+  boost::shared_ptr< cuNDArray<_complext> > cgresult;
+  {
+    GPUTimer timer("GPU Conjugate Gradient solve");
+    cgresult = sb.solve(&samples);
+  }
+  
+  //
+  // Output result
+  //
+  
+  timer = new GPUTimer("Output result to disk");
+
+  boost::shared_ptr< hoNDArray<_complext> > host_image = cgresult->to_host();
+  write_nd_array<_complext>( host_image.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(cgresult.get())->to_host();
+  write_nd_array<_real>( host_norm.get(), "result.real" );
+
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/nfft/CMakeLists.txt b/apps/standalone/gpu/mri/nfft/CMakeLists.txt
new file mode 100644
index 0000000..68dd4c7
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(2d)
+add_subdirectory(ms2d)
diff --git a/apps/standalone/gpu/mri/nfft/ms2d/CMakeLists.txt b/apps/standalone/gpu/mri/nfft/ms2d/CMakeLists.txt
new file mode 100644
index 0000000..a2600e8
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/ms2d/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_executable(nfft_2d_ms_radial nfft_main.cpp)
+add_executable(nffth_2d_ms_radial nffth_main.cpp)
+add_executable(nffth_2d_ms_generic nffth_generic.cpp)
+
+target_link_libraries(nfft_2d_ms_radial gadgetron_toolbox_gpucore gadgetron_toolbox_gpunfft gadgetron_toolbox_hostutils ${CUDA_LIBRARIES})
+target_link_libraries(nffth_2d_ms_radial gadgetron_toolbox_gpucore gadgetron_toolbox_gpunfft gadgetron_toolbox_hostutils ${CUDA_LIBRARIES})
+target_link_libraries(nffth_2d_ms_generic gadgetron_toolbox_gpucore gadgetron_toolbox_gpunfft gadgetron_toolbox_hostutils ${CUDA_LIBRARIES})
+
+install(TARGETS nfft_2d_ms_radial nffth_2d_ms_radial nffth_2d_ms_generic DESTINATION bin COMPONENT main)
diff --git a/apps/standalone/gpu/mri/nfft/ms2d/nfft_main.cpp b/apps/standalone/gpu/mri/nfft/ms2d/nfft_main.cpp
new file mode 100644
index 0000000..a5660cb
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/ms2d/nfft_main.cpp
@@ -0,0 +1,148 @@
+/*
+  Sample application of the NFFT toolbox: standalone "inverse gridding" example.
+
+  -----------
+
+  The nfft is written generically and templetized to
+
+  - transform arbitrary trajectories
+  - transform an arbitrary number of dimensions (currently instantiated for 1d/2d/3d/4d)
+  - support both single and double precision
+
+  General principles of the implementation can be found in:
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12):1974-1985. 
+
+  This example programme of the nnft utilizes golden ratio based radial trajectories 
+  and outputs from a single precision multislice input image ndarrays of the corresponding samples, trajectory, and density compensation weights.
+
+*/
+
+#include "cuNFFT.h"
+#include "radial_utilities.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_elemwise.h"
+#include "GPUTimer.h"
+#include "parameterparser.h"
+#include "complext.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+typedef cuNFFT_plan<_real,2> plan_type;
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output image file name (.cplx)", true, "samples.cplx" );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'p', COMMAND_LINE_INT,    1, "#profiles/frame", true );
+  parms.add_parameter( 's', COMMAND_LINE_INT,    1, "#samples/profile", true );
+  parms.add_parameter( 'f', COMMAND_LINE_INT,    1, "#frames/reconstruction (a negative value means all)", true, "-1" );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load image from disk
+  timer = new GPUTimer("Loading image from disk");
+  boost::shared_ptr< hoNDArray<_real> > host_image = read_nd_array<_real>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_image->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input image is not three-dimensional (2d multislice). Quitting.\n" << endl;
+    return 1;
+  }
+  
+  // Configuration from the command line
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  unsigned int profiles_per_frame = parms.get_parameter('p')->get_int_value();
+  unsigned int samples_per_profile = parms.get_parameter('s')->get_int_value();  
+  int frames_per_reconstruction = parms.get_parameter('f')->get_int_value();  
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+
+  uint64d2 matrix_size = from_std_vector<size_t,2>(*(host_image->get_dimensions().get()));
+  unsigned int num_frames = host_image->get_size(2);
+  _real alpha = (_real)matrix_size_os.vec[0]/(_real)matrix_size.vec[0];
+
+  if( matrix_size.vec[0] != matrix_size.vec[1] ){
+    cout << endl << "For this samples application we only allow square input images. "
+	 << endl << "The only reason being that only one oversampled matrix size is specified and the oversampling ratio must be consistent." << endl;
+  }
+
+  if( frames_per_reconstruction < 0 ) frames_per_reconstruction = num_frames;
+  if( (unsigned int)frames_per_reconstruction > num_frames ) frames_per_reconstruction = num_frames;
+  
+  // Upload host image to device, normalize, and convert to complex type
+  timer = new GPUTimer("Uploading, normalizing and converting to complex");
+  cuNDArray<_real> _image(host_image.get());
+  normalize( &_image, 1.0f );
+  boost::shared_ptr< cuNDArray<_complext> > image = real_to_complex<_complext>( &_image );
+  delete timer;
+  
+  // Setup resulting samples array
+  vector<size_t> samples_dims; 
+  samples_dims.push_back( samples_per_profile ); samples_dims.push_back( profiles_per_frame ); samples_dims.push_back(frames_per_reconstruction);
+  cuNDArray<_complext> samples(&samples_dims);
+  
+  // Initialize plan
+  timer = new GPUTimer("Initializing plan");
+  plan_type plan( matrix_size, matrix_size_os, kernel_width );
+  delete timer;
+
+  // Compute trajectories
+  timer = new GPUTimer("Computing golden ratio radial trajectories");
+  boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>( samples_per_profile, profiles_per_frame, frames_per_reconstruction );
+  delete timer;
+  
+  // Preprocess
+  timer = new GPUTimer("NFFT preprocessing");
+  plan.preprocess( traj.get(), plan_type::NFFT_PREP_C2NC );
+  delete timer;
+
+  // Gridder
+  timer = new GPUTimer("Computing nfft");
+  plan.compute( image.get(), &samples, 0x0, plan_type::NFFT_FORWARDS_C2NC );
+  delete timer;
+
+  //
+  // Output result
+  //
+  
+  timer = new GPUTimer("Output result to disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_samples = samples.to_host();
+  write_nd_array<_complext>( host_samples.get(), (char*)parms.get_parameter('r')->get_string_value() );
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/nfft/ms2d/nffth_generic.cpp b/apps/standalone/gpu/mri/nfft/ms2d/nffth_generic.cpp
new file mode 100644
index 0000000..bb4080f
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/ms2d/nffth_generic.cpp
@@ -0,0 +1,161 @@
+/*
+
+  Sample application of the NFFT toolbox: standalone "gridding" example.
+
+  -----------
+
+  The nfft is written generically and templetized to
+  - transform arbitrary trajectories
+  - transform an arbitrary number of dimensions (currently instantiated for 1d/2d/3d/4d)
+  - support both single and double precision
+
+  General principles of the implementation can be found in:
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12):1974-1985. 
+
+  This example programme of the nnft utilizes golden ratio based radial trajectories 
+  and outputs gridded images from 2D multislice input ndarrays of the corresponding samples, trajectory, and density compensation weights.
+
+*/
+
+#include "cuNFFT.h"
+#include "radial_utilities.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_elemwise.h"
+#include "GPUTimer.h"
+#include "parameterparser.h"
+#include "complext.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+typedef cuNFFT_plan<_real,2> plan_type;
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input samples file name (.cplx)", true );
+  parms.add_parameter( 't', COMMAND_LINE_STRING, 1, "Input trajectories file name (.real)", true );
+  parms.add_parameter( 'w', COMMAND_LINE_STRING, 1, "Input density compensation weights file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output image file name (.cplx)", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'f', COMMAND_LINE_INT,    1, "#frames/reconstruction (a negative value means all)", true, "-1" );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load data from disk
+  timer = new GPUTimer("Loading data from disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_samples = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  boost::shared_ptr< hoNDArray<_reald2> >   host_traj    = read_nd_array<_reald2>  ((char*)parms.get_parameter('t')->get_string_value());
+  boost::shared_ptr< hoNDArray<_real> >     host_dcw     = read_nd_array<_real>    ((char*)parms.get_parameter('w')->get_string_value());
+  delete timer;
+
+  /* {
+    std::vector<size_t> dims;
+    dims.push_back(host_traj->get_size(0));
+    dims.push_back(host_samples->get_number_of_elements()/dims[0]);
+    host_samples->reshape(&dims);
+    } */
+  
+  if( !(host_samples->get_number_of_dimensions() == 2 && host_traj->get_number_of_dimensions() == 2) ){
+    cout << endl << "Samples/trajectory arrays must be two-dimensional: (dim 0: samples/profile x #profiles/frame; dim 1: #frames). Quitting.\n" << endl;
+    return 1;
+  }
+
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  int frames_per_reconstruction = parms.get_parameter('f')->get_int_value();  
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  _real alpha = (_real)matrix_size_os.vec[0]/(_real)matrix_size.vec[0];
+  
+  unsigned int num_frames = host_traj->get_size(1);  
+
+  if( frames_per_reconstruction < 0 ) frames_per_reconstruction = num_frames;
+  if( (unsigned int)frames_per_reconstruction > num_frames ) frames_per_reconstruction = num_frames;
+  
+  // Setup resulting image array
+  vector<size_t> image_dims = to_std_vector(matrix_size); 
+  image_dims.push_back((num_frames/frames_per_reconstruction)*frames_per_reconstruction);
+  cuNDArray<_complext> image(&image_dims);
+  clear(&image);
+  
+  // Initialize plan
+  timer = new GPUTimer("Initializing plan");
+  plan_type plan( matrix_size, matrix_size_os, kernel_width );
+  delete timer;
+
+  // Upload arrays to device
+  cuNDArray<_complext> _samples(host_samples.get());
+  cuNDArray<_reald2> _trajectory(host_traj.get());
+  cuNDArray<_real> dcw(host_dcw.get());
+
+  std::vector<size_t> dims_recon;
+  dims_recon.push_back(host_samples->get_size(0));
+  dims_recon.push_back(frames_per_reconstruction);
+
+  for( unsigned int iteration = 0; iteration < num_frames/frames_per_reconstruction; iteration++ ) {
+    
+    // Set samples/trajectory for sub-frames
+    cuNDArray<_complext> samples( dims_recon, _samples.get_data_ptr()+iteration*dims_recon[0]*dims_recon[1] );
+    cuNDArray<_reald2> trajectory( dims_recon, _trajectory.get_data_ptr()+iteration*dims_recon[0]*dims_recon[1] );
+
+    // Preprocess
+    timer = new GPUTimer("NFFT preprocessing");
+    plan.preprocess( &trajectory, plan_type::NFFT_PREP_NC2C );
+    delete timer;
+    
+    std::vector<size_t> image_dims = to_std_vector(matrix_size); 
+    image_dims.push_back(frames_per_reconstruction);
+    cuNDArray<_complext> tmp_image(&image_dims, image.get_data_ptr()+iteration*prod(matrix_size)*frames_per_reconstruction);
+
+    // Gridder
+    timer = new GPUTimer("Computing adjoint nfft (gridding)");
+    plan.compute( &samples, &tmp_image, &dcw, plan_type::NFFT_BACKWARDS_NC2C );
+    delete timer;
+  }
+  
+  //
+  // Output result
+  //
+  
+  timer = new GPUTimer("Output result to disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_image = image.to_host();
+  write_nd_array<_complext>( host_image.get(), (char*)parms.get_parameter('r')->get_string_value() );
+  write_nd_array<_real>( abs(&image)->to_host().get(), "result.real" );
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/nfft/ms2d/nffth_main.cpp b/apps/standalone/gpu/mri/nfft/ms2d/nffth_main.cpp
new file mode 100644
index 0000000..044d234
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/ms2d/nffth_main.cpp
@@ -0,0 +1,173 @@
+/*
+
+  Sample application of the NFFT toolbox: standalone "gridding" example.
+
+  -----------
+
+  The nfft is written generically and templetized to
+  - transform arbitrary trajectories
+  - transform an arbitrary number of dimensions (currently instantiated for 1d/2d/3d/4d)
+  - support both single and double precision
+
+  General principles of the implementation can be found in:
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12):1974-1985. 
+
+  This example programme of the nnft utilizes golden ratio based radial trajectories 
+  and outputs gridded images from 2D multislice input ndarrays of the corresponding samples, trajectory, and density compensation weights.
+
+*/
+
+#include "cuNFFT.h"
+#include "radial_utilities.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_elemwise.h"
+#include "GPUTimer.h"
+#include "parameterparser.h"
+#include "complext.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+typedef cuNFFT_plan<_real,2> plan_type;
+
+// Upload samples for one reconstruction from host to device
+boost::shared_ptr< cuNDArray<_complext> > 
+upload_data( unsigned int reconstruction, unsigned int samples_per_reconstruction,
+	     hoNDArray<_complext> *host_data )
+{
+  vector<size_t> dims; dims.push_back(samples_per_reconstruction);
+  cuNDArray<_complext> *data = new cuNDArray<_complext>( &dims );
+  cudaMemcpy( data->get_data_ptr(), 
+	      host_data->get_data_ptr()+reconstruction*samples_per_reconstruction, 
+	      samples_per_reconstruction*sizeof(_complext), cudaMemcpyHostToDevice );
+  
+  return boost::shared_ptr< cuNDArray<_complext> >(data);
+}
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input samples file name (.cplx)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output image file name (.cplx)", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'f', COMMAND_LINE_INT,    1, "#frames/reconstruction (a negative value means all)", true, "-1" );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load sample data from disk
+  timer = new GPUTimer("Loading samples from disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_samples = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_samples->get_number_of_dimensions() == 3) ){
+    cout << endl << "Samples ndarray is not three-dimensional (samples/profile x #profiles/frame x #frames). Quitting.\n" << endl;
+    return 1;
+  }
+  
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  int frames_per_reconstruction = parms.get_parameter('f')->get_int_value();  
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  
+  unsigned int samples_per_profile = host_samples->get_size(0);  
+  unsigned int profiles_per_frame = host_samples->get_size(1);
+  unsigned int num_frames = host_samples->get_size(2);  
+  
+  unsigned int profiles_per_reconstruction = profiles_per_frame*frames_per_reconstruction;
+  unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+  _real alpha = (_real)matrix_size_os.vec[0]/(_real)matrix_size.vec[0];
+
+  if( frames_per_reconstruction < 0 ) frames_per_reconstruction = num_frames;
+  if( (unsigned int)frames_per_reconstruction > num_frames ) frames_per_reconstruction = num_frames;
+  
+  // Setup resulting image array
+  vector<size_t> image_dims = to_std_vector(matrix_size); 
+  image_dims.push_back((num_frames/frames_per_reconstruction)*frames_per_reconstruction);
+  cuNDArray<_complext> image(&image_dims);
+  clear(&image);
+  
+  // Initialize plan
+  timer = new GPUTimer("Initializing plan");
+  plan_type plan( matrix_size, matrix_size_os, kernel_width );
+  delete timer;
+
+  // Compute density compensation weights
+  timer = new GPUTimer("Computing density compensation weights");
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, profiles_per_frame, alpha, _real(1)/((_real)samples_per_profile/(_real)matrix_size.vec[0]) );
+  delete timer;
+
+  for( unsigned int iteration = 0; iteration < num_frames/frames_per_reconstruction; iteration++ ) {
+    
+    // Compute trajectories
+    timer = new GPUTimer("Computing golden ratio radial trajectories");
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, frames_per_reconstruction, iteration*profiles_per_reconstruction );
+    delete timer;
+    
+    // Preprocess
+    timer = new GPUTimer("NFFT preprocessing");
+    plan.preprocess( traj.get(), plan_type::NFFT_PREP_NC2C );
+    delete timer;
+    
+    // Upload data
+    timer = new GPUTimer("Upload data");
+    boost::shared_ptr< cuNDArray<_complext> > data = upload_data
+      ( iteration, samples_per_reconstruction, host_samples.get() );
+    
+    vector<size_t> image_dims = to_std_vector(matrix_size); 
+    image_dims.push_back(frames_per_reconstruction);
+    cuNDArray<_complext> tmp_image(&image_dims, image.get_data_ptr()+iteration*prod(matrix_size)*frames_per_reconstruction);
+
+    // Gridder
+    timer = new GPUTimer("Computing adjoint nfft (gridding)");
+    plan.compute( data.get(), &tmp_image, dcw.get(), plan_type::NFFT_BACKWARDS_NC2C );
+    delete timer;
+  }
+  
+  //
+  // Output result
+  //
+  
+  timer = new GPUTimer("Output result to disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_image = image.to_host();
+  write_nd_array<_complext>( host_image.get(), (char*)parms.get_parameter('r')->get_string_value() );
+  write_nd_array<_real>( abs(&image)->to_host().get(), "result.real" );
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/sense/CMakeLists.txt b/apps/standalone/gpu/mri/sense/CMakeLists.txt
new file mode 100644
index 0000000..11e472d
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/CMakeLists.txt
@@ -0,0 +1,2 @@
+#add_subdirectory(cartesian)
+add_subdirectory(noncartesian)
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/CMakeLists.txt b/apps/standalone/gpu/mri/sense/noncartesian/CMakeLists.txt
new file mode 100644
index 0000000..c319e2b
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/CMakeLists.txt
@@ -0,0 +1,11 @@
+include_directories( 
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+  )
+
+add_executable(sense_cg_generic_2d generic_cg.cpp)
+
+target_link_libraries(sense_cg_generic_2d gadgetron_toolbox_gpuoperators gadgetron_toolbox_cpucore gadgetron_toolbox_gpucore gadgetron_toolbox_gpuparallelmri gadgetron_toolbox_gpunfft gadgetron_toolbox_hostutils gadgetron_toolbox_gpusolvers ${CUDA_LIBRARIES})
+
+install(TARGETS sense_cg_generic_2d DESTINATION bin COMPONENT main)
+
+add_subdirectory(radial)
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/generic_cg.cpp b/apps/standalone/gpu/mri/sense/noncartesian/generic_cg.cpp
new file mode 100644
index 0000000..5ca90d0
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/generic_cg.cpp
@@ -0,0 +1,200 @@
+/*
+
+  Sample application of the NFFT toolbox: standalone "gridding" example.
+
+  -----------
+
+  The nfft is written generically and templetized to
+  - transform arbitrary trajectories
+  - transform an arbitrary number of dimensions (currently instantiated for 1d/2d/3d/4d)
+  - support both single and double precision
+
+  General principles of the implementation can be found in:
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12):1974-1985. 
+
+  This example programme of the nnft utilizes golden ratio based radial trajectories 
+  and outputs gridded images from 2D multislice input ndarrays of the corresponding samples, trajectory, and density compensation weights.
+
+*/
+
+#include "cuNFFT.h"
+#include "radial_utilities.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuImageOperator.h"
+#include "cuCgSolver.h"
+#include "GPUTimer.h"
+#include "parameterparser.h"
+#include "complext.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+typedef cuNFFT_plan<_real,2> plan_type;
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input samples file name (.cplx)", true );
+  parms.add_parameter( 't', COMMAND_LINE_STRING, 1, "Input trajectories file name (.real)", true );
+  parms.add_parameter( 'w', COMMAND_LINE_STRING, 1, "Input density compensation weights file name (.real)", true );
+  parms.add_parameter( 'c', COMMAND_LINE_STRING, 1, "Input coil sensitivity maps file name (.cplx)", true );
+  parms.add_parameter( 'g', COMMAND_LINE_STRING, 1, "Input regularization image file name (.cplx)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output image file name (.cplx)", true, "result.cplx" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of iterations", true, "10" );
+  parms.add_parameter( 'l', COMMAND_LINE_FLOAT,  1, "Regularization weight", true, "0.3" );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Oversampling factor", true, "2.0" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load data from disk
+  timer = new GPUTimer("Loading data from disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_samples = read_nd_array<_complext> ((char*)parms.get_parameter('d')->get_string_value());
+  boost::shared_ptr< hoNDArray<_reald2> >   host_traj    = read_nd_array<_reald2>   ((char*)parms.get_parameter('t')->get_string_value());
+  boost::shared_ptr< hoNDArray<_real> >     host_dcw     = read_nd_array<_real>     ((char*)parms.get_parameter('w')->get_string_value());
+  boost::shared_ptr< hoNDArray<_complext> > host_csm     = read_nd_array<_complext> ((char*)parms.get_parameter('c')->get_string_value());
+  boost::shared_ptr< hoNDArray<_complext> > host_reg     = read_nd_array<_complext> ((char*)parms.get_parameter('g')->get_string_value());
+  delete timer;
+   
+  /* {
+    std::vector<size_t> dims;
+    dims.push_back(host_traj->get_size(0));
+    dims.push_back(host_samples->get_number_of_elements()/dims[0]);
+    host_samples->reshape(&dims);
+    } */
+
+  if( !(host_samples->get_number_of_dimensions() == 2 && host_traj->get_number_of_dimensions() == 2) ){
+    cout << endl << "Samples/trajectory arrays must be two-dimensional: (dim 0: samples/profile x #profiles/frame; dim 1: #frames). Quitting.\n" << endl;
+    return 1;
+  }
+
+  if( !(host_csm->get_number_of_dimensions() == 3 )){
+    cout << endl << "Coil sensitivity maps must be three-dimensional. Quitting.\n" << endl;
+    return 1;
+  }
+
+  if( !(host_reg->get_number_of_dimensions() == 2 )){
+    cout << endl << "Regularization image must be two-dimensional. Quitting.\n" << endl;
+    return 1;
+  }
+
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(host_csm->get_size(0), host_csm->get_size(0));
+  size_t _matrix_size_os = size_t((float)matrix_size[0]*parms.get_parameter('a')->get_float_value());
+  uint64d2 matrix_size_os = uint64d2(_matrix_size_os, _matrix_size_os);
+  int num_iterations = parms.get_parameter('i')->get_int_value();
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  _real alpha = parms.get_parameter('a')->get_float_value();
+  _real kappa = parms.get_parameter('l')->get_float_value();
+  
+  unsigned int num_frames = host_traj->get_size(1);  
+  unsigned int num_coils = host_csm->get_size(2);
+
+  std::vector<size_t> recon_dims = to_std_vector(matrix_size);
+  recon_dims.push_back(num_frames);
+
+  // Upload arrays to device
+  cuNDArray<_complext> samples(host_samples.get());
+  cuNDArray<_reald2> trajectory(host_traj.get());
+  boost::shared_ptr< cuNDArray<_complext> > csm( new cuNDArray<_complext>(host_csm.get()));
+  boost::shared_ptr< cuNDArray<_complext> > reg_image( new cuNDArray<_complext>(host_reg.get()));
+  boost::shared_ptr< cuNDArray<_real> > dcw( new cuNDArray<_real>(host_dcw.get()));
+
+  // Define encoding matrix for non-Cartesian SENSE
+  boost::shared_ptr< cuNonCartesianSenseOperator<_real,2> > E( new cuNonCartesianSenseOperator<_real,2>() );  
+  E->setup( matrix_size, matrix_size_os, kernel_width );
+  E->set_dcw(dcw) ;
+  E->set_csm(csm);
+  E->set_domain_dimensions(&recon_dims);
+  E->set_codomain_dimensions(samples.get_dimensions().get());
+  E->preprocess(&trajectory);
+  
+  // Define regularization operator
+  boost::shared_ptr< cuImageOperator<_complext> > R( new cuImageOperator<_complext>() );
+  R->set_weight( kappa );
+  R->compute( reg_image.get() );
+
+  boost::shared_ptr< cuNDArray<_real> > _precon_weights = sum(abs_square(csm.get()).get(),2);
+  boost::shared_ptr< cuNDArray<_real> > R_diag = R->get();
+  *R_diag *= kappa;
+  *_precon_weights += *R_diag;
+  R_diag.reset();
+  reciprocal_sqrt_inplace(_precon_weights.get());
+  boost::shared_ptr< cuNDArray<_complext> > precon_weights = real_to_complex<_complext>( _precon_weights.get() );
+  _precon_weights.reset();
+
+  // Define preconditioning matrix
+  boost::shared_ptr< cuCgPreconditioner<_complext> > D( new cuCgPreconditioner<_complext>() );
+  D->set_weights( precon_weights );
+  precon_weights.reset();
+  csm.reset();
+
+  // Setup conjugate gradient solver
+  cuCgSolver<_complext> cg;
+  cg.set_preconditioner ( D );           // preconditioning matrix
+  cg.set_max_iterations( num_iterations );
+  cg.set_tc_tolerance( 1e-6 );
+  cg.set_output_mode( cuCgSolver< _complext>::OUTPUT_VERBOSE );
+  cg.set_encoding_operator( E );        // encoding matrix
+  cg.add_regularization_operator( R );  // regularization matrix
+
+  //
+  // Invoke conjugate gradient solver
+  //
+  
+  boost::shared_ptr< cuNDArray<_complext> > cgresult;
+  {
+    GPUTimer timer("GPU Conjugate Gradient solve");
+    cgresult = cg.solve(&samples);
+  }
+  
+  //
+  // Output result
+  //
+  
+  timer = new GPUTimer("Output result to disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_image = cgresult->to_host();
+  write_nd_array<_complext>( host_image.get(), (char*)parms.get_parameter('r')->get_string_value() );
+  write_nd_array<_real>( abs(cgresult.get())->to_host().get(), "result.real" );
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/.gitignore b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/.gitignore
new file mode 100644
index 0000000..7e4edfd
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/.gitignore
@@ -0,0 +1 @@
+radial_sense
\ No newline at end of file
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/CMakeLists.txt b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/CMakeLists.txt
new file mode 100644
index 0000000..0b5b049
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_executable(sense_cg_radial_2d main_cg.cpp)
+add_executable(sense_sbc_radial_2d main_sbc.cpp)
+add_executable(sense_gpbb_radial_2d main_gpbb.cpp)
+add_executable(sense_nlcg_radial_2d main_nlcg.cpp)
+
+target_link_libraries(sense_cg_radial_2d gadgetron_toolbox_gpuoperators gadgetron_toolbox_cpucore gadgetron_toolbox_gpucore gadgetron_toolbox_gpuparallelmri gadgetron_toolbox_gpunfft gadgetron_toolbox_hostutils gadgetron_toolbox_gpusolvers ${CUDA_LIBRARIES})
+target_link_libraries(sense_sbc_radial_2d gadgetron_toolbox_gpuoperators gadgetron_toolbox_cpucore gadgetron_toolbox_gpucore gadgetron_toolbox_gpuparallelmri gadgetron_toolbox_gpunfft gadgetron_toolbox_hostutils gadgetron_toolbox_gpusolvers ${CUDA_LIBRARIES})
+target_link_libraries(sense_gpbb_radial_2d gadgetron_toolbox_gpuoperators gadgetron_toolbox_cpucore gadgetron_toolbox_gpucore gadgetron_toolbox_gpuparallelmri gadgetron_toolbox_gpunfft gadgetron_toolbox_hostutils gadgetron_toolbox_gpusolvers ${CUDA_LIBRARIES})
+target_link_libraries(sense_nlcg_radial_2d gadgetron_toolbox_gpuoperators gadgetron_toolbox_cpucore gadgetron_toolbox_gpucore gadgetron_toolbox_gpuparallelmri gadgetron_toolbox_gpunfft gadgetron_toolbox_hostutils gadgetron_toolbox_gpusolvers ${CUDA_LIBRARIES})
+
+install(TARGETS sense_cg_radial_2d sense_sbc_radial_2d sense_gpbb_radial_2d sense_nlcg_radial_2d DESTINATION bin COMPONENT main)
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_cg.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_cg.cpp
new file mode 100644
index 0000000..462375a
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_cg.cpp
@@ -0,0 +1,291 @@
+// Gadgetron includes
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "hoNDArray_fileio.h"
+#include "vector_td_utilities.h"
+#include "cuImageOperator.h"
+#include "radial_utilities.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuSenseBuffer.h"
+#include "cuCgPreconditioner.h"
+#include "cuCgSolver.h"
+#include "b1_map.h"
+#include "parameterparser.h"
+#include "GPUTimer.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+
+const bool use_atomics = false;
+
+// Upload samples for one reconstruction from host to device
+boost::shared_ptr< cuNDArray<_complext> > 
+upload_data( unsigned int reconstruction, unsigned int samples_per_reconstruction, unsigned int total_samples_per_coil, unsigned int num_coils, hoNDArray<_complext> *host_data )
+{
+  vector<size_t> dims; dims.push_back(samples_per_reconstruction); dims.push_back(num_coils);
+  cuNDArray<_complext> *data = new cuNDArray<_complext>(); data->create( &dims );
+  for( unsigned int i=0; i<num_coils; i++ )
+    cudaMemcpy( data->get_data_ptr()+i*samples_per_reconstruction, 
+		host_data->get_data_ptr()+i*total_samples_per_coil+reconstruction*samples_per_reconstruction, 
+		samples_per_reconstruction*sizeof(_complext), cudaMemcpyHostToDevice );
+
+  return boost::shared_ptr< cuNDArray<_complext> >(data);
+}
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Sample data file name", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'p', COMMAND_LINE_INT,    1, "Profiles per frame", true );
+  parms.add_parameter( 'f', COMMAND_LINE_INT,    1, "Frames per reconstruction (negative meaning all)", true, "-1" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of iterations", true, "10" );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+  parms.add_parameter( 'K', COMMAND_LINE_FLOAT,  1, "Kappa", true, "0.3" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load sample data from disk
+  timer = new GPUTimer("\nLoading data");
+  boost::shared_ptr< hoNDArray<_complext> > host_data = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_data->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input data is not three-dimensional (#samples/profile x #profiles x #coils). Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Configuration from the host data
+  unsigned int samples_per_profile = host_data->get_size(0);
+  unsigned int num_profiles = host_data->get_size(1);
+  unsigned int num_coils = host_data->get_size(2);
+  
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  _real kappa = parms.get_parameter('K')->get_float_value();
+  unsigned int num_iterations = parms.get_parameter('i')->get_int_value();
+  unsigned int profiles_per_frame = parms.get_parameter('p')->get_int_value();
+  unsigned int frames_per_reconstruction = parms.get_parameter('f')->get_int_value();
+
+  // Silent correction of invalid command line parameters (clamp to valid range)
+  if( profiles_per_frame > num_profiles ) profiles_per_frame = num_profiles;
+  if( frames_per_reconstruction < 0 ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  if( frames_per_reconstruction*profiles_per_frame > num_profiles ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  
+  unsigned int profiles_per_reconstruction = frames_per_reconstruction*profiles_per_frame;
+  unsigned int samples_per_frame = profiles_per_frame*samples_per_profile;
+  unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+
+  cout << endl << "#samples/profile: " << samples_per_profile;
+  cout << endl << "#profiles/frame: " << profiles_per_frame;
+  cout << endl << "#profiles: " << num_profiles;
+  cout << endl << "#coils: " << num_coils;
+  cout << endl << "#frames/reconstruction: " << frames_per_reconstruction;
+  cout << endl << "#profiles/reconstruction: " << profiles_per_reconstruction;
+  cout << endl << "#samples/reconstruction: " << samples_per_reconstruction << endl << endl;
+
+  // Set density compensation weights
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, profiles_per_frame, (_real)matrix_size_os[0]/(_real)matrix_size[0], 
+      _real(1)/((_real)samples_per_profile/(_real)max(matrix_size[0],matrix_size[1])) );
+
+  // Define encoding matrix for non-Cartesian SENSE
+  boost::shared_ptr< cuNonCartesianSenseOperator<_real,2,use_atomics> > E
+    ( new cuNonCartesianSenseOperator<_real,2,use_atomics>() );  
+
+  E->setup( matrix_size, matrix_size_os, kernel_width );
+
+
+
+  // Define rhs buffer
+  //
+
+  boost::shared_ptr< cuSenseBuffer<_real,2,use_atomics> > rhs_buffer
+    ( new cuSenseBuffer<_real,2,use_atomics>() );
+
+  rhs_buffer->setup( matrix_size, matrix_size_os, kernel_width, num_coils, 8, 16 );
+  rhs_buffer->set_dcw(dcw);
+
+  // Fill rhs buffer
+  //
+
+  timer = new GPUTimer("Filling rhs buffer");
+    
+  // Go through all the data...
+  for( unsigned int iteration = 0; iteration < num_profiles/profiles_per_frame; iteration++ ) {
+
+    // Define trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, 1, iteration*profiles_per_frame );
+    
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > csm_data = upload_data
+      ( iteration, samples_per_frame, num_profiles*samples_per_profile, num_coils, host_data.get() );
+    
+    // Add frame to rhs buffer
+    rhs_buffer->add_frame_data( csm_data.get(), traj.get() );
+  }
+
+  delete timer;
+
+
+  // Estimate CSM
+  //
+
+  timer = new GPUTimer("Estimating csm");
+
+  boost::shared_ptr< cuNDArray<_complext> > acc_images = rhs_buffer->get_accumulated_coil_images();
+  boost::shared_ptr< cuNDArray<_complext> > csm = estimate_b1_map<_real,2>( acc_images.get() );  
+  E->set_csm(csm);
+
+  delete timer;
+  
+
+  // Define regularization image operator 
+  //
+
+  timer = new GPUTimer("Computing regularization");
+
+  std::vector<size_t> image_dims = to_std_vector(matrix_size);
+  cuNDArray<_complext> reg_image = cuNDArray<_complext>(&image_dims);
+
+  E->mult_csm_conj_sum( acc_images.get(), &reg_image );
+  acc_images.reset();
+
+  boost::shared_ptr< cuImageOperator<_complext> > R( new cuImageOperator<_complext>() );
+  R->set_weight( kappa );
+  R->compute( &reg_image );
+
+  delete timer;
+
+  // Define preconditioning weights
+  //
+
+  timer = new GPUTimer("Computing preconditioning weights");
+
+  boost::shared_ptr< cuNDArray<_real> > _precon_weights = sum(abs_square(csm.get()).get(),2);
+  boost::shared_ptr< cuNDArray<_real> > R_diag = R->get();
+  *R_diag *= kappa;
+  *_precon_weights += *R_diag;
+  R_diag.reset();
+  reciprocal_sqrt_inplace(_precon_weights.get());
+  boost::shared_ptr< cuNDArray<_complext> > precon_weights = real_to_complex<_complext>( _precon_weights.get() );
+  _precon_weights.reset();
+
+  // Define preconditioning matrix
+  boost::shared_ptr< cuCgPreconditioner<_complext> > D( new cuCgPreconditioner<_complext>() );
+  D->set_weights( precon_weights );
+  precon_weights.reset();
+  csm.reset();
+
+  delete timer;
+  
+  // 
+  // Setup radial SENSE reconstructions
+  //
+  // Notify encoding operator of dcw
+  sqrt_inplace(dcw.get());
+	E->set_dcw(dcw);
+  // Setup conjugate gradient solver
+  cuCgSolver<_complext> cg;
+  cg.set_preconditioner ( D );  // preconditioning matrix
+  cg.set_max_iterations( num_iterations );
+  cg.set_tc_tolerance( 1e-6 );
+  cg.set_output_mode( cuCgSolver< _complext>::OUTPUT_VERBOSE );
+  cg.set_encoding_operator( E );        // encoding matrix
+  cg.add_regularization_operator( R );  // regularization matrix
+  
+  // Reconstruct all SENSE frames iteratively
+  unsigned int num_reconstructions = num_profiles / profiles_per_reconstruction;
+  
+  // Allocate space for result
+  image_dims.push_back(frames_per_reconstruction*num_reconstructions); 
+  cuNDArray<_complext> result = cuNDArray<_complext>(&image_dims);
+  
+  timer = new GPUTimer("Full SENSE reconstruction.");
+  
+  // Define image dimensions
+  image_dims = to_std_vector(matrix_size); 
+  image_dims.push_back(frames_per_reconstruction);
+  
+  for( unsigned int reconstruction = 0; reconstruction<num_reconstructions; reconstruction++ ){
+
+    // Determine trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, frames_per_reconstruction, reconstruction*profiles_per_reconstruction );
+    
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > data = upload_data
+      ( reconstruction, samples_per_reconstruction, num_profiles*samples_per_profile, num_coils, host_data.get() );
+    
+    // Pass image dimensions to encoding operator
+    E->set_domain_dimensions(&image_dims);
+    E->set_codomain_dimensions(data->get_dimensions().get());
+  
+    // Set current trajectory and trigger NFFT preprocessing
+    E->preprocess(traj.get());
+    
+    *data *= *dcw;
+    //
+    // Invoke conjugate gradient solver
+    //
+
+    boost::shared_ptr< cuNDArray<_complext> > cgresult;
+    {
+      GPUTimer timer("GPU Conjugate Gradient solve");
+      cgresult = cg.solve(data.get());
+    }
+
+    if( !cgresult.get() )
+      return 1;
+
+    // Copy cgresult to overall result
+    cuNDArray<_complext> out(&image_dims, result.get_data_ptr()+reconstruction*prod(matrix_size)*frames_per_reconstruction );    
+    out = *(cgresult.get());
+  }
+  
+  delete timer;
+
+  // All done, write out the result
+
+  timer = new GPUTimer("Writing out result");
+  
+  boost::shared_ptr< hoNDArray<_complext> > host_result = result.to_host();
+  write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+    
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(&result)->to_host();
+  write_nd_array<_real>( host_norm.get(), "result.real" );
+  
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_gpbb.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_gpbb.cpp
new file mode 100644
index 0000000..0f2ecc5
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_gpbb.cpp
@@ -0,0 +1,286 @@
+// Gadgetron includes
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "radial_utilities.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuSenseBuffer.h"
+#include "cuCgPreconditioner.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuGpBbSolver.h"
+#include "cuTvOperator.h"
+#include "cuTvPicsOperator.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real;
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+
+// Upload samples for one reconstruction from host to device
+boost::shared_ptr< cuNDArray<_complext> >
+upload_data( unsigned int reconstruction, unsigned int samples_per_reconstruction, unsigned int total_samples_per_coil, unsigned int num_coils, hoNDArray<_complext> *host_data )
+{
+  vector<size_t> dims; dims.push_back(samples_per_reconstruction); dims.push_back(num_coils);
+  cuNDArray<_complext> *data = new cuNDArray<_complext>(); data->create( &dims );
+  for( unsigned int i=0; i<num_coils; i++ )
+    cudaMemcpy( data->get_data_ptr()+i*samples_per_reconstruction,
+		host_data->get_data_ptr()+i*total_samples_per_coil+reconstruction*samples_per_reconstruction,
+		samples_per_reconstruction*sizeof(_complext), cudaMemcpyHostToDevice );
+
+  return boost::shared_ptr< cuNDArray<_complext> >(data);
+}
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Sample data file name", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'p', COMMAND_LINE_INT,    1, "Profiles per frame", true );
+  parms.add_parameter( 'f', COMMAND_LINE_INT,    1, "Frames per reconstruction (negative meaning all)", true, "-1" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of iterations", true, "10" );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+  parms.add_parameter( 'L', COMMAND_LINE_FLOAT,  1, "Lambda", true, "2e-7" );
+  parms.add_parameter( 'A', COMMAND_LINE_FLOAT,  1, "Alpha in [0;1] (for PICS)", true, "0.5" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+
+  GPUTimer *timer;
+
+  // Load sample data from disk
+  timer = new GPUTimer("\nLoading data");
+  boost::shared_ptr< hoNDArray<_complext> > host_data = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+
+  if( !(host_data->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input data is not three-dimensional (#samples/profile x #profiles x #coils). Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Configuration from the host data
+  unsigned int samples_per_profile = host_data->get_size(0);
+  unsigned int num_profiles = host_data->get_size(1);
+  unsigned int num_coils = host_data->get_size(2);
+
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  unsigned int num_iterations = parms.get_parameter('i')->get_int_value();
+
+  unsigned int profiles_per_frame = parms.get_parameter('p')->get_int_value();
+  unsigned int frames_per_reconstruction = parms.get_parameter('f')->get_int_value();
+
+  _real lambda = (_real) parms.get_parameter('L')->get_float_value();
+  _real alpha = (_real) parms.get_parameter('A')->get_float_value();
+
+  if( alpha>1 ) alpha = 1;
+  if( alpha<0 ) alpha = 0;
+
+  // Silent correction of invalid command line parameters (clamp to valid range)
+  if( profiles_per_frame > num_profiles ) profiles_per_frame = num_profiles;
+  if( frames_per_reconstruction < 0 ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  if( frames_per_reconstruction*profiles_per_frame > num_profiles ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+
+  unsigned int profiles_per_reconstruction = frames_per_reconstruction*profiles_per_frame;
+  unsigned int samples_per_frame = profiles_per_frame*samples_per_profile;
+  unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+
+  cout << endl << "#samples/profile: " << samples_per_profile;
+  cout << endl << "#profiles/frame: " << profiles_per_frame;
+  cout << endl << "#profiles: " << num_profiles;
+  cout << endl << "#coils: " << num_coils;
+  cout << endl << "#frames/reconstruction " << frames_per_reconstruction;
+  cout << endl << "#profiles/reconstruction " << profiles_per_reconstruction;
+  cout << endl << "#samples/reconstruction " << samples_per_reconstruction << endl << endl;
+
+  // Density compensation weights are constant throughout all reconstrutions
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, profiles_per_frame, (_real)matrix_size_os[0]/(_real)matrix_size[0],
+      _real(1)/((_real)samples_per_profile/(_real)max(matrix_size[0],matrix_size[1])) );
+
+  // Define encoding matrix for non-Cartesian SENSE
+  boost::shared_ptr< cuNonCartesianSenseOperator<_real,2> > E( new cuNonCartesianSenseOperator<_real,2>() );
+  E->setup( matrix_size, matrix_size_os, kernel_width );
+
+
+  // Define rhs buffer
+  //
+
+  boost::shared_ptr< cuSenseBuffer<_real,2> > rhs_buffer( new cuSenseBuffer<_real,2>() );
+
+  rhs_buffer->setup( matrix_size, matrix_size_os, kernel_width, num_coils, 8, 16 );
+  rhs_buffer->set_dcw(dcw);
+
+  //
+  // Compute CSM using accumulation in the rhs buffer
+  //
+
+  timer = new GPUTimer("CSM and regularization estimation");
+
+  // Go through all the data...
+  for( unsigned int iteration = 0; iteration < num_profiles/profiles_per_frame; iteration++ ) {
+
+    // Define trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, 1, iteration*profiles_per_frame );
+
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > csm_data = upload_data
+      ( iteration, samples_per_frame, num_profiles*samples_per_profile, num_coils, host_data.get() );
+
+    // Add frame to rhs buffer
+    rhs_buffer->add_frame_data( csm_data.get(), traj.get() );
+  }
+
+  // Estimate csm
+  boost::shared_ptr< cuNDArray<_complext> > acc_images = rhs_buffer->get_accumulated_coil_images();
+  *acc_images *= rhs_buffer->get_normalization_factor();
+  boost::shared_ptr< cuNDArray<_complext> > csm = estimate_b1_map<_real,2>( acc_images.get() );
+  E->set_csm(csm);
+
+  std::vector<size_t> reg_dims = to_std_vector(matrix_size);
+  cuNDArray<_complext> _reg_image = cuNDArray<_complext>(&reg_dims);
+  E->mult_csm_conj_sum( acc_images.get(), &_reg_image );
+
+  // Duplicate the regularization image to 'frames_per_reconstruction' frames
+  boost::shared_ptr<cuNDArray<_complext> > reg_image = expand( &_reg_image, frames_per_reconstruction );
+
+  acc_images.reset();
+
+  // Define preconditioning weights
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > _precon_weights = sum(abs_square(csm.get()).get(),2);
+  reciprocal_sqrt_inplace(_precon_weights.get());
+  boost::shared_ptr< cuNDArray<_complext> > precon_weights = real_to_complex<_complext>( _precon_weights.get() );
+  _precon_weights.reset();
+
+  // Define preconditioning matrix
+  boost::shared_ptr< cuCgPreconditioner<_complext> > D( new cuCgPreconditioner<_complext>() );
+  D->set_weights( precon_weights );
+  precon_weights.reset();
+  csm.reset();
+
+  boost::shared_ptr< std::vector<size_t> > recon_dims( new std::vector<size_t> );
+  *recon_dims = to_std_vector(matrix_size); recon_dims->push_back(frames_per_reconstruction);
+
+  delete timer;
+
+  //
+  // Setup radial SENSE reconstructions
+  //
+
+  vector<size_t> data_dims;
+  data_dims.push_back(samples_per_reconstruction); data_dims.push_back(num_coils);
+
+  sqrt_inplace(dcw.get());
+  E->set_dcw(dcw);
+  E->set_domain_dimensions(recon_dims.get());
+  E->set_codomain_dimensions(&data_dims);
+
+  // Setup split-Bregman solver
+  cuGpBbSolver<_complext> solver;
+
+  // Add "TV" regularization
+  if( (alpha<1.0f) && (lambda>0.0f)){
+    boost::shared_ptr<cuTvOperator<_complext,3> > TV(new cuTvOperator<_complext,3>);
+    TV->set_weight(lambda*(1.0f-alpha));
+    solver.add_nonlinear_operator(TV);
+  }
+
+  // Add "PICS" regularization
+  boost::shared_ptr<cuTvPicsOperator<_complext,3> > PICS;
+  if( (alpha>0.0f) && (lambda>0.0f)){
+    PICS = boost::shared_ptr<cuTvPicsOperator<_complext,3> >(new cuTvPicsOperator<_complext,3>);
+    PICS->set_weight(lambda*alpha);
+    PICS->set_prior(reg_image);
+    solver.add_nonlinear_operator(PICS);
+  }
+
+  solver.set_encoding_operator( E );
+  solver.set_preconditioner ( D );
+  solver.set_max_iterations( num_iterations );
+  solver.set_output_mode( cuGpBbSolver<_complext>::OUTPUT_VERBOSE );
+//  solver.set_x0( reg_image );
+
+  unsigned int num_reconstructions = num_profiles / profiles_per_reconstruction;
+
+  // Allocate space for result
+  std::vector<size_t> res_dims = to_std_vector(matrix_size);
+  res_dims.push_back(frames_per_reconstruction*num_reconstructions);
+  cuNDArray<_complext> result = cuNDArray<_complext>(&res_dims);
+
+  timer = new GPUTimer("Full SENSE reconstruction with TV regularization.");
+
+  for( unsigned int reconstruction = 0; reconstruction<num_reconstructions; reconstruction++ ){
+
+    // Determine trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, frames_per_reconstruction, reconstruction*profiles_per_reconstruction );
+
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > data = upload_data
+      ( reconstruction, samples_per_reconstruction, num_profiles*samples_per_profile, num_coils, host_data.get() );
+
+    // Set current trajectory and trigger NFFT preprocessing
+    E->preprocess(traj.get());
+
+    //
+    // Split-Bregman solver
+    //
+    *data *= *dcw;
+    boost::shared_ptr< cuNDArray<_complext> > solve_result;
+    {
+      GPUTimer timer("GPU constrained Split Bregman solve");
+      solve_result = solver.solve(data.get());
+    }
+
+    vector<size_t> tmp_dims = to_std_vector(matrix_size); tmp_dims.push_back(frames_per_reconstruction);
+    cuNDArray<_complext> tmp(&tmp_dims, result.get_data_ptr()+reconstruction*prod(matrix_size)*frames_per_reconstruction );
+
+    // Copy sbresult to result (pointed to by tmp)
+    tmp = *solve_result;
+  }
+
+  delete timer;
+
+  // All done, write out the result
+
+  timer = new GPUTimer("Writing out result");
+
+  boost::shared_ptr< hoNDArray<_complext> > host_result = result.to_host();
+  write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(&result)->to_host();
+  write_nd_array<_real>( host_norm.get(), "result.real" );
+
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_nlcg.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_nlcg.cpp
new file mode 100644
index 0000000..cc0d9dd
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_nlcg.cpp
@@ -0,0 +1,341 @@
+// Gadgetron includes
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "radial_utilities.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuSenseBuffer.h"
+#include "cuCgPreconditioner.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuNlcgSolver.h"
+#include "cuTvOperator.h"
+#include "cuTvPicsOperator.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real;
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+
+// Upload samples for one reconstruction from host to device
+boost::shared_ptr< cuNDArray<_complext> >
+upload_data( unsigned int reconstruction, unsigned int samples_per_reconstruction, unsigned int total_samples_per_coil, unsigned int num_coils, hoNDArray<_complext> *host_data )
+{
+	vector<size_t> dims; dims.push_back(samples_per_reconstruction); dims.push_back(num_coils);
+	cuNDArray<_complext> *data = new cuNDArray<_complext>(); data->create( &dims );
+	for( unsigned int i=0; i<num_coils; i++ )
+		cudaMemcpy( data->get_data_ptr()+i*samples_per_reconstruction,
+				host_data->get_data_ptr()+i*total_samples_per_coil+reconstruction*samples_per_reconstruction,
+				samples_per_reconstruction*sizeof(_complext), cudaMemcpyHostToDevice );
+
+	return boost::shared_ptr< cuNDArray<_complext> >(data);
+}
+
+int main(int argc, char** argv)
+{
+	//
+	// Parse command line
+	//
+
+	ParameterParser parms;
+	parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Sample data file name", true );
+	parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "result.cplx" );
+	parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+	parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+	parms.add_parameter( 'p', COMMAND_LINE_INT,    1, "Profiles per frame", true );
+	parms.add_parameter( 'f', COMMAND_LINE_INT,    1, "Frames per reconstruction (negative meaning all)", true, "-1" );
+	parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of iterations", true, "10" );
+	parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+	parms.add_parameter( 'L', COMMAND_LINE_FLOAT,  1, "Lambda", true, "2e-7" );
+	parms.add_parameter( 'A', COMMAND_LINE_FLOAT,  1, "Alpha in [0;1] (for PICS)", true, "0.5" );
+
+	parms.parse_parameter_list(argc, argv);
+	if( parms.all_required_parameters_set() ){
+		cout << " Running reconstruction with the following parameters: " << endl;
+		parms.print_parameter_list();
+	}
+	else{
+		cout << " Some required parameters are missing: " << endl;
+		parms.print_parameter_list();
+		parms.print_usage();
+		return 1;
+	}
+
+	GPUTimer *timer;
+
+	// Load sample data from disk
+	timer = new GPUTimer("\nLoading data");
+	boost::shared_ptr< hoNDArray<_complext> > host_data = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+	delete timer;
+
+	if( !(host_data->get_number_of_dimensions() == 3) ){
+		cout << endl << "Input data is not three-dimensional (#samples/profile x #profiles x #coils). Quitting!\n" << endl;
+		return 1;
+	}
+
+	// Configuration from the host data
+	unsigned int samples_per_profile = host_data->get_size(0);
+	unsigned int num_profiles = host_data->get_size(1);
+	unsigned int num_coils = host_data->get_size(2);
+
+	// Configuration from the command line
+	uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+	uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+	_real kernel_width = parms.get_parameter('k')->get_float_value();
+	unsigned int num_iterations = parms.get_parameter('i')->get_int_value();
+
+	unsigned int profiles_per_frame = parms.get_parameter('p')->get_int_value();
+	unsigned int frames_per_reconstruction = parms.get_parameter('f')->get_int_value();
+
+	_real lambda = (_real) parms.get_parameter('L')->get_float_value();
+	_real alpha = (_real) parms.get_parameter('A')->get_float_value();
+
+	if( alpha>1 ) alpha = 1;
+	if( alpha<0 ) alpha = 0;
+
+	// Silent correction of invalid command line parameters (clamp to valid range)
+	if( profiles_per_frame > num_profiles ) profiles_per_frame = num_profiles;
+	if( frames_per_reconstruction < 0 ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+	if( frames_per_reconstruction*profiles_per_frame > num_profiles ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+
+	unsigned int profiles_per_reconstruction = frames_per_reconstruction*profiles_per_frame;
+	unsigned int samples_per_frame = profiles_per_frame*samples_per_profile;
+	unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+
+	cout << endl << "#samples/profile: " << samples_per_profile;
+	cout << endl << "#profiles/frame: " << profiles_per_frame;
+	cout << endl << "#profiles: " << num_profiles;
+	cout << endl << "#coils: " << num_coils;
+	cout << endl << "#frames/reconstruction " << frames_per_reconstruction;
+	cout << endl << "#profiles/reconstruction " << profiles_per_reconstruction;
+	cout << endl << "#samples/reconstruction " << samples_per_reconstruction << endl << endl;
+
+	// Density compensation weights are constant throughout all reconstrutions
+	boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+			( samples_per_profile, profiles_per_frame, (_real)matrix_size_os[0]/(_real)matrix_size[0],
+					_real(1)/((_real)samples_per_profile/(_real)max(matrix_size[0],matrix_size[1])) );
+	// Define rhs buffer
+	//
+
+	boost::shared_ptr< cuSenseBuffer<_real,2> > rhs_buffer( new cuSenseBuffer<_real,2>() );
+
+	rhs_buffer->setup( matrix_size, matrix_size_os, kernel_width, num_coils, 8, 16 );
+	rhs_buffer->set_dcw(dcw);
+
+	//
+	// Compute CSM using accumulation in the rhs buffer
+	//
+
+	timer = new GPUTimer("CSM and regularization estimation");
+
+	// Go through all the data...
+	for( unsigned int iteration = 0; iteration < num_profiles/profiles_per_frame; iteration++ ) {
+
+		// Define trajectories
+		boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+		( samples_per_profile, profiles_per_frame, 1, iteration*profiles_per_frame );
+
+		// Upload data
+		boost::shared_ptr< cuNDArray<_complext> > csm_data = upload_data
+				( iteration, samples_per_frame, num_profiles*samples_per_profile, num_coils, host_data.get() );
+
+		// Add frame to rhs buffer
+		rhs_buffer->add_frame_data( csm_data.get(), traj.get() );
+	}
+
+	// Estimate csm
+	boost::shared_ptr< cuNDArray<_complext> > acc_images = rhs_buffer->get_accumulated_coil_images();
+	*acc_images *= rhs_buffer->get_normalization_factor();
+	boost::shared_ptr< cuNDArray<_complext> > csm = estimate_b1_map<_real,2>( acc_images.get() );
+
+
+	// Define encoding matrix for non-Cartesian SENSE
+	boost::shared_ptr< cuNonCartesianSenseOperator<_real,2> > E( new cuNonCartesianSenseOperator<_real,2>() );
+	E->setup( matrix_size, matrix_size_os, kernel_width );
+
+
+
+	E->set_csm(csm);
+
+	std::vector<size_t> reg_dims = to_std_vector(matrix_size);
+	cuNDArray<_complext> _reg_image = cuNDArray<_complext>(&reg_dims);
+	E->mult_csm_conj_sum( acc_images.get(), &_reg_image );
+
+	// Duplicate the regularization image to 'frames_per_reconstruction' frames
+	boost::shared_ptr<cuNDArray<_complext> > reg_image = expand( &_reg_image, frames_per_reconstruction );
+
+	acc_images.reset();
+
+	// Define preconditioning weights
+	//
+
+	boost::shared_ptr< cuNDArray<_real> > _precon_weights = sum(abs_square(csm.get()).get(),2);
+	reciprocal_sqrt_inplace(_precon_weights.get());
+	boost::shared_ptr< cuNDArray<_complext> > precon_weights = real_to_complex<_complext>( _precon_weights.get() );
+	_precon_weights.reset();
+
+	// Define preconditioning matrix
+	boost::shared_ptr< cuCgPreconditioner<_complext> > D( new cuCgPreconditioner<_complext>() );
+	D->set_weights( precon_weights );
+	//precon_weights.reset();
+	csm.reset();
+
+	boost::shared_ptr< std::vector<size_t> > recon_dims( new std::vector<size_t> );
+	*recon_dims = to_std_vector(matrix_size); recon_dims->push_back(frames_per_reconstruction);
+
+	delete timer;
+
+	//
+	// Setup radial SENSE reconstructions
+	//
+
+	vector<size_t> data_dims;
+	data_dims.push_back(samples_per_reconstruction); data_dims.push_back(num_coils);
+
+	E->set_domain_dimensions(recon_dims.get());
+	E->set_codomain_dimensions(&data_dims);
+
+	// Setup split-Bregman solver
+	cuNlcgSolver<_complext> solver;
+
+	// Define regularization operators
+	// We need "a pair" for PICCS
+	//
+
+	boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+	Rx( new cuPartialDerivativeOperator<_complext,3>(0) );
+	Rx->set_weight( (1.0f-alpha)*lambda );
+	Rx->set_domain_dimensions(recon_dims.get());
+	Rx->set_codomain_dimensions(recon_dims.get());
+
+	boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+	Ry( new cuPartialDerivativeOperator<_complext,3>(1) );
+	Ry->set_weight( (1.0f-alpha)*lambda );
+	Ry->set_domain_dimensions(recon_dims.get());
+	Ry->set_codomain_dimensions(recon_dims.get());
+
+	boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+	Rz( new cuPartialDerivativeOperator<_complext,3>(2) );
+	Rz->set_weight( (1.0f-alpha)*lambda );
+	Rz->set_domain_dimensions(recon_dims.get());
+	Rz->set_codomain_dimensions(recon_dims.get());
+
+	boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+	Rx2( new cuPartialDerivativeOperator<_complext,3>(0) );
+	Rx2->set_weight( alpha*lambda );
+	Rx2->set_domain_dimensions(recon_dims.get());
+	Rx2->set_codomain_dimensions(recon_dims.get());
+
+	boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+	Ry2( new cuPartialDerivativeOperator<_complext,3>(1) );
+	Ry2->set_weight( alpha*lambda );
+	Ry2->set_domain_dimensions(recon_dims.get());
+	Ry2->set_codomain_dimensions(recon_dims.get());
+
+	boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+	Rz2( new cuPartialDerivativeOperator<_complext,3>(2) );
+	Rz2->set_weight( alpha*lambda );
+	Rz2->set_domain_dimensions(recon_dims.get());
+	Rz2->set_codomain_dimensions(recon_dims.get());
+
+
+	// Add "TV" regularization
+	if( (alpha<1.0f) && (lambda>0.0f)){
+		boost::shared_ptr<cuTvOperator<_complext,3> > TV(new cuTvOperator<_complext,3>);
+		TV->set_weight(lambda*(1.0f-alpha));
+		solver.add_nonlinear_operator(TV);
+		/*solver.add_regularization_group_operator(Rx);
+		solver.add_regularization_group_operator(Ry);
+		solver.add_regularization_group_operator(Rz);
+		solver.add_group(1);*/
+		GDEBUG_STREAM("Total variation in use " << std::endl);
+	}
+
+	// Add "PICS" regularization
+	boost::shared_ptr<cuTvPicsOperator<_complext,3> > PICS;
+	if( (alpha>0.0f) && (lambda>0.0f)){
+		PICS = boost::shared_ptr<cuTvPicsOperator<_complext,3> >(new cuTvPicsOperator<_complext,3>);
+		PICS->set_weight(lambda*alpha);
+		PICS->set_prior(reg_image);
+		solver.add_nonlinear_operator(PICS);
+		/*
+		solver.add_regularization_group_operator(Rx2);
+		solver.add_regularization_group_operator(Ry2);
+		solver.add_regularization_group_operator(Rz2);
+		solver.add_group(reg_image,1);*/
+		GDEBUG_STREAM("PICS in use " << std::endl);
+	}
+
+	sqrt_inplace(dcw.get());
+	E->set_dcw(dcw);
+	solver.set_encoding_operator( E );
+	solver.set_preconditioner ( D );
+	solver.set_max_iterations( num_iterations );
+	solver.set_output_mode( cuNlcgSolver<_complext>::OUTPUT_VERBOSE );
+	//solver.set_x0( reg_image );
+
+	unsigned int num_reconstructions = num_profiles / profiles_per_reconstruction;
+
+	// Allocate space for result
+	std::vector<size_t> res_dims = to_std_vector(matrix_size);
+	res_dims.push_back(frames_per_reconstruction*num_reconstructions);
+	cuNDArray<_complext> result = cuNDArray<_complext>(&res_dims);
+
+	timer = new GPUTimer("Full SENSE reconstruction with TV regularization.");
+
+	for( unsigned int reconstruction = 0; reconstruction<num_reconstructions; reconstruction++ ){
+
+		// Determine trajectories
+		boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+		( samples_per_profile, profiles_per_frame, frames_per_reconstruction, reconstruction*profiles_per_reconstruction );
+
+		// Upload data
+		boost::shared_ptr< cuNDArray<_complext> > data = upload_data
+				( reconstruction, samples_per_reconstruction, num_profiles*samples_per_profile, num_coils, host_data.get() );
+
+		*data *= *dcw;
+		// Set current trajectory and trigger NFFT preprocessing
+		E->preprocess(traj.get());
+
+		//
+		// Split-Bregman solver
+		//
+
+		boost::shared_ptr< cuNDArray<_complext> > solve_result;
+		{
+			GPUTimer timer("GPU constrained Split Bregman solve");
+			solve_result = solver.solve(data.get());
+		}
+
+		vector<size_t> tmp_dims = to_std_vector(matrix_size); tmp_dims.push_back(frames_per_reconstruction);
+		cuNDArray<_complext> tmp(&tmp_dims, result.get_data_ptr()+reconstruction*prod(matrix_size)*frames_per_reconstruction );
+
+		// Copy sbresult to result (pointed to by tmp)
+		tmp = *solve_result;
+	}
+
+	delete timer;
+
+	// All done, write out the result
+
+	timer = new GPUTimer("Writing out result");
+
+	boost::shared_ptr< hoNDArray<_complext> > host_result = result.to_host();
+	write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+	boost::shared_ptr< hoNDArray<_real> > host_norm = abs(&result)->to_host();
+	write_nd_array<_real>( host_norm.get(), "result.real" );
+
+	delete timer;
+
+	return 0;
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_sbc.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_sbc.cpp
new file mode 100644
index 0000000..21fec1f
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_sbc.cpp
@@ -0,0 +1,335 @@
+// Gadgetron includes
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "radial_utilities.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuSenseBuffer.h"
+#include "cuCgPreconditioner.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuCgSolver.h"
+#include "cuSbcCgSolver.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+
+// Upload samples for one reconstruction from host to device
+boost::shared_ptr< cuNDArray<_complext> > 
+upload_data( unsigned int reconstruction, unsigned int samples_per_reconstruction, unsigned int total_samples_per_coil, unsigned int num_coils, hoNDArray<_complext> *host_data )
+{
+  vector<size_t> dims; dims.push_back(samples_per_reconstruction); dims.push_back(num_coils);
+  cuNDArray<_complext> *data = new cuNDArray<_complext>(); data->create( &dims );
+  for( unsigned int i=0; i<num_coils; i++ )
+    cudaMemcpy( data->get_data_ptr()+i*samples_per_reconstruction, 
+		host_data->get_data_ptr()+i*total_samples_per_coil+reconstruction*samples_per_reconstruction, 
+		samples_per_reconstruction*sizeof(_complext), cudaMemcpyHostToDevice );
+
+  return boost::shared_ptr< cuNDArray<_complext> >(data);
+}
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+ 
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Sample data file name", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'p', COMMAND_LINE_INT,    1, "Profiles per frame", true );
+  parms.add_parameter( 'f', COMMAND_LINE_INT,    1, "Frames per reconstruction (negative meaning all)", true, "-1" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of cg iterations", true, "10" );
+  parms.add_parameter( 'I', COMMAND_LINE_INT,    1, "Number of sb inner iterations", true, "1" );
+  parms.add_parameter( 'O', COMMAND_LINE_INT,    1, "Number of sb outer iterations", true, "10" );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+  parms.add_parameter( 'M', COMMAND_LINE_FLOAT,  1, "Mu", true, "1.0" );
+  parms.add_parameter( 'L', COMMAND_LINE_FLOAT,  1, "Lambda", true, "2.0" );
+  parms.add_parameter( 'A', COMMAND_LINE_FLOAT,  1, "Alpha in [0;1] (for PICCS)", true, "0.5" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load sample data from disk
+  timer = new GPUTimer("\nLoading data");
+  boost::shared_ptr< hoNDArray<_complext> > host_data = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_data->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input data is not three-dimensional (#samples/profile x #profiles x #coils). Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Configuration from the host data
+  unsigned int samples_per_profile = host_data->get_size(0);
+  unsigned int num_profiles = host_data->get_size(1);
+  unsigned int num_coils = host_data->get_size(2);
+  
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  unsigned int num_cg_iterations = parms.get_parameter('i')->get_int_value();
+  unsigned int num_sb_inner_iterations = parms.get_parameter('I')->get_int_value();
+  unsigned int num_sb_outer_iterations = parms.get_parameter('O')->get_int_value();
+  unsigned int profiles_per_frame = parms.get_parameter('p')->get_int_value();
+  unsigned int frames_per_reconstruction = parms.get_parameter('f')->get_int_value();
+
+  _real mu = (_real) parms.get_parameter('M')->get_float_value();
+  _real lambda = (_real) parms.get_parameter('L')->get_float_value();
+  _real alpha = (_real) parms.get_parameter('A')->get_float_value();
+
+  if( alpha>1 ) alpha = 1;
+  if( alpha<0 ) alpha = 0;
+
+  // Silent correction of invalid command line parameters (clamp to valid range)
+  if( profiles_per_frame > num_profiles ) profiles_per_frame = num_profiles;
+  if( frames_per_reconstruction < 0 ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  if( frames_per_reconstruction*profiles_per_frame > num_profiles ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  
+  unsigned int profiles_per_reconstruction = frames_per_reconstruction*profiles_per_frame;
+  unsigned int samples_per_frame = profiles_per_frame*samples_per_profile;
+  unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+
+  cout << endl << "#samples/profile: " << samples_per_profile;
+  cout << endl << "#profiles/frame: " << profiles_per_frame;
+  cout << endl << "#profiles: " << num_profiles;
+  cout << endl << "#coils: " << num_coils;
+  cout << endl << "#frames/reconstruction " << frames_per_reconstruction;
+  cout << endl << "#profiles/reconstruction " << profiles_per_reconstruction;
+  cout << endl << "#samples/reconstruction " << samples_per_reconstruction << endl << endl;
+
+  // Density compensation weights are constant throughout all reconstrutions
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, profiles_per_frame, (_real)matrix_size_os[0]/(_real)matrix_size[0], 
+      _real(1)/((_real)samples_per_profile/(_real)max(matrix_size[0],matrix_size[1])) );
+  
+  // Define encoding matrix for non-Cartesian SENSE
+  boost::shared_ptr< cuNonCartesianSenseOperator<_real,2> > E( new cuNonCartesianSenseOperator<_real,2>() );  
+  E->set_weight( mu );
+  E->setup( matrix_size, matrix_size_os, kernel_width );
+
+
+  // Define rhs buffer
+  //
+
+  boost::shared_ptr< cuSenseBuffer<_real,2> > rhs_buffer( new cuSenseBuffer<_real,2>() );
+
+  rhs_buffer->setup( matrix_size, matrix_size_os, kernel_width, num_coils, 8, 16 );
+  rhs_buffer->set_dcw(dcw);
+
+  //
+  // Compute CSM using accumulation in the rhs buffer
+  // 
+ 
+  timer = new GPUTimer("CSM and regularization estimation");
+    
+  // Go through all the data...
+  for( unsigned int iteration = 0; iteration < num_profiles/profiles_per_frame; iteration++ ) {
+
+    // Define trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, 1, iteration*profiles_per_frame );
+    
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > csm_data = upload_data
+      ( iteration, samples_per_frame, num_profiles*samples_per_profile, num_coils, host_data.get() );
+    
+    // Add frame to rhs buffer
+    rhs_buffer->add_frame_data( csm_data.get(), traj.get() );
+  }
+    
+  // Estimate csm
+  boost::shared_ptr< cuNDArray<_complext> > acc_images = rhs_buffer->get_accumulated_coil_images();
+  *acc_images *= rhs_buffer->get_normalization_factor();
+  boost::shared_ptr< cuNDArray<_complext> > csm = estimate_b1_map<_real,2>( acc_images.get() );
+  E->set_csm(csm);
+
+  std::vector<size_t> reg_dims = to_std_vector(matrix_size);
+  cuNDArray<_complext> _reg_image = cuNDArray<_complext>(&reg_dims);
+  E->mult_csm_conj_sum( acc_images.get(), &_reg_image );
+
+  // Duplicate the regularization image to 'frames_per_reconstruction' frames
+  boost::shared_ptr<cuNDArray<_complext> > reg_image = expand( &_reg_image, frames_per_reconstruction );
+
+  acc_images.reset();
+
+  // Define preconditioning weights
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > _precon_weights = sum(abs_square(csm.get()).get(),2);
+  reciprocal_sqrt_inplace(_precon_weights.get());
+  boost::shared_ptr< cuNDArray<_complext> > precon_weights = real_to_complex<_complext>( _precon_weights.get() );
+  _precon_weights.reset();
+
+  // Define preconditioning matrix
+  boost::shared_ptr< cuCgPreconditioner<_complext> > D( new cuCgPreconditioner<_complext>() );
+  D->set_weights( precon_weights );
+  precon_weights.reset();
+  csm.reset();
+
+  boost::shared_ptr< std::vector<size_t> > recon_dims( new std::vector<size_t> );
+  *recon_dims = to_std_vector(matrix_size); recon_dims->push_back(frames_per_reconstruction); 
+
+  // Define regularization operators 
+  // We need "a pair" for PICCS
+  //
+
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+    Rx( new cuPartialDerivativeOperator<_complext,3>(0) );
+  Rx->set_weight( (1.0f-alpha)*lambda );
+  Rx->set_domain_dimensions(recon_dims.get());
+  Rx->set_codomain_dimensions(recon_dims.get());
+
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+    Ry( new cuPartialDerivativeOperator<_complext,3>(1) );
+  Ry->set_weight( (1.0f-alpha)*lambda );
+  Ry->set_domain_dimensions(recon_dims.get());
+  Ry->set_codomain_dimensions(recon_dims.get());
+ 
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+    Rz( new cuPartialDerivativeOperator<_complext,3>(2) );
+  Rz->set_weight( (1.0f-alpha)*lambda );
+  Rz->set_domain_dimensions(recon_dims.get());
+  Rz->set_codomain_dimensions(recon_dims.get());
+
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+    Rx2( new cuPartialDerivativeOperator<_complext,3>(0) );
+  Rx2->set_weight( alpha*lambda );
+  Rx2->set_domain_dimensions(recon_dims.get());
+  Rx2->set_codomain_dimensions(recon_dims.get());
+
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+    Ry2( new cuPartialDerivativeOperator<_complext,3>(1) );
+  Ry2->set_weight( alpha*lambda );
+  Ry2->set_domain_dimensions(recon_dims.get());
+  Ry2->set_codomain_dimensions(recon_dims.get());
+ 
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+    Rz2( new cuPartialDerivativeOperator<_complext,3>(2) );
+  Rz2->set_weight( alpha*lambda );
+  Rz2->set_domain_dimensions(recon_dims.get());
+  Rz2->set_codomain_dimensions(recon_dims.get());
+
+  delete timer;
+    
+  // 
+  // Setup radial SENSE reconstructions
+  //
+
+  vector<size_t> data_dims; 
+  data_dims.push_back(samples_per_reconstruction); data_dims.push_back(num_coils);
+
+  E->set_domain_dimensions(recon_dims.get());
+  E->set_codomain_dimensions(&data_dims);
+
+  sqrt_inplace(dcw.get());
+  E->set_dcw(dcw);
+  // Setup split-Bregman solver
+  cuSbcCgSolver<_complext> sb;
+  sb.set_encoding_operator( E );
+  
+  // Add "TV" regularization
+  if( alpha<1.0 ){
+    sb.add_regularization_group_operator( Rx ); 
+    sb.add_regularization_group_operator( Ry ); 
+    sb.add_regularization_group_operator( Rz ); 
+    sb.add_group();
+  }
+  
+  // Add "PICCS" regularization
+  if( alpha > 0.0 ){
+    sb.add_regularization_group_operator( Rx2 ); 
+    sb.add_regularization_group_operator( Ry2 ); 
+    sb.add_regularization_group_operator( Rz2 ); 
+    sb.add_group(reg_image);
+  }
+  
+  sb.set_max_outer_iterations(num_sb_outer_iterations);
+  sb.set_max_inner_iterations(num_sb_inner_iterations);
+  sb.set_output_mode( cuSbcCgSolver<_complext>::OUTPUT_VERBOSE );
+
+  sb.get_inner_solver()->set_preconditioner ( D );
+  sb.get_inner_solver()->set_max_iterations( num_cg_iterations );
+  sb.get_inner_solver()->set_tc_tolerance( 1e-4 );
+  sb.get_inner_solver()->set_output_mode( cuCgSolver<_complext>::OUTPUT_WARNINGS );
+  
+  unsigned int num_reconstructions = num_profiles / profiles_per_reconstruction;
+
+  // Allocate space for result
+  std::vector<size_t> res_dims = to_std_vector(matrix_size); 
+  res_dims.push_back(frames_per_reconstruction*num_reconstructions); 
+  cuNDArray<_complext> result = cuNDArray<_complext>(&res_dims);
+
+  timer = new GPUTimer("Full SENSE reconstruction with TV regularization.");
+
+  for( unsigned int reconstruction = 0; reconstruction<num_reconstructions; reconstruction++ ){
+
+    // Determine trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, frames_per_reconstruction, reconstruction*profiles_per_reconstruction );
+    
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > data = upload_data
+      ( reconstruction, samples_per_reconstruction, num_profiles*samples_per_profile, num_coils, host_data.get() );
+    
+    // Set current trajectory and trigger NFFT preprocessing
+    E->preprocess(traj.get());
+        
+    *data *= *dcw;
+    //
+    // Split-Bregman solver
+    //
+
+    boost::shared_ptr< cuNDArray<_complext> > sbresult;
+    {
+      GPUTimer timer("GPU constrained Split Bregman solve");
+      sbresult = sb.solve(data.get());
+    }
+
+    vector<size_t> tmp_dims = to_std_vector(matrix_size); tmp_dims.push_back(frames_per_reconstruction);
+    cuNDArray<_complext> tmp(&tmp_dims, result.get_data_ptr()+reconstruction*prod(matrix_size)*frames_per_reconstruction );
+
+    // Copy sbresult to result (pointed to by tmp)
+    tmp = *sbresult;
+  }
+  
+  delete timer;
+
+  // All done, write out the result
+
+  timer = new GPUTimer("Writing out result");
+
+  boost::shared_ptr< hoNDArray<_complext> > host_result = result.to_host();
+  write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+    
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(&result)->to_host();
+  write_nd_array<_real>( host_norm.get(), "result.real" );
+  
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/CMakeLists.txt b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/CMakeLists.txt
new file mode 100644
index 0000000..afb4616
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/CMakeLists.txt
@@ -0,0 +1,31 @@
+include( ${QT_USE_FILE} )
+
+#We need binary and source dirs in this because of the header files created by the make process
+include_directories(
+  ${CMAKE_CURRENT_BINARY_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${OPENGL_INCLUDE_DIR}
+  ${GLUT_INCLUDE_DIR}
+  ${GLEW_INCLUDE_DIR}
+  ${CUDA_INCLUDE_DIRS}
+  ${Boost_INCLUDE_DIR} 
+)
+
+set(UI_UIFILES reconBaseWidget.ui radialSenseAppBaseMainWidget.ui)
+qt4_wrap_ui( UI_HEADERS ${UI_UIFILES} )
+ 
+set(UI_MOC_HEADERS radialSenseAppMainWidget.h reconWidget.h GLReconWidget.h)
+qt4_wrap_cpp (UI_MOC_OUTFILES ${UI_MOC_HEADERS})
+
+add_executable(sense_2d_golden_radial_gui main.cpp ${UI_MOC_OUTFILES}
+radialSenseAppMainWidget.cpp reconWidget.cpp GLReconWidget.cpp ${UI_HEADERS} )
+
+target_link_libraries(sense_2d_golden_radial_gui gadgetron_toolbox_gpucore gadgetron_toolbox_gpuparallelmri
+gadgetron_toolbox_gpunfft gadgetron_toolbox_hostutils gadgetron_toolbox_gpusolvers gpuoperators ${CUDA_LIBRARIES} ${QT_QTGUI_LIBRARY} ${GLEW_LIBRARY}
+${QT_QTCORE_LIBRARY} ${QT_QTOPENGL_LIBRARY} ${OPENGL_gl_LIBRARY} )
+
+if (WIN32)
+set_target_properties( sense_2d_golden_radial_gui PROPERTIES LINK_FLAGS "/FORCE:MULTIPLE") 
+endif (WIN32)
+
+install(TARGETS sense_2d_golden_radial_gui DESTINATION bin COMPONENT main)
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/GLReconWidget.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/GLReconWidget.cpp
new file mode 100644
index 0000000..e195790
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/GLReconWidget.cpp
@@ -0,0 +1,222 @@
+#include <GL/glew.h>
+
+#include "GLReconWidget.h"
+#include "UIconstants.h"
+
+#include <cuda_runtime_api.h>
+#include <cuda_gl_interop.h>
+
+#include <stdio.h>
+
+//MSH: Ripped from cutil.h to remove dependency, replace
+#  define CUDA_SAFE_CALL_NO_SYNC( call) {                                    \
+    cudaError err = call;                                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } }
+
+#  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);                                            \
+
+
+GLReconWidget::GLReconWidget(QWidget *parent) : QGLWidget(parent)
+{
+  cudaWidget = new cuGLReconWidget( MATRIX_SIZE_INITIAL_VALUE, MATRIX_SIZE_INITIAL_VALUE );
+}
+
+void GLReconWidget::setMatrixSize( unsigned int width, unsigned int height )
+{
+  cudaWidget->width = width;
+  cudaWidget->height = height;
+  
+  cudaWidget->initializePBO();
+}
+
+void GLReconWidget::initializeGL()
+{
+  glewInit();
+  
+  if (!glewIsSupported("GL_VERSION_2_0 GL_VERSION_1_5 GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) {
+    fprintf(stderr, "Required OpenGL extensions missing.");
+    exit(1);
+  }
+  
+  cudaWidget->initializePBO();
+}
+
+void GLReconWidget::paintGL()
+{
+  cudaWidget->display();
+}
+
+void GLReconWidget::resizeGL( int w, int h )
+{
+  glViewport(0, 0, w, h);
+  
+  glMatrixMode(GL_MODELVIEW);
+  glLoadIdentity();
+  
+  glMatrixMode(GL_PROJECTION);
+  glLoadIdentity();
+  glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); 
+}
+
+void GLReconWidget::mapPBO()
+{
+  cudaWidget->mapPBO();
+}
+
+void GLReconWidget::unmapPBO()
+{
+  cudaWidget->unmapPBO();
+}
+
+float* GLReconWidget::getDevPtr()
+{
+  return cudaWidget->getDevPtr();
+}
+
+// shader for displaying floating-point texture
+static const char *shader_code = 
+  "!!ARBfp1.0\n"
+  "TEX result.color, fragment.texcoord, texture[0], 2D; \n"
+  "END";
+
+cuGLReconWidget::cuGLReconWidget( unsigned int width, unsigned int height )
+{
+  this->width = width;
+  this->height = height;
+  imageDevPtr = 0x0;
+  pbo = texid = shader = 0;
+}
+
+GLuint cuGLReconWidget::compileASMShader(GLenum program_type, const char *code)
+{
+  GLuint program_id;
+  glGenProgramsARB(1, &program_id);
+  glBindProgramARB(program_type, program_id);
+  glProgramStringARB(program_type, GL_PROGRAM_FORMAT_ASCII_ARB, (GLsizei) strlen(code), (GLubyte *) code);
+
+  GLint error_pos;
+  glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &error_pos);
+  if (error_pos != -1) {
+    const GLubyte *error_string;
+    error_string = glGetString(GL_PROGRAM_ERROR_STRING_ARB);
+    fprintf(stderr, "Program error at position: %d\n%s\n", (int)error_pos, error_string);
+    return 0;
+  }
+  return program_id;
+}
+
+void cuGLReconWidget::initializePBO()
+{
+  while( glGetError() != GL_NO_ERROR ){
+    printf("\nWARNING: glError detected prior to initialisePBO");
+    fflush(stdout);
+  }
+  
+  // Create pixel buffer object (PBO) to "render Cuda memory" through a texture
+  glGenBuffersARB(1, &pbo);
+  glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
+
+  // Initialize PBO with zero image
+  float *tmp = (float*) calloc( width*height, sizeof(float) );
+  glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, width*height*sizeof(float), tmp, GL_STREAM_DRAW_ARB);
+
+  glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
+  CUDA_SAFE_CALL(cudaGLRegisterBufferObject(pbo));
+
+  // Create texture for display
+  glGenTextures(1, &texid);
+  glBindTexture(GL_TEXTURE_2D, texid);
+  glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE32F_ARB, width, height, 0, GL_LUMINANCE, GL_FLOAT, NULL);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
+    
+  glBindTexture(GL_TEXTURE_2D, 0);
+
+  // Load shader program
+  shader = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shader_code);
+
+  while( glGetError() != GL_NO_ERROR ){
+    printf("\nWARNING: glError detected prior to initialiseOpenGL");
+    fflush(stdout);
+  }
+
+  free(tmp);
+}
+
+void cuGLReconWidget::mapPBO()
+{
+  if( width==0 || height == 0 ){
+    printf("\nWARNING: pbo buffer size is 0! Has initialiseOpenGL() been called?\n");
+  }
+
+  imageDevPtr = 0x0;
+  
+  // Map the PBO used for rendering to Cuda device memory
+  CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&imageDevPtr, pbo));
+  
+  if( !imageDevPtr ){
+    printf("\nWARNING: no pbo allocated for reconstruction result!\n");
+  }
+  
+  // Error check
+  cudaError_t err = cudaGetLastError();
+  if( err != cudaSuccess ){
+    printf("\nCuda error detected: %s\n", cudaGetErrorString(err) ); fflush(stdout);
+    exit(1);
+  }
+}
+
+void cuGLReconWidget::unmapPBO()
+{
+  // Unmap Cuda <-> PBO relation
+  CUDA_SAFE_CALL(cudaGLUnmapBufferObject(pbo));
+
+  // Error check
+  cudaError_t err = cudaGetLastError();
+  if( err != cudaSuccess ){
+    printf("\nCuda error detected: %s\n", cudaGetErrorString(err) ); fflush(stdout);
+    exit(1);
+  }
+}
+
+float* cuGLReconWidget::getDevPtr()
+{
+  return imageDevPtr;     
+}
+
+void cuGLReconWidget::display()
+{
+  // Clear window
+  glClear(GL_COLOR_BUFFER_BIT);
+
+  // Load texture from PBO
+  glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
+  glBindTexture(GL_TEXTURE_2D, texid);
+  glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_LUMINANCE, GL_FLOAT, 0);
+  glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
+
+  // Use simple fragment program to display the floating point texture
+  glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader);
+  glEnable(GL_FRAGMENT_PROGRAM_ARB);
+  glDisable(GL_DEPTH_TEST);
+
+  // Render quad
+  glBegin(GL_QUADS);
+  {
+    glVertex2f(0, 1); glTexCoord2f(0, 1);
+    glVertex2f(0, 0); glTexCoord2f(0, 0);
+    glVertex2f(1, 0); glTexCoord2f(1, 0);
+    glVertex2f(1, 1); glTexCoord2f(1, 1);
+  }
+  glEnd();
+
+  // Restore original state
+  glBindTexture(GL_TEXTURE_2D, 0);
+  glDisable(GL_FRAGMENT_PROGRAM_ARB);
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/GLReconWidget.h b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/GLReconWidget.h
new file mode 100644
index 0000000..4c579d3
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/GLReconWidget.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include "cuNDArray.h"
+
+#if defined (WIN32)
+#include <Windows.h>
+#endif
+
+#ifdef __MACH__
+#import <OpenGL/gl.h>
+#else
+#include <GL/gl.h>
+#endif //__MACH__
+
+#include <QtOpenGL/QGLWidget>
+
+class cuGLReconWidget
+{
+public:
+  cuGLReconWidget( unsigned int width, unsigned int height );
+  
+  void initializePBO();
+  void mapPBO();
+  void unmapPBO();
+  float* getDevPtr();
+  void display();
+  
+  GLuint compileASMShader(GLenum program_type, const char *code);
+  
+  unsigned int width;
+  unsigned int height;
+  GLuint pbo;            // OpenGL pixel buffer object (map between Cuda and OpenGL)
+  GLuint texid;          // Texture (display pbo)
+  GLuint shader;         // Pixel shader for rendering of texture
+  float *imageDevPtr;    // This is the "exchange buffer" between Cuda and OpenGL
+};
+
+class GLReconWidget : public QGLWidget
+{
+  Q_OBJECT
+  
+  public:
+  GLReconWidget(QWidget* parent = 0);
+  void setMatrixSize( unsigned int width, unsigned int height );
+  void mapPBO();
+  void unmapPBO();
+  float* getDevPtr();
+
+protected:
+  void initializeGL();
+  void paintGL();
+  void resizeGL(int w, int h);
+    
+private:
+  cuGLReconWidget *cudaWidget;
+};
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/UIconstants.h b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/UIconstants.h
new file mode 100644
index 0000000..e9a0c9f
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/UIconstants.h
@@ -0,0 +1,10 @@
+#ifndef _UI_CONSTANTS
+#define _UI_CONSTANTS
+
+const unsigned int MATRIX_SIZE_INITIAL_VALUE = 192;
+const unsigned int MATRIX_SIZE_OS_INITIAL_VALUE = 256;
+const unsigned int NUM_ITERATIONS_INITIAL_VALUE = 15;
+const double REG_WEIGHT_INITIAL_VALUE = 0.01;
+const double KERNEL_SIZE_INITIAL_VALUE = 5.5;
+const unsigned int NUM_FRAMES_PER_CSM_RECON = 8;
+#endif
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/main.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/main.cpp
new file mode 100644
index 0000000..1cf22eb
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/main.cpp
@@ -0,0 +1,19 @@
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+
+#include "radialSenseAppMainWidget.h"
+
+#include <stdlib.h>
+
+#include <QtGui/QApplication>
+
+int
+main( int argc, char** argv) 
+{
+  QApplication app(argc, argv);
+  radialSenseAppMainWindow window;
+  window.show();
+  
+  return app.exec();
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppBaseMainWidget.ui b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppBaseMainWidget.ui
new file mode 100644
index 0000000..49a69e3
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppBaseMainWidget.ui
@@ -0,0 +1,572 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>radialSenseAppBaseMainWindow</class>
+ <widget class="QMainWindow" name="radialSenseAppBaseMainWindow">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>476</width>
+    <height>539</height>
+   </rect>
+  </property>
+  <property name="sizePolicy">
+   <sizepolicy hsizetype="MinimumExpanding" vsizetype="MinimumExpanding">
+    <horstretch>0</horstretch>
+    <verstretch>0</verstretch>
+   </sizepolicy>
+  </property>
+  <property name="windowTitle">
+   <string>Radial Sense GPU Reconstructor</string>
+  </property>
+  <property name="locale">
+   <locale language="English" country="UnitedKingdom"/>
+  </property>
+  <widget class="QWidget" name="centralwidget">
+   <widget class="ReconWidget" name="reconWidget" native="true">
+    <property name="geometry">
+     <rect>
+      <x>20</x>
+      <y>30</y>
+      <width>272</width>
+      <height>414</height>
+     </rect>
+    </property>
+    <property name="sizePolicy">
+     <sizepolicy hsizetype="MinimumExpanding" vsizetype="MinimumExpanding">
+      <horstretch>0</horstretch>
+      <verstretch>0</verstretch>
+     </sizepolicy>
+    </property>
+    <property name="minimumSize">
+     <size>
+      <width>272</width>
+      <height>414</height>
+     </size>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label">
+    <property name="geometry">
+     <rect>
+      <x>340</x>
+      <y>10</y>
+      <width>81</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="font">
+     <font>
+      <weight>75</weight>
+      <bold>true</bold>
+      <underline>true</underline>
+     </font>
+    </property>
+    <property name="text">
+     <string>Matrix sizes</string>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_3">
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>30</y>
+      <width>51</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="font">
+     <font>
+      <pointsize>12</pointsize>
+     </font>
+    </property>
+    <property name="text">
+     <string>Target</string>
+    </property>
+   </widget>
+   <widget class="QSpinBox" name="matrixSizeSpinBox">
+    <property name="enabled">
+     <bool>false</bool>
+    </property>
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>50</y>
+      <width>71</width>
+      <height>22</height>
+     </rect>
+    </property>
+    <property name="minimum">
+     <number>16</number>
+    </property>
+    <property name="maximum">
+     <number>512</number>
+    </property>
+    <property name="singleStep">
+     <number>2</number>
+    </property>
+    <property name="value">
+     <number>16</number>
+    </property>
+   </widget>
+   <widget class="QSpinBox" name="oversampledMatrixSizeSpinBox">
+    <property name="enabled">
+     <bool>false</bool>
+    </property>
+    <property name="geometry">
+     <rect>
+      <x>390</x>
+      <y>50</y>
+      <width>61</width>
+      <height>22</height>
+     </rect>
+    </property>
+    <property name="minimum">
+     <number>16</number>
+    </property>
+    <property name="maximum">
+     <number>512</number>
+    </property>
+    <property name="singleStep">
+     <number>0</number>
+    </property>
+    <property name="value">
+     <number>16</number>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_6">
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>210</y>
+      <width>141</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="font">
+     <font>
+      <pointsize>12</pointsize>
+     </font>
+    </property>
+    <property name="text">
+     <string>Regularization weight</string>
+    </property>
+   </widget>
+   <widget class="QDoubleSpinBox" name="regularizationWeightSpinBox">
+    <property name="enabled">
+     <bool>true</bool>
+    </property>
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>230</y>
+      <width>71</width>
+      <height>22</height>
+     </rect>
+    </property>
+    <property name="decimals">
+     <number>3</number>
+    </property>
+    <property name="maximum">
+     <double>50.000000000000000</double>
+    </property>
+    <property name="singleStep">
+     <double>0.010000000000000</double>
+    </property>
+    <property name="value">
+     <double>0.000000000000000</double>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_4">
+    <property name="geometry">
+     <rect>
+      <x>380</x>
+      <y>30</y>
+      <width>81</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="font">
+     <font>
+      <pointsize>12</pointsize>
+     </font>
+    </property>
+    <property name="text">
+     <string>Oversampled</string>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_10">
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>80</y>
+      <width>81</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="font">
+     <font>
+      <pointsize>12</pointsize>
+     </font>
+    </property>
+    <property name="text">
+     <string>Kernel width</string>
+    </property>
+   </widget>
+   <widget class="QDoubleSpinBox" name="kernelSizeSpinBox">
+    <property name="enabled">
+     <bool>true</bool>
+    </property>
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>100</y>
+      <width>51</width>
+      <height>22</height>
+     </rect>
+    </property>
+    <property name="decimals">
+     <number>2</number>
+    </property>
+    <property name="minimum">
+     <double>1.000000000000000</double>
+    </property>
+    <property name="maximum">
+     <double>15.000000000000000</double>
+    </property>
+    <property name="singleStep">
+     <double>0.500000000000000</double>
+    </property>
+    <property name="value">
+     <double>1.000000000000000</double>
+    </property>
+   </widget>
+   <widget class="Line" name="line_2">
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>140</y>
+      <width>118</width>
+      <height>3</height>
+     </rect>
+    </property>
+    <property name="orientation">
+     <enum>Qt::Horizontal</enum>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_11">
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>150</y>
+      <width>131</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="font">
+     <font>
+      <pointsize>12</pointsize>
+     </font>
+    </property>
+    <property name="text">
+     <string>Number of iterations</string>
+    </property>
+   </widget>
+   <widget class="QSpinBox" name="numIterationsSpinBox">
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>170</y>
+      <width>51</width>
+      <height>22</height>
+     </rect>
+    </property>
+    <property name="minimum">
+     <number>1</number>
+    </property>
+    <property name="maximum">
+     <number>99</number>
+    </property>
+    <property name="singleStep">
+     <number>1</number>
+    </property>
+    <property name="value">
+     <number>1</number>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_12">
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>270</y>
+      <width>131</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="text">
+     <string>Window scale</string>
+    </property>
+   </widget>
+   <widget class="QDoubleSpinBox" name="windowScaleSpinBox">
+    <property name="enabled">
+     <bool>true</bool>
+    </property>
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>290</y>
+      <width>62</width>
+      <height>22</height>
+     </rect>
+    </property>
+    <property name="decimals">
+     <number>1</number>
+    </property>
+    <property name="minimum">
+     <double>1.000000000000000</double>
+    </property>
+    <property name="maximum">
+     <double>10.000000000000000</double>
+    </property>
+    <property name="singleStep">
+     <double>0.250000000000000</double>
+    </property>
+    <property name="value">
+     <double>2.000000000000000</double>
+    </property>
+   </widget>
+  </widget>
+  <widget class="QMenuBar" name="menubar">
+   <property name="geometry">
+    <rect>
+     <x>0</x>
+     <y>0</y>
+     <width>476</width>
+     <height>22</height>
+    </rect>
+   </property>
+   <widget class="QMenu" name="menuFile">
+    <property name="title">
+     <string>File</string>
+    </property>
+    <addaction name="actionOpen_cplx_file"/>
+    <addaction name="separator"/>
+    <addaction name="actionSave_image"/>
+    <addaction name="separator"/>
+    <addaction name="actionClose"/>
+    <addaction name="separator"/>
+    <addaction name="actionExit"/>
+   </widget>
+   <widget class="QMenu" name="menuHelp">
+    <property name="title">
+     <string>Help</string>
+    </property>
+    <addaction name="separator"/>
+   </widget>
+   <addaction name="menuFile"/>
+   <addaction name="menuHelp"/>
+  </widget>
+  <widget class="QStatusBar" name="statusbar"/>
+  <widget class="QToolBar" name="toolBar">
+   <property name="windowTitle">
+    <string>toolBar</string>
+   </property>
+   <attribute name="toolBarArea">
+    <enum>TopToolBarArea</enum>
+   </attribute>
+   <attribute name="toolBarBreak">
+    <bool>false</bool>
+   </attribute>
+  </widget>
+  <widget class="QToolBar" name="toolBar_2">
+   <property name="windowTitle">
+    <string>toolBar_2</string>
+   </property>
+   <attribute name="toolBarArea">
+    <enum>TopToolBarArea</enum>
+   </attribute>
+   <attribute name="toolBarBreak">
+    <bool>false</bool>
+   </attribute>
+  </widget>
+  <widget class="QToolBar" name="toolBar_3">
+   <property name="windowTitle">
+    <string>toolBar_3</string>
+   </property>
+   <attribute name="toolBarArea">
+    <enum>TopToolBarArea</enum>
+   </attribute>
+   <attribute name="toolBarBreak">
+    <bool>false</bool>
+   </attribute>
+  </widget>
+  <widget class="QToolBar" name="toolBar_4">
+   <property name="windowTitle">
+    <string>toolBar_4</string>
+   </property>
+   <attribute name="toolBarArea">
+    <enum>TopToolBarArea</enum>
+   </attribute>
+   <attribute name="toolBarBreak">
+    <bool>false</bool>
+   </attribute>
+  </widget>
+  <widget class="QToolBar" name="toolBar_5">
+   <property name="windowTitle">
+    <string>toolBar_5</string>
+   </property>
+   <attribute name="toolBarArea">
+    <enum>TopToolBarArea</enum>
+   </attribute>
+   <attribute name="toolBarBreak">
+    <bool>false</bool>
+   </attribute>
+  </widget>
+  <action name="actionOpen_cplx_file">
+   <property name="text">
+    <string>Open .cplx file</string>
+   </property>
+   <property name="shortcut">
+    <string>Ctrl+O</string>
+   </property>
+  </action>
+  <action name="actionExit">
+   <property name="text">
+    <string>Quit</string>
+   </property>
+   <property name="shortcut">
+    <string>Ctrl+Q</string>
+   </property>
+  </action>
+  <action name="actionClose">
+   <property name="text">
+    <string>Close</string>
+   </property>
+   <property name="shortcut">
+    <string>Ctrl+W</string>
+   </property>
+  </action>
+  <action name="actionSave_image">
+   <property name="text">
+    <string>Save image</string>
+   </property>
+   <property name="shortcut">
+    <string>Ctrl+S</string>
+   </property>
+  </action>
+ </widget>
+ <customwidgets>
+  <customwidget>
+   <class>ReconWidget</class>
+   <extends>QWidget</extends>
+   <header>reconWidget.h</header>
+   <container>1</container>
+  </customwidget>
+ </customwidgets>
+ <resources/>
+ <connections>
+  <connection>
+   <sender>kernelSizeSpinBox</sender>
+   <signal>editingFinished()</signal>
+   <receiver>radialSenseAppBaseMainWindow</receiver>
+   <slot>kernelWidthChanged()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>330</x>
+     <y>260</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>442</x>
+     <y>277</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>matrixSizeSpinBox</sender>
+   <signal>editingFinished()</signal>
+   <receiver>radialSenseAppBaseMainWindow</receiver>
+   <slot>matrixSizeChanged()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>321</x>
+     <y>211</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>441</x>
+     <y>249</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>oversampledMatrixSizeSpinBox</sender>
+   <signal>editingFinished()</signal>
+   <receiver>radialSenseAppBaseMainWindow</receiver>
+   <slot>matrixSizeOSChanged()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>379</x>
+     <y>212</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>448</x>
+     <y>224</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>numIterationsSpinBox</sender>
+   <signal>valueChanged(int)</signal>
+   <receiver>radialSenseAppBaseMainWindow</receiver>
+   <slot>numIterationsChanged()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>320</x>
+     <y>382</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>441</x>
+     <y>402</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>regularizationWeightSpinBox</sender>
+   <signal>valueChanged(double)</signal>
+   <receiver>radialSenseAppBaseMainWindow</receiver>
+   <slot>regularizationWeightChanged()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>332</x>
+     <y>433</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>446</x>
+     <y>453</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>windowScaleSpinBox</sender>
+   <signal>valueChanged(double)</signal>
+   <receiver>radialSenseAppBaseMainWindow</receiver>
+   <slot>windowScaleChanged(double)</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>339</x>
+     <y>484</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>444</x>
+     <y>501</y>
+    </hint>
+   </hints>
+  </connection>
+ </connections>
+ <slots>
+  <slot>matrixSizeChanged()</slot>
+  <slot>matrixSizeOSChanged()</slot>
+  <slot>regularizationWeightChanged()</slot>
+  <slot>numIterationsChanged()</slot>
+  <slot>kernelWidthChanged()</slot>
+  <slot>windowScaleChanged(double)</slot>
+ </slots>
+</ui>
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppMainWidget.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppMainWidget.cpp
new file mode 100644
index 0000000..205214f
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppMainWidget.cpp
@@ -0,0 +1,690 @@
+#include "radialSenseAppMainWidget.h"
+
+#include "hoNDArray_fileio.h"
+#include "cuNFFT.h"
+#include "NFFT_utils.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "vector_td_operators.h"
+#include "vector_td_utilities.h"
+#include "radial_utilities.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuCgSolver.h"
+#include "b1_map.h"
+
+#include "UIconstants.h"
+#include "GLReconWidget.h"
+
+#include <QtGui/QFileDialog>
+#include <QtGui/QProgressDialog>
+#include <QtGui/QMessageBox>
+#include <QtCore/QSignalMapper>
+
+#include <assert.h>
+
+using namespace std;
+using namespace Gadgetron;
+
+void radialSenseAppMainWindow::resetPrivateData()
+{
+  if( statusLabel ) delete statusLabel;
+  statusLabel = 0x0;
+  ready = false;
+}
+
+radialSenseAppMainWindow::radialSenseAppMainWindow(QWidget *parent) : QMainWindow(parent)
+{
+  statusLabel = 0x0;
+
+  setupUi(this);
+  retranslateUi(this);
+
+  resetPrivateData();
+
+  matrixSizeSpinBox->setValue(MATRIX_SIZE_INITIAL_VALUE);
+  oversampledMatrixSizeSpinBox->setValue(MATRIX_SIZE_OS_INITIAL_VALUE);
+  numIterationsSpinBox->setValue(NUM_ITERATIONS_INITIAL_VALUE);
+  regularizationWeightSpinBox->setValue(REG_WEIGHT_INITIAL_VALUE);
+  kernelSizeSpinBox->setValue(KERNEL_SIZE_INITIAL_VALUE);
+
+  // Menu actions
+  connect(actionOpen_cplx_file, SIGNAL(triggered()), this, SLOT(open()));
+  connect(actionSave_image, SIGNAL(triggered()), this, SLOT(saveImage()));
+  connect(actionClose, SIGNAL(triggered()), this, SLOT(close()));
+  connect(actionExit, SIGNAL(triggered()), qApp, SLOT(quit()));
+
+  // Originally, the thought was to put multiple ReconWidgets in the app. 
+  // This is why the SignalMapper is used rather than the basic signals below.
+
+  // Connect to the reconWidgets' frameChanged slots
+  QSignalMapper *signalMapper1 = new QSignalMapper(this);
+  connect(reconWidget->projectionSelectionScrollBar, SIGNAL(valueChanged(int)), signalMapper1, SLOT(map()));
+  signalMapper1->setMapping(reconWidget->projectionSelectionScrollBar, 1 );
+  connect(signalMapper1, SIGNAL(mapped(int)), this, SLOT(centralProjectionChanged(int)));
+
+  // Connect to the reconWidgets' projectionsPerFrameChanged slots
+  QSignalMapper *signalMapper2 = new QSignalMapper(this);
+  connect(reconWidget->numProjectionsScrollBar, SIGNAL(valueChanged(int)), signalMapper2, SLOT(map()));
+  signalMapper2->setMapping(reconWidget->numProjectionsScrollBar, 1 );
+  connect(signalMapper2, SIGNAL(mapped(int)), this, SLOT(projectionsPerFrameChanged(int)));
+
+  // Allocate encoding operator for non-Cartesian Sense
+  E = boost::shared_ptr< cuNonCartesianSenseOperator<float,2> >( new cuNonCartesianSenseOperator<float,2>() );  
+
+  // Allocate preconditioner
+  D = boost::shared_ptr< cuCgPreconditioner<float_complext> >( new cuCgPreconditioner<float_complext>() );
+
+  // Allocate regularization image operator
+  R = boost::shared_ptr< cuImageOperator<float_complext> >( new cuImageOperator<float_complext>() );
+  R->set_weight( 1.0f );
+
+  // Setup solver
+  cg.set_encoding_operator( E );        // encoding matrix
+  cg.add_regularization_operator( R );  // regularization matrix
+  cg.set_preconditioner ( D );          // preconditioning matrix
+  cg.set_max_iterations( get_num_iterations() );
+  cg.set_tc_tolerance( 1e-6 );
+  cg.set_output_mode( cuCgSolver<float_complext>::OUTPUT_SILENT );
+}
+
+/*
+  Slots
+*/
+
+void radialSenseAppMainWindow::open()
+{
+  // Open dialog box
+  QString filename = QFileDialog::getOpenFileName( this, tr("Open File"), "./", tr("Raw data (*.cplx)"));
+
+  if( filename.size() == 0 )
+    return; // Cancel
+
+  // Close current file
+  close();
+
+  // Update status bar
+  statusLabel = new QLabel(filename);	
+  statusBar()->addWidget(statusLabel);
+
+  // Read samples from disk
+  host_samples = read_nd_array<float_complext>(filename.toLatin1().constData());
+  cout << endl << "loaded dataset with " << host_samples->get_number_of_elements() << " samples." << endl;
+
+  // This is to prevent the user changing the matrix sizes before any data is initially loaded
+  matrixSizeSpinBox->setEnabled(true); 
+  oversampledMatrixSizeSpinBox->setEnabled(true);
+  
+  // Chose startup frame
+  reconWidget->projectionSelectionScrollBar->setValue(get_matrix_size().vec[0]>>2);
+  reconWidget->numProjectionsScrollBar->setValue(34);
+
+  replan();
+
+}
+
+void radialSenseAppMainWindow::saveImage()
+{ /*
+  // Open dialog box
+  QString filename = QFileDialog::getSaveFileName( this, tr("Save image to file"), "./", tr("Raw float data (*.raw)"));
+
+  if( filename.size() == 0 )
+  return; // Cancel
+
+  // This code is copied from 'reconstruct' and slightly modified...
+
+  <cut..>
+
+  LOOP:
+
+  // Save file
+  cudaMemcpy( tmp, devPtr, prod(get_matrix_size())*sizeof(float), cudaMemcpyDeviceToHost );
+  fwrite( tmp, prod(get_matrix_size()), sizeof(float), fout );
+
+  // Report any errors not already caught...
+  err = cudaGetLastError();
+  if( err != cudaSuccess ){
+  QMessageBox::critical( this, tr("Cuda error"), tr(cudaGetErrorString(err)) );
+  actionExit->trigger();
+  }
+	
+  END LOOP:
+
+  reconWidget->projectionNumberSpinBox->setValue(reconWidget->projectionNumberSpinBox->value()+20);
+  }
+
+  fclose(fout);
+  cudaFree(devPtr);
+  */
+}
+
+void radialSenseAppMainWindow::close()
+{	
+  resetPrivateData();
+}
+
+void radialSenseAppMainWindow::replan()
+{
+  QProgressDialog progress("Calibrating", "", 0, 4, this);
+  progress.setWindowModality(Qt::WindowModal);
+  progress.setValue(0);
+  progress.show();
+
+  // Set GUI elements before the plan is created to avoid triggering unneccessary reconstructions
+  unsigned int maxProjections = min(get_matrix_size().vec[0]<<2, (get_num_points_per_array_coil()/get_num_samples_per_projection())>>1);
+  reconWidget->numProjectionsScrollBar->setMaximum(maxProjections);
+  reconWidget->numProjectionsSpinBox->setMaximum(maxProjections);
+  unsigned int maxCentralProjection = get_maximum_central_projection();
+  reconWidget->projectionSelectionScrollBar->setMaximum(maxCentralProjection);
+  reconWidget->projectionNumberSpinBox->setMaximum(maxCentralProjection);
+  unsigned int minCentralProjection = get_num_projections_per_frame()>>1;
+  reconWidget->projectionSelectionScrollBar->setMinimum(minCentralProjection);
+  reconWidget->projectionNumberSpinBox->setMinimum(minCentralProjection);
+									    
+  progress.setValue(1);
+
+  // Pass matrix size to GLReconWidget::initializeGL
+  //	reconWidget->openglCanvas->setMatrixSize( get_matrix_size().vec[0], get_matrix_size().vec[1] );
+
+  progress.setValue(2);
+
+  const unsigned int samples_per_profile = get_num_samples_per_projection();
+  const unsigned int num_profiles = get_num_points_per_array_coil() / samples_per_profile;
+  const unsigned int profiles_per_frame = get_num_projections_per_frame();
+  const unsigned int frames_per_reconstruction = NUM_FRAMES_PER_CSM_RECON;
+  const unsigned int profiles_per_reconstruction = get_num_projections_per_frame()*frames_per_reconstruction;
+  const unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+
+  // Density compensation weights are constant throughout all reconstrutions
+  dcw  = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, profiles_per_frame, (float)get_matrix_size_os().vec[0]/(float)get_matrix_size().vec[0], 
+      float(1)/((float)samples_per_profile/(float)max(get_matrix_size().vec[0],get_matrix_size().vec[1])) );
+  
+  progress.setValue(3);
+
+  // Setup plan for convolution
+  plan.setup( get_matrix_size(), get_matrix_size_os(), get_kernel_width() );
+  
+  // Temporary oversampled image buffer
+  vector<unsigned int> image_os_dims = uint64d_to_vector<2>(get_matrix_size_os()); 
+  image_os_dims.push_back(frames_per_reconstruction); image_os_dims.push_back(get_num_coils());    
+  cuNDArray<float_complext> *image_os = new cuNDArray<float_complext>();
+  image_os->create(&image_os_dims);
+
+  // Extract coil sensitivity maps and training data using all the data
+  for( unsigned int iteration = 0; iteration < num_profiles/profiles_per_reconstruction; iteration++ ) {
+    
+    // Define trajectories
+    boost::shared_ptr< cuNDArray<floatd2> > traj = compute_radial_trajectory_golden_ratio_2d<float>
+      ( samples_per_profile, profiles_per_frame, frames_per_reconstruction, iteration*profiles_per_reconstruction );
+    
+    // Preprocess
+    plan.preprocess( traj.get(), cuNFFT_plan<float,2>::NFFT_PREP_NC2C );
+    traj.reset();
+    
+    // Upload data
+    boost::shared_ptr< cuNDArray<float_complext> > csm_data =
+      upload_data( iteration*profiles_per_reconstruction, samples_per_profile, samples_per_reconstruction,
+		   num_profiles*samples_per_profile, get_num_coils(), host_samples.get() );
+    
+    // Accumulate k-space for CSM estimation
+    plan.convolve( csm_data.get(), image_os, dcw.get(), cuNFFT_plan<float,2>::NFFT_CONV_NC2C, (iteration==0) ? false : true );
+    csm_data.reset();
+  }
+  
+  // We now have 'frames_per_reconstruction' k-space images of each coil. Add these up.
+  boost::shared_ptr< cuNDArray<float_complext> > acc_image_os = sum<float_complext>( image_os, 2 );
+  delete image_os; image_os = 0x0;
+  
+  // Complete gridding of k-space CSM image
+  plan.fft( acc_image_os.get(), cuNFFT_plan<float,2>::NFFT_BACKWARDS );
+  plan.deapodize( acc_image_os.get() );
+  
+  // Remove oversampling
+  vector<unsigned int> image_dims = uint64d_to_vector<2>(get_matrix_size()); image_dims.push_back(get_num_coils());
+  cuNDArray<float_complext> *image = new cuNDArray<float_complext>();
+  image->create(&image_dims);
+  crop<float_complext,2>( (get_matrix_size_os()-get_matrix_size())>>1, acc_image_os.get(), image );
+  acc_image_os.reset();
+  
+  // Estimate CSM
+  csm = estimate_b1_map<float,2>( image );
+
+  progress.setValue(4);
+
+  E->setup( get_matrix_size(), get_matrix_size_os(), get_kernel_width() ); 
+  E->set_csm(csm);
+
+  // Setup regularization operator
+  image_dims = uint64d_to_vector<2>(get_matrix_size());
+  cuNDArray<float_complext> *reg_image = new cuNDArray<float_complext>();
+  reg_image->create( &image_dims );
+
+  E->mult_csm_conj_sum( image, reg_image );
+  R->compute( reg_image );
+
+  delete image; image = 0x0; 
+  delete reg_image; reg_image = 0x0; 
+
+  // Define preconditioning weights
+  update_preconditioning_weights();
+    
+  progress.setValue(5);
+
+  ready = true;
+
+  // Trigger the #projections slot
+  reconWidget->numProjectionsScrollBar->setValue(reconWidget->numProjectionsScrollBar->value()+1);
+
+  // Perform reconstruction
+  reconstruct();
+}
+
+void radialSenseAppMainWindow::update_preconditioning_weights()
+{
+  boost::shared_ptr< cuNDArray<float> > _precon_weights = sum(abs_square(csm.get()).get(),2);
+  boost::shared_ptr< cuNDArray<float> > R_diag = R->get();
+  *R_diag *= get_kappa();
+  *_precon_weights += *R_diag;
+  reciprocal_sqrt_inplace(_precon_weights.get());
+  boost::shared_ptr< cuNDArray<float_complext> > precon_weights = real_to_complex<float_complext>( _precon_weights.get() );
+  D->set_weights( precon_weights );
+}
+
+void radialSenseAppMainWindow::projectionsPerFrameChanged(int)
+{
+  // the integer is an 'id' not the slider value!
+
+  unsigned int value = get_num_projections_per_frame();
+
+  // Enforce even values
+  if( value%2 ){
+    value--;
+    reconWidget->numProjectionsScrollBar->setValue(value);
+    return;
+  }
+
+  if(!ready) return;
+
+  // Remove the Qt lag of the slider rendering
+  QApplication::processEvents();
+
+  // The range of the frames slider/spinbox has changed
+  unsigned int maxCentralProjection = get_maximum_central_projection();
+  reconWidget->projectionSelectionScrollBar->setMaximum(maxCentralProjection);
+  reconWidget->projectionNumberSpinBox->setMaximum(maxCentralProjection);
+  reconWidget->projectionSelectionScrollBar->setSingleStep(value>>2);
+  reconWidget->projectionNumberSpinBox->setSingleStep(value>>2);
+
+  const unsigned int samples_per_profile = get_num_samples_per_projection();
+  const unsigned int profiles_per_frame = get_num_projections_per_frame();
+  
+  // Density compensation weights are constant throughout all reconstrutions
+  dcw  = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, profiles_per_frame, (float)get_matrix_size_os().vec[0]/(float)get_matrix_size().vec[0], 
+      float(1)/((float)samples_per_profile/(float)max(get_matrix_size().vec[0],get_matrix_size().vec[1])) );
+  
+  // Set density compensation weights
+  E->set_dcw(dcw);
+
+  // Reconstruct
+  reconstruct();
+}
+
+void radialSenseAppMainWindow::centralProjectionChanged(int id)
+{
+  // the integer is an 'id' not the slider value!
+
+  // Enforce even values
+  unsigned int value = get_central_projection();
+  if( value%2 ){
+    value--;
+    reconWidget->projectionSelectionScrollBar->setValue(value);
+    return;
+  }
+
+  if(!ready) return;
+
+  // Remove the lag of the slider rendering
+  QApplication::processEvents();
+
+  // Perform reconstruction
+  reconstruct();
+}
+
+void radialSenseAppMainWindow::matrixSizeChanged()
+{
+  static unsigned int lastValue = MATRIX_SIZE_INITIAL_VALUE;
+
+  unsigned int value = matrixSizeSpinBox->value();
+  unsigned int value_os = oversampledMatrixSizeSpinBox->value();
+
+  if( value == lastValue )
+    return;
+  else 
+    lastValue = value;
+
+  if(!ready) return;
+
+  // Pass matrix size to GLReconWidget
+  reconWidget->openglCanvas->setMatrixSize( value, value );
+	
+  if( value_os < value ){
+    oversampledMatrixSizeSpinBox->setValue(value);
+  }
+  
+  // and encoding matrix
+  E->setup( get_matrix_size(), get_matrix_size_os(), get_kernel_width() );  
+  
+  replan();
+}
+
+void radialSenseAppMainWindow::matrixSizeOSChanged()
+{
+  static unsigned int lastValue = MATRIX_SIZE_OS_INITIAL_VALUE;
+
+  unsigned int value = matrixSizeSpinBox->value();
+  unsigned int value_os = oversampledMatrixSizeSpinBox->value();
+
+  if( value_os == lastValue )
+    return;
+  else 
+    lastValue = value_os;
+
+  if( value_os < value ){
+    oversampledMatrixSizeSpinBox->setValue(value);
+    return;
+  }
+	
+  if( value_os%2 ){
+    value_os++;
+    oversampledMatrixSizeSpinBox->setValue(value_os);
+    return;
+  }
+
+  if(!ready) return;
+
+  E->setup( get_matrix_size(), get_matrix_size_os(), get_kernel_width() );  
+
+  reconstruct();
+}
+
+void radialSenseAppMainWindow::kernelWidthChanged()
+{
+  static double lastValue = KERNEL_SIZE_INITIAL_VALUE;
+
+  double value = kernelSizeSpinBox->value();
+
+  if( value == lastValue )
+    return;
+  else 
+    lastValue = value;
+
+  if(!ready) return;
+
+  E->setup( get_matrix_size(), get_matrix_size_os(), get_kernel_width() );  
+  
+  reconstruct();
+}
+
+void radialSenseAppMainWindow::numIterationsChanged()
+{
+  static unsigned int lastValue = NUM_ITERATIONS_INITIAL_VALUE;
+
+  unsigned int value = numIterationsSpinBox->value();
+
+  if( value == lastValue )
+    return;
+  else 
+    lastValue = value;
+
+  cg.set_max_iterations( get_num_iterations() );
+
+  if(!ready) return;
+
+  reconstruct();
+}
+
+void radialSenseAppMainWindow::regularizationWeightChanged()
+{
+  static double lastValue = REG_WEIGHT_INITIAL_VALUE;
+
+  double value = regularizationWeightSpinBox->value();
+
+  if( value == lastValue )
+    return;
+  else 
+    lastValue = value;
+
+  // Update D
+  update_preconditioning_weights();
+  
+  // Update operator R 
+  R->set_weight( get_kappa() );
+
+  if(!ready) return;
+
+  reconstruct();
+}
+
+void radialSenseAppMainWindow::windowScaleChanged(double)
+{
+  if(!ready) return;
+  reconstruct();
+}
+
+/*
+  Reconstruct frame
+*/
+
+void radialSenseAppMainWindow::reconstruct()
+{
+  if(!ready) return;
+  
+  // Check if any data has been loaded
+  if( host_samples->get_number_of_elements() == 0 )
+    return;
+  
+  // See if there is any uncaught errors before starting
+  cudaError_t err;
+  err = cudaGetLastError();
+  if( err != cudaSuccess ){
+    QMessageBox::critical( this, tr("Cuda error"), tr(cudaGetErrorString(err)) );
+    actionExit->trigger();
+  }
+
+  // Map result to OpenGL
+  reconWidget->openglCanvas->mapPBO();
+
+  // Be optimistic...
+  bool success = true;
+
+  const unsigned int samples_per_profile = get_num_samples_per_projection();
+  const unsigned int num_profiles = get_num_points_per_array_coil() / samples_per_profile;
+  const unsigned int profiles_per_frame = get_num_projections_per_frame();
+  const unsigned int frames_per_reconstruction = 1; 
+  const unsigned int profiles_per_reconstruction = get_num_projections_per_frame()*frames_per_reconstruction;
+  const uint64d2 matrix_size = get_matrix_size();
+  const uint64d2 matrix_size_os = get_matrix_size_os();
+  const unsigned int num_coils = get_num_coils();
+  const unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+
+  // Determine trajectories
+  boost::shared_ptr< cuNDArray<floatd2> > traj = compute_radial_trajectory_golden_ratio_2d<float>
+    ( samples_per_profile, profiles_per_frame, frames_per_reconstruction,  get_first_projection() );
+  
+  // Upload data
+  boost::shared_ptr< cuNDArray<float_complext> > data =
+    upload_data( get_first_projection(), samples_per_profile, samples_per_reconstruction,
+		 num_profiles*samples_per_profile, num_coils, host_samples.get() );
+    
+  // Set current trajectory and trigger NFFT preprocessing
+  E->preprocess(traj.get());
+  
+  // Form rhs (use result array to save memory)
+  vector<unsigned int> rhs_dims = uint64d_to_vector<2>(matrix_size); rhs_dims.push_back(frames_per_reconstruction);
+  cuNDArray<float_complext> rhs; rhs.create(&rhs_dims);
+  E->mult_MH( data.get(), &rhs );
+  
+  //
+  // Conjugate gradient solver
+  //
+  
+  boost::shared_ptr< cuNDArray<float_complext> > cgresult = cg.solve_from_rhs(&rhs);
+  
+  // Magnitudes image for visualization
+  boost::shared_ptr< cuNDArray<float> > tmp_res = abs<float_complext>(cgresult.get());
+  normalize( tmp_res.get(), get_window_scale() );
+  
+  // Copy to OpenGL/pbo
+  cudaMemcpy( reconWidget->openglCanvas->getDevPtr(),
+	      tmp_res->get_data_ptr(),
+	      prod(matrix_size)*sizeof(float), cudaMemcpyDeviceToDevice );
+  
+  // Report any errors not already caught...
+  err = cudaGetLastError();
+  if( err != cudaSuccess ){
+    QMessageBox::critical( this, tr("Cuda error"), tr(cudaGetErrorString(err)) );
+    actionExit->trigger();
+  }
+  
+  reconWidget->openglCanvas->unmapPBO();
+  
+  if( !success ){
+    QMessageBox::critical( this, tr("Reconstruction error"), tr("Check console. Quitting.") );
+    actionExit->trigger();
+    exit(EXIT_FAILURE);
+  }
+    
+  reconWidget->openglCanvas->updateGL();
+}
+
+/*
+  "Gets..."
+*/
+
+uint64d2 radialSenseAppMainWindow::get_matrix_size()
+{
+  int value = matrixSizeSpinBox->value();
+  return uint64d2( value, value );
+}
+
+uint64d2 radialSenseAppMainWindow::get_matrix_size_os()
+{
+  int value = oversampledMatrixSizeSpinBox->value();
+  return uint64d2( value, value );
+}
+
+float radialSenseAppMainWindow::get_kernel_width()
+{
+  double value = kernelSizeSpinBox->value();
+  return (float) value;	
+}
+
+float radialSenseAppMainWindow::get_window_scale()
+{
+  double value = windowScaleSpinBox->value();
+  return (float) value;	
+}
+
+unsigned int radialSenseAppMainWindow::get_num_samples_per_projection()
+{
+  if( host_samples->get_number_of_dimensions() > 0 )
+    return host_samples->get_size(0);
+  else return 0;
+}
+
+unsigned int radialSenseAppMainWindow::get_first_projection()
+{
+  int value = reconWidget->projectionNumberSpinBox->value();
+  value -= get_num_projections_per_frame()>>1;
+  if( value<0 )
+    value = 0;
+  return value;
+}
+
+unsigned int radialSenseAppMainWindow::get_central_projection()
+{
+  int value = reconWidget->projectionSelectionScrollBar->value();
+  return value;
+}
+
+unsigned int radialSenseAppMainWindow::get_maximum_central_projection()
+{
+  if( get_num_samples_per_projection() == 0 )
+    return 0;
+	
+  unsigned int maxCentralProjection = get_num_points_per_array_coil()/get_num_samples_per_projection()-get_num_projections_per_frame()/2-get_num_projections_per_frame()%2;
+  return maxCentralProjection;
+}
+
+unsigned int radialSenseAppMainWindow::get_num_projections_per_frame()
+{
+  int value = reconWidget->numProjectionsSpinBox->value();
+  return value;
+}
+
+unsigned int radialSenseAppMainWindow::get_num_coils()
+{
+  if( host_samples->get_number_of_dimensions() < 3 )
+    return 0;
+
+  unsigned int val;
+  if( host_samples->get_number_of_dimensions() == 3 )
+    val = host_samples->get_size(2);
+  else{
+    printf("\nUnknown number of dimensions in dataset. Quitting.\n");
+    exit(1);
+  }
+  
+  return val;
+}
+
+unsigned int radialSenseAppMainWindow::get_num_points_per_reconstruction()
+{
+  unsigned int val = get_num_samples_per_projection()*get_num_projections_per_frame();
+  return val;
+}
+
+hoNDArray<complext<float> >* radialSenseAppMainWindow::get_sample_values_array()
+{
+  return host_samples.get();
+}
+
+unsigned int radialSenseAppMainWindow::get_num_points_per_array_coil()
+{
+  if(host_samples->get_number_of_dimensions()<2)
+    return 0;
+
+  unsigned int val = host_samples->get_size(0)*host_samples->get_size(1);
+  return val;
+}
+
+unsigned int radialSenseAppMainWindow::get_num_iterations()
+{
+  int value = numIterationsSpinBox->value();
+  return value;
+}
+
+inline float radialSenseAppMainWindow::get_kappa()
+{
+  double value = regularizationWeightSpinBox->value();
+  return (float)value;
+}
+
+// Upload samples for one reconstruction from host to device
+boost::shared_ptr< cuNDArray<float_complext> >
+radialSenseAppMainWindow::upload_data( unsigned int profile_offset, unsigned int samples_per_profile, unsigned int samples_per_reconstruction, 
+				       unsigned int total_samples_per_coil, unsigned int num_coils,
+				       hoNDArray<float_complext> *host_data )
+{
+  vector<unsigned int> dims; dims.push_back(samples_per_reconstruction); dims.push_back(num_coils);
+  cuNDArray<float_complext> *data = new cuNDArray<float_complext>();
+  data->create( &dims );
+  
+  for( unsigned int i=0; i<num_coils; i++ )
+    cudaMemcpy( data->get_data_ptr()+i*samples_per_reconstruction, 
+		host_data->get_data_ptr()+i*total_samples_per_coil+profile_offset*samples_per_profile, 
+		samples_per_reconstruction*sizeof(float_complext), cudaMemcpyHostToDevice );
+  
+  return boost::shared_ptr< cuNDArray<float_complext> >(data);
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppMainWidget.h b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppMainWidget.h
new file mode 100644
index 0000000..cff0229
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppMainWidget.h
@@ -0,0 +1,134 @@
+#pragma once
+
+// Gadgetron includes
+#include "vector_td.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray.h"
+#include "cuNDArray.h"
+#include "cuNFFT.h"
+#include "cuCgSolver.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuImageOperator.h"
+#include "cuCgPreconditioner.h"
+#include "complext.h"
+
+#include <boost/shared_ptr.hpp>
+
+// Autogenerated header by uic
+#include "ui_radialSenseAppBaseMainWidget.h" 
+
+using namespace Gadgetron; // only because this header file is not distributed...
+
+class radialSenseAppMainWindow : public QMainWindow, public Ui::radialSenseAppBaseMainWindow
+{
+  // Macro for the Qt gui
+  Q_OBJECT
+ 
+  public:
+
+  // Constructor
+  radialSenseAppMainWindow(QWidget *parent = 0);
+
+  // Reconstruct frame
+  void reconstruct();
+
+  // Get matrix size
+  inline uint64d2 get_matrix_size();
+
+  // Get oversampled matrix size
+  inline uint64d2 get_matrix_size_os();
+
+  // Get number of coils
+  inline unsigned int get_num_coils();
+
+  // Get kernel width
+  inline float get_kernel_width();
+
+  // Get kappa (regularization weight)
+  inline float get_kappa();
+
+  // Get first projection
+  inline unsigned int get_first_projection();
+
+  // Get central projection
+  inline unsigned int get_central_projection();
+
+  // Get maximum central projection
+  inline unsigned int get_maximum_central_projection();
+
+  // Get number of projections per frame
+  inline unsigned int get_num_projections_per_frame();
+
+  // Number of samples per projection
+  inline unsigned int get_num_samples_per_projection();
+
+  // Number of points per reconstruction
+  inline unsigned int get_num_points_per_reconstruction();
+
+  // Get host side sample data array
+  inline hoNDArray<complext<float> >* get_sample_values_array();
+
+  // Get number of points per coil in data array
+  unsigned int get_num_points_per_array_coil();
+
+  // Get number of iterations
+  unsigned int get_num_iterations();
+
+  // Get window scale
+  float get_window_scale();
+  
+  boost::shared_ptr< cuNDArray<float_complext> >
+  upload_data( unsigned int profile_offset, unsigned int samples_per_profile, unsigned int samples_per_reconstruction, 
+	       unsigned int total_samples_per_coil, unsigned int num_coils, hoNDArray<float_complext> *host_data );
+
+private:
+  void resetPrivateData();
+  void replan();
+  void update_preconditioning_weights();	       
+
+private slots:
+  void open();
+  void close();
+  void saveImage();
+  void matrixSizeChanged();
+  void matrixSizeOSChanged();
+  void regularizationWeightChanged();
+  void projectionsPerFrameChanged(int);
+  void centralProjectionChanged(int);
+  void numIterationsChanged();
+  void kernelWidthChanged();
+  void windowScaleChanged(double);
+
+private:
+	
+  // Reconstruction plan
+  cuNFFT_plan<float,2> plan;
+
+  // Define conjugate gradient solver
+  cuCgSolver<float_complext> cg;
+
+  // Define non-Cartesian Sense solver
+  boost::shared_ptr< cuNonCartesianSenseOperator<float,2> > E;
+
+  // Define preconditioner
+  boost::shared_ptr< cuCgPreconditioner<float_complext> > D;
+  
+  // Define regularization image operator
+  boost::shared_ptr< cuImageOperator<float_complext> > R;
+  
+  // CSM
+  boost::shared_ptr< cuNDArray<float_complext> > csm;
+
+  // Density compensation weights
+  boost::shared_ptr< cuNDArray<float> > dcw;	
+
+  // Host data array
+  boost::shared_ptr< hoNDArray<float_complext> > host_samples;
+  
+  // Label for the status bar
+  QLabel *statusLabel;
+
+  // Are we set up for reconstruction?
+  bool ready;
+};
+
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconBaseWidget.ui b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconBaseWidget.ui
new file mode 100644
index 0000000..ad03226
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconBaseWidget.ui
@@ -0,0 +1,303 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>ReconBaseWidget</class>
+ <widget class="QWidget" name="ReconBaseWidget">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>272</width>
+    <height>414</height>
+   </rect>
+  </property>
+  <property name="sizePolicy">
+   <sizepolicy hsizetype="MinimumExpanding" vsizetype="MinimumExpanding">
+    <horstretch>0</horstretch>
+    <verstretch>0</verstretch>
+   </sizepolicy>
+  </property>
+  <property name="minimumSize">
+   <size>
+    <width>272</width>
+    <height>414</height>
+   </size>
+  </property>
+  <property name="windowTitle">
+   <string>ReconBaseWidget</string>
+  </property>
+  <property name="locale">
+   <locale language="English" country="UnitedKingdom"/>
+  </property>
+  <widget class="QFrame" name="decorationFrame">
+   <property name="geometry">
+    <rect>
+     <x>0</x>
+     <y>0</y>
+     <width>272</width>
+     <height>381</height>
+    </rect>
+   </property>
+   <property name="frameShape">
+    <enum>QFrame::Box</enum>
+   </property>
+   <property name="frameShadow">
+    <enum>QFrame::Plain</enum>
+   </property>
+   <widget class="GLReconWidget" name="openglCanvas" native="true">
+    <property name="geometry">
+     <rect>
+      <x>8</x>
+      <y>8</y>
+      <width>256</width>
+      <height>256</height>
+     </rect>
+    </property>
+    <property name="sizePolicy">
+     <sizepolicy hsizetype="MinimumExpanding" vsizetype="MinimumExpanding">
+      <horstretch>0</horstretch>
+      <verstretch>0</verstretch>
+     </sizepolicy>
+    </property>
+    <property name="minimumSize">
+     <size>
+      <width>256</width>
+      <height>256</height>
+     </size>
+    </property>
+    <property name="locale">
+     <locale language="English" country="UnitedKingdom"/>
+    </property>
+   </widget>
+   <widget class="QScrollBar" name="projectionSelectionScrollBar">
+    <property name="geometry">
+     <rect>
+      <x>8</x>
+      <y>290</y>
+      <width>176</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="sizePolicy">
+     <sizepolicy hsizetype="MinimumExpanding" vsizetype="Fixed">
+      <horstretch>0</horstretch>
+      <verstretch>0</verstretch>
+     </sizepolicy>
+    </property>
+    <property name="maximum">
+     <number>100</number>
+    </property>
+    <property name="singleStep">
+     <number>2</number>
+    </property>
+    <property name="orientation">
+     <enum>Qt::Horizontal</enum>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_1">
+    <property name="geometry">
+     <rect>
+      <x>8</x>
+      <y>270</y>
+      <width>140</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="sizePolicy">
+     <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
+      <horstretch>0</horstretch>
+      <verstretch>0</verstretch>
+     </sizepolicy>
+    </property>
+    <property name="text">
+     <string>Central projection</string>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_2">
+    <property name="geometry">
+     <rect>
+      <x>8</x>
+      <y>320</y>
+      <width>160</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="sizePolicy">
+     <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
+      <horstretch>0</horstretch>
+      <verstretch>0</verstretch>
+     </sizepolicy>
+    </property>
+    <property name="text">
+     <string>Number of projections per frame</string>
+    </property>
+   </widget>
+   <widget class="QScrollBar" name="numProjectionsScrollBar">
+    <property name="geometry">
+     <rect>
+      <x>8</x>
+      <y>340</y>
+      <width>186</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="sizePolicy">
+     <sizepolicy hsizetype="MinimumExpanding" vsizetype="Fixed">
+      <horstretch>0</horstretch>
+      <verstretch>0</verstretch>
+     </sizepolicy>
+    </property>
+    <property name="minimum">
+     <number>4</number>
+    </property>
+    <property name="maximum">
+     <number>100</number>
+    </property>
+    <property name="singleStep">
+     <number>2</number>
+    </property>
+    <property name="value">
+     <number>32</number>
+    </property>
+    <property name="orientation">
+     <enum>Qt::Horizontal</enum>
+    </property>
+   </widget>
+   <widget class="QSpinBox" name="projectionNumberSpinBox">
+    <property name="geometry">
+     <rect>
+      <x>191</x>
+      <y>287</y>
+      <width>71</width>
+      <height>22</height>
+     </rect>
+    </property>
+    <property name="sizePolicy">
+     <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
+      <horstretch>0</horstretch>
+      <verstretch>0</verstretch>
+     </sizepolicy>
+    </property>
+    <property name="alignment">
+     <set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
+    </property>
+    <property name="maximum">
+     <number>100</number>
+    </property>
+    <property name="singleStep">
+     <number>2</number>
+    </property>
+   </widget>
+   <widget class="QSpinBox" name="numProjectionsSpinBox">
+    <property name="geometry">
+     <rect>
+      <x>201</x>
+      <y>337</y>
+      <width>61</width>
+      <height>22</height>
+     </rect>
+    </property>
+    <property name="sizePolicy">
+     <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
+      <horstretch>0</horstretch>
+      <verstretch>0</verstretch>
+     </sizepolicy>
+    </property>
+    <property name="alignment">
+     <set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
+    </property>
+    <property name="minimum">
+     <number>4</number>
+    </property>
+    <property name="maximum">
+     <number>100</number>
+    </property>
+    <property name="singleStep">
+     <number>2</number>
+    </property>
+    <property name="value">
+     <number>32</number>
+    </property>
+   </widget>
+  </widget>
+ </widget>
+ <customwidgets>
+  <customwidget>
+   <class>GLReconWidget</class>
+   <extends>QWidget</extends>
+   <header>GLReconWidget.h</header>
+   <container>1</container>
+  </customwidget>
+ </customwidgets>
+ <resources/>
+ <connections>
+  <connection>
+   <sender>projectionNumberSpinBox</sender>
+   <signal>valueChanged(int)</signal>
+   <receiver>projectionSelectionScrollBar</receiver>
+   <slot>setValue(int)</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>235</x>
+     <y>298</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>162</x>
+     <y>298</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>numProjectionsSpinBox</sender>
+   <signal>valueChanged(int)</signal>
+   <receiver>numProjectionsScrollBar</receiver>
+   <slot>setValue(int)</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>231</x>
+     <y>351</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>180</x>
+     <y>353</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>projectionSelectionScrollBar</sender>
+   <signal>valueChanged(int)</signal>
+   <receiver>projectionNumberSpinBox</receiver>
+   <slot>setValue(int)</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>124</x>
+     <y>297</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>234</x>
+     <y>294</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>numProjectionsScrollBar</sender>
+   <signal>valueChanged(int)</signal>
+   <receiver>numProjectionsSpinBox</receiver>
+   <slot>setValue(int)</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>102</x>
+     <y>340</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>253</x>
+     <y>348</y>
+    </hint>
+   </hints>
+  </connection>
+ </connections>
+ <slots>
+  <slot>animationSpeedChanged(int)</slot>
+  <slot>projectionsPerFrameChanged(int)</slot>
+  <slot>frameChanged(int)</slot>
+ </slots>
+</ui>
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconWidget.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconWidget.cpp
new file mode 100644
index 0000000..e6ea6cc
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconWidget.cpp
@@ -0,0 +1,7 @@
+#include "reconWidget.h"
+
+ReconWidget::ReconWidget(QWidget *parent) : QWidget(parent)
+{
+  setupUi(this);
+  retranslateUi(this);
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconWidget.h b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconWidget.h
new file mode 100644
index 0000000..585c954
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconWidget.h
@@ -0,0 +1,13 @@
+#pragma once
+
+// Autogenerated header by uic
+#include "ui_reconBaseWidget.h" 
+
+class ReconWidget : public QWidget, public Ui::ReconBaseWidget
+{
+  Q_OBJECT
+  
+  public:
+  // Constructor
+  ReconWidget(QWidget* parent = 0);
+};
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_kt/CMakeLists.txt b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_kt/CMakeLists.txt
new file mode 100644
index 0000000..5826cb5
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_kt/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_executable(ktsense_radial_2d  main.cpp)
+
+target_link_libraries(ktsense_radial_2d gadgetron_toolbox_gpucore gadgetron_toolbox_gpuparallelmri gadgetron_toolbox_gpuoperators gadgetron_toolbox_gpunfft gadgetron_toolbox_hostutils gadgetron_toolbox_gpusolvers ${CUDA_LIBRARIES})
+
+install(TARGETS ktsense_radial_2d DESTINATION bin COMPONENT main)
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_kt/main.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_kt/main.cpp
new file mode 100644
index 0000000..3a12cdb
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_kt/main.cpp
@@ -0,0 +1,314 @@
+// Gadgetron includes
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "vector_td_utilities.h"
+#include "radial_utilities.h"
+#include "cuNonCartesianKtSenseOperator.h"
+#include "cuSenseBuffer.h"
+#include "cuImageOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuCgSolver.h"
+#include "cuNDFFT.h"
+#include "b1_map.h"
+#include "parameterparser.h"
+#include "GPUTimer.h"
+
+// Std includes
+#include <iostream>
+#include <math.h>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+
+// Upload samples for one reconstruction from host to device
+boost::shared_ptr< cuNDArray<_complext> > 
+upload_data( unsigned int reconstruction, unsigned int samples_per_reconstruction, unsigned int total_samples_per_coil, unsigned int num_coils,
+	     hoNDArray<_complext> *host_data )
+{
+  vector<size_t> dims; dims.push_back(samples_per_reconstruction); dims.push_back(num_coils);
+  cuNDArray<_complext> *data = new cuNDArray<_complext>(); data->create( &dims );
+  for( unsigned int i=0; i<num_coils; i++ )
+    cudaMemcpy( data->get_data_ptr()+i*samples_per_reconstruction, 
+		host_data->get_data_ptr()+i*total_samples_per_coil+reconstruction*samples_per_reconstruction, 
+		samples_per_reconstruction*sizeof(_complext), cudaMemcpyHostToDevice );
+
+  return boost::shared_ptr< cuNDArray<_complext> >(data);
+}
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Sample data file name", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'p', COMMAND_LINE_INT,    1, "Profiles per frame", true );
+  parms.add_parameter( 'f', COMMAND_LINE_INT,    1, "Frames per reconstruction", true, "32" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of iterations", true, "25" );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+  parms.add_parameter( 'K', COMMAND_LINE_FLOAT,  1, "Kappa", true, "0.25" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load sample data from disk
+  timer = new GPUTimer("\nLoading data");
+  boost::shared_ptr< hoNDArray<_complext> > host_data = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_data->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input data is not three-dimensional (#samples/profile x #profiles x #coils). Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Configuration from the host data
+  unsigned int samples_per_profile = host_data->get_size(0);
+  unsigned int num_profiles = host_data->get_size(1);
+  unsigned int num_coils = host_data->get_size(2);
+  
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  _real kappa = parms.get_parameter('K')->get_float_value();
+  unsigned int num_iterations = parms.get_parameter('i')->get_int_value();
+  unsigned int profiles_per_frame = parms.get_parameter('p')->get_int_value();
+  unsigned int frames_per_reconstruction = parms.get_parameter('f')->get_int_value();
+
+  // Silent correction of invalid command line parameters (clamp to valid range)
+  if( profiles_per_frame > num_profiles ) profiles_per_frame = num_profiles;
+  if( frames_per_reconstruction < 0 ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  if( frames_per_reconstruction*profiles_per_frame > num_profiles ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  
+  unsigned int profiles_per_reconstruction = frames_per_reconstruction*profiles_per_frame;
+  unsigned int samples_per_frame = profiles_per_frame*samples_per_profile;
+  unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+
+  cout << endl << "#samples/profile: " << samples_per_profile;
+  cout << endl << "#profiles/frame: " << profiles_per_frame;
+  cout << endl << "#profiles: " << num_profiles;
+  cout << endl << "#coils: " << num_coils;
+  cout << endl << "#frames/reconstruction " << frames_per_reconstruction;
+  cout << endl << "#profiles/reconstruction " << profiles_per_reconstruction;
+  cout << endl << "#samples/reconstruction " << samples_per_reconstruction << endl << endl;
+
+  // Density compensation weights are constant throughout all reconstrutions
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, profiles_per_frame, (_real)matrix_size_os[0]/(_real)matrix_size[0], 
+      _real(1)/((_real)samples_per_profile/(_real)max(matrix_size[0],matrix_size[1])) );
+  
+  // Define encoding matrix for non-Cartesian kt-SENSE
+  boost::shared_ptr< cuNonCartesianKtSenseOperator<_real,2> > E( new cuNonCartesianKtSenseOperator<_real,2>() );
+  E->setup( matrix_size, matrix_size_os, kernel_width );
+
+  // Notify encoding operator of dcw
+  E->set_dcw(dcw);
+
+  // Use a rhs buffer to estimate the csm -- from all the data
+  //
+
+  unsigned int profiles_per_subcycle = matrix_size_os[0]<<1; // causes no alising
+  unsigned int num_subcycles = profiles_per_subcycle / profiles_per_frame;
+  unsigned int num_cycles = num_profiles / profiles_per_subcycle;
+
+  GDEBUG_STREAM("Buffer cycles/sybcycles: " << num_cycles << " / " << num_subcycles << std::endl);
+
+  boost::shared_ptr< cuSenseBuffer<_real,2> > rhs_buffer( new cuSenseBuffer<_real,2>() );
+
+  // The first acquired profiles are often undesired. Skip the first two cycles...
+  rhs_buffer->setup( matrix_size, matrix_size_os, kernel_width, num_coils, num_cycles-1, num_subcycles );
+  rhs_buffer->set_dcw(dcw);
+   
+  // Fill rhs buffer
+  //
+ 
+  timer = new GPUTimer("CSM estimation");
+    
+  // Go through all the data...
+  for( unsigned int iteration = 0; iteration < num_profiles/profiles_per_frame; iteration++ ) {
+
+    // Define trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, 1, iteration*profiles_per_reconstruction );
+    
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > csm_data = upload_data
+      ( iteration, samples_per_frame, num_profiles*samples_per_profile, num_coils, host_data.get() );
+        
+    // Add frame to rhs buffer
+    rhs_buffer->add_frame_data( csm_data.get(), traj.get() );
+  }
+  
+  boost::shared_ptr< cuNDArray<_complext> > acc_images = rhs_buffer->get_accumulated_coil_images();
+  boost::shared_ptr< cuNDArray<_complext> > csm = estimate_b1_map<_real,2>( acc_images.get() );
+
+  E->set_csm(csm);
+
+  acc_images.reset();
+  rhs_buffer.reset();
+ 
+  delete timer;
+
+  // 
+  // Setup radial kt-SENSE reconstructions
+  //
+    
+  // Define regularization image operator
+  boost::shared_ptr< cuImageOperator<_complext> > R( new cuImageOperator<_complext>() );
+  R->set_weight( kappa );
+
+  // Define preconditioning operator
+  boost::shared_ptr< cuCgPreconditioner<_complext> > D( new cuCgPreconditioner<_complext>() );
+  boost::shared_ptr< cuNDArray<_real> > ___precon_weights = sum(abs_square(csm.get()).get(),2);
+  boost::shared_ptr< cuNDArray<_real> > __precon_weights = expand<_real>( ___precon_weights.get(), frames_per_reconstruction );
+  ___precon_weights.reset();
+
+  // Setup conjugate gradient solver
+  cuCgSolver< _complext> cg;
+  cg.set_encoding_operator( E );        // encoding matrix
+  cg.add_regularization_operator( R );  // regularization matrix
+  cg.set_preconditioner ( D );          // preconditioning matrix
+  cg.set_max_iterations( num_iterations );
+  cg.set_tc_tolerance( 1e-6 );
+  cg.set_output_mode( cuCgSolver< _complext>::OUTPUT_VERBOSE );
+      
+  // Reconstruct all SENSE frames iteratively
+  unsigned int num_reconstructions = num_profiles / profiles_per_reconstruction;
+
+  // Allocate space for result
+  vector<size_t> image_dims = to_std_vector(matrix_size); 
+  image_dims.push_back(frames_per_reconstruction*num_reconstructions); 
+
+  cuNDArray<_complext> result = cuNDArray<_complext>(&image_dims);
+  
+  // Define shutter for training data
+  _real shutter_radius = ((_real)matrix_size_os[0]/(_real)matrix_size[0])*(_real)profiles_per_frame/(_real)M_PI;
+  shutter_radius /= _real(2);
+  GDEBUG_STREAM("Shutter radius: " << shutter_radius << std::endl);
+
+  vector<size_t> image_os_dims = to_std_vector(matrix_size_os); 
+  image_os_dims.push_back(frames_per_reconstruction); image_os_dims.push_back(num_coils);    
+  cuNDArray<_complext> *image_os = new cuNDArray<_complext>(&image_os_dims);
+
+  timer = new GPUTimer("Full SENSE reconstruction.");
+  
+  for( unsigned int reconstruction = 0; reconstruction<num_reconstructions; reconstruction++ ){
+
+    // 
+    // Estimate training data
+    // 
+
+    // Define trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, frames_per_reconstruction, reconstruction*profiles_per_reconstruction );
+    
+    // Preprocess
+    image_dims.pop_back(); image_dims.push_back(frames_per_reconstruction); 
+    E->set_domain_dimensions(&image_dims);
+    E->preprocess( traj.get() );
+    
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > data = upload_data
+      ( reconstruction, samples_per_reconstruction, num_profiles*samples_per_profile, num_coils, host_data.get() );
+
+    E->set_codomain_dimensions(data->get_dimensions().get());    
+
+    // Convolve to Cartesian k-space
+    E->get_plan()->convolve( data.get(), image_os, dcw.get(), cuNFFT_plan<_real,2>::NFFT_CONV_NC2C );
+
+    // Apply shutter
+    fill_border<_complext,2>( shutter_radius, image_os );
+
+    E->get_plan()->fft( image_os, cuNFFT_plan<_real,2>::NFFT_BACKWARDS );
+    E->get_plan()->deapodize( image_os );
+
+    // Remove oversampling
+    image_dims.push_back(num_coils);
+    cuNDArray<_complext> *image = new cuNDArray<_complext>(&image_dims);
+    crop<_complext,2>( (matrix_size_os-matrix_size)>>1, image_os, image );
+    image_dims.pop_back();
+
+    // Compute regularization image
+    cuNDArray<_complext> *reg_image = new cuNDArray<_complext>(&image_dims);
+
+    E->mult_csm_conj_sum( image, reg_image );
+    cuNDFFT<_real>::instance()->ifft( reg_image, 2, true );
+
+
+    R->compute( reg_image );
+
+    delete reg_image; reg_image = 0x0;
+    delete image; image = 0x0;
+    
+    // Define preconditioning weights
+    boost::shared_ptr< cuNDArray<_real> > _precon_weights( new cuNDArray<_real>(*__precon_weights.get()));
+    boost::shared_ptr< cuNDArray<_real> > R_diag = R->get();
+    *R_diag *= kappa;
+    *_precon_weights += *R_diag;
+    R_diag.reset();
+    reciprocal_sqrt_inplace(_precon_weights.get());
+    boost::shared_ptr< cuNDArray<_complext> > precon_weights = real_to_complex<_complext>( _precon_weights.get() );
+    _precon_weights.reset();
+    
+    // Define preconditioning matrix
+    D->set_weights( precon_weights );
+    precon_weights.reset();
+      
+    //
+    // Conjugate gradient solver
+    //
+
+    boost::shared_ptr< cuNDArray<_complext> > cgresult;
+    {
+      GPUTimer timer("GPU Conjugate Gradient solve");
+      cgresult = cg.solve(data.get());
+    }
+
+    // Goto from x-f to x-t space
+    cuNDFFT<_real>::instance()->fft( cgresult.get(), 2 ,true);
+    
+    // Copy cgresult to result
+    cuNDArray<_complext> tmp(&image_dims, result.get_data_ptr()+reconstruction*prod(matrix_size)*frames_per_reconstruction);    
+    tmp = *(cgresult.get());  
+  }
+  
+  delete timer;
+  delete image_os; image_os = 0x0;
+  csm.reset();
+
+  // All done, write out the result
+
+  timer = new GPUTimer("Writing out result");
+  
+  boost::shared_ptr< hoNDArray<_complext> > host_result = result.to_host();
+  write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+    
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(&result)->to_host();
+  write_nd_array<_real>( host_norm.get(), "result.real" );
+  
+  delete timer;
+  
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/CMakeLists.txt b/apps/standalone/gpu/mri/sense/noncartesian/radial/CMakeLists.txt
new file mode 100644
index 0000000..c5de847
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/CMakeLists.txt
@@ -0,0 +1,10 @@
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/pmri/gpu
+)
+
+add_subdirectory(2d_golden_ratio)
+add_subdirectory(2d_golden_ratio_kt)
+
+#if (QT4_FOUND AND GLEW_FOUND AND GLUT_FOUND AND OPENGL_FOUND)
+#  add_subdirectory(2d_golden_ratio_gui)
+#endif (QT4_FOUND AND GLEW_FOUND AND GLUT_FOUND AND OPENGL_FOUND)
diff --git a/apps/standalone/gpu/registration/2d/CMakeLists.txt b/apps/standalone/gpu/registration/2d/CMakeLists.txt
new file mode 100644
index 0000000..9d33eb7
--- /dev/null
+++ b/apps/standalone/gpu/registration/2d/CMakeLists.txt
@@ -0,0 +1,52 @@
+add_executable(register_HS_2d_gpu register_HS_2d.cpp)
+add_executable(register_CGHS_2d_gpu register_CGHS_2d.cpp)
+add_executable(register_CK_2d_gpu register_CK_2d.cpp)
+#add_executable(test_reg_sense_recon test_reg_sense_recon.cpp)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+)
+
+target_link_libraries(register_HS_2d_gpu 
+  gadgetron_toolbox_hostutils 
+  gadgetron_toolbox_gpureg 
+  gadgetron_toolbox_gpucore 
+  gadgetron_toolbox_gpuoperators
+  gadgetron_toolbox_gpusolvers 
+  ${CUDA_LIBRARIES}
+  )
+
+target_link_libraries(register_CK_2d_gpu
+  gadgetron_toolbox_hostutils 
+  gadgetron_toolbox_gpureg 
+  gadgetron_toolbox_gpucore 
+  gadgetron_toolbox_gpuoperators
+  gadgetron_toolbox_gpusolvers 
+  ${CUDA_LIBRARIES}
+  )
+
+#target_link_libraries(test_reg_sense_recon 
+#  hostutils 
+#  gpureg 
+#  gpucore 
+#  gpuoperators
+#  gpusolvers 
+#  gpunfft 
+#  gpuparallelmri 
+#  ${CUDA_LIBRARIES}
+#  )
+  
+  target_link_libraries(register_CGHS_2d_gpu 
+  gadgetron_toolbox_hostutils 
+  gadgetron_toolbox_gpureg 
+  gadgetron_toolbox_gpucore 
+  gadgetron_toolbox_gpuoperators
+  gadgetron_toolbox_gpusolvers 
+  ${CUDA_LIBRARIES}
+  )
+
+install(TARGETS 
+  register_HS_2d_gpu
+  register_CGHS_2d_gpu  
+  register_CK_2d_gpu 
+  DESTINATION bin COMPONENT main)
diff --git a/apps/standalone/gpu/registration/2d/register_CGHS_2d.cpp b/apps/standalone/gpu/registration/2d/register_CGHS_2d.cpp
new file mode 100644
index 0000000..f6bf8a8
--- /dev/null
+++ b/apps/standalone/gpu/registration/2d/register_CGHS_2d.cpp
@@ -0,0 +1,134 @@
+/*
+  An example of how to register two 2d images using Horn-Schunk optical flow
+*/
+
+// Gadgetron includes
+#include "cuHSOpticalFlowSolver.h"
+#include "cuLinearResampleOperator.h"
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "hoNDArray_utils.h"
+#include "parameterparser.h"
+#include "cuCGHSOFSolver.h"
+// Std includes
+#include <iostream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'f', COMMAND_LINE_STRING, 1, "Fixed image file name (.real)", true );
+  parms.add_parameter( 'm', COMMAND_LINE_STRING, 1, "Moving image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "displacement_field.real" );
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Regularization weight (alpha)", true, "0.1" );
+  parms.add_parameter( 'l', COMMAND_LINE_INT,    1, "Number of multiresolution levels", true, "3" );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running registration with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load sample data from disk
+  //
+  
+  boost::shared_ptr< hoNDArray<_real> > host_fixed = 
+    read_nd_array<_real>((char*)parms.get_parameter('f')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_moving = 
+    read_nd_array<_real>((char*)parms.get_parameter('m')->get_string_value());
+  
+  if( !host_fixed.get() || !host_moving.get() ){
+    cout << endl << "One of the input images is not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  unsigned int num_fixed_dims = host_fixed->get_number_of_dimensions();
+  unsigned int num_moving_dims = host_moving->get_number_of_dimensions();
+
+  if( !(num_fixed_dims == 2 || num_fixed_dims == 3)  ){
+    cout << endl << "The fixed image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( !(num_moving_dims == 2 || num_moving_dims == 3)  ){
+    cout << endl << "The moving image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( num_fixed_dims < num_moving_dims  ){
+    host_fixed = expand( host_fixed.get(), host_moving->get_size(2) );
+    num_fixed_dims = host_fixed->get_number_of_dimensions();
+  }
+
+  if( num_moving_dims < num_moving_dims  ){
+    host_moving = expand( host_moving.get(), host_fixed->get_size(2) );
+    num_moving_dims = host_moving->get_number_of_dimensions();
+  }
+
+  // Upload host data to device
+  //
+
+  cuNDArray<_real> fixed_image(host_fixed.get());
+  cuNDArray<_real> moving_image(host_moving.get());
+  
+  _real alpha = (_real) parms.get_parameter('a')->get_float_value();
+
+  unsigned int multires_levels = parms.get_parameter('l')->get_int_value();
+
+  // Use bilinear interpolation for resampling
+  //
+
+  boost::shared_ptr< cuLinearResampleOperator<_real,2> > R( new cuLinearResampleOperator<_real,2>() );
+
+  // Setup solver
+  //
+  
+  cuCGHSOFSolver<_real,2> HS;
+  HS.set_interpolator( R );
+  HS.set_output_mode( cuCGHSOFSolver<_real,2>::OUTPUT_VERBOSE );
+  HS.get_solver()->set_max_iterations( 100 );
+  HS.get_solver()->set_output_mode(cuCgSolver<_real>::OUTPUT_VERBOSE);
+  HS.set_num_multires_levels( multires_levels );
+  HS.set_alpha(alpha);
+  
+
+  // Run registration
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > result = HS.solve( &fixed_image, &moving_image );
+
+  if( !result.get() ){
+    cout << endl << "Registration solver failed. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  boost::shared_ptr< cuNDArray<_real> > deformed_moving = HS.deform( &moving_image, result );
+  
+  // All done, write out the result
+  //
+
+  boost::shared_ptr< hoNDArray<_real> > host_result = result->to_host();
+  write_nd_array<_real>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  host_result = deformed_moving->to_host();
+  write_nd_array<_real>(host_result.get(), "def_moving.real" );
+  
+  return 0;
+}
diff --git a/apps/standalone/gpu/registration/2d/register_CK_2d.cpp b/apps/standalone/gpu/registration/2d/register_CK_2d.cpp
new file mode 100644
index 0000000..5baaa81
--- /dev/null
+++ b/apps/standalone/gpu/registration/2d/register_CK_2d.cpp
@@ -0,0 +1,129 @@
+/*
+  An example of how to register two 2d images using Cornelius-Kanade optical flow
+*/
+
+// Gadgetron includes
+#include "cuCKOpticalFlowSolver.h"
+#include "cuLinearResampleOperator.h"
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "parameterparser.h"
+#include "GadgetronTimer.h"
+
+// Std includes
+#include <iostream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'f', COMMAND_LINE_STRING, 1, "Fixed image file name (.real)", true );
+  parms.add_parameter( 'm', COMMAND_LINE_STRING, 1, "Moving image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "displacement_field.real" );
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Regularization weight (alpha)", true, "0.05" );
+  parms.add_parameter( 'b', COMMAND_LINE_FLOAT,  1, "Regularization weight (beta)", true, "1.0" );
+  parms.add_parameter( 'l', COMMAND_LINE_INT,    1, "Number of multiresolution levels", true, "3" );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running registration with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load sample data from disk
+  //
+  
+  boost::shared_ptr< hoNDArray<_real> > host_fixed = 
+    read_nd_array<_real>((char*)parms.get_parameter('f')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_moving = 
+    read_nd_array<_real>((char*)parms.get_parameter('m')->get_string_value());
+  
+  if( !host_fixed.get() || !host_moving.get() ){
+    cout << endl << "One of the input images is not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  unsigned int num_fixed_dims = host_fixed->get_number_of_dimensions();
+  unsigned int num_moving_dims = host_moving->get_number_of_dimensions();
+
+  if( !(num_fixed_dims == 2 || num_fixed_dims == 3)  ){
+    cout << endl << "The fixed image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( !(num_moving_dims == 2 || num_moving_dims == 3)  ){
+    cout << endl << "The moving image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Upload host data to device
+  //
+
+  cuNDArray<_real> fixed_image(host_fixed.get());
+  cuNDArray<_real> moving_image(host_moving.get());
+  
+  _real alpha = (_real) parms.get_parameter('a')->get_float_value();
+  _real beta = (_real) parms.get_parameter('b')->get_float_value();
+  
+  unsigned int multires_levels = parms.get_parameter('l')->get_int_value();
+
+  // Use bilinear interpolation for resampling
+  //
+
+  boost::shared_ptr< cuLinearResampleOperator<_real,2> > R( new cuLinearResampleOperator<_real,2>() );
+
+  // Setup solver
+  //
+  
+  cuCKOpticalFlowSolver<_real,2> CK;
+  CK.set_interpolator( R );
+  CK.set_output_mode( cuCKOpticalFlowSolver<_real,2>::OUTPUT_VERBOSE );  
+  CK.set_max_num_iterations_per_level( 500 );
+  CK.set_num_multires_levels( multires_levels );
+  CK.set_alpha(alpha);
+  CK.set_beta(beta);
+  CK.set_limit(0.01f);
+  
+  // Run registration
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > result;
+  {
+      GadgetronTimer timer("Running registration");
+      result = CK.solve( &fixed_image, &moving_image );
+  }
+
+  if( !result.get() ){
+    cout << endl << "Registration solver failed. Quitting!\n" << endl;
+    return 1;
+  }
+
+  boost::shared_ptr< cuNDArray<_real> > deformed_moving = CK.deform( &moving_image, result );
+  
+  // All done, write out the result
+  //
+
+  boost::shared_ptr< hoNDArray<_real> > host_result = result->to_host();
+  write_nd_array<_real>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  host_result = deformed_moving->to_host();
+  write_nd_array<_real>(host_result.get(), "def_moving.real" );
+  
+  return 0;
+}
diff --git a/apps/standalone/gpu/registration/2d/register_HS_2d.cpp b/apps/standalone/gpu/registration/2d/register_HS_2d.cpp
new file mode 100644
index 0000000..fbf881f
--- /dev/null
+++ b/apps/standalone/gpu/registration/2d/register_HS_2d.cpp
@@ -0,0 +1,122 @@
+/*
+  An example of how to register two 2d images using Horn-Schunk optical flow
+*/
+
+// Gadgetron includes
+#include "cuHSOpticalFlowSolver.h"
+#include "cuLinearResampleOperator.h"
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'f', COMMAND_LINE_STRING, 1, "Fixed image file name (.real)", true );
+  parms.add_parameter( 'm', COMMAND_LINE_STRING, 1, "Moving image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "displacement_field.real" );
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Regularization weight (alpha)", true, "0.1" );
+  parms.add_parameter( 'l', COMMAND_LINE_INT,    1, "Number of multiresolution levels", true, "3" );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running registration with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load sample data from disk
+  //
+  
+  boost::shared_ptr< hoNDArray<_real> > host_fixed = 
+    read_nd_array<_real>((char*)parms.get_parameter('f')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_moving = 
+    read_nd_array<_real>((char*)parms.get_parameter('m')->get_string_value());
+  
+  if( !host_fixed.get() || !host_moving.get() ){
+    cout << endl << "One of the input images is not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  unsigned int num_fixed_dims = host_fixed->get_number_of_dimensions();
+  unsigned int num_moving_dims = host_moving->get_number_of_dimensions();
+
+  if( !(num_fixed_dims == 2 || num_fixed_dims == 3)  ){
+    cout << endl << "The fixed image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( !(num_moving_dims == 2 || num_moving_dims == 3)  ){
+    cout << endl << "The moving image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  // Upload host data to device
+  //
+
+  cuNDArray<_real> fixed_image(host_fixed.get());
+  cuNDArray<_real> moving_image(host_moving.get());
+  
+  _real alpha = (_real) parms.get_parameter('a')->get_float_value();
+
+  unsigned int multires_levels = parms.get_parameter('l')->get_int_value();
+
+  // Use bilinear interpolation for resampling
+  //
+
+  boost::shared_ptr< cuLinearResampleOperator<_real,2> > R( new cuLinearResampleOperator<_real,2>() );
+
+  // Setup solver
+  //
+  
+  cuHSOpticalFlowSolver<_real,2> HS;
+  HS.set_interpolator( R );
+  HS.set_output_mode( cuHSOpticalFlowSolver<_real,2>::OUTPUT_VERBOSE );  
+  HS.set_max_num_iterations_per_level( 500 );
+  HS.set_num_multires_levels( multires_levels );
+  HS.set_alpha(alpha);
+  HS.set_limit(0.01f);
+  
+  // Run registration
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > result = HS.solve( &fixed_image, &moving_image );
+
+  if( !result.get() ){
+    cout << endl << "Registration solver failed. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  boost::shared_ptr< cuNDArray<_real> > deformed_moving = HS.deform( &moving_image, result );
+  
+  // All done, write out the result
+  //
+
+  boost::shared_ptr< hoNDArray<_real> > host_result = result->to_host();
+  write_nd_array<_real>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  host_result = deformed_moving->to_host();
+  write_nd_array<_real>(host_result.get(), "def_moving.real" );
+  
+  return 0;
+}
diff --git a/apps/standalone/gpu/registration/2d/test_reg_sense_recon.cpp b/apps/standalone/gpu/registration/2d/test_reg_sense_recon.cpp
new file mode 100644
index 0000000..790b00f
--- /dev/null
+++ b/apps/standalone/gpu/registration/2d/test_reg_sense_recon.cpp
@@ -0,0 +1,568 @@
+#define PAD_Z
+
+/*
+
+  This is an example of how to use optical flow image registration 
+  and the image resampling operator for image reconstruction.
+  
+  This example uses golden ratio Sense MRI for demonstration. 
+  It was tested with a free-breathing cardiac acquisition.
+
+  !!! Note !!!
+  ------------
+  No cardiac phase binning is performed.
+  And since the registration has trouble handling large, 
+  non-rigid deformations such as the heart contraction
+  it serves only for demonstration purposes. 
+
+  An actual application should bin the cardiac phases and use the 
+  registration to correct for respiratory motion only.
+*/
+
+#include "cuCKOpticalFlowSolver.h"
+#include "cuLinearResampleOperator.h"
+#include "cuNDArray.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "hoNDArray_fileio.h"
+#include "parameterparser.h"
+#include "radial_utilities.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuSenseBuffer.h"
+#include "cuImageOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuCgSolver.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+
+#include <iostream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Define desired precision
+//
+
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+
+//
+// Define matrix operator for "registration reconstruction" using non-Cartesian Sense
+// For simplicity we assume that the respective operators have been setup from outside
+//
+
+template<class REAL, unsigned int D> class registrationReconOperator
+  : public linearOperator< cuNDArray< complext<REAL> > >
+{
+public:
+  
+  registrationReconOperator() : linearOperator< cuNDArray< complext< REAL> > >() {}
+  virtual ~registrationReconOperator() {}
+  
+  inline void set_encoding_operator( boost::shared_ptr< cuNonCartesianSenseOperator<REAL,D> > E ){
+    E_ = E;
+  }
+  
+  inline void set_resampling_operator( boost::shared_ptr< cuLinearResampleOperator<complext<REAL>,D> > R ){
+    R_ = R;
+  }
+  
+  virtual void mult_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false)
+  {
+    if( !in || !out || !R_->get_displacement_field() ){
+      throw cuda_error("registrationReconOperator::mult_M failed (1)");
+    }
+    
+    // Allocate intermediate image
+    std::vector<size_t> tmp_dims = *R_->get_displacement_field()->get_dimensions(); tmp_dims.pop_back();
+    cuNDArray< complext<REAL> > tmp_in_out;
+
+    tmp_in_out.create(&tmp_dims);
+    
+    // Deform the input image into multiple frames by applying the registration vector field
+    R_->mult_M( in, &tmp_in_out );
+
+    // Apply non-Cartesian Sense encoding
+    E_->mult_M( &tmp_in_out, out );
+  }
+  
+  virtual void mult_MH( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false )
+  {
+    if( !in || !out || !R_->is_preprocessed() ){
+      throw cuda_error("registrationReconOperator::mult_MH failed (1)");
+    }
+    
+    // Allocate intermediate image
+    std::vector<size_t> tmp_dims = *R_->get_displacement_field()->get_dimensions().get(); tmp_dims.pop_back();
+    cuNDArray< complext<REAL> > tmp_in_out(&tmp_dims); 
+
+    // Apply adjoint non-Cartesian Sense encoding
+    E_->mult_MH( in, &tmp_in_out);
+  
+    // Apply adjoint registration
+    R_->mult_MH( &tmp_in_out, out );
+  }
+  
+  virtual void mult_MH_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false )
+  {
+    if( !in || !out || !R_->get_displacement_field() ){
+      throw cuda_error("registrationReconOperator::mult_MH_M failed (1)");
+    }
+
+    // Allocate intermediate image
+    std::vector<size_t> tmp_dims = *R_->get_displacement_field()->get_dimensions().get(); tmp_dims.pop_back();
+    cuNDArray< complext<REAL> > tmp_in_out1(&tmp_dims), tmp_in_out2(&tmp_dims); 
+    
+    // Deform the input image into multiple frames by applying the registration vector field
+    R_->mult_M( in, &tmp_in_out1 );
+
+    // Apply non-Cartesian Sense encoding _iteration_
+    E_->mult_MH_M( &tmp_in_out1, &tmp_in_out2 );
+    
+    // Apply adjoint registration
+    R_->mult_MH( &tmp_in_out2, out );
+  }
+  
+  virtual boost::shared_ptr< linearOperator< cuNDArray< complext<REAL> > > > clone() {
+    return linearOperator< cuNDArray<complext< REAL > > >::clone(this);
+  }
+  
+private:
+  boost::shared_ptr< cuNonCartesianSenseOperator<REAL,D> > E_;
+  boost::shared_ptr< cuLinearResampleOperator<complext<REAL>,D> > R_;
+};
+
+
+//
+// Utility to upload samples for one reconstruction from host to device
+//
+
+boost::shared_ptr< cuNDArray<_complext> > 
+upload_data( unsigned int reconstruction, unsigned int samples_per_reconstruction, unsigned int total_samples_per_coil, unsigned int num_coils, hoNDArray<_complext> *host_data, unsigned int offset = 0 )
+{
+  vector<size_t> dims; dims.push_back(samples_per_reconstruction); dims.push_back(num_coils);
+  cuNDArray<_complext> *data = new cuNDArray<_complext>(); data->create( &dims );
+  for( unsigned int i=0; i<num_coils; i++ )
+    cudaMemcpy( data->get_data_ptr()+i*samples_per_reconstruction, 
+		host_data->get_data_ptr()+i*total_samples_per_coil+reconstruction*samples_per_reconstruction+offset, 
+		samples_per_reconstruction*sizeof(_complext), cudaMemcpyHostToDevice );
+
+  return boost::shared_ptr< cuNDArray<_complext> >(data);
+}
+
+int main(int argc, char** argv)
+{
+
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "MRI sample data file name", true, "fb_data.cplx" );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Reconstruction result file name", true, "result.real" );
+
+  // Parameters for the initial Sense reconstruction
+  //
+
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true, "256" );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true, "384" );
+  parms.add_parameter( 'p', COMMAND_LINE_INT,    1, "Profiles per frame", true, "16" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of iterations", true, "15" );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+  parms.add_parameter( 'K', COMMAND_LINE_FLOAT,  1, "Kappa", true, "0.1" );
+
+  // Parameters for the registration
+  //
+
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Registration regularization weight (alpha)", true, "0.05" );
+  parms.add_parameter( 'b', COMMAND_LINE_FLOAT,  1, "Registration regularization weight (beta)", true, "1.0" );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running registration with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer = new GPUTimer("\nPerforming Sense reconstruction");
+
+  //
+  // First perform the Sense reconstruction, 
+  // resulting in aliased presumably...
+  //
+  
+  // Load sample data from disk
+  //
+  
+  boost::shared_ptr< hoNDArray<_complext> > host_data = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+   
+  if( !(host_data->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input data is not three-dimensional (#samples/profile x #profiles x #coils). Quitting!" << endl;
+    return 1;
+  }
+  
+  // Configuration from the host data
+  //
+
+  unsigned int samples_per_profile = host_data->get_size(0);
+  unsigned int num_profiles = host_data->get_size(1);
+  unsigned int num_coils = host_data->get_size(2);
+  
+  // Configuration from the command line
+  //
+
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  _real kappa = parms.get_parameter('K')->get_float_value();
+  unsigned int num_iterations = parms.get_parameter('i')->get_int_value();
+  unsigned int profiles_per_frame = parms.get_parameter('p')->get_int_value();
+  unsigned int frames_per_reconstruction = 1;
+
+  // Silent correction of invalid command line parameters (clamp to valid range)
+  //
+
+  if( profiles_per_frame > num_profiles ) profiles_per_frame = num_profiles;
+  if( frames_per_reconstruction < 0 ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  if( frames_per_reconstruction*profiles_per_frame > num_profiles ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  
+  unsigned int profiles_per_reconstruction = frames_per_reconstruction*profiles_per_frame;
+  unsigned int samples_per_frame = profiles_per_frame*samples_per_profile;
+  unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+
+  // Set density compensation weights
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, profiles_per_frame, (_real)matrix_size_os.vec[0]/(_real)matrix_size.vec[0], 
+      _real(1)/((_real)samples_per_profile/(_real)max(matrix_size.vec[0],matrix_size.vec[1])) );
+
+  // Define encoding matrix for non-Cartesian SENSE
+  //
+
+  boost::shared_ptr< cuNonCartesianSenseOperator<_real,2> > E( new cuNonCartesianSenseOperator<_real,2>() );  
+  E->setup( matrix_size, matrix_size_os, kernel_width );
+  
+  std::vector<size_t> tmp_vec = to_std_vector(matrix_size);
+  tmp_vec.push_back(frames_per_reconstruction);
+  E->set_domain_dimensions( &tmp_vec );
+
+  // Notify encoding operator of dcw
+  //
+  
+  E->set_dcw(dcw);
+  
+  // Define rhs buffer
+  //
+
+  boost::shared_ptr< cuSenseBuffer<_real,2> > rhs_buffer( new cuSenseBuffer<_real,2>() );
+  rhs_buffer->setup( matrix_size, matrix_size_os, kernel_width, num_coils, 8, 16 );
+  rhs_buffer->set_dcw(dcw);
+  
+  // Fill rhs buffer (go through all the data...)
+  //
+    
+  for( unsigned int iteration = 0; iteration < num_profiles/profiles_per_frame; iteration++ ) {
+
+    // Define trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, 1, iteration*profiles_per_frame );
+    
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > csm_data = upload_data
+      ( iteration, samples_per_frame, num_profiles*samples_per_profile, num_coils, host_data.get() );
+    
+    // Add frame to rhs buffer
+    rhs_buffer->add_frame_data( csm_data.get(), traj.get() );
+  }
+
+  // Estimate CSM
+  //
+
+  boost::shared_ptr< cuNDArray<_complext> > acc_images = rhs_buffer->get_accumulated_coil_images();
+  boost::shared_ptr< cuNDArray<_complext> > csm = estimate_b1_map<_real,2>( acc_images.get() );
+
+  E->set_csm(csm);
+
+  // Define regularization image operator 
+  //
+
+  std::vector<size_t> image_dims = to_std_vector(matrix_size);
+  cuNDArray<_complext> *regul_image = new cuNDArray<_complext>(&image_dims);
+  
+  E->mult_csm_conj_sum( acc_images.get(), regul_image );
+  acc_images.reset();
+
+  boost::shared_ptr< cuImageOperator<_complext> > R( new cuImageOperator<_complext>() ); 
+  R->set_weight( kappa );
+  R->compute( regul_image ); 
+  delete regul_image; regul_image = 0x0;
+
+  // Define preconditioning weights
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > _precon_weights = sum(abs_square(csm.get()).get(),2);
+  boost::shared_ptr< cuNDArray<_real> > R_diag = R->get();
+  *R_diag *= kappa;
+  *_precon_weights += *R_diag;
+  R_diag.reset();
+  reciprocal_sqrt_inplace(_precon_weights.get());
+  boost::shared_ptr< cuNDArray<_complext> > precon_weights = real_to_complex<_complext>( _precon_weights.get() );
+  _precon_weights.reset();
+
+  // Define preconditioning matrix
+  //
+
+  boost::shared_ptr< cuCgPreconditioner<_complext> > D( new cuCgPreconditioner<_complext>() );
+  D->set_weights( precon_weights );
+  precon_weights.reset();
+  csm.reset();
+  
+  // Setup radial SENSE reconstructions (conjugate gradient solver)
+  //
+      
+  cuCgSolver<_complext> *cg = new cuCgSolver<_complext>;
+  cg->set_encoding_operator( E );  // encoding matrix
+  cg->add_regularization_operator( R );  // regularization matrix
+  cg->set_preconditioner ( D );  // preconditioning matrix
+  cg->set_max_iterations( num_iterations );
+  cg->set_tc_tolerance( 1e-6 );
+  cg->set_output_mode( cuCgSolver<_complext>::OUTPUT_VERBOSE );
+
+  // To save memory we allow only a certain number of frames
+  unsigned int max_num_frames = 25;
+  unsigned int reconstruction_offset = 100; // To find some respiratory movement in the test dataset
+  unsigned int num_reconstructions = num_profiles / profiles_per_reconstruction;
+  if( num_reconstructions<(max_num_frames+reconstruction_offset) ) reconstruction_offset = 0;
+  if( num_reconstructions > max_num_frames ) num_reconstructions = max_num_frames;
+  
+  // Allocate space for aliased reconstruction
+  //
+  
+  image_dims = to_std_vector(matrix_size); 
+  image_dims.push_back(frames_per_reconstruction*num_reconstructions); 
+  cuNDArray<_complext> *sense_result_cplx = new cuNDArray<_complext>; 
+  GDEBUG_STREAM(std::endl << matrix_size[0] << " " << matrix_size[1] << " " << frames_per_reconstruction << " " << num_reconstructions);
+
+  sense_result_cplx->create(&image_dims);
+  
+  // Loop and reconstruct 
+  // 
+
+  for( unsigned int reconstruction = 0; reconstruction<num_reconstructions; reconstruction++ ){
+    
+    // Determine trajectories
+    //
+
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, frames_per_reconstruction, (reconstruction+reconstruction_offset)*profiles_per_reconstruction );
+    
+    // Upload data
+    //
+
+    boost::shared_ptr< cuNDArray<_complext> > data = upload_data
+      ( reconstruction+reconstruction_offset, samples_per_reconstruction, num_profiles*samples_per_profile, num_coils, host_data.get() );
+    
+    // Set current trajectory and trigger NFFT preprocessing
+    //
+    
+    E->preprocess(traj.get());
+    
+    // Form rhs (use sense_result_cplx array to save memory)
+    //
+    
+    vector<size_t> rhs_dims = to_std_vector(matrix_size); 
+    rhs_dims.push_back(frames_per_reconstruction);
+    cuNDArray<_complext> rhs; 
+
+    rhs.create( &rhs_dims, sense_result_cplx->get_data_ptr()+
+		reconstruction*prod(matrix_size)*frames_per_reconstruction );
+
+    E->mult_MH( data.get(), &rhs );
+    
+    // Conjugate gradient solver
+    //
+
+    boost::shared_ptr< cuNDArray<_complext> > cgresult = cg->solve(data.get());
+    rhs = *(cgresult.get());
+  }
+  
+  boost::shared_ptr< cuNDArray<_real> > sense_result = abs(sense_result_cplx);
+  write_nd_array<_complext>(sense_result_cplx->to_host().get(), "_images_all.cplx");
+
+  // We need all our device memory for the registration. Clean up after Sense.
+  // E.reset(); D.reset(); R.reset();   -- we will reuse these below 
+
+  rhs_buffer.reset();
+  delete cg; delete sense_result_cplx;
+  delete timer;
+
+  // Determine fixed/moving image dimensions and create arrays
+  //
+
+#ifdef PAD_Z
+  std::vector<size_t> _3d_dims = *(sense_result->get_dimensions());
+  unsigned int last_dim = _3d_dims.back();
+  _3d_dims.pop_back(); _3d_dims.push_back(1); _3d_dims.push_back(last_dim);
+  sense_result->reshape( &_3d_dims );
+#endif
+  
+  vector<size_t> multi_dims = *sense_result->get_dimensions();
+  multi_dims.pop_back();
+#ifdef PAD_Z
+  multi_dims.push_back(sense_result->get_size(3)-1);
+#else
+  multi_dims.push_back(sense_result->get_size(2)-1);
+#endif
+  vector<size_t> single_dims = *sense_result->get_dimensions();
+  single_dims.pop_back();
+  
+  cuNDArray<_real> 
+    *multi_image = new cuNDArray<_real>, 
+    *single_image = new cuNDArray<_real>;
+  
+  single_image->create( &single_dims, sense_result->get_data_ptr());
+  multi_image->create( &multi_dims, sense_result->get_data_ptr()+prod(matrix_size));
+  
+  write_nd_array<_real>(multi_image->to_host().get(), "_images_multi.real");
+  write_nd_array<_real>(single_image->to_host().get(), "_image_single.real");
+
+  // Setup registration solver
+  //
+#ifdef PAD_Z
+  cuCKOpticalFlowSolver<_real,3> *CK = new cuCKOpticalFlowSolver<_real,3>;
+#else
+  cuCKOpticalFlowSolver<_real,2> *CK = new cuCKOpticalFlowSolver<_real,2>;
+#endif
+
+  //CK->set_output_mode( cuCKOpticalFlowSolver<_real,2>::OUTPUT_VERBOSE );  
+  CK->set_num_multires_levels( 1 );
+  CK->set_max_num_iterations_per_level( 500 );
+  CK->set_alpha((_real) parms.get_parameter('a')->get_float_value());
+  CK->set_beta((_real) parms.get_parameter('b')->get_float_value());
+  CK->set_limit(0.01f);
+  
+  // 
+  // Peform "averaging by registration" type reconstruction
+  //
+
+  timer = new GPUTimer("\nReconstruction by optical flow averaging");
+
+  // Run registration:
+  // - multi_image -> single_image (many to one registration)
+  // 
+
+  // All to one
+  boost::shared_ptr< cuNDArray<_real> > reg_result = CK->solve( single_image, multi_image );
+  
+  write_nd_array<_real>(reg_result->to_host().get(), "_reg1.real");
+
+  // Deform the multi_image according to the deformation field and average
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > regis_image = CK->deform( multi_image, reg_result );
+#ifdef PAD_Z
+  boost::shared_ptr< cuNDArray<_real> > regis_image_avg = sum<_real>( regis_image.get(), 3); 
+#else
+  boost::shared_ptr< cuNDArray<_real> > regis_image_avg = sum<_real>( regis_image.get(), 2); 
+#endif
+  write_nd_array<_real>(regis_image->to_host().get(), "_reg_avg.real");
+  write_nd_array<_real>(regis_image_avg->to_host().get(), "_avg_recon.real");
+
+  regis_image.reset(); regis_image_avg.reset(); reg_result.reset();
+
+  delete timer;
+
+  //
+  // Peform "registration in cost function" type reconstruction
+  //
+
+  timer = new GPUTimer("\nRunning registration recon");
+
+  // One to all
+  reg_result = CK->solve( multi_image, single_image );
+  
+  write_nd_array<_real>(reg_result->to_host().get(), "_reg2.real");
+
+  regis_image = CK->deform( single_image, reg_result );
+  write_nd_array<_real>(regis_image->to_host().get(), "_multi_def.real");
+  regis_image.reset(); 
+
+  // Test iteration
+  cuNDArray<_real> out; out.create(multi_image->get_dimensions().get());
+  cuNDArray<_real> in; in.create(single_image->get_dimensions().get());
+  
+  // Release memory
+  delete CK;
+  exit(1);
+  // Setup solver
+  //
+
+  // The non-Cartesian Sense operator is already setup, 
+  // but the trajectories must be recomputed and preprocessed
+
+  boost::shared_ptr< cuNDArray<_reald2> >traj = compute_radial_trajectory_golden_ratio_2d<_real>
+    ( samples_per_profile, profiles_per_frame, frames_per_reconstruction*(num_reconstructions-1), 
+      (1+reconstruction_offset)*profiles_per_reconstruction );
+  
+  E->preprocess(traj.get());
+
+  // Define and preprocess resampling operator
+  
+  boost::shared_ptr< cuLinearResampleOperator<_complext,2> > resampler
+    ( new cuLinearResampleOperator<_complext,2> );
+
+  resampler->set_displacement_field(reg_result);
+  resampler->mult_MH_preprocess();
+
+  // Define registrationReconstruction encoding operator
+
+  boost::shared_ptr< registrationReconOperator<_real,2> > 
+    RR( new registrationReconOperator<_real,2>() );  
+
+  std::vector<size_t> rhs_dims = to_std_vector(matrix_size); 
+  RR->set_domain_dimensions( &rhs_dims );
+
+  RR->set_encoding_operator( E );
+  RR->set_resampling_operator( resampler );
+
+  cg = new cuCgSolver<_complext>;
+  cg->set_encoding_operator( RR );
+  cg->add_regularization_operator( R );
+  cg->set_preconditioner ( D ); 
+  cg->set_max_iterations( num_iterations );
+  cg->set_tc_tolerance( 1e-6 );
+  cg->set_output_mode( cuCgSolver<_complext>::OUTPUT_VERBOSE );
+
+  // Form rhs
+  
+  boost::shared_ptr< cuNDArray<_complext> > data = upload_data
+    ( 0, samples_per_reconstruction*(num_reconstructions-1), 
+      num_profiles*samples_per_profile, num_coils, host_data.get(), 
+      (reconstruction_offset+1)*samples_per_reconstruction );
+  
+  cuNDArray<_complext> rhs(&rhs_dims); 
+  RR->mult_MH( data.get(), &rhs );
+  
+  write_nd_array<_complext>(rhs.to_host().get(), "_rhs.cplx" );
+  write_nd_array<_real>(abs(&rhs)->to_host().get(), "_rhs.real" );
+ 
+  // Conjugate gradient solver
+  //
+  
+  boost::shared_ptr< cuNDArray<_complext> > cgresult = cg->solve(data.get());
+
+  boost::shared_ptr< hoNDArray<_real> > host_image = abs(cgresult.get())->to_host();
+  write_nd_array<_real>(host_image.get(), "_reg_frame.real" );
+  
+  delete timer;
+   
+  return 0;
+}
diff --git a/apps/standalone/gpu/registration/3d/CMakeLists.txt b/apps/standalone/gpu/registration/3d/CMakeLists.txt
new file mode 100644
index 0000000..3980478
--- /dev/null
+++ b/apps/standalone/gpu/registration/3d/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_executable(register_CK_3d_gpu register_CK_3d.cpp)
+
+target_link_libraries(register_CK_3d_gpu
+  gadgetron_toolbox_hostutils 
+  gadgetron_toolbox_gpureg 
+  gadgetron_toolbox_gpucore 
+  gadgetron_toolbox_gpuoperators
+  gadgetron_toolbox_gpusolvers 
+  ${CUDA_LIBRARIES}
+  )
+
+install(TARGETS register_CK_3d_gpu DESTINATION bin COMPONENT main)
diff --git a/apps/standalone/gpu/registration/3d/register_CK_3d.cpp b/apps/standalone/gpu/registration/3d/register_CK_3d.cpp
new file mode 100644
index 0000000..6700724
--- /dev/null
+++ b/apps/standalone/gpu/registration/3d/register_CK_3d.cpp
@@ -0,0 +1,124 @@
+/*
+  An example of how to register two 3d volumes using Cornelius-Kanade optical flow
+*/
+
+// Gadgetron includes
+#include "cuCKOpticalFlowSolver.h"
+#include "cuLinearResampleOperator.h"
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'f', COMMAND_LINE_STRING, 1, "Fixed image file name (.real)", true );
+  parms.add_parameter( 'm', COMMAND_LINE_STRING, 1, "Moving image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "displacement_field.real" );
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Regularization weight (alpha)", true, "0.05" );
+  parms.add_parameter( 'b', COMMAND_LINE_FLOAT,  1, "Regularization weight (beta)", true, "1.0" );
+  parms.add_parameter( 'l', COMMAND_LINE_INT,    1, "Number of multiresolution levels", true, "3" );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running registration with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load sample data from disk
+  //
+  
+  boost::shared_ptr< hoNDArray<_real> > host_fixed = 
+    read_nd_array<_real>((char*)parms.get_parameter('f')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_moving = 
+    read_nd_array<_real>((char*)parms.get_parameter('m')->get_string_value());
+  
+  if( !host_fixed.get() || !host_moving.get() ){
+    cout << endl << "One of the input images is not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  unsigned int num_fixed_dims = host_fixed->get_number_of_dimensions();
+  unsigned int num_moving_dims = host_moving->get_number_of_dimensions();
+
+  if( !(num_fixed_dims == 3 || num_fixed_dims == 4)  ){
+    cout << endl << "The fixed image is not three- or four-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( !(num_moving_dims == 3 || num_moving_dims == 4)  ){
+    cout << endl << "The moving image is not three- or four-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Upload host data to device
+  //
+
+  cuNDArray<_real> fixed_image(host_fixed.get());
+  cuNDArray<_real> moving_image(host_moving.get());
+  
+  _real alpha = (_real) parms.get_parameter('a')->get_float_value();
+  _real beta = (_real) parms.get_parameter('b')->get_float_value();
+
+  unsigned int multires_levels = parms.get_parameter('l')->get_int_value();
+
+  // Use trilinear interpolation for resampling
+  //
+
+  boost::shared_ptr< cuLinearResampleOperator<_real,3> > R( new cuLinearResampleOperator<_real,3>() );
+
+  // Setup solver
+  //
+  
+  cuCKOpticalFlowSolver<_real,3> CK;
+  CK.set_interpolator( R );
+  CK.set_output_mode( cuCKOpticalFlowSolver<_real,3>::OUTPUT_VERBOSE );  
+  CK.set_max_num_iterations_per_level( 500 );
+  CK.set_num_multires_levels( multires_levels );
+  CK.set_alpha(alpha);
+  CK.set_beta(beta);
+  CK.set_limit(0.01f);
+  
+  // Run registration
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > result = CK.solve( &fixed_image, &moving_image/*, true*/ );
+
+  if( !result.get() ){
+    cout << endl << "Registration solver failed. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  boost::shared_ptr< cuNDArray<_real> > deformed_moving = CK.deform( &moving_image, result );
+  
+  // All done, write out the result
+  //
+
+  boost::shared_ptr< hoNDArray<_real> > host_result = result->to_host();
+  write_nd_array<_real>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  host_result = deformed_moving->to_host();
+  write_nd_array<_real>(host_result.get(), "def_moving.real" );
+  
+  return 0;
+}
diff --git a/apps/standalone/gpu/registration/CMakeLists.txt b/apps/standalone/gpu/registration/CMakeLists.txt
new file mode 100644
index 0000000..4b458f1
--- /dev/null
+++ b/apps/standalone/gpu/registration/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+  ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/gpu
+  )
+
+add_subdirectory(2d)
+add_subdirectory(3d)
diff --git a/chroot/CMakeLists.txt b/chroot/CMakeLists.txt
new file mode 100644
index 0000000..3724de5
--- /dev/null
+++ b/chroot/CMakeLists.txt
@@ -0,0 +1,120 @@
+configure_file("start-gadgetron.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/start-gadgetron.sh @ONLY)
+configure_file("enter-chroot-env.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/enter-chroot-env.sh @ONLY)
+configure_file("gadgetron-dependency-query.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/gadgetron-dependency-query.sh @ONLY)
+configure_file("siemens_to_ismrmrd.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/siemens_to_ismrmrd.sh @ONLY)
+configure_file("gadgetron_ismrmrd_client.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/gadgetron_ismrmrd_client.sh @ONLY)
+configure_file("gt_alive.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/gt_alive.sh @ONLY)
+configure_file("${CMAKE_SOURCE_DIR}/apps/gadgetron/webapp/gadgetron_web_app.in" ${CMAKE_CURRENT_BINARY_DIR}/gadgetron_web_app.cfg @ONLY)
+configure_file("${CMAKE_SOURCE_DIR}/apps/gadgetron/webapp/gadgetron_web.conf.in" ${CMAKE_CURRENT_BINARY_DIR}/gadgetron_web.conf @ONLY)
+configure_file("${CMAKE_SOURCE_DIR}/apps/gadgetron/webapp/gadgetron_web_ld.conf.in" ${CMAKE_CURRENT_BINARY_DIR}/gadgetron_web_ld.conf @ONLY)
+configure_file("run-webapp.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/run-webapp.sh @ONLY)
+configure_file("copy-cuda-lib.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/copy-cuda-lib.sh @ONLY)
+configure_file("start-env.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/start-env.sh @ONLY)
+configure_file("start.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/start.sh @ONLY)
+configure_file("start-webapp.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/start-webapp.sh @ONLY)
+configure_file("run-gadgetron-dependency-query.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/run-gadgetron-dependency-query.sh @ONLY)
+configure_file("run-gadgetron_ismrmrd_client.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/run-gadgetron_ismrmrd_client.sh @ONLY)
+configure_file("run-gt_alive.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/run-gt_alive.sh @ONLY)
+configure_file("run-siemens_to_ismrmrd.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/run-siemens_to_ismrmrd.sh @ONLY)
+configure_file("mount_image.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/mount_image.sh @ONLY)
+configure_file("start-gadgetron-from-image.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/start-gadgetron-from-image.sh @ONLY)
+
+set (LIBRARY_PATHS ${ISMRMRD_LIB_DIR})
+set (LIBRARY_PATHS ${LIBRARY_PATHS}:${MKL_LIB_DIR})
+
+FIND_LIBRARY( CUDA_LIBRARY NAMES libcuda.so )
+
+string(FIND "${CUDA_LIBRARIES}" "libcudart.so" LIBCUDART_LOC)
+string(FIND "${CUDA_LIBRARIES}" "libcuda.so" LIBCUDA_LOC)
+
+if (LIBCUDA_LOC EQUAL -1)
+  get_filename_component(CUDART_LIB_DIR ${CUDA_LIBRARIES} PATH)
+else (LIBCUDA_LOC EQUAL -1)
+  if(LIBCUDART_LOC LESS LIBCUDA_LOC)
+    string(SUBSTRING "${CUDA_LIBRARIES}" 0 ${LIBCUDART_LOC} CUDART_LIB_DIR)
+  else(LIBCUDART_LOC LESS LIBCUDA_LOC)
+    string(SUBSTRING "${CUDA_LIBRARIES}" ${LIBCUDA_LOC} -1 CUDART_LIB_TEMP)
+    string(SUBSTRING "${CUDART_LIB_TEMP}" 10 -1 CUDART_LIB_TEMP2)
+    get_filename_component(CUDART_LIB_DIR ${CUDART_LIB_TEMP2} PATH)
+  endif(LIBCUDART_LOC LESS LIBCUDA_LOC)
+endif (LIBCUDA_LOC EQUAL -1)
+
+get_filename_component(CUDA_LIB_DIR ${CUDA_LIBRARY} PATH)
+get_filename_component(CUDAFFT_LIB_DIR ${CUDA_CUFFT_LIBRARIES} PATH)
+get_filename_component(CUDABLAS_LIB_DIR ${CUDA_CUBLAS_LIBRARIES} PATH)
+
+set (LIBRARY_PATHS ${LIBRARY_PATHS}:${CUDART_LIB_DIR}:${CUDA_LIB_DIR}:${CUDAFFT_LIB_DIR}:${CUDABLAS_LIB_DIR})
+
+configure_file("copy-cuda-lib.sh.in" ${CMAKE_CURRENT_BINARY_DIR}/copy-cuda-lib.sh @ONLY)
+
+find_program(SIEMENS_TO_ISMRMRD_EXE siemens_to_ismrmrd)
+
+add_custom_target(chroot_base)
+add_custom_target(chroot)
+
+add_custom_command(
+    TARGET chroot_base
+    POST_BUILD
+    VERBATIM
+    COMMENT "Creating chroot base tar file"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMAND sudo bash create_chroot_base.sh 
+    ${CMAKE_BINARY_DIR}
+    )
+
+if (SIEMENS_TO_ISMRMRD_EXE)
+  add_custom_command(
+    TARGET chroot
+    POST_BUILD
+    VERBATIM
+    COMMENT "Creating chroot tar and img file"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMAND sudo bash create_chroot.sh 
+    ${CMAKE_INSTALL_PREFIX}
+    ${CMAKE_BINARY_DIR}
+    ${LIBRARY_PATHS}
+    ${CUDA_LIBRARY}
+    ${CMAKE_SOURCE_DIR}
+    ${SIEMENS_TO_ISMRMRD_EXE}
+    )
+else(SIEMENS_TO_ISMRMRD_EXE)
+  add_custom_command(
+    TARGET chroot
+    POST_BUILD
+    VERBATIM
+    COMMENT "Creating chroot tar and img file"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMAND sudo bash create_chroot.sh
+    ${CMAKE_INSTALL_PREFIX}
+    ${CMAKE_BINARY_DIR}
+    ${LIBRARY_PATHS}
+    ${CUDA_LIBRARY}
+    ${CMAKE_SOURCE_DIR}
+    )
+endif(SIEMENS_TO_ISMRMRD_EXE)
+
+install(PROGRAMS 
+  ${CMAKE_CURRENT_BINARY_DIR}/copy-cuda-lib.sh 
+  ${CMAKE_CURRENT_BINARY_DIR}/start-gadgetron.sh
+  ${CMAKE_CURRENT_BINARY_DIR}/enter-chroot-env.sh
+  ${CMAKE_CURRENT_BINARY_DIR}/gadgetron-dependency-query.sh
+  ${CMAKE_CURRENT_BINARY_DIR}/siemens_to_ismrmrd.sh
+  ${CMAKE_CURRENT_BINARY_DIR}/gadgetron_ismrmrd_client.sh
+  ${CMAKE_CURRENT_BINARY_DIR}/gt_alive.sh
+  ${CMAKE_CURRENT_BINARY_DIR}/run-webapp.sh
+  ${CMAKE_CURRENT_BINARY_DIR}/run-gadgetron-dependency-query.sh
+  ${CMAKE_CURRENT_BINARY_DIR}/run-gadgetron_ismrmrd_client.sh
+  ${CMAKE_CURRENT_BINARY_DIR}/run-gt_alive.sh
+  ${CMAKE_CURRENT_BINARY_DIR}/run-siemens_to_ismrmrd.sh
+  ${CMAKE_CURRENT_BINARY_DIR}/start-env.sh
+  ${CMAKE_CURRENT_BINARY_DIR}/start.sh
+  ${CMAKE_CURRENT_BINARY_DIR}/start-webapp.sh
+  ${CMAKE_CURRENT_BINARY_DIR}/mount_image.sh
+  ${CMAKE_CURRENT_BINARY_DIR}/start-gadgetron-from-image.sh
+  ${CMAKE_SOURCE_DIR}/chroot/mount.sh
+  ${CMAKE_SOURCE_DIR}/chroot/stop.sh
+  ${CMAKE_SOURCE_DIR}/chroot/umount_image.sh
+  ${CMAKE_SOURCE_DIR}/chroot/install_chroot_image.sh
+  DESTINATION ${GADGETRON_INSTALL_CHROOT_SCRIPTS_PATH}
+  COMPONENT scripts
+  )
diff --git a/chroot/README.rst b/chroot/README.rst
new file mode 100644
index 0000000..2053222
--- /dev/null
+++ b/chroot/README.rst
@@ -0,0 +1,35 @@
+Running Gadgetron in chroot
+============================
+
+
+Introduction
+------------
+
+
+Creating a chroot environment
+-----------------------------
+
+First we need to install the tools required to create chroot environments::
+
+  sudo apt-get install dchroot debootstrap
+
+Next we need to add an appropriate configuration to `/etc/schroot/schroot.conf'::
+  
+  [trusty]
+  description=trusty
+  location=/var/chroot/trusty
+  priority=3
+  users=doko
+  groups=sbuild
+  root-groups=root
+
+Create the folder where we will be making the root file system::
+  
+  sudo mkdir -p /var/chroot/trusty
+
+
+Now generate a basic root file system::
+
+  sudo debootstrap --variant=buildd --arch amd64 trusty /var/chroot/trusty http://archive.ubuntu.com/ubuntu/
+
+
diff --git a/chroot/chroot-manual.txt b/chroot/chroot-manual.txt
new file mode 100644
index 0000000..3aa7b67
--- /dev/null
+++ b/chroot/chroot-manual.txt
@@ -0,0 +1,156 @@
+---INSTALLING GADGETRON Ubuntu 14.04---
+
+
+
+*** Dependencies ***
+
+sudo apt-get install build-essential git-core cmake gcc-multilib libace-dev libarmadillo-dev libboost-all-dev libfftw3-dev libgtest-dev libxslt-dev xsltproc libhdf5-serial-dev h5utils hdf5-tools libxml2-dev python-dev python-numpy python-libxml2 python-psutil python-h5py python-scipy python-twisted python-matplotlib dcmtk git doxygen libqt4-dev libglew1.6-dev docbook5-xml docbook-xsl-doc-pdf docbook-xsl-doc-html docbook-xsl-ns fop freeglut3-dev libxi-dev liblapack-dev
+
+
+
+*** Steps to install CUDA 6.5 ***
+
+1. sudo apt-get install build-essential
+
+2. In order to be able to install nvidia driver (http://stackoverflow.com/questions/25463952/drm-ko-missing-for-cuda-6-5-ubuntu-14-04-aws-ec2-gpu-instance-g2-2xlarge)
+sudo apt-get install linux-image-extra-virtual
+
+3. Follow the instructions from: http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-linux/index.html#ubuntu-installation or apply steps 1-11 from bellow:
+
+    1. cd && wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_6.5-14_amd64.deb
+    2. sudo dpkg -i cuda-repo-ubuntu1404_6.5-14_amd64.deb
+    3. rm cuda-repo-ubuntu1404_6.5-14_amd64.deb
+    4. sudo apt-get update
+    5. sudo apt-get -y install cuda
+    6. echo "" >> ~/.bashrc
+    7. echo "export PATH=/usr/local/cuda-6.5/bin:$PATH" >> ~/.bashrc
+    8. echo "export LD_LIBRARY_PATH=/usr/local/cuda-6.5/lib64:$LD_LIBRARY_PATH" >> ~/.bashrc
+ 
+    ## You should probably restart the system here
+    9. sudo shutdown -r now
+ 
+    ## Lets make the deviceQuery sample.  This is used to verify cuda works
+    10. cd /usr/local/cuda/samples/1_Utilities/deviceQuery && sudo make
+ 
+    # If you see all of your cards listed, and the last line says "Result = PASS" you're good to go!
+    11. /usr/local/cuda/samples/1_Utilities/deviceQuery/deviceQuery
+
+4. Check driver details: nvidia-smi
+
+
+
+*** Steps to install MKL ***
+
+If you would like to use MKL (Intel Math Kernel Library), please download your installation file from Intel and do the installation. 
+Here is what we did with MKL version 11.0.5.192:
+
+tar -xzvf l_mkl_11.0.5.192_intel64.tgz 
+cd l_mkl_11.0.5.192_intel64/
+sudo ./install.sh
+
+Follow the instructions and add the following paths to your ~/.bashrc
+echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/mkl/lib/intel64:/opt/intel/lib/intel64" >> ~/.bashrc
+
+You will also need to install Armadillo from source, skip the libarmadillo-dev package above. 
+Download latest armadillo from http://arma.sourceforge.net and unpack, build and install. 
+If you have MKL installed, this will pick it up and build the armadillo libs against it.
+
+cd armadillo-3.900.6/
+cmake
+make; sudo make install
+
+
+
+*** Extras ***
+
+Usefull links (but not 100% acurate):
+http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-linux/index.html#ubuntu-installation
+https://gist.github.com/zcshiner/4b32980792d367222304
+http://www.r-tutor.com/gpu-computing/cuda-installation/cuda6.5-ubuntu
+http://askubuntu.com/questions/451672/installing-and-testing-cuda-in-ubuntu-14-04
+
+MH Usefull links: 
+https://sourceforge.net/p/gadgetron/discussion/general/thread/d0ee5b27/
+https://sourceforge.net/p/gadgetron/home/Linux%20Installation/
+
+
+------------------------------------------------------------------------------------------
+------------------------------------------------------------------------------------------
+------------------------------------------------------------------------------------------
+
+
+---CHROOT---
+
+
+1. apt-get update
+
+2. apt-get install debootstrap -y
+
+3. sudo debootstrap --variant=buildd --arch amd64 trusty /opt/chroot/trusty http://gb.archive.ubuntu.com/ubuntu/
+
+4. Place this script to /opt/chroot/chrootmounter.sh
+------------------------------------------------------------------------------------------
+#!/bin/bash
+  
+if [ $# -eq 2 ]; then
+  
+ if [ "$1" == "mount" ]; then
+  sudo mount --bind /dev "${2}/dev"
+  sudo mount --bind /sys "${2}/sys"
+  sudo mount --bind /proc "${2}/proc"
+  
+  exit 0
+ fi
+  
+ if [ "$1" == "umount" ]; then
+  sudo umount "${2}/dev"
+  sudo umount "${2}/sys"
+  sudo umount "${2}/proc"
+  
+  exit 0
+ fi
+  
+ echo -e "\nUsage:  $0 (mount or umount) (chrootdir)\n"
+ exit 1
+  
+else
+ echo -e "\nUsage:  $0 (mount or umount) (chrootdir)\n"
+ exit 1
+fi
+------------------------------------------------------------------------------------------
+
+5. chmod +x /opt/chroots/chrootmounter.sh
+
+6. Run the script: ./chrootmounter.sh mount /opt/chroot/trusty (check if it worked with mount command)
+
+*7. Make sure that /etc/resolv.conf is the same on 'real' computer and on 'chroot' one (if not, copy the file from the real one)
+
+*8. Copy/Extend your /etc/apt/sources.list (make sure you are using the correct version of Ubuntu Repos):
+  sudo cp /etc/apt/sources.list /opt/chroot/trusty/etc/apt/sources.list
+ 
+9. Run: /home/ubuntu/perl_scripts/generate_gadgetron_root <Arg 1> <Arg 2>
+  Arg 1: Gadgetron root: /usr/local/gadgetron
+  Arg 2: New root: /opt/chroot/trusty
+
+10. Enter the new environment: sudo chroot /opt/chroot/trusty
+
+*11. Install additional software. For example:
+  apt-get update
+  apt-get dist-upgrade -y
+  apt-get install nano vim htop less dialog -y
+
+12. Set the PATH and LD_LIBRARY_PATH variable:
+  export PATH=$PATH:/usr/local/gadgetron/bin
+  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH/usr/local/gadgetron/lib:/usr/local/lib:/usr/local/cuda-6.5/lib64:/opt/intel/mkl/lib/intel64:/opt/intel/lib/intel64
+
+
+*** Extras ***
+
+Usefull links:
+https://help.ubuntu.com/community/BasicChroot
+http://ocsovszki-dorian.blogspot.com/2014/06/building-chroot-environment-ubuntu-1404.html
+
+
+
+
+gtplus_FetalHASTE.cfg
diff --git a/chroot/copy-cuda-lib.sh.in b/chroot/copy-cuda-lib.sh.in
new file mode 100644
index 0000000..0efafd4
--- /dev/null
+++ b/chroot/copy-cuda-lib.sh.in
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+if [ $(id -u) -ne 0 ]; then
+ echo -e "\nPlease start the script as a root or sudo!\n"
+ exit 1
+else
+ if [ $# -eq 1 ]; then
+ BASEDIR=$(dirname $0)
+ CHROOT_DIR=${1}
+
+ # Absolute path this script is in
+ SCRIPTPATH=$(dirname "$SCRIPT")
+
+ # find the lib(s)
+ CANDIDATES=$(ldconfig -p | grep "libcuda.so\s")
+
+ # find the one that is 64-bit
+ for CANDIDATE in $CANDIDATES 
+ do
+  var=$(file -L $CANDIDATE | grep '64-bit')
+  if [ -n "$var" ]; then
+   NEW_CUDA_LIB=$CANDIDATE
+  fi 
+ done
+
+ # copy it to the right location (overwrite the previous one)
+ yes | cp $NEW_CUDA_LIB $CHROOT_DIR at CMAKE_INSTALL_PREFIX@/lib/
+ exit 0
+
+ else
+  echo -e "\nUsage: $0 (chroot_dir)\n"
+  exit 1
+ fi
+fi
diff --git a/chroot/copy_file_and_dependencies b/chroot/copy_file_and_dependencies
new file mode 100755
index 0000000..138e499
--- /dev/null
+++ b/chroot/copy_file_and_dependencies
@@ -0,0 +1,25 @@
+#!/usr/bin/perl
+
+
+use FindBin;
+use lib $FindBin::Bin;
+
+my $file = $ARGV[0];
+
+my $new_root = $ARGV[1];
+
+my $gadgetron_root = $ARGV[2];
+
+my $deps = `$FindBin::Bin/get_dependencies_for_binary $file`;
+
+my @lines = split('\n', $deps);
+
+system("sudo cp -n $file $new_root/$gadgetron_root/lib/");
+
+foreach my $val (@lines) {
+    chomp($val);
+    if (not -e "$new_root/$val")
+    {
+        system("$FindBin::Bin/copy_file_and_dependencies $val $new_root $gadgetron_root");
+    }
+}
diff --git a/chroot/create_chroot.sh b/chroot/create_chroot.sh
new file mode 100755
index 0000000..1e3a49a
--- /dev/null
+++ b/chroot/create_chroot.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+if [ $(id -u) -ne 0 ]; then
+  echo -e "\nPlease start the script as a root or sudo!\n"
+  exit 1
+else
+  if [ $# -ge 5 ]; then
+
+    # --ARGUMENTS-- (example)
+
+    # CHROOT_GADGETRON_INSTALL_PREFIX:    /usr/local/gadgetron
+    # CHROOT_GADGETRON_BINARY_DIR:        /home/ubuntu/gadgetron/build
+    # CHROOT_LIBRARY_PATHS:               /usr/local/lib:/usr/lib/x86_64-linux-gnu
+    # CHROOT_CUDA_LIBRARY:                
+    # CHROOT_GADGETRON_SOURCE_DIR:        /home/ubuntu/gadgetron
+
+    CHROOT_GADGETRON_INSTALL_PREFIX=${1}
+    echo CHROOT_GADGETRON_INSTALL_PREFIX: ${CHROOT_GADGETRON_INSTALL_PREFIX}
+    CHROOT_GADGETRON_BINARY_DIR=${2}
+    echo CHROOT_GADGETRON_BINARY_DIR: ${CHROOT_GADGETRON_BINARY_DIR}
+    CHROOT_LIBRARY_PATHS=${3}
+    echo CHROOT_LIBRARY_PATHS: ${CHROOT_LIBRARY_PATHS}
+    CHROOT_CUDA_LIBRARY=${4}
+    echo CHROOT_CUDA_LIBRARY: ${CHROOT_CUDA_LIBRARY}
+    CHROOT_GADGETRON_SOURCE_DIR=${5}
+    echo CHROOT_GADGETRON_SOURCE_DIR: ${CHROOT_GADGETRON_SOURCE_DIR}
+
+    CHROOT_IMAGE_SIZE=1536
+    echo CHROOT_IMAGE_SIZE: ${CHROOT_IMAGE_SIZE} MB
+
+    if [ $# -ge 6 ]; then
+        CHROOT_SIEMENS_TO_ISMRMRD_EXE=${6} 
+        echo CHROOT_SIEMENS_TO_ISMRMRD_EXE: ${CHROOT_SIEMENS_TO_ISMRMRD_EXE}
+    else
+      echo "SIEMENS_TO_ISMRMRD_EXE not set"
+    fi
+
+    # ----------------------------------------------------------------------------------------
+
+    # Add LIBRARY_PATHS to LD_LIBRARY_PATH
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${CHROOT_LIBRARY_PATHS}
+    export LC_ALL=C
+    echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}"
+
+    # create folders and manifest file
+    rm -rf ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root
+    mkdir -p ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root
+    touch ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/source-manifest.txt
+
+    # get gadgetron SHA1 key
+    GADGETRON_INFO=${CHROOT_GADGETRON_INSTALL_PREFIX}/bin/gadgetron_info
+    if [ -f ${GADGETRON_INFO} ]; then
+      res=$(${GADGETRON_INFO})
+      re=".*-- Git SHA1           : ([0-9a-z]+).*"
+      if [[ $res =~ $re ]]; then 
+        CHROOT_GIT_SHA1_HASH=${BASH_REMATCH[1]}
+      fi
+    fi
+
+    echo "gadgetron    ${CHROOT_GIT_SHA1_HASH}" > ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/source-manifest.txt
+    mkdir -p ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/gadgetron
+    mkdir -p ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-backups
+
+    # try to find chroot base package
+    CHROOT_BASE=`ls -t ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-backups/gadgetron-base*.tar.gz | head -1`
+
+    if [[ -f "${CHROOT_BASE}" ]]; then
+        echo "find gadgetron chroot base package : ${CHROOT_BASE}"
+    else       
+        echo "Cannot find gadgetron chroot base package"
+        echo "Creating chroot base package ... "
+        ${CHROOT_GADGETRON_SOURCE_DIR}/chroot/create_chroot_base.sh ${CHROOT_GADGETRON_BINARY_DIR}
+        CHROOT_BASE=`ls -t ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-backups/gadgetron-base*.tar.gz | head -1`
+    fi
+
+    # create chroot from base
+    echo "Creating chroot package from  base " ${CHROOT_BASE}
+    ${CHROOT_GADGETRON_SOURCE_DIR}/chroot/create_chroot_from_base.sh ${CHROOT_GADGETRON_INSTALL_PREFIX} ${CHROOT_GADGETRON_BINARY_DIR} ${CHROOT_LIBRARY_PATHS} ${CHROOT_CUDA_LIBRARY} ${CHROOT_GADGETRON_SOURCE_DIR} ${CHROOT_BASE} ${CHROOT_IMAGE_SIZE} ${CHROOT_SIEMENS_TO_ISMRMRD_EXE}
+
+    exit 0
+  else
+    echo -e "\nUsage:  $0 (gadgetron install prefix) (gadgetron binary dir) (LIBRARY_PATHS) (CHROOT_CUDA_LIBRARY) (gadgetron source dir) (SIEMENS_TO_ISMRMRD_EXE)\n"
+    exit 1
+  fi
+fi
diff --git a/chroot/create_chroot_base.sh b/chroot/create_chroot_base.sh
new file mode 100755
index 0000000..d8faaaa
--- /dev/null
+++ b/chroot/create_chroot_base.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+if [ $(id -u) -ne 0 ]; then
+    echo -e "\nPlease start the script as a root or sudo!\n"
+    exit 1
+else
+    if [ $# -ge 1 ]; then
+
+        # --ARGUMENTS-- (example)
+
+        # CHROOT_GADGETRON_BINARY_DIR:        /home/ubuntu/gadgetron/build
+
+        # -----------------------------------------------------------------------------------
+        # input parameters
+        # -----------------------------------------------------------------------------------
+
+        CHROOT_GADGETRON_BINARY_DIR=${1}
+        echo CHROOT_GADGETRON_BINARY_DIR: ${CHROOT_GADGETRON_BINARY_DIR}
+
+        rm -rf ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root
+        mkdir -p ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root
+        mkdir -p ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-backups
+
+        # install debian bootstrap
+        apt-get install debootstrap -y
+        debootstrap --variant=buildd --arch amd64 trusty ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/gadgetron http://gb.archive.ubuntu.com/ubuntu/
+
+        # install python libraries
+        chroot ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/gadgetron apt-get install software-properties-common python-dev python-twisted python-psutil python-numpy python-libxml2 -y
+        chroot ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/gadgetron add-apt-repository "deb http://us.archive.ubuntu.com/ubuntu/ trusty main restricted multiverse universe"  
+        chroot ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/gadgetron apt-get update  
+        chroot ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/gadgetron apt-get install python-h5py libhdf5-serial-dev hdf5-tools python-pip -y 
+
+        TAR_FILE_NAME=gadgetron-base-`date '+%Y%m%d-%H%M'`
+
+        tar -zcf "${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-backups/${TAR_FILE_NAME}.tar.gz" --directory "${CHROOT_GADGETRON_BINARY_DIR}/chroot" --exclude=./chroot-root/gadgetron/var --exclude=./chroot-root/gadgetron/dev --exclude=./chroot-root/gadgetron/sys --exclude=./chroot-root/gadgetron/proc --exclude=./chroot-root/gadgetron/root ./chroot-root
+
+        rm -rf "${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root"
+
+        chmod 666 "${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-backups/${TAR_FILE_NAME}.tar.gz"
+        exit 0
+    else
+        echo -e "\nUsage:  $0 (gadgetron binary dir)\n"
+        exit 1
+    fi
+fi
diff --git a/chroot/create_chroot_from_base.sh b/chroot/create_chroot_from_base.sh
new file mode 100755
index 0000000..76da60a
--- /dev/null
+++ b/chroot/create_chroot_from_base.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+
+if [ $(id -u) -ne 0 ]; then
+    echo -e "\nPlease start the script as a root or sudo!\n"
+    exit 1
+else
+    if [ $# -ge 6 ]; then
+
+        # --ARGUMENTS-- (example)
+
+        # CHROOT_GADGETRON_INSTALL_PREFIX:    /usr/local
+        # CHROOT_GADGETRON_BINARY_DIR:        /home/ubuntu/gadgetron/build
+        # CHROOT_LIBRARY_PATHS:               /usr/local/lib:/usr/lib/x86_64-linux-gnu
+        # CHROOT_CUDA_LIBRARY:                
+        # CHROOT_GADGETRON_SOURCE_DIR:        /home/ubuntu/gadgetron
+        # CHROOT_GADGETRON_SOURCE_DIR:        gadgetron-base.tar.gz
+        # CHROOT_IMAGE_SIZE:                  1536
+
+        CHROOT_GADGETRON_INSTALL_PREFIX=${1}
+        echo CHROOT_GADGETRON_INSTALL_PREFIX: ${CHROOT_GADGETRON_INSTALL_PREFIX}
+
+        CHROOT_GADGETRON_BINARY_DIR=${2}
+        echo CHROOT_GADGETRON_BINARY_DIR: ${CHROOT_GADGETRON_BINARY_DIR}
+
+        CHROOT_LIBRARY_PATHS=${3}
+        echo CHROOT_LIBRARY_PATHS: ${CHROOT_LIBRARY_PATHS}
+
+        CHROOT_CUDA_LIBRARY=${4}
+        echo CHROOT_CUDA_LIBRARY: ${CHROOT_CUDA_LIBRARY}
+
+        CHROOT_GADGETRON_SOURCE_DIR=${5}
+        echo CHROOT_GADGETRON_SOURCE_DIR: ${CHROOT_GADGETRON_SOURCE_DIR}
+
+        CHROOT_BASE_NAME=${6}
+        echo CHROOT_BASE_NAME: ${CHROOT_BASE_NAME}
+
+        if [ $# -ge 7 ]; then
+            CHROOT_IMAGE_SIZE=${7}
+        else
+            CHROOT_IMAGE_SIZE=1536
+        fi
+        echo CHROOT_IMAGE_SIZE: ${CHROOT_IMAGE_SIZE}
+
+        # --------------------------------------------------------------------------------
+
+        # Add LIBRARY_PATHS to LD_LIBRARY_PATH
+        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${CHROOT_LIBRARY_PATHS}
+        export LC_ALL=C
+        echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}"
+
+        # untar the chroot base
+        rm -rf ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root
+        tar -xzf ${CHROOT_BASE_NAME} -C ${CHROOT_GADGETRON_BINARY_DIR}/chroot
+        sleep 3
+
+        touch ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/source-manifest.txt
+
+        GADGETRON_INFO=${CHROOT_GADGETRON_INSTALL_PREFIX}/bin/gadgetron_info
+        if [ -f ${GADGETRON_INFO} ]; then
+          res=$(${GADGETRON_INFO})
+          re=".*-- Git SHA1           : ([0-9a-z]+).*"
+          if [[ $res =~ $re ]]; then 
+            CHROOT_GIT_SHA1_HASH=${BASH_REMATCH[1]}
+          fi
+        fi
+
+        echo "gadgetron    ${CHROOT_GIT_SHA1_HASH}" > ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/source-manifest.txt
+        mkdir -p ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/gadgetron
+        mkdir -p ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-backups
+
+        cd ${CHROOT_GADGETRON_BINARY_DIR}
+        make install DESTDIR="${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/gadgetron" -j8
+
+        #This copies the SIEMENS_TO_ISMRMRD executable if it is installed
+        if [ $# -ge 8 ]; then
+          CHROOT_SIEMENS_TO_ISMRMRD_EXE=${8} 
+          echo CHROOT_SIEMENS_TO_ISMRMRD_EXE: ${CHROOT_SIEMENS_TO_ISMRMRD_EXE}
+          cp ${CHROOT_SIEMENS_TO_ISMRMRD_EXE} "${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/gadgetron/${CHROOT_GADGETRON_INSTALL_PREFIX}/bin/"
+        else
+          echo "SIEMENS_TO_ISMRMRD_EXE not set"
+        fi
+
+        ${CHROOT_GADGETRON_SOURCE_DIR}/chroot/generate_gadgetron_root ${CHROOT_GADGETRON_INSTALL_PREFIX} ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/gadgetron
+
+        cp ${CHROOT_CUDA_LIBRARY} ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/gadgetron/${CHROOT_GADGETRON_INSTALL_PREFIX}/lib  
+        cp -n ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/gadgetron${CHROOT_GADGETRON_INSTALL_PREFIX}/share/gadgetron/config/gadgetron.xml.example ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/gadgetron${CHROOT_GADGETRON_INSTALL_PREFIX}/share/gadgetron/config/gadgetron.xml
+
+        ISMRMRD_PYTHON_FOLDER=${CHROOT_GADGETRON_INSTALL_PREFIX}/share/gadgetron/python/ismrmrd
+        if [ -d ${ISMRMRD_PYTHON_FOLDER} ]; then
+          chroot ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/gadgetron pip install cython h5py pyxb
+          chroot ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/gadgetron pip install --upgrade h5py
+          cp -rf ${ISMRMRD_PYTHON_FOLDER} "${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root/gadgetron/${CHROOT_GADGETRON_INSTALL_PREFIX}/share/gadgetron/python"
+        fi
+
+        TAR_FILE_NAME=gadgetron-`date '+%Y%m%d-%H%M'`-${CHROOT_GIT_SHA1_HASH:0:8}
+        IMAGE_FILE_NAME=${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-backups/${TAR_FILE_NAME}.img
+
+        tar -zcf "${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-backups/${TAR_FILE_NAME}.tar.gz" --directory "${CHROOT_GADGETRON_BINARY_DIR}/chroot" --exclude=./chroot-root/gadgetron/var --exclude=./chroot-root/gadgetron/dev --exclude=./chroot-root/gadgetron/sys --exclude=./chroot-root/gadgetron/proc --exclude=./chroot-root/gadgetron/root ./chroot-root
+
+        dd if=/dev/zero of=${IMAGE_FILE_NAME} bs=${CHROOT_IMAGE_SIZE}k seek=1024 count=0
+        mke2fs -F -t ext3 ${IMAGE_FILE_NAME}
+        mkdir ${CHROOT_GADGETRON_BINARY_DIR}/chroot/gadgetron_root
+        mount -o loop ${IMAGE_FILE_NAME} ${CHROOT_GADGETRON_BINARY_DIR}/chroot/gadgetron_root
+        tar -xzf ${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-backups/${TAR_FILE_NAME}.tar.gz -C ${CHROOT_GADGETRON_BINARY_DIR}/chroot/gadgetron_root/
+        sleep 3
+        umount ${CHROOT_GADGETRON_BINARY_DIR}/chroot/gadgetron_root
+        rmdir ${CHROOT_GADGETRON_BINARY_DIR}/chroot/gadgetron_root
+        rm -rf "${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-root"
+
+        chmod 666 "${CHROOT_GADGETRON_BINARY_DIR}/chroot/chroot-backups/${TAR_FILE_NAME}.tar.gz"
+        chmod 666 "${IMAGE_FILE_NAME}"
+        exit 0
+    else
+        echo -e "\nUsage:  $0 (gadgetron install prefix) (gadgetron binary dir) (LIBRARY_PATHS) (CHROOT_CUDA_LIBRARY) (gadgetron source dir) (chroot base name) (optional, image size) (optional, SIEMENS_TO_ISMRMRD_EXE)\n"
+        exit 1
+    fi
+fi
diff --git a/chroot/enter-chroot-env.sh.in b/chroot/enter-chroot-env.sh.in
new file mode 100644
index 0000000..5f79889
--- /dev/null
+++ b/chroot/enter-chroot-env.sh.in
@@ -0,0 +1,3 @@
+#!/bin/bash                                                                                                                                                 
+
+PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:@CMAKE_INSTALL_PREFIX@/bin LD_LIBRARY_PATH=@CMAKE_INSTALL_PREFIX@/lib:/usr/local/lib:/opt/intel/mkl/lib/intel64:/opt/intel/lib/intel64 chroot ./../
diff --git a/chroot/gadgetron-dependency-query.sh.in b/chroot/gadgetron-dependency-query.sh.in
new file mode 100644
index 0000000..2804020
--- /dev/null
+++ b/chroot/gadgetron-dependency-query.sh.in
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+if [ $# -eq 3 ]; then
+
+    GT_HOST=${1}
+    GT_PORT=${2}
+    QUERY_OUT=${3}
+
+    PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:@CMAKE_INSTALL_PREFIX@/bin LD_LIBRARY_PATH=@CMAKE_INSTALL_PREFIX@/lib:/usr/local/lib:/opt/intel/mkl/lib/intel64:/opt/intel/lib/intel64 @CMAKE_INSTALL_PREFIX@/bin/gtdependencyquery -h $GT_HOST -p $GT_PORT -o $QUERY_OUT
+    exit $?
+else
+    echo -e "\nUsage: $0 <Host> <port> <query out file>\n"
+    exit 1
+fi
+
+exit 0
diff --git a/chroot/gadgetron_chroot.conf b/chroot/gadgetron_chroot.conf
new file mode 100644
index 0000000..c8c8eec
--- /dev/null
+++ b/chroot/gadgetron_chroot.conf
@@ -0,0 +1,26 @@
+description "Webapp Upstart Script"
+version "1.0"
+author "Dusan Puletic (dusan.puletic at nih.gov)"
+
+start on (started network-interface and started networking) and filesystem and net-device-up IFACE!=lo
+stop on runlevel [!2345]
+
+expect fork
+
+# Log output to log file (/var/log/upstart/webapp.log)
+console log
+
+kill signal INT
+
+# Call a script that will mount the proc and start the webapp inside the chroot env
+script
+    exec su -c "/home/gadgetron_chroot/current/chroot-root/gadgetron/usr/local/share/gadgetron/chroot/start-webapp.sh /home/gadgetron_chroot/current/chroot-root/gadgetron" root &
+end script
+
+# Call a script that will unmount the proc before the service stops
+pre-stop script
+    exec /home/gadgetron_chroot/current/chroot-root/gadgetron/usr/local/share/gadgetron/chroot/stop.sh &
+end script
+
+# Respawn if process dies or is killed
+respawn
diff --git a/chroot/gadgetron_ismrmrd_client.sh.in b/chroot/gadgetron_ismrmrd_client.sh.in
new file mode 100644
index 0000000..a3cb4ce
--- /dev/null
+++ b/chroot/gadgetron_ismrmrd_client.sh.in
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+if [ $# -eq 4 ]; then
+
+    ISMRMRD_FILENAME=${1}
+    CONDIG_XML=${2}
+    GT_HOST=${3}
+    GT_PORT=${4}
+
+    PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:@CMAKE_INSTALL_PREFIX@/bin LD_LIBRARY_PATH=@CMAKE_INSTALL_PREFIX@/lib:/usr/local/lib:/opt/intel/mkl/lib/intel64:/opt/intel/lib/intel64 @CMAKE_INSTALL_PREFIX@/bin/gadgetron_ismrmrd_client -f $ISMRMRD_FILENAME -c $CONDIG_XML -a $GT_HOST -p $GT_PORT
+    exit $?
+else
+    echo -e "\nUsage: $0 <ismrmrd filename> <config filename> <host> <port>\n"
+    exit 1
+fi
+
+exit 0
diff --git a/chroot/generate_gadgetron_root b/chroot/generate_gadgetron_root
new file mode 100755
index 0000000..fb7b7de
--- /dev/null
+++ b/chroot/generate_gadgetron_root
@@ -0,0 +1,45 @@
+#!/usr/bin/perl                                                                                                                                              
+use FindBin;
+use lib $FindBin::Bin;
+
+my $gadgetron_root = $ARGV[0];
+my $new_root = $ARGV[1];
+
+my $directory = $gadgetron_root . "/bin";
+
+opendir (DIR, $new_root . $directory) or die $!;
+while (my $file = readdir(DIR)) {
+    next if ($file =~ m/^\./);
+    my $deps = `$FindBin::Bin/get_dependencies_for_binary $new_root/$directory/$file`;
+    my @lines = split('\n', $deps);
+
+    foreach my $val (@lines) {
+	chomp($val);
+        if (not -e "$new_root/$val")
+        {
+            system("$FindBin::Bin/copy_file_and_dependencies $val $new_root $gadgetron_root");
+        }
+    }
+}
+closedir(DIR);
+
+
+$directory = $gadgetron_root . "/lib";
+
+opendir (DIR, $new_root . $directory) or die $!;
+while (my $file = readdir(DIR)) {
+    next unless ($file =~ m/\.so$/);
+
+    my $deps = `$FindBin::Bin/get_dependencies_for_binary $new_root/$directory/$file`;
+    my @lines = split('\n', $deps);
+
+    foreach my $val (@lines) 
+    {
+	chomp($val);
+        if (not -e "$new_root/$val")
+        {
+            system("$FindBin::Bin/copy_file_and_dependencies $val $new_root $gadgetron_root");
+        }
+    }
+}
+closedir(DIR);
diff --git a/chroot/get_dependencies_for_binary b/chroot/get_dependencies_for_binary
new file mode 100755
index 0000000..a956564
--- /dev/null
+++ b/chroot/get_dependencies_for_binary
@@ -0,0 +1,12 @@
+#!/usr/bin/perl
+
+use strict;
+
+my $o = `ldd $ARGV[0]`;
+my @lines = split('\n', $o);
+
+foreach my $val (@lines) {
+    if ($val =~ m/=> \/([^\s]*)/) {
+	print "/$1\n";
+    }
+}
diff --git a/chroot/gt_alive.sh.in b/chroot/gt_alive.sh.in
new file mode 100644
index 0000000..0753200
--- /dev/null
+++ b/chroot/gt_alive.sh.in
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+if [ $# -eq 2 ]; then
+
+    GT_HOST=${1}
+    GT_PORT=${2}
+
+    PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:@CMAKE_INSTALL_PREFIX@/bin LD_LIBRARY_PATH=@CMAKE_INSTALL_PREFIX@/lib:/usr/local/lib:/opt/intel/mkl/lib/intel64:/opt/intel/lib/intel64 @CMAKE_INSTALL_PREFIX@/bin/gt_alive $GT_HOST $GT_PORT
+    exit $?
+else
+    echo -e "\nUsage: $0 <host> <port>\n"
+    exit 1
+fi
+
+exit 0
diff --git a/chroot/install_chroot_image.sh b/chroot/install_chroot_image.sh
new file mode 100755
index 0000000..8cefb16
--- /dev/null
+++ b/chroot/install_chroot_image.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+
+if [ $(id -u) -ne 0 ]; then 
+ 	echo -e "\nPlease start the script as a root or sudo!\n"
+ 	exit 1
+
+else
+
+	BASEDIR=$(dirname $0)
+
+	install_img=0
+
+	if [ $# -eq 1 ]; then
+		CHROOT_IMAGE_FILENAME=${1}
+		CHROOT_INSTALL_PATH=/home/gadgetron_chroot
+	else
+		if [ $# -eq 2 ]; then
+			CHROOT_IMAGE_FILENAME=${1}
+			CHROOT_INSTALL_PATH=${2}
+		else
+			if [ $# -eq 3 ]; then
+				if [ ${2} == "latest" ]; then
+					TAR_NAME=`find ${1} -type f -name 'gadgetron-*.tar.gz' |sort |head -n1`
+					CHROOT_IMAGE_FILENAME=${TAR_NAME}
+				else
+					CHROOT_IMAGE_FILENAME=${1}/${2}			
+				fi
+				CHROOT_INSTALL_PATH=${3}
+			else
+				if [ $# -eq 4 ]; then
+					if [ ${2} == "latest" ]; then
+						TAR_NAME=`find ${1} -type f -name 'gadgetron-*.tar.gz' |sort |head -n1`
+						CHROOT_IMAGE_FILENAME=${1}/${TAR_NAME}
+					else
+						CHROOT_IMAGE_FILENAME=${1}/${2}			
+					fi
+					CHROOT_INSTALL_PATH=${3}
+
+					if [ ${4} -eq 1 ]; then
+						install_img=1
+						IMG_NAME=`find ${1} -type f -name 'gadgetron-*.img' |sort |head -n1`
+						CHROOT_IMAGE_IMG_FILENAME=${IMG_NAME}
+					fi
+				else
+					echo -e "\nUsage 1, install chroot image to /home/gadgetron_chroot: $0 chroot_image_file chroot_install_path"
+				  	echo -e "\nUsage 2, install chroot image to selected install path: $0 chroot_image_file chroot_install_path"
+				  	echo -e "\nUsage 3, : $0 chroot_image_path chroot_image_name chroot_install_path"
+				  	echo -e "\n           install chroot image to selected install path, if chroot_image_name=latest, the newest chroot image in the folder will be installed: $0 chroot_image_file chroot_install_path"
+				  	echo -e "\nUsage 4, : $0 chroot_image_path chroot_image_name chroot_install_path install_img"
+				  	echo -e "\n           like Usage 3, if install_img=1, the corresponding .img package will be copied to chroot_install_path"
+				  	exit 1
+				fi  
+			fi  
+		fi  
+	fi
+
+  	service gadgetron_chroot stop
+
+	echo CHROOT_IMAGE_FILENAME=${CHROOT_IMAGE_FILENAME}
+	echo CHROOT_INSTALL_PATH=${CHROOT_INSTALL_PATH}
+
+	mkdir -p ${CHROOT_INSTALL_PATH}
+
+	cp -rf ${CHROOT_IMAGE_FILENAME} ${CHROOT_INSTALL_PATH}/
+
+	FILENAME_WITH_EXTENSION=${CHROOT_IMAGE_FILENAME##*/}
+	FILENAME=${FILENAME_WITH_EXTENSION%.*}
+	FILENAME=${FILENAME%.*}
+	echo ${FILENAME}
+
+	mkdir ${CHROOT_INSTALL_PATH}/${FILENAME}
+
+	echo untar ${CHROOT_INSTALL_PATH}/${FILENAME_WITH_EXTENSION} ... 
+
+	tar -xzf ${CHROOT_INSTALL_PATH}/${FILENAME_WITH_EXTENSION} --directory="${CHROOT_INSTALL_PATH}/${FILENAME}" .
+
+	rm -f ${CHROOT_INSTALL_PATH}/current
+
+	ln -s ${CHROOT_INSTALL_PATH}/${FILENAME} ${CHROOT_INSTALL_PATH}/current
+
+	cp -f ${CHROOT_INSTALL_PATH}/current/chroot-root/gadgetron/usr/local/share/gadgetron/chroot/gadgetron_chroot.conf /etc/init/
+
+	if [ ${install_img} -eq 1 ]; then
+                echo "copy image file : ${CHROOT_IMAGE_IMG_FILENAME} ... "		
+		cp -f ${CHROOT_IMAGE_IMG_FILENAME} ${CHROOT_INSTALL_PATH}/
+	fi
+
+	service gadgetron_chroot start
+
+	exit 0
+fi
diff --git a/chroot/make_list_of_dependencies b/chroot/make_list_of_dependencies
new file mode 100755
index 0000000..ed8f3bd
--- /dev/null
+++ b/chroot/make_list_of_dependencies
@@ -0,0 +1,25 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+use FindBin;
+use lib $FindBin::Bin;
+
+my $directory = $ARGV[0] . "/bin";
+
+opendir (DIR, $directory) or die $!;
+while (my $file = readdir(DIR)) {
+    next if ($file =~ m/^\./);
+    system("$FindBin::Bin/get_dependencies_for_binary $ARGV[0]/bin/$file\n");
+}
+closedir(DIR);
+
+
+$directory = $ARGV[0] . "/lib";
+
+opendir (DIR, $directory) or die $!;
+while (my $file = readdir(DIR)) {
+    next unless ($file =~ m/\.so$/);
+    system("$FindBin::Bin/get_dependencies_for_binary $ARGV[0]/lib/$file\n");
+}
+closedir(DIR);
diff --git a/chroot/mount.sh b/chroot/mount.sh
new file mode 100755
index 0000000..9559948
--- /dev/null
+++ b/chroot/mount.sh
@@ -0,0 +1,64 @@
+#!/bin/bash                                                                                                                                    
+
+function mount_safe {
+  MOUNT_POINT=$1
+  MOUNT_DIR=$2
+  mkdir -p $MOUNT_POINT
+  if find $MOUNT_POINT -maxdepth 0 -empty | read v; then
+    mount --bind $MOUNT_DIR $MOUNT_POINT
+    MOUNT_READY=0
+    MOUNT_TRY=0
+    MAX_MOUNT_TRY=100
+
+    if [ $# -eq 3 ]; then
+      MOUNT_FILE=$3
+      while [ ${MOUNT_READY} -eq 0 ]; do
+        if mountpoint -q ${MOUNT_POINT} && [ -e ${MOUNT_POINT}/${MOUNT_FILE} ]; then
+          MOUNT_READY=1
+        else
+          sleep 0.2
+          let MOUNT_TRY++
+          if [ $MOUNT_TRY -eq $MAX_MOUNT_TRY ]; then
+	          MOUNT_READY=1
+          fi  
+        fi
+      done
+    else
+      while [ ${MOUNT_READY} -eq 0 ]; do
+        if mountpoint -q ${MOUNT_POINT}; then
+          MOUNT_READY=1
+        else
+          sleep 0.2
+          let MOUNT_TRY++
+          if [ $MOUNT_TRY -eq $MAX_MOUNT_TRY ]; then
+	          MOUNT_READY=1
+          fi
+        fi
+      done  
+    fi
+  fi
+}
+
+if [ $(id -u) -ne 0 ]; then
+ echo -e "\nPlease start the script as a root or sudo!\n"
+ exit 1
+
+else
+ BASEDIR=$(dirname $0)
+
+ if [ $# -eq 1 ]; then
+
+  CHROOT_DIR=${1}
+
+  mount_safe "${CHROOT_DIR}/proc" /proc self/exe
+  mount_safe "${CHROOT_DIR}/dev" /dev
+  mount_safe "${CHROOT_DIR}/sys" /sys
+
+  exit 0
+
+ else
+  echo -e "\nUsage: $0 (chrootdir)\n"
+  exit 1
+ fi
+
+fi
diff --git a/chroot/mount_image.sh b/chroot/mount_image.sh
new file mode 100755
index 0000000..b4d938d
--- /dev/null
+++ b/chroot/mount_image.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+function mount_safe {
+  MOUNT_POINT=$1
+  MOUNT_DIR=$2
+
+  echo  $MOUNT_POINT
+  mkdir -p $MOUNT_POINT
+  if find $MOUNT_POINT -maxdepth 0 -empty | read v; then
+    mount --bind $MOUNT_DIR $MOUNT_POINT
+    MOUNT_READY=0
+    MOUNT_TRY=0
+    MAX_MOUNT_TRY=100
+
+    if [ $# -eq 3 ]; then
+      MOUNT_FILE=$3
+      while [ ${MOUNT_READY} -eq 0 ]; do
+        if mountpoint -q ${MOUNT_POINT} && [ -e ${MOUNT_POINT}/${MOUNT_FILE} ]; then
+          MOUNT_READY=1
+        else
+          sleep 0.2
+          let MOUNT_TRY++
+          if [ $MOUNT_TRY -eq $MAX_MOUNT_TRY ]; then
+	          MOUNT_READY=1
+          fi  
+        fi
+      done
+    else
+      while [ ${MOUNT_READY} -eq 0 ]; do
+        if mountpoint -q ${MOUNT_POINT}; then
+          MOUNT_READY=1
+        else
+          sleep 0.2
+          let MOUNT_TRY++
+          if [ $MOUNT_TRY -eq $MAX_MOUNT_TRY ]; then
+	          MOUNT_READY=1
+          fi
+        fi
+      done  
+    fi
+  fi
+}
+
+if [ $(id -u) -ne 0 ]; then
+  echo -e "\nPlease start the script as a root or sudo!\n"
+  exit 1
+else
+  if [ $# -eq 2 ]; then
+
+    FULL_PATH_TO_IMG_FILE=${1}
+    GLOBAL_MOUNT_POINT=${2}
+
+    mkdir -p ${GLOBAL_MOUNT_POINT}
+    mount -o loop ${FULL_PATH_TO_IMG_FILE} ${GLOBAL_MOUNT_POINT}
+    sleep 0.2
+
+    MOUNT_READY=0
+    MOUNT_TRY=0
+    MAX_MOUNT_TRY=100
+    while [ ${MOUNT_READY} -eq 0 ]; do
+      if mountpoint -q ${GLOBAL_MOUNT_POINT} && [ -e ${GLOBAL_MOUNT_POINT}/chroot-root/gadgetron/usr/local/share/gadgetron/chroot/start.sh ]; then
+          MOUNT_READY=1
+      else
+          sleep 0.2
+          let MOUNT_TRY++
+          if [ $MOUNT_TRY -eq $MAX_MOUNT_TRY ]; then
+		          MOUNT_READY=1
+			    exit 1
+	      fi
+      fi
+    done
+
+    if mountpoint -q ${GLOBAL_MOUNT_POINT}; then
+        mount_safe "${GLOBAL_MOUNT_POINT}/chroot-root/gadgetron/proc" /proc self/exe
+	      mount_safe "${GLOBAL_MOUNT_POINT}/chroot-root/gadgetron/dev" /dev null
+        mount_safe "${GLOBAL_MOUNT_POINT}/chroot-root/gadgetron/sys" /sys kernel
+    else
+      exit 1
+    fi
+    exit 0
+  else
+    echo -e "\nUsage: $0 <full path to img file> <mount point>\n"
+    exit 1
+  fi
+fi
diff --git a/chroot/mount_image.sh.in b/chroot/mount_image.sh.in
new file mode 100644
index 0000000..896c722
--- /dev/null
+++ b/chroot/mount_image.sh.in
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+function mount_safe {
+  MOUNT_POINT=$1
+  MOUNT_DIR=$2
+
+  echo  $MOUNT_POINT
+  mkdir -p $MOUNT_POINT
+  if find $MOUNT_POINT -maxdepth 0 -empty | read v; then
+    mount --bind $MOUNT_DIR $MOUNT_POINT
+    MOUNT_READY=0
+    MOUNT_TRY=0
+    MAX_MOUNT_TRY=100
+
+    if [ $# -eq 3 ]; then
+      MOUNT_FILE=$3
+      while [ ${MOUNT_READY} -eq 0 ]; do
+        if mountpoint -q ${MOUNT_POINT} && [ -e ${MOUNT_POINT}/${MOUNT_FILE} ]; then
+          MOUNT_READY=1
+        else
+          sleep 0.2
+          let MOUNT_TRY++
+          if [ $MOUNT_TRY -eq $MAX_MOUNT_TRY ]; then
+	          MOUNT_READY=1
+          fi  
+        fi
+      done
+    else
+      while [ ${MOUNT_READY} -eq 0 ]; do
+        if mountpoint -q ${MOUNT_POINT}; then
+          MOUNT_READY=1
+        else
+          sleep 0.2
+          let MOUNT_TRY++
+          if [ $MOUNT_TRY -eq $MAX_MOUNT_TRY ]; then
+	          MOUNT_READY=1
+          fi
+        fi
+      done  
+    fi
+  fi
+}
+
+if [ $(id -u) -ne 0 ]; then
+  echo -e "\nPlease start the script as a root or sudo!\n"
+  exit 1
+else
+  if [ $# -eq 2 ]; then
+
+    FULL_PATH_TO_IMG_FILE=${1}
+    GLOBAL_MOUNT_POINT=${2}
+
+    mkdir -p ${GLOBAL_MOUNT_POINT}
+    mount -o loop ${FULL_PATH_TO_IMG_FILE} ${GLOBAL_MOUNT_POINT}
+    sleep 0.2
+
+    MOUNT_READY=0
+    MOUNT_TRY=0
+    MAX_MOUNT_TRY=100
+    while [ ${MOUNT_READY} -eq 0 ]; do
+      if mountpoint -q ${GLOBAL_MOUNT_POINT} && [ -e ${GLOBAL_MOUNT_POINT}/chroot-root/gadgetron at CMAKE_INSTALL_PREFIX@/@GADGETRON_INSTALL_CHROOT_SCRIPTS_PATH@/start.sh ]; then
+          MOUNT_READY=1
+      else
+          sleep 0.2
+          let MOUNT_TRY++
+          if [ $MOUNT_TRY -eq $MAX_MOUNT_TRY ]; then
+		          MOUNT_READY=1
+			    exit 1
+	      fi
+      fi
+    done
+
+    if mountpoint -q ${GLOBAL_MOUNT_POINT}; then
+        mount_safe "${GLOBAL_MOUNT_POINT}/chroot-root/gadgetron/proc" /proc self/exe
+	mount_safe "${GLOBAL_MOUNT_POINT}/chroot-root/gadgetron/dev" /dev null
+        mount_safe "${GLOBAL_MOUNT_POINT}/chroot-root/gadgetron/sys" /sys kernel
+    else
+      exit 1
+    fi
+    exit 0
+  else
+    echo -e "\nUsage: $0 <full path to img file> <mount point>\n"
+    exit 1
+  fi
+fi
diff --git a/chroot/run-gadgetron-dependency-query.sh b/chroot/run-gadgetron-dependency-query.sh
new file mode 100755
index 0000000..5909a7f
--- /dev/null
+++ b/chroot/run-gadgetron-dependency-query.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+BASEDIR=$(dirname $0)
+
+if [ $(id -u) -ne 0 ]; then
+  echo -e "\nPlease start the script as a root or sudo!\n"
+  exit 1
+else
+  if [ $# -ge 4 ]; then
+
+    MOUNT_POINT=${1}
+    GT_HOST=${2}
+    GT_PORT=${3}
+    QUERY_OUT=${4}
+
+    if [ $# -eq 5 ]; then
+        FULL_PATH_TO_IMG_FILE=${5}
+
+        if find "${MOUNT_POINT}/chroot-root/gadgetron" -maxdepth 0 -empty | read v; then
+            mkdir -p ${MOUNT_POINT}
+            mount -o loop ${FULL_PATH_TO_IMG_FILE} ${MOUNT_POINT}
+        fi
+    fi
+
+    chroot ${MOUNT_POINT}/chroot-root/gadgetron /gadgetron-dependency-query.sh $GT_HOST $GT_PORT $QUERY_OUT
+    exit $?
+  else
+    echo -e "\nUsage: $0 <mount point> <Host> <port> <query out file> <optional: full path to img file>\n"
+    exit 1
+  fi
+fi
diff --git a/chroot/run-gadgetron-dependency-query.sh.in b/chroot/run-gadgetron-dependency-query.sh.in
new file mode 100644
index 0000000..a9ed81b
--- /dev/null
+++ b/chroot/run-gadgetron-dependency-query.sh.in
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+BASEDIR=$(dirname $0)
+
+if [ $(id -u) -ne 0 ]; then
+  echo -e "\nPlease start the script as a root or sudo!\n"
+  exit 1
+else
+  if [ $# -ge 4 ]; then
+
+    MOUNT_POINT=${1}
+    GT_HOST=${2}
+    GT_PORT=${3}
+    QUERY_OUT=${4}
+
+    if [ $# -eq 5 ]; then
+        FULL_PATH_TO_IMG_FILE=${5}
+
+        if find "${MOUNT_POINT}/chroot-root/gadgetron" -maxdepth 0 -empty | read v; then
+            mkdir -p ${MOUNT_POINT}
+            mount -o loop ${FULL_PATH_TO_IMG_FILE} ${MOUNT_POINT}
+        fi
+    fi
+
+    chroot ${MOUNT_POINT}/chroot-root/gadgetron @CMAKE_INSTALL_PREFIX@/@GADGETRON_INSTALL_CHROOT_SCRIPTS_PATH@/gadgetron-dependency-query.sh $GT_HOST $GT_PORT $QUERY_OUT
+    exit $?
+  else
+    echo -e "\nUsage: $0 <mount point> <Host> <port> <query out file> <optional: full path to img file>\n"
+    exit 1
+  fi
+fi
diff --git a/chroot/run-gadgetron_ismrmrd_client.sh b/chroot/run-gadgetron_ismrmrd_client.sh
new file mode 100755
index 0000000..eece810
--- /dev/null
+++ b/chroot/run-gadgetron_ismrmrd_client.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+BASEDIR=$(dirname $0)
+
+if [ $(id -u) -ne 0 ]; then
+  echo -e "\nPlease start the script as a root or sudo!\n"
+  exit 1
+else
+  if [ $# -ge 5 ]; then
+
+    MOUNT_POINT=${1}
+    ISMRMRD_FILENAME=${2}
+    CONDIG_XML=${3}
+    GT_HOST=${4}
+    GT_PORT=${5}
+
+    if [ $# -eq 5 ]; then
+        FULL_PATH_TO_IMG_FILE=${6}
+
+        if find "${MOUNT_POINT}/chroot-root/gadgetron" -maxdepth 0 -empty | read v; then
+            mkdir -p ${MOUNT_POINT}
+            mount -o loop ${FULL_PATH_TO_IMG_FILE} ${MOUNT_POINT}
+        fi
+    fi
+
+    chroot ${MOUNT_POINT}/chroot-root/gadgetron /gadgetron_ismrmrd_client.sh $ISMRMRD_FILENAME $CONDIG_XML $GT_HOST $GT_PORT
+    exit $?
+  else
+    echo -e "\nUsage: $0 <mount point> <ismrmrd filename> <config filename> <host> <port> <optional: full path to img file>\n"
+    exit 1
+  fi
+fi
diff --git a/chroot/run-gadgetron_ismrmrd_client.sh.in b/chroot/run-gadgetron_ismrmrd_client.sh.in
new file mode 100644
index 0000000..154e85f
--- /dev/null
+++ b/chroot/run-gadgetron_ismrmrd_client.sh.in
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+BASEDIR=$(dirname $0)
+
+if [ $(id -u) -ne 0 ]; then
+  echo -e "\nPlease start the script as a root or sudo!\n"
+  exit 1
+else
+  if [ $# -ge 5 ]; then
+
+    MOUNT_POINT=${1}
+    ISMRMRD_FILENAME=${2}
+    CONDIG_XML=${3}
+    GT_HOST=${4}
+    GT_PORT=${5}
+
+    if [ $# -eq 5 ]; then
+        FULL_PATH_TO_IMG_FILE=${6}
+
+        if find "${MOUNT_POINT}/chroot-root/gadgetron" -maxdepth 0 -empty | read v; then
+            mkdir -p ${MOUNT_POINT}
+            mount -o loop ${FULL_PATH_TO_IMG_FILE} ${MOUNT_POINT}
+        fi
+    fi
+
+    chroot ${MOUNT_POINT}/chroot-root/gadgetron @CMAKE_INSTALL_PREFIX@/@GADGETRON_INSTALL_CHROOT_SCRIPTS_PATH@/gadgetron_ismrmrd_client.sh $ISMRMRD_FILENAME $CONDIG_XML $GT_HOST $GT_PORT
+    exit $?
+  else
+    echo -e "\nUsage: $0 <mount point> <ismrmrd filename> <config filename> <host> <port> <optional: full path to img file>\n"
+    exit 1
+  fi
+fi
diff --git a/chroot/run-gt_alive.sh b/chroot/run-gt_alive.sh
new file mode 100755
index 0000000..934b8d1
--- /dev/null
+++ b/chroot/run-gt_alive.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+start_gadgetron_image_job=0
+BASEDIR=$(dirname $0)
+
+trap '(($start_gadgetron_image_job == 0)) || ((`kill -0 $start_gadgetron_image_job`)) || kill $start_gadgetron_image_job & while kill -0 $start_gadgetron_image_job 2>/dev/null; do sleep 1; done' HUP TERM INT
+
+if [ $(id -u) -ne 0 ]; then
+  echo -e "\nPlease start the script as a root or sudo!\n"
+  exit 1
+else
+  if [ $# -ge 3 ]; then
+
+    MOUNT_POINT=${1}
+    HOSTNAME=${2}
+    PORT=${3}
+
+    if [ $# -eq 4 ]; then
+        FULL_PATH_TO_IMG_FILE=${4}
+
+        if find "${MOUNT_POINT}/chroot-root/gadgetron" -maxdepth 0 -empty | read v; then
+            mkdir -p ${MOUNT_POINT}
+            mount -o loop ${FULL_PATH_TO_IMG_FILE} ${MOUNT_POINT}
+        fi
+    fi
+
+    chroot ${MOUNT_POINT}/chroot-root/gadgetron /gt_alive.sh $HOSTNAME $PORT &
+	PID=$!
+	(sleep 2 && kill -9 $PID) &
+	waiter=$!
+	wait $PID
+	status=$?
+	echo "gt_alive exit code : $status"
+	kill -9 $waiter 2>/dev/null
+	completeJob=$?
+	
+	if [ $completeJob -eq 0 ]; then
+	    echo "gt_alive is completed properly"
+	    exit $status
+	else
+	    exit 1
+    fi
+	
+    exit $?
+  else
+    echo -e "\nUsage: $0 <mount point> <host> <port> <optional: full path to img file>\n"
+    exit 1
+  fi
+fi
diff --git a/chroot/run-gt_alive.sh.in b/chroot/run-gt_alive.sh.in
new file mode 100644
index 0000000..b48b387
--- /dev/null
+++ b/chroot/run-gt_alive.sh.in
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+start_gadgetron_image_job=0
+BASEDIR=$(dirname $0)
+
+trap '(($start_gadgetron_image_job == 0)) || ((`kill -0 $start_gadgetron_image_job`)) || kill $start_gadgetron_image_job & while kill -0 $start_gadgetron_image_job 2>/dev/null; do sleep 1; done' HUP TERM INT
+
+if [ $(id -u) -ne 0 ]; then
+  echo -e "\nPlease start the script as a root or sudo!\n"
+  exit 1
+else
+  if [ $# -ge 3 ]; then
+
+    MOUNT_POINT=${1}
+    HOSTNAME=${2}
+    PORT=${3}
+
+    if [ $# -eq 4 ]; then
+        FULL_PATH_TO_IMG_FILE=${4}
+
+        if find "${MOUNT_POINT}/chroot-root/gadgetron" -maxdepth 0 -empty | read v; then
+            mkdir -p ${MOUNT_POINT}
+            mount -o loop ${FULL_PATH_TO_IMG_FILE} ${MOUNT_POINT}
+        fi
+    fi
+
+    chroot ${MOUNT_POINT}/chroot-root/gadgetron @CMAKE_INSTALL_PREFIX@/@GADGETRON_INSTALL_CHROOT_SCRIPTS_PATH@/gt_alive.sh $HOSTNAME $PORT &
+	PID=$!
+	(sleep 2 && kill -9 $PID) &
+	waiter=$!
+	wait $PID
+	status=$?
+	echo "gt_alive exit code : $status"
+	kill -9 $waiter 2>/dev/null
+	completeJob=$?
+	
+	if [ $completeJob -eq 0 ]; then
+	    echo "gt_alive is completed properly"
+	    exit $status
+	else
+	    exit 1
+    fi
+	
+    exit $?
+  else
+    echo -e "\nUsage: $0 <mount point> <host> <port> <optional: full path to img file>\n"
+    exit 1
+  fi
+fi
diff --git a/chroot/run-siemens_to_ismrmrd.sh b/chroot/run-siemens_to_ismrmrd.sh
new file mode 100755
index 0000000..326bccd
--- /dev/null
+++ b/chroot/run-siemens_to_ismrmrd.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+BASEDIR=$(dirname $0)
+
+if [ $(id -u) -ne 0 ]; then
+  echo -e "\nPlease start the script as a root or sudo!\n"
+  exit 1
+else
+  if [ $# -ge 4 ]; then
+    
+    MOUNT_POINT=${1}
+    DAT_FILENAME=${2}
+    ISMRMRD_FILENAME=${3}
+    SCAN_NO=${4}
+
+    if [ $# -eq 5 ]; then
+        FULL_PATH_TO_IMG_FILE=${5}
+
+        if find "${MOUNT_POINT}/chroot-root/gadgetron" -maxdepth 0 -empty | read v; then
+            mkdir -p ${MOUNT_POINT}
+            mount -o loop ${FULL_PATH_TO_IMG_FILE} ${MOUNT_POINT}
+        fi
+    fi
+
+    chroot ${MOUNT_POINT}/chroot-root/gadgetron /siemens_to_ismrmrd.sh $DAT_FILENAME $ISMRMRD_FILENAME $SCAN_NO
+    exit $?
+  else
+    echo -e "\nUsage: $0 <mount point> <dat filename> <ismrmrd filename> <scan number> <optional: full path to img file>\n"
+    exit 1
+  fi
+fi
diff --git a/chroot/run-siemens_to_ismrmrd.sh.in b/chroot/run-siemens_to_ismrmrd.sh.in
new file mode 100644
index 0000000..cb66514
--- /dev/null
+++ b/chroot/run-siemens_to_ismrmrd.sh.in
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+BASEDIR=$(dirname $0)
+
+if [ $(id -u) -ne 0 ]; then
+  echo -e "\nPlease start the script as a root or sudo!\n"
+  exit 1
+else
+  if [ $# -ge 4 ]; then
+    
+    MOUNT_POINT=${1}
+    DAT_FILENAME=${2}
+    ISMRMRD_FILENAME=${3}
+    SCAN_NO=${4}
+
+    if [ $# -eq 5 ]; then
+        FULL_PATH_TO_IMG_FILE=${5}
+
+        if find "${MOUNT_POINT}/chroot-root/gadgetron" -maxdepth 0 -empty | read v; then
+            mkdir -p ${MOUNT_POINT}
+            mount -o loop ${FULL_PATH_TO_IMG_FILE} ${MOUNT_POINT}
+        fi
+    fi
+
+    chroot ${MOUNT_POINT}/chroot-root/gadgetron @CMAKE_INSTALL_PREFIX@/@GADGETRON_INSTALL_CHROOT_SCRIPTS_PATH@/siemens_to_ismrmrd.sh $DAT_FILENAME $ISMRMRD_FILENAME $SCAN_NO
+    exit $?
+  else
+    echo -e "\nUsage: $0 <mount point> <dat filename> <ismrmrd filename> <scan number> <optional: full path to img file>\n"
+    exit 1
+  fi
+fi
diff --git a/chroot/run-webapp.sh.in b/chroot/run-webapp.sh.in
new file mode 100644
index 0000000..160cac8
--- /dev/null
+++ b/chroot/run-webapp.sh.in
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+start_gadgetron_job=0
+trap '(($start_gadgetron_job == 0)) || ((`kill -0 $start_gadgetron_job`)) || kill -SIGINT $start_gadgetron_job & while kill -0 $start_gadgetron_job 2>/dev/null; do sleep 1; done' HUP TERM INT
+
+PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:@CMAKE_INSTALL_PREFIX@/bin 
+LD_LIBRARY_PATH=@CMAKE_INSTALL_PREFIX@/lib:/usr/local/lib:/opt/intel/mkl/lib/intel64:/opt/intel/lib/intel64 
+python @CMAKE_INSTALL_PREFIX@/bin/gadgetron_web_app.py @CMAKE_INSTALL_PREFIX@/share/gadgetron/config/gadgetron_web_app.cfg &
+start_gadgetron_job=$!
+wait $!
+exit 0
diff --git a/chroot/siemens_to_ismrmrd.sh.in b/chroot/siemens_to_ismrmrd.sh.in
new file mode 100644
index 0000000..0edd4f2
--- /dev/null
+++ b/chroot/siemens_to_ismrmrd.sh.in
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+if [ $# -eq 3 ]; then
+
+    DAT_FILENAME=${1}
+    ISMRMRD_FILENAME=${2}
+    SCAN_NO=${3}
+
+    PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:@CMAKE_INSTALL_PREFIX@/bin LD_LIBRARY_PATH=@CMAKE_INSTALL_PREFIX@/lib:/usr/local/lib:/opt/intel/mkl/lib/intel64:/opt/intel/lib/intel64 /usr/local/bin/siemens_to_ismrmrd -f $DAT_FILENAME -o $ISMRMRD_FILENAME -z $SCAN_NO
+    exit $?
+else
+    echo -e "\nUsage: $0 <dat filename> <ismrmrd filename> <scan number>\n"
+    exit 1
+fi
+
+exit 0
diff --git a/chroot/start-env.sh b/chroot/start-env.sh
new file mode 100755
index 0000000..4b93919
--- /dev/null
+++ b/chroot/start-env.sh
@@ -0,0 +1,18 @@
+#!/bin/bash                                                                                                                                                  
+if [ $(id -u) -ne 0 ]; then
+ echo -e "\nPlease start the script as a root or sudo!\n"
+ exit 1
+
+else
+ BASEDIR=$(dirname $0)
+
+ if [ $# -eq 0 ]; then
+  $BASEDIR/mount.sh $BASEDIR
+  chroot $BASEDIR/gadgetron /enter-chroot-env.sh
+  exit 0
+
+ else
+  echo -e "\nUsage: $0\n"
+  exit 1
+ fi
+fi
diff --git a/chroot/start-env.sh.in b/chroot/start-env.sh.in
new file mode 100644
index 0000000..2f04a2c
--- /dev/null
+++ b/chroot/start-env.sh.in
@@ -0,0 +1,21 @@
+#!/bin/bash                                                                                                                                                  
+if [ $(id -u) -ne 0 ]; then
+ echo -e "\nPlease start the script as a root or sudo!\n"
+ exit 1
+
+else
+ BASEDIR=$(dirname $0)
+
+ if [ $# -eq 1 ]; then
+
+  CHROOT_DIR=${1}
+  
+  $BASEDIR/mount.sh $CHROOT_DIR
+  chroot $CHROOT_DIR/ @CMAKE_INSTALL_PREFIX@/@GADGETRON_INSTALL_CHROOT_SCRIPTS_PATH@/enter-chroot-env.sh
+  exit 0
+
+ else
+  echo -e "\nUsage: $0 (chroot dir)\n"
+  exit 1
+ fi
+fi
diff --git a/chroot/start-gadgetron-from-image.sh b/chroot/start-gadgetron-from-image.sh
new file mode 100755
index 0000000..e4d9c3e
--- /dev/null
+++ b/chroot/start-gadgetron-from-image.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+start_gadgetron_image_job=0
+BASEDIR=$(dirname $0)
+
+trap '(($start_gadgetron_image_job == 0)) || ((`kill -0 $start_gadgetron_image_job`)) || kill $start_gadgetron_image_job & while kill -0 $start_gadgetron_image_job 2>/dev/null; do sleep 1; done' HUP TERM INT
+
+if [ $(id -u) -ne 0 ]; then
+  echo -e "\nPlease start the script as a root or sudo!\n"
+  exit 1
+else
+  if [ $# -eq 2 ]; then
+
+    FULL_PATH_TO_IMG_FILE=${1}
+    MOUNT_POINT=${2}
+
+    mkdir -p ${MOUNT_POINT}
+    mount -o loop ${FULL_PATH_TO_IMG_FILE} ${MOUNT_POINT}
+    sleep 1
+
+    MOUNT_READY=0
+    MOUNT_TRY=0
+    MAX_MOUNT_TRY=100
+    while [ ${MOUNT_READY} -eq 0 ]; do
+      if mountpoint -q ${MOUNT_POINT}; then
+          MOUNT_READY=1
+      else
+          sleep 0.2
+          let MOUNT_TRY++
+          if [ $MOUNT_TRY -eq $MAX_MOUNT_TRY ]; then
+		      MOUNT_READY=1
+	      fi
+      fi
+    done
+
+    if mountpoint -q ${MOUNT_POINT}; then
+	  ${MOUNT_POINT}/chroot-root/start.sh &
+	  start_gadgetron_image_job=($!)
+	  wait $!
+	  $BASEDIR/umount_image.sh ${MOUNT_POINT}
+    else
+      exit 1
+    fi
+    exit 0
+  else
+    echo -e "\nUsage: $0 <full path to img file> <mount point>\n"
+    exit 1
+  fi
+fi
diff --git a/chroot/start-gadgetron-from-image.sh.in b/chroot/start-gadgetron-from-image.sh.in
new file mode 100755
index 0000000..0395d1d
--- /dev/null
+++ b/chroot/start-gadgetron-from-image.sh.in
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+start_gadgetron_image_job=0
+BASEDIR=$(dirname $0)
+
+trap '(($start_gadgetron_image_job == 0)) || ((`kill -0 $start_gadgetron_image_job`)) || kill $start_gadgetron_image_job & while kill -0 $start_gadgetron_image_job 2>/dev/null; do sleep 1; done' HUP TERM INT
+
+if [ $(id -u) -ne 0 ]; then
+  echo -e "\nPlease start the script as a root or sudo!\n"
+  exit 1
+else
+  if [ $# -eq 2 ]; then
+
+    FULL_PATH_TO_IMG_FILE=${1}
+    MOUNT_POINT=${2}
+
+    mkdir -p ${MOUNT_POINT}
+    mount -o loop ${FULL_PATH_TO_IMG_FILE} ${MOUNT_POINT}
+    sleep 1
+
+    MOUNT_READY=0
+    MOUNT_TRY=0
+    MAX_MOUNT_TRY=100
+    while [ ${MOUNT_READY} -eq 0 ]; do
+      if mountpoint -q ${MOUNT_POINT}; then
+          MOUNT_READY=1
+      else
+          sleep 0.2
+          let MOUNT_TRY++
+          if [ $MOUNT_TRY -eq $MAX_MOUNT_TRY ]; then
+		      MOUNT_READY=1
+	      fi
+      fi
+    done
+
+    if mountpoint -q ${MOUNT_POINT}; then
+	  ${MOUNT_POINT}/chroot-root/gadgetron at CMAKE_INSTALL_PREFIX@/@GADGETRON_INSTALL_CHROOT_SCRIPTS_PATH@/start.sh ${MOUNT_POINT}/chroot-root/gadgetron &
+	  start_gadgetron_image_job=($!)
+	  wait $!
+	  $BASEDIR/umount_image.sh ${MOUNT_POINT}
+    else
+      exit 1
+    fi
+    exit 0
+  else
+    echo -e "\nUsage: $0 <full path to img file> <mount point>\n"
+    exit 1
+  fi
+fi
diff --git a/chroot/start-gadgetron.sh.in b/chroot/start-gadgetron.sh.in
new file mode 100644
index 0000000..7ea9f90
--- /dev/null
+++ b/chroot/start-gadgetron.sh.in
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+gadgetron_job=0
+trap '(($gadgetron_job == 0)) || ((`kill -0 $gadgetron_job`))|| kill $gadgetron_job' HUP TERM INT
+
+PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:@CMAKE_INSTALL_PREFIX@/bin LD_LIBRARY_PATH=@CMAKE_INSTALL_PREFIX@/lib:/usr/local/lib:/opt/intel/mkl/lib/intel64:/opt/intel/lib/intel64 @CMAKE_INSTALL_PREFIX@/bin/gadgetron &
+
+gadgetron_job=($!)
+wait $!
+exit 0
diff --git a/chroot/start-webapp.sh b/chroot/start-webapp.sh
new file mode 100755
index 0000000..b93da2a
--- /dev/null
+++ b/chroot/start-webapp.sh
@@ -0,0 +1,18 @@
+#!/bin/bash                 
+
+start_gadgetron_job=0
+BASEDIR=$(dirname $0)
+
+trap '(($start_gadgetron_job == 0)) || ((`kill -0 $start_gadgetron_job`)) || kill $start_gadgetron_job & while kill -0 $start_gadgetron_job 2>/dev/null; do sleep 1; done' HUP TERM INT
+
+if [ $(id -u) -ne 0 ]; then 
+    echo -e "\nPlease start the script as a root or sudo!\n"
+    exit 1
+else
+    $BASEDIR/mount.sh $BASEDIR
+    chroot $BASEDIR/gadgetron /run-webapp.sh &
+    start_gadgetron_job=($!)
+    wait $!
+    $BASEDIR/stop.sh
+    exit 0
+fi
diff --git a/chroot/start-webapp.sh.in b/chroot/start-webapp.sh.in
new file mode 100644
index 0000000..9d40291
--- /dev/null
+++ b/chroot/start-webapp.sh.in
@@ -0,0 +1,24 @@
+#!/bin/bash                 
+
+start_gadgetron_job=0
+BASEDIR=$(dirname $0)
+
+trap '(($start_gadgetron_job == 0)) || ((`kill -0 $start_gadgetron_job`)) || kill $start_gadgetron_job & while kill -0 $start_gadgetron_job 2>/dev/null; do sleep 1; done' HUP TERM INT
+
+if [ $(id -u) -ne 0 ]; then 
+    echo -e "\nPlease start the script as a root or sudo!\n"
+    exit 1
+else
+    if [ $# -eq 1 ]; then
+        CHROOT_DIR=${1}
+        $BASEDIR/mount.sh $CHROOT_DIR
+        chroot $CHROOT_DIR @CMAKE_INSTALL_PREFIX@/@GADGETRON_INSTALL_CHROOT_SCRIPTS_PATH@/run-webapp.sh &
+        start_gadgetron_job=($!)
+        wait $!
+        $BASEDIR/stop.sh $CHROOT_DIR
+        exit 0
+    else
+        echo -e "\nUsage: $0 (chroot_dir)\n"
+        exit 1
+    fi
+fi
diff --git a/chroot/start.sh b/chroot/start.sh
new file mode 100755
index 0000000..e4addd2
--- /dev/null
+++ b/chroot/start.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+start_gadgetron_job=0
+BASEDIR=$(dirname $0)
+
+trap '(($start_gadgetron_job == 0)) || ((`kill -0 $start_gadgetron_job`)) || kill $start_gadgetron_job & while kill -0 $start_gadgetron_job 2>/dev/null; do sleep 1; done & $BASEDIR/stop.sh' HUP TERM INT
+
+if [ $(id -u) -ne 0 ]; then
+ echo -e "\nPlease start the script as a root or sudo!\n"
+ exit 1
+else
+ if [ $# -eq 0 ]; then
+  $BASEDIR/mount.sh $BASEDIR
+  chroot $BASEDIR/gadgetron /start-gadgetron.sh &
+  start_gadgetron_job=($!)
+  wait $!
+  $BASEDIR/stop.sh
+  exit 0
+ else
+  echo -e "\nUsage: $0\n"
+  exit 1
+ fi
+fi
diff --git a/chroot/start.sh.in b/chroot/start.sh.in
new file mode 100644
index 0000000..1540742
--- /dev/null
+++ b/chroot/start.sh.in
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+start_gadgetron_job=0
+BASEDIR=$(dirname $0)
+
+trap '(($start_gadgetron_job == 0)) || ((`kill -0 $start_gadgetron_job`)) || kill $start_gadgetron_job & while kill -0 $start_gadgetron_job 2>/dev/null; do sleep 1; done & $BASEDIR/stop.sh $CHROOT_DIR' HUP TERM INT
+
+if [ $(id -u) -ne 0 ]; then
+ echo -e "\nPlease start the script as a root or sudo!\n"
+ exit 1
+else
+ if [ $# -eq 1 ]; then
+
+  CHROOT_DIR=${1}
+
+  $BASEDIR/mount.sh $CHROOT_DIR
+  chroot $CHROOT_DIR @CMAKE_INSTALL_PREFIX@/@GADGETRON_INSTALL_CHROOT_SCRIPTS_PATH@/start-gadgetron.sh &
+  start_gadgetron_job=($!)
+  wait $!
+  $BASEDIR/stop.sh $CHROOT_DIR
+  exit 0
+ else
+  echo -e "\nUsage: $0 (chroot_dir)\n"
+  exit 1
+ fi
+fi
diff --git a/chroot/stop.sh b/chroot/stop.sh
new file mode 100755
index 0000000..79adf35
--- /dev/null
+++ b/chroot/stop.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+function umount_check {
+    MAX_TRY=100
+    MOUNT_DIR=$1
+    UMOUNT_READY=0
+    UMOUNT_TRY=0
+    while [ ${UMOUNT_READY} -eq 0 ]; do
+        if mountpoint -q ${MOUNT_DIR}; then
+            let UMOUNT_TRY++
+            if [ $UMOUNT_TRY -eq $MAX_TRY ]; then
+                UMOUNT_READY=1
+                umount $MOUNT_DIR
+            else
+                sleep 0.2
+            fi
+        else
+            UMOUNT_READY=1
+        fi
+    done
+}
+
+if [ $(id -u) -ne 0 ]; then
+ echo -e "\nPlease start the script as a root or sudo!\n"
+ exit 1
+
+else
+ BASEDIR=$(dirname $0)
+
+ if [ $# -eq 1 ]; then
+  CHROOT_DIR=${1} 
+
+  if mountpoint -q $CHROOT_DIR/proc; then
+   umount $CHROOT_DIR/proc
+   umount_check $CHROOT_DIR/proc
+  fi
+  if mountpoint -q $CHROOT_DIR/sys; then
+    umount $CHROOT_DIR/sys
+   umount_check $CHROOT_DIR/sys
+  fi
+  if mountpoint -q $CHROOT_DIR/dev; then
+    umount $CHROOT_DIR/dev
+    umount_check $CHROOT_DIR/dev
+  fi
+  exit 0
+
+ else
+  echo -e "\nUsage: $0 (chroot_dir)\n"
+  exit 1
+ fi
+fi
diff --git a/chroot/umount_image.sh b/chroot/umount_image.sh
new file mode 100755
index 0000000..e746fd3
--- /dev/null
+++ b/chroot/umount_image.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+function umount_check {
+    MAX_TRY=100
+    MOUNT_DIR=$1
+    UMOUNT_READY=0
+    UMOUNT_TRY=0
+    while [ ${UMOUNT_READY} -eq 0 ]; do
+        if mountpoint -q ${MOUNT_DIR}; then
+            let UMOUNT_TRY++
+            if [ $UMOUNT_TRY -eq $MAX_TRY ]; then
+                UMOUNT_READY=1
+            else
+                sleep 0.2
+            fi
+        else
+            UMOUNT_READY=1
+        fi
+    done
+}
+
+if [ $(id -u) -ne 0 ]; then
+  echo -e "\nPlease start the script as a root or sudo!\n"
+  exit 1
+else
+  if [ $# -eq 1 ]; then
+
+    MOUNT_POINT=${1}
+
+    if mountpoint -q ${MOUNT_POINT}; then
+      if mountpoint -q ${MOUNT_POINT}/chroot-root/gadgetron/proc; then
+        umount ${MOUNT_POINT}/chroot-root/gadgetron/proc
+        umount_check ${MOUNT_POINT}/chroot-root/gadgetron/proc
+      fi
+      if mountpoint -q ${MOUNT_POINT}/chroot-root/gadgetron/dev; then
+        umount ${MOUNT_POINT}/chroot-root/gadgetron/dev
+        umount_check ${MOUNT_POINT}/chroot-root/gadgetron/dev
+      fi
+      if mountpoint -q ${MOUNT_POINT}/chroot-root/gadgetron/sys; then
+        umount ${MOUNT_POINT}/chroot-root/gadgetron/sys
+        umount_check ${MOUNT_POINT}/chroot-root/gadgetron/sys
+      fi
+      umount ${MOUNT_POINT}
+      umount_check ${MOUNT_POINT}
+      exit 0
+    fi
+  else
+    echo -e "\nUsage: $0 <mount point>\n"
+    exit 1
+  fi
+fi
diff --git a/chroot/unique_lines_in_file b/chroot/unique_lines_in_file
new file mode 100755
index 0000000..870039d
--- /dev/null
+++ b/chroot/unique_lines_in_file
@@ -0,0 +1,14 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+my $file = $ARGV[0];
+
+my %seen = ();
+{
+    while(<>){
+	$seen{$_}++;
+	next if $seen{$_} > 1;
+	print;
+    }
+}
diff --git a/chroot/upstart-instructions.txt b/chroot/upstart-instructions.txt
new file mode 100644
index 0000000..3c1e357
--- /dev/null
+++ b/chroot/upstart-instructions.txt
@@ -0,0 +1,10 @@
+*** Instructions for webapp.conf upstart script ***
+
+
+  - Edit the script and set the correct path to start-webapp.sh and stop.sh
+
+  - Copy the script to /etc/init/
+
+  - Run sudo start webapp or restart the computer
+
+  - Log file is in /var/log/upstart/webapp.log
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
new file mode 100644
index 0000000..13627f7
--- /dev/null
+++ b/cmake/CMakeLists.txt
@@ -0,0 +1,22 @@
+install(FILES 	
+FindACE.cmake
+FindFFTW3.cmake
+FindGLEW.cmake
+FindNumPy.cmake
+FindGadgetron.cmake
+FindIsmrmrd.cmake
+FindGMatlab.cmake
+FindDCMTK.cmake
+FindMKL.cmake
+DESTINATION ${GADGETRON_INSTALL_CMAKE_PATH} COMPONENT main)
+
+if (WIN32)
+    install(FILES InstallWinGadgetron.bat DESTINATION ${GADGETRON_INSTALL_CMAKE_PATH} COMPONENT main)
+endif (WIN32)
+
+#if (UNIX)
+#    if (NOT APPLE)
+#        install(FILES ./debian/postinst DESTINATION ${GADGETRON_INSTALL_CMAKE_PATH} COMPONENT main)
+#        install(FILES ./debian/postinst_web DESTINATION ${GADGETRON_INSTALL_CMAKE_PATH} COMPONENT web)
+#    endif (NOT APPLE)
+#endif (UNIX)
diff --git a/cmake/FindACE.cmake b/cmake/FindACE.cmake
new file mode 100644
index 0000000..9d556ce
--- /dev/null
+++ b/cmake/FindACE.cmake
@@ -0,0 +1,90 @@
+#
+# Find the ACE client includes and library
+#
+
+# This module defines
+# ACE_INCLUDE_DIR, where to find ace.h
+# ACE_LIBRARIES, the libraries to link against ... !! NOT header is old !! ...
+# ACE_FOUND, if false, you cannot build anything that requires ACE
+
+# This is the new header...
+
+######################################################################## 
+## check pkg-config for ace information, if available 
+ 
+SET(ACE_INCLUDE_DIR_GUESS) 
+SET(ACE_LIBRARY_DIR_GUESS) 
+SET(ACE_LINK_FLAGS) 
+IF(PKGCONFIG_EXECUTABLE) 
+	PKGCONFIG(ace ACE_INCLUDE_DIR_GUESS ACE_LIBRARY_DIR_GUESS ACE_LINK_FLAGS ACE_C_FLAGS) 
+	IF (NOT ACE_LINK_FLAGS) 
+		PKGCONFIG(ACE ACE_INCLUDE_DIR_GUESS ACE_LIBRARY_DIR_GUESS ACE_LINK_FLAGS ACE_C_FLAGS) 
+	ENDIF (NOT ACE_LINK_FLAGS) 
+	ADD_DEFINITIONS(${ACE_C_FLAGS}) 
+ENDIF(PKGCONFIG_EXECUTABLE) 
+ 
+SET(ACE_LINK_FLAGS "${ACE_LINK_FLAGS}" CACHE INTERNAL "ace link flags") 
+ 
+######################################################################## 
+##  general find 
+ 
+FIND_PATH(ACE_INCLUDE_DIR ace/ACE.h ${CMAKE_SOURCE_DIR}/../ACE_wrappers/ /usr/include /usr/local/include $ENV{ACE_ROOT} $ENV{ACE_ROOT}/include DOC "directory containing ace/*.h for ACE library") 
+ 
+# in YARP1, config was in another directory 
+SET(ACE_INCLUDE_CONFIG_DIR "" CACHE STRING "location of ace/config.h") 
+MARK_AS_ADVANCED(ACE_INCLUDE_CONFIG_DIR) 
+ 
+FIND_LIBRARY(ACE_LIBRARY NAMES ACE ace PATHS ${CMAKE_SOURCE_DIR}/../ACE_wrappers/lib/ /usr/lib /usr/local/lib $ENV{ACE_ROOT}/lib $ENV{ACE_ROOT} DOC "ACE library file") 
+ 
+IF (WIN32 AND NOT CYGWIN) 
+	SET(CMAKE_DEBUG_POSTFIX "d") 
+	FIND_LIBRARY(ACE_DEBUG_LIBRARY NAMES ACE${CMAKE_DEBUG_POSTFIX} ace${CMAKE_DEBUG_POSTFIX} PATHS ${CMAKE_SOURCE_DIR}/../ACE_wrappers/lib/ /usr/lib /usr/local/lib $ENV{ACE_ROOT}/lib $ENV{ACE_ROOT} DOC "ACE library file (debug version)") 
+ENDIF (WIN32 AND NOT CYGWIN) 
+ 
+ 
+######################################################################## 
+## OS-specific extra linkage 
+ 
+# Solaris needs some extra libraries that may not have been found already 
+IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS") 
+  #MESSAGE(STATUS "need to link solaris-specific libraries") 
+  #  LINK_LIBRARIES(socket rt) 
+  SET(ACE_LIBRARY ${ACE_LIBRARY} socket rt nsl) 
+ENDIF(CMAKE_SYSTEM_NAME STREQUAL "SunOS") 
+ 
+# Windows needs some extra libraries 
+IF (WIN32 AND NOT CYGWIN) 
+  #MESSAGE(STATUS "need to link windows-specific libraries") 
+  #LINK_LIBRARIES(winmm) 
+  SET(ACE_LIBRARY ${ACE_LIBRARY} winmm) 
+ENDIF (WIN32 AND NOT CYGWIN) 
+ 
+ 
+######################################################################## 
+## finished - now just set up flags and complain to user if necessary 
+ 
+IF (ACE_INCLUDE_DIR AND ACE_LIBRARY) 
+	SET(ACE_FOUND TRUE) 
+ELSE (ACE_INCLUDE_DIR AND ACE_LIBRARY) 
+	SET(ACE_FOUND FALSE) 
+ENDIF (ACE_INCLUDE_DIR AND ACE_LIBRARY) 
+ 
+IF (ACE_DEBUG_LIBRARY) 
+	SET(ACE_DEBUG_FOUND TRUE) 
+ELSE (ACE_DEBUG_LIBRARY)
+  SET(ACE_DEBUG_LIBRARY ${ACE_LIBRARY})
+ENDIF (ACE_DEBUG_LIBRARY) 
+ 
+IF (ACE_FOUND) 
+	IF (NOT Ace_FIND_QUIETLY) 
+		MESSAGE(STATUS "Found ACE library: ${ACE_LIBRARY}") 
+		MESSAGE(STATUS "Found ACE include: ${ACE_INCLUDE_DIR}") 
+	ENDIF (NOT Ace_FIND_QUIETLY) 
+ELSE (ACE_FOUND) 
+	IF (Ace_FIND_REQUIRED) 
+		MESSAGE(FATAL_ERROR "Could not find ACE") 
+	ENDIF (Ace_FIND_REQUIRED) 
+ENDIF (ACE_FOUND) 
+
+# TSS: backwards compatibility
+SET(ACE_LIBRARIES ${ACE_LIBRARY}) 
diff --git a/cmake/FindArmadillo.cmake b/cmake/FindArmadillo.cmake
new file mode 100644
index 0000000..9fabca8
--- /dev/null
+++ b/cmake/FindArmadillo.cmake
@@ -0,0 +1,100 @@
+# - Find Armadillo
+# Find the Armadillo C++ library
+#
+# Using Armadillo:
+#  find_package(Armadillo REQUIRED)
+#  include_directories(${ARMADILLO_INCLUDE_DIRS})
+#  add_executable(foo foo.cc)
+#  target_link_libraries(foo ${ARMADILLO_LIBRARIES})
+# This module sets the following variables:
+#  ARMADILLO_FOUND - set to true if the library is found
+#  ARMADILLO_INCLUDE_DIRS - list of required include directories
+#  ARMADILLO_LIBRARIES - list of libraries to be linked
+#  ARMADILLO_VERSION_MAJOR - major version number
+#  ARMADILLO_VERSION_MINOR - minor version number
+#  ARMADILLO_VERSION_PATCH - patch version number
+#  ARMADILLO_VERSION_STRING - version number as a string (ex: "1.0.4")
+#  ARMADILLO_VERSION_NAME - name of the version (ex: "Antipodean Antileech")
+
+#=============================================================================
+# Copyright 2011 Clement Creusot <creusot at cs.york.ac.uk>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+
+# UNIX paths are standard, no need to write.
+find_library(ARMADILLO_LIBRARY
+  NAMES armadillo
+  HINTS $ENV{ARMA_HOME}
+  PATHS "$ENV{ProgramFiles}/Armadillo/lib"  "$ENV{ProgramFiles}/Armadillo/lib64" "$ENV{ProgramFiles}/Armadillo"
+  PATH_SUFFIXES "lib"
+  )
+find_path(ARMADILLO_INCLUDE_DIR
+  NAMES armadillo
+  HINTS $ENV{ARMA_HOME}
+  PATHS "$ENV{ProgramFiles}/Armadillo/include"
+  PATH_SUFFIXES "include"
+  )
+
+
+if(ARMADILLO_INCLUDE_DIR)
+
+  # ------------------------------------------------------------------------
+  #  Extract version information from <armadillo>
+  # ------------------------------------------------------------------------
+
+  # WARNING: Early releases of Armadillo didn't have the arma_version.hpp file.
+  # (e.g. v.0.9.8-1 in ubuntu maverick packages (2001-03-15))
+  # If the file is missing, set all values to 0
+  set(ARMADILLO_VERSION_MAJOR 0)
+  set(ARMADILLO_VERSION_MINOR 0)
+  set(ARMADILLO_VERSION_PATCH 0)
+  set(ARMADILLO_VERSION_NAME "EARLY RELEASE")
+
+  if(EXISTS "${ARMADILLO_INCLUDE_DIR}/armadillo_bits/arma_version.hpp")
+
+    # Read and parse armdillo version header file for version number
+    file(STRINGS "${ARMADILLO_INCLUDE_DIR}/armadillo_bits/arma_version.hpp" _armadillo_HEADER_CONTENTS REGEX "#define ARMA_VERSION_[A-Z]+ ")
+    string(REGEX REPLACE ".*#define ARMA_VERSION_MAJOR ([0-9]+).*" "\\1" ARMADILLO_VERSION_MAJOR "${_armadillo_HEADER_CONTENTS}")
+    string(REGEX REPLACE ".*#define ARMA_VERSION_MINOR ([0-9]+).*" "\\1" ARMADILLO_VERSION_MINOR "${_armadillo_HEADER_CONTENTS}")
+    string(REGEX REPLACE ".*#define ARMA_VERSION_PATCH ([0-9]+).*" "\\1" ARMADILLO_VERSION_PATCH "${_armadillo_HEADER_CONTENTS}")
+
+    # WARNING: The number of spaces before the version name is not one.
+    string(REGEX REPLACE ".*#define ARMA_VERSION_NAME +\"([0-9a-zA-Z _-]+)\".*" "\\1" ARMADILLO_VERSION_NAME "${_armadillo_HEADER_CONTENTS}")
+
+    unset(_armadillo_HEADER_CONTENTS)
+  endif()
+
+  set(ARMADILLO_VERSION_STRING "${ARMADILLO_VERSION_MAJOR}.${ARMADILLO_VERSION_MINOR}.${ARMADILLO_VERSION_PATCH}")
+endif ()
+
+#======================
+
+
+# Checks 'REQUIRED', 'QUIET' and versions.
+include("FindPackageHandleStandardArgs")
+find_package_handle_standard_args(Armadillo
+  REQUIRED_VARS ARMADILLO_LIBRARY ARMADILLO_INCLUDE_DIR
+  VERSION_VAR ARMADILLO_VERSION_STRING)
+# version_var fails with cmake < 2.8.4.
+
+if (ARMADILLO_FOUND)
+  set(ARMADILLO_INCLUDE_DIRS ${ARMADILLO_INCLUDE_DIR})
+  set(ARMADILLO_LIBRARIES ${ARMADILLO_LIBRARY})
+endif ()
+
+
+# Hide internal variables
+mark_as_advanced(
+  ARMADILLO_INCLUDE_DIR
+  ARMADILLO_LIBRARY)
+
+#======================
diff --git a/cmake/FindCUDA/cuda_compute_capability.c b/cmake/FindCUDA/cuda_compute_capability.c
new file mode 100644
index 0000000..a69edee
--- /dev/null
+++ b/cmake/FindCUDA/cuda_compute_capability.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2011 Florian Rathgeber, florian.rathgeber at gmail.com
+ *
+ * This code is licensed under the MIT License.  See the FindCUDA.cmake script
+ * for the text of the license.
+ *
+ * Based on code by Christopher Bruns published on Stack Overflow (CC-BY):
+ * http://stackoverflow.com/questions/2285185
+ */
+
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+int main() {
+  int deviceCount, device, major = 9999, minor = 9999;
+  int gpuDeviceCount = 0;
+  struct cudaDeviceProp properties;
+
+  if (cudaGetDeviceCount(&deviceCount) != cudaSuccess)
+    return 1;
+  /* machines with no GPUs can still report one emulation device */
+  for (device = 0; device < deviceCount; ++device) {
+    cudaGetDeviceProperties(&properties, device);
+    if (properties.major != 9999 && properties.major > 1) {/* 9999 means emulation only and we do not support compute model 1.x*/
+      ++gpuDeviceCount;
+      if (gpuDeviceCount > 1)
+      	printf(";");
+      if (properties.major == 2) //Need a special case for Fermi. Compute capabillity 2.1 exists, but compute model 2.1 does not.
+      	printf("%d%d",properties.major, 0);
+      else
+      	printf("%d%d",properties.major, properties.minor);
+      /*  get minimum compute capability of all devices */
+    }
+  }
+  /* don't just return the number of gpus, because other runtime cuda
+     errors can also yield non-zero return values */
+  if (gpuDeviceCount > 0) {
+    /* this output will be parsed by FindCUDA.cmake */
+    return 0; /* success */
+  }
+  return 1; /* failure */
+}
diff --git a/cmake/FindCUDA_advanced.cmake b/cmake/FindCUDA_advanced.cmake
new file mode 100644
index 0000000..bea8878
--- /dev/null
+++ b/cmake/FindCUDA_advanced.cmake
@@ -0,0 +1,40 @@
+find_package(CUDA 5.5)
+
+# Check for GPUs present and their compute capability
+# based on http://stackoverflow.com/questions/2285185/easiest-way-to-test-for-existence-of-cuda-capable-gpu-from-cmake/2297877#2297877 (Christopher Bruns)
+if(CUDA_FOUND)
+    set(CUDA_NVCC_FLAGS2 "-gencode arch=compute_20,code=sm_20")
+    set(CUDA_NVCC_FLAGS3 "-gencode arch=compute_30,code=sm_30") 
+    set(CUDA_NVCC_FLAGS4 "-gencode arch=compute_35,code=sm_35")   
+  cuda_find_helper_file(cuda_compute_capability c)
+  try_run(RUN_RESULT_VAR COMPILE_RESULT_VAR
+    ${CMAKE_BINARY_DIR} 
+    ${CUDA_cuda_compute_capability}
+    CMAKE_FLAGS 
+    -DINCLUDE_DIRECTORIES:STRING=${CUDA_TOOLKIT_INCLUDE}
+    -DLINK_LIBRARIES:STRING=${CUDA_CUDART_LIBRARY}
+    COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT_VAR
+    RUN_OUTPUT_VARIABLE RUN_OUTPUT_VAR)
+  # COMPILE_RESULT_VAR is TRUE when compile succeeds
+  # RUN_RESULT_VAR is zero when a GPU is found
+  if(COMPILE_RESULT_VAR AND NOT RUN_RESULT_VAR)
+    set(CUDA_HAVE_GPU TRUE CACHE BOOL "Whether CUDA-capable GPU is present")
+    set(CUDA_COMPUTE_CAPABILITY ${RUN_OUTPUT_VAR} CACHE STRING "Compute capability of CUDA-capable GPU present. Seperate multiple by ;. For all known, use ALL")
+  else()
+    
+    set(CUDA_HAVE_GPU FALSE CACHE BOOL "Whether CUDA-capable GPU is present")
+    set(CUDA_COMPUTE_CAPABILITY ALL CACHE STRING "Compute capability of CUDA-capable GPU present. Seperate multiple by ;. For all known, use ALL")
+  endif()
+
+find_cuda_helper_libs(cusparse)
+set(CUDA_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY})
+if( "${CUDA_COMPUTE_CAPABILITY}" MATCHES ALL)
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${CUDA_NVCC_FLAGS2} ${CUDA_NVCC_FLAGS3} ${CUDA_NVCC_FLAGS4} ${CUDA_NVCC_FLAGS5})
+else()
+	foreach(code ${CUDA_COMPUTE_CAPABILITY})
+	   set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_${code},code=sm_${code} ")
+	endforeach()
+endif()
+
+endif(CUDA_FOUND)
+
diff --git a/cmake/FindDCMTK.cmake b/cmake/FindDCMTK.cmake
new file mode 100644
index 0000000..f8e4aff
--- /dev/null
+++ b/cmake/FindDCMTK.cmake
@@ -0,0 +1,175 @@
+# - find DCMTK libraries and applications
+#
+
+#  DCMTK_INCLUDE_DIRS   - Directories to include to use DCMTK
+#  DCMTK_LIBRARIES     - Files to link against to use DCMTK
+#  DCMTK_FOUND         - If false, don't try to use DCMTK
+#  DCMTK_DIR           - (optional) Source directory for DCMTK
+#  DCMTK_HOME          - install path for dcmtk binaries/headers/libs
+#
+# DCMTK_DIR can be used to make it simpler to find the various include
+# directories and compiled libraries if you've just compiled it in the
+# source tree. Just set it to the root of the tree where you extracted
+# the source (default to /usr/include/dcmtk/)
+
+#=============================================================================
+# Copyright 2004-2009 Kitware, Inc.
+# Copyright 2009-2010 Mathieu Malaterre <mathieu.malaterre at gmail.com>
+# Copyright 2010 Thomas Sondergaard <ts at medical-insight.com>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+#
+# Written for VXL by Amitha Perera.
+# Upgraded for GDCM by Mathieu Malaterre.
+# Modified for EasyViz by Thomas Sondergaard.
+# Updated to work with non-standard install paths by Joseph Naegele
+#
+
+
+# Allow for non-standard dcmtk installation locations
+# Set the DCMTK_HOME environment variable to make life easier!!
+find_path(DCMTK_HOME include/dcmtk
+    HINTS $ENV{DCMTK_HOME} $ENV{DCMTK_DIR}
+    PATHS /usr /usr/local
+    PATH_SUFFIXES dcmtk)
+mark_as_advanced(DCMTK_HOME)
+
+if(NOT DCMTK_FOUND AND NOT DCMTK_DIR)
+    set(DCMTK_DIR ${DCMTK_HOME})
+    mark_as_advanced(DCMTK_DIR)
+endif()
+
+foreach(lib
+    dcmdata
+    dcmimage
+    dcmimgle
+    dcmjpeg
+    dcmnet
+    dcmpstat
+    dcmqrdb
+    dcmsign
+    dcmsr
+    dcmtls
+    ijg12
+    ijg16
+    ijg8
+    oflog
+    ofstd)
+
+    find_library(DCMTK_${lib}_LIBRARY
+        ${lib}
+        HINTS
+        ${DCMTK_DIR}/lib
+        ${DCMTK_DIR}/${lib}/libsrc
+        ${DCMTK_DIR}/${lib}/libsrc/Release
+        ${DCMTK_DIR}/${lib}/libsrc/Debug
+        ${DCMTK_DIR}/${lib}/Release
+        ${DCMTK_DIR}/${lib}/Debug)
+
+    mark_as_advanced(DCMTK_${lib}_LIBRARY)
+
+    if(DCMTK_${lib}_LIBRARY)
+        list(APPEND DCMTK_LIBRARIES ${DCMTK_${lib}_LIBRARY})
+    endif()
+
+endforeach()
+
+
+set(DCMTK_config_TEST_HEADER osconfig.h)
+set(DCMTK_dcmdata_TEST_HEADER dctypes.h)
+set(DCMTK_dcmimage_TEST_HEADER dicoimg.h)
+set(DCMTK_dcmimgle_TEST_HEADER dcmimage.h)
+set(DCMTK_dcmjpeg_TEST_HEADER djdecode.h)
+set(DCMTK_dcmnet_TEST_HEADER assoc.h)
+set(DCMTK_dcmpstat_TEST_HEADER dcmpstat.h)
+set(DCMTK_dcmqrdb_TEST_HEADER dcmqrdba.h)
+set(DCMTK_dcmsign_TEST_HEADER sicert.h)
+set(DCMTK_dcmsr_TEST_HEADER dsrtree.h)
+set(DCMTK_dcmtls_TEST_HEADER tlslayer.h)
+set(DCMTK_oflog_TEST_HEADER oflog.h)
+set(DCMTK_ofstd_TEST_HEADER ofstdinc.h)
+
+foreach(dir
+    config
+    dcmdata
+    dcmimage
+    dcmimgle
+    dcmjpeg
+    dcmnet
+    dcmpstat
+    dcmqrdb
+    dcmsign
+    dcmsr
+    dcmtls
+    oflog
+    ofstd)
+
+    find_path(DCMTK_${dir}_INCLUDE_DIR
+        ${DCMTK_${dir}_TEST_HEADER}
+        HINTS
+        ${DCMTK_DIR}/include/dcmtk/${dir}
+        ${DCMTK_DIR}/${dir}/include
+        ${DCMTK_DIR}/${dir}
+        ${DCMTK_DIR}/include/${dir}
+        ${DCMTK_DIR}/${dir}/include/dcmtk/${dir}
+    )
+    mark_as_advanced(DCMTK_${dir}_INCLUDE_DIR)
+
+    if(DCMTK_${dir}_INCLUDE_DIR)
+        list(APPEND DCMTK_INCLUDE_DIRS ${DCMTK_${dir}_INCLUDE_DIR})
+    endif()
+endforeach()
+
+if(WIN32)
+    list(APPEND DCMTK_LIBRARIES netapi32 wsock32)
+endif()
+
+if(DCMTK_ofstd_INCLUDE_DIR)
+    get_filename_component(
+        DCMTK_dcmtk_INCLUDE_DIR
+        ${DCMTK_ofstd_INCLUDE_DIR}
+        PATH
+        CACHE)
+    list(APPEND DCMTK_INCLUDE_DIRS ${DCMTK_dcmtk_INCLUDE_DIR})
+    mark_as_advanced(DCMTK_dcmtk_INCLUDE_DIR)
+endif()
+
+if(DCMTK_dcmtk_INCLUDE_DIR)
+    get_filename_component(
+        DCMTK_root_INCLUDE_DIR
+        ${DCMTK_dcmtk_INCLUDE_DIR}
+        PATH
+        CACHE)
+    list(APPEND DCMTK_INCLUDE_DIRS ${DCMTK_root_INCLUDE_DIR})
+    mark_as_advanced(DCMTK_root_INCLUDE_DIR)
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(DCMTK DEFAULT_MSG
+  DCMTK_config_INCLUDE_DIR
+  DCMTK_ofstd_INCLUDE_DIR
+  DCMTK_ofstd_LIBRARY
+  DCMTK_oflog_INCLUDE_DIR
+  DCMTK_oflog_LIBRARY
+  DCMTK_dcmdata_INCLUDE_DIR
+  DCMTK_dcmdata_LIBRARY
+  DCMTK_dcmimgle_INCLUDE_DIR
+  DCMTK_dcmimgle_LIBRARY)
+
+# Compatibility: This variable is deprecated
+set(DCMTK_INCLUDE_DIR ${DCMTK_INCLUDE_DIRS})
+
+foreach(executable dcmdump dcmdjpeg dcmdrle)
+  string(TOUPPER ${executable} EXECUTABLE)
+  find_program(DCMTK_${EXECUTABLE}_EXECUTABLE ${executable} ${DCMTK_DIR}/bin)
+  mark_as_advanced(DCMTK_${EXECUTABLE}_EXECUTABLE)
+endforeach()
diff --git a/cmake/FindFFTW3.cmake b/cmake/FindFFTW3.cmake
new file mode 100644
index 0000000..b2b9f3f
--- /dev/null
+++ b/cmake/FindFFTW3.cmake
@@ -0,0 +1,114 @@
+# - Try to find FFTW3.
+# Usage: find_package(FFTW3 [COMPONENTS [single double long-double threads]])
+#
+# Variables used by this module:
+#  FFTW3_ROOT_DIR             - FFTW3 root directory
+# Variables defined by this module:
+#  FFTW3_FOUND                - system has FFTW3
+#  FFTW3_INCLUDE_DIR          - the FFTW3 include directory (cached)
+#  FFTW3_INCLUDE_DIRS         - the FFTW3 include directories
+#                               (identical to FFTW3_INCLUDE_DIR)
+#  FFTW3[FL]?_LIBRARY         - the FFTW3 library - double, single(F), 
+#                               long-double(L) precision (cached)
+#  FFTW3[FL]?_THREADS_LIBRARY - the threaded FFTW3 library - double, single(F), 
+#                               long-double(L) precision (cached)
+#  FFTW3_LIBRARIES            - list of all FFTW3 libraries found
+
+# Copyright (C) 2009-2010
+# ASTRON (Netherlands Institute for Radio Astronomy)
+# P.O.Box 2, 7990 AA Dwingeloo, The Netherlands
+#
+# This file is part of the LOFAR software suite.
+# The LOFAR software suite is free software: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# The LOFAR software suite is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>.
+#
+# $Id: FindFFTW3.cmake 15918 2010-06-25 11:12:42Z loose $
+
+# Use double precision by default.
+if(FFTW3_FIND_COMPONENTS MATCHES "^$")
+  set(_components double)
+else()
+  set(_components ${FFTW3_FIND_COMPONENTS})
+endif()
+
+# Loop over each component.
+set(_libraries)
+foreach(_comp ${_components})
+  if(_comp STREQUAL "single")
+    list(APPEND _libraries fftw3f)
+  elseif(_comp STREQUAL "double")
+    list(APPEND _libraries fftw3)
+  elseif(_comp STREQUAL "long-double")
+    list(APPEND _libraries fftw3l)
+  elseif(_comp STREQUAL "threads")
+    set(_use_threads ON)
+  else(_comp STREQUAL "single")
+    message(FATAL_ERROR "FindFFTW3: unknown component `${_comp}' specified. "
+      "Valid components are `single', `double', `long-double', and `threads'.")
+  endif(_comp STREQUAL "single")
+endforeach(_comp ${_components})
+
+# If using threads, we need to link against threaded libraries as well.
+if(_use_threads)
+  set(_thread_libs)
+  foreach(_lib ${_libraries})
+    list(APPEND _thread_libs ${_lib}_threads)
+  endforeach(_lib ${_libraries})
+  set(_libraries ${_thread_libs} ${_libraries})
+endif(_use_threads)
+
+# Keep a list of variable names that we need to pass on to
+# find_package_handle_standard_args().
+set(_check_list)
+
+# Search for all requested libraries.
+if (WIN32)
+
+    foreach(_lib ${_libraries})
+
+      string(TOUPPER ${_lib} _LIB)
+
+      find_library(${_LIB}_LIBRARY lib${_lib}-3
+        HINTS $ENV{FFTW3_ROOT_DIR} PATH_SUFFIXES lib)
+      mark_as_advanced(${_LIB}_LIBRARY)
+      list(APPEND FFTW3_LIBRARIES ${${_LIB}_LIBRARY})
+      list(APPEND _check_list ${_LIB}_LIBRARY)
+    endforeach(_lib ${_libraries})
+
+    message("FFTW3 WINDOWS libraries: " ${FFTW3_LIBRARIES})
+
+else (WIN32)
+    foreach(_lib ${_libraries})
+
+      string(TOUPPER ${_lib} _LIB)
+
+      find_library(${_LIB}_LIBRARY ${_lib}
+        HINTS $ENV{FFTW3_ROOT_DIR} PATH_SUFFIXES lib)
+      mark_as_advanced(${_LIB}_LIBRARY)
+      list(APPEND FFTW3_LIBRARIES ${${_LIB}_LIBRARY})
+      list(APPEND _check_list ${_LIB}_LIBRARY)
+    endforeach(_lib ${_libraries})
+
+    message("FFTW3 UNIX libraries: " ${FFTW3_LIBRARIES})
+endif (WIN32)
+
+# Search for the header file.
+find_path(FFTW3_INCLUDE_DIR fftw3.h 
+  HINTS $ENV{FFTW3_ROOT_DIR} PATH_SUFFIXES include)
+mark_as_advanced(FFTW3_INCLUDE_DIR)
+list(APPEND _check_list FFTW3_INCLUDE_DIR)
+
+# Handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE if
+# all listed variables are TRUE
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(FFTW3 DEFAULT_MSG ${_check_list})
diff --git a/cmake/FindGLEW.cmake b/cmake/FindGLEW.cmake
new file mode 100644
index 0000000..d9b35e0
--- /dev/null
+++ b/cmake/FindGLEW.cmake
@@ -0,0 +1,53 @@
+# - Try to find GLEW
+# Once done this will define
+#  
+#  GLEW_FOUND        - system has GLEW
+#  GLEW_INCLUDE_DIR  - the GLEW include directory
+#  GLEW_LIBRARY_DIR  - where the libraries are
+#  GLEW_LIBRARY      - Link these to use GLEW
+#   
+
+IF (GLEW_INCLUDE_DIR)
+  # Already in cache, be silent
+  SET(GLEW_FIND_QUIETLY TRUE)
+ENDIF (GLEW_INCLUDE_DIR)
+
+if( WIN32 )
+   if( MSVC80 )
+       set( COMPILER_PATH "C:/Program\ Files/Microsoft\ Visual\ Studio\ 8/VC" )
+   endif( MSVC80 )
+   if( MSVC71 )
+       set( COMPILER_PATH "C:/Program\ Files/Microsoft\ Visual\ Studio\ .NET\ 2003/Vc7" )
+   endif( MSVC71 )
+   FIND_PATH( GLEW_INCLUDE_DIR gl/glew.h gl/wglew.h
+              PATHS c:/glew/include ${COMPILER_PATH}/PlatformSDK/Include )
+   SET( GLEW_NAMES glew32 )
+   FIND_LIBRARY( GLEW_LIBRARY
+                 NAMES ${GLEW_NAMES}
+                 PATHS c:/glew/lib ${COMPILER_PATH}/PlatformSDK/Lib )
+else( WIN32 )
+   FIND_PATH( GLEW_INCLUDE_DIR glew.h wglew.h
+              PATHS /usr/local/include /usr/include
+              PATH_SUFFIXES gl/ GL/ )
+   SET( GLEW_NAMES glew GLEW )
+   FIND_LIBRARY( GLEW_LIBRARY
+                 NAMES ${GLEW_NAMES}
+                 PATHS /usr/lib /usr/local/lib )
+endif( WIN32 )
+
+GET_FILENAME_COMPONENT( GLEW_LIBRARY_DIR ${GLEW_LIBRARY} PATH )
+
+IF (GLEW_INCLUDE_DIR AND GLEW_LIBRARY)
+   SET(GLEW_FOUND TRUE)
+    SET( GLEW_LIBRARY_DIR ${GLEW_LIBRARY} )
+    MESSAGE("GLEW FOUND")
+ELSE (GLEW_INCLUDE_DIR AND GLEW_LIBRARY)
+   SET( GLEW_FOUND FALSE )
+   SET( GLEW_LIBRARY_DIR )
+    MESSAGE("GLEW NOT FOUND")
+ENDIF (GLEW_INCLUDE_DIR AND GLEW_LIBRARY)
+
+MARK_AS_ADVANCED(
+  GLEW_LIBRARY
+  GLEW_INCLUDE_DIR
+)
diff --git a/cmake/FindGMatlab.cmake b/cmake/FindGMatlab.cmake
new file mode 100644
index 0000000..5e8a8f5
--- /dev/null
+++ b/cmake/FindGMatlab.cmake
@@ -0,0 +1,116 @@
+# - this module looks for Matlab
+# Defines:
+#  MATLAB_INCLUDE_DIR:  include path for mex.h, engine.h
+#  MATLAB_LIBRARIES:    required libraries: libmex, etc
+#  MATLAB_JARS:         optional java jars: jmi.jar, util.jar, etc
+#  MATLAB_MEX_LIBRARY:  path to libmex.lib
+#  MATLAB_MX_LIBRARY:   path to libmx.lib
+#  MATLAB_ENG_LIBRARY:  path to libeng.lib
+
+#=============================================================================
+# Copyright 2005-2009 Kitware, Inc.
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+#
+#  Modified April 2013 - Joseph Naegele
+#   - Updated to work on OS X 10._
+#   - Added Matlab's Java Jars as MATLAB_JARS
+
+set(MATLAB_FOUND 0)
+
+if(WIN32)
+    set(MATLAB_ROOT "[HKEY_LOCAL_MACHINE\\SOFTWARE\\MathWorks\\MATLAB\\7.0;MATLABROOT]")
+    if(${CMAKE_GENERATOR} MATCHES "Visual Studio 6")
+        set(MATLAB_LIB_ROOT "${MATLAB_ROOT}/extern/lib/win32/microsoft/msvc60")
+    else()
+        if(${CMAKE_GENERATOR} MATCHES "Visual Studio 7")
+            # Assume people are generally using 7.1,
+            # if using 7.0 need to link to: ../extern/lib/win32/microsoft/msvc70
+            set(MATLAB_LIB_ROOT "${MATLAB_ROOT}/extern/lib/win32/microsoft/msvc71")
+        else()
+            if(${CMAKE_GENERATOR} MATCHES "Borland")
+                # Same here, there are also: bcc50 and bcc51 directories
+                set(MATLAB_LIB_ROOT "${MATLAB_ROOT}/extern/lib/win32/microsoft/bcc54")
+            else()
+                if(MATLAB_FIND_REQUIRED)
+                    message(FATAL_ERROR "Generator not compatible: ${CMAKE_GENERATOR}")
+                endif()
+            endif()
+        endif()
+    endif()
+    find_path(
+        MATLAB_INCLUDE_DIR
+        "mex.h"
+        HINTS ${MATLAB_ROOT}
+        PATH_SUFFIXES extern/include
+    )
+else(WIN32)
+    find_path(
+        MATLAB_ROOT extern/include/mex.h
+        HINTS $ENV{MATLAB_HOME} $ENV{MATLAB_ROOT}
+        PATHS /usr /usr/local /opt
+        PATH_SUFFIXES MATLAB
+    )
+    find_path(
+        MATLAB_INCLUDE_DIR
+        "mex.h"
+        HINTS ${MATLAB_ROOT}
+        PATH_SUFFIXES extern/include
+    )
+endif(WIN32)
+
+# find each library
+# give it it's own cmake variable
+# add it to the list of libraries
+foreach(lib mat ut mex mx eng)
+    string(TOUPPER ${lib} LIB)
+    find_library(
+        MATLAB_${LIB}_LIBRARY
+        ${lib}
+        HINTS ${MATLAB_ROOT} ${MATLAB_LIB_ROOT}
+        PATH_SUFFIXES lib bin bin/maci64 bin/glnxa64 bin/glnxa86
+        NO_CMAKE_SYSTEM_PATH    # don't pick up libmx on OS X > 10.9
+    )
+    if(MATLAB_${LIB}_LIBRARY)
+        list(APPEND MATLAB_LIBRARIES "${MATLAB_${LIB}_LIBRARY}")
+    endif()
+    mark_as_advanced(MATLAB_${LIB}_LIBRARY)
+endforeach()
+
+foreach(jar jmi util)
+    string(TOUPPER ${jar} LIB)
+    find_file(
+        MATLAB_${LIB}_JAR
+        "${jar}.jar"
+        HINTS ${MATLAB_ROOT}
+        PATH_SUFFIXES java jar java/jar
+    )
+    if(MATLAB_${LIB}_JAR)
+        list(APPEND MATLAB_JARS "${MATLAB_${LIB}_JAR}")
+    endif()
+    mark_as_advanced(MATLAB_${LIB}_JAR)
+endforeach()
+
+if(MATLAB_INCLUDE_DIR AND MATLAB_LIBRARIES)
+    set(MATLAB_FOUND 1)
+endif()
+
+include("FindPackageHandleStandardArgs")
+FIND_PACKAGE_HANDLE_STANDARD_ARGS("Matlab" DEFAULT_MSG MATLAB_ROOT MATLAB_INCLUDE_DIR MATLAB_LIBRARIES)
+
+mark_as_advanced(
+    MATLAB_JARS
+    MATLAB_LIBRARIES
+    MATLAB_INCLUDE_DIR
+    MATLAB_FOUND
+    MATLAB_ROOT
+)
+
diff --git a/cmake/FindGadgetron.cmake b/cmake/FindGadgetron.cmake
new file mode 100644
index 0000000..38f8639
--- /dev/null
+++ b/cmake/FindGadgetron.cmake
@@ -0,0 +1,40 @@
+#
+# Find the Gadgetron Installation
+#
+
+# This module defines
+# GADGETRON_INCLUDE_DIR, where to finds Gadget.h
+# GADGETRON_HOME, Gadgetron Root Dir
+# GADGETRON_LIB_DIR, This is where all the installed gadgetron libraries live
+# GADGETRON_FOUND, if false, you cannot build anything that requires ACE
+
+# Keep a list of variable names that we need to pass on to
+# find_package_handle_standard_args().
+set(_check_list)
+
+# Search for the header file.
+find_path(GADGETRON_HOME include/gadgetron/Gadget.h
+    HINTS $ENV{GADGETRON_HOME} /usr/local/gadgetron /usr/gadgetron)
+mark_as_advanced(GADGETRON_HOME)
+list(APPEND _check_list GADGETRON_HOME)
+
+SET(GADGETRON_INCLUDE_DIR ${GADGETRON_HOME}/include/gadgetron)
+mark_as_advanced(GADGETRON_INCLUDE_DIR)
+list(APPEND _check_list GADGETRON_INCLUDE_DIR)
+
+SET(GADGETRON_LIB_DIR ${GADGETRON_HOME}/lib)
+mark_as_advanced(GADGETRON_LIB_DIR)
+list(APPEND _check_list GADGETRON_LIB_DIR)
+
+# Handle the QUIETLY and REQUIRED arguments and set FFTW_FOUND to TRUE if
+# all listed variables are TRUE
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Gadgetron DEFAULT_MSG ${_check_list})
+
+# If Cuda is detected on the system some header files will be needed
+# -- whether Cuda is used or not --
+
+find_package(CUDA)
+if (CUDA_FOUND)
+  include_directories( ${CUDA_INCLUDE_DIRS} )
+endif (CUDA_FOUND)
diff --git a/cmake/FindIsmrmrd.cmake b/cmake/FindIsmrmrd.cmake
new file mode 100644
index 0000000..f09b5a8
--- /dev/null
+++ b/cmake/FindIsmrmrd.cmake
@@ -0,0 +1,28 @@
+# - Find ISMRMRRD
+#   ISMRMRD_FOUND            - true if an ISMRMRD installation is found.
+#   ISMRMRD_INCLUDE_DIR      - where to find ismrmrd.h, etc.
+#   ISMRMRD_LIBRARIES        - libismrmrd.so and libismrmrd_xml.so
+#   ISMRMRD_SCHEMA_DIR       - where to find ismrmrd.xsd       
+
+FIND_PATH( ISMRMRD_INCLUDE_DIR ismrmrd/ismrmrd.h 
+HINTS $ENV{ISMRMRD_HOME} PATHS /usr/local /usr PATH_SUFFIXES include)
+
+FIND_PATH( ISMRMRD_SCHEMA_DIR ismrmrd.xsd 
+HINTS $ENV{ISMRMRD_HOME} PATHS /usr/local /usr PATH_SUFFIXES share/ismrmrd/schema)
+
+FIND_LIBRARY( ISMRMRD_LIBRARY NAMES ismrmrd
+HINTS $ENV{ISMRMRD_HOME} /usr/local /usr PATH_SUFFIXES lib)
+
+FIND_PATH( ISMRMRD_LIB_DIR libismrmrd.so
+HINTS $ENV{ISMRMRD_HOME} /usr/local /usr PATH_SUFFIXES lib)
+
+SET(ISMRMRD_LIBRARIES ${ISMRMRD_LIBRARY})
+
+INCLUDE( "FindPackageHandleStandardArgs" )
+FIND_PACKAGE_HANDLE_STANDARD_ARGS( "Ismrmrd" DEFAULT_MSG ISMRMRD_INCLUDE_DIR ISMRMRD_LIBRARIES ISMRMRD_SCHEMA_DIR)
+
+MARK_AS_ADVANCED( ISMRMRD_INCLUDE_DIR ISMRMRD_LIBRARIES ISMRMRD_SCHEMA_DIR)
+
+#if(ISMRMRD_FOUND)
+#  message("ISMRMRD found ${ISMRMRD_LIBRARIES}")
+#endif(ISMRMRD_FOUND)
diff --git a/cmake/FindMKL.cmake b/cmake/FindMKL.cmake
new file mode 100644
index 0000000..4a5f25d
--- /dev/null
+++ b/cmake/FindMKL.cmake
@@ -0,0 +1,128 @@
+# - Find the MKL libraries
+# Modified from Armadillo's ARMA_FindMKL.cmake
+# This module defines
+#  MKL_INCLUDE_DIR, the directory for the MKL headers
+#  MKL_LIB_DIR, the directory for the MKL library files
+#  MKL_COMPILER_LIB_DIR, the directory for the MKL compiler library files
+#  MKL_LIBRARIES, the libraries needed to use Intel's implementation of BLAS & LAPACK.
+#  MKL_FOUND, If false, do not try to use MKL; if true, the macro definition USE_MKL is added.
+
+# Set the include path
+# TODO: what if MKL is not installed in /opt/intel/mkl?
+# try to find at /opt/intel/mkl
+# in windows, try to find MKL at C:/Program Files (x86)/Intel/Composer XE/mkl
+
+if ( WIN32 )
+  if(NOT DEFINED ENV{MKLROOT_PATH})
+    set(MKLROOT_PATH "C:/Program Files (x86)/Intel/Composer XE" CACHE PATH "Where the MKL are stored")
+  endif(NOT DEFINED ENV{MKLROOT_PATH}) 
+else ( WIN32 )
+    set(MKLROOT_PATH "/opt/intel" CACHE PATH "Where the MKL are stored")
+endif ( WIN32 )
+
+if (EXISTS ${MKLROOT_PATH}/mkl)
+    SET(MKL_FOUND TRUE)
+    message("MKL is found at ${MKLROOT_PATH}/mkl")
+    IF(CMAKE_SIZEOF_VOID_P EQUAL 8)
+        set( USE_MKL_64BIT On )
+        if ( ARMADILLO_FOUND )
+            if ( ARMADILLO_BLAS_LONG_LONG )
+                set( USE_MKL_64BIT_LIB On )
+                ADD_DEFINITIONS(-DMKL_ILP64)
+                message("MKL is linked against ILP64 interface ... ")
+            endif ( ARMADILLO_BLAS_LONG_LONG )
+        endif ( ARMADILLO_FOUND )
+    ELSE(CMAKE_SIZEOF_VOID_P EQUAL 8)
+        set( USE_MKL_64BIT Off )
+    ENDIF(CMAKE_SIZEOF_VOID_P EQUAL 8)
+else (EXISTS ${MKLROOT_PATH}/mkl)
+    SET(MKL_FOUND FALSE)
+    message("MKL is NOT found ... ")
+endif (EXISTS ${MKLROOT_PATH}/mkl)
+
+if (MKL_FOUND)
+    set(MKL_INCLUDE_DIR "${MKLROOT_PATH}/mkl/include")
+    ADD_DEFINITIONS(-DUSE_MKL)
+    if ( USE_MKL_64BIT )
+        set(MKL_LIB_DIR "${MKLROOT_PATH}/mkl/lib/intel64")
+        set(MKL_COMPILER_LIB_DIR "${MKLROOT_PATH}/compiler/lib/intel64")
+        set(MKL_COMPILER_LIB_DIR ${MKL_COMPILER_LIB_DIR} "${MKLROOT_PATH}/lib/intel64")
+        if ( USE_MKL_64BIT_LIB )
+                if (WIN32)
+                    set(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_intel_ilp64)
+                else (WIN32)
+                    set(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_intel_ilp64)
+                endif (WIN32)
+        else ( USE_MKL_64BIT_LIB )
+                if (WIN32)
+                    set(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_intel_lp64)
+                else (WIN32)
+                    set(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_intel_lp64)
+                endif (WIN32)
+        endif ( USE_MKL_64BIT_LIB )
+    else ( USE_MKL_64BIT )
+        set(MKL_LIB_DIR "${MKLROOT_PATH}/mkl/lib/ia32")
+        set(MKL_COMPILER_LIB_DIR "${MKLROOT_PATH}/compiler/lib/ia32")
+        set(MKL_COMPILER_LIB_DIR ${MKL_COMPILER_LIB_DIR} "${MKLROOT_PATH}/lib/ia32")
+        if ( WIN32 )
+            set(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_intel_c)
+        else ( WIN32 )
+            set(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_intel)
+        endif ( WIN32 )
+    endif ( USE_MKL_64BIT )
+
+    if (WIN32)
+        SET(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_intel_thread)
+        SET(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_core)
+        SET(MKL_LIBRARIES ${MKL_LIBRARIES} libiomp5md)
+    else (WIN32)
+        SET(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_gnu_thread)
+        SET(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_core)
+    endif (WIN32) 
+endif (MKL_FOUND)
+
+IF (MKL_FOUND)
+    IF (NOT MKL_FIND_QUIETLY)
+        MESSAGE(STATUS "Found MKL libraries: ${MKL_LIBRARIES}")
+        MESSAGE(STATUS "MKL_INCLUDE_DIR: ${MKL_INCLUDE_DIR}")
+        MESSAGE(STATUS "MKL_LIB_DIR: ${MKL_LIB_DIR}")
+        MESSAGE(STATUS "MKL_COMPILER_LIB_DIR: ${MKL_COMPILER_LIB_DIR}")
+    ENDIF (NOT MKL_FIND_QUIETLY)
+
+    # ------------------------------------------------------------------------
+    #  Extract version information from <mkl.h>
+    # ------------------------------------------------------------------------
+
+    set(INTEL_MKL_VERSION_MAJOR 0)
+    set(INTEL_MKL_VERSION_MINOR 0)
+    set(INTEL_MKL_VERSION_UPDATE 0)
+
+    if(EXISTS "${MKL_INCLUDE_DIR}/mkl.h")
+
+        # Read and parse header file for version number
+        file(STRINGS "${MKL_INCLUDE_DIR}/mkl.h" _mkl_HEADER_CONTENTS REGEX "#define __INTEL_MKL__ ")
+        string(REGEX REPLACE ".*#define __INTEL_MKL__ ([0-9]+).*" "\\1" INTEL_MKL_VERSION_MAJOR "${_mkl_HEADER_CONTENTS}")
+        unset(_mkl_HEADER_CONTENTS)
+        
+        file(STRINGS "${MKL_INCLUDE_DIR}/mkl.h" _mkl_HEADER_CONTENTS REGEX "#define __INTEL_MKL_MINOR__ ")
+        string(REGEX REPLACE ".*#define __INTEL_MKL_MINOR__ ([0-9]+).*" "\\1" INTEL_MKL_VERSION_MINOR "${_mkl_HEADER_CONTENTS}")
+        unset(_mkl_HEADER_CONTENTS)
+        
+        file(STRINGS "${MKL_INCLUDE_DIR}/mkl.h" _mkl_HEADER_CONTENTS REGEX "#define __INTEL_MKL_UPDATE__ ")
+        string(REGEX REPLACE ".*#define __INTEL_MKL_UPDATE__ ([0-9]+).*" "\\1" INTEL_MKL_VERSION_UPDATE "${_mkl_HEADER_CONTENTS}")
+
+        unset(_mkl_HEADER_CONTENTS)
+    endif()
+
+    set(MKL_VERSION_STRING "${INTEL_MKL_VERSION_MAJOR}.${INTEL_MKL_VERSION_MINOR}.${INTEL_MKL_VERSION_UPDATE}")
+    message("find MKL version : ${MKL_VERSION_STRING}")
+
+    INCLUDE_DIRECTORIES( ${MKL_INCLUDE_DIR} )
+    LINK_DIRECTORIES( ${MKL_LIB_DIR} ${MKL_COMPILER_LIB_DIR} )
+ELSE (MKL_FOUND)
+    IF (MKL_FIND_REQUIRED)
+        MESSAGE(FATAL_ERROR "Could not find MKL libraries")
+    ENDIF (MKL_FIND_REQUIRED)
+ENDIF (MKL_FOUND)
+
+# MARK_AS_ADVANCED(MKL_LIBRARY)
diff --git a/cmake/FindNumPy.cmake b/cmake/FindNumPy.cmake
new file mode 100644
index 0000000..e2aa5c4
--- /dev/null
+++ b/cmake/FindNumPy.cmake
@@ -0,0 +1,102 @@
+# - Find the NumPy libraries
+# This module finds if NumPy is installed, and sets the following variables
+# indicating where it is.
+#
+# TODO: Update to provide the libraries and paths for linking npymath lib.
+#
+#  NUMPY_FOUND               - was NumPy found
+#  NUMPY_VERSION             - the version of NumPy found as a string
+#  NUMPY_VERSION_MAJOR       - the major version number of NumPy
+#  NUMPY_VERSION_MINOR       - the minor version number of NumPy
+#  NUMPY_VERSION_PATCH       - the patch version number of NumPy
+#  NUMPY_VERSION_DECIMAL     - e.g. version 1.6.1 is 10601
+#  NUMPY_INCLUDE_DIRS        - path to the NumPy include files
+
+
+#============================================================================
+# Copyright 2012 Continuum Analytics, Inc.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to permit
+# persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+#============================================================================
+
+# Finding NumPy involves calling the Python interpreter
+if(NumPy_FIND_REQUIRED)
+    find_package(PythonInterp REQUIRED)
+else()
+    find_package(PythonInterp)
+endif()
+
+if(NOT PYTHONINTERP_FOUND)
+    set(NUMPY_FOUND FALSE)
+    return()
+endif()
+
+execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+    "import numpy as n; print(n.__version__); print(n.get_include());"
+    RESULT_VARIABLE _NUMPY_SEARCH_SUCCESS
+    OUTPUT_VARIABLE _NUMPY_VALUES_OUTPUT
+    ERROR_VARIABLE _NUMPY_ERROR_VALUE
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+if(NOT _NUMPY_SEARCH_SUCCESS MATCHES 0)
+    if(NumPy_FIND_REQUIRED)
+        message(FATAL_ERROR
+            "NumPy import failure:\n${_NUMPY_ERROR_VALUE}")
+    endif()
+    set(NUMPY_FOUND FALSE)
+    return()
+endif()
+
+# Convert the process output into a list
+string(REGEX REPLACE ";" "\\\\;" _NUMPY_VALUES ${_NUMPY_VALUES_OUTPUT})
+string(REGEX REPLACE "\n" ";" _NUMPY_VALUES ${_NUMPY_VALUES})
+list(GET _NUMPY_VALUES 0 NUMPY_VERSION)
+list(GET _NUMPY_VALUES 1 NUMPY_INCLUDE_DIRS)
+
+string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" _VER_CHECK "${NUMPY_VERSION}")
+if("${_VER_CHECK}" STREQUAL "")
+    # The output from Python was unexpected. Raise an error always
+    # here, because we found NumPy, but it appears to be corrupted somehow.
+    message(FATAL_ERROR
+        "Requested version and include path from NumPy, got instead:\n${_NUMPY_VALUES_OUTPUT}\n")
+    return()
+endif()
+
+# Make sure all directory separators are '/'
+string(REGEX REPLACE "\\\\" "/" NUMPY_INCLUDE_DIRS ${NUMPY_INCLUDE_DIRS})
+
+# Get the major and minor version numbers
+string(REGEX REPLACE "\\." ";" _NUMPY_VERSION_LIST ${NUMPY_VERSION})
+list(GET _NUMPY_VERSION_LIST 0 NUMPY_VERSION_MAJOR)
+list(GET _NUMPY_VERSION_LIST 1 NUMPY_VERSION_MINOR)
+list(GET _NUMPY_VERSION_LIST 2 NUMPY_VERSION_PATCH)
+string(REGEX MATCH "[0-9]*" NUMPY_VERSION_PATCH ${NUMPY_VERSION_PATCH})
+math(EXPR NUMPY_VERSION_DECIMAL
+    "(${NUMPY_VERSION_MAJOR} * 10000) + (${NUMPY_VERSION_MINOR} * 100) + ${NUMPY_VERSION_PATCH}")
+
+find_package_message(NUMPY
+    "Found NumPy: version \"${NUMPY_VERSION}\" ${NUMPY_INCLUDE_DIRS}"
+    "${NUMPY_INCLUDE_DIRS}${NUMPY_VERSION}")
+
+set(NUMPY_FOUND TRUE)
+
diff --git a/cmake/FindOctave.cmake b/cmake/FindOctave.cmake
new file mode 100644
index 0000000..52beab9
--- /dev/null
+++ b/cmake/FindOctave.cmake
@@ -0,0 +1,84 @@
+# Try to find the build flags to compile octave shared objects (oct and mex files)
+# Once done this will define
+#
+# OCTAVE_FOUND - if OCTAVE is found
+# OCTAVE_CXXFLAGS - extra flags
+# OCTAVE_INCLUDE_DIRS - include directories
+# OCTAVE_LINK_DIRS - link directories
+# OCTAVE_LIBRARY_RELEASE - the relase version
+# OCTAVE_LIBRARY_DEBUG - the debug version
+# OCTAVE_LIBRARY - a default library, with priority debug.
+
+# use mkoctfile
+set(MKOCTFILE_EXECUTABLE MKOCTFILE_EXECUTABLE-NOTFOUND)
+find_program(MKOCTFILE_EXECUTABLE NAME mkoctfile PATHS)
+mark_as_advanced(MKOCTFILE_EXECUTABLE)
+
+if(MKOCTFILE_EXECUTABLE)
+  set(OCTAVE_FOUND 1)
+
+  execute_process(
+    COMMAND ${MKOCTFILE_EXECUTABLE} -p ALL_CXXFLAGS
+    OUTPUT_VARIABLE _mkoctfile_cppflags
+    RESULT_VARIABLE _mkoctfile_failed)
+  string(REGEX REPLACE "[\r\n]" " " _mkoctfile_cppflags "${_mkoctfile_cppflags}")
+  execute_process(
+    COMMAND ${MKOCTFILE_EXECUTABLE} -p INCFLAGS
+    OUTPUT_VARIABLE _mkoctfile_includedir
+    RESULT_VARIABLE _mkoctfile_failed)
+  string(REGEX REPLACE "[\r\n]" " " _mkoctfile_includedir "${_mkoctfile_includedir}")
+  execute_process(
+    COMMAND ${MKOCTFILE_EXECUTABLE} -p ALL_LDFLAGS
+    OUTPUT_VARIABLE _mkoctfile_ldflags
+    RESULT_VARIABLE _mkoctfile_failed)
+  string(REGEX REPLACE "[\r\n]" " " _mkoctfile_ldflags "${_mkoctfile_ldflags}")
+  execute_process(
+    COMMAND ${MKOCTFILE_EXECUTABLE} -p LFLAGS
+    OUTPUT_VARIABLE _mkoctfile_lflags
+    RESULT_VARIABLE _mkoctfile_failed)
+  string(REGEX REPLACE "[\r\n]" " " _mkoctfile_lflags "${_mkoctfile_lflags}")
+  execute_process(
+    COMMAND ${MKOCTFILE_EXECUTABLE} -p LIBS
+    OUTPUT_VARIABLE _mkoctfile_libs
+    RESULT_VARIABLE _mkoctfile_failed)
+  string(REGEX REPLACE "[\r\n]" " " _mkoctfile_libs "${_mkoctfile_libs}")
+  execute_process(
+    COMMAND ${MKOCTFILE_EXECUTABLE} -p OCTAVE_LIBS
+    OUTPUT_VARIABLE _mkoctfile_octlibs
+    RESULT_VARIABLE _mkoctfile_failed)
+  string(REGEX REPLACE "[\r\n]" " " _mkoctfile_octlibs "${_mkoctfile_octlibs}")
+  set(_mkoctfile_libs "${_mkoctfile_libs} ${_mkoctfile_octlibs}")
+
+  string(REGEX MATCHALL "(^| )-l([./+-_\\a-zA-Z]*)" _mkoctfile_libs "${_mkoctfile_libs}")
+  string(REGEX REPLACE "(^| )-l" "" _mkoctfile_libs "${_mkoctfile_libs}")
+
+  string(REGEX MATCHALL "(^| )-L([./+-_\\a-zA-Z]*)" _mkoctfile_ldirs "${_mkoctfile_lflags}")
+  string(REGEX REPLACE "(^| )-L" "" _mkoctfile_ldirs "${_mkoctfile_ldirs}")
+
+  string(REGEX MATCHALL "(^| )-I([./+-_\\a-zA-Z]*)" _mkoctfile_includedir "${_mkoctfile_includedir}")
+  string(REGEX REPLACE "(^| )-I" "" _mkoctfile_includedir "${_mkoctfile_includedir}")
+
+  string(REGEX REPLACE "(^| )-l([./+-_\\a-zA-Z]*)" " " _mkoctfile_ldflags "${_mkoctfile_ldflags}")
+  string(REGEX REPLACE "(^| )-L([./+-_\\a-zA-Z]*)" " " _mkoctfile_ldflags "${_mkoctfile_ldflags}")
+
+  separate_arguments(_mkoctfile_includedir)
+
+  set( OCTAVE_CXXFLAGS "${_mkoctfile_cppflags}" )
+  set( OCTAVE_LINK_FLAGS "${_mkoctfile_ldflags}" )
+  set( OCTAVE_INCLUDE_DIRS ${_mkoctfile_includedir})
+  set( OCTAVE_LINK_DIRS ${_mkoctfile_ldirs})
+  set( OCTAVE_LIBRARY ${_mkoctfile_libs})
+  set( OCTAVE_LIBRARY_RELEASE ${OCTAVE_LIBRARY})
+  set( OCTAVE_LIBRARY_DEBUG ${OCTAVE_LIBRARY})
+endif(MKOCTFILE_EXECUTABLE)
+
+MARK_AS_ADVANCED(
+    OCTAVE_LIBRARY_FOUND
+    OCTAVE_CXXFLAGS
+    OCTAVE_LINK_FLAGS
+    OCTAVE_INCLUDE_DIRS
+    OCTAVE_LINK_DIRS
+    OCTAVE_LIBRARY
+    OCTAVE_LIBRARY_RELEASE
+    OCTAVE_LIBRARY_DEBUG
+)
diff --git a/cmake/InstallLinuxDependencies.cmake b/cmake/InstallLinuxDependencies.cmake
new file mode 100644
index 0000000..beffdaa
--- /dev/null
+++ b/cmake/InstallLinuxDependencies.cmake
@@ -0,0 +1,24 @@
+
+#install dependencies
+if (MKL_FOUND)
+    if (HAS_64_BIT)
+        set(MKL_REDIST_DIR ${MKLROOT_PATH}/mkl/lib/intel64)
+        set(MKL_COMPILER_REDIST_DIR ${MKLROOT_PATH}/lib/intel64)
+    else (HAS_64_BIT)
+        set(MKL_REDIST_DIR ${MKLROOT_PATH}/mkl/lib/ia32)
+        set(MKL_COMPILER_REDIST_DIR ${MKLROOT_PATH}/lib/ia32)
+    endif (HAS_64_BIT)
+
+    message("Install mkl libraries from ${MKL_REDIST_DIR} ")
+    FILE(GLOB MKL_DLL ${MKL_REDIST_DIR}/*.so)
+    foreach(fileName ${MKL_DLL})
+        message("Install ${fileName} ")
+        install( FILES ${fileName} DESTINATION lib COMPONENT main)
+    endforeach(fileName)
+
+    FILE(GLOB MKL_COMPILER_DLL ${MKL_COMPILER_REDIST_DIR}/libiomp5*.so)
+    foreach(fileName ${MKL_COMPILER_DLL})
+        message("Install ${fileName} ")
+        install( FILES ${fileName} DESTINATION lib COMPONENT main)
+    endforeach(fileName)
+endif (MKL_FOUND)
diff --git a/cmake/InstallWinDependencies.cmake b/cmake/InstallWinDependencies.cmake
new file mode 100644
index 0000000..fc16558
--- /dev/null
+++ b/cmake/InstallWinDependencies.cmake
@@ -0,0 +1,137 @@
+
+#install dependencies
+if (HDF5_FOUND)
+    if(DEFINED ENV{HDF5_ROOT})
+        set(HDF5_BIN_DIR $ENV{HDF5_ROOT}/bin)
+    else (DEFINED ENV{HDF5_ROOT})
+        set(HDF5_BIN_DIR ${HDF5_C_INCLUDE_DIR}/../bin)
+    endif (DEFINED ENV{HDF5_ROOT})
+    message("Install hdf5 libraries from ${HDF5_BIN_DIR} ")
+
+    FILE(GLOB HDF5_DLL ${HDF5_BIN_DIR}/*.dll)
+    foreach(fileName ${HDF5_DLL})
+        message("Install ${fileName} ")
+        install( FILES ${fileName} DESTINATION lib COMPONENT main)
+    endforeach(fileName)
+endif (HDF5_FOUND)
+
+if (MKL_FOUND)
+    if (HAS_64_BIT)
+        set(MKL_REDIST_DIR ${MKLROOT_PATH}/redist/intel64)
+    else (HAS_64_BIT)
+        set(MKL_REDIST_DIR ${MKLROOT_PATH}/redist/ia32)
+    endif (HAS_64_BIT)
+
+    message("Install mkl libraries from ${MKL_REDIST_DIR}/compiler ")
+    FILE(GLOB MKL_DLL ${MKL_REDIST_DIR}/compiler/*iomp5md*.dll)
+    foreach(fileName ${MKL_DLL})
+        message("Install ${fileName} ")
+        install( FILES ${fileName} DESTINATION lib COMPONENT main)
+    endforeach(fileName)
+endif (MKL_FOUND)
+
+if (ACE_FOUND)
+    message("Install ACE libraries from ${ACE_INCLUDE_DIR}/lib ")
+    FILE(GLOB ACE_DLL ${ACE_INCLUDE_DIR}/lib/ACE.dll)
+    foreach(fileName ${ACE_DLL})
+        message("Install ${ACE_DLL} ")
+        install( FILES ${ACE_DLL} DESTINATION lib COMPONENT main)
+    endforeach(fileName)
+endif (ACE_FOUND)
+
+if (ARMADILLO_FOUND)
+    message("Install ARMADILLO libraries from ${ARMADILLO_INCLUDE_DIRS}/../lib ")
+    FILE(GLOB ARMADILLO_DLL ${ARMADILLO_INCLUDE_DIRS}/../lib/*.dll)
+    foreach(fileName ${ARMADILLO_DLL})
+        message("Install ${fileName} ")
+        install( FILES ${fileName} DESTINATION lib COMPONENT main)
+    endforeach(fileName)
+endif (ARMADILLO_FOUND)
+
+if (FFTW3_FOUND)
+    message("Install FFTW3 libraries from ${FFTW3_INCLUDE_DIR} ")
+    FILE(GLOB FFTW3_DLL ${FFTW3_INCLUDE_DIR}/*.dll)
+    foreach(fileName ${FFTW3_DLL})
+        message("Install ${fileName} ")
+        install( FILES ${fileName} DESTINATION lib COMPONENT main)
+    endforeach(fileName)
+endif (FFTW3_FOUND)
+
+if (ISMRMRD_FOUND)
+    message("Install ISMRMRD libraries from ${ISMRMRD_INCLUDE_DIR}/../lib ")
+    FILE(GLOB ISMRMRD_DLL ${ISMRMRD_INCLUDE_DIR}/../lib/*.dll)
+    foreach(fileName ${ISMRMRD_DLL})
+        message("Install ${fileName} ")
+        install( FILES ${fileName} DESTINATION lib COMPONENT main)
+    endforeach(fileName)
+endif (ISMRMRD_FOUND)
+
+if (Boost_FOUND)
+    message("Install BOOST libraries from ${Boost_LIBRARY_DIR} ")
+    FILE(GLOB BOOST_CHRONO_DLL ${Boost_LIBRARY_DIR}/boost_chrono*.dll)
+    foreach(fileName ${BOOST_CHRONO_DLL})
+        message("Install ${fileName} ")
+        install( FILES ${fileName} DESTINATION lib COMPONENT main)
+    endforeach(fileName)
+
+    FILE(GLOB BOOST_DATE_TIME_DLL ${Boost_LIBRARY_DIR}/boost_data_time*.dll)
+    foreach(fileName ${BOOST_DATE_TIME_DLL})
+        message("Install ${fileName} ")
+        install( FILES ${fileName} DESTINATION lib COMPONENT main)
+    endforeach(fileName)
+
+    FILE(GLOB BOOST_PROGRAM_OPTIONS_DLL ${Boost_LIBRARY_DIR}/boost_program_options*.dll)
+    foreach(fileName ${BOOST_PROGRAM_OPTIONS_DLL})
+        message("Install ${fileName} ")
+        install( FILES ${fileName} DESTINATION lib COMPONENT main)
+    endforeach(fileName)
+
+    if (Boost_PYTHON_FOUND AND PYTHONLIBS_FOUND AND NUMPY_FOUND)
+        FILE(GLOB BOOST_PYTHON_DLL ${Boost_LIBRARY_DIR}/boost_python*.dll)
+        foreach(fileName ${BOOST_PYTHON_DLL})
+            message("Install ${fileName} ")
+            install( FILES ${fileName} DESTINATION lib COMPONENT main)
+        endforeach(fileName)
+    endif (Boost_PYTHON_FOUND AND PYTHONLIBS_FOUND AND NUMPY_FOUND)
+
+    FILE(GLOB BOOST_SYSTEM_DLL ${Boost_LIBRARY_DIR}/boost_system*.dll)
+    foreach(fileName ${BOOST_SYSTEM_DLL})
+        message("Install ${fileName} ")
+        install( FILES ${fileName} DESTINATION lib COMPONENT main)
+    endforeach(fileName)
+
+    FILE(GLOB BOOST_THREAD_DLL ${Boost_LIBRARY_DIR}/boost_thread*.dll)
+    foreach(fileName ${BOOST_THREAD_DLL})
+        message("Install ${fileName} ")
+        install( FILES ${fileName} DESTINATION lib COMPONENT main)
+    endforeach(fileName)
+
+    FILE(GLOB BOOST_FILESYSTEM_DLL ${Boost_LIBRARY_DIR}/boost_filesystem*.dll)
+    foreach(fileName ${BOOST_FILESYSTEM_DLL})
+        message("Install ${fileName} ")
+        install( FILES ${fileName} DESTINATION lib COMPONENT main)
+    endforeach(fileName)
+endif (Boost_FOUND)
+
+if (DCMTK_FOUND)
+    message("Install DCMTK libraries from ${DCMTK_DIR}/lib ")
+    FILE(GLOB DCMTK_DLL ${DCMTK_DIR}/lib/*.dll)
+    foreach(fileName ${DCMTK_DLL})
+        message("Install ${fileName} ")
+        install( FILES ${fileName} DESTINATION lib COMPONENT main)
+    endforeach(fileName)
+endif (DCMTK_FOUND)
+
+if (GTEST_FOUND)
+    message("Install GTEST libraries from ${DCMTK_DIR}/lib ")
+
+    get_filename_component(GTEST_LIB_NAME ${GTEST_LIBRARY} NAME_WE)
+    get_filename_component(GTEST_DLL ${GTEST_LIBRARY} DIRECTORY)
+    message("Install ${GTEST_DLL}/${GTEST_LIB_NAME}.dll ")
+    install( FILES ${GTEST_DLL}/${GTEST_LIB_NAME}.dll DESTINATION lib COMPONENT main)
+
+    get_filename_component(GTEST_MAIN_NAME ${GTEST_MAIN_LIBRARY} NAME_WE)
+    get_filename_component(GTEST_MAIN_DLL ${GTEST_MAIN_LIBRARY} DIRECTORY)
+    message("Install ${GTEST_MAIN_DLL}/${GTEST_MAIN_NAME}.dll ")
+    install( FILES ${GTEST_MAIN_DLL}/${GTEST_MAIN_NAME}.dll DESTINATION lib COMPONENT main)
+endif (GTEST_FOUND)
diff --git a/cmake/InstallWinGadgetron.bat b/cmake/InstallWinGadgetron.bat
new file mode 100644
index 0000000..2a34033
--- /dev/null
+++ b/cmake/InstallWinGadgetron.bat
@@ -0,0 +1,6 @@
+set INSTALL_DIR=%~dp0
+
+: set the path of gadgetron
+setx PATH "%PATH%;%INSTALL_DIR%\..\lib;%INSTALL_DIR%\..\bin"
+: copy the gadgetron.xml file
+copy /Y %INSTALL_DIR%\..\config\gadgetron.xml.example %INSTALL_DIR%\..\config\gadgetron.xml
\ No newline at end of file
diff --git a/cmake/cpack_options.cmake.in b/cmake/cpack_options.cmake.in
new file mode 100644
index 0000000..ea38887
--- /dev/null
+++ b/cmake/cpack_options.cmake.in
@@ -0,0 +1,34 @@
+################################################################################
+# Metadata for package generators
+################################################################################
+
+# Common options
+set(CPACK_PACKAGE_VERSION "@GADGETRON_VERSION_STRING@")
+set(CPACK_PACKAGE_VERSION_MAJOR "@GADGETRON_VERSION_MAJOR@")
+set(CPACK_PACKAGE_VERSION_MINOR "@GADGETRON_VERSION_MINOR@")
+set(CPACK_PACKAGE_VERSION_PATCH "@GADGETRON_VERSION_PATCH@")
+set(CPACK_PACKAGE_NAME "@PROJECT_NAME@")
+set(CPACK_PACKAGE_VENDOR "https://gadgetron.github.io/")
+set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Gadgetron framwork")
+set(CPACK_PACKAGE_INSTALL_DIRECTORY "@PROJECT_NAME_LOWER@")
+set(CPACK_RESOURCE_FILE_LICENSE "@CMAKE_CURRENT_SOURCE_DIR@/LICENSE")
+set(CPACK_PACKAGE_MAINTAINER "Michael S. Hansen <michael.hansen at nih.gov>")
+set(CPACK_PACKAGE_CONTACT "Michael S. Hansen <michael.hansen at nih.gov>")
+
+# DEB specific
+set(CPACK_DEBIAN_PACKAGE_SECTION "devel")
+set(CPACK_DEBIAN_PACKAGE_PRIORITY "optional")
+set(CPACK_DEBIAN_PACKAGE_DESCRIPTION "Implementation of the Gadgetron.")
+set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "@CMAKE_SOURCE_DIR@/cmake/debian/postinst;@CMAKE_SOURCE_DIR@/cmake/debian/prerm;" )
+
+# NSIS specific
+set(CPACK_NSIS_HELP_LINK "https://github.com/gadgetron/gadgetron")
+set(CPACK_NSIS_URL_INFO_ABOUT "https://github.com/gadgetron/gadgetron")
+set(CPACK_NSIS_MODIFY_PATH ON)
+set(CPACK_NSIS_DISPLAY_NAME "gadgetron")
+
+set(CPACK_NSIS_EXTRA_INSTALL_COMMANDS "ExecWait '$INSTDIR/cmake/InstallWinGadgetron.bat'")
+
+# Output filename of the generated tarball / package
+set(CPACK_PACKAGE_FILE_NAME "@PROJECT_NAME_LOWER at -@GADGETRON_VERSION_STRING@")
+set(CPACK_SOURCE_PACKAGE_FILE_NAME "@PROJECT_NAME_LOWER at -@GADGETRON_VERSION_STRING@")
diff --git a/cmake/cpack_options_dependency.cmake.in b/cmake/cpack_options_dependency.cmake.in
new file mode 100644
index 0000000..8f3ed87
--- /dev/null
+++ b/cmake/cpack_options_dependency.cmake.in
@@ -0,0 +1,37 @@
+################################################################################
+# Metadata for package generators
+################################################################################
+
+# Common options
+set(CPACK_PACKAGE_VERSION "@GADGETRON_VERSION_STRING@")
+set(CPACK_PACKAGE_VERSION_MAJOR "@GADGETRON_VERSION_MAJOR@")
+set(CPACK_PACKAGE_VERSION_MINOR "@GADGETRON_VERSION_MINOR@")
+set(CPACK_PACKAGE_VERSION_PATCH "@GADGETRON_VERSION_PATCH@")
+set(CPACK_PACKAGE_NAME "@PROJECT_NAME@")
+set(CPACK_PACKAGE_VENDOR "http://gadgetron.sourceforge.net/")
+set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Gadgetron framwork")
+set(CPACK_PACKAGE_INSTALL_DIRECTORY "@PROJECT_NAME_LOWER@")
+set(CPACK_RESOURCE_FILE_LICENSE "@CMAKE_CURRENT_SOURCE_DIR@/LICENSE")
+set(CPACK_PACKAGE_MAINTAINER "Michael S. Hansen <michael.hansen at nih.gov>")
+set(CPACK_PACKAGE_CONTACT "Michael S. Hansen <michael.hansen at nih.gov>")
+
+set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
+SET(CPACK_COMPONENTS_ALL_IN_ONE_PACKAGE ON)
+
+# DEB specific
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "@DEBIAN_PACKAGE_DEPENDS@")
+set(CPACK_DEBIAN_PACKAGE_SECTION "devel")
+set(CPACK_DEBIAN_PACKAGE_PRIORITY "optional")
+set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
+set(CPACK_DEBIAN_PACKAGE_DESCRIPTION "Dependencies of the Gadgetron.")
+set(CPACK_DEB_COMPONENT_INSTALL ON)
+
+# NSIS specific
+set(CPACK_NSIS_HELP_LINK "http:\\\\\\\\gadgetron.sourceforge.net")
+set(CPACK_NSIS_URL_INFO_ABOUT "http:\\\\\\\\gadgetron.sourceforge.net")
+set(CPACK_NSIS_MODIFY_PATH ON)
+set(CPACK_NSIS_DISPLAY_NAME "gadgetron")
+
+# Output filename of the generated tarball / package
+set(CPACK_PACKAGE_FILE_NAME "@PROJECT_NAME_LOWER at -@GADGETRON_VERSION_STRING@")
+set(CPACK_SOURCE_PACKAGE_FILE_NAME "@PROJECT_NAME_LOWER at -@GADGETRON_VERSION_STRING@")
diff --git a/cmake/cpack_options_web.cmake.in b/cmake/cpack_options_web.cmake.in
new file mode 100644
index 0000000..345c3b2
--- /dev/null
+++ b/cmake/cpack_options_web.cmake.in
@@ -0,0 +1,38 @@
+################################################################################
+# Metadata for package generators
+################################################################################
+
+# Common options
+set(CPACK_PACKAGE_VERSION "@GADGETRON_VERSION_STRING@")
+set(CPACK_PACKAGE_VERSION_MAJOR "@GADGETRON_VERSION_MAJOR@")
+set(CPACK_PACKAGE_VERSION_MINOR "@GADGETRON_VERSION_MINOR@")
+set(CPACK_PACKAGE_VERSION_PATCH "@GADGETRON_VERSION_PATCH@")
+set(CPACK_PACKAGE_NAME "@PROJECT_NAME@")
+set(CPACK_PACKAGE_VENDOR "http://gadgetron.sourceforge.net/")
+set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Gadgetron web service")
+set(CPACK_PACKAGE_INSTALL_DIRECTORY "@PROJECT_NAME_LOWER@")
+set(CPACK_RESOURCE_FILE_LICENSE "@CMAKE_CURRENT_SOURCE_DIR@/LICENSE")
+set(CPACK_PACKAGE_MAINTAINER "Michael S. Hansen <michael.hansen at nih.gov>")
+set(CPACK_PACKAGE_CONTACT "Michael S. Hansen <michael.hansen at nih.gov>")
+
+set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
+SET(CPACK_COMPONENTS_ALL_IN_ONE_PACKAGE ON)
+
+# DEB specific
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "@DEBIAN_PACKAGE_DEPENDS@")
+set(CPACK_DEBIAN_PACKAGE_SECTION "devel")
+set(CPACK_DEBIAN_PACKAGE_PRIORITY "optional")
+set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
+set(CPACK_DEBIAN_PACKAGE_DESCRIPTION "Implementation of the Gadgetron web server.")
+set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "@CMAKE_SOURCE_DIR@/cmake/debian_web/postinst;@CMAKE_SOURCE_DIR@/cmake/debian_web/prerm;")
+set(CPACK_DEB_COMPONENT_INSTALL ON)
+
+# NSIS specific
+set(CPACK_NSIS_HELP_LINK "http:\\\\\\\\gadgetron.sourceforge.net")
+set(CPACK_NSIS_URL_INFO_ABOUT "http:\\\\\\\\gadgetron.sourceforge.net")
+set(CPACK_NSIS_MODIFY_PATH ON)
+set(CPACK_NSIS_DISPLAY_NAME "gadgetron_web")
+
+# Output filename of the generated tarball / package
+set(CPACK_PACKAGE_FILE_NAME "@PROJECT_NAME_LOWER at -@GADGETRON_VERSION_STRING@")
+set(CPACK_SOURCE_PACKAGE_FILE_NAME "@PROJECT_NAME_LOWER at -@GADGETRON_VERSION_STRING@")
diff --git a/cmake/debian/postinst b/cmake/debian/postinst
new file mode 100644
index 0000000..5f80f31
--- /dev/null
+++ b/cmake/debian/postinst
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# copy the gadgetron.xml
+sudo cp -i /usr/local/gadgetron/config/gadgetron.xml.example /usr/local/gadgetron/config/gadgetron.xml
+
+# create the symbolic link for gadgetron
+sudo rm -f /usr/local/bin/gadgetron
+sudo ln -s /usr/local/gadgetron/bin/gadgetron /usr/local/bin/gadgetron
+
+sudo rm -f /usr/local/bin/gadgetron_ismrmrd_client
+sudo ln -s /usr/local/gadgetron/bin/gadgetron_ismrmrd_client /usr/local/bin/gadgetron_ismrmrd_client
+
+sudo rm -f /usr/local/bin/gt_alive
+sudo ln -s /usr/local/gadgetron/bin/gt_alive /usr/local/bin/gt_alive
+
+# load library path
+sudo ldconfig
+
diff --git a/cmake/debian/prerm b/cmake/debian/prerm
new file mode 100644
index 0000000..eda091b
--- /dev/null
+++ b/cmake/debian/prerm
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# remove the gadgetron.xml
+sudo rm -f /usr/local/gadgetron/config/gadgetron.xml
+
+# remove the symbolic link for gadgetron
+sudo rm -f /usr/local/bin/gadgetron
+sudo rm -f /usr/local/bin/gadgetron_ismrmrd_client
+sudo rm -f /usr/local/bin/gt_alive
+
+# update library path
+sudo ldconfig
+
diff --git a/cmake/debian_web/postinst b/cmake/debian_web/postinst
new file mode 100644
index 0000000..6457e3d
--- /dev/null
+++ b/cmake/debian_web/postinst
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# add the user 'gadgetron'
+sudo adduser --no-create-home --disabled-password --gecos "" gadgetron
+
+# start the gadgetron service
+sudo service gadgetron_web start
diff --git a/cmake/debian_web/prerm b/cmake/debian_web/prerm
new file mode 100644
index 0000000..83cc215
--- /dev/null
+++ b/cmake/debian_web/prerm
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# stop the gadgetron service
+sudo service gadgetron_web stop
+
+# remove the user gadgetron
+sudo deluser gadgetron
diff --git a/cmake/gadgetron_cpack.cmake b/cmake/gadgetron_cpack.cmake
new file mode 100644
index 0000000..3ef303c
--- /dev/null
+++ b/cmake/gadgetron_cpack.cmake
@@ -0,0 +1,40 @@
+################################################################################
+# Find available package generators
+################################################################################
+
+if(UNIX)
+  # DEB
+  find_program(DPKG_PROGRAM dpkg)
+  if(EXISTS ${DPKG_PROGRAM})
+    list(APPEND CPACK_GENERATOR "DEB")
+  endif(EXISTS ${DPKG_PROGRAM})
+endif(UNIX)
+
+# Enable/Disable automatic search for dependencies:
+set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
+
+# Enable/Disable component install for CPack generator DEB
+set(CPACK_DEB_COMPONENT_INSTALL OFF)
+set(CPACK_DEB_PACKAGE_COMPONENT OFF)
+
+# Set dependencies explicitly
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "build-essential, ismrmrd, libfftw3-dev, python-dev, python-numpy, python-psutil, liblapack-dev, libxml2-dev, libxslt-dev, libarmadillo-dev, libace-dev, python-matplotlib, python-libxml2, python-h5py, libboost-all-dev, libhdf5-serial-dev, h5utils, hdf5-tools, libgtest-dev")
+
+# Where the package metadata are
+set(GADGETRON_CPACK_CFG_FILE "${PROJECT_BINARY_DIR}/cpack_options.cmake")
+
+# Where the package to be installed
+set(CPACK_PACKAGING_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX})
+message("CPACK_PACKAGING_INSTALL_PREFIX: " ${CPACK_PACKAGING_INSTALL_PREFIX})
+
+if(WIN32)
+  # NSLS
+  list(APPEND CPACK_GENERATOR "NSIS")    
+endif(WIN32)
+
+list(APPEND CPACK_SOURCE_GENERATOR "TGZ")
+list(APPEND CPACK_SOURCE_GENERATOR "ZIP")
+list(APPEND CPACK_SOURCE_IGNORE_FILES ";.git;.gitignore;todo.txt;_clang-format;build/")
+
+# Set dependencies explictly
+include(InstallRequiredSystemLibraries)
diff --git a/cmake/gadgetron_web_cpack.cmake b/cmake/gadgetron_web_cpack.cmake
new file mode 100644
index 0000000..63a7141
--- /dev/null
+++ b/cmake/gadgetron_web_cpack.cmake
@@ -0,0 +1,32 @@
+################################################################################
+# Find available package generators
+################################################################################
+
+if(UNIX)
+  # DEB
+  find_program(DPKG_PROGRAM dpkg)
+  if(EXISTS ${DPKG_PROGRAM})
+    list(APPEND CPACK_GENERATOR "DEB")
+  endif(EXISTS ${DPKG_PROGRAM})
+endif(UNIX)
+
+if(WIN32)
+    # NSLS
+    list(APPEND CPACK_GENERATOR "NSIS")    
+endif(WIN32)
+
+list(APPEND CPACK_SOURCE_GENERATOR "TGZ")
+list(APPEND CPACK_SOURCE_GENERATOR "ZIP")
+list(APPEND CPACK_SOURCE_IGNORE_FILES ";.git;.gitignore;todo.txt;_clang-format;build/")
+
+# set dependencies explictly
+set(DEBIAN_PACKAGE_DEPENDS "gadgetron, python-psutil, python-twisted")
+
+# where the package metadata are
+set(GADGETRON_WEB_CPACK_CFG_FILE "${PROJECT_BINARY_DIR}/cpack_options_web.cmake")
+
+# where the package to be installed
+# set(CPACK_PACKAGE_INSTALL_DIRECTORY ${CMAKE_INSTALL_PREFIX})
+if (NOT WIN32)
+    set(CPACK_PACKAGING_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX})
+endif (NOT WIN32)
diff --git a/doc/.gitignore b/doc/.gitignore
new file mode 100644
index 0000000..c8c6066
--- /dev/null
+++ b/doc/.gitignore
@@ -0,0 +1,7 @@
+death_row/figures/*.pdf
+*.gz
+*.log
+*.toc
+*.bbl
+*.blg
+*.aux
\ No newline at end of file
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
new file mode 100644
index 0000000..68949ec
--- /dev/null
+++ b/doc/CMakeLists.txt
@@ -0,0 +1,6 @@
+find_package(Doxygen)
+if(DOXYGEN_FOUND)
+	add_subdirectory(doxygen)
+else(DOXYGEN_FOUND)
+	MESSAGE("Doxygen not found. Will not be able to build documentation")
+endif(DOXYGEN_FOUND)
diff --git a/doc/doxygen/CMakeLists.txt b/doc/doxygen/CMakeLists.txt
new file mode 100644
index 0000000..caa1691
--- /dev/null
+++ b/doc/doxygen/CMakeLists.txt
@@ -0,0 +1,8 @@
+if(DOXYGEN_FOUND)
+	configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY)
+	add_custom_target(apidoc ${DOXYGEN_EXECUTABLE}
+	${CMAKE_CURRENT_BINARY_DIR}/Doxyfile WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+		COMMENT "Generating API documentation with Doxygen" VERBATIM)
+else(DOXYGEN_FOUND)
+	MESSAGE("Doxygen not found. Will not be able to build documentation")
+endif(DOXYGEN_FOUND)
\ No newline at end of file
diff --git a/doc/doxygen/Doxyfile.in b/doc/doxygen/Doxyfile.in
new file mode 100644
index 0000000..36f7a05
--- /dev/null
+++ b/doc/doxygen/Doxyfile.in
@@ -0,0 +1,1757 @@
+# Doxyfile 1.7.5.1
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or sequence of words) that should
+# identify the project. Note that if you do not use Doxywizard you need
+# to put quotes around the project name if it contains spaces.
+
+PROJECT_NAME           = Gadgetron
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = A Streaming Framework for Medical Image Reconstruction
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = doc
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
+# unions are shown inside the group in which they are included (e.g. using
+# @ingroup) instead of on a separate page (for HTML and Man pages) or
+# section (for LaTeX and RTF).
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and
+# unions with only public data fields will be shown inline in the documentation
+# of the scope in which they are defined (i.e. file, namespace, or group
+# documentation), provided this scope is documented. If set to NO (the default),
+# structs, classes, and unions are shown on a separate page (for HTML and Man
+# pages) or section (for LaTeX and RTF).
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
+# do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even
+# if there is only one candidate or it is obvious which candidate to choose
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. The create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files
+# containing the references data. This must be a list of .bib files. The
+# .bib extension is automatically appended if omitted. Using this command
+# requires the bibtex tool to be installed. See also
+# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style
+# of the bibliography can be controlled using LATEX_BIB_STYLE.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = @CMAKE_CURRENT_SOURCE_DIR@/../../apps/ @CMAKE_CURRENT_SOURCE_DIR@/../../toolboxes/ @CMAKE_CURRENT_SOURCE_DIR@/../../gadgets/
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS          =
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+# Note that relative paths are relative to directory from which doxygen is run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header. Note that when using a custom header you are responsible
+#  for the proper inclusion of any scripts and style sheets that doxygen
+# needs, which is dependent on the configuration options used.
+# It is adviced to generate a default header using "doxygen -w html
+# header.html footer.html stylesheet.css YourConfigFile" and then modify
+# that header. Note that the header is subject to change so you typically
+# have to redo this when upgrading to a newer version of doxygen or when
+# changing the value of configuration settings such as GENERATE_TREEVIEW!
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that
+# the files will be copied as-is; there are no commands or markers available.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the stylesheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+#  will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
+# (range [0,1..20]) that doxygen will group on one line in the generated HTML
+# documentation. Note that a value of 0 will completely suppress the enum
+# values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NO
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES       = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the
+# mathjax.org site, so you can quickly see the result without installing
+# MathJax, but it is strongly recommended to install a local copy of MathJax
+# before deployment.
+
+MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension
+# names that should be enabled during MathJax rendering.
+
+MATHJAX_EXTENSIONS     =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
+# the generated latex document. The footer should contain everything after
+# the last chapter. If it is left blank doxygen will generate a
+# standard footer. Notice: only use this tag if you know what you are doing!
+
+LATEX_FOOTER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See
+# http://en.wikipedia.org/wiki/BibTeX for more info.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# pointed to by INCLUDE_PATH will be searched when a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS        = 0
+
+# By default doxygen will use the Helvetica font for all dot files that
+# doxygen generates. When you want a differently looking font you can specify
+# the font name using DOT_FONTNAME. You need to make sure dot is able to find
+# the font, which can be done by putting it in a standard location or by setting
+# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the Helvetica font.
+# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to
+# set the path where dot can find it.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are svg, png, jpg, or gif.
+# If left blank png will be used. If you choose svg you need to set
+# HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible in IE 9+ (other browsers do not have this requirement).
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+# Note that this requires a modern browser other than Internet Explorer.
+# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you
+# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible. Older versions of IE do not have SVG support.
+
+INTERACTIVE_SVG        = NO
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/doc/website/Gadgetron.png b/doc/website/Gadgetron.png
new file mode 100644
index 0000000..f39b052
Binary files /dev/null and b/doc/website/Gadgetron.png differ
diff --git a/doc/website/index.html b/doc/website/index.html
new file mode 100644
index 0000000..69ff5df
--- /dev/null
+++ b/doc/website/index.html
@@ -0,0 +1,146 @@
+<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
+<html>
+<head>
+<style type="text/css">
+div.main
+{
+width=98%;
+margin=1%;
+text-align: center;
+}
+
+table.maintable
+{
+margin-left:auto; 
+margin-right:auto;
+width: 600px;
+}
+
+td.logo
+{
+text-align: center;
+width: 100%;
+}
+
+body
+{
+font-family: Sans-serif;
+}
+
+div.maintext
+{
+  margin-top: 20px;
+  margin-bottom: 20px;
+}
+
+table.resources
+{
+margin-left:auto; 
+margin-right:auto;
+width: 400px;
+text-align: center;
+border: 1px solid black;
+}
+
+div.resources
+{
+  margin-top: 20px;
+  margin-bottom: 20px;
+  width: 100%;
+  text-align: center;
+}
+</style>
+
+<title>Gadgetron</title>
+</head>
+
+<body>
+<div class="main">
+<table class="maintable">
+<tr>
+<td class="logo"><img class="logoimg" src="Gadgetron.png"></td>
+</tr>
+
+<tr><td><hr></td></tr>
+<tr>
+  <td>
+    <div class="maintext">
+    The Gadgetron is an Open Source framework for medical image
+      reconstruction. It has been developed at the National Heart,
+      Lung, and Blood Institute, NIH, Bethesda, MD, USA
+      and at the Department of Computer Science
+      and Department of Clinical Medicine, Aarhus University, Denmark.
+      It is made freely available to the medical image
+      reconstruction community.
+    </div>
+
+    <div class="maintext">
+      <p>
+      The Magnetic Resonance in Medicine (MRM) <a href="http://onlinelibrary.wiley.com/doi/10.1002/mrm.24389/abstract">paper</a> on the Gadgetron
+      is now published. If you use the Gadgetron in a scinetific publication, please cite:</p>
+      <p style="font-style: italic;">Hansen MS, Sørensen TS. Gadgetron: An Open Source Framework for Medical Image Reconstruction. Magn Reson Med. 2012.</p>
+    </div>
+
+    <div class="maintext">
+      Example demonstrations of the framework can be found on our
+      <a href="demo/index.html">demo page</a>. 
+    </div>
+      
+    <div class="maintext">
+      The main portal for access to source code,
+      documentation, discussion groups, etc. is the
+      Sourceforge.net website: <a href="http://sourceforge.net/p/gadgetron/">http://sourceforge.net/p/gadgetron/</a>.
+    </div>
+
+    <div class="maintext">
+      Source code can be found at: <a
+      href="http://sourceforge.net/projects/gadgetron/files/">http://sourceforge.net/projects/gadgetron/files/</a>.
+    </div>
+
+    <div class="maintext">
+    You can also checkout the code from the git archive with:
+    <p style="font-family: monospace;">git clone git://git.code.sf.net/p/gadgetron/gadgetron</p>
+    </div>
+
+    <div class="resources">
+      <table class="resources">
+	<tr><td>Manual</td><td>API Documentation</td></tr>
+	<tr><td><a
+	  href="http://gadgetron.sourceforge.net/1.1alpha1/manual/gadgetron_manual.html">[v1.1alpha1]</a></td>
+	  <td><a
+	  href="http://gadgetron.sourceforge.net/1.1alpha1/api">[v1.1alpha1]</a></td></tr>
+	<tr><td><a
+	  href="http://gadgetron.sourceforge.net/1.0/manual/gadgetron_manual.html">[v1.0]</a></td>
+	  <td><a
+	  href="http://gadgetron.sourceforge.net/1.0/api">[v1.0]</a></td></tr>
+	<tr><td><a
+	  href="http://gadgetron.sourceforge.net/1.0alpha/manual/gadgetron_manual.html">[v1.0alpha]</a></td>
+	  <td><a
+	  href="http://gadgetron.sourceforge.net/1.0alpha/api">[v1.0alpha]</a></td></tr>	
+     </table>
+   </div>
+
+    <div class="maintext">
+      Questions and comments, please contact the authors:
+    </div>
+    <div class="maintext">  
+      Michael Schacht Hansen <a
+      href="mailto:michael.hansen at nih.gov">michael.hansen at nih.gov</a>
+      <br>
+      Thomas Sangild Sørensen
+      <a
+      href="mailto:sangild at cs.au.dk">sangild at cs.au.dk</a>
+    </div>
+    <div class="maintext">
+      Follow us on twitter <a href="http://www.twitter.com/ReconstructThis">@ReconstructThis</a>
+    </div>
+ </td>
+</tr>
+  
+<tr><td><hr></td></tr>
+
+</table>
+</div>
+
+</body>
+</html>
diff --git a/doc/windows_installation/GadgetronWindowsInstallation.ps1 b/doc/windows_installation/GadgetronWindowsInstallation.ps1
new file mode 100644
index 0000000..db1f308
Binary files /dev/null and b/doc/windows_installation/GadgetronWindowsInstallation.ps1 differ
diff --git a/gadgets/.gitignore b/gadgets/.gitignore
new file mode 100644
index 0000000..afaf431
--- /dev/null
+++ b/gadgets/.gitignore
@@ -0,0 +1 @@
+gputest/test
\ No newline at end of file
diff --git a/gadgets/CMakeLists.txt b/gadgets/CMakeLists.txt
new file mode 100644
index 0000000..9e21c9c
--- /dev/null
+++ b/gadgets/CMakeLists.txt
@@ -0,0 +1,91 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETS__)
+  link_directories(${Boost_LIBRARY_DIRS})
+endif (WIN32)
+
+# These dependencies should have been found already to even get in here, but we play safe
+find_package(ACE)
+find_package(FFTW3 COMPONENTS single double)
+find_package(Ismrmrd)
+
+include_directories(
+  ${CMAKE_BINARY_DIR}/apps/gadgetron
+  ${ACE_INCLUDE_DIR} 
+  ${Boost_INCLUDE_DIR}
+  ${FFTW3_INCLUDE_DIR}
+  ${ISMRMRD_INCLUDE_DIR}
+  ${CMAKE_SOURCE_DIR}/apps/gadgetron 
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/algorithm
+  ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+  )
+
+add_subdirectory(mri_core)
+add_subdirectory(interventional_mri)
+add_subdirectory(util)
+
+if (ARMADILLO_FOUND)
+    add_subdirectory(epi)
+elseif (ARMADILLO_FOUND)
+    MESSAGE("Armadillo not found, NOT compiling EPI Gadgets")
+endif (ARMADILLO_FOUND)
+
+if (MKL_FOUND OR ARMADILLO_FOUND)
+    add_subdirectory(gtPlus)
+endif (MKL_FOUND OR ARMADILLO_FOUND)
+
+if (CUDA_FOUND)
+  message("Cuda found, compiling gpu accelerated gadgets")
+  add_subdirectory(pmri)
+  add_subdirectory(radial)
+  add_subdirectory(spiral)
+  add_subdirectory(hyper)
+  add_subdirectory(gpu)
+else (CUDA_FOUND)
+  message("Cuda NOT found, NOT compiling gpu accelerated gadgets")
+endif(CUDA_FOUND)
+
+add_subdirectory(grappa)
+
+if (MATLAB_FOUND)
+  message("Compiling MATLAB gadgets")
+  add_subdirectory(matlab)
+endif(MATLAB_FOUND)
+
+# nest the find_package checks to provide more useful error messages
+find_package(Boost COMPONENTS python system thread REQUIRED)
+
+if (Boost_PYTHON_FOUND AND PYTHONLIBS_FOUND AND NUMPY_FOUND)
+    MESSAGE("PYTHON_INCLUDE_DIRS: ${PYTHON_INCLUDE_DIRS}")
+    MESSAGE("PYTHON_LIBRARIES: ${PYTHON_LIBRARIES}")
+    MESSAGE("NUMPY_INCLUDE_DIRS: ${NUMPY_INCLUDE_DIRS}")
+    MESSAGE("Compiling Python Gadgets")
+    add_subdirectory(python)
+else (Boost_PYTHON_FOUND AND PYTHONLIBS_FOUND AND NUMPY_FOUND)
+    if(NOT PYTHONLIBS_FOUND)
+        MESSAGE("Python Libraries/Headers NOT found, NOT compiling Python Gadgets")
+    endif(NOT PYTHONLIBS_FOUND)
+    if(NOT NUMPY_FOUND)
+        MESSAGE("NumPy NOT found, NOT compiling Python Gadgets")
+    endif(NOT NUMPY_FOUND)
+    if(NOT Boost_PYTHON_FOUND)
+        MESSAGE("Boost Python NOT found, NOT compiling Python Gadgets")
+    endif(NOT Boost_PYTHON_FOUND)
+endif (Boost_PYTHON_FOUND AND PYTHONLIBS_FOUND AND NUMPY_FOUND)
+
+find_package(DCMTK)
+if(DCMTK_FOUND)
+  message("Compiling DICOM gadget")
+  add_subdirectory(dicom)
+else(DCMTK_FOUND)
+  message("DCMTK NOT found, not compiling DICOM gadget")
+endif(DCMTK_FOUND)
+
+add_subdirectory(cartesian)
+
+if(ARMADILLO_FOUND)
+    add_subdirectory(moco)
+endif(ARMADILLO_FOUND)
diff --git a/gadgets/cartesian/CMakeLists.txt b/gadgets/cartesian/CMakeLists.txt
new file mode 100644
index 0000000..6f896a9
--- /dev/null
+++ b/gadgets/cartesian/CMakeLists.txt
@@ -0,0 +1,34 @@
+IF (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_CARTESIAN__)
+ENDIF (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+  )
+
+add_library(gadgetron_cartesian SHARED 
+  gadgetron_cartesian_export.h
+  CartesianToGenericGadget.h
+  CartesianToGenericGadget.cpp
+)
+
+set_target_properties(gadgetron_cartesian PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_cartesian gadgetron_toolbox_cpucore 
+  gadgetron_gadgetbase
+  gadgetron_toolbox_log
+  ${ISMRMRD_LIBRARIES}
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY}
+  )
+
+if(ARMADILLO_FOUND)
+    target_link_libraries(gadgetron_cartesian gadgetron_toolbox_cpucore_math )
+endif(ARMADILLO_FOUND)
+
+install (FILES  gadgetron_cartesian_export.h
+                CartesianToGenericGadget.h
+                DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+install (TARGETS gadgetron_cartesian DESTINATION lib COMPONENT main)
diff --git a/gadgets/cartesian/CartesianToGenericGadget.cpp b/gadgets/cartesian/CartesianToGenericGadget.cpp
new file mode 100644
index 0000000..86dcf9e
--- /dev/null
+++ b/gadgets/cartesian/CartesianToGenericGadget.cpp
@@ -0,0 +1,94 @@
+#include "CartesianToGenericGadget.h"
+#include "ismrmrd/xml.h"
+
+namespace Gadgetron{
+
+  CartesianToGenericGadget::CartesianToGenericGadget() 
+  {
+  }
+
+  CartesianToGenericGadget::~CartesianToGenericGadget() {}
+  
+  int CartesianToGenericGadget::process_config(ACE_Message_Block* mb)
+  {
+    // Get the Ismrmrd header
+    //
+    ISMRMRD::IsmrmrdHeader h;
+    ISMRMRD::deserialize(mb->rd_ptr(),h);
+    
+    
+    if (h.encoding.size() != 1) {
+      GDEBUG("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    
+    // Get the encoding space and trajectory description
+    ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+    ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+    ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+    // Enforcement of the matrix size being a multiple of the "warp size"
+    warp_size_ = matrix_size_as_multiple_of.value();
+
+    matrix_size_.push_back( (e_space.matrixSize.x+warp_size_-1)/warp_size_*warp_size_);
+    matrix_size_.push_back( (e_space.matrixSize.y+warp_size_-1)/warp_size_*warp_size_);
+
+    center_phase_ = e_limits.kspace_encoding_step_1 ? e_limits.kspace_encoding_step_1->center : 0;
+
+    return GADGET_OK;
+  }
+
+  int CartesianToGenericGadget::
+  process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1,
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2)
+  {
+    // Noise should have been consumed by the noise adjust, but just in case...
+    //
+
+    bool is_noise = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT);
+    if (is_noise) {
+      m1->release();
+      return GADGET_OK;
+    }
+
+    // Make a new array as continuation of m1, and pass along
+    //
+
+    size_t samples_per_readout = m1->getObjectPtr()->number_of_samples;
+    size_t center_sample = m1->getObjectPtr()->center_sample;
+    size_t offset_readout = (matrix_size_[0]>>1)-center_sample; // In case of partial Fourier
+    size_t offset_phase = (matrix_size_[1]>>1)-center_phase_; // In case of partial Fourier
+    size_t phase_encode_step = m1->getObjectPtr()->idx.kspace_encode_step_1;
+
+    std::vector<size_t> trajectory_dimensions;
+    trajectory_dimensions.push_back(3);
+    trajectory_dimensions.push_back(samples_per_readout);
+    
+    GadgetContainerMessage< hoNDArray<float> > *cont = new GadgetContainerMessage< hoNDArray<float> >();
+    cont->getObjectPtr()->create(&trajectory_dimensions);
+    m2->cont(cont);
+
+    float *traj_ptr = cont->getObjectPtr()->get_data_ptr();
+
+    for( size_t sample=0; sample<samples_per_readout; sample++ ){
+
+      // trajectory x (normalized to [-0.5;0.5])
+      traj_ptr[sample*3+0] = float(sample+offset_readout)/float(matrix_size_[0])-0.5f;
+
+      // trajectory y (normalized to [-0.5;0.5])
+      traj_ptr[sample*3+1] = float(phase_encode_step+offset_phase)/float(matrix_size_[1])-0.5f;
+
+      // dcw
+      traj_ptr[sample*3+2] = 1.0f;
+    }
+        
+    if (this->next()->putq(m1) < 0) {
+      GDEBUG("Failed to put job on queue.\n");
+      return GADGET_FAIL;
+    }
+    
+    return GADGET_OK;
+  }
+  
+  GADGET_FACTORY_DECLARE(CartesianToGenericGadget)
+}
diff --git a/gadgets/cartesian/CartesianToGenericGadget.h b/gadgets/cartesian/CartesianToGenericGadget.h
new file mode 100644
index 0000000..a3c3815
--- /dev/null
+++ b/gadgets/cartesian/CartesianToGenericGadget.h
@@ -0,0 +1,42 @@
+#ifndef CartesianToGenericGadget_H
+#define CartesianToGenericGadget_H
+#pragma once
+
+#include "gadgetron_cartesian_export.h"
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <vector>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_CARTESIAN CartesianToGenericGadget :
+    public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+  {
+
+  public:
+    GADGET_DECLARE(CartesianToGenericGadget);
+    CartesianToGenericGadget();
+    virtual ~CartesianToGenericGadget();
+
+  protected:
+    GADGET_PROPERTY(matrix_size_as_multiple_of, int, "Force the matrix size to be a multiple of", 1);
+
+    virtual int process_config(ACE_Message_Block* mb);    
+    virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1,
+			GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2);
+    
+  private:
+    std::vector<unsigned int> matrix_size_;
+    unsigned short center_phase_;
+
+    // We can enforce the encoding space dimension 
+    // to be a multiple of the "warp size" (required for the gpu nfft)
+    unsigned int warp_size_; 
+  };
+}
+#endif //CartesianToGenericGadget_H
diff --git a/gadgets/cartesian/gadgetron_cartesian_export.h b/gadgets/cartesian/gadgetron_cartesian_export.h
new file mode 100644
index 0000000..9e4dce8
--- /dev/null
+++ b/gadgets/cartesian/gadgetron_cartesian_export.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#if defined (WIN32)
+#ifdef __BUILD_GADGETRON_CARTESIAN__
+#define EXPORTGADGETS_CARTESIAN __declspec(dllexport)
+#else
+#define EXPORTGADGETS_CARTESIAN __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETS_CARTESIAN
+#endif
diff --git a/gadgets/dicom/CMakeLists.txt b/gadgets/dicom/CMakeLists.txt
new file mode 100644
index 0000000..8bda3a2
--- /dev/null
+++ b/gadgets/dicom/CMakeLists.txt
@@ -0,0 +1,105 @@
+# DCMTK-necessary preprocessor flags
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_CONFIG_H -D_REENTRANT -D_OSF_SOURCE")
+
+set(Boost_NO_BOOST_CMAKE ON)
+find_package(Boost COMPONENTS date_time REQUIRED)
+
+if(WIN32)
+  link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+if (WIN32)
+    set(GT_DICOM_LIBRARIES
+        #z
+        ${DCMTK_dcmdata_LIBRARY}
+        ${DCMTK_oflog_LIBRARY}
+        ${DCMTK_ofstd_LIBRARY}
+        #m
+        #rt
+        #nsl
+        #pthread
+        )
+else (WIN32)
+    set(GT_DICOM_LIBRARIES
+        z
+        ${DCMTK_dcmdata_LIBRARY}
+        ${DCMTK_oflog_LIBRARY}
+        ${DCMTK_ofstd_LIBRARY}
+        m
+        #rt
+        #nsl
+        pthread
+        )
+endif (WIN32)
+
+# sanity check:
+#message("DCMTK ${DCMTK_HOME}")
+#message("Include: ${DCMTK_INCLUDE_DIRS}")
+#message("Libraries: ${GT_DICOM_LIBRARIES}")
+
+include_directories(
+        ${CMAKE_SOURCE_DIR}/gadgets/mri_core    # for GadgetIsmrmrdReadWrite.h
+        ${DCMTK_INCLUDE_DIRS}
+        ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/core
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/algorithm
+        ${CMAKE_SOURCE_DIR}/toolboxes/operators
+        ${CMAKE_SOURCE_DIR}/toolboxes/operators/cpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+        ${CMAKE_SOURCE_DIR}/toolboxes/solvers/cpu
+        ${CMAKE_SOURCE_DIR}/gadgets/core
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+        ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+        ${CMAKE_SOURCE_DIR}/toolboxes/mri_core
+        ${CMAKE_SOURCE_DIR}/toolboxes/gtplus
+        ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/util
+        ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/workflow
+        ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/algorithm
+        ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/solver
+        ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+        ${CMAKE_SOURCE_DIR}/apps/gadgetron
+        ${CMAKE_SOURCE_DIR}/apps/matlab
+        ${CMAKE_SOURCE_DIR}/gadgets/mri_core 
+        ${CMAKE_SOURCE_DIR}/gadgets/gtPlus
+        ${ARMADILLO_INCLUDE_DIRS}
+        )
+
+add_library(gadgetron_dicom SHARED
+    DicomFinishGadget.h DicomFinishGadget.cpp
+    DicomImageWriter.h DicomImageWriter.cpp)
+
+set_target_properties(gadgetron_dicom PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(
+    gadgetron_dicom
+    gadgetron_gadgetbase
+    gadgetron_toolbox_log
+    gadgetron_toolbox_cpucore 
+    ${ISMRMRD_LIBRARIES}
+    optimized ${ACE_LIBRARIES}
+    debug ${ACE_DEBUG_LIBRARY}
+    ${Boost_LIBRARIES}
+    ${GT_DICOM_LIBRARIES} )
+
+if(ARMADILLO_FOUND)
+    target_link_libraries(gadgetron_dicom gadgetron_toolbox_cpucore_math )
+endif(ARMADILLO_FOUND)
+
+if(MKL_FOUND)
+    target_link_libraries(gadgetron_dicom gadgetron_toolbox_gtplus )
+endif(MKL_FOUND)
+
+install(
+    FILES DicomFinishGadget.h DicomImageWriter.h gadgetron_dicom_export.h
+    DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+install(TARGETS gadgetron_dicom DESTINATION lib COMPONENT main)
+
+install(FILES dicom.xml DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
diff --git a/gadgets/dicom/DicomFinishGadget.cpp b/gadgets/dicom/DicomFinishGadget.cpp
new file mode 100644
index 0000000..f7e2a36
--- /dev/null
+++ b/gadgets/dicom/DicomFinishGadget.cpp
@@ -0,0 +1,701 @@
+#include <vector>
+#include "boost/date_time/gregorian/gregorian.hpp"
+
+#include "DicomFinishGadget.h"
+#include "ismrmrd/xml.h"
+
+namespace Gadgetron {
+
+    int DicomFinishGadget::process_config(ACE_Message_Block* mb)
+    {
+        OFCondition status;
+        DcmTagKey key;
+        long BUFSIZE = 1024;
+        char *buf = new char[BUFSIZE];  // used for writing numbers as strings in DCMTK
+
+        ISMRMRD::IsmrmrdHeader h;
+        deserialize(mb->rd_ptr(), h);
+
+        // Ensure DICOM dictionary is loaded
+        if (!dcmDataDict.isDictionaryLoaded()) {
+            GDEBUG("Dictionary not loaded!  Set DCMDICTPATH\n");
+            return GADGET_FAIL;
+        }
+
+        ISMRMRD::ExperimentalConditions exp_cond = h.experimentalConditions;
+
+        ISMRMRD::SubjectInformation patient_info = *h.subjectInformation;
+        if (!h.subjectInformation) {
+            GWARN("Header missing SubjectInformation parameters\n");
+
+            patient_info.patientName.set("XXXXXXXX");
+            patient_info.patientWeight_kg.set(1000);
+            patient_info.patientID.set("XXXXXXXX");
+            patient_info.patientBirthdate.set("19000101");
+            patient_info.patientGender.set("o");
+        }
+
+        ISMRMRD::StudyInformation study_info = *h.studyInformation;
+        if (!h.studyInformation) {
+            GWARN("Header missing StudyInformation parameters\n");
+
+            study_info.studyDate.set("19000101");
+            study_info.studyTime.set("121212");
+            study_info.studyID.set("XXXXXXXX");
+            study_info.accessionNumber.set(0);
+            study_info.referringPhysicianName.set("XXXXXXXX");
+            study_info.studyDescription.set("XXXXXXXX");
+            study_info.studyInstanceUID.set("XXXXXXXX");
+        }
+
+        if (!h.measurementInformation) {
+            GDEBUG("Header missing MeasurementInformation parameters\n");
+            return GADGET_FAIL;
+        }
+
+        ISMRMRD::MeasurementInformation meas_info = *h.measurementInformation;
+
+        if (!h.acquisitionSystemInformation) {
+            GDEBUG("Header missing AcquisitionSystemInformation parameters\n");
+            return GADGET_FAIL;
+        }
+
+        ISMRMRD::AcquisitionSystemInformation sys_info = *h.acquisitionSystemInformation;
+
+        if (!h.sequenceParameters) {
+            GDEBUG("Header missing SequenceTiming parameters\n");
+            return GADGET_FAIL;
+        }
+
+        ISMRMRD::SequenceParameters seq_info = *h.sequenceParameters;
+
+        if (h.encoding.size() == 0) {
+            GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+            GDEBUG("This Gadget needs an encoding description\n");
+            return GADGET_FAIL;
+        }
+
+
+        ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+        ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+        ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+        DcmDataset *dataset = dcmFile.getDataset();
+        DcmMetaInfo *metainfo = dcmFile.getMetaInfo();
+
+        // Store initial Series Number for later
+        if (meas_info.initialSeriesNumber) {
+            this->initialSeriesNumber = (long)*meas_info.initialSeriesNumber;
+        }
+        else {
+            this->initialSeriesNumber = 0;
+        }
+
+
+        // Set the Application Entity Title in the DICOM Meta Info section
+        // The rest of the Meta Info will be automatically populated by DCMTK
+        if (sys_info.stationName) {
+            status = metainfo->putAndInsertString(DcmTagKey(0x0002, 0x0016),
+                sys_info.stationName->c_str());
+            if (!status.good()) {
+                GDEBUG("Failed to set AET in MetaInfo\n");
+                return GADGET_FAIL;
+            }
+        }
+        else {
+            status = metainfo->putAndInsertString(DcmTagKey(0x0002, 0x0016), "none");
+            if (!status.good()) {
+                GDEBUG("Failed to set AET in MetaInfo\n");
+                return GADGET_FAIL;
+            }
+        }
+
+        // Group Length
+        key.set(0x0008, 0x0000);
+        status = dataset->insertEmptyElement(key);
+        if (status.bad()) {
+            GDEBUG("Failed to write 0x0008 Group Length\n");
+            return GADGET_FAIL;
+        }
+
+        // Specific Character Set
+        key.set(0x0008, 0x0005);
+        WRITE_DCM_STRING(key, "ISO_IR 100");
+
+        // Image Type
+        // ORIGINAL or DERIVED describes origin of pixel data
+        // PRIMARY or SECONDARY describes image creation time (during or after exam)
+        // OTHER, etc. are implementation-specific
+        key.set(0x0008, 0x0008);
+        WRITE_DCM_STRING(key, "ORIGINAL\\PRIMARY\\OTHER");
+
+        // SOPClassUID
+        key.set(0x0008, 0x0016);
+        WRITE_DCM_STRING(key, UID_MRImageStorage);
+
+        // Study Date
+        if (study_info.studyDate) {
+            key.set(0x0008, 0x0020);
+            std::string d(study_info.studyDate.get());
+            d.erase(std::remove(d.begin(), d.end(), '-'), d.end());     // erase all occurrences of '-'
+            WRITE_DCM_STRING(key, d.c_str());
+        }
+
+        // Series, Acquisition, Content Date
+        if (meas_info.seriesDate) {
+            key.set(0x0008, 0x0021);
+            std::string d(meas_info.seriesDate.get());
+            d.erase(std::remove(d.begin(), d.end(), '-'), d.end());
+            WRITE_DCM_STRING(key, d.c_str());
+
+            key.set(0x0008, 0x0022);
+            WRITE_DCM_STRING(key, d.c_str());
+
+            key.set(0x0008, 0x0023);
+            WRITE_DCM_STRING(key, d.c_str());
+        }
+
+        // Study Time
+        if (study_info.studyTime) {
+            key.set(0x0008, 0x0030);
+            std::string t(study_info.studyTime.get());
+            t.erase(std::remove(t.begin(), t.end(), ':'), t.end());
+            WRITE_DCM_STRING(key, t.c_str());
+        }
+
+        // Series, Acquisition, Content Time
+        if (meas_info.seriesTime) {
+            key.set(0x0008, 0x0031);
+            std::string t(meas_info.seriesTime.get());
+            t.erase(std::remove(t.begin(), t.end(), ':'), t.end());
+            WRITE_DCM_STRING(key, t.c_str());
+
+            key.set(0x0008, 0x0032);
+            WRITE_DCM_STRING(key, t.c_str());
+
+            key.set(0x0008, 0x0033);
+            WRITE_DCM_STRING(key, t.c_str());
+        }
+
+        // Accession Number
+        key.set(0x0008, 0x0050);
+        if (study_info.accessionNumber) {
+            ACE_OS::snprintf(buf, BUFSIZE, "%ld", *study_info.accessionNumber);
+            WRITE_DCM_STRING(key, buf);
+        }
+        else {
+            WRITE_DCM_STRING(key, 0);
+        }
+
+        // Modality
+        // TODO: this is hardcoded!!
+        key.set(0x0008, 0x0060);
+        WRITE_DCM_STRING(key, "MR");
+
+        // Manufacturer
+        key.set(0x0008, 0x0070);
+        if (sys_info.systemVendor) {
+            WRITE_DCM_STRING(key, sys_info.systemVendor->c_str());
+        }
+        else {
+            WRITE_DCM_STRING(key, "UNKNOWN");
+        }
+
+        // Institution Name
+        key.set(0x0008, 0x0080);
+        if (sys_info.institutionName) {
+            WRITE_DCM_STRING(key, sys_info.institutionName->c_str());
+        }
+        else {
+            WRITE_DCM_STRING(key, "UNKNOWN");
+        }
+
+        // Referring Physician's Name
+        key.set(0x0008, 0x0090);
+        if (study_info.referringPhysicianName) {
+            WRITE_DCM_STRING(key, study_info.referringPhysicianName->c_str());
+        }
+        else {
+            WRITE_DCM_STRING(key, "");
+        }
+
+        // Station Name
+        key.set(0x0008, 0x1010);
+        if (sys_info.stationName) {
+            WRITE_DCM_STRING(key, sys_info.stationName->c_str());
+        }
+        else {
+            WRITE_DCM_STRING(key, "");
+        }
+
+        // Study Description
+        key.set(0x0008, 0x1030);
+        if (study_info.studyDescription) {
+            WRITE_DCM_STRING(key, study_info.studyDescription->c_str());
+        }
+        else {
+            WRITE_DCM_STRING(key, "");
+        }
+
+        // Series Description
+        key.set(0x0008, 0x103E);
+        if (meas_info.seriesDescription) {
+            WRITE_DCM_STRING(key, meas_info.seriesDescription->c_str());
+        }
+        else {
+            WRITE_DCM_STRING(key, "");
+        }
+
+        // Manufacturer's Model Name
+        key.set(0x0008, 0x1090);
+        if (sys_info.systemModel) {
+            WRITE_DCM_STRING(key, sys_info.systemModel->c_str());
+        }
+        else {
+            WRITE_DCM_STRING(key, "");
+        }
+
+        // Referenced SOP Instance UIDs
+        std::vector<ISMRMRD::ReferencedImageSequence> refs(meas_info.referencedImageSequence);
+        if (refs.size() > 0) {
+            DcmItem *ref_sequence;
+            std::vector<ISMRMRD::ReferencedImageSequence>::iterator it;
+            for (it = refs.begin(); it != refs.end(); ++it) {
+                std::string ref_uid(it->referencedSOPInstanceUID);
+                if (ref_uid.length() > 0) {   // Only write non-empty strings
+                    if (dataset->findOrCreateSequenceItem(key, ref_sequence, -2).good()) {
+                        // Write the Referenced SOPClassUID (MRImageStorage)
+                        key.set(0x0008, 0x1150);
+                        ((DcmDataset *)ref_sequence)->putAndInsertString(key, UID_MRImageStorage);
+                        // Write the Referenced SOPInstanceUID
+                        key.set(0x0008, 0x1155);
+                        ((DcmDataset *)ref_sequence)->putAndInsertString(key, ref_uid.c_str());
+                    }
+                }
+            }
+        }
+
+        // Group Length
+        key.set(0x0010, 0x0000);
+        status = dataset->insertEmptyElement(key);
+        if (!status.good()) {
+            GDEBUG("Failed to write 0x0010 Group Length\n");
+            return GADGET_FAIL;
+        }
+
+        // Patient Name
+        key.set(0x0010, 0x0010);
+        if (patient_info.patientName) {
+            WRITE_DCM_STRING(key, patient_info.patientName->c_str());
+        }
+        else {
+            WRITE_DCM_STRING(key, "None");
+        }
+
+        // Patient ID
+        key.set(0x0010, 0x0020);
+        if (patient_info.patientID) {
+            WRITE_DCM_STRING(key, patient_info.patientID->c_str());
+        }
+        else {
+            WRITE_DCM_STRING(key, "0");
+        }
+
+        // Patient Birthdate
+        key.set(0x0010, 0x0030);
+        if (patient_info.patientBirthdate) {
+            std::string d(patient_info.patientBirthdate.get());
+            d.erase(std::remove(d.begin(), d.end(), '-'), d.end());
+            WRITE_DCM_STRING(key, d.c_str());
+        }
+        else {
+            status = dataset->insertEmptyElement(key);
+        }
+
+        // Patient Sex
+        key.set(0x0010, 0x0040);
+        if (patient_info.patientGender) {
+            if (*patient_info.patientGender == "O") {
+                status = dataset->insertEmptyElement(key);
+            }
+            else {
+                WRITE_DCM_STRING(key, patient_info.patientGender->c_str());
+            }
+        }
+        else {
+            WRITE_DCM_STRING(key, "");
+        }
+
+        // Patient Age
+        key.set(0x0010, 0x1010);
+        if (patient_info.patientBirthdate && meas_info.seriesDate) {
+            boost::gregorian::date bday(boost::gregorian::from_simple_string(patient_info.patientBirthdate.get()));
+            boost::gregorian::date seriesDate(boost::gregorian::from_simple_string(meas_info.seriesDate.get()));
+
+            boost::gregorian::days age = seriesDate - bday;
+
+            long age_in_years = age.days() / 365;
+
+            ACE_OS::snprintf(buf, BUFSIZE, "%03ldY", age_in_years);
+            WRITE_DCM_STRING(key, buf);
+        }
+        else {
+            WRITE_DCM_STRING(key, "000Y");
+        }
+
+        // Patient Weight
+        key.set(0x0010, 0x1030);
+        if (patient_info.patientWeight_kg) {
+            ACE_OS::snprintf(buf, BUFSIZE, "%f", *patient_info.patientWeight_kg);
+            WRITE_DCM_STRING(key, buf);
+        }
+        else {
+            WRITE_DCM_STRING(key, "0.0");
+        }
+
+        // Group Length
+        key.set(0x0018, 0x0000);
+        status = dataset->insertEmptyElement(key);
+        if (!status.good()) {
+            GDEBUG("Failed to write 0x0018 Group Length\n");
+            return GADGET_FAIL;
+        }
+
+        // Scanning Sequence, Sequence Variant, Scan Options, Acquisition Type
+        std::string scanningSequence("RM");
+        std::string sequenceVariant("NONE");
+        std::string scanOptions("NONE");
+        std::string mrAcquisitionType("2D");
+        if (h.userParameters) {
+            ISMRMRD::UserParameters user_params = h.userParameters.get();
+            std::vector<ISMRMRD::UserParameterString> strings = user_params.userParameterString;
+            std::vector<ISMRMRD::UserParameterString>::iterator it;
+
+            for (it = strings.begin(); it != strings.end(); ++it) {
+                if (it->name == "scanningSequence") {
+                    scanningSequence = it->value;
+                }
+                else if (it->name == "sequenceVariant") {
+                    sequenceVariant = it->value;
+                }
+                else if (it->name == "scanOptions") {
+                    scanOptions = it->value;
+                }
+                else if (it->name == "mrAcquisitionType") {
+                    mrAcquisitionType = it->value;
+                }
+            }
+        }
+        key.set(0x0018, 0x0020);
+        WRITE_DCM_STRING(key, scanningSequence.c_str());
+        key.set(0x0018, 0x0021);
+        WRITE_DCM_STRING(key, sequenceVariant.c_str());
+        key.set(0x0018, 0x0022);
+        WRITE_DCM_STRING(key, scanOptions.c_str());
+        key.set(0x0018, 0x0023);
+        WRITE_DCM_STRING(key, mrAcquisitionType.c_str());
+
+        // Angio Flag
+        // TODO: hardcoded
+        key.set(0x0018, 0x0025);
+        WRITE_DCM_STRING(key, "N");
+
+        // Slice Thickness
+        // This will need updated if the "reconSpace.fieldOfView_mm.z" field
+        // is changed in the ISMRMRD populating code (client)
+        key.set(0x0018, 0x0050);
+        ACE_OS::snprintf(buf, BUFSIZE, "%f", r_space.fieldOfView_mm.z / std::max(r_space.matrixSize.z, (unsigned short)1));
+        WRITE_DCM_STRING(key, buf);
+
+        // Spacing Between Slices
+        key.set(0x0018, 0x0088);
+        ACE_OS::snprintf(buf, BUFSIZE, "%f", r_space.fieldOfView_mm.z);
+        WRITE_DCM_STRING(key, buf);
+
+        // Repetition Time
+        key.set(0x0018, 0x0080);
+        ACE_OS::snprintf(buf, BUFSIZE, "%f", seq_info.TR.front());
+        WRITE_DCM_STRING(key, buf);
+
+        // Echo Time
+        key.set(0x0018, 0x0081);
+        ACE_OS::snprintf(buf, BUFSIZE, "%f", seq_info.TE.front());
+        WRITE_DCM_STRING(key, buf);
+
+        // Inversion Time
+        if (seq_info.TI.size()>0)
+        {
+            key.set(0x0018, 0x0082);
+            ACE_OS::snprintf(buf, BUFSIZE, "%f", seq_info.TI.front());
+            WRITE_DCM_STRING(key, buf);
+        }
+
+        // Flip Angle
+        if (seq_info.flipAngle_deg.size()>0)
+        {
+            key.set(0x0018, 0x1314);
+            ACE_OS::snprintf(buf, BUFSIZE, "%ld", (long)seq_info.flipAngle_deg.front());
+            WRITE_DCM_STRING(key, buf);
+        }
+
+        // Imaging Frequency in tenths of MHz ???
+        key.set(0x0018, 0x0084);
+        ACE_OS::snprintf(buf, BUFSIZE, "%f", (float)exp_cond.H1resonanceFrequency_Hz / 10000000.);
+        WRITE_DCM_STRING(key, buf);
+
+        // Magnetic Field Strength (T)
+        key.set(0x0018, 0x0087);
+        if (sys_info.systemFieldStrength_T) {
+            ACE_OS::snprintf(buf, BUFSIZE, "%f", *sys_info.systemFieldStrength_T);
+            WRITE_DCM_STRING(key, buf);
+        }
+        else {
+            WRITE_DCM_STRING(key, "3.0");
+        }
+
+
+        // Echo Train Length
+        if (h.encoding[0].echoTrainLength) {
+            key.set(0x0018, 0x0091);
+            ACE_OS::snprintf(buf, BUFSIZE, "%ld", (long)*h.encoding[0].echoTrainLength);
+            WRITE_DCM_STRING(key, buf);
+        }
+        else {
+            WRITE_DCM_STRING(key, "1");
+        }
+
+        // Percent Sampling
+        // TODO: hardcoded
+        key.set(0x0018, 0x0093);
+        WRITE_DCM_STRING(key, "100");
+
+        // Percent Phase FOV
+        // TODO: hardcoded
+        key.set(0x0018, 0x0094);
+        WRITE_DCM_STRING(key, "100");
+
+        // Protocol Name
+        if (meas_info.protocolName) {
+            key.set(0x0018, 0x1030);
+            WRITE_DCM_STRING(key, meas_info.protocolName.get().c_str());
+        }
+        else {
+            WRITE_DCM_STRING(key, "");
+        }
+
+        // Trigger Time - TODO: use Image Meta Data
+        key.set(0x0018, 0x1060);
+        WRITE_DCM_STRING(key, "0.0");
+
+        // Reconstruction Diameter (FOV) - TODO: ?
+        key.set(0x0018, 0x1100);
+
+        // Frequency Encoding Direction - TODO: use Image Meta Data
+        key.set(0x0018, 0x1312);
+        WRITE_DCM_STRING(key, "ROW");
+
+        // Patient Position
+        key.set(0x0018, 0x5100);
+        WRITE_DCM_STRING(key, meas_info.patientPosition.c_str());
+
+        /****************************************/
+        // Group Length
+        key.set(0x0020, 0x0000);
+        status = dataset->insertEmptyElement(key);
+        if (!status.good()) {
+            GDEBUG("Failed to write 0x0020 Group Length\n");
+            return GADGET_FAIL;
+        }
+
+        // Study Instance UID
+        key.set(0x0020, 0x000D);
+        if (study_info.studyInstanceUID) {
+            WRITE_DCM_STRING(key, study_info.studyInstanceUID->c_str());
+        }
+
+        // Study ID
+        if (study_info.studyID) {
+            key.set(0x0020, 0x0010);
+            WRITE_DCM_STRING(key, study_info.studyID->c_str());
+        }
+        else {
+            WRITE_DCM_STRING(key, "0");
+        }
+
+        // Store Series Instance UID for later
+        if (meas_info.seriesInstanceUIDRoot) {
+            seriesIUIDRoot = *meas_info.seriesInstanceUIDRoot;
+        }
+
+        // Frame of Reference UID
+        if (meas_info.frameOfReferenceUID) {
+            key.set(0x0020, 0x0052);
+            WRITE_DCM_STRING(key, meas_info.frameOfReferenceUID->c_str());
+        }
+
+        /****************************************/
+        // Group Length
+        key.set(0x0028, 0x0000);
+        status = dataset->insertEmptyElement(key);
+        if (!status.good()) {
+            GDEBUG("Failed to write 0x0028 Group Length\n");
+            return GADGET_FAIL;
+        }
+
+        // Samples Per Pixel
+        key.set(0x0028, 0x0002);
+        // TODO: hardcoded
+        WRITE_DCM_STRING(key, "1");
+
+        // Photometric Interpretation
+        key.set(0x0028, 0x0004);
+        // TODO: hardcoded
+        WRITE_DCM_STRING(key, "MONOCHROME2");
+
+        // Pixel Spacing (Array of len 2)
+        key.set(0x0028, 0x0030);
+        float pixel_spacing_X = r_space.fieldOfView_mm.x / r_space.matrixSize.x;
+        float pixel_spacing_Y = r_space.fieldOfView_mm.y / r_space.matrixSize.y;
+        ACE_OS::snprintf(buf, BUFSIZE, "%.3f\\%.3f", pixel_spacing_X, pixel_spacing_Y);
+        WRITE_DCM_STRING(key, buf);
+
+        // Bits Allocated
+        key.set(0x0028, 0x0100);
+        WRITE_DCM_STRING(key, "16");
+        // Bits Stored
+        key.set(0x0028, 0x0101);
+        WRITE_DCM_STRING(key, "16");
+        // High Bit
+        key.set(0x0028, 0x0102);
+        WRITE_DCM_STRING(key, "15");
+        // Pixel Representation
+        key.set(0x0028, 0x0103);
+        WRITE_DCM_STRING(key, "1");
+
+        //GDEBUG("Finished populating DICOM fields\n");
+
+        /* clean up the buffer we created for ACE_OS::snprintf */
+        delete[] buf;
+
+        return GADGET_OK;
+    }
+
+    int DicomFinishGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1)
+    {
+        if (!this->controller_) {
+            GERROR("Cannot return result to controller, no controller set");
+            return GADGET_FAIL;
+        }
+
+        // --------------------------------------------------
+        ISMRMRD::ImageHeader *img = m1->getObjectPtr();
+
+        uint16_t data_type = img->data_type;
+
+        if (data_type == ISMRMRD::ISMRMRD_USHORT)
+        {
+            GadgetContainerMessage< hoNDArray< unsigned short > >* datamb = AsContainerMessage< hoNDArray< unsigned short > >(m1->cont());
+            if (!datamb)
+            {
+                GERROR("DicomFinishGadget::process, invalid image message objects\n");
+                return GADGET_FAIL;
+            }
+
+            if (this->write_data_attrib(m1, datamb) != GADGET_OK)
+            {
+                GERROR("DicomFinishGadget::write_data_attrib failed for unsigned short ... \n");
+                return GADGET_FAIL;
+            }
+        }
+        else if (data_type == ISMRMRD::ISMRMRD_SHORT)
+        {
+            GadgetContainerMessage< hoNDArray< short > >* datamb = AsContainerMessage< hoNDArray< short > >(m1->cont());
+            if (!datamb)
+            {
+                GERROR("DicomFinishGadget::process, invalid image message objects\n");
+                return GADGET_FAIL;
+            }
+
+            if (this->write_data_attrib(m1, datamb) != GADGET_OK)
+            {
+                GERROR("DicomFinishGadget::write_data_attrib failed for short ... \n");
+                return GADGET_FAIL;
+            }
+        }
+        else if (data_type == ISMRMRD::ISMRMRD_UINT)
+        {
+            GadgetContainerMessage< hoNDArray< unsigned int > >* datamb = AsContainerMessage< hoNDArray< unsigned int > >(m1->cont());
+            if (!datamb)
+            {
+                GERROR("DicomFinishGadget::process, invalid image message objects\n");
+                return GADGET_FAIL;
+            }
+
+            if (this->write_data_attrib(m1, datamb) != GADGET_OK)
+            {
+                GERROR("DicomFinishGadget::write_data_attrib failed for unsigned int ... \n");
+                return GADGET_FAIL;
+            }
+        }
+        else if (data_type == ISMRMRD::ISMRMRD_INT)
+        {
+            GadgetContainerMessage< hoNDArray< int > >* datamb = AsContainerMessage< hoNDArray< int > >(m1->cont());
+            if (!datamb)
+            {
+                GERROR("DicomFinishGadget::process, invalid image message objects\n");
+                return GADGET_FAIL;
+            }
+
+            if (this->write_data_attrib(m1, datamb) != GADGET_OK)
+            {
+                GERROR("DicomFinishGadget::write_data_attrib failed for int ... \n");
+                return GADGET_FAIL;
+            }
+        }
+        else if (data_type == ISMRMRD::ISMRMRD_FLOAT)
+        {
+            GadgetContainerMessage< hoNDArray< float > >* datamb = AsContainerMessage< hoNDArray< float > >(m1->cont());
+            if (!datamb)
+            {
+                GERROR("DicomFinishGadget::process, invalid image message objects\n");
+                return GADGET_FAIL;
+            }
+
+            if (this->write_data_attrib(m1, datamb) != GADGET_OK)
+            {
+                GERROR("DicomFinishGadget::write_data_attrib failed for float ... \n");
+                return GADGET_FAIL;
+            }
+        }
+        else if (data_type == ISMRMRD::ISMRMRD_DOUBLE)
+        {
+            GadgetContainerMessage< hoNDArray< double > >* datamb = AsContainerMessage< hoNDArray< double > >(m1->cont());
+            if (!datamb)
+            {
+                GERROR("DicomFinishGadget::process, invalid image message objects\n");
+                return GADGET_FAIL;
+            }
+
+            if (this->write_data_attrib(m1, datamb) != GADGET_OK)
+            {
+                GERROR("DicomFinishGadget::write_data_attrib failed for double ... \n");
+                return GADGET_FAIL;
+            }
+        }
+        else if (data_type == ISMRMRD::ISMRMRD_CXFLOAT)
+        {
+            GERROR("DicomFinishGadget::process, does not supprot ISMRMRD_CXFLOAT data type\n");
+            return GADGET_FAIL;
+        }
+        else if (data_type == ISMRMRD::ISMRMRD_CXDOUBLE)
+        {
+            GERROR("DicomFinishGadget::process, does not supprot ISMRMRD_CXDOUBLE data type\n");
+            return GADGET_FAIL;
+        }
+
+        return GADGET_OK;
+    }
+
+    GADGET_FACTORY_DECLARE(DicomFinishGadget)
+
+} /* namespace Gadgetron */
diff --git a/gadgets/dicom/DicomFinishGadget.h b/gadgets/dicom/DicomFinishGadget.h
new file mode 100644
index 0000000..7e70a01
--- /dev/null
+++ b/gadgets/dicom/DicomFinishGadget.h
@@ -0,0 +1,414 @@
+/** \file   DicomFinishGadget.h
+\brief      Assemble the dicom images and send out
+
+The dicom image is sent out with message id -> dicom image -> dicom image name -> meta attributes
+\author     Hui Xue
+*/
+
+#ifndef DICOMFINISHGADGET_H
+#define DICOMFINISHGADGET_H
+
+#include "gadgetron_dicom_export.h"
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd/meta.h"
+#include "GadgetMRIHeaders.h"
+#include "ismrmrd/ismrmrd.h"
+#include "GadgetStreamController.h"
+
+#include "dcmtk/config/osconfig.h"
+#include "dcmtk/ofstd/ofstdinc.h"
+#define INCLUDE_CSTDLIB
+#define INCLUDE_CSTDIO
+#define INCLUDE_CSTRING
+#include "dcmtk/dcmdata/dctk.h"
+#include "dcmtk/dcmdata/dcostrmb.h"
+
+#include "mri_core_def.h"
+
+#include <string>
+#include <map>
+#include <complex>
+
+namespace Gadgetron
+{
+
+    // Used for windowing using short ints
+#define PIX_RANGE_MAX    (+32767)
+#define PIX_RANGE_MIN    (-32768)
+
+
+    // Writes a DICOM string value at the given location in the header
+    // Saves keystrokes
+#define WRITE_DCM_STRING(k, s)    \
+    do {                                                                    \
+        status = dataset->putAndInsertString(k, s);            \
+        if (!status.good()) {                                               \
+            GDEBUG("Failed to insert DICOM field (0x%04X,0x%04X) at "\
+                "line %u\n", k.getGroup(), k.getElement(), __LINE__);       \
+            return GADGET_FAIL;                                             \
+                        }                                                                   \
+            } while (0)
+
+    class EXPORTGADGETSDICOM DicomFinishGadget : public Gadget1< ISMRMRD::ImageHeader >
+    {
+    public:
+
+        typedef Gadget1<ISMRMRD::ImageHeader> BaseClass;
+
+        GADGET_DECLARE(DicomFinishGadget);
+
+        DicomFinishGadget()
+            : BaseClass()
+            , dcmFile()
+            , initialSeriesNumber(0)
+            , seriesIUIDRoot()
+        { }
+
+    protected:
+
+        virtual int process_config(ACE_Message_Block * mb);
+        virtual int process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1);
+
+        template <typename T>
+        int write_data_attrib(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, GadgetContainerMessage< hoNDArray< T > >* m2)
+        {
+
+            GadgetContainerMessage< ISMRMRD::MetaContainer >* m3 = AsContainerMessage< ISMRMRD::MetaContainer >(m2->cont());
+
+            std::string filename;
+
+            if (m3)
+            {
+                ISMRMRD::MetaContainer* img_attrib = m3->getObjectPtr();
+
+                size_t n;
+
+                size_t num = img_attrib->length(GADGETRON_DATA_ROLE);
+
+                std::vector<std::string> dataRole;
+                if (num == 0)
+                {
+                    dataRole.push_back("Image");
+                }
+                else
+                {
+                    dataRole.resize(num);
+                    for (n = 0; n < num; n++)
+                    {
+                        dataRole[n] = std::string(img_attrib->as_str(GADGETRON_DATA_ROLE, n));
+                    }
+                }
+
+                long imageNumber = img_attrib->as_long(GADGETRON_IMAGENUMBER, 0);
+
+                long slc, con, phs, rep, set, ave;
+                slc = m1->getObjectPtr()->slice;
+                con = m1->getObjectPtr()->contrast;
+                phs = m1->getObjectPtr()->phase;
+                rep = m1->getObjectPtr()->repetition;
+                set = m1->getObjectPtr()->set;
+                ave = m1->getObjectPtr()->average;
+
+                std::ostringstream ostr;
+
+                for (n = 0; n < dataRole.size(); n++)
+                {
+                    ostr << dataRole[n] << "_";
+                }
+
+                ostr << "SLC" << slc << "_"
+                    << "CON" << con << "_"
+                    << "PHS" << phs << "_"
+                    << "REP" << rep << "_"
+                    << "SET" << set << "_"
+                    << "AVE" << ave << "_"
+                    << imageNumber;
+
+                filename = ostr.str();
+            }
+            else
+            {
+                std::ostringstream ostr;
+                ostr << "Image_" << m1->getObjectPtr()->image_index << "_" << m1->getObjectPtr()->image_series_index;
+                filename = ostr.str();
+            }
+
+            GadgetContainerMessage<std::string>* mfilename = new GadgetContainerMessage<std::string>();
+            *(mfilename->getObjectPtr()) = filename;
+
+            // --------------------------------------------------
+
+            GadgetContainerMessage<hoNDArray< ACE_INT16 > > *pixels = new GadgetContainerMessage<hoNDArray< ACE_INT16 > >();
+            boost::shared_ptr< std::vector<size_t> > dims = m2->getObjectPtr()->get_dimensions();
+
+            try {
+                pixels->getObjectPtr()->create(dims.get());
+            }
+            catch (bad_alloc&) {
+                GDEBUG("Unable to create short storage in DicomFinishGadget");
+                return GADGET_FAIL;
+            }
+
+            /* create ImageHeader and hoNDArray pointers for better readability */
+            hoNDArray<ACE_INT16>* data = pixels->getObjectPtr();
+
+            /* grab pointers to both the original and new data arrays
+            * The original is of type T
+            * The new is of type ACE_INT16 */
+            T *src = m2->getObjectPtr()->get_data_ptr();
+            ACE_INT16 *dst = data->get_data_ptr();
+
+            /* Convert/cast each element in the data array
+            * and simultaneously find the min/max pixel value, which
+            * will be used later for some crude windowing */
+            T min_pix_val, max_pix_val, sum_pix_val = 0;
+            if (pixels->getObjectPtr()->get_number_of_elements() > 0)
+            {
+                min_pix_val = src[0];
+                max_pix_val = src[0];
+            }
+
+            for (unsigned long i = 0; i < pixels->getObjectPtr()->get_number_of_elements(); i++)
+            {
+                T pix_val = src[i];
+                // search for minimum and maximum pixel values
+                if (pix_val < min_pix_val) min_pix_val = pix_val;
+                if (pix_val > max_pix_val) max_pix_val = pix_val;
+                sum_pix_val += pix_val / 4; // scale by 25% to avoid overflow
+
+                // copy/cast the pixel value to a short int
+                dst[i] = static_cast<ACE_INT16>(pix_val);
+            }
+            T mean_pix_val = (T)((sum_pix_val * 4) / (T)pixels->getObjectPtr()->get_number_of_elements());
+
+            /* replace the old 'message2' with the new data */
+            m1->cont(pixels);
+            /* release the old data array */
+            m2->cont(NULL);
+            m2->release();
+            /* update the image data_type.
+            * There is currently no SIGNED SHORT type so this will have to suffice */
+            m1->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_USHORT;
+
+            unsigned int BUFSIZE = 1024;
+            char *buf = new char[BUFSIZE];
+            OFCondition status;
+            DcmTagKey key;
+            DcmDataset *dataset = dcmFile.getDataset();
+
+            // Echo Number
+            // TODO: it is often the case the img->contrast is not properly set
+            // likely due to the allocated ISMRMRD::ImageHeader being uninitialized
+            key.set(0x0018, 0x0086);
+            ACE_OS::snprintf(buf, BUFSIZE, "%d", m1->getObjectPtr()->contrast);
+            WRITE_DCM_STRING(key, buf);
+
+            // Acquisition Matrix ... Image Dimensions
+            // Defined as: [frequency rows, frequency columns, phase rows, phase columns]
+            // But at this point in the gadget I don't know the frequency encode direction
+            key.set(0x0018, 0x1310);
+            ACE_UINT16 im_dim[4] = { 0, 0, 0, 0 };
+            /* if (frequence_encode_dir == "ROW")) {
+            // frequency encoding direction is ROW
+            im_dim[1] = img->matrix_size[0];
+            im_dim[2] = img->matrix_size[1];
+            } */
+            // frequency encoding direction is COLUMN
+            /*im_dim[0] = img->matrix_size[0];
+            im_dim[3] = img->matrix_size[1];*/
+
+            im_dim[1] = m1->getObjectPtr()->matrix_size[0];
+            im_dim[2] = m1->getObjectPtr()->matrix_size[1];
+
+            status = dataset->putAndInsertUint16Array(key, im_dim, 4);
+            if (!status.good())
+            {
+                GDEBUG("Failed to stuff image dimensions\n");
+                return GADGET_FAIL;
+            }
+
+            // Series Number
+            // Only write a number if the image_series_index is positive and non-zero
+            key.set(0x0020, 0x0011);
+            ACE_OS::snprintf(buf, BUFSIZE, "%ld", this->initialSeriesNumber * 100 + m1->getObjectPtr()->image_series_index);
+            WRITE_DCM_STRING(key, buf);
+
+            // Image Number
+            key.set(0x0020, 0x0013);
+            ACE_OS::snprintf(buf, BUFSIZE, "%d", m1->getObjectPtr()->image_index + 1);
+            WRITE_DCM_STRING(key, buf);
+
+            // Image Position (Patient)
+            float corner[3];
+
+            corner[0] = m1->getObjectPtr()->position[0] -
+                (m1->getObjectPtr()->field_of_view[0] / 2.0f) * m1->getObjectPtr()->read_dir[0] -
+                (m1->getObjectPtr()->field_of_view[1] / 2.0f) * m1->getObjectPtr()->phase_dir[0];
+            corner[1] = m1->getObjectPtr()->position[1] -
+                (m1->getObjectPtr()->field_of_view[0] / 2.0f) * m1->getObjectPtr()->read_dir[1] -
+                (m1->getObjectPtr()->field_of_view[1] / 2.0f) * m1->getObjectPtr()->phase_dir[1];
+            corner[2] = m1->getObjectPtr()->position[2] -
+                (m1->getObjectPtr()->field_of_view[0] / 2.0f) * m1->getObjectPtr()->read_dir[2] -
+                (m1->getObjectPtr()->field_of_view[1] / 2.0f) * m1->getObjectPtr()->phase_dir[2];
+
+            key.set(0x0020, 0x0032);
+            ACE_OS::snprintf(buf, BUFSIZE, "%.4f\\%.4f\\%.4f", corner[0], corner[1], corner[2]);
+            WRITE_DCM_STRING(key, buf);
+
+            // Image Orientation
+            // read_dir, phase_dir, and slice_dir were calculated in
+            // a DICOM/patient coordinate system, so just plug them in
+            key.set(0x0020, 0x0037);
+            ACE_OS::snprintf(buf, BUFSIZE, "%.4f\\%.4f\\%.4f\\%.4f\\%.4f\\%.4f",
+                m1->getObjectPtr()->read_dir[0], m1->getObjectPtr()->read_dir[1], m1->getObjectPtr()->read_dir[2],
+                m1->getObjectPtr()->phase_dir[0], m1->getObjectPtr()->phase_dir[1], m1->getObjectPtr()->phase_dir[2]);
+            WRITE_DCM_STRING(key, buf);
+
+            // Slice Location
+            key.set(0x0020, 0x1041);
+            ACE_OS::snprintf(buf, BUFSIZE, "%f", m1->getObjectPtr()->position[2]);
+            WRITE_DCM_STRING(key, buf);
+
+            // Columns
+            key.set(0x0028, 0x0010);
+            ACE_OS::snprintf(buf, BUFSIZE, "%d", m1->getObjectPtr()->matrix_size[1]);
+            WRITE_DCM_STRING(key, buf);
+
+            // Rows
+            key.set(0x0028, 0x0011);
+            ACE_OS::snprintf(buf, BUFSIZE, "%d", m1->getObjectPtr()->matrix_size[0]);
+            WRITE_DCM_STRING(key, buf);
+
+            // Simple windowing using pixel values calculated earlier...
+            int mid_pix_val = (int)(max_pix_val + min_pix_val) / 2;
+            int window_center = (int)(mid_pix_val + mean_pix_val) / 2;
+            int window_width_left = (int)(window_center - min_pix_val);
+            int window_width_right = (int)(max_pix_val - window_center);
+            int window_width = (window_width_right > window_width_left) ?
+            window_width_right : window_width_left;
+
+            // Window Center
+            key.set(0x0028, 0x1050);
+            ACE_OS::snprintf(buf, BUFSIZE, "%d", window_center);
+            WRITE_DCM_STRING(key, buf);
+
+            // Window Width
+            key.set(0x0028, 0x1051);
+            ACE_OS::snprintf(buf, BUFSIZE, "%d", window_width);
+            WRITE_DCM_STRING(key, buf);
+
+            // ACR_NEMA_2C_VariablePixelDataGroupLength
+            key.set(0x7fe0, 0x0000);
+            status = dataset->insertEmptyElement(key);
+            if (!status.good()) {
+                GDEBUG("Failed to write 0x7fe0 Group Length\n");
+                return GADGET_FAIL;
+            }
+
+            // Pixel Data
+            if ((unsigned long)m1->getObjectPtr()->matrix_size[0] * (unsigned long)m1->getObjectPtr()->matrix_size[1] !=
+                data->get_number_of_elements()) {
+                GDEBUG("Mismatch in image dimensions and available data\n");
+                return GADGET_FAIL;
+            }
+            key.set(0x7fe0, 0x0010);
+            status = dataset->putAndInsertUint16Array(key, (unsigned short *)data->get_data_ptr(), (unsigned long)data->get_number_of_elements());
+            if (!status.good()) {
+                GDEBUG("Failed to stuff Pixel Data\n");
+                return GADGET_FAIL;
+            }
+
+            // Series Instance UID = generated here
+            key.set(0x0020, 0x000E);
+            unsigned short series_number = m1->getObjectPtr()->image_series_index + 1;
+
+            // Try to find an already-generated Series Instance UID in our map
+            std::map<unsigned int, std::string>::iterator it = seriesIUIDs.find(series_number);
+
+            if (it == seriesIUIDs.end()) {
+                // Didn't find a Series Instance UID for this series number
+                char prefix[32];
+                char newuid[96];
+                if (seriesIUIDRoot.length() > 20) {
+                    memcpy(prefix, seriesIUIDRoot.c_str(), 20);
+                    prefix[20] = '\0';
+                    dcmGenerateUniqueIdentifier(newuid, prefix);
+                }
+                else {
+                    dcmGenerateUniqueIdentifier(newuid);
+                }
+                seriesIUIDs[series_number] = std::string(newuid);
+            }
+            WRITE_DCM_STRING(key, seriesIUIDs[series_number].c_str());
+
+            // At a minimum, to put the DICOM image back into the database,
+            // you must change the SOPInstanceUID.
+            key.set(0x0008, 0x0018);        // SOPInstanceUID
+            const char *root;
+            if (seriesIUIDRoot.length() > 0) {
+                root = std::string(seriesIUIDRoot, 0, 20).c_str();
+            }
+            else {
+                root = "1.2.840.113619.2.156";
+            }
+            char newuid[65];
+            dcmGenerateUniqueIdentifier(newuid, root);
+            WRITE_DCM_STRING(key, newuid);
+
+            //// set the private fields to store meta attributes
+            //key.set(0x0051, 0x0000);
+            //status = dataset->insertEmptyElement(key);
+            //if (!status.good())
+            //{
+            //    GDEBUG("Failed to write 0x0051 Group Length\n");
+            //    return GADGET_FAIL;
+            //}
+
+            //key.set(0x0051, 0x0019);
+            //WRITE_DCM_STRING(key, buf+sizeof(size_t_type));
+
+            //delete [] meta_buf;
+
+            /* clean up the char[] we created for ACE_OS::snprintf */
+            delete[] buf;
+
+            GadgetContainerMessage<DcmFileFormat>* mdcm = new GadgetContainerMessage<DcmFileFormat>();
+
+            *mdcm->getObjectPtr() = dcmFile;
+
+            GadgetContainerMessage<GadgetMessageIdentifier>* mb =
+                new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+            mb->getObjectPtr()->id = GADGET_MESSAGE_DICOM_WITHNAME;
+
+            mb->cont(mdcm);
+            mdcm->cont(mfilename);
+
+            if (m3)
+            {
+                mfilename->cont(m3);
+            }
+
+            int ret = this->controller_->output_ready(mb);
+
+            if ((ret < 0))
+            {
+                GDEBUG("Failed to return message to controller\n");
+                return GADGET_FAIL;
+            }
+
+            return GADGET_OK;
+        }
+
+    private:
+        DcmFileFormat dcmFile;
+        std::string seriesIUIDRoot;
+        long initialSeriesNumber;
+        std::map <unsigned int, std::string> seriesIUIDs;
+    };
+
+} /* namespace Gadgetron */
+
+#endif // DICOMFINISHGADGET_H
diff --git a/gadgets/dicom/DicomImageWriter.cpp b/gadgets/dicom/DicomImageWriter.cpp
new file mode 100644
index 0000000..3e154b4
--- /dev/null
+++ b/gadgets/dicom/DicomImageWriter.cpp
@@ -0,0 +1,157 @@
+#include <complex>
+#include <fstream>
+#include <time.h>
+
+// Gadgetron includes
+#include "GadgetIsmrmrdReadWrite.h"
+#include "DicomImageWriter.h"
+#include "GadgetContainerMessage.h"
+#include "hoNDArray.h"
+#include "ismrmrd/meta.h"
+
+// DCMTK includes
+#include "dcmtk/config/osconfig.h"
+#include "dcmtk/ofstd/ofstdinc.h"
+#define INCLUDE_CSTDLIB
+#define INCLUDE_CSTDIO
+#define INCLUDE_CSTRING
+#include "dcmtk/dcmdata/dctk.h"
+#include "dcmtk/dcmdata/dcostrmb.h"
+
+namespace Gadgetron {
+
+int DicomImageWriter::write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb)
+{
+    GadgetContainerMessage<DcmFileFormat>* dcm_file_message = AsContainerMessage<DcmFileFormat>(mb);
+    if (!dcm_file_message)
+    {
+      GERROR("DicomImageWriter::write, invalid image message objects\n");
+      return -1;
+    }
+
+    DcmFileFormat *dcmFile = dcm_file_message->getObjectPtr();
+
+    // Initialize transfer state of DcmDataset
+    dcmFile->transferInit();
+
+    // Calculate size of DcmFileFormat and create a SUFFICIENTLY sized buffer
+    long buffer_length = dcmFile->calcElementLength(EXS_LittleEndianExplicit, EET_ExplicitLength) * 2;
+    std::vector<char> bufferChar(buffer_length);
+    char* buffer = &bufferChar[0];
+
+    DcmOutputBufferStream out_stream(buffer, buffer_length);
+
+    OFCondition status;
+
+    status = dcmFile->write(out_stream, EXS_LittleEndianExplicit, EET_ExplicitLength, NULL);
+    if (!status.good()) {
+      GERROR("Failed to write DcmFileFormat to DcmOutputStream(%s)\n", status.text());
+      return GADGET_FAIL;
+    }
+
+    void *serialized = NULL;
+    offile_off_t serialized_length = 0;
+    out_stream.flushBuffer(serialized, serialized_length);
+
+    // finalize transfer state of DcmDataset
+    dcmFile->transferEnd();
+
+    ssize_t send_cnt = 0;
+
+    GadgetMessageIdentifier id;
+    id.id = GADGET_MESSAGE_DICOM_WITHNAME;
+
+    if ((send_cnt = sock->send_n (&id, sizeof(GadgetMessageIdentifier))) <= 0)
+    {
+      GERROR("Unable to send DICOM message identifier\n");
+      return -1;
+    }
+
+    uint32_t nbytes = (uint32_t)serialized_length;
+    if ((send_cnt = sock->send_n (&nbytes, sizeof(nbytes))) <= 0)
+    {
+      GERROR("Unable to send DICOM bytes length\n");
+      return -1;
+    }
+
+    if ((send_cnt = sock->send_n (serialized, serialized_length)) <= 0)
+    {
+      GERROR("Unable to send DICOM bytes\n");
+      return -1;
+    }
+
+    // chech whether the image filename is attached
+    GadgetContainerMessage<std::string>* dcm_filename_message = AsContainerMessage<std::string>(mb->cont());
+    if (dcm_filename_message)
+    {
+        unsigned long long len = dcm_filename_message->getObjectPtr()->length();
+        if ((send_cnt = sock->send_n (&len, sizeof(unsigned long long))) <= 0)
+        {
+          GERROR("Unable to send DICOM filename length\n");
+          return -1;
+        }
+
+        const char* filename = dcm_filename_message->getObjectPtr()->c_str();
+        if ((send_cnt = sock->send_n (filename, len)) <= 0)
+        {
+          GERROR("Unable to send DICOM filename\n");
+          return -1;
+        }
+
+        GadgetContainerMessage<ISMRMRD::MetaContainer>* dcm_meta_message = AsContainerMessage<ISMRMRD::MetaContainer>(dcm_filename_message->cont());
+
+        char* buf = NULL;
+        len = 0;
+
+        if (dcm_meta_message)
+        {
+            try
+            {
+                std::stringstream str;
+                ISMRMRD::serialize( *dcm_meta_message->getObjectPtr(), str);
+                std::string attribContent = str.str();
+                len = attribContent.length();
+
+                buf = new char[len];
+                GADGET_CHECK_THROW(buf != NULL);
+
+                memcpy(buf, attribContent.c_str(), len);
+            }
+            catch(...)
+            {
+              GERROR("Unable to serialize dicom image meta attributes \n");
+              return -1;
+            }
+
+            if ((send_cnt = sock->send_n(&len, sizeof(unsigned long long))) <= 0)
+            {
+                GERROR("Unable to send dicom image meta attributes length\n");
+                if (buf != NULL) delete[] buf;
+                return -1;
+            }
+
+            if ( (send_cnt = sock->send_n (buf, len)) <= 0 )
+            {
+              GERROR("Unable to send dicom image meta attributes\n");
+              if ( buf != NULL ) delete [] buf;
+              return -1;
+            }
+
+            if ( buf != NULL ) delete [] buf;
+        }
+        else
+        {
+            if ((send_cnt = sock->send_n(&len, sizeof(unsigned long long))) <= 0)
+            {
+                GERROR("Unable to send dicom image meta attributes length\n");
+                return -1;
+            }
+        }
+    }
+
+    return 0;
+}
+
+GADGETRON_WRITER_FACTORY_DECLARE(DicomImageWriter)
+
+} /* namespace Gadgetron */
diff --git a/gadgets/dicom/DicomImageWriter.h b/gadgets/dicom/DicomImageWriter.h
new file mode 100644
index 0000000..e912e7f
--- /dev/null
+++ b/gadgets/dicom/DicomImageWriter.h
@@ -0,0 +1,22 @@
+#ifndef DICOMIMAGEWRITER_H
+#define DICOMIMAGEWRITER_H
+
+#include "gadgetron_dicom_export.h"
+#include "GadgetMessageInterface.h"
+#include "GadgetMRIHeaders.h"
+#include "ismrmrd/ismrmrd.h"
+
+
+namespace Gadgetron {
+
+class EXPORTGADGETSDICOM DicomImageWriter : public GadgetMessageWriter
+{
+ public:
+  virtual int write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb);
+
+  GADGETRON_WRITER_DECLARE(DicomImageWriter);
+};
+
+} /* namespace Gadgetron */
+
+#endif
diff --git a/gadgets/dicom/dicom.xml b/gadgets/dicom/dicom.xml
new file mode 100644
index 0000000..d78a647
--- /dev/null
+++ b/gadgets/dicom/dicom.xml
@@ -0,0 +1,90 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+    xmlns="http://gadgetron.sf.net/gadgetron"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <writer>
+        <slot>1018</slot>
+        <dll>gadgetron_dicom</dll>
+        <classname>DicomImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+        <property>
+            <name>constant_noise_variance</name>
+            <value>false</value>
+        </property>
+    </gadget>
+
+    <gadget>
+        <name>AccTrig</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AcquisitionAccumulateTriggerGadget</classname>
+        <property>
+            <name>trigger_dimension</name>
+            <value>repetition</value>
+        </property>
+        <property>
+          <name>sorting_dimension</name>
+          <value>slice</value>
+        </property>
+    </gadget>
+
+    <gadget>
+        <name>Buff</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>BucketToBufferGadget</classname>
+        <property>
+            <name>N_dimension</name>
+            <value></value>
+        </property>
+        <property>
+          <name>S_dimension</name>
+          <value></value>
+        </property>
+        <property>
+          <name>split_slices</name>
+          <value>true</value>
+        </property>
+    </gadget>
+
+    <gadget>
+        <name>FFT</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FFTGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>Combine</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>CombineGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>Extract</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ExtractGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>AutoScale</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AutoScaleGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>DicomFinish</name>
+        <dll>gadgetron_dicom</dll>
+        <classname>DicomFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/dicom/gadgetron_dicom_export.h b/gadgets/dicom/gadgetron_dicom_export.h
new file mode 100644
index 0000000..4f4a544
--- /dev/null
+++ b/gadgets/dicom/gadgetron_dicom_export.h
@@ -0,0 +1,15 @@
+#ifndef DICOM_EXPORT_H_
+#define DICOM_EXPORT_H_
+
+
+#if defined (WIN32)
+#if defined (gadgetron_dicom_EXPORTS)
+#define EXPORTGADGETSDICOM __declspec(dllexport)
+#else
+#define EXPORTGADGETSDICOM __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETSDICOM
+#endif
+
+#endif /* DICOM_EXPORT_H_ */
diff --git a/gadgets/epi/CMakeLists.txt b/gadgets/epi/CMakeLists.txt
new file mode 100644
index 0000000..0c95309
--- /dev/null
+++ b/gadgets/epi/CMakeLists.txt
@@ -0,0 +1,50 @@
+IF (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_EPI__)
+ENDIF (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/fft/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+  ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/epi
+  ${ARMADILLO_INCLUDE_DIRS}
+)
+
+add_library(gadgetron_epi SHARED 
+  EPIReconXGadget.h EPIReconXGadget.cpp
+  EPICorrGadget.h EPICorrGadget.cpp
+  FFTXGadget.h FFTXGadget.cpp
+  CutXGadget.h CutXGadget.cpp
+  epi.xml
+  epi_gtplus_grappa.xml
+)
+
+set_target_properties(gadgetron_epi PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(
+  gadgetron_epi
+  gadgetron_gadgetbase
+  gadgetron_toolbox_cpucore
+  gadgetron_toolbox_cpufft
+  gadgetron_toolbox_cpucore_math
+  gadgetron_toolbox_log
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+)
+
+install(FILES 
+  EPIReconXGadget.h
+  EPICorrGadget.h
+  FFTXGadget.h
+  gadgetron_epi_export.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+install(TARGETS gadgetron_epi DESTINATION lib COMPONENT main)
+
+install(FILES
+  epi.xml
+  epi_gtplus_grappa.xml
+  DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
diff --git a/gadgets/epi/CutXGadget.cpp b/gadgets/epi/CutXGadget.cpp
new file mode 100644
index 0000000..4bab98d
--- /dev/null
+++ b/gadgets/epi/CutXGadget.cpp
@@ -0,0 +1,91 @@
+#include "CutXGadget.h"
+#include "hoNDFFT.h"
+#include "hoNDArray_utils.h"
+#include "ismrmrd/xml.h"
+
+namespace Gadgetron{
+
+    CutXGadget::CutXGadget() {}
+    CutXGadget::~CutXGadget() {}
+
+    int CutXGadget::process_config(ACE_Message_Block* mb)
+    {
+      ISMRMRD::IsmrmrdHeader h;
+      ISMRMRD::deserialize(mb->rd_ptr(),h);
+      
+      
+      if (h.encoding.size() == 0) {
+	GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+	GDEBUG("This Gadget needs an encoding description\n");
+	return GADGET_FAIL;
+      }
+      
+      // Get the encoding space and trajectory description
+      ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+      ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+      ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+      ISMRMRD::TrajectoryDescription traj_desc;
+      
+      // Primary encoding space is for EPI
+      encodeNx_  = e_space.matrixSize.x;
+      encodeFOV_ = e_space.fieldOfView_mm.x;
+      reconNx_   = r_space.matrixSize.x;
+      reconFOV_  = r_space.fieldOfView_mm.x;
+      
+      cutNx_ = encodeNx_;
+
+      return 0;
+    }
+
+    int CutXGadget::process( GadgetContainerMessage< ISMRMRD::AcquisitionHeader>* m1,
+        GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+    {
+        try
+        {
+            // cut the central half from the kspace line
+            if ( m1->getObjectPtr()->number_of_samples > cutNx_ )
+            {
+                size_t RO = m1->getObjectPtr()->number_of_samples;
+
+                uint16_t startX = m1->getObjectPtr()->center_sample - cutNx_/2;
+                uint16_t endX = startX + cutNx_ - 1;
+
+                float ratio = RO / (float)cutNx_;
+                m1->getObjectPtr()->number_of_samples = cutNx_;
+                m1->getObjectPtr()->center_sample = (uint16_t)(m1->getObjectPtr()->center_sample / ratio );
+
+                GadgetContainerMessage< hoNDArray< std::complex<float> > >* m3 = new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+                std::vector<size_t> dim(2);
+                dim[0] = cutNx_;
+                dim[1] = m2->getObjectPtr()->get_size(1);
+
+                m3->getObjectPtr()->create(&dim);
+
+                size_t cha;
+                for ( cha=0; cha<dim[1]; cha++ )
+                {
+                    memcpy(m3->getObjectPtr()->begin()+cha*cutNx_, 
+                            m2->getObjectPtr()->begin()+cha*RO+startX, 
+                            sizeof( std::complex<float> )*cutNx_);
+                }
+
+                m1->cont(m3);
+                m2->release();
+            }
+
+            if (this->next()->putq(m1) < 0)
+            {
+                return GADGET_FAIL;
+            }
+        }
+        catch(...)
+        {
+            return GADGET_FAIL;
+        }
+
+        return GADGET_OK;
+    }
+
+    GADGET_FACTORY_DECLARE(CutXGadget)
+}
diff --git a/gadgets/epi/CutXGadget.h b/gadgets/epi/CutXGadget.h
new file mode 100644
index 0000000..37a59eb
--- /dev/null
+++ b/gadgets/epi/CutXGadget.h
@@ -0,0 +1,33 @@
+#ifndef CutXGADGET_H
+#define CutXGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_epi_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  class   EXPORTGADGETS_EPI CutXGadget : 
+  public Gadget2<ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+  {
+    public:
+      CutXGadget();
+      virtual ~CutXGadget();
+
+    protected:
+      virtual int process_config(ACE_Message_Block* mb);
+      virtual int process( GadgetContainerMessage< ISMRMRD::AcquisitionHeader>* m1,
+                       GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+      size_t encodeNx_;
+      float encodeFOV_;
+      size_t reconNx_;
+      float reconFOV_;
+
+      size_t cutNx_;
+  };
+}
+#endif //CutXGADGET_H
diff --git a/gadgets/epi/EPICorrGadget.cpp b/gadgets/epi/EPICorrGadget.cpp
new file mode 100644
index 0000000..4b0c1f5
--- /dev/null
+++ b/gadgets/epi/EPICorrGadget.cpp
@@ -0,0 +1,192 @@
+#include "EPICorrGadget.h"
+#include "ismrmrd/xml.h"
+
+namespace Gadgetron{
+
+  EPICorrGadget::EPICorrGadget() {}
+  EPICorrGadget::~EPICorrGadget() {}
+
+int EPICorrGadget::process_config(ACE_Message_Block* mb)
+{
+  ISMRMRD::IsmrmrdHeader h;
+  ISMRMRD::deserialize(mb->rd_ptr(),h);
+
+  if (h.encoding.size() == 0) {
+    GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+    GDEBUG("This Gadget needs an encoding description\n");
+    return GADGET_FAIL;
+  }
+
+  // Get the encoding space and trajectory description
+  ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+  ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+  ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+  ISMRMRD::TrajectoryDescription traj_desc;
+
+  if (h.encoding[0].trajectoryDescription) {
+    traj_desc = *h.encoding[0].trajectoryDescription;
+  } else {
+    GDEBUG("Trajectory description missing");
+    return GADGET_FAIL;
+  }
+
+  if (traj_desc.identifier != "ConventionalEPI") {
+    GDEBUG("Expected trajectory description identifier 'ConventionalEPI', not found.");
+    return GADGET_FAIL;
+  }
+
+
+  for (std::vector<ISMRMRD::UserParameterLong>::iterator i (traj_desc.userParameterLong.begin()); i != traj_desc.userParameterLong.end(); ++i) {
+    if (i->name == "numberOfNavigators") {
+      numNavigators_ = i->value;
+    } else if (i->name == "etl") {
+      etl_ = i->value;
+    }
+  }
+
+  verboseMode_ = verboseMode.value();
+
+  corrComputed_ = false;
+  navNumber_ = -1;
+  epiEchoNumber_ = -1;
+
+  return 0;
+}
+
+int EPICorrGadget::process(
+          GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+      GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+
+  //GDEBUG_STREAM("Nav: " << navNumber_ << "    " << "Echo: " << epiEchoNumber_ << std::endl);
+
+  // Get a reference to the acquisition header
+  ISMRMRD::AcquisitionHeader &hdr = *m1->getObjectPtr();
+
+  // Pass on the non-EPI data (e.g. FLASH Calibration)
+  if (hdr.encoding_space_ref > 0) {
+    // It is enough to put the first one, since they are linked
+    if (this->next()->putq(m1) == -1) {
+      m1->release();
+      GERROR("EPICorrGadget::process, passing data on to next gadget");
+      return -1;
+    }
+    return 0;
+  }
+
+  // We have data from encoding space 0.
+
+  // Make an armadillo matrix of the data
+  arma::cx_fmat adata = as_arma_matrix(m2->getObjectPtr());
+
+  // Check to see if the data is a navigator line or an imaging line
+  if (hdr.isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_PHASECORR_DATA)) {
+
+    // Increment the navigator counter
+    navNumber_ += 1;
+
+    // If the number of navigators per shot is exceeded, then
+    // we are at the beginning of the next shot
+    if (navNumber_ == numNavigators_) {
+      corrComputed_ = false;
+      navNumber_ = 0;
+      epiEchoNumber_ = -1;
+    }
+    
+    // If we are at the beginning of a shot, then initialize
+    if (navNumber_==0) {
+      // Set the size of the corrections and storage arrays
+      corrpos_.set_size( adata.n_rows);
+      corrneg_.set_size( adata.n_rows );
+      navdata_.set_size( adata.n_rows, hdr.active_channels, numNavigators_);
+      // Store the first navigator's polarity
+      startNegative_ = hdr.isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_REVERSE);
+    }
+
+    // Store the navigator data
+    navdata_.slice(navNumber_) = adata;
+
+    // If this is the last of the navigators for this shot, then
+    // compute the correction operator
+    if (navNumber_ == (numNavigators_-1)) {
+      arma::cx_fvec ctemp =  arma::zeros<arma::cx_fvec>(adata.n_rows);    // temp column complex
+      arma::fvec tvec = arma::zeros<arma::fvec>(adata.n_rows);            // temp column real
+      arma::fvec x = arma::linspace<arma::fvec>(-0.5, 0.5, adata.n_rows); // Evenly spaced x-space locations
+      int p; // counter
+      
+      // Accumulate over navigator triplets and sum over coils
+      // this is the average phase difference between odd and even navigators
+      for (p=0; p<numNavigators_-2; p=p+2) {
+    ctemp += arma::sum(arma::conj(navdata_.slice(p)+navdata_.slice(p+2)) % navdata_.slice(p+1),1);
+      }
+      
+      // TODO: Add a configuration toggle to switch between correction types
+
+      // Point-wise phase estimate
+      //for (p=0; p<adata.n_rows; p++) {
+      //  tvec[p] = std::arg(ctemp[p]);
+      //}
+
+      // Robust fit to a straight line
+      float slope = ctemp.n_rows * std::arg(arma::cdot(ctemp.rows(0,ctemp.n_rows-2), ctemp.rows(1,ctemp.n_rows-1)));
+      ctemp = ctemp % arma::exp(arma::cx_fvec(arma::zeros<arma::fvec>(x.n_rows), -slope*x));
+      float intercept = std::arg(arma::sum(ctemp));
+      //GDEBUG_STREAM("Slope = " << slope << std::endl);
+      //GDEBUG_STREAM("Intercept = " << intercept << std::endl);
+      tvec = slope*x + intercept;
+      
+      // Odd and even phase corrections
+      if (!startNegative_) {
+    // if the first navigator is a positive readout, we need to flip the sign of our correction
+    tvec = -1.0*tvec;
+      }
+      corrpos_ = arma::exp(arma::cx_fvec(arma::zeros<arma::fvec>(x.n_rows), -0.5*tvec));
+      corrneg_ = arma::exp(arma::cx_fvec(arma::zeros<arma::fvec>(x.n_rows), +0.5*tvec));
+      corrComputed_ = true;
+    }
+
+  }
+  else {
+    // Increment the echo number
+    epiEchoNumber_ += 1;
+    // TODO: use this to apply the B0 correction
+
+    // Apply the correction
+    // We use the armadillo notation that loops over all the columns
+    if (hdr.isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_REVERSE)) {
+      // Negative readout
+      for (int p=0; p<adata.n_cols; p++) {
+    adata.col(p) %= corrneg_;
+      }
+      // Now that we have corrected we set the readout direction to positive
+      hdr.clearFlag(ISMRMRD::ISMRMRD_ACQ_IS_REVERSE);
+    } 
+    else {
+      // Positive readout
+      for (int p=0; p<adata.n_cols; p++) {
+    adata.col(p) %= corrpos_;
+      }
+    }
+  }
+
+  // Pass on the imaging data
+  // TODO: this should be controlled by a flag
+  if (hdr.isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_PHASECORR_DATA)) {
+    m1->release();
+  } 
+  else {
+    // It is enough to put the first one, since they are linked
+    if (this->next()->putq(m1) == -1) {
+      m1->release();
+      GERROR("EPICorrGadget::process, passing data on to next gadget");
+      return -1;
+    }
+  }
+
+  return 0;
+}
+
+GADGET_FACTORY_DECLARE(EPICorrGadget)
+}
+
+
diff --git a/gadgets/epi/EPICorrGadget.h b/gadgets/epi/EPICorrGadget.h
new file mode 100644
index 0000000..70321a1
--- /dev/null
+++ b/gadgets/epi/EPICorrGadget.h
@@ -0,0 +1,50 @@
+#ifndef EPICORRGADGET_H
+#define EPICORRGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "hoArmadillo.h"
+#include "gadgetron_epi_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+#define _USE_MATH_DEFINES
+#include <math.h>
+
+namespace Gadgetron{
+
+  class  EXPORTGADGETS_EPI EPICorrGadget :
+  public Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >
+    {
+    public:
+      EPICorrGadget();
+      virtual ~EPICorrGadget();
+
+    protected:
+      GADGET_PROPERTY(verboseMode, bool, "Verbose output", false);
+
+      virtual int process_config(ACE_Message_Block* mb);
+      virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+              GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+      // in verbose mode, more info is printed out
+      bool verboseMode_;
+
+      arma::cx_fvec corrpos_;
+      arma::cx_fvec corrneg_;  
+      arma::cx_fcube navdata_;
+
+      // epi parameters
+      int numNavigators_;
+      int etl_;
+
+      // for a given shot
+      bool corrComputed_;
+      int navNumber_;
+      int epiEchoNumber_;
+      bool startNegative_;
+
+    };
+}
+#endif //EPICORRGADGET_H
diff --git a/gadgets/epi/EPIReconXGadget.cpp b/gadgets/epi/EPIReconXGadget.cpp
new file mode 100644
index 0000000..7b3da82
--- /dev/null
+++ b/gadgets/epi/EPIReconXGadget.cpp
@@ -0,0 +1,131 @@
+#include "EPIReconXGadget.h"
+#include "ismrmrd/xml.h"
+
+namespace Gadgetron{
+
+  EPIReconXGadget::EPIReconXGadget() {}
+  EPIReconXGadget::~EPIReconXGadget() {}
+
+int EPIReconXGadget::process_config(ACE_Message_Block* mb)
+{
+  ISMRMRD::IsmrmrdHeader h;
+  ISMRMRD::deserialize(mb->rd_ptr(),h);
+  
+  
+  verboseMode_ = verboseMode.value();
+
+  if (h.encoding.size() == 0) {
+    GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+    GDEBUG("This Gadget needs an encoding description\n");
+    return GADGET_FAIL;
+  }
+
+  GDEBUG("Number of encoding spaces = %d\n", h.encoding.size());
+
+  // Get the encoding space and trajectory description
+  ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+  ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+  ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+  ISMRMRD::TrajectoryDescription traj_desc;
+
+  if (h.encoding[0].trajectoryDescription) {
+    traj_desc = *h.encoding[0].trajectoryDescription;
+  } else {
+    GDEBUG("Trajectory description missing");
+    return GADGET_FAIL;
+  }
+
+  if (traj_desc.identifier != "ConventionalEPI") {
+    GDEBUG("Expected trajectory description identifier 'ConventionalEPI', not found.");
+    return GADGET_FAIL;
+  }
+
+  // Primary encoding space is for EPI
+  reconx.encodeNx_  = e_space.matrixSize.x;
+  reconx.encodeFOV_ = e_space.fieldOfView_mm.x;
+  reconx.reconNx_   = r_space.matrixSize.x;
+  reconx.reconFOV_  = r_space.fieldOfView_mm.x;
+  
+  // TODO: we need a flag that says it's a balanced readout.
+  for (std::vector<ISMRMRD::UserParameterLong>::iterator i (traj_desc.userParameterLong.begin()); i != traj_desc.userParameterLong.end(); ++i) {
+    if (i->name == "rampUpTime") {
+      reconx.rampUpTime_ = i->value;
+    } else if (i->name == "rampDownTime") {
+      reconx.rampDownTime_ = i->value;
+    } else if (i->name == "flatTopTime") {
+      reconx.flatTopTime_ = i->value;
+    } else if (i->name == "acqDelayTime") {
+      reconx.acqDelayTime_ = i->value;
+    } else if (i->name == "numSamples") {
+      reconx.numSamples_ = i->value;
+    }
+  }
+
+  for (std::vector<ISMRMRD::UserParameterDouble>::iterator i (traj_desc.userParameterDouble.begin()); i != traj_desc.userParameterDouble.end(); ++i) {
+    if (i->name == "dwellTime") {
+      reconx.dwellTime_ = i->value;
+    }
+  }
+
+  // If the flat top time is not set in the header, then we assume that rampSampling is off
+  // and we set the flat top time from the number of samples and the dwell time.
+  if (reconx.flatTopTime_ == 0) {
+      reconx.flatTopTime_ = reconx.dwellTime_ * reconx.numSamples_;
+  }
+
+  // Compute the trajectory
+  reconx.computeTrajectory();
+
+  // Second encoding space is an even readout for PAT REF e.g. FLASH
+  if ( h.encoding.size() > 1 ) {
+    ISMRMRD::EncodingSpace e_space2 = h.encoding[1].encodedSpace;
+    ISMRMRD::EncodingSpace r_space2 = h.encoding[1].reconSpace;
+    reconx_other.encodeNx_  = r_space2.matrixSize.x;
+    reconx_other.encodeFOV_ = r_space2.fieldOfView_mm.x;
+    reconx_other.reconNx_   = r_space2.matrixSize.x;
+    reconx_other.reconFOV_  = r_space2.fieldOfView_mm.x;
+    reconx_other.numSamples_ = e_space2.matrixSize.x;
+    reconx_other.dwellTime_ = 1.0;
+    reconx_other.computeTrajectory();
+  }
+
+  return 0;
+}
+
+int EPIReconXGadget::process(
+          GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+      GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+
+  ISMRMRD::AcquisitionHeader hdr_in = *(m1->getObjectPtr());
+  ISMRMRD::AcquisitionHeader hdr_out;
+  hoNDArray<std::complex<float> > data_out;
+
+  data_out.create(reconx.reconNx_, m2->getObjectPtr()->get_size(1));
+
+  // Switch the reconstruction based on the encoding space (e.g. for FLASH Calibration)
+  if (hdr_in.encoding_space_ref == 0) {
+    reconx.apply(*m1->getObjectPtr(), *m2->getObjectPtr(), hdr_out, data_out);
+  }
+  else {
+    reconx_other.apply(*m1->getObjectPtr(), *m2->getObjectPtr(), hdr_out, data_out);
+  }
+
+  // Replace the contents of m1 with the new header and the contentes of m2 with the new data
+  *m1->getObjectPtr() = hdr_out;
+  *m2->getObjectPtr() = data_out;
+
+  // It is enough to put the first one, since they are linked
+  if (this->next()->putq(m1) == -1) {
+    m1->release();
+    GERROR("EPIReconXGadget::process, passing data on to next gadget");
+    return -1;
+  }
+
+  return 0;
+}
+
+GADGET_FACTORY_DECLARE(EPIReconXGadget)
+}
+
+
diff --git a/gadgets/epi/EPIReconXGadget.h b/gadgets/epi/EPIReconXGadget.h
new file mode 100644
index 0000000..bada42a
--- /dev/null
+++ b/gadgets/epi/EPIReconXGadget.h
@@ -0,0 +1,40 @@
+#ifndef EPIRECONXGADGET_H
+#define EPIRECONXGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_epi_export.h"
+#include "hoArmadillo.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+#include "EPIReconXObjectFlat.h"
+#include "EPIReconXObjectTrapezoid.h"
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_EPI EPIReconXGadget : 
+  public Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >
+    {
+    public:
+      EPIReconXGadget();
+      virtual ~EPIReconXGadget();
+      
+    protected:
+      GADGET_PROPERTY(verboseMode, bool, "Verbose output", false);
+
+      virtual int process_config(ACE_Message_Block* mb);
+      virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+      // in verbose mode, more info is printed out
+      bool verboseMode_;
+
+      // A set of reconstruction objects
+      EPI::EPIReconXObjectTrapezoid<std::complex<float> > reconx;
+      EPI::EPIReconXObjectFlat<std::complex<float> > reconx_other;
+
+    };
+}
+#endif //EPIRECONXGADGET_H
diff --git a/gadgets/epi/FFTXGadget.cpp b/gadgets/epi/FFTXGadget.cpp
new file mode 100644
index 0000000..b66ac68
--- /dev/null
+++ b/gadgets/epi/FFTXGadget.cpp
@@ -0,0 +1,27 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "FFTXGadget.h"
+#include "hoNDFFT.h"
+#include "hoNDArray_utils.h"
+
+namespace Gadgetron{
+
+  FFTXGadget::FFTXGadget() {}
+  FFTXGadget::~FFTXGadget() {}
+
+  int FFTXGadget::process( GadgetContainerMessage< ISMRMRD::AcquisitionHeader>* m1,
+                GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+  {
+
+    // FFT along 1st dimensions (x)
+    hoNDFFT<float>::instance()->fft1c( *m2->getObjectPtr() );
+
+    if (this->next()->putq(m1) < 0)
+    {
+      return GADGET_FAIL;
+    }
+    
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(FFTXGadget)
+}
diff --git a/gadgets/epi/FFTXGadget.h b/gadgets/epi/FFTXGadget.h
new file mode 100644
index 0000000..2c8ca98
--- /dev/null
+++ b/gadgets/epi/FFTXGadget.h
@@ -0,0 +1,25 @@
+#ifndef FFTXGADGET_H
+#define FFTXGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_epi_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  class   EXPORTGADGETS_EPI FFTXGadget : 
+  public Gadget2<ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+  {
+    public:
+      FFTXGadget();
+      virtual ~FFTXGadget();
+
+    protected:
+      virtual int process( GadgetContainerMessage< ISMRMRD::AcquisitionHeader>* m1,
+                       GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+  };
+}
+#endif //FFTXGADGET_H
diff --git a/gadgets/epi/epi.xml b/gadgets/epi/epi.xml
new file mode 100644
index 0000000..9cbccbe
--- /dev/null
+++ b/gadgets/epi/epi.xml
@@ -0,0 +1,124 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+  
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ReconX</name>
+      <dll>gadgetron_epi</dll>
+      <classname>EPIReconXGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>EPICorr</name>
+      <dll>gadgetron_epi</dll>
+      <classname>EPICorrGadget</classname>
+    </gadget>
+
+    <!-- FFT in X back to k -->
+    <gadget>
+        <name>FFTX</name>
+        <dll>gadgetron_epi</dll>
+        <classname>FFTXGadget</classname>
+    </gadget>
+
+<!--
+    <gadget>
+      <name>IsmrmrdDump</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>IsmrmrdDumpGadget</classname>
+      <property><name>file_prefix</name><value>ISMRMRD_DUMP</value></property>
+      <property><name>append_timestamp</name><value>1</value></property>
+    </gadget>
+-->
+
+    <gadget>
+        <name>AccTrig</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AcquisitionAccumulateTriggerGadget</classname>
+        <property>
+            <name>trigger_dimension</name>
+            <value>repetition</value>
+        </property>
+        <property>
+          <name>sorting_dimension</name>
+          <value>slice</value>
+        </property>
+    </gadget>
+
+    <gadget>
+        <name>Buff</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>BucketToBufferGadget</classname>
+        <property>
+            <name>N_dimension</name>
+            <value></value>
+        </property>
+        <property>
+          <name>S_dimension</name>
+          <value></value>
+        </property>
+        <property>
+          <name>split_slices</name>
+          <value>true</value>
+        </property>
+        <property>
+          <name>ignore_segment</name>
+          <value>true</value>
+        </property>
+    </gadget>
+
+    <gadget>
+      <name>FFT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FFTGadget</classname>
+    </gadget>
+    
+    <gadget>
+      <name>Combine</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CombineGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>  
+
+   <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+ 
+     <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+</gadgetronStreamConfiguration>
diff --git a/gadgets/epi/epi_gtplus_grappa.xml b/gadgets/epi/epi_gtplus_grappa.xml
new file mode 100644
index 0000000..8bd9450
--- /dev/null
+++ b/gadgets/epi/epi_gtplus_grappa.xml
@@ -0,0 +1,483 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuration file for general 2D epi reconstruction
+
+        Author: Souheil Inati
+        Email: souheil.inati at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- Recon in X -->
+    <gadget>
+        <name>ReconX</name>
+        <dll>gadgetron_epi</dll>
+        <classname>EPIReconXGadget</classname>
+    </gadget>
+
+    <!-- EPI Corr -->
+    <gadget>
+        <name>EPICorr</name>
+        <dll>gadgetron_epi</dll>
+        <classname>EPICorrGadget</classname>
+    </gadget>
+
+    <!-- FFT in X back to k -->
+    <gadget>
+        <name>FFTX</name>
+        <dll>gadgetron_epi</dll>
+        <classname>FFTXGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!--Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Contrast</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>10240</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/epi/gadgetron_epi_export.h b/gadgets/epi/gadgetron_epi_export.h
new file mode 100644
index 0000000..b7b3b8e
--- /dev/null
+++ b/gadgets/epi/gadgetron_epi_export.h
@@ -0,0 +1,14 @@
+#ifndef GADGETRON_EPI_EXPORT_H_
+#define GADGETRON_EPI_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_EPI__)
+#define EXPORTGADGETS_EPI __declspec(dllexport)
+#else
+#define EXPORTGADGETS_EPI __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETS_EPI
+#endif
+
+#endif /* GADGETRON_EPI_EXPORT_H_ */
diff --git a/gadgets/gpu/CMakeLists.txt b/gadgets/gpu/CMakeLists.txt
new file mode 100644
index 0000000..030e53a
--- /dev/null
+++ b/gadgets/gpu/CMakeLists.txt
@@ -0,0 +1,42 @@
+IF (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUGADGET__)
+ENDIF (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+  ${CMAKE_SOURCE_DIR}/gadgets/pmri
+  ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/fft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators/gpu
+  ${ARMADILLO_INCLUDE_DIRS}
+  )
+
+include_directories(${CUDA_INCLUDE_DIRS})
+
+add_library(gadgetron_gpugadget SHARED 
+ cuFFTGadget.h cuFFTGadget.cpp) 
+
+set_target_properties(gadgetron_gpugadget PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_gpugadget
+  gadgetron_gadgetbase
+  gadgetron_toolbox_gpunfft gadgetron_toolbox_gpusolvers gadgetron_toolbox_gpuoperators gadgetron_toolbox_cpucore gadgetron_toolbox_cpucore_math gadgetron_toolbox_gpucore
+  ${ISMRMRD_LIBRARIES} ${FFTW3_LIBRARIES} ${CUDA_LIBRARIES}
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY}
+  )
+
+
+install (FILES  cuFFTGadget.h
+                DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+install (TARGETS gadgetron_gpugadget DESTINATION lib COMPONENT main)
+
diff --git a/gadgets/gpu/cuFFTGadget.cpp b/gadgets/gpu/cuFFTGadget.cpp
new file mode 100644
index 0000000..e7724f9
--- /dev/null
+++ b/gadgets/gpu/cuFFTGadget.cpp
@@ -0,0 +1,26 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "cuFFTGadget.h"
+#include "cuNDFFT.h"
+namespace Gadgetron{
+
+  int cuFFTGadget::process( GadgetContainerMessage< ISMRMRD::ImageHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+  	{
+	  hoNDArray<complext<float>> * tmp = (hoNDArray<complext<float>>*) m2->getObjectPtr();
+	  cuNDArray< complext<float> > cu_data(*tmp);
+	  cu_data.squeeze();
+	  std::cout << "PENGUIN: ";
+	  for (int i = 0; i < cu_data.get_number_of_dimensions(); i++) std::cout << cu_data.get_size(i) <<  " ";
+	  std::cout << std::endl;
+	  cuNDFFT<float>::instance()->ifft(&cu_data);
+	  cu_data.to_host(tmp);
+
+    if (this->next()->putq(m1) < 0) {
+      return GADGET_FAIL;
+    }
+
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(cuFFTGadget)
+}
diff --git a/gadgets/gpu/cuFFTGadget.h b/gadgets/gpu/cuFFTGadget.h
new file mode 100644
index 0000000..e8e7d0a
--- /dev/null
+++ b/gadgets/gpu/cuFFTGadget.h
@@ -0,0 +1,33 @@
+#pragma once
+#include "Gadget.h"
+#include "hoNDArray.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+#include <boost/make_shared.hpp>
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GPUGADGET__)
+#define EXPORTGPUGADGET __declspec(dllexport)
+#else
+#define EXPORTGPUGADGET __declspec(dllimport)
+#endif
+#else
+#define EXPORTGPUGADGET
+#endif
+
+
+namespace Gadgetron{
+
+  class EXPORTGPUGADGET cuFFTGadget :
+  public Gadget2<ISMRMRD::ImageHeader, hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(cuFFTGadget)
+
+	protected:
+      virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader>* m1,
+			   GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+    };
+}
diff --git a/gadgets/grappa/CMakeLists.txt b/gadgets/grappa/CMakeLists.txt
new file mode 100644
index 0000000..e7ec524
--- /dev/null
+++ b/gadgets/grappa/CMakeLists.txt
@@ -0,0 +1,75 @@
+if (WIN32)
+    ADD_DEFINITIONS(-D__BUILD_GADGETRON_GRAPPA__)
+endif (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+
+include_directories(
+    ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+    ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/fft/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/fft/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/mri_core
+    ${ARMADILLO_INCLUDE_DIRS}
+)
+
+add_library(gadgetron_grappa SHARED 
+    gadgetron_grappa_export.h
+    GrappaCalibrationBuffer.h
+    GrappaGadget.h
+    GrappaUnmixingGadget.h
+    GrappaWeights.h
+    GrappaWeightsCalculator.h
+    GrappaGadget.cpp
+    GrappaCalibrationBuffer.cpp
+    GrappaWeights.cpp
+    GrappaWeightsCalculator.cpp
+    GrappaUnmixingGadget.cpp
+    )
+
+set_target_properties(gadgetron_grappa PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+if(CUDA_FOUND)
+    target_link_libraries(gadgetron_grappa 
+      gadgetron_gadgetbase
+      gadgetron_toolbox_log
+      gadgetron_toolbox_gpuparallelmri
+      gadgetron_toolbox_cpucore
+      gadgetron_toolbox_cpucore_math
+      gadgetron_toolbox_cpufft
+      gadgetron_toolbox_gpufft
+      gadgetron_toolbox_mri_core
+      ${Boost_LIBRARIES}
+      ${ISMRMRD_LIBRARIES} ${FFTW3_LIBRARIES} 
+      optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+      )
+else (CUDA_FOUND)
+    target_link_libraries(gadgetron_grappa 
+      gadgetron_gadgetbase
+      gadgetron_toolbox_log
+      gadgetron_toolbox_cpucore
+      gadgetron_toolbox_cpucore_math
+      gadgetron_toolbox_cpufft
+      gadgetron_toolbox_mri_core
+      ${Boost_LIBRARIES}
+      ${ISMRMRD_LIBRARIES} ${FFTW3_LIBRARIES} 
+      optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+      )
+endif (CUDA_FOUND)
+
+install (FILES  gadgetron_grappa_export.h
+                GrappaCalibrationBuffer.h
+                GrappaGadget.h
+                GrappaUnmixingGadget.h
+                GrappaWeights.h
+                GrappaWeightsCalculator.h 
+                DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+install (TARGETS gadgetron_grappa DESTINATION lib COMPONENT main)
+
+add_subdirectory(config)
diff --git a/gadgets/grappa/GrappaCalibrationBuffer.cpp b/gadgets/grappa/GrappaCalibrationBuffer.cpp
new file mode 100644
index 0000000..b2c4767
--- /dev/null
+++ b/gadgets/grappa/GrappaCalibrationBuffer.cpp
@@ -0,0 +1,144 @@
+#include "GrappaCalibrationBuffer.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+namespace Gadgetron{
+
+  GrappaCalibrationBuffer::GrappaCalibrationBuffer(std::vector<size_t> dimensions,
+                                                   boost::shared_ptr<GrappaWeights<float> > w,
+                                                   GrappaWeightsCalculator<float>* weights_calculator)
+    : weights_(w)
+    , weights_calculator_(weights_calculator)
+    , buffer_counter_(dimensions[1])
+    , biggest_gap_current_(0)
+    , acceleration_factor_(0)
+    , last_line_(0)
+    , weights_invalid_(true)
+  {
+    dimensions_ = dimensions;
+    try {
+      buffer_.create(&dimensions_);
+      buffer_.fill(std::complex<float>(0.0,0.0));
+    } catch (std::runtime_error & err){
+      GEXCEPTION(err,"Unable to allocate memory for GRAPPA buffer");
+    }
+  
+  }
+
+  int GrappaCalibrationBuffer::add_data(ISMRMRD::AcquisitionHeader* m1, hoNDArray< std::complex<float> >* m2,
+					 unsigned short line_offset, unsigned short partition_offset)
+  {
+    if (!buffer_.get_data_ptr()) {
+      GDEBUG("Buffer not allocated, cannot add data");
+      return GADGET_FAIL;
+    }
+  
+    unsigned int samples =  m1->number_of_samples;
+    unsigned int line = m1->idx.kspace_encode_step_1 + line_offset;
+    unsigned int partition = m1->idx.kspace_encode_step_2 + partition_offset;
+    unsigned int slice = m1->idx.slice; //We should probably check this
+
+    if (samples != dimensions_[0]) {
+      GDEBUG("Wrong number of samples received\n");
+      return GADGET_FAIL;    
+    }
+
+    std::complex<float>* b = buffer_.get_data_ptr();
+    std::complex<float>* d = m2->get_data_ptr();
+
+    size_t offset= 0;
+    //Copy the data for all the channels
+    for (int c = 0; c < m1->active_channels; c++) {
+      offset = 
+        c*dimensions_[0]*dimensions_[1]*dimensions_[2] +
+        partition*dimensions_[0]*dimensions_[1] +
+        line*dimensions_[0];
+      memcpy(b+offset,d+c*samples,sizeof(std::complex<float>)*samples);
+    }
+
+    int buf_update  = buffer_counter_.update_line(line, m1->position,
+                                                  m1->read_dir, m1->phase_dir, m1->slice_dir);
+
+    if ( buf_update < 0) {
+      GDEBUG("Unable to update buffer counter for line %d\n", line);
+      return GADGET_FAIL;
+    }
+
+    //Let's figure out if we should start a weight calculation job
+  
+    //This means that the orientation changed
+    if (buf_update == 1) {
+      weights_invalid_ = true;
+    }
+
+    bool is_first_scan_in_slice = m1->isFlagSet(ISMRMRD::ISMRMRD_ACQ_FIRST_IN_SLICE);
+
+
+    //Depending on the sequence used, we could get into trouble if the sequence switches slice acquisition scheme before finishing a slice.
+    bool acquiring_sequentially = line > last_line_;
+
+    if (is_first_scan_in_slice) {
+      biggest_gap_current_ = 0;
+    } else if (acquiring_sequentially){
+      unsigned int gap = std::abs(static_cast<int>(last_line_) - static_cast<int>(line));
+      if (gap != biggest_gap_current_) biggest_gap_current_ = gap;
+    } else {
+      biggest_gap_current_ = 0;
+    }
+    last_line_ = line;
+
+
+    bool is_last_scan_in_slice = m1->isFlagSet(ISMRMRD::ISMRMRD_ACQ_LAST_IN_SLICE);
+
+    if (is_last_scan_in_slice && acquiring_sequentially) {
+      unsigned int min_ky, max_ky;
+
+      if (biggest_gap_current_ != acceleration_factor_) {
+        acceleration_factor_ = biggest_gap_current_;
+        weights_invalid_ = true;
+      }
+ 
+      if (buffer_counter_.get_region_of_support(min_ky, max_ky) < 0) {
+        GDEBUG("Unable to query min_ky, max_ky\n");
+        return GADGET_FAIL;
+      }
+    
+      //If there is nothing on the queue, we might as well recalculate
+      if (weights_calculator_->msg_queue()->message_count() < 1) {
+        //GDEBUG("Queue is empty, invalidating weights\n");
+        weights_invalid_ = true;
+      } else {
+        //GDEBUG("Queue is NOT EMPTY, calculation not triggered\n");
+      }
+
+      if (weights_invalid_ && ((max_ky-min_ky) > acceleration_factor_)) {
+        std::vector< std::pair<unsigned int, unsigned int> > sampled_region;
+        sampled_region.push_back(std::pair<unsigned int, unsigned int>(0, samples-1));
+        sampled_region.push_back(std::pair<unsigned int, unsigned int>(min_ky, max_ky));
+
+        std::vector<unsigned int> uncombined_channel_weights;
+
+        //GDEBUG_STREAM("==========================================================================");
+        //GDEBUG_STREAM("compute weights on scan : " << m1->scan_counter);
+        //GDEBUG("sampled_region[0] = %d,%d\n", sampled_region[0].first, sampled_region[0].second);
+        //GDEBUG("sampled_region[1] = %d,%d\n", sampled_region[1].first, sampled_region[1].second);
+
+        if (!weights_calculator_) {
+          GDEBUG("Weights calculator not defined\n");
+          return GADGET_FAIL;
+        }
+
+        weights_calculator_->add_job( &buffer_,
+                                      sampled_region,
+                                      acceleration_factor_,
+                                      weights_,
+                                      uncombined_channel_weights,
+                                      true);
+
+        weights_invalid_ = false;
+      }
+    }
+
+
+    return GADGET_OK;
+  }
+}
diff --git a/gadgets/grappa/GrappaCalibrationBuffer.h b/gadgets/grappa/GrappaCalibrationBuffer.h
new file mode 100644
index 0000000..84e4b8e
--- /dev/null
+++ b/gadgets/grappa/GrappaCalibrationBuffer.h
@@ -0,0 +1,150 @@
+#ifndef GRAPPACALIBRATIONBUFFER_H
+#define GRAPPACALIBRATIONBUFFER_H
+
+#include "gadgetron_grappa_export.h"
+#include "ismrmrd/ismrmrd.h"
+#include "hoNDArray.h"
+#include "GrappaWeights.h"
+#include "GrappaWeightsCalculator.h"
+
+#include <vector>
+#include <string.h>
+#include <memory>
+#include <complex>
+
+namespace Gadgetron{
+
+class EXPORTGADGETSGRAPPA CalibrationBufferCounter
+{
+
+ public:
+  CalibrationBufferCounter(unsigned int lines)  {
+    lines_sampled_ = std::vector<unsigned int>(lines,0);
+    memset(position_, 0, 3*sizeof(float));
+    memset(read_dir_, 0, 3*sizeof(float));
+    memset(phase_dir_, 0, 3*sizeof(float));
+    memset(slice_dir_, 0, 3*sizeof(float));
+  }
+
+
+  virtual ~CalibrationBufferCounter() {}
+
+  int update_line(unsigned int ky_index, float* position,
+        float* read_dir, float* phase_dir, float* slice_dir)
+  {
+    int ret_val = 0;
+
+    if (!read_dir_equal(read_dir) || 
+                !phase_dir_equal(phase_dir) ||
+                !slice_dir_equal(slice_dir) ||
+                !position_equal(position)) {
+      for (unsigned int i = 0; i < lines_sampled_.size(); i++) {
+	lines_sampled_[i] = 0;
+      }
+      memcpy(position_,position,3*sizeof(float));
+      memcpy(read_dir_,read_dir,3*sizeof(float));
+      memcpy(phase_dir_,phase_dir,3*sizeof(float));
+      memcpy(slice_dir_,slice_dir,3*sizeof(float));
+      ret_val = 1;
+    }
+
+    if (ky_index >= lines_sampled_.size()) {
+      return -1;
+    }
+
+    lines_sampled_[ky_index] = 1;
+
+    return ret_val;
+  }
+
+  int get_region_of_support(unsigned int& min_ky_index, unsigned int& max_ky_index) {
+    
+    unsigned int current_start_line = 0;
+    min_ky_index = 0;
+    max_ky_index = 0;
+    while (current_start_line < lines_sampled_.size() ) {
+      while ((current_start_line < lines_sampled_.size()) && (lines_sampled_[current_start_line] == 0) ) {	
+       	current_start_line++;
+      }
+      if (current_start_line >= lines_sampled_.size()) continue;
+
+      unsigned int region_start = current_start_line;
+      while ((current_start_line < lines_sampled_.size()) && (lines_sampled_[current_start_line] > 0)) {	
+       	current_start_line++;
+      }
+      unsigned int region_end = current_start_line-1;
+      if ((region_start < region_end) && ((region_end-region_start) > (max_ky_index-min_ky_index))) {
+	min_ky_index = region_start;
+	max_ky_index = region_end;
+      }
+    }
+    return 0;
+  }
+
+ protected:
+  float           position_[3];
+  float           read_dir_[3];
+  float           phase_dir_[3];
+  float           slice_dir_[3];
+
+  bool position_equal(float* position) {
+    for (unsigned int i = 0; i < 3; i++) {
+      if (position_[i] != position[i]) return false;
+    }
+    return true;
+  }
+
+  bool read_dir_equal(float* cosines) {
+    for (unsigned int i = 0; i < 3; i++) {
+      if (read_dir_[i] != cosines[i]) return false;
+    }
+    return true;
+  }
+
+  bool phase_dir_equal(float* cosines) {
+    for (unsigned int i = 0; i < 3; i++) {
+      if (phase_dir_[i] != cosines[i]) return false;
+    }
+    return true;
+  }
+
+  bool slice_dir_equal(float* cosines) {
+    for (unsigned int i = 0; i < 3; i++) {
+      if (slice_dir_[i] != cosines[i]) return false;
+    }
+    return true;
+  }
+
+ private:
+  std::vector<unsigned int> lines_sampled_;
+
+};
+
+class EXPORTGADGETSGRAPPA GrappaCalibrationBuffer
+{
+
+ public:
+  GrappaCalibrationBuffer(std::vector<size_t> dimensions, 
+			  boost::shared_ptr< GrappaWeights<float> > w,
+			  GrappaWeightsCalculator<float>* weights_calculator);
+  virtual ~GrappaCalibrationBuffer() {}
+
+  int add_data(ISMRMRD::AcquisitionHeader* m1, hoNDArray< std::complex<float> >* m2, 
+	       unsigned short line_offset = 0, unsigned short partition_offset = 0);
+
+ private:
+  hoNDArray< std::complex<float> > buffer_;
+  std::vector<size_t> dimensions_;
+  boost::shared_ptr< GrappaWeights<float> > weights_;
+  GrappaWeightsCalculator<float>* weights_calculator_;
+  CalibrationBufferCounter buffer_counter_;
+
+  unsigned int biggest_gap_current_;
+  unsigned int acceleration_factor_;
+  unsigned int last_line_;
+  bool weights_invalid_;
+};
+
+}
+
+#endif
diff --git a/gadgets/grappa/GrappaGadget.cpp b/gadgets/grappa/GrappaGadget.cpp
new file mode 100644
index 0000000..c69e0a4
--- /dev/null
+++ b/gadgets/grappa/GrappaGadget.cpp
@@ -0,0 +1,416 @@
+#include "GrappaGadget.h"
+#include "GrappaUnmixingGadget.h"
+#include "ismrmrd/xml.h"
+
+#include <ace/OS_NS_stdlib.h>
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/split.hpp>
+
+
+namespace Gadgetron{
+
+  GrappaGadget::GrappaGadget()
+    : image_counter_(0)
+    , image_series_(0)
+    , first_call_(true)
+    , target_coils_(0)
+    , use_gpu_(true)
+  {
+  }
+
+  GrappaGadget::~GrappaGadget()
+  {
+    for (unsigned int i = 0; i < buffers_.size(); i++) {
+      if (buffers_[i]) delete buffers_[i];
+      buffers_[i] = 0;
+
+
+      if (image_data_[i]) {
+        image_data_[i]->release();
+        image_data_[i] = 0;
+      }
+    }
+  }
+
+  int GrappaGadget::close(unsigned long flags) {
+    int ret = Gadget::close(flags);
+    GDEBUG("Shutting down GRAPPA Gadget\n");
+
+    if (weights_calculator_.close(flags) < 0) {
+      GDEBUG("Failed to close down weights calculator\n");
+      return GADGET_FAIL;
+    }
+
+    return ret;
+  }
+
+  int GrappaGadget::process_config(ACE_Message_Block* mb)
+  {
+    ISMRMRD::IsmrmrdHeader h;
+    ISMRMRD::deserialize(mb->rd_ptr(),h);
+
+    if (h.encoding.size() != 1) {
+      GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+      GDEBUG("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+
+    ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+    ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+    ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+    unsigned int slices = e_limits.slice ? e_limits.slice->maximum + 1 : 1;
+    dimensions_.push_back(e_space.matrixSize.x);
+    dimensions_.push_back(e_space.matrixSize.y);
+    dimensions_.push_back(e_space.matrixSize.z);
+    dimensions_.push_back((h.acquisitionSystemInformation && h.acquisitionSystemInformation->receiverChannels) ?
+                          *(h.acquisitionSystemInformation->receiverChannels) : 1);
+    dimensions_.push_back(slices);
+
+
+    GDEBUG("Dimensions %d, %d, %d, %d, %d\n", dimensions_[0], dimensions_[1], dimensions_[2], dimensions_[3], dimensions_[4]);
+
+    image_dimensions_.push_back(r_space.matrixSize.x); 
+    image_dimensions_.push_back(r_space.matrixSize.y);
+    image_dimensions_.push_back(r_space.matrixSize.z);
+    image_dimensions_.push_back(dimensions_[3]);
+
+    fov_.push_back(r_space.fieldOfView_mm.x);
+    fov_.push_back(r_space.fieldOfView_mm.y);
+    fov_.push_back(r_space.fieldOfView_mm.z);
+
+    line_offset_ = (dimensions_[1]>>1)-e_limits.kspace_encoding_step_1->center;
+
+    if (h.userParameters) {
+      for (size_t i = 0; i < h.userParameters->userParameterString.size(); i++) {
+	std::string name = h.userParameters->userParameterString[i].name;
+	std::string value = h.userParameters->userParameterString[i].value;
+	if (name.substr(0,5) == std::string("COIL_")) {
+	  int coil_num = std::atoi(name.substr(5,name.size()-5).c_str());
+	  channel_map_[value] = coil_num;
+	}
+      }
+    }
+
+    return GADGET_OK;
+  }
+
+
+  int GrappaGadget::initial_setup()
+  {
+
+
+    weights_ = std::vector< boost::shared_ptr<GrappaWeights<float> > >(dimensions_[4]);
+
+    buffers_ = std::vector<GrappaCalibrationBuffer* >(dimensions_[4],0);
+    time_stamps_ = std::vector<ACE_UINT32>(dimensions_[4],0);
+
+    //Let's figure out the number of target coils
+    target_coils_ = target_coils.value();
+    if ((target_coils_ <= 0) || (target_coils_ > dimensions_[3])) {
+      target_coils_ = dimensions_[3];
+    }
+
+    GDEBUG("Running GRAPPA recon with %d source channels and %d target channels\n", dimensions_[3], target_coils_);
+
+    weights_calculator_.set_number_of_target_coils(target_coils_);
+
+    bool use_gpu_ = use_gpu.value();
+    GDEBUG_STREAM("use_gpu_ is " << use_gpu_);
+
+    weights_calculator_.set_use_gpu(use_gpu_);
+
+    if (device_channels.value()) {
+      GDEBUG("We got the number of device channels from other gadget: %d\n", device_channels.value());
+      for (int i = 0; i < device_channels.value(); i++) {
+	weights_calculator_.add_uncombined_channel((unsigned int)i);
+      }
+    } else {
+      //Let's figure out if we have channels that are supposed to be uncombined
+      std::string uncomb_str = uncombined_channels.value();
+      std::vector<std::string> uncomb;
+      boost::split(uncomb, uncomb_str, boost::is_any_of(","));
+      for (unsigned int i = 0; i < uncomb.size(); i++) {
+	std::string ch = boost::algorithm::trim_copy(uncomb[i]);
+	if (ch.size() > 0) {
+	  unsigned int channel_id = static_cast<unsigned int>(ACE_OS::atoi(ch.c_str()));
+	  weights_calculator_.add_uncombined_channel(channel_id);
+	}
+      }
+      
+      uncomb_str = uncombined_channels_by_name.value();
+      if (uncomb_str.size()) {
+	GDEBUG("uncomb_str: %s\n",  uncomb_str.c_str());
+	boost::split(uncomb, uncomb_str, boost::is_any_of(","));
+	for (unsigned int i = 0; i < uncomb.size(); i++) {
+	std::string ch = boost::algorithm::trim_copy(uncomb[i]);
+	map_type_::iterator it = channel_map_.find(ch);
+	if (it != channel_map_.end()) {
+	  unsigned int channel_id = static_cast<unsigned int>(it->second);
+	  GDEBUG("Device channel: %s (%d)\n",  uncomb[i].c_str(), channel_id);
+	  weights_calculator_.add_uncombined_channel(channel_id);
+	}
+	/*
+	  if (ch.size() > 0) {
+	  unsigned int channel_id = static_cast<unsigned int>(ACE_OS::atoi(ch.c_str()));
+	  weights_calculator_.add_uncombined_channel(channel_id);
+	  }
+	*/
+	}
+      }
+    }
+    
+
+    for (unsigned int i = 0; i < buffers_.size(); i++) {
+      weights_[i] = boost::shared_ptr<GrappaWeights<float> >(new GrappaWeights<float>());
+
+      //Let's set some default GRAPPA weights, so that we have something to work with the first couple of frames.
+      /*
+        std::vector<unsigned int> wdims = image_dimensions_;
+        if (weights_calculator_.get_number_of_uncombined_channels()) {
+        wdims.push_back(weights_calculator_.get_number_of_uncombined_channels()+1);
+        }
+
+        hoNDArray< std::complex<float> > tmp_w;
+        if (!tmp_w.create(&wdims)) {
+        GDEBUG("Unable to create temporary array with dimensions\n");
+        return GADGET_FAIL;
+        }
+        tmp_w.clear(std::complex<float>(1.0,0));
+        weights_[i]->update(&tmp_w);
+      */
+
+      buffers_[i] = new GrappaCalibrationBuffer(image_dimensions_,
+                                                weights_[i],
+                                                &weights_calculator_);
+    }
+
+
+    if (weights_calculator_.open() < 0) {
+      GDEBUG("Failed to open GrappaWeightsCalculator\n");
+      return GADGET_FAIL;
+    }
+
+    image_data_ = std::vector< GadgetContainerMessage< hoNDArray< std::complex<float> > >* >(dimensions_[4],0);
+    for (unsigned int i = 0; i < image_data_.size(); i++) {
+      if (create_image_buffer(i) != GADGET_OK) {
+        GDEBUG("Unable to create image buffers");
+        return GADGET_FAIL;
+      }
+    }
+
+    image_series_ = image_series.value();
+
+    return GADGET_OK;
+  }
+
+
+  int GrappaGadget::
+  process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+          GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+  {
+      bool is_noise = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT);
+
+      //We should not be receiving noise here
+      if (is_noise) {
+	m1->release();
+	return GADGET_OK;
+      }
+
+
+    if (first_call_) {
+      if (m1->getObjectPtr()->active_channels != dimensions_[3]) {
+        GDEBUG("Detected coil number change. Maybe due to upstream channel reduction\n");
+        dimensions_[3] = m1->getObjectPtr()->active_channels;
+      }
+
+      if (initial_setup() != GADGET_OK) {
+        GDEBUG("Initial Setup Failed\n");
+        m1->release();
+        return GADGET_FAIL;
+      }
+      first_call_ = false;
+    }
+
+    ISMRMRD::AcquisitionHeader* acq_head = m1->getObjectPtr();
+
+    unsigned int samples =  acq_head->number_of_samples;
+    unsigned int line = acq_head->idx.kspace_encode_step_1 + line_offset_;
+    unsigned int partition = acq_head->idx.kspace_encode_step_2;
+    unsigned int slice = acq_head->idx.slice;
+
+    if (samples != image_dimensions_[0]) {
+      GDEBUG("GrappaGadget: wrong number of samples received %d, expected %d\n", samples, image_dimensions_[0]);
+      return GADGET_FAIL;
+    }
+
+    if (slice >= image_data_.size()) {
+      GDEBUG("Invalid slice number received\n");
+      return GADGET_FAIL;
+    }
+
+    if (!image_data_[0]) {
+      if (create_image_buffer(slice) != GADGET_OK) {
+        GDEBUG("Failed to allocate new slice buffer\n");
+        return GADGET_FAIL;
+      }
+    }
+
+    std::complex<float>* b = image_data_[slice]->getObjectPtr()->get_data_ptr();
+    std::complex<float>* d = m2->getObjectPtr()->get_data_ptr();
+
+    size_t offset= 0;
+    //Copy the data for all the channels
+    for (int c = 0; c < m1->getObjectPtr()->active_channels; c++) {
+      offset =
+        c*image_dimensions_[0]*image_dimensions_[1]*image_dimensions_[2] +
+        partition*image_dimensions_[0]*image_dimensions_[1] +
+        line*image_dimensions_[0];
+
+      memcpy(b+offset,d+c*samples,sizeof(std::complex<float>)*samples);
+    }
+
+
+    bool is_last_scan_in_slice = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_LAST_IN_SLICE);
+
+    bool is_first_scan_in_slice = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_FIRST_IN_SLICE);
+
+    if (is_first_scan_in_slice) {
+      time_stamps_[slice] = m1->getObjectPtr()->acquisition_time_stamp;
+    }
+
+    if (is_last_scan_in_slice) {
+
+      GadgetContainerMessage<GrappaUnmixingJob>* cm0 =
+        new GadgetContainerMessage<GrappaUnmixingJob>();
+
+      GadgetContainerMessage<ISMRMRD::ImageHeader>* cm1 =
+        new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+
+      /*
+        GadgetContainerMessage< hoNDArray<std::complex<float> > >* cm2 =
+        new GadgetContainerMessage< hoNDArray<std::complex<float> > >();
+
+        std::vector<unsigned int> combined_dims(3,0);
+        combined_dims[0] = image_dimensions_[0];
+        combined_dims[1] = image_dimensions_[1];
+        combined_dims[2] = image_dimensions_[2];
+
+        if (weights_calculator_.get_number_of_uncombined_channels()) {
+        combined_dims.push_back(weights_calculator_.get_number_of_uncombined_channels()+1);
+        }
+
+        if (!cm2->getObjectPtr()->create(&combined_dims)) {
+        GDEBUG("Unable to create combined image array\n");
+        return GADGET_FAIL;
+        }
+
+        cm1->cont(cm2);
+      */
+
+      cm1->getObjectPtr()->matrix_size[0] = image_dimensions_[0];
+      cm1->getObjectPtr()->matrix_size[1] = image_dimensions_[1];
+      cm1->getObjectPtr()->matrix_size[2] = image_dimensions_[2];
+
+      cm1->getObjectPtr()->field_of_view[0] = fov_[0];
+      cm1->getObjectPtr()->field_of_view[1] = fov_[1];
+      cm1->getObjectPtr()->field_of_view[2] = fov_[2];
+
+      cm1->getObjectPtr()->channels       = 1+weights_calculator_.get_number_of_uncombined_channels();
+      cm1->getObjectPtr()->slice              = m1->getObjectPtr()->idx.slice;
+      cm1->getObjectPtr()->acquisition_time_stamp         = time_stamps_[slice];
+
+      memcpy(cm1->getObjectPtr()->position,m1->getObjectPtr()->position,
+             sizeof(float)*3);
+
+      memcpy(cm1->getObjectPtr()->read_dir,m1->getObjectPtr()->read_dir,
+             sizeof(float)*3);
+
+      memcpy(cm1->getObjectPtr()->phase_dir,m1->getObjectPtr()->phase_dir,
+             sizeof(float)*3);
+
+      memcpy(cm1->getObjectPtr()->slice_dir,m1->getObjectPtr()->slice_dir,
+             sizeof(float)*3);
+
+      memcpy(cm1->getObjectPtr()->patient_table_position,m1->getObjectPtr()->patient_table_position, sizeof(float)*3);
+
+      cm1->getObjectPtr()->image_index = ++image_counter_;
+      cm1->getObjectPtr()->image_series_index = image_series_;
+
+
+      cm0->getObjectPtr()->weights_ = weights_[slice];
+      cm0->cont(cm1);
+      cm1->cont(image_data_[slice]);
+
+      image_data_[slice] = 0;
+      if (create_image_buffer(slice) != GADGET_OK) {
+        GDEBUG("Failed to create image buffer");
+        return GADGET_FAIL;
+      }
+
+      if (this->next()->putq(cm0) < 0) {
+        GDEBUG("Failed to pass image on to next Gadget in chain\n");
+        return GADGET_FAIL;
+      }
+
+      /*
+        hoFFT<float>::instance()->ifft(image_data_[slice]->getObjectPtr(),0);
+        hoFFT<float>::instance()->ifft(image_data_[slice]->getObjectPtr(),1);
+        hoFFT<float>::instance()->ifft(image_data_[slice]->getObjectPtr(),2);
+
+        //apply weights
+        float scale_factor = (dimensions_[0] *dimensions_[1] *dimensions_[0] *dimensions_[1])/10;
+
+        int appl_result = weights_[slice]->apply(image_data_[slice]->getObjectPtr(), cm2->getObjectPtr(), scale_factor);
+        if (appl_result < 0) {
+        GDEBUG("Failed to apply GRAPPA weights: error code %d\n", appl_result);
+        return GADGET_FAIL;
+        }
+
+        if (this->next()->putq(cm1) < 0) {
+        GDEBUG("Failed to pass image on to next Gadget in chain\n");
+        return GADGET_FAIL;
+        }
+        image_data_[slice]->getObjectPtr()->clear(std::complex<float>(0.0f,0.0f));
+      */
+    }
+
+    if (buffers_[slice]->add_data(m1->getObjectPtr(),m2->getObjectPtr(), line_offset_) < 0) {
+      GDEBUG("Failed to add incoming data to grappa calibration buffer\n");
+      return GADGET_FAIL;
+    }
+
+    m1->release();
+    return GADGET_OK;
+  }
+
+
+  int GrappaGadget::create_image_buffer(unsigned int slice)
+  {
+    if (slice >= image_data_.size()) {
+      return GADGET_FAIL;
+    }
+
+    if (image_data_[slice] != 0) {
+      image_data_[slice]->release();
+      image_data_[slice] = 0;
+    }
+
+    image_data_[slice] = new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+    try{ image_data_[slice]->getObjectPtr()->create(&image_dimensions_);}
+    catch (std::runtime_error &err){
+      GEXCEPTION(err,"Unable to create image buffers");
+      return GADGET_FAIL;
+    }
+
+    std::fill(image_data_[slice]->getObjectPtr()->get_data_ptr(),
+              image_data_[slice]->getObjectPtr()->get_data_ptr()+image_data_[slice]->getObjectPtr()->get_number_of_elements(),
+              std::complex<float>(0.0f,0.0f));
+
+    return GADGET_OK;
+
+  }
+
+  GADGET_FACTORY_DECLARE(GrappaGadget)
+}
diff --git a/gadgets/grappa/GrappaGadget.h b/gadgets/grappa/GrappaGadget.h
new file mode 100644
index 0000000..7b2ffdd
--- /dev/null
+++ b/gadgets/grappa/GrappaGadget.h
@@ -0,0 +1,75 @@
+#ifndef GRAPPAGADGET_H
+#define GRAPPAGADGET_H
+
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "GrappaCalibrationBuffer.h"
+#include "gadgetron_grappa_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+#include <map>
+
+namespace Gadgetron{
+struct EXPORTGADGETSGRAPPA GrappaBufferInfo
+{
+  float           position[3];
+  float           read_dir[3];
+  float           phase_dir[3];
+  float           slice_dir[3];
+  unsigned int    acceleration_factor;
+};
+
+class EXPORTGADGETSGRAPPA GrappaGadget : 
+public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+{
+  
+ public:
+  GADGET_DECLARE(GrappaGadget);
+
+  GrappaGadget();
+  virtual ~GrappaGadget();
+
+ protected:
+
+  virtual int process_config(ACE_Message_Block* mb);
+  virtual int process( GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1,
+		  GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2 );
+
+  virtual int create_image_buffer(unsigned int slice);
+
+  //We have to overwrite close in this gadget to make sure we wait for the weights calculator.
+  virtual int close(unsigned long flags);
+
+  virtual int initial_setup();
+  bool first_call_;
+
+  GADGET_PROPERTY(target_coils,int, "Number of target coils for GRAPPA recon", 0);
+  GADGET_PROPERTY(use_gpu,bool,"If true, recon will try to use GPU resources (when available)", true);
+  GADGET_PROPERTY(device_channels,int,"Number of device channels", 0);
+  GADGET_PROPERTY(uncombined_channels,std::string,"Uncombined channels (as a comma separated list of channel indices", "");
+  GADGET_PROPERTY(uncombined_channels_by_name,std::string,"Uncombined channels (as a comma separated list of channel names", "");
+  GADGET_PROPERTY(image_series,int,"Image series number for output images", 0);
+
+ private:
+  typedef std::map< std::string, int > map_type_;
+
+  std::vector< GrappaCalibrationBuffer* > buffers_;
+  std::vector<unsigned int> fov_;
+  std::vector<size_t> dimensions_;
+  std::vector<size_t> image_dimensions_;
+  std::vector< GadgetContainerMessage<  hoNDArray< std::complex<float> > >* > image_data_;
+  std::vector< boost::shared_ptr<GrappaWeights<float> > > weights_;
+  GrappaWeightsCalculator<float> weights_calculator_;
+  std::vector<ACE_UINT32> time_stamps_;
+  int image_counter_;
+  int image_series_;
+  int target_coils_;
+  float phase_encoding_resolution_;
+  unsigned int line_offset_;
+  map_type_ channel_map_;
+  bool use_gpu_;
+};
+}
+#endif //GRAPPAGADGET_H
diff --git a/gadgets/grappa/GrappaUnmixingGadget.cpp b/gadgets/grappa/GrappaUnmixingGadget.cpp
new file mode 100644
index 0000000..b31b7c7
--- /dev/null
+++ b/gadgets/grappa/GrappaUnmixingGadget.cpp
@@ -0,0 +1,70 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "GrappaUnmixingGadget.h"
+#include "hoNDFFT.h"
+
+namespace Gadgetron{
+
+  GrappaUnmixingGadget::GrappaUnmixingGadget() {
+    // TODO Auto-generated constructor stub
+
+  }
+
+  GrappaUnmixingGadget::~GrappaUnmixingGadget() {
+    // TODO Auto-generated destructor stub
+  }
+
+  int GrappaUnmixingGadget::process(GadgetContainerMessage<GrappaUnmixingJob>* m1,
+                                    GadgetContainerMessage<ISMRMRD::ImageHeader>* m2, GadgetContainerMessage<hoNDArray<std::complex<float> > >* m3)
+  {
+    GadgetContainerMessage< hoNDArray<std::complex<float> > >* cm2 =
+			new GadgetContainerMessage< hoNDArray<std::complex<float> > >();
+
+    std::vector<size_t> combined_dims(3,0);
+    combined_dims[0] = m2->getObjectPtr()->matrix_size[0];
+    combined_dims[1] = m2->getObjectPtr()->matrix_size[1];
+    combined_dims[2] = m2->getObjectPtr()->matrix_size[2];
+
+    if (m2->getObjectPtr()->channels > 1) {
+      combined_dims.push_back(m2->getObjectPtr()->channels);
+    }
+
+    try{cm2->getObjectPtr()->create(&combined_dims);}
+    catch (std::runtime_error &err ){
+      GEXCEPTION(err,"Unable to create combined image array\n");
+      return GADGET_FAIL;
+    }
+
+    m1->cont(0);
+    m2->cont(cm2);
+
+    hoNDFFT<float>::instance()->ifft3c(*m3->getObjectPtr());
+    /*
+    hoNDFFT<float>::instance()->ifft(m3->getObjectPtr(),0);
+    hoNDFFT<float>::instance()->ifft(m3->getObjectPtr(),1);
+    hoNDFFT<float>::instance()->ifft(m3->getObjectPtr(),2);
+    */
+
+    if (!m1->getObjectPtr()->weights_) {
+      GDEBUG("Weights are a NULL\n");
+      return GADGET_FAIL;
+    }
+
+    float scale_factor = 1.0;
+    int appl_result = m1->getObjectPtr()->weights_->apply(m3->getObjectPtr(), cm2->getObjectPtr(), scale_factor);
+    if (appl_result < 0) {
+      GDEBUG("Failed to apply GRAPPA weights: error code %d\n", appl_result);
+      return GADGET_FAIL;
+    }
+
+    m1->release();
+    m3->release();
+
+    if (this->next()->putq(m2) < 0) {
+      return GADGET_FAIL;
+    }
+
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(GrappaUnmixingGadget)
+}
diff --git a/gadgets/grappa/GrappaUnmixingGadget.h b/gadgets/grappa/GrappaUnmixingGadget.h
new file mode 100644
index 0000000..a6258fc
--- /dev/null
+++ b/gadgets/grappa/GrappaUnmixingGadget.h
@@ -0,0 +1,32 @@
+#ifndef GRAPPAUNMIXINGGADGET_H_
+#define GRAPPAUNMIXINGGADGET_H_
+
+#include "gadgetron_grappa_export.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd/ismrmrd.h"
+#include "GrappaWeights.h"
+
+#include <complex>
+
+namespace Gadgetron{
+
+  struct EXPORTGADGETSGRAPPA GrappaUnmixingJob
+  {
+    boost::shared_ptr< GrappaWeights<float> > weights_;
+  };
+
+  class EXPORTGADGETSGRAPPA GrappaUnmixingGadget: public Gadget3<GrappaUnmixingJob, ISMRMRD::ImageHeader, hoNDArray<std::complex<float> > > {
+  public:
+    GADGET_DECLARE(GrappaUnmixingGadget);
+
+    GrappaUnmixingGadget();
+    virtual ~GrappaUnmixingGadget();
+
+  protected:
+    virtual int process(GadgetContainerMessage<GrappaUnmixingJob>* m1,
+                        GadgetContainerMessage<ISMRMRD::ImageHeader>* m2, GadgetContainerMessage<hoNDArray<std::complex<float> > >* m3);
+  };
+}
+
+#endif /* GRAPPAUNMIXINGGADGET_H_ */
diff --git a/gadgets/grappa/GrappaWeights.cpp b/gadgets/grappa/GrappaWeights.cpp
new file mode 100644
index 0000000..49d4dce
--- /dev/null
+++ b/gadgets/grappa/GrappaWeights.cpp
@@ -0,0 +1,111 @@
+#include "../mri_core/GadgetIsmrmrdReadWrite.h"
+#include "GrappaWeights.h"
+#include "hoNDArray_fileio.h"
+
+namespace Gadgetron{
+
+template <class T> int GrappaWeights<T>::
+update(hoNDArray< std::complex<T> >* new_weights)
+{
+  /*
+  ACE_Guard<ACE_Thread_Mutex> guard(mutex_);
+  if (!guard.locked()) {
+    return -1;
+  }
+  */
+
+  mutex_.acquire();
+
+  if (!weights_.dimensions_equal(new_weights)) {
+    try{weights_.create(new_weights->get_dimensions());}
+    catch (std::runtime_error & err){
+      return -2;
+    }
+  }
+
+  memcpy(weights_.get_data_ptr(), new_weights->get_data_ptr(),
+	 weights_.get_number_of_elements()*sizeof(T)*2);
+
+  weights_are_valid_ = true;
+  mutex_.release();
+  cond_.broadcast();
+
+  return 0;
+}
+
+template<class T> int GrappaWeights<T>::
+apply(hoNDArray< std::complex<T> >* data_in,
+      hoNDArray< std::complex<T> >* data_out,
+      T scale)
+{
+  /*
+  ACE_Guard<ACE_Thread_Mutex> guard(mutex_);
+  if (!guard.locked()) {
+    return -1;
+  }
+  */
+
+  mutex_.acquire();
+  if (!weights_are_valid_) {
+	  GDEBUG("Releasing Mutex to Wait for result\n");
+	  mutex_.release();
+	  cond_.wait();
+	  mutex_.acquire();
+ }
+
+
+  if (weights_.get_number_of_elements()%data_in->get_number_of_elements()) {
+    return -3;
+  }
+
+  unsigned int sets = weights_.get_number_of_elements()/data_in->get_number_of_elements();
+  
+  if (sets < 1) {
+    return -4;
+  }
+
+  if (data_out->get_size(data_out->get_number_of_dimensions()-1) != sets) {
+    return -5;
+  }
+
+  unsigned long image_elements = data_out->get_number_of_elements()/sets;
+  unsigned int coils = weights_.get_number_of_elements()/(sets*image_elements);
+  
+  if (weights_.get_number_of_elements() != (image_elements*coils*sets)) {
+    return -6;
+  }
+
+  if (data_in->get_number_of_elements() != (image_elements*coils)) {
+    return -7;
+  }
+
+  if (data_out->get_number_of_elements() != (image_elements*sets)) {
+    return -8;
+  }
+
+  std::complex<T>* weights_ptr = weights_.get_data_ptr();
+  std::complex<T>* in_ptr = data_in->get_data_ptr();
+  std::complex<T>* out_ptr = data_out->get_data_ptr();
+
+  for (unsigned int i = 0; i < image_elements*sets; i++) {
+    out_ptr[i] = 0;
+  }
+
+  for (unsigned int s = 0; s < sets; s++) {
+    for (unsigned int p = 0; p < image_elements; p++) {
+      for (unsigned int c = 0; c < coils; c++) {
+	out_ptr[s*image_elements + p] += 
+	  weights_ptr[s*image_elements*coils + c*image_elements + p] * 
+	  in_ptr[c*image_elements + p]*scale;
+      }
+    }
+  }
+
+  mutex_.release();
+  return 0;
+}
+
+//Template instanciation
+template class EXPORTGADGETSGRAPPA GrappaWeights<float>;
+template class EXPORTGADGETSGRAPPA GrappaWeights<double>;
+}
diff --git a/gadgets/grappa/GrappaWeights.h b/gadgets/grappa/GrappaWeights.h
new file mode 100644
index 0000000..e9de58f
--- /dev/null
+++ b/gadgets/grappa/GrappaWeights.h
@@ -0,0 +1,37 @@
+#pragma once 
+
+#include "gadgetron_grappa_export.h"
+#include "hoNDArray.h"
+
+#include <ace/Synch.h>
+#include <complex>
+
+namespace Gadgetron{
+
+template <class T> class EXPORTGADGETSGRAPPA GrappaWeights
+{
+ public:
+  GrappaWeights()
+  	  : weights_are_valid_(false)
+  	  , cond_(cond_mutex_)
+  	  {
+
+  	  }
+  virtual ~GrappaWeights() {}
+  
+  int update(hoNDArray< std::complex<T> >* new_weights);
+
+  int apply(hoNDArray< std::complex<T> >* data_in,
+	    hoNDArray< std::complex<T> >* data_out, 
+	    T scale = 1.0);
+
+ private:
+  ACE_Thread_Mutex mutex_;
+  bool weights_are_valid_;
+
+  ACE_Thread_Mutex cond_mutex_;
+  ACE_Condition_Thread_Mutex cond_;
+  hoNDArray< std::complex<T> > weights_;
+
+};
+}
diff --git a/gadgets/grappa/GrappaWeightsCalculator.cpp b/gadgets/grappa/GrappaWeightsCalculator.cpp
new file mode 100644
index 0000000..fa8992d
--- /dev/null
+++ b/gadgets/grappa/GrappaWeightsCalculator.cpp
@@ -0,0 +1,470 @@
+#include "GrappaWeightsCalculator.h"
+#include "GadgetContainerMessage.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "hoNDArray_fileio.h"
+#include "hoNDArray_reductions.h"
+#include "GadgetronTimer.h"
+
+#ifdef USE_CUDA
+    #include "GPUTimer.h"
+    #include "cuNDFFT.h"
+    #include "b1_map.h"
+    #include "htgrappa.h"
+    #include <cuComplex.h>
+#endif // USE_CUDA
+
+#include "complext.h"
+
+#include "hoNDFFT.h"
+#include "hoNDArray_elemwise.h"
+#include "mri_core_grappa.h"
+#include "mri_core_coil_map_estimation.h"
+
+namespace Gadgetron{
+
+template <class T> class EXPORTGADGETSGRAPPA GrappaWeightsDescription
+{
+
+public:
+    std::vector< std::pair<unsigned int, unsigned int> > sampled_region;
+    unsigned int acceleration_factor;
+    boost::shared_ptr<GrappaWeights<T> > destination;
+    std::vector<unsigned int> uncombined_channel_weights;
+    bool include_uncombined_channels_in_combined_weights;
+};
+
+template <class T> int GrappaWeightsCalculator<T>::svc(void)  {
+    ACE_Message_Block *mb;
+
+    while (this->getq(mb) >= 0) {
+        if (mb->msg_type() == ACE_Message_Block::MB_HANGUP) {
+            GDEBUG("Hanging up in weights calculator\n");
+            if (this->putq(mb) == -1) {
+              GERROR("GrappaWeightsCalculator::svc, putq");
+              return -1;
+            }
+            break;
+        }
+
+        GadgetContainerMessage< GrappaWeightsDescription<T> >* mb1
+        = AsContainerMessage< GrappaWeightsDescription<T> >(mb);
+
+        if (!mb1) {
+            mb->release();
+            return -2;
+        }
+
+        GadgetContainerMessage< hoNDArray< std::complex<T> > >* mb2
+        = AsContainerMessage< hoNDArray< std::complex<T> > >(mb1->cont());
+
+        if (!mb2) {
+            mb->release();
+            return -3;
+        }
+
+        hoNDArray<float_complext>* host_data =
+                reinterpret_cast< hoNDArray<float_complext>* >(mb2->getObjectPtr());
+
+        size_t ks = 5;
+        size_t power = 3;
+
+#ifndef USE_CUDA
+        use_gpu_ = false;
+#endif // USE_CUDA
+
+        if (use_gpu_)
+        {
+#ifdef USE_CUDA
+            // Copy the image data to the device
+            cuNDArray<float_complext> device_data(host_data);
+            device_data.squeeze();
+
+            std::vector<size_t> ftdims(2,0); ftdims[1] = 1;
+
+            //Go to image space
+             cuNDFFT<float>::instance()->ifft( &device_data, &ftdims);
+
+            size_t RO = device_data.get_size(0);
+            size_t E1 = device_data.get_size(1);
+            size_t CHA = device_data.get_size(2);
+
+            boost::shared_ptr< cuNDArray<float_complext> > csm;
+            {
+                //GPUTimer timer("GRAPPA CSM");
+                csm = estimate_b1_map<float,2>( &device_data, target_coils_ );
+
+                // estimate_b1_map_2D_NIH_Souheil( &device_data, &csm, ks, power, D, DH_D, V1, U1 );
+
+                //GDEBUG("Coils in csm: %d\n", csm->get_size(2));
+            }
+            //Go back to kspace
+            cuNDFFT<float>::instance()->fft(&device_data, &ftdims);
+
+            cuNDArray<complext<float> > unmixing_dev;
+            boost::shared_ptr< std::vector<size_t> > data_dimensions = device_data.get_dimensions();
+
+            if (uncombined_channels_.size() > 0) {
+                data_dimensions->push_back(uncombined_channels_.size()+1);
+            }
+
+            try{unmixing_dev.create(data_dimensions.get());}
+            catch (std::runtime_error &err){
+                GEXCEPTION(err,"Unable to allocate device memory for unmixing coeffcients\n");
+                return GADGET_FAIL;
+            }
+
+            {
+                //GPUTimer unmix_timer("GRAPPA Unmixing");
+                //GadgetronTimer timer("GRAPPA unmixing", true);
+                std::vector<unsigned int> kernel_size;
+
+                //TODO: Add parameters for kernel size
+                kernel_size.push_back(5);
+                kernel_size.push_back(4);
+                if ( htgrappa_calculate_grappa_unmixing(reinterpret_cast< cuNDArray<complext<float> >* >(&device_data),
+                        csm.get(),
+                        (unsigned int)(mb1->getObjectPtr()->acceleration_factor),
+                        &kernel_size,
+                        &unmixing_dev,
+                        &(mb1->getObjectPtr()->sampled_region),
+                        &uncombined_channels_) < 0) {
+                    GDEBUG("GRAPPA unmixing coefficients calculation failed\n");
+                    return GADGET_FAIL;
+                }
+            }
+
+            if (mb1->getObjectPtr()->destination) {
+                boost::shared_ptr< hoNDArray<complext<float> > > unmixing_host = unmixing_dev.to_host();
+
+                //TODO: This reshaping needs to take uncombined channels into account
+                boost::shared_ptr< std::vector<size_t> > tmp_dims = mb2->getObjectPtr()->get_dimensions();
+                if (uncombined_channels_.size()) tmp_dims->push_back((size_t)(uncombined_channels_.size() + 1));
+
+                try {
+                    unmixing_host->reshape(tmp_dims.get());
+                }
+                catch (std::runtime_error &err){
+                    GEXCEPTION(err, "Reshaping of GRAPPA weights failed \n");
+
+                }
+
+                if (mb1->getObjectPtr()->destination->update(reinterpret_cast<hoNDArray<std::complex<float> >* >(unmixing_host.get())) < 0) {
+                    GDEBUG("Update of GRAPPA weights failed\n");
+                    return GADGET_FAIL;
+                }
+            }
+            else {
+                GDEBUG("Undefined GRAPPA weights destination\n");
+                return GADGET_FAIL;
+            }
+#endif // USE_CUDA
+        }
+        else
+        {
+            host_data->squeeze();
+
+            size_t RO = host_data->get_size(0);
+            size_t E1 = host_data->get_size(1);
+            size_t CHA = host_data->get_size(2);
+
+            std::vector<size_t> data_dimensions;
+            host_data->get_dimensions(data_dimensions);
+
+            if (uncombined_channels_.size() > 0) {
+                data_dimensions.push_back(uncombined_channels_.size() + 1);
+            }
+
+            try{ unmixing_.create(data_dimensions); }
+            catch (std::runtime_error &err){
+                GEXCEPTION(err, "Unable to allocate host memory for unmixing coeffcients\n");
+                return GADGET_FAIL;
+            }
+
+            // compute the unmixing coefficients
+            size_t numUnCombined = uncombined_channels_.size();
+
+            double thres = 0.0005;
+            size_t kRO = 5;
+            size_t kNE1 = 4;
+
+            if (numUnCombined==0)
+            {
+                hoNDArray< std::complex<float> > acs(RO, E1, target_coils_, reinterpret_cast< std::complex<float>* >(host_data->begin()));
+                hoNDArray< std::complex<float> > target_acs(RO, E1, target_coils_, acs.begin());
+
+                // estimate coil map
+                if (!complex_im_.dimensions_equal(&target_acs))
+                {
+                    complex_im_.create(RO, E1, target_coils_);
+                }
+
+                hoNDFFT<float>::instance()->ifft2c(target_acs, complex_im_);
+                Gadgetron::coil_map_2d_Inati(complex_im_, coil_map_, ks, power);
+
+                // compute unmixing coefficients
+                if (mb1->getObjectPtr()->acceleration_factor == 1)
+                {
+                    Gadgetron::conjugate(coil_map_, coil_map_);
+                    Gadgetron::clear(unmixing_);
+                    memcpy(unmixing_.begin(), coil_map_.begin(), coil_map_.get_number_of_bytes());
+                }
+                else
+                {
+                    size_t startRO = mb1->getObjectPtr()->sampled_region[0].first;
+                    size_t endRO = mb1->getObjectPtr()->sampled_region[0].second;
+
+                    size_t startE1 = mb1->getObjectPtr()->sampled_region[1].first;
+                    size_t endE1 = mb1->getObjectPtr()->sampled_region[1].second;
+
+                    Gadgetron::grappa2d_calib_convolution_kernel(acs, target_acs,
+                        (size_t)(mb1->getObjectPtr()->acceleration_factor),
+                        thres, kRO, kNE1, startRO, endRO, startE1, endE1, conv_ker_);
+
+                    Gadgetron::grappa2d_image_domain_kernel(conv_ker_, RO, E1, kIm_);
+
+                    Gadgetron::clear(unmixing_);
+
+                    Gadgetron::grappa2d_unmixing_coeff(kIm_, coil_map_, (size_t)(mb1->getObjectPtr()->acceleration_factor), unmixing_, gFactor_);
+
+                    // GDEBUG_STREAM("cpu triggered - unmixing_ : " << Gadgetron::norm2(unmixing_));
+                }
+            }
+            else
+            {
+                hoNDArray< std::complex<float> > acs(RO, E1, CHA, reinterpret_cast< std::complex<float>* >(host_data->begin()));
+
+                std::vector<size_t> dimTarget(3);
+                dimTarget[0] = RO;
+                dimTarget[1] = E1;
+                dimTarget[2] = target_coils_ + numUnCombined;
+
+                if (!target_acs_.dimensions_equal(&dimTarget))
+                {
+                    target_acs_.create(RO, E1, target_coils_ + numUnCombined);
+                }
+
+                // copy first target_coils_ channels and all uncombined channels to target_acs_
+                size_t sCha, ind(0), ind_uncombined(0);
+                std::list<unsigned int>::iterator it;
+
+                // record from which src channel a target channel is selected
+                std::vector<size_t> srcChaLoc(target_coils_ + numUnCombined);
+                for (sCha = 0; sCha<CHA; sCha++)
+                {
+                    bool uncombined = false;
+                    for (it = uncombined_channels_.begin(); it != uncombined_channels_.end(); it++)
+                    {
+                        if (sCha == *it)
+                        {
+                            uncombined = true;
+                            break;
+                        }
+                    }
+
+                    if (!uncombined)
+                    {
+                        if (ind<target_coils_)
+                        {
+                            memcpy(target_acs_.begin() + ind * RO*E1, acs.begin() + sCha * RO*E1, sizeof(std::complex<float>)*RO*E1);
+                            srcChaLoc[ind] = sCha;
+                            ind++;
+                        }
+                    }
+                    else
+                    {
+                        memcpy(target_acs_.begin() + (target_coils_ + ind_uncombined) * RO*E1, acs.begin() + sCha * RO*E1, sizeof(std::complex<float>)*RO*E1);
+                        srcChaLoc[target_coils_ + ind_uncombined] = sCha;
+                        ind_uncombined++;
+                    }
+                }
+
+                // estimate coil map
+                if (!complex_im_.dimensions_equal(&target_acs_))
+                {
+                    complex_im_.create(RO, E1, target_acs_.get_size(2));
+                }
+
+                hoNDFFT<float>::instance()->ifft2c(target_acs_, complex_im_);
+
+                Gadgetron::coil_map_2d_Inati(complex_im_, coil_map_, ks, power);
+
+                // compute unmixing coefficients
+                if (mb1->getObjectPtr()->acceleration_factor == 1)
+                {
+                    Gadgetron::conjugate(coil_map_, coil_map_);
+
+                    Gadgetron::clear(unmixing_);
+
+                    // copy back to unmixing
+                    size_t t;
+                    for (t = 0; t<target_coils_ + numUnCombined; t++)
+                    {
+                        memcpy(unmixing_.begin() + srcChaLoc[t] * RO*E1, coil_map_.begin() + t*RO*E1, sizeof(std::complex<float>)*RO*E1);
+                    }
+
+                    // set uncombined channels
+                    ind = 1;
+                    for (it = uncombined_channels_.begin(); it != uncombined_channels_.end(); it++)
+                    {
+                        std::complex<float>* pUnmixing = unmixing_.begin() + ind*RO*E1*CHA + (*it)*RO*E1;
+                        for (size_t p = 0; p<RO*E1; p++)
+                        {
+                            pUnmixing[p] = 1;
+                        }
+
+                        ind++;
+                    }
+                }
+                else
+                {
+                    Gadgetron::grappa2d_calib_convolution_kernel(acs, target_acs_,
+                        (size_t)(mb1->getObjectPtr()->acceleration_factor),
+                        thres, kRO, kNE1, conv_ker_);
+
+                    Gadgetron::grappa2d_image_domain_kernel(conv_ker_, RO, E1, kIm_);
+
+                    Gadgetron::clear(unmixing_);
+
+                    hoNDArray< std::complex<float> > unmixing_all_channels(RO, E1, CHA, unmixing_.begin());
+                    Gadgetron::grappa2d_unmixing_coeff(kIm_, coil_map_, (size_t)(mb1->getObjectPtr()->acceleration_factor), unmixing_all_channels, gFactor_);
+
+                    // set unmixing coefficients for uncombined channels
+                    size_t ind = 1;
+                    for (it = uncombined_channels_.begin(); it != uncombined_channels_.end(); it++)
+                    {
+                        memcpy(unmixing_.begin() + ind*RO*E1*CHA, kIm_.begin() + (target_coils_ + ind - 1)*RO*E1*CHA, sizeof(std::complex<float>)*RO*E1*CHA);
+                        ind++;
+                    }
+                }
+            }
+
+            // pass the unmixing coefficients
+            if (mb1->getObjectPtr()->destination)
+            {
+                boost::shared_ptr< hoNDArray< std::complex<float> > > unmixing_host(new hoNDArray< std::complex<float> >());
+                boost::shared_ptr< std::vector<size_t> > tmp_dims = mb2->getObjectPtr()->get_dimensions();
+                if (uncombined_channels_.size()) tmp_dims->push_back((size_t)(uncombined_channels_.size() + 1));
+
+                try {
+                    unmixing_host->create(tmp_dims.get());
+                    Gadgetron::clear(*unmixing_host);
+                }
+                catch (std::runtime_error &err){
+                    GEXCEPTION(err, "Reshaping of GRAPPA weights failed \n");
+
+                }
+
+                memcpy(unmixing_host->begin(), unmixing_.begin(), unmixing_.get_number_of_bytes());
+
+                // GDEBUG_STREAM("cpu triggered ... : " << Gadgetron::norm2(*unmixing_host));
+
+                if (mb1->getObjectPtr()->destination->update(unmixing_host.get()) < 0) {
+                    GDEBUG("Update of GRAPPA weights failed\n");
+                    return GADGET_FAIL;
+                }
+            }
+            else {
+                GDEBUG("Undefined GRAPPA weights destination\n");
+                return GADGET_FAIL;
+            }
+        }
+
+        mb->release();
+    }
+
+    return 0;
+}
+
+template <class T> int GrappaWeightsCalculator<T>::close(unsigned long flags) {
+    int rval = 0;
+    if (flags == 1) {
+        ACE_Message_Block *hangup = new ACE_Message_Block();
+        hangup->msg_type( ACE_Message_Block::MB_HANGUP );
+        if (this->putq(hangup) == -1) {
+            hangup->release();
+            GERROR("GrappaWeightsCalculator::close, putq");
+            return -1;
+        }
+        //GDEBUG("Waiting for weights calculator to finish\n");
+        rval = this->wait();
+        //GDEBUG("Weights calculator to finished\n");
+    }
+    return rval;
+}
+
+
+template <class T> int GrappaWeightsCalculator<T>::
+add_job( hoNDArray< std::complex<T> >* ref_data,
+        std::vector< std::pair<unsigned int, unsigned int> > sampled_region,
+        unsigned int acceleration_factor,
+        boost::shared_ptr< GrappaWeights<T> > destination,
+        std::vector<unsigned int> uncombined_channel_weights,
+        bool include_uncombined_channels_in_combined_weights)
+        {
+
+    GadgetContainerMessage< GrappaWeightsDescription<T> >* mb1 =
+            new GadgetContainerMessage< GrappaWeightsDescription<T> >();
+
+    if (!mb1) {
+        return -1;
+    }
+
+    /*
+  for (unsigned int i = 0; i < sampled_region.size(); i++) {
+      GDEBUG("Sampled region %d: [%d, %d]\n", i, sampled_region[i].first, sampled_region[i].second);
+  }
+     */
+
+    mb1->getObjectPtr()->sampled_region = sampled_region;
+    mb1->getObjectPtr()->acceleration_factor = acceleration_factor;
+    mb1->getObjectPtr()->destination = destination;
+    mb1->getObjectPtr()->uncombined_channel_weights = uncombined_channel_weights;
+    mb1->getObjectPtr()->include_uncombined_channels_in_combined_weights =
+            include_uncombined_channels_in_combined_weights;
+
+
+    GadgetContainerMessage< hoNDArray< std::complex<T> > >* mb2 =
+            new GadgetContainerMessage< hoNDArray< std::complex<T> > >();
+
+    if (!mb2) {
+        mb1->release();
+        return -2;
+    }
+
+    mb1->cont(mb2);
+
+    try{mb2->getObjectPtr()->create(ref_data->get_dimensions().get());}
+    catch (std::runtime_error &err ){
+        mb1->release();
+        return -3;
+    }
+
+    memcpy(mb2->getObjectPtr()->get_data_ptr(), ref_data->get_data_ptr(),
+            ref_data->get_number_of_elements()*sizeof(T)*2);
+
+    this->putq(mb1);
+
+    return 0;
+        }
+
+template <class T> int GrappaWeightsCalculator<T>::add_uncombined_channel(unsigned int channel_id)
+        {
+    remove_uncombined_channel(channel_id);
+    uncombined_channels_.push_back(channel_id);
+    return 0;
+        }
+
+template <class T> int GrappaWeightsCalculator<T>::remove_uncombined_channel(unsigned int channel_id)
+        {
+    uncombined_channels_.remove(channel_id);
+    return 0;
+        }
+
+
+
+template class EXPORTGADGETSGRAPPA GrappaWeightsDescription<float>;
+template class EXPORTGADGETSGRAPPA GrappaWeightsCalculator<float>;
+//template class EXPORTGADGETSGRAPPA GrappaWeightsCalculator<double>; //TOFO
+//template class EXPORTGADGETSGRAPPA GrappaWeightsDescription<double>;
+
+}
diff --git a/gadgets/grappa/GrappaWeightsCalculator.h b/gadgets/grappa/GrappaWeightsCalculator.h
new file mode 100644
index 0000000..c608e62
--- /dev/null
+++ b/gadgets/grappa/GrappaWeightsCalculator.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include "gadgetron_grappa_export.h"
+#include "GrappaWeights.h"
+
+#include <ace/Task.h>
+#include <list>
+
+namespace Gadgetron{
+
+template <class T> class EXPORTGADGETSGRAPPA GrappaWeightsCalculator : public ACE_Task<ACE_MT_SYNCH>
+{
+  typedef ACE_Task<ACE_MT_SYNCH> inherited;
+
+ public:
+  GrappaWeightsCalculator() 
+    : inherited()
+    , target_coils_(0)
+  {
+    #ifdef USE_CUDA
+      use_gpu_ = true;
+    #else
+      use_gpu_ = false;
+    #endif // USE_CUDA
+  }
+
+  virtual ~GrappaWeightsCalculator() { }
+
+  virtual int init(void)
+  {
+    return 0;
+  }
+
+  virtual int open(void* = 0) 
+  {
+    return this->activate( THR_NEW_LWP | THR_JOINABLE, 1 );
+  }
+
+  virtual int close(unsigned long flags);
+  virtual int svc(void);
+
+  virtual int add_job( hoNDArray< std::complex<T> >* ref_data,
+		       std::vector< std::pair<unsigned int, unsigned int> > sampled_region,
+		       unsigned int acceleration_factor,
+		       boost::shared_ptr<GrappaWeights<T> > destination,
+		       std::vector<unsigned int> uncombined_channel_weights,
+		       bool include_uncombined_channels_in_combined_weights = true);
+
+  virtual int add_uncombined_channel(unsigned int channel_id);
+  virtual int remove_uncombined_channel(unsigned int channel_id);
+  virtual int get_number_of_uncombined_channels() {
+    return uncombined_channels_.size();
+  }
+
+  virtual int get_number_of_target_coils() {
+	  return target_coils_;
+  }
+
+  virtual void set_number_of_target_coils(int n) {
+	  target_coils_ = n;
+  }
+
+  bool get_use_gpu() {
+    return use_gpu_;
+  }
+
+  void set_use_gpu(bool v) {
+      use_gpu_ = v;
+  }
+
+ private:
+  std::list<unsigned int> uncombined_channels_;
+  int target_coils_;
+  bool use_gpu_;
+
+  hoNDArray< std::complex<T> > target_acs_;
+
+  hoNDArray< std::complex<T> > complex_im_;
+  hoNDArray< std::complex<T> > conv_ker_;
+  hoNDArray< std::complex<T> > kIm_;
+  hoNDArray< std::complex<T> > coil_map_;
+  hoNDArray< std::complex<T> > unmixing_;
+  hoNDArray< T > gFactor_;
+};
+}
diff --git a/gadgets/grappa/config/CMakeLists.txt b/gadgets/grappa/config/CMakeLists.txt
new file mode 100644
index 0000000..82e7053
--- /dev/null
+++ b/gadgets/grappa/config/CMakeLists.txt
@@ -0,0 +1,6 @@
+
+install (FILES grappa_unoptimized.xml grappa_unoptimized_float.xml DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
+
+if(ARMADILLO_FOUND)
+  install (FILES grappa.xml grappa_float.xml grappa_float_cpu.xml DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
+endif(ARMADILLO_FOUND)
diff --git a/gadgets/grappa/config/grappa.xml b/gadgets/grappa/config/grappa.xml
new file mode 100644
index 0000000..22fe4bf
--- /dev/null
+++ b/gadgets/grappa/config/grappa.xml
@@ -0,0 +1,88 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+ 
+     <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+   <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>RemoveROOversampling</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>Grappa</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaGadget</classname>
+      <property><name>target_coils</name><value>8</value></property>
+    </gadget>
+
+    <gadget>
+      <name>GrappaUnmixing</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaUnmixingGadget</classname>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+    
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/grappa/config/grappa_float.xml b/gadgets/grappa/config/grappa_float.xml
new file mode 100644
index 0000000..1fc1e55
--- /dev/null
+++ b/gadgets/grappa/config/grappa_float.xml
@@ -0,0 +1,93 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+ 
+     <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+   <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>RemoveROOversampling</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>Grappa</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaGadget</classname>
+      <property><name>target_coils</name><value>8</value></property>
+      <property><name>use_gpu</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>GrappaUnmixing</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaUnmixingGadget</classname>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+    -->
+    
+    <!--
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+     -->
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/grappa/config/grappa_float_cpu.xml b/gadgets/grappa/config/grappa_float_cpu.xml
new file mode 100644
index 0000000..1d65a02
--- /dev/null
+++ b/gadgets/grappa/config/grappa_float_cpu.xml
@@ -0,0 +1,94 @@
+
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+ 
+     <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+   <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>RemoveROOversampling</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>Grappa</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaGadget</classname>
+      <property><name>target_coils</name><value>8</value></property>
+      <property><name>use_gpu</name><value>false</value></property>
+    </gadget>
+
+    <gadget>
+      <name>GrappaUnmixing</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaUnmixingGadget</classname>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+    -->
+    
+    <!--
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+     -->
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/grappa/config/grappa_unoptimized.xml b/gadgets/grappa/config/grappa_unoptimized.xml
new file mode 100644
index 0000000..006c711
--- /dev/null
+++ b/gadgets/grappa/config/grappa_unoptimized.xml
@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+ 
+     <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>RemoveROOversampling</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>Grappa</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaGadget</classname>
+      <property><name>target_coils</name><value>8</value></property>
+    </gadget>
+
+    <gadget>
+      <name>GrappaUnmixing</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaUnmixingGadget</classname>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+    
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/grappa/config/grappa_unoptimized_float.xml b/gadgets/grappa/config/grappa_unoptimized_float.xml
new file mode 100644
index 0000000..4aad6f3
--- /dev/null
+++ b/gadgets/grappa/config/grappa_unoptimized_float.xml
@@ -0,0 +1,73 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+ 
+     <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>RemoveROOversampling</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>Grappa</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaGadget</classname>
+      <property><name>target_coils</name><value>8</value></property>
+    </gadget>
+
+    <gadget>
+      <name>GrappaUnmixing</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaUnmixingGadget</classname>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+    -->
+    
+    <!--
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+     -->
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/grappa/gadgetron_grappa_export.h b/gadgets/grappa/gadgetron_grappa_export.h
new file mode 100644
index 0000000..457a5bc
--- /dev/null
+++ b/gadgets/grappa/gadgetron_grappa_export.h
@@ -0,0 +1,14 @@
+#ifndef GADGETRON_GRAPPA_EXPORT_H_
+#define GADGETRON_GRAPPA_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GRAPPA__)
+#define EXPORTGADGETSGRAPPA __declspec(dllexport)
+#else
+#define EXPORTGADGETSGRAPPA __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETSGRAPPA
+#endif
+
+#endif /* GADGETRON_GRAPPA_EXPORT_H_ */
diff --git a/gadgets/gtPlus/CMakeLists.txt b/gadgets/gtPlus/CMakeLists.txt
new file mode 100644
index 0000000..4d4be04
--- /dev/null
+++ b/gadgets/gtPlus/CMakeLists.txt
@@ -0,0 +1,177 @@
+
+include_directories( 
+    ${ACE_INCLUDE_DIR} 
+    ${Boost_INCLUDE_DIR}
+    ${ISMRMRD_INCLUDE_DIR}
+    ${FFTW3_INCLUDE_DIR}
+    ${ARMADILLO_INCLUDE_DIRS}
+    ${CMAKE_SOURCE_DIR}/toolboxes/cloudbus
+    ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+    ${CMAKE_SOURCE_DIR}/toolboxes/fft/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/operators
+    ${CMAKE_SOURCE_DIR}/toolboxes/operators/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+    ${CMAKE_SOURCE_DIR}/toolboxes/solvers/cpu
+    ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+    ${HDF5_INCLUDE_DIR}
+    ${HDF5_INCLUDE_DIR}/cpp
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/mri_core
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/util
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/workflow
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/algorithm
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/solver
+    ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+    ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools/ismrmrd
+    ${CMAKE_SOURCE_DIR}/apps/gadgetron
+    ${CMAKE_SOURCE_DIR}/gadgets/mri_core 
+    ${CMAKE_SOURCE_DIR}/gadgets/gtPlus 
+    ${CMAKE_SOURCE_DIR}/apps/gadgetron
+  )
+
+IF (WIN32)
+    ADD_DEFINITIONS(-DTIXML_USE_STL)
+    ADD_DEFINITIONS(-D__BUILD_GADGETS__)
+ENDIF (WIN32)
+
+if(WIN32)
+    link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+find_package(Ismrmrd REQUIRED)
+
+set( gtCloud_files 
+        config/gtCloud/myCloud_2DT.txt 
+        config/gtCloud/myCloud_3DT.txt 
+        config/gtCloud/myCloud_2DT_DualLayer.txt 
+        config/gtCloud/myCloud_2DT_DualLayer_FirstLayer.txt )
+
+set( config_files 
+
+    config/GT_2DT_Cartesian.xml
+    config/GT_2DT_Cartesian_CloudNode.xml
+    config/GT_2DT_Cartesian_Dicom.xml
+    config/GT_2DT_Cartesian_DualLayer_Gateway_L1SPIRIT.xml
+    config/GT_2DT_Cartesian_DualLayer_Gateway_SPIRIT.xml
+    config/GT_2DT_Cartesian_FirstLayer_CloudNode.xml
+    config/GT_2DT_Cartesian_GFactor.xml
+    config/GT_2DT_Cartesian_ImageTrigger_Dicom.xml
+    config/GT_2DT_Cartesian_L1SPIRIT.xml
+    config/GT_2DT_Cartesian_PseudoReplica_SNRUnitRecon.xml
+    config/GT_2DT_Cartesian_SingleLayer_CloudNode.xml
+    config/GT_2DT_Cartesian_SPIRIT.xml
+    config/GT_2DT_FatWater.xml
+
+    config/GT_2DT_HASTE.xml
+    config/GT_2DT_HASTE_MOCO_AVE.xml
+
+    config/GT_2DT_T2W.xml
+
+    config/GT_2DT_LGE.xml
+
+    config/GT_2DT_MOLLI.xml
+    config/GT_2DT_MOLLI_Offline.xml
+
+    config/GT_2DT_Perfusion.xml
+
+    config/GT_2DT_PseudoReplica_SNRUnitRecon_DataExport.xml
+
+    config/GT_2DT_RealTimeCine.xml
+    config/GT_2DT_RealTimeFlow.xml
+
+    config/GT_2DT_RTCine_L1SPIRIT_PhysioInterp.xml
+    config/GT_2DT_RTCine_L1SPIRIT_PhysioInterp_DualLayer_Gateway.xml
+
+    config/GT_3DT_Cartesian.xml
+    config/GT_3DT_Cartesian_CloudNode.xml
+    config/GT_3DT_Cartesian_GFactor.xml
+    config/GT_3DT_Cartesian_L1SPIRIT.xml
+    config/GT_3DT_Cartesian_SingleLayer_L1SPIRIT.xml )
+
+set( gadgetronPlus_header_files GtPlusGadgetImageArray.h
+                                GtPlusAccumulatorWorkOrderTriggerGadget.h
+                                GtPlusAccumulatorImageTriggerGadget.h
+                                GtPlusGadgetOpenMP.h
+                                GtPlusReconGadget.h
+                                GtPlusRecon2DTGadget.h
+                                GtPlusRecon3DTGadget.h
+                                GtPlusRecon2DTGadgetCloud.h
+                                GtPlusRecon2DTCloudPackage.h
+                                GadgetCloudJobMessageReadWrite.h
+                                GtPlusReconJob2DTGadget.h 
+                                GtPlusReconJob3DTGadget.h 
+                                GtPlusReconJob2DTGadgetCloud.h 
+                                GtPlusImageReconGadget.h 
+                                GtPlusReconGadgetUtil.h 
+                                )
+
+set( gadgetronPlus_src_files GtPlusGadgetImageArray.cpp
+                            GtPlusAccumulatorWorkOrderTriggerGadget.cpp
+                            GtPlusAccumulatorImageTriggerGadget.cpp
+                            GtPlusGadgetOpenMP.cpp
+                            GtPlusReconGadget.cpp
+                            GtPlusRecon2DTGadget.cpp
+                            GtPlusRecon3DTGadget.cpp
+                            GtPlusRecon2DTGadgetCloud.cpp
+                            GadgetCloudJobMessageReadWrite.cpp
+                            GtPlusReconJob2DTGadget.cpp 
+                            GtPlusReconJob3DTGadget.cpp 
+                            GtPlusReconJob2DTGadgetCloud.cpp 
+                            GtPlusImageReconGadget.cpp 
+                            GtPlusReconGadgetUtil.cpp 
+                            )
+
+set( config_gtCloud_files ${config_files} ${gtCloud_files} )
+source_group(config FILES ${config_gtCloud_files})
+
+add_library(gadgetronPlus SHARED
+            GtPlusGadgetExport.h 
+            ${gadgetronPlus_header_files} 
+            ${gadgetronPlus_src_files} 
+            ${config_gtCloud_files} 
+        )
+
+set_target_properties(gadgetronPlus PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetronPlus 
+    gadgetron_gadgetbase
+    gadgetron_toolbox_log
+    gadgetron_toolbox_cpucore 
+    gadgetron_toolbox_cpucore_math 
+    gadgetron_toolbox_cpufft
+    gadgetron_toolbox_gtplus 
+    gadgetron_toolbox_gadgettools 
+    gadgetron_toolbox_cloudbus 
+    gadgetron_toolbox_mri_core 
+    ${Boost_LIBRARIES}
+    ${ISMRMRD_LIBRARIES} ${FFTW3_LIBRARIES} 
+    optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+    )
+
+if (CUDA_FOUND)  
+  include_directories( 
+    ${CUDA_INCLUDE_DIRS}
+    )
+  target_link_libraries(gadgetronPlus gadgetron_toolbox_gpuparallelmri)
+endif(CUDA_FOUND)
+
+install (FILES 
+        GtPlusGadgetExport.h 
+        ${gadgetronPlus_header_files} 
+        DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+install (FILES  ${config_files} 
+        DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
+
+install (FILES  ${gtCloud_files} 
+        DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH}/gtCloud COMPONENT main)
+
+install(TARGETS gadgetronPlus DESTINATION lib COMPONENT main)
diff --git a/gadgets/gtPlus/GadgetCloudJobMessageReadWrite.cpp b/gadgets/gtPlus/GadgetCloudJobMessageReadWrite.cpp
new file mode 100644
index 0000000..4e8ac2c
--- /dev/null
+++ b/gadgets/gtPlus/GadgetCloudJobMessageReadWrite.cpp
@@ -0,0 +1,11 @@
+
+#include "GadgetCloudJobMessageReadWrite.h"
+
+namespace Gadgetron
+{
+    GADGETRON_READER_FACTORY_DECLARE(GtPlusCloudJobMessageReaderCPFL)
+    GADGETRON_WRITER_FACTORY_DECLARE(GtPlusCloudJobMessageWriterCPFL)
+
+    GADGETRON_READER_FACTORY_DECLARE(GtPlus2DTGadgetCloudJobMessageReaderCPFL)
+    GADGETRON_WRITER_FACTORY_DECLARE(GtPlus2DTGadgetCloudJobMessageWriterCPFL)
+}
diff --git a/gadgets/gtPlus/GadgetCloudJobMessageReadWrite.h b/gadgets/gtPlus/GadgetCloudJobMessageReadWrite.h
new file mode 100644
index 0000000..7274968
--- /dev/null
+++ b/gadgets/gtPlus/GadgetCloudJobMessageReadWrite.h
@@ -0,0 +1,243 @@
+/** \file   GadgetCloudJobMessageReaderWriter.h
+    \brief  Implement the reader/writer for GtPlus cloud job data package
+            This implementation requires the cloud job supports the serialize and deserialize functions
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "GtPlusGadgetExport.h"
+
+#include "GadgetImageMessageReader.h"
+#include "GadgetImageMessageWriter.h"
+#include "gtPlusISMRMRDReconWorker.h"
+#include "GtPlusRecon2DTCloudPackage.h"
+
+namespace Gadgetron
+{
+
+    template <typename JobType> 
+    class GadgetCloudJobMessageReader : public GadgetMessageReader
+    {
+
+    public:
+        virtual ACE_Message_Block* read(ACE_SOCK_Stream* stream) 
+        {
+            GadgetContainerMessage<int>* jobID = new GadgetContainerMessage<int>();
+            GadgetContainerMessage<JobType>* job = new GadgetContainerMessage<JobType>();
+
+            jobID->cont(job);
+
+            int id = 0;
+            size_t sizeOfJob = 0;
+
+            ssize_t recv_count = 0;
+
+            if ((recv_count = stream->recv_n(&id, sizeof(int))) <= 0)
+            {
+	      GERROR("GadgetCloudJobMessageReader, failed to read job id\n");
+	      job->release();
+	      return 0;
+            }
+
+            *(jobID->getObjectPtr()) = id;
+
+            if ((recv_count = stream->recv_n(&sizeOfJob, sizeof(size_t))) <= 0)
+            {
+	      GERROR("GadgetCloudJobMessageReader, failed to read job size\n");
+	      job->release();
+	      return 0;
+            }
+
+            hoNDArray<char> jobBuf;
+            try
+            {
+                jobBuf.create(sizeOfJob);
+            }
+            catch(...)
+            {
+	      GERROR("GadgetCloudJobMessageReader, failed to allocate memory\n");
+	      job->release();
+	      return 0;
+            }
+
+            size_t maxBytesPerSend = (size_t)(512.0*1024*1024);
+
+            if ( sizeOfJob > maxBytesPerSend )
+            {
+                size_t receivedBytes = 0;
+                size_t receivingBytes = maxBytesPerSend;
+
+                while ( receivingBytes > 0 )
+                {
+                    if ((recv_count = stream->recv_n(jobBuf.get_data_ptr()+receivedBytes, receivingBytes)) <= 0)
+                    {
+		      GERROR("GadgetCloudJobMessageReader, failed to read data from socket\n");
+		      job->release();
+		      return 0;
+                    }
+
+                    receivedBytes += receivingBytes;
+                    if ( receivedBytes >= sizeOfJob ) break;
+
+                    if ( sizeOfJob-receivedBytes < maxBytesPerSend )
+                    {
+                        receivingBytes = sizeOfJob-receivedBytes;
+                    }
+                }
+            }
+            else
+            {
+                if ((recv_count = stream->recv_n(jobBuf.get_data_ptr(), sizeOfJob)) <= 0)
+                {
+		  GERROR("GadgetCloudJobMessageReader, failed to read data from socket\n");
+		  job->release();
+		  return 0;
+                }
+            }
+
+            if ( !job->getObjectPtr()->deserialize(jobBuf.get_data_ptr(), sizeOfJob) ) return 0;
+
+            return jobID;
+        }
+    };
+
+    template <typename JobType> 
+    class GadgetCloudJobMessageWriter : public GadgetMessageWriter
+    {
+
+    public:
+
+        ACE_UINT16 msg_id_;
+
+        GadgetCloudJobMessageWriter() : msg_id_(GADGET_MESSAGE_CLOUD_JOB) {}
+
+        virtual int write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb) 
+        {
+
+            GadgetContainerMessage<int>* m1 = 
+                dynamic_cast< GadgetContainerMessage<int>* >(mb);
+
+            int jobID = *(m1->getObjectPtr());
+
+            GadgetContainerMessage<JobType>* job = 
+                dynamic_cast< GadgetContainerMessage<JobType>* >(mb->cont());
+
+            if (!job )
+            {
+	      GERROR("GadgetCloudJobMessageWriter invalid image message objects\n");
+	      return -1;
+            }
+
+            ssize_t send_cnt = 0;
+            GadgetMessageIdentifier id;
+            id.id = msg_id_;
+
+            if ((send_cnt = sock->send_n (&id, sizeof(GadgetMessageIdentifier))) <= 0)
+	    {
+	      GERROR("Unable to send job message identifier\n");
+	      return -1;
+            }
+
+            if ((send_cnt = sock->send_n (&jobID, sizeof(int))) <= 0)
+            {
+	      GERROR("Unable to send job id\n");
+	      return -1;
+            }
+
+            size_t sizeOfJob=0;
+            char* buf = NULL;
+            if ( !job->getObjectPtr()->serialize(buf, sizeOfJob) )
+            {
+	      GERROR("GadgetCloudJobMessageWriter, failed to serialize the job\n");
+	      return -1;
+            }
+
+            if ((send_cnt = sock->send_n (&sizeOfJob, sizeof(size_t))) <= 0)
+            {
+	      GERROR("Unable to send job size\n");
+	      delete [] buf;
+	      return -1;
+            }
+
+            GDEBUG("--> send job, size of job : %f MBytes ... \n", sizeOfJob/1024.0/1024);
+
+            size_t maxBytesPerSend = (size_t)(512.0*1024*1024);
+
+            if ( sizeOfJob > maxBytesPerSend )
+            {
+                size_t sentBytes = 0;
+                size_t sendingBytes = maxBytesPerSend;
+
+                while ( sendingBytes > 0 )
+                {
+                    if ((send_cnt = sock->send_n (buf+sentBytes, sendingBytes)) <= 0)
+                    {
+		      GERROR("Unable to send job data\n");
+		      delete [] buf;
+		      return -1;
+                    }
+
+                    sentBytes += sendingBytes;
+                    if ( sentBytes >= sizeOfJob ) break;
+
+                    if ( sizeOfJob-sentBytes < maxBytesPerSend )
+                    {
+                        sendingBytes = sizeOfJob-sentBytes;
+                    }
+                }
+            }
+            else
+            {
+                if ((send_cnt = sock->send_n (buf, sizeOfJob)) <= 0)
+                {
+		  GERROR("Unable to send job data\n");
+		  delete [] buf;
+		  return -1;
+                }
+            }
+
+            delete [] buf;
+
+            return 0;
+        }
+
+    };
+
+    typedef Gadgetron::gtPlus::gtPlusReconJob2DT< std::complex<float> > GtPlusReconJobTypeCPFL;
+
+    class EXPORTGTPLUSGADGET GtPlusCloudJobMessageReaderCPFL : public GadgetCloudJobMessageReader<GtPlusReconJobTypeCPFL>
+    {
+    public:
+        GADGETRON_WRITER_DECLARE(GtPlusCloudJobMessageReaderCPFL);
+    };
+
+    class EXPORTGTPLUSGADGET GtPlusCloudJobMessageWriterCPFL : public GadgetCloudJobMessageWriter<GtPlusReconJobTypeCPFL>
+    {
+    public:
+        GADGETRON_WRITER_DECLARE(GtPlusCloudJobMessageWriterCPFL);
+    };
+
+    // gadget level cloud computing
+
+    class EXPORTGTPLUSGADGET GtPlus2DTGadgetCloudJobMessageReaderCPFL : public GadgetCloudJobMessageReader<GtPlusRecon2DTCloudPackageCPFL>
+    {
+    public:
+        GADGETRON_WRITER_DECLARE(GtPlus2DTGadgetCloudJobMessageReaderCPFL);
+    };
+
+    class EXPORTGTPLUSGADGET GtPlus2DTGadgetCloudJobMessageWriterCPFL : public GadgetCloudJobMessageWriter<GtPlusRecon2DTCloudPackageCPFL>
+    {
+    public:
+
+        typedef GadgetCloudJobMessageWriter<GtPlusRecon2DTCloudPackageCPFL> BaseClass;
+
+        GtPlus2DTGadgetCloudJobMessageWriterCPFL() : BaseClass()
+        {
+            msg_id_ = GADGET_MESSAGE_GADGETCLOUD_JOB;
+        }
+
+        GADGETRON_WRITER_DECLARE(GtPlus2DTGadgetCloudJobMessageWriterCPFL);
+    };
+}
diff --git a/gadgets/gtPlus/GadgetMRIHeaders.cpp b/gadgets/gtPlus/GadgetMRIHeaders.cpp
new file mode 100644
index 0000000..fc67992
--- /dev/null
+++ b/gadgets/gtPlus/GadgetMRIHeaders.cpp
@@ -0,0 +1,262 @@
+
+#include "GadgetMRIHeaders.h"
+
+// --------------------------------------------------------------------
+
+LoopCounters::LoopCounters() 
+{
+    line = 0;
+    acquisition = 0;
+    slice = 0;
+    partition = 0;
+    echo = 0;
+    phase = 0;
+    repetition = 0;
+    set = 0;
+    segment = 0;
+    channel = 0;
+}
+
+LoopCounters::~LoopCounters() {}
+
+void LoopCounters::dump()
+{
+    std::cout << "[Line Cha Slice Partition Echo Phase Rep Set Seg] = [" 
+                    << line 
+                    << " " << channel 
+                    << " " << slice 
+                    << " " << partition 
+                    << " " << echo 
+                    << " " << phase 
+                    << " " << repetition 
+                    << " " << set 
+                    << " " << segment << "]" << std::endl;
+}
+
+// --------------------------------------------------------------------
+
+GadgetMessageAcquisition::GadgetMessageAcquisition() 
+{
+    flags = 0;
+    meas_uid = 0;
+    scan_counter = 0;
+    time_stamp = 0;
+    pmu_time_stamp = 0;
+    samples = 0;
+    channels = 0;
+    centre_column = 0;
+    position[0] = 0.0f; position[1] = 0.0f; position[2] = 0.0f;
+    quarternion[0] = 1.0f; quarternion[1] = 0.0f; quarternion[2] = 0.0f; quarternion[3] = 0.0f;
+    table_position = 0.0f;
+}
+
+GadgetMessageAcquisition::~GadgetMessageAcquisition() {}
+
+float GadgetMessageAcquisition::get_position(unsigned int index) 
+{
+    if (index < 3) 
+    {
+        return position[index];
+    }
+    else
+    {
+        return 0.0f;
+    }
+}
+
+void GadgetMessageAcquisition::set_position(unsigned int index, float pos)
+{
+    if (index < 3)
+    {
+        position[index] = pos;
+    }
+}
+
+float GadgetMessageAcquisition::get_quarternion(unsigned int index) 
+{
+    if (index < 4) 
+    {
+        return quarternion[index];
+    }
+    else
+    {
+        return 0.0f;
+    }
+}
+
+void GadgetMessageAcquisition::set_quarternion(unsigned int index, float quar)
+{
+    if (index < 4) 
+    {
+        quarternion[index] = quar;
+    }
+}
+
+void GadgetMessageAcquisition::dump()
+{
+    GDEBUG_STREAM("GadgetMessageAcquisition" << std::endl);
+    GDEBUG_STREAM("----------------------------------------------------------" << std::endl);
+    GDEBUG_STREAM("flags            : " << flags << std::endl);
+    GDEBUG_STREAM("meas_uid         : " << meas_uid << std::endl);
+    GDEBUG_STREAM("scan_counter     : " << scan_counter << std::endl);
+    GDEBUG_STREAM("time_stamp       : " << time_stamp << std::endl);
+    GDEBUG_STREAM("pmu_time_stamp   : " << pmu_time_stamp << std::endl);
+    GDEBUG_STREAM("samples          : " << samples << std::endl);
+    GDEBUG_STREAM("channels         : " << channels << std::endl);
+    GDEBUG_STREAM("position         : " << position[0] << " " << position[1] << " " << position[2] << std::endl);
+    GDEBUG_STREAM("quarternion      : " << quarternion[0] << " " << quarternion[1] << " " << quarternion[2] << " " << quarternion[3] << std::endl);
+    GDEBUG_STREAM("table_position   : " << table_position << std::endl);
+    GDEBUG_STREAM("idx     : ";            idx.dump());
+    GDEBUG_STREAM("min_idx : ";            min_idx.dump());
+    GDEBUG_STREAM("max_idx : ";            max_idx.dump());
+    GDEBUG_STREAM("----------------------------------------------------------" << std::endl);
+}
+
+// --------------------------------------------------------------------
+
+GadgetMessageImage::GadgetMessageImage()
+{
+    flags = 0;
+
+    matrix_size[0] = 0;
+    matrix_size[1] = 0;
+    matrix_size[2] = 0;
+
+    channels = 0;
+
+    position[0] = 0.0f;
+    position[1] = 0.0f;
+    position[2] = 0.0f;
+
+    quarternion[0] = 1.0f;
+    quarternion[1] = 0.0f;
+    quarternion[2] = 0.0f;
+    quarternion[3] = 0.0f;
+
+    table_position = 0.0f;
+
+    time_stamp = 0;
+    pmu_time_stamp = 0;
+    image_format = 0;
+    image_type = 0;
+    image_index = 0;
+    image_series_index = 0;
+}
+
+GadgetMessageImage::~GadgetMessageImage() {}
+
+void GadgetMessageImage::copy(GadgetMessageImage& aMessageImage)
+{
+    flags = aMessageImage.flags;
+
+    matrix_size[0] = aMessageImage.matrix_size[0];
+    matrix_size[1] = aMessageImage.matrix_size[1];
+    matrix_size[2] = aMessageImage.matrix_size[2];
+
+    channels = aMessageImage.channels;
+
+    position[0] = aMessageImage.position[0];
+    position[1] = aMessageImage.position[1];
+    position[2] = aMessageImage.position[2];
+
+    quarternion[0] = aMessageImage.quarternion[0];
+    quarternion[1] = aMessageImage.quarternion[1];
+    quarternion[2] = aMessageImage.quarternion[2];
+    quarternion[3] = aMessageImage.quarternion[3];
+
+    table_position = aMessageImage.table_position;
+
+    time_stamp = aMessageImage.time_stamp;
+    pmu_time_stamp = aMessageImage.pmu_time_stamp;
+    image_format = aMessageImage.image_format;
+    image_type = aMessageImage.image_type;
+    image_index = aMessageImage.image_index;
+    image_series_index = aMessageImage.image_series_index;
+}
+
+ACE_UINT16 GadgetMessageImage::get_matrix_size(unsigned int index) 
+{
+    if (index < 3) 
+    {
+        return matrix_size[index];
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+void GadgetMessageImage::set_matrix_size(unsigned int index, ACE_UINT16 size)
+{
+    if (index < 3) 
+    {
+        matrix_size[index] = size;
+    }
+}
+
+float GadgetMessageImage::get_position(unsigned int index) 
+{
+    if (index < 3) 
+    {
+        return position[index];
+    }
+    else
+    {
+        return 0.0f;
+    }
+}
+
+void GadgetMessageImage::set_position(unsigned int index, float pos)
+{
+    if (index < 3)
+    {
+        position[index] = pos;
+    }
+}
+
+float GadgetMessageImage::get_quarternion(unsigned int index)
+{
+    if (index < 4)
+    {
+        return quarternion[index];
+    }
+    else
+    {
+        return 0.0f;
+    }
+}
+
+void GadgetMessageImage::set_quarternion(unsigned int index, float quar)
+{
+    if (index < 4)
+    {
+        quarternion[index] = quar;
+    }
+}
+
+void GadgetMessageImage::dumpInfo()
+{
+    GDEBUG_STREAM("flags                 : " << flags << std::endl);
+    GDEBUG_STREAM("matrix_size           : " << matrix_size[0] << " " << matrix_size[1] << " " << matrix_size[2] << std::endl);
+    GDEBUG_STREAM("channels              : " << channels << std::endl);
+    GDEBUG_STREAM("position              : " << position[0] << " " << position[1] << " " << position[2] << std::endl);
+    GDEBUG_STREAM("quarternion           : " << quarternion[0] << " " << quarternion[1] << " " << quarternion[2] << " " << quarternion[3] << std::endl);
+    GDEBUG_STREAM("table_position        : " << table_position << std::endl);
+    GDEBUG_STREAM("data_idx_min          : ";   data_idx_min.dump());
+    GDEBUG_STREAM("data_idx_max          : ";   data_idx_max.dump());
+    GDEBUG_STREAM("data_idx_current      : ";   data_idx_current.dump());
+    GDEBUG_STREAM("time_stamp            : " << time_stamp << std::endl);
+    GDEBUG_STREAM("pmu_time_stamp        : " << pmu_time_stamp << std::endl);
+    GDEBUG_STREAM("image_format          : " << image_format << std::endl);
+    GDEBUG_STREAM("image_type            : " << image_type << std::endl);
+    GDEBUG_STREAM("image_index           : " << image_index << std::endl);
+    GDEBUG_STREAM("image_series_index    : " << image_series_index << std::endl);
+}
+
+void GadgetMessageImage::dump()
+{
+    GDEBUG_STREAM("GadgetMessageImage" << std::endl);
+    GDEBUG_STREAM("----------------------------------------------------------" << std::endl);
+    dumpInfo();
+    GDEBUG_STREAM("----------------------------------------------------------" << std::endl);
+}
diff --git a/gadgets/gtPlus/GadgetMRIHeadersExt.cpp b/gadgets/gtPlus/GadgetMRIHeadersExt.cpp
new file mode 100644
index 0000000..0fab366
--- /dev/null
+++ b/gadgets/gtPlus/GadgetMRIHeadersExt.cpp
@@ -0,0 +1,428 @@
+
+#include "GadgetMRIHeadersExt.h"
+#include "GadgetIsmrmrdReadWrite.h"
+// #include <iostream>
+
+// --------------------------------------------------------------------
+
+GadgetMessageImageExt::GadgetMessageImageExt() : ISMRMRD::ImageHeader()
+{
+    time_stamps.clear();
+    pmu_time_stamps.clear();
+}
+
+GadgetMessageImageExt::~GadgetMessageImageExt() { }
+
+void GadgetMessageImageExt::set_matrix_size(unsigned int index, ACE_UINT16 size)
+{
+    if (index < 3) 
+    {
+        matrix_size[index] = size;
+    }
+
+    if ( index == 1 )
+    {
+        time_stamps.clear();
+        time_stamps.resize(matrix_size[1], -1);
+        pmu_time_stamps.clear();
+        pmu_time_stamps.resize(matrix_size[1], -1);
+    }
+}
+
+void GadgetMessageImageExt::copy(GadgetMessageImageExt& aMessageImage)
+{
+    flags = aMessageImage.flags;
+
+    matrix_size[0] = aMessageImage.matrix_size[0];
+    matrix_size[1] = aMessageImage.matrix_size[1];
+    matrix_size[2] = aMessageImage.matrix_size[2];
+
+    channels = aMessageImage.channels;
+
+    position[0] = aMessageImage.position[0];
+    position[1] = aMessageImage.position[1];
+    position[2] = aMessageImage.position[2];
+
+    read_dir[0] = aMessageImage.read_dir[0];
+    read_dir[1] = aMessageImage.read_dir[1];
+    read_dir[2] = aMessageImage.read_dir[2];
+
+    phase_dir[0] = aMessageImage.phase_dir[0];
+    phase_dir[1] = aMessageImage.phase_dir[1];
+    phase_dir[2] = aMessageImage.phase_dir[2];
+
+    slice_dir[0] = aMessageImage.slice_dir[0];
+    slice_dir[1] = aMessageImage.slice_dir[1];
+    slice_dir[2] = aMessageImage.slice_dir[2];
+
+    patient_table_position[0] = aMessageImage.patient_table_position[0];
+    patient_table_position[1] = aMessageImage.patient_table_position[1];
+    patient_table_position[2] = aMessageImage.patient_table_position[2];
+
+    acquisition_time_stamp = aMessageImage.acquisition_time_stamp;
+
+    physiology_time_stamp[0] = aMessageImage.physiology_time_stamp[0];
+    physiology_time_stamp[1] = aMessageImage.physiology_time_stamp[1];
+    physiology_time_stamp[2] = aMessageImage.physiology_time_stamp[2];
+
+    image_data_type = aMessageImage.image_data_type;
+    image_type = aMessageImage.image_type;
+    image_index = aMessageImage.image_index;
+    image_series_index = aMessageImage.image_series_index;
+
+    memcpy(user_int, aMessageImage.user_int, sizeof(int32_t)*ISMRMRD_USER_INTS);
+    memcpy(user_float, aMessageImage.user_float, sizeof(float)*ISMRMRD_USER_FLOATS);
+
+    time_stamps = aMessageImage.time_stamps;
+    pmu_time_stamps = aMessageImage.pmu_time_stamps;
+}
+
+void GadgetMessageImageExt::dump()
+{
+    GDEBUG_STREAM("GadgetMessageImageExt" << std::endl);
+    GDEBUG_STREAM("----------------------------------------------------------" << std::endl);
+    //dumpInfo();
+    GDEBUG_STREAM("----------------------------------------------------------" << std::endl);
+}
+
+// --------------------------------------------------------------------
+
+// [Col Line Cha Slice Partition Echo Phase Rep Set Seg]
+//   0   1    2   3     4         5    6     7   8   9
+// store a scan with 10 dimensions
+GadgetMessageImageArray::GadgetMessageImageArray() 
+:   imageArray_(0),
+    kSpace_centre_col_no(0), 
+    kSpace_centre_line_no(0), 
+    kSpace_centre_partition_no(0), 
+    kSpace_max_acquired_col_no(0), 
+    kSpace_max_acquired_line_no(0), 
+    kSpace_max_acquired_partition_no(0)
+{
+
+}
+
+GadgetMessageImageArray::GadgetMessageImageArray(int aSize[10])
+{
+    try
+    {
+        unsigned int ii;
+        for ( ii=0; ii<10; ii++ )
+        {
+            matrix_size[ii] = aSize[ii];
+        }
+
+        unsigned int len = 1;
+        for ( ii=3; ii<10; ii++ )
+        {
+            len *= matrix_size[ii];
+        }
+
+        if ( len > 0 )
+        {
+            imageArray_ = new GadgetMessageImageExt[len];
+        }
+
+        kSpace_centre_col_no = matrix_size[0]/2;
+        kSpace_centre_line_no = matrix_size[1]/2;
+        kSpace_centre_partition_no = matrix_size[4]/2;
+
+        kSpace_max_acquired_col_no = matrix_size[0]-1;
+        kSpace_max_acquired_line_no = matrix_size[1]-1;
+        kSpace_max_acquired_partition_no = matrix_size[4]-1;
+    }
+    catch(...)
+    {
+        GDEBUG_STREAM("Failed in allocate imageArray_" << std::endl);
+    }
+}
+
+GadgetMessageImageArray::~GadgetMessageImageArray()
+{
+    if (imageArray_)
+    {
+        delete [] imageArray_;
+    }
+}
+
+void GadgetMessageImageArray::resize(int aSize[10])
+{
+    try
+    {
+        unsigned int ii;
+        for ( ii=0; ii<10; ii++ )
+        {
+            matrix_size[ii] = aSize[ii];
+        }
+
+        unsigned int len = 1;
+        for ( ii=3; ii<10; ii++ )
+        {
+            len *= matrix_size[ii];
+        }
+
+        if ( imageArray_ ) 
+        {
+            delete [] imageArray_;
+            imageArray_ = NULL;
+        }
+
+        if ( len > 0 )
+        {
+            imageArray_ = new GadgetMessageImageExt[len];
+        }
+
+        kSpace_centre_col_no = matrix_size[0]/2;
+        kSpace_centre_line_no = matrix_size[1]/2;
+        kSpace_centre_partition_no = matrix_size[4]/2;
+
+        kSpace_max_acquired_col_no = matrix_size[0]-1;
+        kSpace_max_acquired_line_no = matrix_size[1]-1;
+        kSpace_max_acquired_partition_no = matrix_size[4]-1;
+    }
+    catch(...)
+    {
+        GDEBUG_STREAM("Failed in resize GadgetMessageImageArray " << std::endl);
+    }
+}
+
+void GadgetMessageImageArray::copy(GadgetMessageImageArray& imageArray)
+{
+    if (imageArray_) delete [] imageArray_;
+
+    unsigned int ii;
+    for ( ii=0; ii<10; ii++ )
+    {
+        matrix_size[ii] = imageArray.matrix_size[ii];
+    }
+
+    unsigned int len = 1;
+    for ( ii=3; ii<10; ii++ )
+    {
+        len *= matrix_size[ii];
+    }
+
+    kSpace_centre_col_no = imageArray.kSpace_centre_col_no;
+    kSpace_centre_line_no = imageArray.kSpace_centre_line_no;
+    kSpace_centre_partition_no = imageArray.kSpace_centre_partition_no;
+
+    kSpace_max_acquired_col_no = imageArray.kSpace_max_acquired_col_no;
+    kSpace_max_acquired_line_no = imageArray.kSpace_max_acquired_line_no;
+    kSpace_max_acquired_partition_no = imageArray.kSpace_max_acquired_partition_no;
+
+    if ( len > 0 )
+    {
+        imageArray_ = new GadgetMessageImageExt[len];
+    }
+
+    for ( unsigned int i=0; i<len; i++ )
+    {
+        imageArray_[i] = imageArray.imageArray_[i];
+    }
+}
+
+int GadgetMessageImageArray::get_offset(int slc, int par, int eco, int phs, int rep, int set, int seg)
+{
+    int offset = seg*matrix_size[8]*matrix_size[7]*matrix_size[6]*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + set*matrix_size[7]*matrix_size[6]*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + rep*matrix_size[6]*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + phs*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + eco*matrix_size[4]*matrix_size[3]
+                    + par*matrix_size[3]
+                    + slc;
+    return offset;
+}
+
+void GadgetMessageImageArray::extractMessageImageArrayForSLC(int slc, GadgetMessageImageArray& imageArray)
+{
+    if ( slc >= matrix_size[3] )
+    {
+        GDEBUG_STREAM("extractMessageImageArrayForSLC error - slc >= matrix_size[3] " << std::endl);
+        return;
+    }
+
+    int aSize[10];
+
+    unsigned int ii;
+    for ( ii=0; ii<10; ii++ )
+    {
+        aSize[ii] = matrix_size[ii];
+    }
+
+    aSize[3] = 1;
+
+    imageArray.resize(aSize);
+
+    imageArray.kSpace_centre_col_no = kSpace_centre_col_no;
+    imageArray.kSpace_centre_line_no = kSpace_centre_line_no;
+    imageArray.kSpace_centre_partition_no = kSpace_centre_partition_no;
+    imageArray.kSpace_max_acquired_col_no = kSpace_max_acquired_col_no;
+    imageArray.kSpace_max_acquired_line_no = kSpace_max_acquired_line_no;
+    imageArray.kSpace_max_acquired_partition_no = kSpace_max_acquired_partition_no;
+
+    int par, eco, phs, rep, set, seg;
+
+    int PAR = matrix_size[4];
+    int ECO = matrix_size[5];
+    int PHS = matrix_size[6];
+    int REP = matrix_size[7];
+    int SET = matrix_size[8];
+    int SEG = matrix_size[9];
+
+    for ( seg=0; seg<SEG; seg++ )
+    {
+        for ( set=0; set<SET; set++ )
+        {
+            for ( rep=0; rep<REP; rep++ )
+            {
+                for ( phs=0; phs<PHS; phs++ )
+                {
+                    for ( eco=0; eco<ECO; eco++ )
+                    {
+                        for ( par=0; par<PAR; par++ )
+                        {
+                            int offset = this->get_offset(slc, par, eco, phs, rep, set, seg);
+                            int offsetSLC = imageArray.get_offset(0, par, eco, phs, rep, set, seg);
+
+                            imageArray.imageArray_[offsetSLC] = imageArray_[offset];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void GadgetMessageImageArray::extractMessageImageArrayForREP(int rep, GadgetMessageImageArray& imageArray)
+{
+    if ( rep >= matrix_size[7] )
+    {
+        GDEBUG_STREAM("extractMessageImageArrayForSLC error - rep >= matrix_size[7] " << std::endl);
+        return;
+    }
+
+    int aSize[10];
+
+    unsigned int ii;
+    for ( ii=0; ii<10; ii++ )
+    {
+        aSize[ii] = matrix_size[ii];
+    }
+
+    aSize[7] = 1;
+
+    imageArray.resize(aSize);
+
+    imageArray.kSpace_centre_col_no = kSpace_centre_col_no;
+    imageArray.kSpace_centre_line_no = kSpace_centre_line_no;
+    imageArray.kSpace_centre_partition_no = kSpace_centre_partition_no;
+    imageArray.kSpace_max_acquired_col_no = kSpace_max_acquired_col_no;
+    imageArray.kSpace_max_acquired_line_no = kSpace_max_acquired_line_no;
+    imageArray.kSpace_max_acquired_partition_no = kSpace_max_acquired_partition_no;
+
+    int par, eco, phs, slc, set, seg;
+
+    int SLC = matrix_size[3];
+    int PAR = matrix_size[4];
+    int ECO = matrix_size[5];
+    int PHS = matrix_size[6];
+    int SET = matrix_size[8];
+    int SEG = matrix_size[9];
+
+    for ( seg=0; seg<SEG; seg++ )
+    {
+        for ( set=0; set<SET; set++ )
+        {
+            for ( slc=0; slc<SLC; slc++ )
+            {
+                for ( phs=0; phs<PHS; phs++ )
+                {
+                    for ( eco=0; eco<ECO; eco++ )
+                    {
+                        for ( par=0; par<PAR; par++ )
+                        {
+                            int offset = this->get_offset(slc, par, eco, phs, rep, set, seg);
+                            int offsetREP = imageArray.get_offset(slc, par, eco, phs, 0, set, seg);
+
+                            imageArray.imageArray_[offsetREP] = imageArray_[offset];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void GadgetMessageImageArray::dump()
+{
+    unsigned int ii;
+    GDEBUG_STREAM("GadgetMessageImageArray" << std::endl);
+    GDEBUG_STREAM("==========================================================" << std::endl);
+    GDEBUG_STREAM("matrix_size           : ");
+    for ( ii=0; ii<10; ii++ )
+    {
+        GDEBUG_STREAM(matrix_size[ii] << " ");
+    }
+    GDEBUG_STREAM(std::endl);
+    GDEBUG_STREAM("----------------------------------------------------------" << std::endl);
+    GDEBUG_STREAM("kSpace_centre_col_no             : " << kSpace_centre_col_no << std::endl);
+    GDEBUG_STREAM("kSpace_max_acquired_col_no       : " << kSpace_max_acquired_col_no << std::endl);
+    GDEBUG_STREAM("----------------------------------------------------------" << std::endl);
+    GDEBUG_STREAM("kSpace_centre_line_no            : " << kSpace_centre_line_no << std::endl);
+    GDEBUG_STREAM("kSpace_max_acquired_line_no      : " << kSpace_max_acquired_line_no << std::endl);
+    GDEBUG_STREAM("----------------------------------------------------------" << std::endl);
+    GDEBUG_STREAM("kSpace_centre_partition_no       : " << kSpace_centre_partition_no << std::endl);
+    GDEBUG_STREAM("kSpace_max_acquired_partition_no : " << kSpace_max_acquired_partition_no << std::endl);
+    GDEBUG_STREAM("----------------------------------------------------------" << std::endl);
+    if ( imageArray_ )
+    {
+        int slc, par, eco, phs, rep, set, seg;
+        for ( seg=0; seg<matrix_size[9]; seg++ )
+        {
+            for ( set=0; set<matrix_size[8]; set++ )
+            {
+                for ( rep=0; rep<matrix_size[7]; rep++ )
+                {
+                    for ( phs=0; phs<matrix_size[6]; phs++ )
+                    {
+                        for ( eco=0; eco<matrix_size[5]; eco++ )
+                        {
+                            for ( par=0; par<matrix_size[4]; par++ )
+                            {
+                                for ( slc=0; slc<matrix_size[3]; slc++ )
+                                {
+                                    int offset = get_offset(slc, par, eco, phs, rep, set, seg);
+                                    std::cout << "[Slice Partition Echo Phase Rep Set Seg] = [" 
+                                                << " " << slc 
+                                                << " " << par 
+                                                << " " << eco 
+                                                << " " << phs 
+                                                << " " << rep 
+                                                << " " << set 
+                                                << " " << seg << "]" << std::endl;
+
+                                    imageArray_[offset].dump();
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+    }
+    GDEBUG_STREAM("==========================================================" << std::endl);
+}
+
+// --------------------------------------------------------------------
+
+KSpaceBuffer::KSpaceBuffer() 
+: isIPAT(false) 
+{
+
+}
+
+KSpaceBuffer::~KSpaceBuffer()
+{
+
+}
diff --git a/gadgets/gtPlus/GtPlusAccumulatorImageTriggerGadget.cpp b/gadgets/gtPlus/GtPlusAccumulatorImageTriggerGadget.cpp
new file mode 100644
index 0000000..5ff5959
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusAccumulatorImageTriggerGadget.cpp
@@ -0,0 +1,746 @@
+
+#include "GtPlusAccumulatorImageTriggerGadget.h"
+#include "GtPlusReconGadgetUtil.h"
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+GtPlusAccumulatorImageTriggerGadget::GtPlusAccumulatorImageTriggerGadget() : image_counter_(0), triggered_in_close_(false), verboseMode_(false)
+{
+    cha_trigger_ = false;
+    slc_trigger_ = false;
+    e2_trigger_ = false;
+    con_trigger_ = false;
+    phs_trigger_ = false;
+    rep_trigger_ = false;
+    set_trigger_ = false;
+    ave_trigger_ = false;
+
+    num_of_dimensions_ = 8; // [CHA SLC E2 CON PHS REP SET AVE]
+
+    // this may be changed later if multi-channel image workflow are used
+    meas_max_channel_ = 1;
+
+    pass_image_immediate_ = false;
+}
+
+GtPlusAccumulatorImageTriggerGadget::~GtPlusAccumulatorImageTriggerGadget()
+{
+
+}
+
+// extract necessary configuration information from the xml
+int GtPlusAccumulatorImageTriggerGadget::process_config(ACE_Message_Block* mb)
+{
+    // gadget parameters
+    verboseMode_ = this->get_bool_value("verboseMode");
+
+    cha_trigger_ = this->get_bool_value("TriggerChannel");
+    slc_trigger_ = this->get_bool_value("TriggerSlice");
+    e2_trigger_  = this->get_bool_value("TriggerE2");
+    con_trigger_ = this->get_bool_value("TriggerContrast");
+    phs_trigger_ = this->get_bool_value("TriggerPhase");
+    rep_trigger_ = this->get_bool_value("TriggerRepetition");
+    set_trigger_ = this->get_bool_value("TriggerSet");
+    ave_trigger_ = this->get_bool_value("TriggerAverage");
+
+    pass_image_immediate_ = this->get_bool_value("PassImageImmediately");
+
+    // ---------------------------------------------------------------------------------------------------------
+    // pass the xml file
+    ISMRMRD::IsmrmrdHeader h;
+    try {
+      deserialize(mb->rd_ptr(),h);
+    } catch (...) {
+      GDEBUG("Error parsing ISMRMRD Header");
+      throw;
+      return GADGET_FAIL;
+    }
+
+    // seq object
+    if (h.encoding.size() != 1)
+    {
+        GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+        GDEBUG("This simple GtPlusAccumulatorImageTriggerGadget only supports one encoding space\n");
+        return GADGET_FAIL;
+    }
+
+    // ---------------------------------------------------------------------------------------------------------
+
+    // find out the encoding space 
+    findMatrixSizeEncoding(h, matrix_size_encoding_);
+    findFOVEncoding(h, field_of_view_encoding_);
+
+    findMatrixSizeRecon(h, matrix_size_recon_);
+    findFOVRecon(h, field_of_view_recon_);
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "Encoding matrix size: " << matrix_size_encoding_[0] << " " << matrix_size_encoding_[1] << " " << matrix_size_encoding_[2]);
+    GDEBUG_CONDITION_STREAM(verboseMode_, "Encoding field_of_view : " << field_of_view_encoding_[0] << " " << field_of_view_encoding_[1] << " " << field_of_view_encoding_[2]);
+    GDEBUG_CONDITION_STREAM(verboseMode_, "Recon matrix size : " << matrix_size_recon_[0] << " " << matrix_size_recon_[1] << " " << matrix_size_recon_[2]);
+    GDEBUG_CONDITION_STREAM(verboseMode_, "Recon field_of_view :  " << field_of_view_recon_[0] << " " << field_of_view_recon_[1] << " " << field_of_view_recon_[2]);
+
+    // ---------------------------------------------------------------------------------------------------------
+    // encoding limits
+    GADGET_CHECK_RETURN(findEncodingLimits(h, meas_max_idx_, verboseMode_), GADGET_FAIL);
+
+    //ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+    //if (e_limits.kspace_encoding_step_1) 
+    //{
+    //    meas_max_idx_.kspace_encode_step_1 = (uint16_t)(matrix_size_encoding_[1]-1); // e_limits.kspace_encoding_step_1().get().maximum();
+    //}
+    //else
+    //{
+    //    meas_max_idx_.kspace_encode_step_1 = 0;
+    //    GDEBUG_STREAM("Setting number of kspace_encode_step_1 to 0" << std::endl);
+    //    return GADGET_FAIL;
+    //}
+
+    //if (e_limits.set)
+    //{
+    //    if ( e_limits.set->maximum > 0 )
+    //        meas_max_idx_.set = e_limits.set->maximum - 1;
+    //    else
+    //        meas_max_idx_.set = 0;
+
+    //    if ( meas_max_idx_.set < 0 ) meas_max_idx_.set = 0;
+    //}
+    //else
+    //{
+    //    meas_max_idx_.set = 0;
+    //}
+
+    //if (e_limits.phase)
+    //{
+    //    if ( e_limits.phase->maximum > 0 )
+    //        meas_max_idx_.phase = e_limits.phase->maximum-1;
+    //    else
+    //        meas_max_idx_.phase = 0;
+
+    //    if ( meas_max_idx_.phase < 0 ) meas_max_idx_.phase = 0;
+    //}
+    //else
+    //{
+    //    meas_max_idx_.phase = 0;
+    //}
+
+    //if (e_limits.kspace_encoding_step_2)
+    //{
+    //    meas_max_idx_.kspace_encode_step_2 = (uint16_t)(matrix_size_encoding_[2] - 1); // e_limits.kspace_encoding_step_2().get().maximum();
+    //}
+    //else
+    //{
+    //    meas_max_idx_.kspace_encode_step_2 = 0;
+    //}
+    //meas_max_idx_.kspace_encode_step_2 = (uint16_t)(matrix_size_recon_[2]);
+
+    //if (e_limits.contrast)
+    //{
+    //    if ( e_limits.contrast->maximum > 0 )
+    //        meas_max_idx_.contrast = e_limits.contrast->maximum-1;
+    //    else
+    //        meas_max_idx_.contrast = 0;
+
+    //    if ( meas_max_idx_.contrast < 0 ) meas_max_idx_.contrast = 0;
+    //}
+    //else
+    //{
+    //    meas_max_idx_.contrast = 0;
+    //}
+
+    //if (e_limits.slice)
+    //{
+    //    meas_max_idx_.slice = e_limits.slice->maximum;
+    //}
+    //else
+    //{
+    //    meas_max_idx_.slice = 0;
+    //}
+
+    //if (e_limits.repetition)
+    //{
+    //    meas_max_idx_.repetition = e_limits.repetition->maximum;
+    //}
+    //else
+    //{
+    //    meas_max_idx_.repetition = 0;
+    //}
+
+    //if (e_limits.average)
+    //{
+    //    meas_max_idx_.average = e_limits.average->maximum-1;
+    //}
+    //else
+    //{
+    //    meas_max_idx_.average = 0;
+    //}
+
+    //if (e_limits.segment)
+    //{
+    //    // meas_max_idx_.segment = e_limits.segment().get().maximum()-1;
+    //    meas_max_idx_.segment = 0;
+    //}
+    //else
+    //{
+    //    meas_max_idx_.segment = 0;
+    //}
+
+    // allocate the image buffers
+    // [Cha Slice E2 Con Phase Rep Set Ave]
+    //   0    1    2   3   4    5   6   7
+
+    meas_max_idx_.kspace_encode_step_2 = (uint16_t)(matrix_size_recon_[2]);
+
+    dimensions_.resize(GT_DIM_NUM_IMAGE, 0);
+    dimensions_[0] = meas_max_channel_;
+    dimensions_[1] = meas_max_idx_.slice+1;
+    dimensions_[2] = meas_max_idx_.kspace_encode_step_2;
+    dimensions_[3] = meas_max_idx_.contrast+1;
+    dimensions_[4] = meas_max_idx_.phase+1;
+    dimensions_[5] = meas_max_idx_.repetition+1;
+    dimensions_[6] = meas_max_idx_.set+1;
+    dimensions_[7] = meas_max_idx_.average+1;
+
+    imageBuffer_.create(dimensions_);
+    imageSent_.create(dimensions_);
+
+    otherBuffer_.create(dimensions_);
+    otherSent_.create(dimensions_);
+
+    size_t nElem = imageBuffer_.get_number_of_elements();
+    size_t ii;
+    for ( ii=0; ii<nElem; ii++ )
+    {
+        imageBuffer_(ii) = NULL;
+        otherBuffer_(ii) = NULL;
+        imageSent_(ii) = false;
+        otherSent_(ii) = false;
+    }
+
+    // set the dimensions under/not under trigger
+    this->setDimensionsUnderTrigger();
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "dimension limits                [Cha Slice E2 Con Phase Rep Set Ave] = [" 
+                               << " " << dimensions_[0] 
+                               << " " << dimensions_[1] 
+                               << " " << dimensions_[2] 
+                               << " " << dimensions_[3]
+                               << " " << dimensions_[4]
+                               << " " << dimensions_[5]
+                               << " " << dimensions_[6] 
+                               << " " << dimensions_[7] << "]");
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "dimension under trigger         [Cha Slice E2 Con Phase Rep Set Ave] = [" 
+                               << " " << dim_under_trigger_[0] 
+                               << " " << dim_under_trigger_[1] 
+                               << " " << dim_under_trigger_[2] 
+                               << " " << dim_under_trigger_[3]
+                               << " " << dim_under_trigger_[4]
+                               << " " << dim_under_trigger_[5]
+                               << " " << dim_under_trigger_[6] 
+                               << " " << dim_under_trigger_[7] << "]");
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "dimension limits under trigger  [Cha Slice E2 Con Phase Rep Set Ave] = [" 
+                               << " " << dim_limit_under_trigger_[0] 
+                               << " " << dim_limit_under_trigger_[1] 
+                               << " " << dim_limit_under_trigger_[2] 
+                               << " " << dim_limit_under_trigger_[3]
+                               << " " << dim_limit_under_trigger_[4]
+                               << " " << dim_limit_under_trigger_[5]
+                               << " " << dim_limit_under_trigger_[6] 
+                               << " " << dim_limit_under_trigger_[7] << "]");
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "dimension NOT under trigger     [Cha Slice E2 Con Phase Rep Set Ave] = [" 
+                               << " " << dim_not_under_trigger_[0] 
+                               << " " << dim_not_under_trigger_[1] 
+                               << " " << dim_not_under_trigger_[2] 
+                               << " " << dim_not_under_trigger_[3]
+                               << " " << dim_not_under_trigger_[4]
+                               << " " << dim_not_under_trigger_[5]
+                               << " " << dim_not_under_trigger_[6] 
+                               << " " << dim_not_under_trigger_[7] << "]");
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "dimension limits NOT under trigger [Cha Slice E2 Con Phase Rep Set Ave] = [" 
+                               << " " << dim_limit_not_under_trigger_[0] 
+                               << " " << dim_limit_not_under_trigger_[1] 
+                               << " " << dim_limit_not_under_trigger_[2] 
+                               << " " << dim_limit_not_under_trigger_[3]
+                               << " " << dim_limit_not_under_trigger_[4]
+                               << " " << dim_limit_not_under_trigger_[5]
+                               << " " << dim_limit_not_under_trigger_[6] 
+                               << " " << dim_limit_not_under_trigger_[7] << "]");
+
+    return GADGET_OK;
+}
+
+void GtPlusAccumulatorImageTriggerGadget::setDimensionsUnderTrigger()
+{
+    dim_under_trigger_.resize(num_of_dimensions_, false);
+    dim_not_under_trigger_.resize(num_of_dimensions_, false);
+
+    dim_limit_under_trigger_.resize(num_of_dimensions_, 1);
+    dim_limit_not_under_trigger_.resize(num_of_dimensions_, 1);
+
+    if (cha_trigger_)
+    {
+        dim_under_trigger_[0] = true;
+        dim_limit_under_trigger_[0] = dimensions_[0];
+    }
+    else
+    {
+        dim_not_under_trigger_[0] = true;
+        dim_limit_not_under_trigger_[0] = dimensions_[0];
+    }
+
+    if (slc_trigger_)
+    {
+        dim_under_trigger_[1] = true;
+        dim_limit_under_trigger_[1] = dimensions_[1];
+    }
+    else
+    {
+        dim_not_under_trigger_[1] = true;
+        dim_limit_not_under_trigger_[1] = dimensions_[1];
+    }
+
+    if (e2_trigger_)
+    {
+        dim_under_trigger_[2] = true;
+        dim_limit_under_trigger_[2] = dimensions_[2];
+    }
+    else
+    {
+        dim_not_under_trigger_[2] = true;
+        dim_limit_not_under_trigger_[2] = dimensions_[2];
+    }
+
+    if (con_trigger_)
+    {
+        dim_under_trigger_[3] = true;
+        dim_limit_under_trigger_[3] = dimensions_[3];
+    }
+    else
+    {
+        dim_not_under_trigger_[3] = true;
+        dim_limit_not_under_trigger_[3] = dimensions_[3];
+    }
+
+    if (phs_trigger_)
+    {
+        dim_under_trigger_[4] = true;
+        dim_limit_under_trigger_[4] = dimensions_[4];
+    }
+    else
+    {
+        dim_not_under_trigger_[4] = true;
+        dim_limit_not_under_trigger_[4] = dimensions_[4];
+    }
+
+    if (rep_trigger_)
+    {
+        dim_under_trigger_[5] = true;
+        dim_limit_under_trigger_[5] = dimensions_[5];
+    }
+    else
+    {
+        dim_not_under_trigger_[5] = true;
+        dim_limit_not_under_trigger_[5] = dimensions_[5];
+    }
+
+    if (set_trigger_)
+    {
+        dim_under_trigger_[6] = true;
+        dim_limit_under_trigger_[6] = dimensions_[6];
+    }
+    else
+    {
+        dim_not_under_trigger_[6] = true;
+        dim_limit_not_under_trigger_[6] = dimensions_[6];
+    }
+
+    if (ave_trigger_)
+    {
+        dim_under_trigger_[7] = true;
+        dim_limit_under_trigger_[7] = dimensions_[7];
+    }
+    else
+    {
+        dim_not_under_trigger_[7] = true;
+        dim_limit_not_under_trigger_[7] = dimensions_[7];
+    }
+
+    imageSentBuffer_.create(dim_limit_under_trigger_);
+    imageSentBuffer_.delete_data_on_destruct(false);
+}
+
+int GtPlusAccumulatorImageTriggerGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2, GadgetContainerMessage<ISMRMRD::MetaContainer>* m3)
+{
+    // find the data role
+    std::string dataRole;
+    dataRole = std::string(m3->getObjectPtr()->as_str(GADGETRON_DATA_ROLE, 0));
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "--> receive image : " << m1->getObjectPtr()->image_index << " -- " << dataRole);
+
+    if ( dataRole == GADGETRON_IMAGE_REGULAR )
+    {
+        GADGET_CHECK_RETURN(this->storeImage(*m1->getObjectPtr(), *m2->getObjectPtr(), *m3->getObjectPtr(), imageBuffer_), GADGET_FAIL);
+        GADGET_CHECK_RETURN(this->trigger(imageBuffer_, imageSent_, false), GADGET_FAIL);
+    }
+
+    if ( dataRole == GADGETRON_IMAGE_OTHER )
+    {
+        GADGET_CHECK_RETURN(this->storeImage(*m1->getObjectPtr(), *m2->getObjectPtr(), *m3->getObjectPtr(), otherBuffer_), GADGET_FAIL);
+        GADGET_CHECK_RETURN(this->trigger(otherBuffer_, otherSent_, false), GADGET_FAIL);
+    }
+
+    if ( dataRole == GADGETRON_IMAGE_GFACTOR )
+    {
+        // pass the image to the next gadget
+        Gadgetron::GadgetContainerMessage<ImageBufferType>* cm1 = new Gadgetron::GadgetContainerMessage<ImageBufferType>();
+
+        ImageBufferType& imgBuf = *(cm1->getObjectPtr());
+
+        std::vector<size_t> dim2D(num_of_dimensions_, 1);
+        imgBuf.create(dim2D);
+        imgBuf(0) = new ImageType();
+        GADGET_CHECK_RETURN(imgBuf(0)!=NULL, GADGET_FAIL);
+
+        // set image content
+        imgBuf(0)->from_NDArray( *m2->getObjectPtr() );
+        // set image attrib
+        imgBuf(0)->attrib_ = *m3->getObjectPtr();
+
+        // pass the ISMRMRD header info
+        GADGET_CHECK_RETURN(gtPlus_util_.setMetaAttributesFromImageHeaderISMRMRD(*m1->getObjectPtr(), imgBuf(0)->attrib_), GADGET_FAIL);
+
+        if (this->next()->putq(cm1) < 0) 
+        {
+            m1->release();
+            cm1->release();
+            return GADGET_FAIL;
+        }
+    }
+
+    m1->release();
+    return GADGET_OK;
+}
+
+bool GtPlusAccumulatorImageTriggerGadget::trigger(ImageBufferType& buf, ImageSentFlagBufferType& sentFlagBuf, bool inClose)
+{
+    try
+    {
+        // scan the buffered images, if the trigger dimensions are complete, sent out this package
+
+        // not under trigger
+        size_t cha, slc, e2, con, phs, rep, set, ave;
+
+        // under trigger
+        size_t cha_t, slc_t, e2_t, con_t, phs_t, rep_t, set_t, ave_t;
+
+        std::vector<size_t> image_ind(num_of_dimensions_, 0);
+        std::vector<size_t> image_sent_ind(num_of_dimensions_, 0);
+
+        size_t numOfElem = imageSentBuffer_.get_number_of_elements();
+        size_t ii;
+        for ( ii=0; ii<numOfElem; ii++ ) { imageSentBuffer_(ii) = NULL; }
+
+        for ( ave=0; ave<dim_limit_not_under_trigger_[7]; ave++ )
+        {
+            if ( dim_not_under_trigger_[7] ) image_ind[7] = ave;
+            // -------------------
+            for ( set=0; set<dim_limit_not_under_trigger_[6]; set++ )
+            {
+                if ( dim_not_under_trigger_[6] ) image_ind[6] = set;
+                // -------------------
+                for ( rep=0; rep<dim_limit_not_under_trigger_[5]; rep++ )
+                {
+                    if ( dim_not_under_trigger_[5] ) image_ind[5] = rep;
+                    // -------------------
+                    for ( phs=0; phs<dim_limit_not_under_trigger_[4]; phs++ )
+                    {
+                        if ( dim_not_under_trigger_[4] ) image_ind[4] = phs;
+                        // -------------------
+                        for ( con=0; con<dim_limit_not_under_trigger_[3]; con++ )
+                        {
+                            if ( dim_not_under_trigger_[3] ) image_ind[3] = con;
+                            // -------------------
+                            for ( e2=0; e2<dim_limit_not_under_trigger_[2]; e2++ )
+                            {
+                                if ( dim_not_under_trigger_[2] ) image_ind[2] = e2;
+                                // -------------------
+                                for ( slc=0; slc<dim_limit_not_under_trigger_[1]; slc++ )
+                                {
+                                    if ( dim_not_under_trigger_[1] ) image_ind[1] = slc;
+                                    // -------------------
+                                    for ( cha=0; cha<dim_limit_not_under_trigger_[0]; cha++ )
+                                    {
+                                        if ( dim_not_under_trigger_[0] ) image_ind[0] = cha;
+                                        // -------------------
+
+                                        // loop over under triggered dimensions and check whether every images are there
+                                        bool needTrigger = true;
+                                        if ( inClose )
+                                        {
+                                            needTrigger = false;
+                                        }
+
+                                        {
+                                            for ( ii=0; ii<numOfElem; ii++ ) { imageSentBuffer_(ii) = NULL; }
+
+                                            // =================================================
+
+                                            for ( ave_t=0; ave_t<dim_limit_under_trigger_[7]; ave_t++ )
+                                            {
+                                                if ( dim_under_trigger_[7] ) image_ind[7] = ave_t;
+                                                image_sent_ind[7] = ave_t;
+                                                // -------------------
+                                                for ( set_t=0; set_t<dim_limit_under_trigger_[6]; set_t++ )
+                                                {
+                                                    if ( dim_under_trigger_[6] ) image_ind[6] = set_t;
+                                                    image_sent_ind[6] = set_t;
+                                                    // -------------------
+                                                    for ( rep_t=0; rep_t<dim_limit_under_trigger_[5]; rep_t++ )
+                                                    {
+                                                        if ( dim_under_trigger_[5] ) image_ind[5] = rep_t;
+                                                        image_sent_ind[5] = rep_t;
+                                                        // -------------------
+                                                        for ( phs_t=0; phs_t<dim_limit_under_trigger_[4]; phs_t++ )
+                                                        {
+                                                            if ( dim_under_trigger_[4] ) image_ind[4] = phs_t;
+                                                            image_sent_ind[4] = phs_t;
+                                                            // -------------------
+                                                            for ( con_t=0; con_t<dim_limit_under_trigger_[3]; con_t++ )
+                                                            {
+                                                                if ( dim_under_trigger_[3] ) image_ind[3] = con_t;
+                                                                image_sent_ind[3] = con_t;
+                                                                // -------------------
+                                                                for ( e2_t=0; e2_t<dim_limit_under_trigger_[2]; e2_t++ )
+                                                                {
+                                                                    if ( dim_under_trigger_[2] ) image_ind[2] = e2_t;
+                                                                    image_sent_ind[2] = e2_t;
+                                                                    // -------------------
+                                                                    for ( slc_t=0; slc_t<dim_limit_under_trigger_[1]; slc_t++ )
+                                                                    {
+                                                                        if ( dim_under_trigger_[1] ) image_ind[1] = slc_t;
+                                                                        image_sent_ind[1] = slc_t;
+                                                                        // -------------------
+                                                                        for ( cha_t=0; cha_t<dim_limit_under_trigger_[0]; cha_t++ )
+                                                                        {
+                                                                            if ( dim_under_trigger_[0] ) image_ind[0] = cha_t;
+                                                                            image_sent_ind[0] = cha_t;
+                                                                            // -------------------
+
+                                                                            ImageType* pImage = buf(image_ind);
+                                                                            bool sentFlag = sentFlagBuf(image_ind);
+
+                                                                            if ( inClose )
+                                                                            {
+                                                                                // if in close call, send out all unsent images
+                                                                                if ( pImage != NULL && !sentFlag )
+                                                                                {
+                                                                                    imageSentBuffer_(image_sent_ind) = pImage;
+                                                                                    buf(image_ind) = NULL;
+                                                                                    needTrigger = true;
+                                                                                }
+                                                                            }
+                                                                            else
+                                                                            {
+                                                                                if ( pImage != NULL && !sentFlag )
+                                                                                {
+                                                                                    imageSentBuffer_(image_sent_ind) = pImage;
+                                                                                    // buf(image_ind) = NULL;
+                                                                                }
+                                                                                else
+                                                                                {
+                                                                                    needTrigger = false; // if all images for current under-trigger dimensions are filled, trigger
+                                                                                    break;
+                                                                                }
+                                                                            }
+                                                                        }
+                                                                    }
+                                                                }
+                                                            }
+                                                        }
+                                                    }
+                                                }
+                                            }
+
+                                            if ( needTrigger )
+                                            {
+                                                // if a image has been sent, not sent again
+                                                for ( ave_t=0; ave_t<dim_limit_under_trigger_[7]; ave_t++ )
+                                                {
+                                                    if ( dim_under_trigger_[7] ) image_ind[7] = ave_t;
+                                                    for ( set_t=0; set_t<dim_limit_under_trigger_[6]; set_t++ )
+                                                    {
+                                                        if ( dim_under_trigger_[6] ) image_ind[6] = set_t;
+                                                        for ( rep_t=0; rep_t<dim_limit_under_trigger_[5]; rep_t++ )
+                                                        {
+                                                            if ( dim_under_trigger_[5] ) image_ind[5] = rep_t;
+                                                            for ( phs_t=0; phs_t<dim_limit_under_trigger_[4]; phs_t++ )
+                                                            {
+                                                                if ( dim_under_trigger_[4] ) image_ind[4] = phs_t;
+                                                                for ( con_t=0; con_t<dim_limit_under_trigger_[3]; con_t++ )
+                                                                {
+                                                                    if ( dim_under_trigger_[3] ) image_ind[3] = con_t;
+                                                                    for ( e2_t=0; e2_t<dim_limit_under_trigger_[2]; e2_t++ )
+                                                                    {
+                                                                        if ( dim_under_trigger_[2] ) image_ind[2] = e2_t;
+                                                                        for ( slc_t=0; slc_t<dim_limit_under_trigger_[1]; slc_t++ )
+                                                                        {
+                                                                            if ( dim_under_trigger_[1] ) image_ind[1] = slc_t;
+                                                                            for ( cha_t=0; cha_t<dim_limit_under_trigger_[0]; cha_t++ )
+                                                                            {
+                                                                                if ( dim_under_trigger_[0] ) image_ind[0] = cha_t;
+
+                                                                                bool sentFlag = sentFlagBuf(image_ind);
+                                                                                if ( sentFlag )
+                                                                                {
+                                                                                    imageSentBuffer_(cha_t, slc_t, e2_t, con_t, phs_t, rep_t, set_t) = NULL;
+                                                                                }
+                                                                                else
+                                                                                {
+                                                                                    sentFlagBuf(image_ind) = true;
+                                                                                }
+
+                                                                                buf(image_ind) = NULL;
+                                                                            }
+                                                                        }
+                                                                    }
+                                                                }
+                                                            }
+                                                        }
+                                                    }
+                                                }
+
+                                                GDEBUG_STREAM("--> Accumulator image trigger for [CHA SLC E2 CON PHS REP SET AVE] : [" 
+                                                                                                                            << image_ind[0] << " " 
+                                                                                                                            << image_ind[1] << " " 
+                                                                                                                            << image_ind[2] << " " 
+                                                                                                                            << image_ind[3] << " " 
+                                                                                                                            << image_ind[4] << " " 
+                                                                                                                            << image_ind[5] << " " 
+                                                                                                                            << image_ind[6] << " " 
+                                                                                                                            << image_ind[7] << "]" );
+
+                                                Gadgetron::GadgetContainerMessage<ImageBufferType>* cm1 = new Gadgetron::GadgetContainerMessage<ImageBufferType>();
+                                                ImageBufferType& imgBuf = *(cm1->getObjectPtr());
+                                                imgBuf = imageSentBuffer_;
+                                                imgBuf.delete_data_on_destruct(true);
+
+                                                if (this->next()->putq(cm1) < 0) 
+                                                {
+                                                    cm1->release();
+                                                    return false;
+                                                }
+                                            }
+                                            else
+                                            {
+                                                for ( ii=0; ii<numOfElem; ii++ )
+                                                {
+                                                    imageSentBuffer_(ii) = NULL;
+                                                }
+                                            }
+
+                                            // =================================================
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happens in GtPlusAccumulatorImageTriggerGadget::trigger(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorImageTriggerGadget::storeImage(const ISMRMRD::ImageHeader& imgHeader, const hoNDArray<ValueType>& img, const ISMRMRD::MetaContainer& attrib, ImageBufferType& buf)
+{
+    try
+    {
+        long long cha = attrib.as_long(GADGETRON_CHA, 0);
+
+        size_t slc = imgHeader.slice;
+
+        long long e2 = attrib.as_long(GADGETRON_E2, 0);
+
+        size_t con = imgHeader.contrast;
+        size_t phs = imgHeader.phase;
+        size_t rep = imgHeader.repetition;
+        size_t set = imgHeader.set;
+        size_t ave = imgHeader.average;
+
+        // create image
+        ImageType* storedImage = new ImageType();
+        GADGET_CHECK_RETURN_FALSE(storedImage!=NULL);
+
+        storedImage->from_NDArray(img);
+        storedImage->attrib_ = attrib;
+        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.setMetaAttributesFromImageHeaderISMRMRD(imgHeader, storedImage->attrib_));
+
+        storedImage->attrib_.set(GADGETRON_PASS_IMMEDIATE, (long)0);
+        buf(cha, slc, e2, con, phs, rep, set, ave) = storedImage;
+
+        if ( pass_image_immediate_ )
+        {
+            Gadgetron::GadgetContainerMessage<ImageBufferType>* cm1 = new Gadgetron::GadgetContainerMessage<ImageBufferType>();
+
+            ImageBufferType& imgBuf = *(cm1->getObjectPtr());
+
+            std::vector<size_t> dim2D(num_of_dimensions_, 1);
+            imgBuf.create(dim2D);
+
+            imgBuf(0) = new ImageType();
+            *imgBuf(0) = *storedImage;
+
+            // set the pass_image flag, so next gadget knows
+            imgBuf(0)->attrib_.set(GADGETRON_PASS_IMMEDIATE, (long)1);
+
+            if (this->next()->putq(cm1) < 0) 
+            {
+                cm1->release();
+                return false;
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happens in GtPlusAccumulatorImageTriggerGadget::storeImage(const ISMRMRD::ImageHeader& imgHeader, const hoNDArray<ValueType>& img, const ISMRMRD::MetaContainer& attrib, ImageBufferType& buf) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+int GtPlusAccumulatorImageTriggerGadget::close(unsigned long flags)
+{
+    GDEBUG_CONDITION_STREAM(true, "GtPlusAccumulatorImageTriggerGadget - close(flags) : " << flags);
+
+    if ( BaseClass::close(flags) != GADGET_OK ) return GADGET_FAIL;
+
+    if ( flags!=0 && !triggered_in_close_ )
+    {
+        triggered_in_close_ = true;
+
+        GDEBUG_CONDITION_STREAM(true, "GtPlusAccumulatorImageTriggerGadget - trigger in close(flags) ... ");
+
+        GADGET_CHECK_RETURN(this->trigger(imageBuffer_, imageSent_, true), GADGET_FAIL);
+        GADGET_CHECK_RETURN(this->trigger(otherBuffer_, otherSent_, true), GADGET_FAIL);
+    }
+
+    return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusAccumulatorImageTriggerGadget)
+
+}
diff --git a/gadgets/gtPlus/GtPlusAccumulatorImageTriggerGadget.h b/gadgets/gtPlus/GtPlusAccumulatorImageTriggerGadget.h
new file mode 100644
index 0000000..91047f1
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusAccumulatorImageTriggerGadget.h
@@ -0,0 +1,150 @@
+/** \file   GtPlusAccumulatorImageTriggerGadget.h
+    \brief  The GtPlus image accmulation and triggering gadget, used after GtPlus reconstruction for image data
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "hoNDObjectArray.h"
+#include "ismrmrd/ismrmrd.h"
+#include "ismrmrd/meta.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+#include "hoNDArray_utils.h"
+#include "hoNDImage.h"
+
+#include "GtPlusGadgetImageArray.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+
+namespace Gadgetron
+{
+
+// the dimensionsal order of buffered images
+// [Cha Slice E2 Con Phase Rep Set Ave]
+//   0    1    2   3   4    5   6   7
+#define GT_DIM_NUM_IMAGE 8
+
+class EXPORTGTPLUSGADGET GtPlusAccumulatorImageTriggerGadget : public Gadget3< ISMRMRD::ImageHeader, hoNDArray< std::complex<float> >, ISMRMRD::MetaContainer >
+{
+public:
+    GADGET_DECLARE(GtPlusAccumulatorImageTriggerGadget);
+
+    typedef std::complex<float> ValueType;
+
+    typedef Gadget3< ISMRMRD::ImageHeader, hoNDArray< ValueType >, ISMRMRD::MetaContainer > BaseClass;
+
+    typedef hoNDImage<ValueType, 2> ImageType;
+
+    typedef hoNDObjectArray<ImageType> ImageBufferType;
+    typedef hoNDArray<bool> ImageSentFlagBufferType;
+
+    GtPlusAccumulatorImageTriggerGadget();
+    ~GtPlusAccumulatorImageTriggerGadget();
+
+    virtual int close(unsigned long flags);
+
+    /// parameters to control the triggering
+
+    /// for every dimension, user can define whether it is under the trigger
+    /// if the dimensional index of buffered images reache maximum for all dimensions under the trigger, 
+    /// the image buffer will be send to the next gadget
+    /// e.g., if the PHS dimension limit is 40 and the dimension PHS is under the trigger, all 40 images 
+    /// will be sent to the next gadget as a data buffer
+    /// every buffered images will only  be sent once
+    /// GADGETRON_IMAGE_GFACTOR gfactor images will be sent to the next gadget immediately
+
+    /// dimension limits
+    /// the dimension limits by default is read from the protocol,but 
+    /// user can set them via the input parameters
+    ISMRMRD::EncodingCounters meas_max_idx_;
+
+    /// whether a dimension is under the trigger
+    /// if no dimension is under the trigger, images will be passed to next gadget right away
+    bool cha_trigger_;
+    bool slc_trigger_;
+    bool e2_trigger_;
+    bool con_trigger_;
+    bool phs_trigger_;
+    bool rep_trigger_;
+    bool set_trigger_;
+    bool ave_trigger_;
+
+    /// whether to immediately pass the image to the next gadget
+    bool pass_image_immediate_;
+
+protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, GadgetContainerMessage< hoNDArray<ValueType> >* m2, GadgetContainerMessage<ISMRMRD::MetaContainer>* m3);
+
+    // perform the triggering
+    virtual bool trigger(ImageBufferType& buf, ImageSentFlagBufferType& sentFlagBuf, bool inClose);
+
+    // store the incoming image
+    // if pass_image_immediate_==true, the image will be immediately passed to the next gadget with 
+    virtual bool storeImage(const ISMRMRD::ImageHeader& imgHeader, const hoNDArray<ValueType>& img, const ISMRMRD::MetaContainer& attrib, ImageBufferType& buf);
+
+    // set dimensions under trigger
+    void setDimensionsUnderTrigger();
+
+    // buffer for regular images whose data role is GADGETRON_IMAGE_REGULAR
+    ImageBufferType imageBuffer_;
+    ImageSentFlagBufferType imageSent_;
+
+    // buffer for other images whole data role is not GADGETRON_IMAGE_REGULAR and not GADGETRON_IMAGE_GFACTOR
+    ImageBufferType otherBuffer_;
+    ImageSentFlagBufferType otherSent_;
+
+    // buffer sent to next gadget
+    ImageBufferType imageSentBuffer_;
+
+    // number of total dimensions
+    size_t num_of_dimensions_;
+
+    // dimensions under trigger
+    std::vector<bool> dim_under_trigger_;
+    std::vector<size_t> dim_limit_under_trigger_;
+
+    std::vector<bool> dim_not_under_trigger_;
+    std::vector<size_t> dim_limit_not_under_trigger_;
+
+    // whether the next gadget has been triggered in close(...)
+    bool triggered_in_close_;
+
+    // dimension for image kspace
+    std::vector<size_t> dimensions_;
+
+    // encoding matrix size (the real sampled size)
+    size_t matrix_size_encoding_[3];
+
+    // encoding space size (the logic kspace size)
+    size_t space_size_[3];
+
+    // encoding filed of view [mm]
+    float field_of_view_encoding_[3];
+
+    // recon matrix size (the final image size)
+    size_t matrix_size_recon_[3];
+
+    // recon filed of view [mm]
+    float field_of_view_recon_[3];
+
+    int image_counter_;
+
+    int meas_max_ro_;
+    int meas_max_channel_;
+
+    // util for gtplus
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtil< std::complex<float> > gtPlus_util_;
+
+    // in verbose mode, more info is printed out
+    bool verboseMode_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusAccumulatorWorkOrderTriggerGadget.cpp b/gadgets/gtPlus/GtPlusAccumulatorWorkOrderTriggerGadget.cpp
new file mode 100644
index 0000000..ad341af
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusAccumulatorWorkOrderTriggerGadget.cpp
@@ -0,0 +1,2412 @@
+#include "GtPlusAccumulatorWorkOrderTriggerGadget.h"
+#include "GtPlusReconGadgetUtil.h"
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+GtPlusAccumulatorWorkOrderTriggerGadget::GtPlusAccumulatorWorkOrderTriggerGadget() : 
+                                            image_counter_(0), image_series_(100), first_kspace_scan_(true), 
+                                            triggered_in_close_(false), triggered_in_process_(false), triggered_in_process_last_acq_(false), 
+                                            triggered_in_process_by_numOfKSpace_triggerDim1_(false), 
+                                            prev_dim1_(-1), curr_dim1_(-1), 
+                                            prev_dim2_(-1), curr_dim2_(-1), 
+                                            count_dim1_(0), 
+                                            last_acq_arrived_(false), 
+                                            verboseMode_(false), 
+                                            other_kspace_matching_Dim_(DIM_NONE)
+{
+    space_matrix_offset_E1_ = 0;
+    space_matrix_offset_E2_ = 0;
+
+    gtPlusISMRMRDReconUtil<ValueType>().clearAcquisitionHeaderISMRMRD(prev_acq_header_);
+    memset(&meas_max_idx_ref_, 0, sizeof(ISMRMRD::EncodingCounters));
+
+    ind_time_stamp_.resize(GT_DIM_NUM, 0);
+
+    embedded_ref_lines_E1_ = 0;
+    embedded_ref_lines_E2_ = 0;
+
+    timeStampResolution_ = 0.0025f;
+}
+
+GtPlusAccumulatorWorkOrderTriggerGadget::~GtPlusAccumulatorWorkOrderTriggerGadget()
+{
+
+}
+
+// extract necessary configuration information from the xml
+int GtPlusAccumulatorWorkOrderTriggerGadget::process_config(ACE_Message_Block* mb)
+{
+    // gadget parameters
+    image_series_ = this->get_int_value("image_series");
+
+    noacceleration_triggerDim1_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("noacceleration_triggerDim1")));
+    noacceleration_triggerDim2_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("noacceleration_triggerDim2")));
+    noacceleration_numOfKSpace_triggerDim1_ = this->get_int_value("noacceleration_numOfKSpace_triggerDim1"); 
+
+    interleaved_triggerDim1_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("interleaved_triggerDim1")));
+    interleaved_triggerDim2_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("interleaved_triggerDim2")));
+    interleaved_numOfKSpace_triggerDim1_ = this->get_int_value("interleaved_numOfKSpace_triggerDim1"); 
+
+    embedded_triggerDim1_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("embedded_triggerDim1")));
+    embedded_triggerDim2_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("embedded_triggerDim2")));
+    embedded_numOfKSpace_triggerDim1_ = this->get_int_value("embedded_numOfKSpace_triggerDim1");
+
+    separate_triggerDim1_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("separate_triggerDim1")));
+    separate_triggerDim2_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("separate_triggerDim2")));
+    separate_numOfKSpace_triggerDim1_ = this->get_int_value("separate_numOfKSpace_triggerDim1");
+
+    other_kspace_matching_Dim_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("other_kspace_matching_Dim")));
+
+    verboseMode_ = this->get_bool_value("verboseMode");
+
+    timeStampResolution_ = (float)this->get_double_value("timeStampResolution");
+    if ( timeStampResolution_ < FLT_EPSILON ) timeStampResolution_ = 0.0025f;
+    GDEBUG_CONDITION_STREAM(verboseMode_, "timeStampResolution_ is " << timeStampResolution_);
+
+    // ---------------------------------------------------------------------------------------------------------
+    // pass the xml file
+    ISMRMRD::IsmrmrdHeader h;
+    try {
+      deserialize(mb->rd_ptr(),h);
+    } catch (...) {
+      GDEBUG("Error parsing ISMRMRD Header");
+      throw;
+      return GADGET_FAIL;
+    }
+
+
+    // This only supports two encoding spaces where the recon_space is the same size
+    // e.g. Parallel imaging reference scan collected with GRE and data with EPI
+    if (h.encoding.size() > 2)
+    {
+        GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+        GDEBUG("This GtPlusAccumulatorWorkOrderTriggerGadget only supports two encoding space\n");
+        return GADGET_FAIL;
+    } 
+    else if (h.encoding.size() == 2)
+    {
+        if (! ((h.encoding[0].reconSpace.matrixSize.x == h.encoding[1].reconSpace.matrixSize.x) && 
+            (h.encoding[0].reconSpace.matrixSize.y == h.encoding[1].reconSpace.matrixSize.y) && 
+            (h.encoding[0].reconSpace.matrixSize.z == h.encoding[1].reconSpace.matrixSize.z) && 
+            (h.encoding[0].reconSpace.fieldOfView_mm.x == h.encoding[1].reconSpace.fieldOfView_mm.x) &&
+            (h.encoding[0].reconSpace.fieldOfView_mm.y == h.encoding[1].reconSpace.fieldOfView_mm.y) &&
+            (h.encoding[0].reconSpace.fieldOfView_mm.z == h.encoding[1].reconSpace.fieldOfView_mm.z)) )
+        {
+            GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+            GDEBUG("This GtPlusAccumulatorWorkOrderTriggerGadget only supports two encoding spaces with identical recon spaces.\n");
+            return GADGET_FAIL;
+        }
+    }
+
+    // find out the PAT mode
+    if (!h.encoding[0].parallelImaging)
+    {
+      GDEBUG("Parallel Imaging section not found in header");
+      return GADGET_FAIL;
+    }
+
+    ISMRMRD::ParallelImaging p_imaging = *h.encoding[0].parallelImaging;
+
+    workOrder_.acceFactorE1_ = (double)(p_imaging.accelerationFactor.kspace_encoding_step_1);
+    workOrder_.acceFactorE2_ = (double)(p_imaging.accelerationFactor.kspace_encoding_step_2);
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "acceFactorE1_ is " << workOrder_.acceFactorE1_);
+    GDEBUG_CONDITION_STREAM(verboseMode_, "acceFactorE2_ is " << workOrder_.acceFactorE2_);
+
+    workOrder_.InterleaveDim_ = Gadgetron::DIM_NONE;
+
+    if ( !p_imaging.calibrationMode.is_present() )
+    {
+        GDEBUG("Parallel Imaging calibrationMode not found in header");
+        return GADGET_FAIL;
+    }
+
+    std::string calib = *p_imaging.calibrationMode;
+    if ( calib.compare("interleaved") == 0 )
+    {
+        workOrder_.CalibMode_ = Gadgetron::ISMRMRD_interleaved;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "Calibration mode is interleaved");
+
+        if ( p_imaging.interleavingDimension )
+        {
+            if ( p_imaging.interleavingDimension->compare("phase") == 0 )
+            {
+                workOrder_.InterleaveDim_ = Gadgetron::DIM_Phase;
+            }
+            else if ( p_imaging.interleavingDimension->compare("repetition") == 0 )
+            {
+                workOrder_.InterleaveDim_ = Gadgetron::DIM_Repetition;
+            }
+            else if ( p_imaging.interleavingDimension->compare("average") == 0 )
+            {
+                workOrder_.InterleaveDim_ = Gadgetron::DIM_Average;
+            }
+            else if ( p_imaging.interleavingDimension->compare("contrast") == 0 )
+            {
+                workOrder_.InterleaveDim_ = Gadgetron::DIM_Contrast;
+            }
+            else if ( p_imaging.interleavingDimension->compare("other") == 0 )
+            {
+                workOrder_.InterleaveDim_ = Gadgetron::DIM_other1;
+            }
+            else
+            {
+                GDEBUG("Unknown interleaving dimension. Bailing out");
+                return GADGET_FAIL;
+            }
+            GDEBUG_CONDITION_STREAM(verboseMode_, "InterleaveDim is " << gtPlus_util_.getISMRMRDDimName(workOrder_.InterleaveDim_));
+        }
+    }
+    else if ( calib.compare("embedded") == 0 )
+    {
+        workOrder_.CalibMode_ = Gadgetron::ISMRMRD_embedded;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "Calibration mode is embedded");
+    }
+    else if ( calib.compare("separate") == 0 )
+    {
+        workOrder_.CalibMode_ = Gadgetron::ISMRMRD_separate;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "Calibration mode is separate");
+    }
+    else if ( calib.compare("external") == 0 )
+    {
+        workOrder_.CalibMode_ = Gadgetron::ISMRMRD_external;
+    }
+    else if ( (calib.compare("other") == 0) && workOrder_.acceFactorE1_==1 && workOrder_.acceFactorE2_==1 )
+    {
+        workOrder_.CalibMode_ = Gadgetron::ISMRMRD_noacceleration;
+        workOrder_.acceFactorE1_=1;
+    }
+    else if ( (calib.compare("other") == 0) &&  (workOrder_.acceFactorE1_>1 || workOrder_.acceFactorE2_>1) )
+    {
+        workOrder_.CalibMode_ = Gadgetron::ISMRMRD_interleaved;
+        workOrder_.acceFactorE1_=2;
+        workOrder_.InterleaveDim_ = Gadgetron::DIM_Phase;
+    }
+    else
+    {
+        GDEBUG("Failed to process parallel imaging calibration mode");
+        return GADGET_FAIL;
+    }
+    
+    // ---------------------------------------------------------------------------------------------------------
+
+    // find out the encoding space 
+
+    findMatrixSizeEncoding(h, matrix_size_encoding_);
+    findFOVEncoding(h, field_of_view_encoding_);
+
+    findMatrixSizeRecon(h, matrix_size_recon_);
+    findFOVRecon(h, field_of_view_recon_);
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "Encoding matrix size: " << matrix_size_encoding_[0] << " " << matrix_size_encoding_[1] << " " << matrix_size_encoding_[2]);
+    GDEBUG_CONDITION_STREAM(verboseMode_, "Encoding field_of_view : " << field_of_view_encoding_[0] << " " << field_of_view_encoding_[1] << " " << field_of_view_encoding_[2]);
+    GDEBUG_CONDITION_STREAM(verboseMode_, "Recon matrix size : " << matrix_size_recon_[0] << " " << matrix_size_recon_[1] << " " << matrix_size_recon_[2]);
+    GDEBUG_CONDITION_STREAM(verboseMode_, "Recon field_of_view :  " << field_of_view_recon_[0] << " " << field_of_view_recon_[1] << " " << field_of_view_recon_[2]);
+
+    // ---------------------------------------------------------------------------------------------------------
+    // handle partial fourier
+
+    ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+    ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+    workOrder_.kSpaceMaxEncode1_ = matrix_size_encoding_[1]-1;
+    GDEBUG_CONDITION_STREAM(verboseMode_, "matrix size kSpaceMaxEncode1_ is " << workOrder_.kSpaceMaxEncode1_);
+
+    workOrder_.kSpaceMaxEncode2_ = matrix_size_encoding_[2]-1;
+    GDEBUG_CONDITION_STREAM(verboseMode_, "matrix size kSpaceMaxEncode2_ is " << workOrder_.kSpaceMaxEncode2_);
+
+    space_size_[1] = workOrder_.kSpaceMaxEncode1_+1;
+    space_size_[2] = workOrder_.kSpaceMaxEncode2_+1;
+
+    if ( (!e_limits.kspace_encoding_step_1) || (!e_limits.kspace_encoding_step_2))
+    {
+        GDEBUG("kspace_encoding_step_1 and kspace_encoding_step_2 limits are required. Not found. Bailing out.");
+        return GADGET_FAIL;
+    }
+
+    max_sampled_E1_ = e_limits.kspace_encoding_step_1->maximum;
+    max_sampled_E2_ = e_limits.kspace_encoding_step_2->maximum;
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "max_sampled_E1_ is " << max_sampled_E1_);
+    GDEBUG_CONDITION_STREAM(verboseMode_, "max_sampled_E2_ is " << max_sampled_E2_);
+
+    center_line_E1_ = e_limits.kspace_encoding_step_1->center;
+    center_line_E2_ = e_limits.kspace_encoding_step_2->center;
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "center_line_E1_ is " << center_line_E1_);
+    GDEBUG_CONDITION_STREAM(verboseMode_, "center_line_E2_ is " << center_line_E2_);
+
+    workOrder_.kSpaceCenterEncode1_ = center_line_E1_;
+    GDEBUG_CONDITION_STREAM(verboseMode_, "kSpaceCenterEncode1_ is " << workOrder_.kSpaceCenterEncode1_);
+
+    workOrder_.kSpaceCenterEncode2_ = center_line_E2_;
+    GDEBUG_CONDITION_STREAM(verboseMode_, "kSpaceCenterEncode2_ is " << workOrder_.kSpaceCenterEncode2_);
+
+    // ---------------------------------------------------------------------------------------------------------
+    // handle retro-gating
+    if (h.userParameters)
+    {
+        for (std::vector<ISMRMRD::UserParameterLong>::const_iterator  i = h.userParameters->userParameterLong.begin (); i != h.userParameters->userParameterLong.end(); ++i)
+        {
+            if (i->name == "RetroGatedImages")
+            {
+                workOrder_.retro_gated_images_ = i->value;
+            }
+            else if ( i->name == "RetroGatedSegmentSize")
+            {
+                workOrder_.retro_gated_segment_size_ = i->value;
+            }
+            else if ( i->name == "EmbeddedRefLinesE1")
+            {
+                embedded_ref_lines_E1_ = i->value;
+            }
+            else if ( i->name == "EmbeddedRefLinesE2")
+            {
+                embedded_ref_lines_E2_ = i->value;
+            }
+        }
+    }
+
+    // ---------------------------------------------------------------------------------------------------------
+    // encoding limits
+
+    if ( std::abs(2*field_of_view_recon_[0]-field_of_view_encoding_[0]) < 1.0 )
+    {
+        meas_max_ro_ = e_space.matrixSize.x/2;
+    }
+    else
+    {
+        meas_max_ro_ = e_space.matrixSize.x;
+    }
+    space_size_[0] = meas_max_ro_;
+
+    meas_max_idx_.kspace_encode_step_1 = (uint16_t)matrix_size_encoding_[1]-1;
+
+    meas_max_idx_.set = (e_limits.set && (e_limits.set->maximum>0)) ? e_limits.set->maximum : 0;
+    meas_max_idx_.phase = (e_limits.phase && (e_limits.phase->maximum>0)) ? e_limits.phase->maximum : 0;
+
+    // if it is retro-gating
+    if ( workOrder_.retro_gated_images_ > 0 )
+    {
+        meas_max_idx_.phase = (uint16_t)(workOrder_.retro_gated_images_ - 1);
+    }
+
+    meas_max_idx_.kspace_encode_step_2 = (uint16_t)matrix_size_encoding_[2]-1;
+
+    meas_max_idx_.contrast = (e_limits.contrast && (e_limits.contrast->maximum > 0)) ? e_limits.contrast->maximum : 0;
+
+    meas_max_idx_.slice = (e_limits.slice && (e_limits.slice->maximum > 0)) ? e_limits.slice->maximum : 0;
+
+    meas_max_idx_.repetition = e_limits.repetition ? e_limits.repetition->maximum : 0;
+
+    meas_max_idx_.average = e_limits.average ? e_limits.average->maximum : 0;
+
+    meas_max_idx_.segment = 0;
+
+    return GADGET_OK;
+}
+
+int GtPlusAccumulatorWorkOrderTriggerGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+    // logic to control whether to store kspace and ref data
+    bool bIsKSpace, bIsRef, bIsNoise, bIsPhaseCorr, bIsReflect, bIsOther, bIsNavigator, bIsRTFeedback, bIsHPFeedback, bIsDummyScan;
+    if ( !checkStatus(m1->getObjectPtr()->flags, m1->getObjectPtr()->number_of_samples, 
+            bIsKSpace, bIsRef, bIsNoise, bIsPhaseCorr, bIsReflect, bIsOther,
+            bIsNavigator, bIsRTFeedback, bIsHPFeedback, bIsDummyScan) )
+    {
+        GDEBUG("Failed check readout status\n");
+        return GADGET_FAIL;
+    }
+
+    size_t scan_counter = m1->getObjectPtr()->scan_counter;
+
+    if ( scan_counter%1000 == 0 )
+    {
+        GDEBUG_CONDITION_STREAM(verboseMode_, "--> receive scan : " << scan_counter);
+    }
+
+    // combine the segmentes
+    //if ( workOrder_.retro_gated_images_ == 0 )
+    //{
+        m1->getObjectPtr()->idx.segment = 0;
+    //}
+
+    if ( (bIsNavigator || bIsRTFeedback || bIsHPFeedback || bIsDummyScan) && !bIsKSpace && !bIsRef )
+    {
+        m1->release();
+        return GADGET_OK;
+    }
+
+    if ( !bIsRTFeedback && bIsKSpace && first_kspace_scan_ && m1->getObjectPtr()->center_sample>0 )
+    {
+        if ( (workOrder_.start_RO_<0) && (workOrder_.end_RO_<0) )
+        {
+            gtPlus_util_.findStartEndROAfterZeroFilling(m1->getObjectPtr()->center_sample, m1->getObjectPtr()->number_of_samples, workOrder_.start_RO_, workOrder_.end_RO_);
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "start_RO : " << workOrder_.start_RO_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "end_RO : " << workOrder_.end_RO_);
+
+            workOrder_.kSpaceCenterRO_ = m1->getObjectPtr()->center_sample;
+            workOrder_.kSpaceMaxRO_ = m1->getObjectPtr()->number_of_samples;
+        }
+
+        // if partial fourier or asymmetric echo is used, correct the kSpaceCenter
+        if ( std::abs( (long long)(space_size_[1])-(long long)max_sampled_E1_) > workOrder_.acceFactorE1_ )
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "Partial fourier along E1 ... ");
+
+            // if ( (m1->getObjectPtr()->idx.user[5]>0) && (std::abs( (long long)m1->getObjectPtr()->idx.user[5] - (long long)space_size_[1]/2 )<2) )
+            if ( (m1->getObjectPtr()->idx.user[5]>0) )
+            {
+                workOrder_.kSpaceCenterEncode1_ = m1->getObjectPtr()->idx.user[5];
+            }
+
+            if ( 2*workOrder_.kSpaceCenterEncode1_ >= (max_sampled_E1_+1) )
+            {
+                space_matrix_offset_E1_ = 0;
+
+                workOrder_.start_E1_ = 0;
+                workOrder_.end_E1_ = (int)max_sampled_E1_;
+            }
+            else
+            {
+                space_matrix_offset_E1_ = space_size_[1] - max_sampled_E1_ -1;
+
+                workOrder_.start_E1_ = (int)space_matrix_offset_E1_;
+                workOrder_.end_E1_ = (int)workOrder_.kSpaceMaxEncode1_;
+            }
+
+            workOrder_.kSpaceMaxEncode1_ = 2*workOrder_.kSpaceCenterEncode1_-1;
+        }
+        else
+        {
+            space_matrix_offset_E1_ = 0;
+        }
+
+        if ( std::abs( (long long)space_size_[2] - (long long)max_sampled_E2_) > workOrder_.acceFactorE2_ )
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "Partial fourier along E2 ... ");
+
+            // if ( (m1->getObjectPtr()->idx.user[6]>0) && (std::abs( (long long)m1->getObjectPtr()->idx.user[6] - (long long)space_size_[2]/2 )<2) )
+            if ( (m1->getObjectPtr()->idx.user[6]>0) )
+            {
+                workOrder_.kSpaceCenterEncode2_ = m1->getObjectPtr()->idx.user[6];
+            }
+
+            if ( 2*workOrder_.kSpaceCenterEncode2_ >= (max_sampled_E2_+1) )
+            {
+                space_matrix_offset_E2_ = 0;
+
+                workOrder_.start_E2_ = 0;
+                workOrder_.end_E2_ = (int)max_sampled_E2_;
+            }
+            else
+            {
+                space_matrix_offset_E2_ = space_size_[2] - max_sampled_E2_-1;
+
+                workOrder_.start_E2_ = (int)space_matrix_offset_E2_;
+                workOrder_.end_E2_ = (int)workOrder_.kSpaceMaxEncode2_;
+            }
+
+            workOrder_.kSpaceMaxEncode2_ = 2*workOrder_.kSpaceCenterEncode2_-1;
+        }
+        else
+        {
+            space_matrix_offset_E2_ = 0;
+        }
+
+        first_kspace_scan_ = false;
+    }
+
+    // hack for UCL data
+    //if ( bIsKSpace && bIsRef )
+    //{
+    //    if ( m1->getObjectPtr()->idx.kspace_encode_step_1%2 == 1 )
+    //    {
+    //        bIsKSpace = false;
+    //    }
+    //}
+
+    // store kspace read out
+    if ( bIsKSpace )
+    {
+        if ( !storeImageData(m1, m2, bIsReflect) )
+        {
+            GDEBUG("Failed check readout status\n");
+            return GADGET_FAIL;
+        }
+    }
+
+    // store ref read out
+    if ( bIsRef && (workOrder_.CalibMode_ != Gadgetron::ISMRMRD_interleaved) )
+    {
+        if ( !storeRefData(m1, m2, bIsReflect) )
+        {
+            GDEBUG("Failed check readout status\n");
+            return GADGET_FAIL;
+        }
+    }
+
+    // store phaseCorr read out
+    if ( bIsPhaseCorr )
+    {
+        ISMRMRD::AcquisitionHeader* pMDH = m1->getObjectPtr();
+        hoNDArray< ValueType >* pRefLine = m2->getObjectPtr();
+
+        ReadOutBuffer item;
+        item.acqHead_ = *pMDH;
+        item.data_ = *pRefLine;
+        item.isReflect_ = bIsReflect;
+        phaseCorrBuffer_.push_back(item);
+    }
+
+    // store noise read out
+    if ( bIsNoise )
+    {
+        ISMRMRD::AcquisitionHeader* pMDH = m1->getObjectPtr();
+        hoNDArray< ValueType >* pRefLine = m2->getObjectPtr();
+
+        ReadOutBuffer item;
+        item.acqHead_ = *pMDH;
+        item.data_ = *pRefLine;
+        item.isReflect_ = bIsReflect;
+        noiseBuffer_.push_back(item);
+    }
+
+    // store other read out
+    if ( bIsOther )
+    {
+        ISMRMRD::AcquisitionHeader* pMDH = m1->getObjectPtr();
+        hoNDArray< ValueType >* pRefLine = m2->getObjectPtr();
+
+        if ( other_kspace_matching_Dim_ != DIM_NONE )
+        {
+            if ( prev_acq_header_.measurement_uid != 0 )
+            {
+                size_t v = getDimValue(prev_acq_header_, other_kspace_matching_Dim_);
+                setDimValue(*pMDH, other_kspace_matching_Dim_, v+1);
+            }
+        }
+
+        ReadOutBuffer item;
+        item.acqHead_ = *pMDH;
+        item.data_ = *pRefLine;
+        item.isReflect_ = bIsReflect;
+        otherBuffer_.push_back(item);
+    }
+
+    // perform triggering
+    if ( !triggerWorkOrder(m1, false, bIsKSpace) )
+    {
+        GDEBUG("Failed triggerWorkOrder(m1)\n");
+        return GADGET_FAIL;
+    }
+
+    m1->release();
+    return GADGET_OK;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::needTriggerWorkOrderAllInClose()
+{
+    // already triggered for last acquisition
+    if ( triggered_in_process_last_acq_ ) return false;
+
+    // if never triggered in process(...) and the last acqusition does arrive
+    // if the last acquisition does not arrive, the user may has cancel the scan
+    if ( !triggered_in_process_ && !triggered_in_process_last_acq_ && last_acq_arrived_ ) return true;
+
+    if ( workOrder_.CalibMode_ == ISMRMRD_interleaved )
+    {
+        return ((interleaved_triggerDim1_==DIM_NONE)&&(interleaved_triggerDim2_==DIM_NONE));
+    }
+    else if ( workOrder_.CalibMode_ == ISMRMRD_embedded )
+    {
+        return ((embedded_triggerDim1_==DIM_NONE)&&(embedded_triggerDim2_==DIM_NONE));
+    }
+    else if ( (workOrder_.CalibMode_ == ISMRMRD_separate) 
+            || (workOrder_.CalibMode_ == ISMRMRD_external) )
+    {
+        return ((separate_triggerDim1_==DIM_NONE)&&(separate_triggerDim2_==DIM_NONE));
+    }
+    else if ( (workOrder_.CalibMode_ == ISMRMRD_noacceleration) )
+    {
+        return ((noacceleration_triggerDim1_==DIM_NONE)&&(noacceleration_triggerDim2_==DIM_NONE));
+    }
+    else
+    {
+        GERROR_STREAM("Unsupported calibration mode : " << workOrder_.CalibMode_);
+        return true;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::triggerWorkOrder(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, bool inClose, bool isKSpace)
+{
+    if ( workOrder_.CalibMode_ == ISMRMRD_interleaved )
+    {
+        if ( inClose )
+        {
+            GADGET_CHECK_RETURN_FALSE(triggerWorkOrderLastCountInClose(interleaved_triggerDim1_, interleaved_triggerDim2_, interleaved_numOfKSpace_triggerDim1_));
+        }
+        else
+        {
+            if ( isKSpace )
+            {
+                GADGET_CHECK_RETURN_FALSE(triggerWorkOrder(m1, interleaved_triggerDim1_, interleaved_triggerDim2_, interleaved_numOfKSpace_triggerDim1_));
+            }
+        }
+    }
+    else if ( workOrder_.CalibMode_ == ISMRMRD_embedded )
+    {
+        if ( inClose )
+        {
+            GADGET_CHECK_RETURN_FALSE(triggerWorkOrderLastCountInClose(embedded_triggerDim1_, embedded_triggerDim2_, embedded_numOfKSpace_triggerDim1_));
+        }
+        else
+        {
+            if ( isKSpace )
+            {
+                GADGET_CHECK_RETURN_FALSE(triggerWorkOrder(m1, embedded_triggerDim1_, embedded_triggerDim2_, embedded_numOfKSpace_triggerDim1_));
+            }
+        }
+    }
+    else if ( (workOrder_.CalibMode_ == ISMRMRD_separate) 
+            || (workOrder_.CalibMode_ == ISMRMRD_external) )
+    {
+        if ( inClose )
+        {
+            GADGET_CHECK_RETURN_FALSE(triggerWorkOrderLastCountInClose(separate_triggerDim1_, separate_triggerDim2_, separate_numOfKSpace_triggerDim1_));
+        }
+        else
+        {
+            if ( isKSpace )
+            {
+                GADGET_CHECK_RETURN_FALSE(triggerWorkOrder(m1, separate_triggerDim1_, separate_triggerDim2_, separate_numOfKSpace_triggerDim1_));
+            }
+        }
+    }
+    else if ( workOrder_.CalibMode_ == ISMRMRD_noacceleration )
+    {
+        if ( inClose )
+        {
+            GADGET_CHECK_RETURN_FALSE(triggerWorkOrderLastCountInClose(noacceleration_triggerDim1_, noacceleration_triggerDim2_, noacceleration_numOfKSpace_triggerDim1_));
+        }
+        else
+        {
+            if ( isKSpace )
+            {
+                GADGET_CHECK_RETURN_FALSE(triggerWorkOrder(m1, noacceleration_triggerDim1_, noacceleration_triggerDim2_, noacceleration_numOfKSpace_triggerDim1_));
+            }
+        }
+    }
+    else
+    {
+        GERROR_STREAM("Unsupported calibration mode : " << workOrder_.CalibMode_);
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+resetTriggerStatus(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1)
+{
+    // return !gtPlusISMRMRDReconUtil<ValueType>().hasIdenticalGeometryISMRMRD(*(m1->getObjectPtr()), prev_acq_header_);
+    return false;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+triggerWorkOrder(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+            Gadgetron::ISMRMRDDIM& triggerDim1_, 
+            Gadgetron::ISMRMRDDIM& triggerDim2_,
+            int numOfKSpace_triggerDim1_)
+{
+    //bool is_first_acq_in_slice = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_FIRST_IN_SLICE).isSet(m1->getObjectPtr()->flags);
+    //if ( !is_first_acq_in_slice ) return true;
+
+    bool is_last_acq = ( ((ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_LAST_IN_REPETITION).isSet(m1->getObjectPtr()->flags)) || (ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_LAST_IN_SLICE).isSet(m1->getObjectPtr()->flags)) ) 
+                                && (m1->getObjectPtr()->idx.repetition==meas_max_idx_.repetition)
+                                && (m1->getObjectPtr()->idx.slice==meas_max_idx_.slice)
+                                && (m1->getObjectPtr()->idx.set==meas_max_idx_.set)
+                                && (m1->getObjectPtr()->idx.contrast==meas_max_idx_.contrast)
+                                && (m1->getObjectPtr()->idx.phase==meas_max_idx_.phase)
+                                && (m1->getObjectPtr()->idx.average==meas_max_idx_.average) );
+
+    // if retro gating, use the end of acq flag
+    if ( !is_last_acq && (workOrder_.retro_gated_images_ > 0) )
+    {
+        is_last_acq = (ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_LAST_IN_MEASUREMENT).isSet(m1->getObjectPtr()->flags));
+    }
+
+    if ( is_last_acq ) last_acq_arrived_ = true;
+
+    curr_dim1_ = getDimValue(*(m1->getObjectPtr()), triggerDim1_);
+    curr_dim2_ = getDimValue(*(m1->getObjectPtr()), triggerDim2_);
+
+    if ( is_last_acq && ( (triggerDim1_!=DIM_NONE) || (triggerDim2_!=DIM_NONE) ) )
+    {
+        GDEBUG_CONDITION_STREAM(true, "Last scan in measurement - " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << curr_dim1_ << " - " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << curr_dim2_);
+
+        if ( curr_dim1_==0 && curr_dim2_== 0 )
+        {
+            GDEBUG_CONDITION_STREAM(true, "Last scan in measurement - not trigger ... ");
+            return true;
+        }
+
+        triggered_in_process_last_acq_ = true;
+        GDEBUG_CONDITION_STREAM(true, "Last scan in measurement - triggered_in_process_last_acq_ : " << triggered_in_process_last_acq_);
+
+        if ( workOrder_.CalibMode_ == ISMRMRD_interleaved )
+        {
+            GADGET_CHECK_RETURN_FALSE(triggerWorkOrderLastCountInClose(interleaved_triggerDim1_, interleaved_triggerDim2_, interleaved_numOfKSpace_triggerDim1_));
+        }
+        else if ( workOrder_.CalibMode_ == ISMRMRD_embedded )
+        {
+            GADGET_CHECK_RETURN_FALSE(triggerWorkOrderLastCountInClose(embedded_triggerDim1_, embedded_triggerDim2_, embedded_numOfKSpace_triggerDim1_));
+        }
+        else if ( (workOrder_.CalibMode_ == ISMRMRD_separate) 
+                || (workOrder_.CalibMode_ == ISMRMRD_external) )
+        {
+            GADGET_CHECK_RETURN_FALSE(triggerWorkOrderLastCountInClose(separate_triggerDim1_, separate_triggerDim2_, separate_numOfKSpace_triggerDim1_));
+        }
+        else if ( workOrder_.CalibMode_ == ISMRMRD_noacceleration )
+        {
+            GADGET_CHECK_RETURN_FALSE(triggerWorkOrderLastCountInClose(noacceleration_triggerDim1_, noacceleration_triggerDim2_, noacceleration_numOfKSpace_triggerDim1_));
+        }
+        else
+        {
+            triggered_in_process_last_acq_ = false;
+            GERROR_STREAM("Unsupported calibration mode : " << workOrder_.CalibMode_);
+            return false;
+        }
+
+        return true;
+    }
+
+    if ( prev_dim1_ == -1 )
+    {
+        prev_dim1_ = curr_dim1_;
+        count_dim1_ = 0;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "Current Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << curr_dim1_);
+    }
+
+    if ( prev_dim2_ == -1 )
+    {
+        prev_dim2_ = curr_dim2_;
+        count_dim1_ = 0;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "Current Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << curr_dim2_);
+    }
+
+    if ( prev_acq_header_.measurement_uid == 0 ) prev_acq_header_ = *(m1->getObjectPtr());
+
+    bool workFlow_BufferKernel_ = false;
+    bool workFlow_use_BufferedKernel_ = false;
+
+    if ( prev_dim1_ != curr_dim1_ )
+    {
+        count_dim1_++;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "Current Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << curr_dim1_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "Current Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << curr_dim2_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "count_dim1_ : " << count_dim1_);
+    }
+
+    if ( (triggerDim1_==DIM_NONE) && (triggerDim2_==DIM_NONE) )
+    {
+        prev_dim1_ = curr_dim1_;
+        prev_dim2_ = curr_dim2_;
+        prev_acq_header_ = *(m1->getObjectPtr());
+        return true;
+    }
+
+    int numOfAcquiredKSpaceForTriggerDim1 = numOfKSpace_triggerDim1_;
+    if ( workOrder_.CalibMode_ == ISMRMRD_interleaved )
+    {
+        numOfAcquiredKSpaceForTriggerDim1 = (int)(numOfKSpace_triggerDim1_ * workOrder_.acceFactorE1_ * workOrder_.acceFactorE2_);
+    }
+
+    // trigger whenever the Dim2 is changed
+    if (  triggerDim1_==DIM_NONE && triggerDim2_!=DIM_NONE  )
+    {
+        prev_dim1_ = curr_dim1_;
+        prev_acq_header_ = *(m1->getObjectPtr());
+
+        size_t prev_dim2_local_ = prev_dim2_;
+        prev_dim2_ = curr_dim2_;
+
+        if ( curr_dim2_!= prev_dim2_local_ )
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << prev_dim2_local_);
+            GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+            triggered_in_process_ = true;
+        }
+    }
+
+    // trigger whenever the Dim1 is changed
+    if (  triggerDim1_!=DIM_NONE && triggerDim2_==DIM_NONE  )
+    {
+        prev_dim2_ = curr_dim2_;
+
+        size_t prev_dim1_local_ = prev_dim1_;
+        prev_dim1_ = curr_dim1_;
+
+        if ( numOfKSpace_triggerDim1_ > 0 )
+        {
+            if ( curr_dim1_!= prev_dim1_local_ )
+            {
+                if ( resetTriggerStatus(m1) )
+                {
+                    count_dim1_ = 0;
+                    GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_);
+
+                    workFlow_BufferKernel_ = false;
+                    workFlow_use_BufferedKernel_ = true;
+                    GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim1_, prev_dim1_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+                    triggered_in_process_ = true;
+                }
+
+                if ( count_dim1_ == numOfAcquiredKSpaceForTriggerDim1 )
+                {
+                    GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_);
+
+                    workFlow_BufferKernel_ = true;
+                    workFlow_use_BufferedKernel_ = false;
+                    GADGET_CHECK_RETURN_FALSE(triggerByDimLessEqual(triggerDim1_, prev_dim1_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+                    triggered_in_process_ = true;
+                }
+                else if ( count_dim1_ > numOfAcquiredKSpaceForTriggerDim1 )
+                {
+                    GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_);
+
+                    workFlow_BufferKernel_ = false;
+                    workFlow_use_BufferedKernel_ = true;
+                    GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim1_, prev_dim1_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+                    triggered_in_process_ = true;
+                }
+            }
+
+            prev_acq_header_ = *(m1->getObjectPtr());
+        }
+        else
+        {
+            if ( curr_dim1_!= prev_dim1_local_ )
+            {
+                GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_);
+                GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim1_, prev_dim1_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+                triggered_in_process_ = true;
+            }
+
+            prev_acq_header_ = *(m1->getObjectPtr());
+        }
+    }
+
+    if (  triggerDim1_!=DIM_NONE && triggerDim2_!=DIM_NONE  )
+    {
+        size_t prev_dim1_local_ = prev_dim1_;
+        size_t prev_dim2_local_ = prev_dim2_;
+
+        prev_dim1_ = curr_dim1_;
+        prev_dim2_ = curr_dim2_;
+
+        if ( numOfKSpace_triggerDim1_ > 0 )
+        {
+            if ( (curr_dim2_!=prev_dim2_local_) || resetTriggerStatus(m1) )
+            {
+                if ( count_dim1_ > numOfAcquiredKSpaceForTriggerDim1 )
+                {
+                    count_dim1_ = 0;
+                    GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_ 
+                        << "; Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << prev_dim2_local_);
+
+                    workFlow_BufferKernel_ = false;
+                    workFlow_use_BufferedKernel_ = true;
+
+                    GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim1_, prev_dim1_local_, triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+                }
+
+                if ( count_dim1_ <= numOfAcquiredKSpaceForTriggerDim1 && !triggered_in_process_by_numOfKSpace_triggerDim1_ ) // the trigger never happened
+                {
+                    count_dim1_ = 0;
+                    GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_ 
+                        << "; Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << prev_dim2_local_);
+
+                    workFlow_BufferKernel_ = false;
+                    workFlow_use_BufferedKernel_ = false;
+
+                    GADGET_CHECK_RETURN_FALSE(triggerByDim1LessEqualDim2Equal(triggerDim1_, prev_dim1_local_, triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+                }
+
+                triggered_in_process_ = true;
+                triggered_in_process_by_numOfKSpace_triggerDim1_ = false; // reset this flag to be false for next dim2
+            }
+
+            if (curr_dim1_!=prev_dim1_local_)
+            {
+                if ( count_dim1_ == numOfAcquiredKSpaceForTriggerDim1 )
+                {
+                    GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_ 
+                        << "; Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << prev_dim2_local_);
+
+                    workFlow_BufferKernel_ = true;
+                    workFlow_use_BufferedKernel_ = false;
+                    GADGET_CHECK_RETURN_FALSE(triggerByDim1LessEqualDim2Equal(triggerDim1_, prev_dim1_local_, triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+                    triggered_in_process_ = true;
+                    triggered_in_process_by_numOfKSpace_triggerDim1_ = true;
+                }
+                else if ( count_dim1_ > numOfAcquiredKSpaceForTriggerDim1 )
+                {
+                    GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_ 
+                        << "; Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << prev_dim2_local_);
+
+                    workFlow_BufferKernel_ = false;
+                    workFlow_use_BufferedKernel_ = true;
+                    GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim1_, prev_dim1_local_, triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+                    triggered_in_process_ = true;
+                    triggered_in_process_by_numOfKSpace_triggerDim1_ = true;
+                }
+            }
+
+            prev_acq_header_ = *(m1->getObjectPtr());
+        }
+        else
+        {
+            // trigger whenever the Dim2 is changed
+            if ( curr_dim2_!= prev_dim2_local_ )
+            {
+                GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << prev_dim2_local_);
+                GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+                triggered_in_process_ = true;
+            }
+
+            prev_acq_header_ = *(m1->getObjectPtr());
+        }
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+triggerWorkOrderLastCountInClose(Gadgetron::ISMRMRDDIM& triggerDim1_, Gadgetron::ISMRMRDDIM& triggerDim2_, int numOfKSpace_triggerDim1_)
+{
+    GDEBUG_CONDITION_STREAM(verboseMode_, "Current Dim1 InClose : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << curr_dim1_);
+    GDEBUG_CONDITION_STREAM(verboseMode_, "Current Dim2 InClose : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << curr_dim2_);
+
+    if ( prev_dim1_ != curr_dim1_ )
+    {
+        count_dim1_++;
+    }
+
+    bool workFlow_BufferKernel_ = false;
+    bool workFlow_use_BufferedKernel_ = false;
+
+    int numOfAcquiredKSpaceForTriggerDim1 = numOfKSpace_triggerDim1_;
+    if ( workOrder_.CalibMode_ == ISMRMRD_interleaved )
+    {
+        numOfAcquiredKSpaceForTriggerDim1 = (int)(numOfKSpace_triggerDim1_ * workOrder_.acceFactorE1_ * workOrder_.acceFactorE2_);
+    }
+
+    size_t prev_dim1_local_ = prev_dim1_;
+    size_t prev_dim2_local_ = prev_dim2_;
+
+    prev_dim1_ = curr_dim1_;
+    prev_dim2_ = curr_dim2_;
+
+    // trigger whenever the Dim2 is changed
+    if (  triggerDim1_==DIM_NONE && triggerDim2_!=DIM_NONE  )
+    {
+        GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << prev_dim2_local_);
+        GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+    }
+
+    // trigger whenever the Dim1 is changed
+    if (  triggerDim1_!=DIM_NONE && triggerDim2_==DIM_NONE  )
+    {
+        if ( numOfKSpace_triggerDim1_ > 0 )
+        {
+            if ( count_dim1_ <= numOfAcquiredKSpaceForTriggerDim1 )
+            {
+                GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " <= " << prev_dim1_local_);
+                workFlow_BufferKernel_ = true;
+                workFlow_use_BufferedKernel_ = false;
+                GADGET_CHECK_RETURN_FALSE(triggerByDimLessEqual(triggerDim1_, prev_dim1_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+            }
+            else if ( count_dim1_ > numOfAcquiredKSpaceForTriggerDim1 )
+            {
+                GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_);
+                workFlow_BufferKernel_ = false;
+                workFlow_use_BufferedKernel_ = true;
+                GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim1_, prev_dim1_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+            }
+        }
+        else
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_);
+            GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim1_, prev_dim1_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+        }
+    }
+
+    if (  triggerDim1_!=DIM_NONE && triggerDim2_!=DIM_NONE  )
+    {
+        if ( numOfKSpace_triggerDim1_ > 0 )
+        {
+            if ( count_dim1_ <= numOfAcquiredKSpaceForTriggerDim1 ) // no more data will be available, so have to do the recon
+            {
+                GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " <= " << prev_dim1_local_);
+                workFlow_BufferKernel_ = true;
+                workFlow_use_BufferedKernel_ = false;
+                GADGET_CHECK_RETURN_FALSE(triggerByDim1LessEqualDim2Equal(triggerDim1_, prev_dim1_local_, triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+            }
+            else if ( count_dim1_ > numOfAcquiredKSpaceForTriggerDim1 )
+            {
+                GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_);
+                workFlow_BufferKernel_ = false;
+                workFlow_use_BufferedKernel_ = true;
+                GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim1_, prev_dim1_local_, triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+            }
+        }
+        else
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "Trigger Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << prev_dim2_local_);
+            GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+        }
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::checkStatus(uint64_t flag, int samples, 
+    bool& bIsKSpace, bool& bIsRef, bool& bIsNoise, bool& bIsPhaseCorr, bool& bIsReflect, bool& bIsOther,
+    bool& bIsNavigator, bool& bIsRTFeedback, bool& bIsHPFeedback, bool& bIsDummyScan)
+{
+    bIsNoise = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT).isSet(flag);
+    bool is_ref = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_PARALLEL_CALIBRATION).isSet(flag);
+    bool is_ref_kspace = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_PARALLEL_CALIBRATION_AND_IMAGING).isSet(flag);
+    bIsReflect = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_REVERSE).isSet(flag);
+    bIsPhaseCorr = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_PHASECORR_DATA).isSet(flag);
+    bIsNavigator = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_NAVIGATION_DATA).isSet(flag);
+    bIsRTFeedback = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_RTFEEDBACK_DATA).isSet(flag);
+    bIsHPFeedback = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_HPFEEDBACK_DATA).isSet(flag);
+    bIsDummyScan = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_DUMMYSCAN_DATA).isSet(flag);
+
+    bIsKSpace = false;
+    bIsRef = false;
+    bIsOther = false;
+
+    if ( bIsNoise || bIsDummyScan )
+    {
+        return true;
+    }
+
+    if ( workOrder_.CalibMode_==ISMRMRD_noacceleration )
+    {
+        bIsKSpace = true;
+        bIsRef = false;
+    }
+
+    // in interleaved mode, only store the image data
+    if ( workOrder_.CalibMode_==ISMRMRD_interleaved )
+    {
+        bIsKSpace = true;
+        bIsRef = false;
+    }
+
+    // in embedded, kspace stores only the undersampled lines
+    // ref stores all lines used for references
+    if ( workOrder_.CalibMode_==ISMRMRD_embedded )
+    {
+        if ( is_ref && !is_ref_kspace )
+        {
+            bIsKSpace = false;
+            bIsRef = true;
+        }
+
+        if ( !is_ref && is_ref_kspace )
+        {
+            bIsKSpace = true;
+            bIsRef = true;
+        }
+
+        if ( is_ref && is_ref_kspace )
+        {
+            bIsKSpace = true;
+            bIsRef = true;
+        }
+
+        if ( !is_ref && !is_ref_kspace )
+        {
+            bIsKSpace = true;
+            bIsRef = false;
+        }
+    }
+
+    // in separate mode
+    if ( workOrder_.CalibMode_==ISMRMRD_separate 
+    || workOrder_.CalibMode_==ISMRMRD_external )
+    {
+        if ( is_ref )
+        {
+            bIsKSpace = false;
+            bIsRef = true;
+        }
+
+        if ( !is_ref )
+        {
+            bIsKSpace = true;
+            bIsRef = false;
+        }
+    }
+
+    // store other data, e.g. AIF
+    // only for tpat
+    if ( !is_ref && !is_ref_kspace && (samples != meas_max_ro_) )
+    {
+        bIsOther = true;
+        bIsKSpace = false;
+        bIsRef = false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::storeImageData(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2, bool isReflect)
+{
+    try
+    {
+        size_t ii;
+        size_t samples =  m1->getObjectPtr()->number_of_samples;
+        ISMRMRD::EncodingCounters idx = m1->getObjectPtr()->idx;
+
+        /*if ( workOrder_.retro_gated_images_ == 0 )
+        {*/
+            idx.segment = 0; // combine the segments
+        //}
+
+        if ( workOrder_.data_.get_number_of_elements() <= 0 )
+        {
+            meas_max_channel_ = m1->getObjectPtr()->active_channels;
+
+            size_t E1 = workOrder_.kSpaceMaxEncode1_+1;
+            size_t E2 = workOrder_.kSpaceMaxEncode2_+1;
+            if ( E2 == 0 ) E2 = 1;
+
+            if ( E1 < matrix_size_encoding_[1] ) E1 = matrix_size_encoding_[1];
+            if ( E2 < matrix_size_encoding_[2] ) E2 = matrix_size_encoding_[2];
+
+            if ( samples > meas_max_ro_ ) meas_max_ro_ = samples;
+
+            // find the loop counter boundary and allocate the buffer
+            GDEBUG_CONDITION_STREAM(verboseMode_, "[RO E1 Cha Slice E2 Con Phase Rep Set Seg Ave] = [" 
+                               << meas_max_ro_ 
+                               << " " << E1 
+                               << " " << meas_max_channel_ 
+                               << " " << meas_max_idx_.slice+1 
+                               << " " << E2 
+                               << " " << meas_max_idx_.contrast+1 
+                               << " " << meas_max_idx_.phase+1 
+                               << " " << meas_max_idx_.repetition+1 
+                               << " " << meas_max_idx_.set+1 
+                               << " " << meas_max_idx_.segment+1 
+                               << " " << meas_max_idx_.average+1 << "]");
+
+            dimensions_.clear();
+            dimensions_.push_back(meas_max_ro_);
+            dimensions_.push_back(E1);
+            dimensions_.push_back(meas_max_channel_);
+            dimensions_.push_back(meas_max_idx_.slice+1);
+            dimensions_.push_back(E2);
+            dimensions_.push_back(meas_max_idx_.contrast+1);
+            dimensions_.push_back(meas_max_idx_.phase+1);
+            dimensions_.push_back(meas_max_idx_.repetition+1);
+            dimensions_.push_back(meas_max_idx_.set+1);
+            dimensions_.push_back(meas_max_idx_.segment+1);
+            dimensions_.push_back(meas_max_idx_.average+1);
+
+            size_t N = dimensions_.size();
+            for ( ii=0; ii<N; ii++ )
+            {
+                GDEBUG_CONDITION_STREAM(verboseMode_, "dimensions_[" << ii << "] = " << dimensions_[ii]);
+            }
+
+            // allocate data buffer
+            try
+            {
+                workOrder_.data_.create(&dimensions_);
+                Gadgetron::clear(workOrder_.data_);
+
+                std::vector<size_t> reflect_dimensions_(dimensions_);
+                reflect_dimensions_[0] = 1;
+                reflect_dimensions_[2] = 1;
+                workOrder_.reflect_.create(&reflect_dimensions_);
+                Gadgetron::clear(workOrder_.reflect_);
+
+                std::vector<size_t> dim(dimensions_);
+                dim[0] = 1;
+                dim[2] = 1;
+                workOrder_.time_stamp_.create(dim);
+                Gadgetron::fill(workOrder_.time_stamp_, (real_value_type)(-1) );
+
+                workOrder_.physio_time_stamp_.create(dim);
+                Gadgetron::fill(workOrder_.physio_time_stamp_, (real_value_type)(-1) );
+            }
+            catch(...)
+            {
+                GDEBUG("Failed create buffer\n");
+                return false;
+            }
+
+            // allocate message buffer
+            size_t matrix_size[GT_DIM_NUM];
+            for ( ii=0; ii<GT_DIM_NUM; ii++ )
+            {
+                matrix_size[ii] = dimensions_[ii];
+            }
+
+            if (!(messageImage_ = new GtPlusGadgetImageArray(matrix_size))) 
+            {
+                GDEBUG("Failed create buffer\n");
+                return false;
+            }
+        }
+
+        // if necessary, shift the E1/E2 indexes
+        if ( workOrder_.start_E1_ > 0 )
+        {
+            idx.kspace_encode_step_1 += workOrder_.start_E1_;
+        }
+
+        if ( workOrder_.start_E2_ > 0 )
+        {
+            idx.kspace_encode_step_2 += workOrder_.start_E2_;
+        }
+
+        if ( idx.kspace_encode_step_1 >= dimensions_[1] )
+        {
+            return true;
+        }
+
+        size_t dataN = workOrder_.data_.get_number_of_elements();
+        std::complex<float>* b = workOrder_.data_.begin();
+        std::complex<float>* d = m2->getObjectPtr()->get_data_ptr();
+        if (samples != static_cast<size_t>(dimensions_[0])) 
+        {
+            GDEBUG("Wrong number of samples received\n");
+            return false;
+        }
+
+        //Copy the data for all the channels
+        hoNDArray<std::complex<float> > reflectBuf;
+        if ( isReflect )
+        {
+            reflectBuf.create(samples);
+        }
+
+        std::vector<size_t> pos(GT_DIM_NUM);
+        for (size_t c = 0; c < m1->getObjectPtr()->active_channels; c++) 
+        {
+            pos[0] = 0;
+            pos[1] = idx.kspace_encode_step_1;
+            pos[2] = c;
+            pos[3] = idx.slice;
+            pos[4] = idx.kspace_encode_step_2;
+            pos[5] = idx.contrast;
+            pos[6] = idx.phase;
+            pos[7] = idx.repetition;
+            pos[8] = idx.set;
+            pos[9] = idx.segment;
+            pos[10] = idx.average;
+            size_t offsetBuffer = workOrder_.data_.calculate_offset(pos);
+
+            if ( offsetBuffer >= dataN )
+            {
+                break;
+            }
+
+            if ( offsetBuffer >= dataN )
+            {
+                break;
+            }
+
+            if ( isReflect )
+            {
+                for ( size_t s=0; s<samples; s++ )
+                {
+                    reflectBuf(samples-1-s) = d[c*samples+s];
+                }
+
+                memcpy(b+offsetBuffer, reflectBuf.begin(), sizeof(std::complex<float>)*samples);
+            }
+            else
+            {
+                memcpy(b+offsetBuffer, d+c*samples, sizeof(std::complex<float>)*samples);
+            }
+
+            pos[2] = 0;
+            offsetBuffer = workOrder_.reflect_.calculate_offset(pos);
+            workOrder_.reflect_.at(offsetBuffer) = isReflect;
+        }
+
+        if ( !fillImageInfo(m1, messageImage_, idx) )
+        {
+            GDEBUG("Failed in fillImageInfo(m1, messageImage_, idx)\n");
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GDEBUG("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::storeImageData(...) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+storeRefData(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2, bool isReflect)
+{
+    try
+    {
+        size_t ii;
+        size_t samples =  m1->getObjectPtr()->number_of_samples;
+        ISMRMRD::EncodingCounters idx = m1->getObjectPtr()->idx;
+
+        /*if ( workOrder_.retro_gated_images_ == 0 )
+        {*/
+            idx.segment = 0; // combine the segments
+        //}
+
+        if ( workOrder_.ref_.get_number_of_elements() <= 0 )
+        {
+            meas_max_channel_ = m1->getObjectPtr()->active_channels;
+
+            size_t E1 = workOrder_.kSpaceMaxEncode1_+1;
+            size_t E2 = workOrder_.kSpaceMaxEncode2_+1;
+            if ( E2 == 0 ) E2 = 1;
+
+            if ( E1 < matrix_size_encoding_[1] ) E1 = matrix_size_encoding_[1];
+            if ( E2 < matrix_size_encoding_[2] ) E2 = matrix_size_encoding_[2];
+
+            size_t RO = meas_max_ro_;
+
+            if ( (samples < meas_max_ro_) 
+                && (( workOrder_.CalibMode_==ISMRMRD_separate || workOrder_.CalibMode_==ISMRMRD_external )) )
+            {
+                RO = samples;
+            }
+
+            if ( RO < samples ) RO = samples;
+
+            // find the loop counter boundary and allocate the buffer
+            GDEBUG_CONDITION_STREAM(verboseMode_, "[RO E1 Cha Slice E2 Con Phase Rep Set Seg Ave] = [" 
+                               << RO 
+                               << " " << E1 
+                               << " " << meas_max_channel_ 
+                               << " " << meas_max_idx_.slice+1 
+                               << " " << E2 
+                               << " " << meas_max_idx_.contrast+1 
+                               << " " << meas_max_idx_.phase+1 
+                               << " " << meas_max_idx_.repetition+1 
+                               << " " << meas_max_idx_.set+1 
+                               << " " << meas_max_idx_.segment+1 
+                               << " " << meas_max_idx_.average+1 << "]");
+
+            dimensions_.clear();
+            dimensions_.push_back(RO);
+            dimensions_.push_back(E1);
+            dimensions_.push_back(meas_max_channel_);
+            dimensions_.push_back(meas_max_idx_.slice+1);
+            dimensions_.push_back(E2);
+            dimensions_.push_back(meas_max_idx_.contrast+1);
+            dimensions_.push_back(meas_max_idx_.phase+1);
+            dimensions_.push_back(meas_max_idx_.repetition+1);
+            dimensions_.push_back(meas_max_idx_.set+1);
+            dimensions_.push_back(meas_max_idx_.segment+1);
+            dimensions_.push_back(meas_max_idx_.average+1);
+
+            size_t N = dimensions_.size();
+            for ( ii=0; ii<N; ii++ )
+            {
+                GDEBUG_CONDITION_STREAM(verboseMode_, "ref dimensions_[" << ii << "] = " << dimensions_[ii]);
+            }
+
+            // allocate data buffer
+            try
+            {
+                workOrder_.ref_.create(&dimensions_);
+                Gadgetron::clear(workOrder_.ref_);
+
+                std::vector<size_t> reflect_dimensions_(dimensions_);
+                reflect_dimensions_[0] = 1;
+                reflect_dimensions_[2] = 1;
+                workOrder_.reflect_ref_.create(&reflect_dimensions_);
+                Gadgetron::clear(workOrder_.reflect_ref_);
+            }
+            catch(...)
+            {
+                GDEBUG("Failed create ref buffer\n");
+                return false;
+            }
+        }
+
+        // if necessary, shift the E1/E2 indexes
+        if ( workOrder_.CalibMode_ == ISMRMRD_embedded )
+        {
+            if ( workOrder_.start_E1_ > 0 )
+            {
+                idx.kspace_encode_step_1 += workOrder_.start_E1_;
+            }
+
+            if ( workOrder_.start_E2_ > 0 )
+            {
+                idx.kspace_encode_step_2 += workOrder_.start_E2_;
+            }
+        }
+
+        // for the seperate or external mode, store the maximal idx
+        if ( (workOrder_.CalibMode_ == ISMRMRD_separate) || (workOrder_.CalibMode_ == ISMRMRD_external) )
+        {
+            if ( idx.kspace_encode_step_1 > meas_max_idx_ref_.kspace_encode_step_1 )    meas_max_idx_ref_.kspace_encode_step_1 = idx.kspace_encode_step_1;
+            if ( idx.kspace_encode_step_2 > meas_max_idx_ref_.kspace_encode_step_2 )    meas_max_idx_ref_.kspace_encode_step_2 = idx.kspace_encode_step_2;
+            if ( idx.average > meas_max_idx_ref_.average )                              meas_max_idx_ref_.average = idx.average;
+            if ( idx.slice > meas_max_idx_ref_.slice )                                  meas_max_idx_ref_.slice = idx.slice;
+            if ( idx.contrast > meas_max_idx_ref_.contrast )                            meas_max_idx_ref_.contrast = idx.contrast;
+            if ( idx.phase > meas_max_idx_ref_.phase )                                  meas_max_idx_ref_.phase = idx.phase;
+            if ( idx.repetition > meas_max_idx_ref_.repetition )                        meas_max_idx_ref_.repetition = idx.repetition;
+            if ( idx.set > meas_max_idx_ref_.set )                                      meas_max_idx_ref_.set = idx.set;
+            if ( idx.segment > meas_max_idx_ref_.segment )                              meas_max_idx_ref_.segment = idx.segment;
+            if ( idx.average > meas_max_idx_ref_.average )                              meas_max_idx_ref_.average = idx.average;
+
+            size_t ii;
+            for ( ii=0; ii<ISMRMRD::ISMRMRD_USER_INTS; ii++ )
+            {
+                if ( idx.user[ii] > meas_max_idx_ref_.user[ii] ) meas_max_idx_ref_.user[ii] = idx.user[ii];
+            }
+        }
+
+        size_t refN = workOrder_.ref_.get_number_of_elements();
+        std::complex<float>* b = workOrder_.ref_.begin();
+        std::complex<float>* d = m2->getObjectPtr()->get_data_ptr();
+        if (samples != static_cast<size_t>(workOrder_.ref_.get_size(0))) 
+        {
+            GDEBUG("Wrong number of samples received\n");
+            return false;
+        }
+
+        //Copy the data for all the channels
+        hoNDArray<std::complex<float> > reflectBuf;
+        if ( isReflect )
+        {
+            reflectBuf.create(samples);
+        }
+
+        std::vector<size_t> pos(GT_DIM_NUM);
+        for (uint16_t c = 0; c < m1->getObjectPtr()->active_channels; c++) 
+        {
+            pos[0] = 0;
+            pos[1] = idx.kspace_encode_step_1;
+            pos[2] = c;
+            pos[3] = idx.slice;
+            pos[4] = idx.kspace_encode_step_2;
+            pos[5] = idx.contrast;
+            pos[6] = idx.phase;
+            pos[7] = idx.repetition;
+            pos[8] = idx.set;
+            pos[9] = idx.segment;
+            pos[10] = idx.average;
+
+            size_t offsetBuffer = workOrder_.ref_.calculate_offset(pos);
+            if ( offsetBuffer >= refN )
+            {
+                break;
+            }
+
+            if ( isReflect )
+            {
+                for ( size_t s=0; s<samples; s++ )
+                {
+                    reflectBuf(samples-1-s) = d[c*samples+s];
+                }
+
+                memcpy(b+offsetBuffer, reflectBuf.begin(), sizeof(std::complex<float>)*samples);
+            }
+            else
+            {
+                memcpy(b+offsetBuffer, d+c*samples, sizeof(std::complex<float>)*samples);
+            }
+
+            pos[2] = 0;
+            offsetBuffer = workOrder_.reflect_ref_.calculate_offset(pos);
+            workOrder_.reflect_ref_.at(offsetBuffer) = isReflect;
+        }
+
+        // if it is embedded mode, store the acquisition and physio time stamp
+        if ( workOrder_.CalibMode_ == ISMRMRD_embedded )
+        {
+            ind_time_stamp_[0] = 0;
+            ind_time_stamp_[1] = idx.kspace_encode_step_1;
+            ind_time_stamp_[2] = 0;
+            ind_time_stamp_[3] = idx.slice;
+            ind_time_stamp_[4] = idx.kspace_encode_step_2;
+            ind_time_stamp_[5] = idx.contrast;
+            ind_time_stamp_[6] = idx.phase;
+            ind_time_stamp_[7] = idx.repetition;
+            ind_time_stamp_[8] = idx.set;
+            ind_time_stamp_[9] = idx.segment;
+            ind_time_stamp_[10] = idx.average;
+
+            workOrder_.time_stamp_(ind_time_stamp_) = (real_value_type)(m1->getObjectPtr()->acquisition_time_stamp) * timeStampResolution_;
+            workOrder_.physio_time_stamp_(ind_time_stamp_) = (real_value_type)(m1->getObjectPtr()->physiology_time_stamp[0]) * timeStampResolution_;
+        }
+    }
+    catch(...)
+    {
+        GDEBUG("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::storeRefData(...) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+fillBuffer(ReadOutBufferType& readOutBuffer, BufferType& buf, ReflectBufferType& reflectBuf)
+{
+    try
+    {
+        // find the maximal dimension of all buffered ICE readout
+        size_t numOfReadOuts = readOutBuffer.size();
+        ISMRMRD::EncodingCounters max_idx;
+        max_idx.kspace_encode_step_1 = 0;
+        max_idx.average = 0;
+        max_idx.slice = 0;
+        max_idx.kspace_encode_step_2 = 0;
+        max_idx.contrast = 0;
+        max_idx.phase = 0;
+        max_idx.repetition = 0;
+        max_idx.set = 0;
+        max_idx.segment = 0;
+        max_idx.average = 0;
+        size_t max_channel = 0;
+        size_t max_col = 0;
+
+        size_t a;
+        for (a = 0; a < numOfReadOuts; a++) 
+        {
+            ISMRMRD::EncodingCounters idx = readOutBuffer[a].acqHead_.idx;
+
+            if ( readOutBuffer[a].acqHead_.number_of_samples > max_col ) 
+                max_col=readOutBuffer[a].acqHead_.number_of_samples;
+
+            if ( idx.kspace_encode_step_1 > max_idx.kspace_encode_step_1 ) 
+                max_idx.kspace_encode_step_1=idx.kspace_encode_step_1;
+
+            if ( idx.slice > max_idx.slice ) 
+                max_idx.slice = idx.slice;
+
+            if ( idx.kspace_encode_step_2 > max_idx.kspace_encode_step_2 ) 
+                max_idx.kspace_encode_step_2 = idx.kspace_encode_step_2;
+
+            if ( idx.contrast > max_idx.contrast ) 
+                max_idx.contrast = idx.contrast;
+
+            if ( idx.phase > max_idx.phase ) 
+                max_idx.phase = idx.phase;
+
+            if ( idx.repetition > max_idx.repetition ) 
+                max_idx.repetition = idx.repetition;
+
+            if ( idx.set > max_idx.set ) 
+                max_idx.set = idx.set;
+
+            if ( idx.segment > max_idx.segment ) 
+                max_idx.segment = idx.segment;
+
+            if ( idx.average > max_idx.average ) 
+                max_idx.average = idx.average;
+
+            if ( readOutBuffer[a].acqHead_.active_channels > max_channel ) 
+                max_channel = readOutBuffer[a].acqHead_.active_channels;
+        }
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "[RO E1 Cha Slice E2 Contrast Phase Rep Set Seg Ave] = [" 
+                               << max_col 
+                               << " " << max_idx.kspace_encode_step_1+1 
+                               << " " << max_channel 
+                               << " " << max_idx.slice+1 
+                               << " " << max_idx.kspace_encode_step_2+1 
+                               << " " << max_idx.contrast+1 
+                               << " " << max_idx.phase+1 
+                               << " " << max_idx.repetition+1 
+                               << " " << max_idx.set+1 
+                               << " " << max_idx.segment+1 
+                               << " " << max_idx.average+1 << "]");
+
+        // alloate buffer for data
+        std::vector<size_t> dims(GT_DIM_NUM);
+        dims[0] = max_col;
+        dims[1] = max_idx.kspace_encode_step_1+1;
+        dims[2] = max_channel;
+        dims[3] = max_idx.slice+1;
+        dims[4] = max_idx.kspace_encode_step_2+1;
+        dims[5] = max_idx.contrast+1;
+        dims[6] = max_idx.phase+1;
+        dims[7] = max_idx.repetition+1;
+        dims[8] = max_idx.set+1;
+        dims[9] = max_idx.segment+1;
+        dims[10] = max_idx.average+1;
+
+        try
+        {
+            buf.create(&dims);
+            Gadgetron::clear(buf);
+
+            std::vector<size_t> reflect_dims(dims);
+            reflect_dims[0] = 1;
+            reflect_dims[2] = 1;
+            reflectBuf.create(&reflect_dims);
+            Gadgetron::clear(reflectBuf);
+        }
+        catch(...)
+        {
+            GDEBUG("Failed create buffer\n");
+            return false;
+        }
+
+        std::complex<float>* b = buf.begin();
+
+        // copy the data
+        uint16_t c;
+        std::vector<size_t> pos(GT_DIM_NUM);
+
+        for ( a=0; a<numOfReadOuts; a++) 
+        {
+            ISMRMRD::EncodingCounters idx = readOutBuffer[a].acqHead_.idx;
+            std::complex<float>* d = const_cast<std::complex<float>*>(readOutBuffer[a].data_.begin());
+
+            for ( c=0; c<readOutBuffer[a].acqHead_.active_channels; c++) 
+            {
+                pos[0] = 0;
+                pos[1] = idx.kspace_encode_step_1;
+                pos[2] = c;
+                pos[3] = idx.slice;
+                pos[4] = idx.kspace_encode_step_2;
+                pos[5] = idx.contrast;
+                pos[6] = idx.phase;
+                pos[7] = idx.repetition;
+                pos[8] = idx.set;
+                pos[9] = idx.segment;
+                pos[10] = idx.average;
+                long long offsetBuffer = buf.calculate_offset(pos);
+
+                memcpy(b+offsetBuffer, d+c*readOutBuffer[a].acqHead_.number_of_samples, sizeof(std::complex<float>)*readOutBuffer[a].acqHead_.number_of_samples);
+
+                pos[2] = 0;
+                offsetBuffer = reflectBuf.calculate_offset(pos);
+                reflectBuf.at(offsetBuffer) = readOutBuffer[a].isReflect_;
+            }
+        }
+    }
+    catch(...)
+    {
+        GDEBUG("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::fillBuffer(...) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+//XUE-TODO: Functions DO NOT return booleans in the Gadgetron
+bool GtPlusAccumulatorWorkOrderTriggerGadget::fillImageInfo(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GtPlusGadgetImageArray* messageImage, const ISMRMRD::EncodingCounters& idx)
+{
+
+    try
+    {
+        // fill the message info
+        size_t offset = messageImage->get_offset(idx.slice, idx.kspace_encode_step_2, idx.contrast, idx.phase, idx.repetition, idx.set, idx.segment, idx.average);
+
+        if( (offset >= messageImage->max_num_of_images_)
+            || (idx.slice>=messageImage->matrix_size[3])
+            || (idx.kspace_encode_step_2>=messageImage->matrix_size[4])
+            || (idx.contrast>=messageImage->matrix_size[5])
+            || (idx.phase>=messageImage->matrix_size[6])
+            || (idx.repetition>=messageImage->matrix_size[7])
+            || (idx.set>=messageImage->matrix_size[8])
+            || (idx.segment>=messageImage->matrix_size[9])
+            || (idx.average>=messageImage->matrix_size[10]) )
+        {
+            GWARN_STREAM("Incoming image is over the boundary of buffer [SLC E2 CON PHS REP SET SEG AVE] = [ " 
+                                                                            << idx.slice << " " << idx.kspace_encode_step_2 << " " 
+                                                                            << idx.contrast << " " << idx.phase << " " 
+                                                                            << idx.repetition << " " << idx.set << " " 
+                                                                            << idx.segment << " " << idx.average << " ] ");
+            return true;
+        }
+
+        if( offset >= messageImage->max_num_of_images_ )
+        {
+            GWARN_STREAM("Incoming image is over the boundary of buffer [SLC E2 CON PHS REP SET SEG AVE] = [ " 
+                                                                            << idx.slice << " " << idx.kspace_encode_step_2 << " " 
+                                                                            << idx.contrast << " " << idx.phase << " " << idx.repetition << " " 
+                                                                            << idx.set << " " << idx.segment << " " << idx.average << " ] ");
+            return true;
+        }
+
+        // if it is the first acq in a slice, fill in all information
+        bool is_first_acq_in_slice = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_FIRST_IN_SLICE).isSet(m1->getObjectPtr()->flags);
+
+        /*if ( is_first_acq_in_slice 
+            || ( messageImage->imageArray_[offset].version==0 
+                    && messageImage->imageArray_[offset].flags==0 
+                    && messageImage->imageArray_[offset].measurement_uid==0 ) )*/
+        if ( messageImage->imageArray_[offset].version==0 
+                    && messageImage->imageArray_[offset].flags==0 
+                    && messageImage->imageArray_[offset].measurement_uid==0 )
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "--> buffer image header - offset = " << offset << " - [SLC E2 CON PHS REP SET SEG AVE] = [" 
+                                                                      << idx.slice << " " 
+                                                                      << idx.kspace_encode_step_2 << " " 
+                                                                      << idx.contrast << " " 
+                                                                      << idx.phase << " " 
+                                                                      << idx.repetition << " " 
+                                                                      << idx.set << " " 
+                                                                      << idx.segment << " " 
+                                                                      << idx.average << "]");
+
+            messageImage->imageArray_[offset].version = m1->getObjectPtr()->version;
+            messageImage->imageArray_[offset].flags = m1->getObjectPtr()->flags;
+            messageImage->imageArray_[offset].measurement_uid = m1->getObjectPtr()->measurement_uid;
+
+            //messageImage->imageArray_[offset].matrix_size[0] = dimensions_[0];
+            //messageImage->imageArray_[offset].matrix_size[1] = dimensions_[1];
+            //messageImage->imageArray_[offset].matrix_size[2] = dimensions_[2];
+
+            messageImage->imageArray_[offset].set_matrix_size(0, dimensions_[0]);
+            messageImage->imageArray_[offset].set_matrix_size(1, dimensions_[1]);
+            messageImage->imageArray_[offset].set_matrix_size(2, dimensions_[2]);
+
+            messageImage->imageArray_[offset].field_of_view[0] = field_of_view_recon_[0];
+            messageImage->imageArray_[offset].field_of_view[1] = field_of_view_recon_[1];
+            messageImage->imageArray_[offset].field_of_view[2] = field_of_view_recon_[2];
+
+            messageImage->imageArray_[offset].channels = m1->getObjectPtr()->active_channels;
+
+            messageImage->imageArray_[offset].position[0] = m1->getObjectPtr()->position[0];
+            messageImage->imageArray_[offset].position[1] = m1->getObjectPtr()->position[1];
+            messageImage->imageArray_[offset].position[2] = m1->getObjectPtr()->position[2];
+
+            //messageImage->imageArray_[offset].quaternion[0] = m1->getObjectPtr()->quaternion[0];
+            //messageImage->imageArray_[offset].quaternion[1] = m1->getObjectPtr()->quaternion[1];
+            //messageImage->imageArray_[offset].quaternion[2] = m1->getObjectPtr()->quaternion[2];
+            //messageImage->imageArray_[offset].quaternion[3] = m1->getObjectPtr()->quaternion[3];
+
+            messageImage->imageArray_[offset].read_dir[0] = m1->getObjectPtr()->read_dir[0];
+            messageImage->imageArray_[offset].read_dir[1] = m1->getObjectPtr()->read_dir[1];
+            messageImage->imageArray_[offset].read_dir[2] = m1->getObjectPtr()->read_dir[2];
+
+            messageImage->imageArray_[offset].phase_dir[0] = m1->getObjectPtr()->phase_dir[0];
+            messageImage->imageArray_[offset].phase_dir[1] = m1->getObjectPtr()->phase_dir[1];
+            messageImage->imageArray_[offset].phase_dir[2] = m1->getObjectPtr()->phase_dir[2];
+
+            messageImage->imageArray_[offset].slice_dir[0] = m1->getObjectPtr()->slice_dir[0];
+            messageImage->imageArray_[offset].slice_dir[1] = m1->getObjectPtr()->slice_dir[1];
+            messageImage->imageArray_[offset].slice_dir[2] = m1->getObjectPtr()->slice_dir[2];
+
+            messageImage->imageArray_[offset].patient_table_position[0] = m1->getObjectPtr()->patient_table_position[0];
+            messageImage->imageArray_[offset].patient_table_position[1] = m1->getObjectPtr()->patient_table_position[1];
+            messageImage->imageArray_[offset].patient_table_position[2] = m1->getObjectPtr()->patient_table_position[2];
+
+            messageImage->imageArray_[offset].average = m1->getObjectPtr()->idx.average;
+            messageImage->imageArray_[offset].slice = m1->getObjectPtr()->idx.slice;
+            messageImage->imageArray_[offset].contrast = m1->getObjectPtr()->idx.contrast;
+            messageImage->imageArray_[offset].phase = m1->getObjectPtr()->idx.phase;
+            messageImage->imageArray_[offset].repetition = m1->getObjectPtr()->idx.repetition;
+            messageImage->imageArray_[offset].set = m1->getObjectPtr()->idx.set;
+            messageImage->imageArray_[offset].average = m1->getObjectPtr()->idx.average;
+
+            messageImage->imageArray_[offset].acquisition_time_stamp = m1->getObjectPtr()->acquisition_time_stamp;
+
+            messageImage->imageArray_[offset].physiology_time_stamp[0] = m1->getObjectPtr()->physiology_time_stamp[0];
+            messageImage->imageArray_[offset].physiology_time_stamp[1] = m1->getObjectPtr()->physiology_time_stamp[1];
+            messageImage->imageArray_[offset].physiology_time_stamp[2] = m1->getObjectPtr()->physiology_time_stamp[2];
+
+            messageImage->imageArray_[offset].data_type = ISMRMRD::ISMRMRD_CXFLOAT;
+
+            messageImage->imageArray_[offset].image_type = ISMRMRD::ISMRMRD_IMTYPE_MAGNITUDE;
+
+            messageImage->imageArray_[offset].image_index = (uint16_t)(++image_counter_);
+            messageImage->imageArray_[offset].image_series_index = (uint16_t)image_series_;
+
+            // need to store the free user parameters
+            memcpy(messageImage->imageArray_[offset].user_int, m1->getObjectPtr()->user_int, sizeof(int32_t)*8);
+            memcpy(messageImage->imageArray_[offset].user_float, m1->getObjectPtr()->user_float, sizeof(float)*8);
+        }
+
+        // whether or not this acq is the first in a slice, we need to fill the TimeStamps and PMUTimeStamps
+        if ( idx.kspace_encode_step_1 < messageImage->imageArray_[offset].time_stamps.size() )
+        {
+            messageImage->imageArray_[offset].time_stamps[idx.kspace_encode_step_1] = m1->getObjectPtr()->acquisition_time_stamp;
+            messageImage->imageArray_[offset].pmu_time_stamps[idx.kspace_encode_step_1] = m1->getObjectPtr()->physiology_time_stamp[0];
+
+            ind_time_stamp_[0] = 0;
+            ind_time_stamp_[1] = idx.kspace_encode_step_1;
+            ind_time_stamp_[2] = 0;
+            ind_time_stamp_[3] = idx.slice;
+            ind_time_stamp_[4] = idx.kspace_encode_step_2;
+            ind_time_stamp_[5] = idx.contrast;
+            ind_time_stamp_[6] = idx.phase;
+            ind_time_stamp_[7] = idx.repetition;
+            ind_time_stamp_[8] = idx.set;
+            ind_time_stamp_[9] = idx.segment;
+            ind_time_stamp_[10] = idx.average;
+
+            workOrder_.time_stamp_(ind_time_stamp_) = (real_value_type)(m1->getObjectPtr()->acquisition_time_stamp) * timeStampResolution_;
+            workOrder_.physio_time_stamp_(ind_time_stamp_) = (real_value_type)(m1->getObjectPtr()->physiology_time_stamp[0]) * timeStampResolution_;
+        }
+    }
+    catch(...)
+    {
+        GDEBUG("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::fillImageInfo(...) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+size_t GtPlusAccumulatorWorkOrderTriggerGadget::
+computeEncodedSizeE1(size_t centerE1, size_t maxE1)
+{
+    size_t E1;
+    if ( (maxE1+1)%2 == 0 )
+    {
+        E1 = 2*centerE1;
+    }
+    else
+    {
+        E1 = 2*centerE1+1;
+    }
+
+    return E1;
+}
+
+size_t GtPlusAccumulatorWorkOrderTriggerGadget::
+computeEncodedSizeE2(size_t centerE2, size_t maxE2)
+{
+    size_t E2;
+    if ( (maxE2+1)%2 == 0 )
+    {
+        E2 = 2*centerE2;
+    }
+    else
+    {
+        E2 = 2*centerE2+1;
+    }
+
+    return E2;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+triggerByDimEqual(Gadgetron::ISMRMRDDIM& triggerDim, size_t value, bool workFlow_BufferKernel_, bool workFlow_use_BufferedKernel_)
+{
+    try
+    {
+        GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - triggerByDimEqual(triggerDim, value) ... ");
+
+        GadgetContainerMessage<GtPlusGadgetImageArray>* cm1 = new GadgetContainerMessage<GtPlusGadgetImageArray>();
+        GadgetContainerMessage< WorkOrderType >* cm2 = new GadgetContainerMessage< WorkOrderType >();
+        cm1->cont(cm2);
+
+        workOrder_.duplicate(*cm2->getObjectPtr());
+        cm2->getObjectPtr()->workFlow_BufferKernel_ = workFlow_BufferKernel_;
+        cm2->getObjectPtr()->workFlow_use_BufferedKernel_ = workFlow_use_BufferedKernel_;
+
+        bool lessEqual = false;
+
+        // copy the image content
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.data_, cm2->getObjectPtr()->data_, triggerDim, value, lessEqual));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForDim(workOrder_.reflect_, cm2->getObjectPtr()->reflect_, triggerDim, value, lessEqual));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<real_value_type>().extractSubArrayForDim(workOrder_.time_stamp_, cm2->getObjectPtr()->time_stamp_, triggerDim, value, lessEqual));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<real_value_type>().extractSubArrayForDim(workOrder_.physio_time_stamp_, cm2->getObjectPtr()->physio_time_stamp_, triggerDim, value, lessEqual));
+
+        // copy the ref
+        if ( workOrder_.ref_.get_number_of_elements()>0 
+                && workOrder_.ref_.get_number_of_dimensions()==workOrder_.data_.get_number_of_dimensions() )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.ref_, cm2->getObjectPtr()->ref_, triggerDim, value, lessEqual));
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForDim(workOrder_.reflect_ref_, cm2->getObjectPtr()->reflect_ref_, triggerDim, value, lessEqual));
+
+            // for seperate and external mode, further truncate the reference data
+            if ( (workOrder_.CalibMode_ == ISMRMRD_separate) || (workOrder_.CalibMode_ == ISMRMRD_external) )
+            {
+                hoNDArray<ValueType> ref;
+                hoNDArray<unsigned short> reflect_ref;
+
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->ref_, ref, meas_max_idx_ref_));
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->reflect_ref_, reflect_ref, meas_max_idx_ref_));
+
+                cm2->getObjectPtr()->ref_ = ref;
+                cm2->getObjectPtr()->reflect_ref_ = reflect_ref;
+            }
+        }
+
+        // copy the message image array
+        GADGET_CHECK_RETURN_FALSE(messageImage_->extractGadgetImageArrayEqual(triggerDim, value, *(cm1->getObjectPtr()) ));
+
+        if (!phaseCorrBuffer_.empty())
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - phase correction signal found : " << phaseCorrBuffer_.size());
+
+            if ( !fillBuffer(phaseCorrBuffer_, workOrder_.phaseCorr_, workOrder_.reflect_phaseCorr_) )
+            {
+                GDEBUG("fillBuffer(phaseCorrBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->phaseCorr_ = workOrder_.phaseCorr_;
+            cm2->getObjectPtr()->reflect_phaseCorr_ = workOrder_.reflect_phaseCorr_;
+        }
+
+        if (!noiseBuffer_.empty())
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorGadget - noise signal found : " << noiseBuffer_.size());
+
+            ReflectBufferType tmpBuf;
+            if ( !fillBuffer(noiseBuffer_, workOrder_.noise_, tmpBuf) )
+            {
+                GDEBUG("fillBuffer(noiseBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->noise_ = workOrder_.noise_;
+        }
+
+        if (!otherBuffer_.empty())
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorGadget - other signal found : " << otherBuffer_.size());
+
+            if ( !fillBuffer(otherBuffer_, workOrder_.other_, workOrder_.reflect_other_) )
+            {
+                GDEBUG("fillBuffer(otherBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.other_, cm2->getObjectPtr()->other_, triggerDim, value, lessEqual));
+
+            cm2->getObjectPtr()->reflect_other_ = workOrder_.reflect_other_;
+        }
+
+        // send to next gadget
+        if (this->next()->putq(cm1) < 0) 
+        {
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::triggerByDimEqual(triggerDim, value) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+triggerByDimLessEqual(Gadgetron::ISMRMRDDIM& triggerDim, size_t value, bool workFlow_BufferKernel_, bool workFlow_use_BufferedKernel_)
+{
+    try
+    {
+        GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - triggerByDimEqual(triggerDim, value) ... ");
+
+        GadgetContainerMessage<GtPlusGadgetImageArray>* cm1 = new GadgetContainerMessage<GtPlusGadgetImageArray>();
+        GadgetContainerMessage< WorkOrderType >* cm2 = new GadgetContainerMessage< WorkOrderType >();
+        cm1->cont(cm2);
+
+        workOrder_.duplicate(*cm2->getObjectPtr());
+        cm2->getObjectPtr()->workFlow_BufferKernel_ = workFlow_BufferKernel_;
+        cm2->getObjectPtr()->workFlow_use_BufferedKernel_ = workFlow_use_BufferedKernel_;
+
+        bool lessEqual = true;
+
+        // copy the image content
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.data_, cm2->getObjectPtr()->data_, triggerDim, value, lessEqual));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForDim(workOrder_.reflect_, cm2->getObjectPtr()->reflect_, triggerDim, value, lessEqual));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<real_value_type>().extractSubArrayForDim(workOrder_.time_stamp_, cm2->getObjectPtr()->time_stamp_, triggerDim, value, lessEqual));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<real_value_type>().extractSubArrayForDim(workOrder_.physio_time_stamp_, cm2->getObjectPtr()->physio_time_stamp_, triggerDim, value, lessEqual));
+
+        // copy the ref
+        if ( workOrder_.ref_.get_number_of_elements()>0 
+                && workOrder_.ref_.get_number_of_dimensions()==workOrder_.data_.get_number_of_dimensions() )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.ref_, cm2->getObjectPtr()->ref_, triggerDim, value, lessEqual));
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForDim(workOrder_.reflect_ref_, cm2->getObjectPtr()->reflect_ref_, triggerDim, value, lessEqual));
+
+            // for seperate and external mode, further truncate the reference data
+            if ( (workOrder_.CalibMode_ == ISMRMRD_separate) || (workOrder_.CalibMode_ == ISMRMRD_external) )
+            {
+                hoNDArray<ValueType> ref;
+                hoNDArray<unsigned short> reflect_ref;
+
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->ref_, ref, meas_max_idx_ref_));
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->reflect_ref_, reflect_ref, meas_max_idx_ref_));
+
+                cm2->getObjectPtr()->ref_ = ref;
+                cm2->getObjectPtr()->reflect_ref_ = reflect_ref;
+            }
+        }
+
+        // copy the message image array
+        GADGET_CHECK_RETURN_FALSE(messageImage_->extractGadgetImageArrayLessEqual(triggerDim, value, *(cm1->getObjectPtr()) ));
+
+        if (!phaseCorrBuffer_.empty())
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - phase correction signal found : " << phaseCorrBuffer_.size());
+
+            if ( !fillBuffer(phaseCorrBuffer_, workOrder_.phaseCorr_, workOrder_.reflect_phaseCorr_) )
+            {
+                GDEBUG("fillBuffer(phaseCorrBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->phaseCorr_ = workOrder_.phaseCorr_;
+            cm2->getObjectPtr()->reflect_phaseCorr_ = workOrder_.reflect_phaseCorr_;
+        }
+
+        if (!noiseBuffer_.empty())
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorGadget - noise signal found : " << noiseBuffer_.size());
+
+            ReflectBufferType tmpBuf;
+            if ( !fillBuffer(noiseBuffer_, workOrder_.noise_, tmpBuf) )
+            {
+                GDEBUG("fillBuffer(noiseBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->noise_ = workOrder_.noise_;
+        }
+
+        if (!otherBuffer_.empty())
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorGadget - other signal found : " << otherBuffer_.size());
+
+            if ( !fillBuffer(otherBuffer_, workOrder_.other_, workOrder_.reflect_other_) )
+            {
+                GDEBUG("fillBuffer(otherBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.other_, cm2->getObjectPtr()->other_, triggerDim, value, lessEqual));
+
+            cm2->getObjectPtr()->reflect_other_ = workOrder_.reflect_other_;
+        }
+
+        // send to next gadget
+        if (this->next()->putq(cm1) < 0) 
+        {
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::triggerByDimLessEqual(triggerDim, value) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+triggerByDimEqual(Gadgetron::ISMRMRDDIM& triggerDim1, size_t value1, Gadgetron::ISMRMRDDIM& triggerDim2, size_t value2, bool workFlow_BufferKernel_, bool workFlow_use_BufferedKernel_)
+{
+    try
+    {
+        GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - triggerByDimEqual(triggerDim1, value1, triggerDim2, value2) ... ");
+
+        GadgetContainerMessage<GtPlusGadgetImageArray>* cm1 = new GadgetContainerMessage<GtPlusGadgetImageArray>();
+        GadgetContainerMessage< WorkOrderType >* cm2 = new GadgetContainerMessage< WorkOrderType >();
+        cm1->cont(cm2);
+
+        workOrder_.duplicate(*cm2->getObjectPtr());
+        cm2->getObjectPtr()->workFlow_BufferKernel_ = workFlow_BufferKernel_;
+        cm2->getObjectPtr()->workFlow_use_BufferedKernel_ = workFlow_use_BufferedKernel_;
+
+        bool lessEqual = false;
+
+        // copy the image content
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.data_, cm2->getObjectPtr()->data_, triggerDim1, value1, triggerDim2, value2, lessEqual));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForDim(workOrder_.reflect_, cm2->getObjectPtr()->reflect_, triggerDim1, value1, triggerDim2, value2, lessEqual));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<real_value_type>().extractSubArrayForDim(workOrder_.time_stamp_, cm2->getObjectPtr()->time_stamp_, triggerDim1, value1, triggerDim2, value2, lessEqual));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<real_value_type>().extractSubArrayForDim(workOrder_.physio_time_stamp_, cm2->getObjectPtr()->physio_time_stamp_, triggerDim1, value1, triggerDim2, value2, lessEqual));
+
+        // copy the ref
+        if ( workOrder_.ref_.get_number_of_elements()>0 
+                && workOrder_.ref_.get_number_of_dimensions()==workOrder_.data_.get_number_of_dimensions() )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.ref_, cm2->getObjectPtr()->ref_, triggerDim1, value1, triggerDim2, value2, lessEqual));
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForDim(workOrder_.reflect_ref_, cm2->getObjectPtr()->reflect_ref_, triggerDim1, value1, triggerDim2, value2, lessEqual));
+
+            // for seperate and external mode, further truncate the reference data
+            if ( (workOrder_.CalibMode_ == ISMRMRD_separate) || (workOrder_.CalibMode_ == ISMRMRD_external) )
+            {
+                hoNDArray<ValueType> ref;
+                hoNDArray<unsigned short> reflect_ref;
+
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->ref_, ref, meas_max_idx_ref_));
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->reflect_ref_, reflect_ref, meas_max_idx_ref_));
+
+                cm2->getObjectPtr()->ref_ = ref;
+                cm2->getObjectPtr()->reflect_ref_ = reflect_ref;
+            }
+        }
+
+        // copy the message image array
+        GADGET_CHECK_RETURN_FALSE(messageImage_->extractGadgetImageArrayEqual(triggerDim1, value1, triggerDim2, value2, *(cm1->getObjectPtr()) ));
+
+        if (!phaseCorrBuffer_.empty())
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - phase correction signal found : " << phaseCorrBuffer_.size());
+
+            if ( !fillBuffer(phaseCorrBuffer_, workOrder_.phaseCorr_, workOrder_.reflect_phaseCorr_) )
+            {
+                GDEBUG("fillBuffer(phaseCorrBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->phaseCorr_ = workOrder_.phaseCorr_;
+            cm2->getObjectPtr()->reflect_phaseCorr_ = workOrder_.reflect_phaseCorr_;
+        }
+
+        if (!noiseBuffer_.empty())
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorGadget - noise signal found : " << noiseBuffer_.size());
+
+            ReflectBufferType tmpBuf;
+            if ( !fillBuffer(noiseBuffer_, workOrder_.noise_, tmpBuf) )
+            {
+                GDEBUG("fillBuffer(noiseBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->noise_ = workOrder_.noise_;
+        }
+
+        if (!otherBuffer_.empty())
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorGadget - other signal found : " << otherBuffer_.size());
+
+            if ( !fillBuffer(otherBuffer_, workOrder_.other_, workOrder_.reflect_other_) )
+            {
+                GDEBUG("fillBuffer(otherBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.other_, cm2->getObjectPtr()->other_, triggerDim1, value1, false));
+
+            cm2->getObjectPtr()->reflect_other_ = workOrder_.reflect_other_;
+        }
+
+        // send to next gadget
+        if (this->next()->putq(cm1) < 0) 
+        {
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::triggerByDimEqual(triggerDim1, value1, triggerDim2, value2) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+triggerByDim1LessEqualDim2Equal(Gadgetron::ISMRMRDDIM& triggerDim1, size_t value1, Gadgetron::ISMRMRDDIM& triggerDim2, size_t value2, bool workFlow_BufferKernel_, bool workFlow_use_BufferedKernel_)
+{
+    try
+    {
+        GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - triggerByDim1LessEqualDim2Equal(triggerDim1, value1, triggerDim2, value2) ... ");
+
+        GadgetContainerMessage<GtPlusGadgetImageArray>* cm1 = new GadgetContainerMessage<GtPlusGadgetImageArray>();
+        GadgetContainerMessage< WorkOrderType >* cm2 = new GadgetContainerMessage< WorkOrderType >();
+
+        workOrder_.duplicate(*cm2->getObjectPtr());
+        cm2->getObjectPtr()->workFlow_BufferKernel_ = workFlow_BufferKernel_;
+        cm2->getObjectPtr()->workFlow_use_BufferedKernel_ = workFlow_use_BufferedKernel_;
+
+        cm1->cont(cm2);
+
+        // copy the image content
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim1LessEqualDim2Equal(workOrder_.data_, cm2->getObjectPtr()->data_, triggerDim1, value1, triggerDim2, value2));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForDim1LessEqualDim2Equal(workOrder_.reflect_, cm2->getObjectPtr()->reflect_, triggerDim1, value1, triggerDim2, value2));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<real_value_type>().extractSubArrayForDim1LessEqualDim2Equal(workOrder_.time_stamp_, cm2->getObjectPtr()->time_stamp_, triggerDim1, value1, triggerDim2, value2));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<real_value_type>().extractSubArrayForDim1LessEqualDim2Equal(workOrder_.physio_time_stamp_, cm2->getObjectPtr()->physio_time_stamp_, triggerDim1, value1, triggerDim2, value2));
+
+        // copy the ref
+        if ( workOrder_.ref_.get_number_of_elements()>0 
+                && workOrder_.ref_.get_number_of_dimensions()==workOrder_.data_.get_number_of_dimensions() )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim1LessEqualDim2Equal(workOrder_.ref_, cm2->getObjectPtr()->ref_, triggerDim1, value1, triggerDim2, value2));
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForDim1LessEqualDim2Equal(workOrder_.reflect_ref_, cm2->getObjectPtr()->reflect_ref_, triggerDim1, value1, triggerDim2, value2));
+
+            // for seperate and external mode, further truncate the reference data
+            if ( (workOrder_.CalibMode_ == ISMRMRD_separate) || (workOrder_.CalibMode_ == ISMRMRD_external) )
+            {
+                hoNDArray<ValueType> ref;
+                hoNDArray<unsigned short> reflect_ref;
+
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->ref_, ref, meas_max_idx_ref_));
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->reflect_ref_, reflect_ref, meas_max_idx_ref_));
+
+                cm2->getObjectPtr()->ref_ = ref;
+                cm2->getObjectPtr()->reflect_ref_ = reflect_ref;
+            }
+        }
+
+        // copy the message image array
+        GADGET_CHECK_RETURN_FALSE(messageImage_->extractGadgetImageArray_Dim1LessEqual_Dim2Equal(triggerDim1, value1, triggerDim2, value2, *(cm1->getObjectPtr()) ));
+
+        if (!phaseCorrBuffer_.empty())
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - phase correction signal found : " << phaseCorrBuffer_.size());
+
+            if ( !fillBuffer(phaseCorrBuffer_, workOrder_.phaseCorr_, workOrder_.reflect_phaseCorr_) )
+            {
+                GDEBUG("fillBuffer(phaseCorrBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->phaseCorr_ = workOrder_.phaseCorr_;
+            cm2->getObjectPtr()->reflect_phaseCorr_ = workOrder_.reflect_phaseCorr_;
+        }
+
+        if (!noiseBuffer_.empty())
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorGadget - noise signal found : " << noiseBuffer_.size());
+
+            ReflectBufferType tmpBuf;
+            if ( !fillBuffer(noiseBuffer_, workOrder_.noise_, tmpBuf) )
+            {
+                GDEBUG("fillBuffer(noiseBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->noise_ = workOrder_.noise_;
+        }
+
+        if (!otherBuffer_.empty())
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorGadget - other signal found : " << otherBuffer_.size());
+
+            if ( !fillBuffer(otherBuffer_, workOrder_.other_, workOrder_.reflect_other_) )
+            {
+                GDEBUG("fillBuffer(otherBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.other_, cm2->getObjectPtr()->other_, triggerDim1, value1, true));
+
+            cm2->getObjectPtr()->reflect_other_ = workOrder_.reflect_other_;
+        }
+
+        // send to next gadget
+        if (this->next()->putq(cm1) < 0) 
+        {
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::triggerByDim1LessEqualDim2Equal(triggerDim1, value1, triggerDim2, value2) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::triggerWorkOrderAllInClose()
+{
+    try
+    {
+        GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - triggerWorkOrderAllInClose ... ");
+
+        GadgetContainerMessage<GtPlusGadgetImageArray>* cm1 = new GadgetContainerMessage<GtPlusGadgetImageArray>();
+        GadgetContainerMessage< WorkOrderType >* cm2 = new GadgetContainerMessage< WorkOrderType >();
+
+        workOrder_.duplicate(*cm2->getObjectPtr());
+        cm2->getObjectPtr()->workFlow_BufferKernel_ = false;
+        cm2->getObjectPtr()->workFlow_use_BufferedKernel_ = false;
+
+        cm1->cont(cm2);
+
+        // copy the image content
+        cm2->getObjectPtr()->data_ = workOrder_.data_;
+        cm2->getObjectPtr()->time_stamp_ = workOrder_.time_stamp_;
+        cm2->getObjectPtr()->physio_time_stamp_ = workOrder_.physio_time_stamp_;
+        cm2->getObjectPtr()->reflect_ = workOrder_.reflect_;
+
+        // copy the ref
+        cm2->getObjectPtr()->ref_ = workOrder_.ref_;
+        cm2->getObjectPtr()->reflect_ref_ = workOrder_.reflect_ref_;
+
+        // for seperate and external mode, further truncate the reference data
+        if ( (workOrder_.CalibMode_ == ISMRMRD_separate) || (workOrder_.CalibMode_ == ISMRMRD_external) )
+        {
+            hoNDArray<ValueType> ref;
+            hoNDArray<unsigned short> reflect_ref;
+
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->ref_, ref, meas_max_idx_ref_));
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->reflect_ref_, reflect_ref, meas_max_idx_ref_));
+
+            cm2->getObjectPtr()->ref_ = ref;
+            cm2->getObjectPtr()->reflect_ref_ = reflect_ref;
+        }
+
+        // copy the message image array
+        GADGET_CHECK_RETURN_FALSE(cm1->getObjectPtr()->copy(*messageImage_));
+
+        if (!phaseCorrBuffer_.empty())
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - phase correction signal found : " << phaseCorrBuffer_.size());
+
+            if ( !fillBuffer(phaseCorrBuffer_, workOrder_.phaseCorr_, workOrder_.reflect_phaseCorr_) )
+            {
+                GDEBUG("fillBuffer(phaseCorrBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->phaseCorr_ = workOrder_.phaseCorr_;
+            cm2->getObjectPtr()->reflect_phaseCorr_ = workOrder_.reflect_phaseCorr_;
+        }
+
+        if (!noiseBuffer_.empty())
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorGadget - noise signal found : " << noiseBuffer_.size());
+
+            ReflectBufferType tmpBuf;
+            if ( !fillBuffer(noiseBuffer_, workOrder_.noise_, tmpBuf) )
+            {
+                GDEBUG("fillBuffer(noiseBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->noise_ = workOrder_.noise_;
+        }
+
+        if (!otherBuffer_.empty())
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusAccumulatorGadget - other signal found : " << otherBuffer_.size());
+
+            if ( !fillBuffer(otherBuffer_, workOrder_.other_, workOrder_.reflect_other_) )
+            {
+                GDEBUG("fillBuffer(otherBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->other_ = workOrder_.other_;
+            cm2->getObjectPtr()->reflect_other_ = workOrder_.reflect_other_;
+        }
+
+        // send to next gadget
+        if (this->next()->putq(cm1) < 0) 
+        {
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::triggerWorkOrderAllInClose() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+size_t GtPlusAccumulatorWorkOrderTriggerGadget::
+getDimValue(const ISMRMRD::AcquisitionHeader& acqHeader, Gadgetron::ISMRMRDDIM& dim)
+{
+    if ( dim == DIM_Encoding1 )             return acqHeader.idx.kspace_encode_step_1;
+    if ( dim == DIM_Slice )                 return acqHeader.idx.slice;
+    if ( dim == DIM_Encoding2 )             return acqHeader.idx.kspace_encode_step_2;
+    if ( dim == DIM_Contrast )              return acqHeader.idx.contrast;
+    if ( dim == DIM_Phase )                 return acqHeader.idx.phase;
+    if ( dim == DIM_Repetition )            return acqHeader.idx.repetition;
+    if ( dim == DIM_Set )                   return acqHeader.idx.set;
+    if ( dim == DIM_Segment )               return acqHeader.idx.segment;
+    if ( dim == DIM_Average )               return acqHeader.idx.average;
+
+    return 0;
+}
+
+void GtPlusAccumulatorWorkOrderTriggerGadget::
+setDimValue(ISMRMRD::AcquisitionHeader& acqHeader, Gadgetron::ISMRMRDDIM& dim, size_t value)
+{
+    if ( dim == DIM_Encoding1 ) acqHeader.idx.kspace_encode_step_1  = (uint16_t)value;
+    if ( dim == DIM_Slice ) acqHeader.idx.slice                     = (uint16_t)value;
+    if ( dim == DIM_Encoding2 ) acqHeader.idx.kspace_encode_step_2  = (uint16_t)value;
+    if ( dim == DIM_Contrast ) acqHeader.idx.contrast               = (uint16_t)value;
+    if ( dim == DIM_Phase ) acqHeader.idx.phase                     = (uint16_t)value;
+    if ( dim == DIM_Repetition ) acqHeader.idx.repetition           = (uint16_t)value;
+    if ( dim == DIM_Set ) acqHeader.idx.set                         = (uint16_t)value;
+    if ( dim == DIM_Segment ) acqHeader.idx.segment                 = (uint16_t)value;
+    if ( dim == DIM_Average ) acqHeader.idx.average                 = (uint16_t)value;
+
+    return;
+}
+
+int GtPlusAccumulatorWorkOrderTriggerGadget::close(unsigned long flags)
+{
+    GDEBUG_CONDITION_STREAM(true, "GtPlusAccumulatorWorkOrderTriggerGadget - close(flags) : " << flags);
+
+    if ( BaseClass::close(flags) != GADGET_OK ) return GADGET_FAIL;
+
+    if ( flags!=0 && !triggered_in_close_ )
+    // if ( !triggered_in_close_ )
+    {
+        triggered_in_close_ = true;
+
+        GDEBUG_CONDITION_STREAM(true, "GtPlusAccumulatorWorkOrderTriggerGadget - trigger in close(flags) ... ");
+
+        if ( needTriggerWorkOrderAllInClose() )
+        {
+            // never been triggered, so need to trigger with all data buffered
+            if ( !triggerWorkOrderAllInClose() )
+            {
+                GDEBUG("triggerWorkOrderAllInClose() failed ... \n");
+                return GADGET_FAIL;
+            }
+        }
+        else
+        {
+            // need to trigger the last portion of kspace
+            //if ( !triggerWorkOrder(NULL, true, true) )
+            //{
+            //    GDEBUG("Failed triggerWorkOrder(inClose)\n");
+            //    return GADGET_FAIL;
+            //}
+        }
+    }
+
+    // return BaseClass::close(flags);
+    return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusAccumulatorWorkOrderTriggerGadget)
+
+}
diff --git a/gadgets/gtPlus/GtPlusAccumulatorWorkOrderTriggerGadget.h b/gadgets/gtPlus/GtPlusAccumulatorWorkOrderTriggerGadget.h
new file mode 100644
index 0000000..b774898
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusAccumulatorWorkOrderTriggerGadget.h
@@ -0,0 +1,297 @@
+/** \file   GtPlusAccumulatorWorkOrderTriggerGadget.h
+    \brief  The GtPlus reconstruction data accmulation and triggering gadget
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd/ismrmrd.h"
+#include "ismrmrd/xml.h"
+
+#include "hoNDArray_utils.h"
+
+#include "GtPlusGadgetImageArray.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorkOrder.h"
+
+namespace Gadgetron
+{
+
+// [Ro E1 Cha Slice E2 Con Phase Rep Set Seg AVE]
+//   0  1  2   3    4   5    6     7  8   9  10
+
+struct ReadOutBuffer
+{
+    ISMRMRD::AcquisitionHeader acqHead_;
+    hoNDArray< std::complex<float> > data_;
+    bool isReflect_;
+};
+
+class EXPORTGTPLUSGADGET GtPlusAccumulatorWorkOrderTriggerGadget : public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+{
+public:
+    GADGET_DECLARE(GtPlusAccumulatorWorkOrderTriggerGadget);
+
+    typedef float real_value_type;
+    typedef std::complex<real_value_type> ValueType;
+
+    typedef Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< ValueType > > BaseClass;
+
+    typedef std::vector< ReadOutBuffer > ReadOutBufferType;
+    typedef hoNDArray< std::complex<float> > BufferType;
+    typedef hoNDArray< int > TimeStampBufferType;
+    typedef hoNDArray< unsigned short > ReflectBufferType;
+
+    // typedef Gadgetron::gtPlus::gtPlusReconWorkOrder2DT<ValueType> WorkOrderType;
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder<ValueType> WorkOrderType;
+
+    GtPlusAccumulatorWorkOrderTriggerGadget();
+    ~GtPlusAccumulatorWorkOrderTriggerGadget();
+
+    virtual int close(unsigned long flags);
+
+    /// parameters to control the triggering
+
+    /// for interleaved mode
+    // if DIM_NONE, the trigger is performed in the close function
+
+    // the triggering for interleaved mode is defined as:
+    // a) if triggerDim1_==DIM_NONE, and triggerDim2_ != DIM_NONE
+    // the trigger is performed whenever the triggerDim2_ is changed
+
+    // b) if triggerDim2_==NONE and triggerDim1_ != NONE, 
+    // if numOfKSpace_triggerDim1_==0, 
+    // the trigger is performed whenever the triggerDim1_ is changed
+
+    // if numOfKSpace_triggerDim1_>0, 
+    // the trigger is first performed when numOfKSpace_triggerDim1_ of triggerDim1_ kspace is buffered, 
+    // then trigger is performed whenever a new triggerDim1_ kspace arrives
+    // the new triggerDim1_ kspace will be reconed using the kernel estimated from numOfKSpace_triggerDim1_ of triggerDim1_
+
+    // when the resetTriggerStatus(m1)==true , the status is resetted and the numOfKSpace_triggerDim1_ of triggerDim1_ kspace 
+    // will be buffered and then trigger the recon
+
+    // c) if both triggerDim1_ and triggerDim2_ are NONE, the trigger is performed 
+    // in the close(flags) functions
+
+    // e) if both triggerDim1_ and triggerDim2_ are NOT NONE, the trigger is first performed
+    // when numOfKSpace_triggerDim1_ of triggerDim1_ kspace is buffered and then trigger is performed whenever a new 
+    // triggerDim1_ kspace arrives
+    // the new triggerDim1_ kspace will be reconed using the kernel estimated from numOfKSpace_triggerDim1_ of triggerDim1_
+    // when the triggerDim2_ changes or resetTriggerStatus(m1)==true , the status is resetted and the numOfKSpace_triggerDim1_ of triggerDim1_ kspace 
+    // will be buffered and then trigger the recon
+
+    // f) if numOfKSpace_triggerDim1_==0 and both triggerDim1_ and triggerDim2_ are NOT NONE,
+    // the trigger is performed whenever the triggerDim2_ is changed
+
+    // noacceleration
+    Gadgetron::ISMRMRDDIM noacceleration_triggerDim1_;
+    Gadgetron::ISMRMRDDIM noacceleration_triggerDim2_;
+    int noacceleration_numOfKSpace_triggerDim1_;
+
+    // interleaved
+    Gadgetron::ISMRMRDDIM interleaved_triggerDim1_;
+    Gadgetron::ISMRMRDDIM interleaved_triggerDim2_;
+    int interleaved_numOfKSpace_triggerDim1_;
+
+    // embedded
+    Gadgetron::ISMRMRDDIM embedded_triggerDim1_;
+    Gadgetron::ISMRMRDDIM embedded_triggerDim2_;
+    int embedded_numOfKSpace_triggerDim1_;
+
+    // separate
+    Gadgetron::ISMRMRDDIM separate_triggerDim1_;
+    Gadgetron::ISMRMRDDIM separate_triggerDim2_;
+    int separate_numOfKSpace_triggerDim1_;
+
+    // for other kspace data, if other_kspace_matching_Dim != DIM_NONE, the other data dimension will be made to match the image data at 
+    // dimension other_kspace_matching_Dim
+    Gadgetron::ISMRMRDDIM other_kspace_matching_Dim_;
+
+    // default behavior is to compare the readout geometry
+    // if the imaging slice changes, the trigger status is reset
+    virtual bool resetTriggerStatus(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1);
+
+protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1, GadgetContainerMessage< hoNDArray< ValueType > > * m2);
+
+    // check the status of incoming readout
+    // bIsKSpace: whether this data is for image
+    // bIsRef: whether this data is for calibration signal
+    // bIsNoise: whether this data is a noise scan
+    // bIsPhaseCorr: whether this data is for phase correction
+    // bIsReflect: whether this data is acquired reflectly (for EPI and similar scans)
+    // bIsOther: other scans
+    virtual bool checkStatus(uint64_t flag, int samples, 
+                        bool& bIsKSpace, bool& bIsRef, bool& bIsNoise, bool& bIsPhaseCorr, bool& bIsReflect, bool& bIsOther,
+                        bool& bIsNavigator, bool& bIsRTFeedback, bool& bIsHPFeedback, bool& bIsDummyScan);
+
+    // store the image data
+    virtual bool storeImageData(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2, bool isReflect);
+
+    // store the ref data
+    virtual bool storeRefData(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2, bool isReflect);
+
+    // fill the dynamically buffered data
+    virtual bool fillBuffer(ReadOutBufferType& readOutBuffer, BufferType& buf, ReflectBufferType& reflectBuf);
+
+    // fill the per 2D image info
+    virtual bool fillImageInfo(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GtPlusGadgetImageArray* messageImage, const ISMRMRD::EncodingCounters& idx);
+
+    // compute the encoded size
+    size_t computeEncodedSizeE1(size_t centerE1, size_t maxE1);
+    size_t computeEncodedSizeE2(size_t centerE2, size_t maxE2);
+
+    // perform the triggering
+    virtual bool triggerWorkOrder(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, bool inClose, bool isKSpace);
+
+    // workFlow_BufferKernel_ and workFlow_use_BufferedKernel_ is the command to work flow
+    // if workFlow_BufferKernel_ == true, work flow will buffer the kernels computed for this work order
+    // if workFlow_use_BufferedKernel_ == true, work flow will recon this work order using buffered kernels
+    // if both triggerDim1_ and triggerDim2_ are NOT NONE and numOfKSpace_triggerDim1_ > 0, 
+    // the first work order with workFlow_BufferKernel_==true will be sent out when values of triggerDim1_  equals numOfKSpace_triggerDim1_-1
+    // the next work orders will be sent out when triggerDim1_ changes with workFlow_BufferKernel_==false and workFlow_use_BufferedKernel_==true
+    // when the triggerDim2_ changes, the status will be reset
+    virtual bool triggerWorkOrder(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, 
+            Gadgetron::ISMRMRDDIM& triggerDim1_, Gadgetron::ISMRMRDDIM& triggerDim2_, int numOfKSpace_triggerDim1_);
+
+    // trigger by extract array with triggerDim being value
+    virtual bool triggerByDimEqual(Gadgetron::ISMRMRDDIM& triggerDim, size_t value, bool workFlow_BufferKernel_, bool workFlow_use_BufferedKernel_);
+    virtual bool triggerByDimLessEqual(Gadgetron::ISMRMRDDIM& triggerDim, size_t value, bool workFlow_BufferKernel_, bool workFlow_use_BufferedKernel_);
+    virtual bool triggerByDimEqual(Gadgetron::ISMRMRDDIM& triggerDim1, size_t value1, Gadgetron::ISMRMRDDIM& triggerDim2, size_t value2, bool workFlow_BufferKernel_, bool workFlow_use_BufferedKernel_);
+
+    // trigger by extract array with triggerDim being <= value
+    virtual bool triggerByDim1LessEqualDim2Equal(Gadgetron::ISMRMRDDIM& triggerDim1, size_t value1, Gadgetron::ISMRMRDDIM& triggerDim2, size_t value2, bool workFlow_BufferKernel_, bool workFlow_use_BufferedKernel_);
+
+    // whether need to trigger all buffered data in close()
+    bool needTriggerWorkOrderAllInClose();
+    // trigger with all buffered data
+    virtual bool triggerWorkOrderAllInClose();
+
+    // trigger the last count in the close function
+    virtual bool triggerWorkOrderLastCountInClose(Gadgetron::ISMRMRDDIM& triggerDim1_, Gadgetron::ISMRMRDDIM& triggerDim2_, int numOfKSpace_triggerDim1_);
+
+    size_t getDimValue(const ISMRMRD::AcquisitionHeader& acqHeader, Gadgetron::ISMRMRDDIM& dim);
+    void setDimValue(ISMRMRD::AcquisitionHeader& acqHeader, Gadgetron::ISMRMRDDIM& dim, size_t value);
+
+    // buffer for per 2D image information
+    GtPlusGadgetImageArray* messageImage_;
+
+    // buffer for image kspace data
+    // if the partial fourier is used, the kspace center is put at the center of buffer
+    // this means zeros will be added accordingly
+    BufferType kspaceBuffer_;
+    BufferType refBuffer_;
+
+    // dynamic buffer for other kspace data
+    ReadOutBufferType noiseBuffer_;
+    ReadOutBufferType phaseCorrBuffer_;
+    ReadOutBufferType otherBuffer_;
+
+    // dimension for image kspace
+    std::vector<size_t> dimensions_;
+
+    // encoding matrix size (the real sampled size)
+    size_t matrix_size_encoding_[3];
+
+    // maximal sampled line along E1 and E2
+    size_t max_sampled_E1_;
+    size_t max_sampled_E2_;
+
+    // index of center line along E1 and E2
+    size_t center_line_E1_;
+    size_t center_line_E2_;
+
+    // encoding space size (the logic kspace size)
+    size_t space_size_[3];
+
+    // offset along E1 and E2 directions for incoming readouts
+    size_t space_matrix_offset_E1_;
+    size_t space_matrix_offset_E2_;
+
+    // encoding filed of view [mm]
+    float field_of_view_encoding_[3];
+
+    // recon matrix size (the final image size)
+    size_t matrix_size_recon_[3];
+
+    // recon filed of view [mm]
+    float field_of_view_recon_[3];
+
+    // for the embedded mode
+    size_t embedded_ref_lines_E1_;
+    size_t embedded_ref_lines_E2_;
+
+    size_t image_counter_;
+    size_t image_series_;
+
+    // mark the first kspace line
+    bool first_kspace_scan_;
+
+    // whether the next gadget has been triggered in close(...)
+    bool triggered_in_close_;
+
+    // whether the next gadget has been triggered in process(...)
+    bool triggered_in_process_;
+
+    // whether the next gadget has been triggered becasue the triggerDim1 changes meet the required number of kspace
+    // only used for triggerDim1!=DIM_NONE && triggerDim2!=DIM_NONE
+    bool triggered_in_process_by_numOfKSpace_triggerDim1_;
+
+    // whether the next gadget has been triggered in process(...) for the last acquisition
+    // if so, extra triggering in close(...) is not needed
+    bool triggered_in_process_last_acq_;
+
+    size_t meas_max_ro_;
+    ISMRMRD::EncodingCounters meas_max_idx_;
+    size_t meas_max_channel_;
+
+    // maximal idx for reference data
+    ISMRMRD::EncodingCounters meas_max_idx_ref_;
+
+    // track the trigger dim1 and dim2
+    size_t prev_dim1_;
+    size_t curr_dim1_;
+
+    size_t prev_dim2_;
+    size_t curr_dim2_;
+
+    // store the previous acquisition head
+    ISMRMRD::AcquisitionHeader prev_acq_header_;
+
+    // for trigger dim1, need to count its times
+    size_t count_dim1_;
+
+    // a general workorder to store the buffered data
+    WorkOrderType workOrder_;
+
+    // indicator for the arrival of last acq
+    bool last_acq_arrived_;
+
+    // time stamp resolution (default, 0.0025s)
+    float timeStampResolution_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // util for gtplus
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtil< std::complex<float> > gtPlus_util_;
+
+    // in verbose mode, more info is printed out
+    bool verboseMode_;
+
+private:
+
+    // index for the time stamp
+    std::vector<size_t> ind_time_stamp_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusGadgetExport.h b/gadgets/gtPlus/GtPlusGadgetExport.h
new file mode 100644
index 0000000..b7369d4
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusGadgetExport.h
@@ -0,0 +1,16 @@
+/** \file   GtPlusGadgetExport.h
+    \brief  The windows export/import definition for the GtPlus reconstruction gadget
+    \author Hui Xue
+*/
+
+#pragma once
+
+#if defined (WIN32)
+    #if defined (__BUILD_GADGETS__) || defined (gadgetronPlus_EXPORTS)
+        #define EXPORTGTPLUSGADGET __declspec(dllexport)
+    #else
+        #define EXPORTGTPLUSGADGET __declspec(dllimport)
+    #endif
+#else
+    #define EXPORTGTPLUSGADGET
+#endif
diff --git a/gadgets/gtPlus/GtPlusGadgetImageArray.cpp b/gadgets/gtPlus/GtPlusGadgetImageArray.cpp
new file mode 100644
index 0000000..f1fb861
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusGadgetImageArray.cpp
@@ -0,0 +1,685 @@
+
+#include "GtPlusGadgetImageArray.h"
+
+namespace Gadgetron
+{
+
+GtPlusGadgetImageExt::GtPlusGadgetImageExt() : ISMRMRD::ImageHeader()
+{
+    version = 0;
+    flags = 0;
+    measurement_uid = 0;
+
+    matrix_size[0] = 0; matrix_size[1] = 0; matrix_size[2] = 0;
+    field_of_view[0] = 0; field_of_view[1] = 0; field_of_view[2] = 0;
+    channels = 0;
+    memset(position, 0, sizeof(float)*ISMRMRD::ISMRMRD_POSITION_LENGTH);
+    memset(read_dir, 0, sizeof(float)*ISMRMRD::ISMRMRD_POSITION_LENGTH);
+    memset(phase_dir, 0, sizeof(float)*ISMRMRD::ISMRMRD_POSITION_LENGTH);
+    memset(slice_dir, 0, sizeof(float)*ISMRMRD::ISMRMRD_POSITION_LENGTH);
+    memset(patient_table_position, 0, sizeof(float)*ISMRMRD::ISMRMRD_POSITION_LENGTH);
+
+    average = 0;
+    slice = 0;
+    contrast = 0;
+    phase = 0;
+    repetition = 0;
+    set = 0;
+    acquisition_time_stamp = 0;
+    memset(physiology_time_stamp, 0, sizeof(uint32_t)*ISMRMRD::ISMRMRD_PHYS_STAMPS);
+
+    data_type = 0;
+    image_type = 0;
+    image_index = 0;
+    image_series_index = 0;
+
+    memset(user_int, 0, sizeof(int32_t)*ISMRMRD::ISMRMRD_USER_INTS);
+    memset(user_float, 0, sizeof(float)*ISMRMRD::ISMRMRD_USER_FLOATS);
+
+    time_stamps.clear();
+    pmu_time_stamps.clear();
+}
+
+GtPlusGadgetImageExt::~GtPlusGadgetImageExt()
+{
+}
+
+void GtPlusGadgetImageExt::set_matrix_size(size_t index, size_t size)
+{
+    if (index < 3) 
+    {
+        matrix_size[index] = (uint16_t)size;
+    }
+
+    if ( index == 1 )
+    {
+        time_stamps.clear();
+        time_stamps.resize(matrix_size[1], -1);
+        pmu_time_stamps.clear();
+        pmu_time_stamps.resize(matrix_size[1], -1);
+    }
+}
+
+void GtPlusGadgetImageExt::copy(GtPlusGadgetImageExt& aMessageImage)
+{
+    version = aMessageImage.version;
+    flags = aMessageImage.flags;
+    measurement_uid = aMessageImage.measurement_uid;
+
+    matrix_size[0] = aMessageImage.matrix_size[0];
+    matrix_size[1] = aMessageImage.matrix_size[1];
+    matrix_size[2] = aMessageImage.matrix_size[2];
+
+    field_of_view[0] = aMessageImage.field_of_view[0];
+    field_of_view[1] = aMessageImage.field_of_view[1];
+    field_of_view[2] = aMessageImage.field_of_view[2];
+
+    channels = aMessageImage.channels;
+
+    memcpy(position, aMessageImage.position, sizeof(float)*ISMRMRD::ISMRMRD_POSITION_LENGTH);
+    memcpy(read_dir, aMessageImage.read_dir, sizeof(float)*ISMRMRD::ISMRMRD_DIRECTION_LENGTH);
+    memcpy(phase_dir, aMessageImage.phase_dir, sizeof(float)*ISMRMRD::ISMRMRD_DIRECTION_LENGTH);
+    memcpy(slice_dir, aMessageImage.slice_dir, sizeof(float)*ISMRMRD::ISMRMRD_DIRECTION_LENGTH);
+    memcpy(patient_table_position, aMessageImage.patient_table_position, sizeof(float)*ISMRMRD::ISMRMRD_POSITION_LENGTH);
+
+    average = aMessageImage.average;
+    slice = aMessageImage.slice;
+    contrast = aMessageImage.contrast;
+    phase = aMessageImage.phase;
+    repetition = aMessageImage.repetition;
+    set = aMessageImage.set;
+
+    acquisition_time_stamp = aMessageImage.acquisition_time_stamp;
+
+    memcpy(physiology_time_stamp, aMessageImage.physiology_time_stamp, sizeof(uint32_t)*ISMRMRD::ISMRMRD_PHYS_STAMPS);
+
+    data_type = aMessageImage.data_type;
+    image_type = aMessageImage.image_type;
+    image_index = aMessageImage.image_index;
+    image_series_index = aMessageImage.image_series_index;
+
+    memcpy(user_int, aMessageImage.user_int, sizeof(int32_t)*ISMRMRD::ISMRMRD_USER_INTS);
+    memcpy(user_float, aMessageImage.user_float, sizeof(float)*ISMRMRD::ISMRMRD_USER_FLOATS);
+
+    time_stamps = aMessageImage.time_stamps;
+    pmu_time_stamps = aMessageImage.pmu_time_stamps;
+}
+
+void GtPlusGadgetImageExt::recomputeHeader(const GtPlusGadgetImageExt& aMessageImage, double weight)
+{
+    size_t ii;
+    for ( ii=0; ii<ISMRMRD::ISMRMRD_POSITION_LENGTH; ii++ )
+    {
+        position[ii] = (float)((position[ii]*weight) + (1.0-weight)*aMessageImage.position[ii]);
+        patient_table_position[ii] = (float)((patient_table_position[ii]*weight) + (1.0-weight)*aMessageImage.patient_table_position[ii]);
+    }
+
+    acquisition_time_stamp = (uint32_t)((acquisition_time_stamp*weight) + (1.0-weight)*aMessageImage.acquisition_time_stamp + 0.5);
+
+    for ( ii=0; ii<ISMRMRD::ISMRMRD_PHYS_STAMPS; ii++ )
+    {
+        physiology_time_stamp[ii] = (uint32_t)((physiology_time_stamp[ii]*weight) + (1.0-weight)*aMessageImage.physiology_time_stamp[ii] + 0.5);
+    }
+}
+
+void GtPlusGadgetImageExt::dump()
+{
+    using namespace std;
+
+    cout << "GtPlusGadgetImageExt" << endl;
+    cout << "----------------------------------------------------------" << endl;
+    cout << "version            : " << version << endl;
+    cout << "flags              : " << flags << endl;
+    cout << "measurement_uid    : " << measurement_uid << endl;
+    cout << "matrix_size[3]     : " << matrix_size[0] << " " << matrix_size[1] << " " << matrix_size[2] << endl;
+    cout << "field_of_view[3]   : " << field_of_view[0] << " " << field_of_view[1] << " " << field_of_view[2] << endl;
+    cout << "channels           : " << channels << endl;
+
+    size_t ii;
+
+    cout << "position[ISMRMRD::ISMRMRD_POSITION_LENGTH]      : ";
+    for ( ii=0; ii<ISMRMRD::ISMRMRD_POSITION_LENGTH; ii++ )
+    {
+        cout << position[ii] << " ";
+    }
+    cout << endl;
+
+    cout << "read_dir[ISMRMRD::ISMRMRD_POSITION_LENGTH]      : ";
+    for ( ii=0; ii<ISMRMRD::ISMRMRD_POSITION_LENGTH; ii++ )
+    {
+        cout << read_dir[ii] << " ";
+    }
+    cout << endl;
+
+    cout << "phase_dir[ISMRMRD::ISMRMRD_POSITION_LENGTH]      : ";
+    for ( ii=0; ii<ISMRMRD::ISMRMRD_POSITION_LENGTH; ii++ )
+    {
+        cout << phase_dir[ii] << " ";
+    }
+    cout << endl;
+
+    cout << "slice_dir[ISMRMRD::ISMRMRD_POSITION_LENGTH]      : ";
+    for ( ii=0; ii<ISMRMRD::ISMRMRD_POSITION_LENGTH; ii++ )
+    {
+        cout << slice_dir[ii] << " ";
+    }
+    cout << endl;
+
+    cout << "patient_table_position[ISMRMRD::ISMRMRD_POSITION_LENGTH]      : ";
+    for ( ii=0; ii<ISMRMRD::ISMRMRD_POSITION_LENGTH; ii++ )
+    {
+        cout << patient_table_position[ii] << " ";
+    }
+    cout << endl;
+
+    cout << "average            : " << average << endl;
+    cout << "slice              : " << slice << endl;
+    cout << "contrast           : " << contrast << endl;
+    cout << "phase              : " << phase << endl;
+    cout << "repetition         : " << repetition << endl;
+    cout << "set                : " << set << endl;
+    cout << "acquisition_time_stamp : " << acquisition_time_stamp << endl;
+
+    cout << "physiology_time_stamp[ISMRMRD::ISMRMRD_PHYS_STAMPS] : ";
+    for ( ii=0; ii<ISMRMRD::ISMRMRD_PHYS_STAMPS; ii++ )
+    {
+        cout << physiology_time_stamp[ii] << " ";
+    }
+    cout << endl;
+
+    cout << "data_type          : " << data_type << endl;
+    cout << "image_type         : " << image_type << endl;
+    cout << "image_index        : " << image_index << endl;
+    cout << "image_series_index : " << image_series_index << endl;
+
+    cout << "user_int[ISMRMRD::ISMRMRD_USER_INTS]        : ";
+    for ( ii=0; ii<ISMRMRD::ISMRMRD_USER_INTS; ii++ )
+    {
+        cout << user_int[ii] << " ";
+    }
+    cout << endl;
+
+    cout << "user_float[ISMRMRD::ISMRMRD_USER_FLOATS]    : ";
+    for ( ii=0; ii<ISMRMRD::ISMRMRD_USER_FLOATS; ii++ )
+    {
+        cout << user_float[ii] << " ";
+    }
+    cout << endl;
+    cout << "----------------------------------------------------------" << endl;
+}
+
+// [Ro E1 Cha Slice E2 Con Phase Rep Set Seg Ave]
+//   0  1  2   3     4  5    6     7   8   9  10
+// store a scan with 10 dimensions
+
+GtPlusGadgetImageArray::GtPlusGadgetImageArray() 
+:   imageArray_(0)
+{
+    size_t ii;
+    for ( ii=0; ii<GT_DIM_NUM; ii++ )
+    {
+        matrix_size[ii] = 0;
+    }
+
+    max_num_of_images_ = 0;
+}
+
+GtPlusGadgetImageArray::GtPlusGadgetImageArray(const GtPlusGadgetImageArray& imArray) : imageArray_(0) 
+{
+    this->copy(imArray);
+}
+
+GtPlusGadgetImageArray::GtPlusGadgetImageArray(size_t aSize[GT_DIM_NUM])
+{
+    try
+    {
+        size_t ii;
+        for ( ii=0; ii<GT_DIM_NUM; ii++ )
+        {
+            matrix_size[ii] = aSize[ii];
+        }
+
+        size_t len = 1;
+        for ( ii=3; ii<GT_DIM_NUM; ii++ )
+        {
+            len *= matrix_size[ii];
+        }
+
+        max_num_of_images_ = len;
+
+        if ( len > 0 )
+        {
+            imageArray_ = new GtPlusGadgetImageExt[len];
+        }
+    }
+    catch(...)
+    {
+        GDEBUG_STREAM("Failed in allocate imageArray_" << std::endl);
+    }
+}
+
+GtPlusGadgetImageArray::~GtPlusGadgetImageArray()
+{
+    if (imageArray_)
+    {
+        delete [] imageArray_;
+    }
+}
+
+void GtPlusGadgetImageArray::resize(size_t aSize[GT_DIM_NUM])
+{
+    try
+    {
+        size_t ii;
+        for ( ii=0; ii<GT_DIM_NUM; ii++ )
+        {
+            matrix_size[ii] = aSize[ii];
+        }
+
+        size_t len = 1;
+        for ( ii=3; ii<GT_DIM_NUM; ii++ )
+        {
+            len *= matrix_size[ii];
+        }
+
+        if ( imageArray_ ) 
+        {
+            delete [] imageArray_;
+            imageArray_ = NULL;
+        }
+
+        max_num_of_images_ = len;
+
+        if ( len > 0 )
+        {
+            imageArray_ = new GtPlusGadgetImageExt[len];
+        }
+    }
+    catch(...)
+    {
+        GDEBUG_STREAM("Failed in resize GtPlusGadgetImageArray " << std::endl);
+    }
+}
+
+bool GtPlusGadgetImageArray::copy(const GtPlusGadgetImageArray& imageArray)
+{
+    try
+    {
+        if (imageArray_) delete [] imageArray_;
+        max_num_of_images_ = 0;
+
+        size_t ii;
+        for ( ii=0; ii<GT_DIM_NUM; ii++ )
+        {
+            matrix_size[ii] = imageArray.matrix_size[ii];
+        }
+
+        size_t len = 1;
+        for ( ii=3; ii<GT_DIM_NUM; ii++ )
+        {
+            len *= matrix_size[ii];
+        }
+
+        max_num_of_images_ = len;
+
+        if ( len > 0 )
+        {
+            imageArray_ = new GtPlusGadgetImageExt[len];
+        }
+
+        for ( size_t i=0; i<len; i++ )
+        {
+            imageArray_[i] = imageArray.imageArray_[i];
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusGadgetImageArray::copy(const GtPlusGadgetImageArray& imageArray) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+size_t GtPlusGadgetImageArray::get_offset(size_t slc, size_t e2, size_t con, size_t phs, size_t rep, size_t set, size_t seg, size_t ave)
+{
+    size_t offset = ave  *matrix_size[9]*matrix_size[8]*matrix_size[7]*matrix_size[6]*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + seg*matrix_size[8]*matrix_size[7]*matrix_size[6]*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + set*matrix_size[7]*matrix_size[6]*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + rep*matrix_size[6]*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + phs*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + con*matrix_size[4]*matrix_size[3]
+                    + e2 *matrix_size[3]
+                    + slc;
+    return offset;
+}
+
+// Slice E2 Con Phase Rep Set Seg
+void GtPlusGadgetImageArray::findDimIndex(Gadgetron::ISMRMRDDIM& dim, size_t& ind)
+{
+    switch (dim)
+    {
+        case Gadgetron::DIM_Slice:
+            ind = 3;
+        break;
+
+        case Gadgetron::DIM_Encoding2:
+            ind = 4;
+        break;
+
+        case Gadgetron::DIM_Contrast:
+            ind = 5;
+        break;
+
+        case Gadgetron::DIM_Phase:
+            ind = 6;
+        break;
+
+        case Gadgetron::DIM_Repetition:
+            ind = 7;
+        break;
+
+        case Gadgetron::DIM_Set:
+            ind = 8;
+        break;
+
+        case Gadgetron::DIM_Segment:
+            ind = 9;
+        break;
+
+        case Gadgetron::DIM_Average:
+            ind = 10;
+        break;
+
+        default:
+            ind = 0;
+    }
+
+    return;
+}
+
+bool GtPlusGadgetImageArray::
+extractGadgetImageArrayEqual(Gadgetron::ISMRMRDDIM& dim, size_t value, GtPlusGadgetImageArray& imageArray)
+{
+    try
+    {
+        size_t dimInd;
+        findDimIndex(dim, dimInd);
+
+        GADGET_DEBUG_CHECK_RETURN_FALSE( value >= matrix_size[dimInd] );
+
+        size_t startInd[GT_DIM_NUM-3];
+        size_t endInd[GT_DIM_NUM-3];
+
+        for ( size_t d=Gadgetron::DIM_Slice; d<=Gadgetron::DIM_Average; d++ )
+        {
+            if ( d == dim )
+            {
+                startInd[d-Gadgetron::DIM_Slice] = value;
+                endInd[d-Gadgetron::DIM_Slice] = value+1;
+            }
+            else
+            {
+                startInd[d-Gadgetron::DIM_Slice] = 0;
+                endInd[d-Gadgetron::DIM_Slice] = matrix_size[d-Gadgetron::DIM_Slice+3];
+            }
+        }
+
+        GADGET_CHECK_RETURN_FALSE(getSubImageArray(startInd, endInd, imageArray));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusGadgetImageArray::extractGadgetImageArrayEqual(dim, value) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusGadgetImageArray::
+extractGadgetImageArrayEqual(Gadgetron::ISMRMRDDIM& dim1, size_t value1, Gadgetron::ISMRMRDDIM& dim2, size_t value2, GtPlusGadgetImageArray& imageArray)
+{
+    try
+    {
+        size_t dimInd1;
+        findDimIndex(dim1, dimInd1);
+        GADGET_DEBUG_CHECK_RETURN_FALSE( value1 >= matrix_size[dimInd1] );
+
+
+        size_t dimInd2;
+        findDimIndex(dim2, dimInd2);
+        GADGET_DEBUG_CHECK_RETURN_FALSE( value2 >= matrix_size[dimInd2] );
+
+        size_t startInd[GT_DIM_NUM-3];
+        size_t endInd[GT_DIM_NUM-3];
+
+        for ( size_t d=Gadgetron::DIM_Slice; d<=Gadgetron::DIM_Average; d++ )
+        {
+            if ( d == dim1 )
+            {
+                startInd[d-Gadgetron::DIM_Slice] = value1;
+                endInd[d-Gadgetron::DIM_Slice] = value1+1;
+            }
+            else if ( d == dim2 )
+            {
+                startInd[d-Gadgetron::DIM_Slice] = value2;
+                endInd[d-Gadgetron::DIM_Slice] = value2+1;
+            }
+            else
+            {
+                startInd[d-Gadgetron::DIM_Slice] = 0;
+                endInd[d-Gadgetron::DIM_Slice] = matrix_size[d-Gadgetron::DIM_Slice+3];
+            }
+        }
+
+        GADGET_CHECK_RETURN_FALSE(getSubImageArray(startInd, endInd, imageArray));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusGadgetImageArray::extractGadgetImageArrayEqual(dim1, value1, dim2, value2) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusGadgetImageArray::
+extractGadgetImageArrayLessEqual(Gadgetron::ISMRMRDDIM& dim, size_t value, GtPlusGadgetImageArray& imageArray)
+{
+    try
+    {
+        size_t dimInd;
+        findDimIndex(dim, dimInd);
+        GADGET_DEBUG_CHECK_RETURN_FALSE( value >= matrix_size[dimInd] );
+
+        size_t startInd[GT_DIM_NUM-3];
+        size_t endInd[GT_DIM_NUM-3];
+
+        for ( size_t d=Gadgetron::DIM_Slice; d<=Gadgetron::DIM_Average; d++ )
+        {
+            if ( d == dim )
+            {
+                startInd[d-Gadgetron::DIM_Slice] = 0;
+                endInd[d-Gadgetron::DIM_Slice] = value+1;
+            }
+            else
+            {
+                startInd[d-Gadgetron::DIM_Slice] = 0;
+                endInd[d-Gadgetron::DIM_Slice] = matrix_size[d-Gadgetron::DIM_Slice+3];
+            }
+        }
+
+        GADGET_CHECK_RETURN_FALSE(getSubImageArray(startInd, endInd, imageArray));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusGadgetImageArray::extractGadgetImageArrayLessEqual(dim, value) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusGadgetImageArray::
+extractGadgetImageArray_Dim1LessEqual_Dim2Equal(Gadgetron::ISMRMRDDIM& dim1, size_t value1, 
+        Gadgetron::ISMRMRDDIM& dim2, size_t value2, GtPlusGadgetImageArray& imageArray)
+{
+    try
+    {
+        size_t dimInd1;
+        findDimIndex(dim1, dimInd1);
+
+        size_t dimInd2;
+        findDimIndex(dim2, dimInd2);
+
+        GADGET_DEBUG_CHECK_RETURN_FALSE( value1 >= matrix_size[dimInd1] );
+        GADGET_DEBUG_CHECK_RETURN_FALSE( value2 >= matrix_size[dimInd2] );
+
+        size_t startInd[GT_DIM_NUM];
+        size_t endInd[GT_DIM_NUM];
+
+        for ( size_t d=Gadgetron::DIM_Slice; d<=Gadgetron::DIM_Average; d++ )
+        {
+            if ( d == dim1 )
+            {
+                startInd[d-Gadgetron::DIM_Slice] = 0;
+                endInd[d-Gadgetron::DIM_Slice] = value1+1;
+            }
+            else if ( d == dim2 )
+            {
+                startInd[d-Gadgetron::DIM_Slice] = value2;
+                endInd[d-Gadgetron::DIM_Slice] = value2+1;
+            }
+            else
+            {
+                startInd[d-Gadgetron::DIM_Slice] = 0;
+                endInd[d-Gadgetron::DIM_Slice] = matrix_size[d-Gadgetron::DIM_Slice+3];
+            }
+        }
+
+        GADGET_CHECK_RETURN_FALSE(getSubImageArray(startInd, endInd, imageArray));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusGadgetImageArray::extractGadgetImageArray_Dim1LessEqual_Dim2Equal(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusGadgetImageArray::
+getSubImageArray(size_t* startInd, size_t* endInd, GtPlusGadgetImageArray& imageArray)
+{
+    try
+    {
+        size_t aSize[GT_DIM_NUM];
+        aSize[0] = matrix_size[0];
+        aSize[1] = matrix_size[1];
+        aSize[2] = matrix_size[2];
+
+        size_t ii;
+        for ( ii=3; ii<GT_DIM_NUM; ii++ )
+        {
+            aSize[ii] = endInd[ii-3]-startInd[ii-3];
+        }
+
+        imageArray.resize(aSize);
+
+        size_t slc, e2, con, phs, rep, set, seg, ave;
+
+        for ( ave=startInd[7]; ave<endInd[7]; ave++ )
+        {
+            for ( seg=startInd[6]; seg<endInd[6]; seg++ )
+            {
+                for ( set=startInd[5]; set<endInd[5]; set++ )
+                {
+                    for ( rep=startInd[4]; rep<endInd[4]; rep++ )
+                    {
+                        for ( phs=startInd[3]; phs<endInd[3]; phs++ )
+                        {
+                            for ( con=startInd[2]; con<endInd[2]; con++ )
+                            {
+                                for ( e2=startInd[1]; e2<endInd[1]; e2++ )
+                                {
+                                    for ( slc=startInd[0]; slc<endInd[0]; slc++ )
+                                    {
+                                        size_t offset = this->get_offset(slc, e2, con, phs, rep, set, seg, ave);
+                                        size_t offsetDst= imageArray.get_offset(slc-startInd[0], e2-startInd[1], con-startInd[2], phs-startInd[3], rep-startInd[4], set-startInd[5], seg-startInd[6], ave-startInd[7]);
+
+                                        imageArray.imageArray_[offsetDst] = imageArray_[offset];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusGadgetImageArray::getSubImageArray(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+void GtPlusGadgetImageArray::dump()
+{
+    size_t ii;
+    GDEBUG_STREAM("GtPlusGadgetImageArray" << std::endl);
+    GDEBUG_STREAM("==========================================================" << std::endl);
+    GDEBUG_STREAM("matrix_size           : ");
+    for ( ii=0; ii<GT_DIM_NUM; ii++ )
+    {
+        GDEBUG_STREAM(matrix_size[ii] << " ");
+    }
+    GDEBUG_STREAM(std::endl);
+    GDEBUG_STREAM("----------------------------------------------------------" << std::endl);
+    if ( imageArray_ )
+    {
+        int slc, e2, con, phs, rep, set, seg, ave;
+
+        for ( ave=0; ave<matrix_size[10]; ave++ )
+        {
+            for ( seg=0; seg<matrix_size[9]; seg++ )
+            {
+                for ( set=0; set<matrix_size[8]; set++ )
+                {
+                    for ( rep=0; rep<matrix_size[7]; rep++ )
+                    {
+                        for ( phs=0; phs<matrix_size[6]; phs++ )
+                        {
+                            for ( con=0; con<matrix_size[5]; con++ )
+                            {
+                                for ( e2=0; e2<matrix_size[4]; e2++ )
+                                {
+                                    for ( slc=0; slc<matrix_size[3]; slc++ )
+                                    {
+                                        size_t offset = get_offset(slc, e2, con, phs, rep, set, seg, ave);
+                                        std::cout << "[Slice E2 Contrast Phase Rep Set Seg Ave] = [" 
+                                                    << " " << slc 
+                                                    << " " << e2 
+                                                    << " " << con 
+                                                    << " " << phs 
+                                                    << " " << rep 
+                                                    << " " << set 
+                                                    << " " << seg 
+                                                    << " " << ave << "]" << std::endl;
+
+                                        imageArray_[offset].dump();
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    GDEBUG_STREAM("==========================================================" << std::endl);
+}
+
+}
diff --git a/gadgets/gtPlus/GtPlusGadgetImageArray.h b/gadgets/gtPlus/GtPlusGadgetImageArray.h
new file mode 100644
index 0000000..5700302
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusGadgetImageArray.h
@@ -0,0 +1,76 @@
+/** \file   GtPlusGadgetImageArray.h
+    \brief  The GtPlusGadgetImageArray is used by the triggering gadget to store the ISMRMRD ImageHeader information
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd/ismrmrd.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+
+// GtPlusGadgetImageArray stores the ISMRMRD image header info for every
+// 2D kspace
+
+namespace Gadgetron
+{
+
+struct  EXPORTGTPLUSGADGET GtPlusGadgetImageExt : public ISMRMRD::ImageHeader
+{
+    // fields added to store the time_stamp and pmu_time_stamp for every incoming read-out line
+    // if one line is not acquried, the corresponding time is -1
+    std::vector<int>     time_stamps;
+    std::vector<int>     pmu_time_stamps;
+
+    GtPlusGadgetImageExt();
+    ~GtPlusGadgetImageExt();
+
+    void copy(GtPlusGadgetImageExt& aMessageImage);
+    void set_matrix_size(size_t index, size_t size);
+
+    // interpolation is performed
+    // this = weight * this + (1-weight)*aMessageImage
+    void recomputeHeader(const GtPlusGadgetImageExt& aMessageImage, double weight);
+    void dump();
+}; 
+
+// [Ro E1 Cha Slice E2 Con Phase Rep Set Seg Ave]
+//  0  1  2   3     4  5   6     7   8   9   10
+// store a scan with 11 dimensions
+#define GT_DIM_NUM 11
+
+struct  EXPORTGTPLUSGADGET GtPlusGadgetImageArray
+{
+    // size of the image array
+    size_t matrix_size[GT_DIM_NUM];
+
+    size_t max_num_of_images_;
+
+    // message information for every 2D image [RO E1 Cha Slice E2 Contrast Phase Rep Set Seg]
+    GtPlusGadgetImageExt* imageArray_;
+
+    GtPlusGadgetImageArray();
+    GtPlusGadgetImageArray(const GtPlusGadgetImageArray& imArray);
+    GtPlusGadgetImageArray(size_t aSize[GT_DIM_NUM]);
+    ~GtPlusGadgetImageArray();
+
+    void findDimIndex(Gadgetron::ISMRMRDDIM& dim, size_t& ind);
+    bool getSubImageArray(size_t* startInd, size_t* endInd, GtPlusGadgetImageArray& imageArray);
+    void resize(size_t aSize[GT_DIM_NUM]);
+    bool copy(const GtPlusGadgetImageArray& imageArray);
+    size_t get_offset(size_t slc, size_t e2, size_t con, size_t phs, size_t rep, size_t set, size_t seg, size_t ave);
+    bool extractGadgetImageArrayEqual(Gadgetron::ISMRMRDDIM& dim, size_t value, GtPlusGadgetImageArray& imageArray);
+    bool extractGadgetImageArrayEqual(Gadgetron::ISMRMRDDIM& dim1, size_t value1, Gadgetron::ISMRMRDDIM& dim2, size_t value2, GtPlusGadgetImageArray& imageArray);
+    bool extractGadgetImageArrayLessEqual(Gadgetron::ISMRMRDDIM& dim, size_t value, GtPlusGadgetImageArray& imageArray);
+    bool extractGadgetImageArray_Dim1LessEqual_Dim2Equal(Gadgetron::ISMRMRDDIM& dim1, size_t value1, Gadgetron::ISMRMRDDIM& dim2, size_t value2, GtPlusGadgetImageArray& imageArray);
+
+    void dump();
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusGadgetOpenMP.cpp b/gadgets/gtPlus/GtPlusGadgetOpenMP.cpp
new file mode 100644
index 0000000..ed03d29
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusGadgetOpenMP.cpp
@@ -0,0 +1,71 @@
+
+#include "GtPlusGadgetOpenMP.h"
+
+namespace Gadgetron
+{
+
+#ifdef USE_OMP
+
+bool prepOpenMP()
+{
+    try
+    {
+        GDEBUG_STREAM("--> OpenMP info <--");
+        GDEBUG_STREAM("--------------------------------------------------------");
+
+        int numOpenMPProcs = omp_get_num_procs();
+        GDEBUG_STREAM("GtPlusRecon, numOpenMPProcs : " << numOpenMPProcs);
+
+        #ifndef WIN32
+            int maxOpenMPLevels = omp_get_max_active_levels();
+            GDEBUG_STREAM("GtPlusRecon, maxOpenMPLevels : " << maxOpenMPLevels);
+        #endif // WIN32
+
+        int maxOpenMPThreads = omp_get_max_threads();
+        GDEBUG_STREAM("GtPlusRecon, maxOpenMPThreads : " << maxOpenMPThreads);
+
+        if ( numOpenMPProcs != maxOpenMPThreads )
+        {
+            GDEBUG_STREAM("GtPlusRecon, numOpenMPProcs != maxOpenMPThreads , hyperthreading must be disabled ... ");
+            omp_set_num_threads(numOpenMPProcs);
+        }
+
+        // omp_set_nested(1);
+        int allowOpenMPNested = omp_get_nested();
+        GDEBUG_STREAM("GtPlusRecon, allowOpenMPNested : " << allowOpenMPNested);
+
+        #ifdef WIN32
+            GDEBUG_STREAM("----------------------------------");
+            GDEBUG_STREAM("GtPlus, set thread affinity ... ");
+
+            /// lock the threads
+            #pragma omp parallel default(shared)
+            {
+                int tid = omp_get_thread_num();
+                DWORD_PTR mask = (1 << tid);
+                GDEBUG_STREAM("thread id : " << tid << " - mask : " << mask);
+                SetThreadAffinityMask( GetCurrentThread(), mask );
+            }
+        #endif // WIN32
+
+        GDEBUG_STREAM("--------------------------------------------------------");
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlus prepOpenMP() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+#else
+
+bool prepOpenMP()
+{
+    return true;
+}
+
+#endif // USE_OMP
+
+}
diff --git a/gadgets/gtPlus/GtPlusGadgetOpenMP.h b/gadgets/gtPlus/GtPlusGadgetOpenMP.h
new file mode 100644
index 0000000..81c4338
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusGadgetOpenMP.h
@@ -0,0 +1,26 @@
+/** \file   GtPlusGadgetOpenMP.h
+    \brief  Pack up the OpenMP support in the GtPlus
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd/ismrmrd.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+
+#ifdef USE_OMP
+    #include <omp.h>
+#endif // USE_OMP
+
+namespace Gadgetron
+{
+
+bool EXPORTGTPLUSGADGET prepOpenMP();
+
+}
diff --git a/gadgets/gtPlus/GtPlusImageReconGadget.cpp b/gadgets/gtPlus/GtPlusImageReconGadget.cpp
new file mode 100644
index 0000000..34ffd52
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusImageReconGadget.cpp
@@ -0,0 +1,714 @@
+
+#include "GtPlusImageReconGadget.h"
+#include "GtPlusGadgetOpenMP.h"
+#include <iomanip>
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+    GtPlusImageReconGadget::GtPlusImageReconGadget()
+    {
+        image_series_num_ = 100;
+
+        debugFolder_ = "DebugOutput";
+
+        performTiming_ = true;
+
+        verboseMode_ = false;
+
+        gt_timer1_.set_timing_in_destruction(false);
+        gt_timer2_.set_timing_in_destruction(false);
+        gt_timer3_.set_timing_in_destruction(false);
+
+        Gadgetron::prepOpenMP();
+    }
+
+    GtPlusImageReconGadget::~GtPlusImageReconGadget()
+    {
+
+    }
+
+    bool GtPlusImageReconGadget::readParameters()
+    {
+        try
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "------> GtPlusImageReconGadget parameters <------");
+
+            verboseMode_ = this->get_bool_value("verboseMode");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "verboseMode_ is " << verboseMode_);
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+            boost::shared_ptr<std::string> str = this->get_string_value("debugFolder");
+            debugFolder_ = *str;
+            GDEBUG_CONDITION_STREAM(verboseMode_, "debugFolder_ is " << debugFolder_);
+
+            if ( !debugFolder_.empty() )
+            {
+                Gadgetron::getDebugFolderPath(debugFolder_, debugFolder_fullPath_, verboseMode_);
+            }
+            else
+            {
+                GDEBUG_STREAM("GtPlusImageRecon, debugFolder is not set ...");
+            }
+
+            performTiming_ = this->get_bool_value("performTiming");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "performTiming_ is " << performTiming_);
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in GtPlusImageReconGadget::readParameters() ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    int GtPlusImageReconGadget::process_config(ACE_Message_Block* mb)
+    {
+        // read in parameters from the xml
+        GADGET_CHECK_RETURN(this->readParameters(), GADGET_FAIL);
+
+        ISMRMRD::IsmrmrdHeader h;
+        try {
+          deserialize(mb->rd_ptr(),h);
+        } catch (...) {
+          GDEBUG("Error parsing ISMRMRD Header");
+          throw;
+          return GADGET_FAIL;
+        }
+
+        // seq object
+        if (h.encoding.size() != 1)
+        {
+            GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+            GDEBUG("This simple GtPlusAccumulatorImageTriggerGadget only supports one encoding space\n");
+            return GADGET_FAIL;
+        }
+
+        GADGET_CHECK_RETURN(findEncodingLimits(h, meas_max_idx_, verboseMode_), GADGET_FAIL);
+
+        return GADGET_OK;
+    }
+
+    int GtPlusImageReconGadget::process(GadgetContainerMessage<ImageBufferType>* m1)
+    {
+        GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusImageReconGadget::process(...) starts ... ");
+
+        std::vector<std::string> processStr;
+        std::vector<std::string> dataRole;
+
+        ImageBufferType& ori = *m1->getObjectPtr();
+
+        if ( ori.get_number_of_elements() == 1 )
+        {
+            size_t num = (*ori(0)).attrib_.length(GADGETRON_DATA_ROLE);
+            GADGET_CHECK_RETURN(num>0, GADGET_FAIL);
+
+            dataRole.resize(num);
+
+            for ( size_t ii=0; ii<num; ii++ )
+            {
+                dataRole[ii] = std::string( (*ori(0)).attrib_.as_str(GADGETRON_DATA_ROLE, ii) );
+            }
+
+            if ( (dataRole[0] == GADGETRON_IMAGE_GFACTOR) 
+                || (dataRole[0] == GADGETRON_IMAGE_SNR_MAP) 
+                || (dataRole[0] == GADGETRON_IMAGE_STD_MAP) 
+                || (dataRole[0] == GADGETRON_IMAGE_WRAPAROUNDMAP) )
+            {
+                GADGET_CHECK_RETURN(this->sendOutImages(ori, image_series_num_++, processStr, dataRole), GADGET_FAIL);
+                GADGET_CHECK_RETURN(this->releaseImageBuffer(ori), GADGET_FAIL);
+                return GADGET_OK;
+            }
+        }
+
+        this->processImageBuffer(ori);
+
+        this->releaseImageBuffer(ori);
+
+        m1->release();
+
+        return GADGET_OK;
+    }
+
+    int GtPlusImageReconGadget::processImageBuffer(ImageBufferType& ori)
+    {
+        std::vector<std::string> processStr;
+        std::vector<std::string> dataRole;
+
+        boost::shared_ptr< std::vector<size_t> > dims = ori.get_dimensions();
+        GDEBUG_CONDITION_STREAM(verboseMode_, "[Cha Slice E2 Con Phase Rep Set Ave] = [" << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " 
+            << (*dims)[3] << " " << (*dims)[4]  << " " << (*dims)[5] << " " 
+            << (*dims)[6] << " " << (*dims)[7] << "]");
+
+        this->sendOutImages(ori, image_series_num_++, processStr, dataRole);
+
+        return GADGET_OK;
+    }
+
+    bool GtPlusImageReconGadget::fillWithNULL(ImageBufferType& buf)
+    {
+        try
+        {
+            size_t N = buf.get_number_of_elements();
+            size_t ii;
+            for ( ii=0; ii<N; ii++ )
+            {
+                buf(ii) = NULL;
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in GtPlusImageReconGadget::fillWithNULL(ImageBufferType& buf) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool GtPlusImageReconGadget::releaseImageBuffer(ImageBufferType& buf)
+    {
+        try
+        {
+            size_t N = buf.get_number_of_elements();
+            size_t ii;
+            for ( ii=0; ii<N; ii++ )
+            {
+                ImageType* pImage = buf(ii);
+                if ( pImage != NULL )
+                {
+                    delete pImage;
+                    buf(ii) = NULL;
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in GtPlusImageReconGadget::releaseImageBuffer(ImageBufferType& buf) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    hoNDImage<std::complex<float>, 3>* GtPlusImageReconGadget::getImage3DFromImage2D(ImageBufferType& ori, size_t cha, size_t slc, size_t con, size_t phs, size_t rep, size_t set, size_t ave)
+    {
+        ImageType* pImage2D = ori(cha, slc, 0, con, phs, rep, set, ave);
+        GADGET_CHECK_THROW(pImage2D!=NULL);
+
+        size_t RO = pImage2D->get_size(0);
+        size_t E1 = pImage2D->get_size(1);
+        size_t E2 = ori.get_size(2);
+
+        Image3DType* pImage3D = new Image3DType(RO, E1, E2);
+        GADGET_CHECK_THROW(pImage3D!=NULL);
+
+        pImage3D->attrib_ = pImage2D->attrib_;
+
+        size_t e2;
+        for ( e2=0; e2<E2; e2++ )
+        {
+            pImage2D = ori(cha, slc, e2, con, phs, rep, set, ave);
+            GADGET_CHECK_THROW(pImage2D!=NULL);
+
+            memcpy(pImage3D->begin()+e2*RO*E1, pImage2D->begin(), sizeof(ValueType)*RO*E1 );
+        }
+
+        return pImage3D;
+    }
+
+    bool GtPlusImageReconGadget::getImage2DFromImage3D(Image3DType& image3D, ImageBufferType& image2DBuf)
+    {
+        size_t RO = image3D.get_size(0);
+        size_t E1 = image3D.get_size(1);
+        size_t E2 = image3D.get_size(2);
+
+        std::vector<size_t> dim(1);
+        dim[0] = E2;
+        image2DBuf.create(dim);
+
+        size_t e2;
+        for ( e2=0; e2<E2; e2++ )
+        {
+            ImageType* pImage2D = new ImageType(RO, E1);
+            GADGET_CHECK_RETURN_FALSE(pImage2D!=NULL);
+
+            memcpy(pImage2D->begin(), image3D.begin()+e2*RO*E1, sizeof(ValueType)*RO*E1);
+
+            image2DBuf(e2) = pImage2D;
+        }
+
+        return true;
+    }
+
+    size_t GtPlusImageReconGadget::computeSeriesImageNumber (ISMRMRD::ImageHeader& imheader, size_t nCHA, size_t cha, size_t nE2, size_t e2)
+    {
+        size_t nSET = meas_max_idx_.set+1;
+        size_t nREP = meas_max_idx_.repetition+1;
+        size_t nPHS = meas_max_idx_.phase+1;
+        size_t nSLC = meas_max_idx_.slice+1;
+        size_t nCON = meas_max_idx_.contrast+1;
+        if ( nE2 == 0 ) nE2 = 1;
+
+        size_t imageNum = imheader.average*nREP*nSET*nPHS*nCON*nSLC*nE2*nCHA 
+            + imheader.repetition*nSET*nPHS*nCON*nSLC*nE2*nCHA 
+            + imheader.set*nPHS*nCON*nSLC*nE2*nCHA 
+            + imheader.phase*nCON*nSLC*nE2*nCHA 
+            + imheader.contrast*nSLC*nE2*nCHA
+            + imheader.slice*nE2*nCHA 
+            + e2*nCHA 
+            + cha 
+            + 1;
+
+        return imageNum;
+    }
+
+    bool GtPlusImageReconGadget::sendOutImages(ImageBufferType& images, int seriesNum, const std::vector<std::string>& processStr, const std::vector<std::string>& dataRole, const std::vector<float>& windowCenter, const std::vector<float>& windowWidth)
+    {
+        try
+        {
+            size_t CHA = images.get_size(0);
+            size_t SLC = images.get_size(1);
+            size_t E2  = images.get_size(2);
+            size_t CON = images.get_size(3);
+            size_t PHS = images.get_size(4);
+            size_t REP = images.get_size(5);
+            size_t SET = images.get_size(6);
+            size_t AVE = images.get_size(7);
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "--> GtPlusImageReconGadget, sending out images, array boundary [CHA SLC E2 CON PHS REP SET AVE] = [" 
+                << CHA << " " << SLC << " " 
+                << E2 << " " << CON << " " 
+                << PHS << " " << REP << " " 
+                << SET << " " << AVE << "] " );
+
+            size_t ave(0), set(0), rep(0), phs(0), con(0), e2(0), slc(0), cha(0);
+            std::vector<size_t> dim2D(2);
+
+            for ( ave=0; ave<AVE; ave++ )
+            {
+                for ( set=0; set<SET; set++ )
+                {
+                    for ( rep=0; rep<REP; rep++ )
+                    {
+                        for ( phs=0; phs<PHS; phs++ )
+                        {
+                            for ( con=0; con<CON; con++ )
+                            {
+                                for ( e2=0; e2<E2; e2++ )
+                                {
+                                    for ( slc=0; slc<SLC; slc++ )
+                                    {
+                                        for ( cha=0; cha<CHA; cha++ )
+                                        {
+                                            ImageType* pImage = images(cha, slc, e2, con, phs, rep, set, ave);
+                                            if ( pImage != NULL )
+                                            {
+                                                Gadgetron::GadgetContainerMessage<ISMRMRD::ImageHeader>* cm1 = new Gadgetron::GadgetContainerMessage<ISMRMRD::ImageHeader>();
+                                                Gadgetron::GadgetContainerMessage<ImgArrayType>* cm2 = new Gadgetron::GadgetContainerMessage<ImgArrayType>();
+                                                Gadgetron::GadgetContainerMessage<ISMRMRD::MetaContainer>* cm3 = new Gadgetron::GadgetContainerMessage<ISMRMRD::MetaContainer>();
+
+                                                try
+                                                {
+                                                    cm1->cont(cm2);
+                                                    cm2->cont(cm3);
+
+                                                    // set the ISMRMRD image header
+                                                    GADGET_CHECK_THROW( gtPlus_util_.setImageHeaderISMRMRDFromMetaAttributes(pImage->attrib_, *cm1->getObjectPtr()) );
+
+                                                    //long long imageNum(0);
+                                                    //if ( pImage->attrib_.attributeInteger_.get(GADGETRON_IMAGENUMBER, 0, imageNum) )
+                                                    //{
+                                                    //    cm1->getObjectPtr()->image_index = (uint16_t)imageNum;
+                                                    //}
+
+                                                    long long imageNum = this->computeSeriesImageNumber (*cm1->getObjectPtr(), CHA, cha, E2, e2);
+                                                    cm1->getObjectPtr()->image_index = (uint16_t)imageNum;
+                                                    pImage->attrib_.set(GADGETRON_IMAGENUMBER, (long)imageNum);
+
+                                                    cm1->getObjectPtr()->image_series_index = seriesNum;
+
+                                                    // set the image data
+                                                    size_t RO = pImage->get_size(0);
+                                                    size_t E1 = pImage->get_size(1);
+
+                                                    dim2D[0] = RO;
+                                                    dim2D[1] = E1;
+
+                                                    cm2->getObjectPtr()->create(dim2D);
+                                                    memcpy(cm2->getObjectPtr()->get_data_ptr(), pImage->get_data_ptr(), pImage->get_number_of_bytes());
+
+                                                    // set the attributes
+                                                    *cm3->getObjectPtr() = pImage->attrib_;
+
+                                                    if ( !dataRole.empty() && (dataRole[0]!=GADGETRON_IMAGE_REGULAR) )
+                                                    {
+                                                        std::string str;
+
+                                                        // data role
+                                                        bool isRealImage = false;
+                                                        bool isParametricMap = false;
+                                                        bool isParametricT1Map = false;
+                                                        bool isParametricT1SDMap = false;
+                                                        bool isParametricT2Map = false;
+                                                        bool isParametricT2SDMap = false;
+                                                        bool isParametricT2StarMap = false;
+                                                        bool isParametricT2StarMaskMap = false;
+                                                        bool isParametricT2StarSDMap = false;
+                                                        bool isParametricT2StarAMap = false;
+                                                        bool isParametricT2StarTruncMap = false;
+
+                                                        if ( !dataRole.empty() )
+                                                        {
+                                                            size_t n;
+                                                            for ( n=0; n<dataRole.size(); n++ )
+                                                            {
+                                                                if ( dataRole[n] == GADGETRON_IMAGE_PSIR )
+                                                                {
+                                                                    isRealImage = true;
+                                                                }
+
+                                                                if ( (dataRole[n]==GADGETRON_IMAGE_T1MAP) 
+                                                                    || (dataRole[n]==GADGETRON_IMAGE_T1SDMAP)
+                                                                    || (dataRole[n]==GADGETRON_IMAGE_T2MAP)
+                                                                    || (dataRole[n]==GADGETRON_IMAGE_T2SDMAP)
+                                                                    || (dataRole[n]==GADGETRON_IMAGE_T2STARMAP)
+                                                                    || (dataRole[n]==GADGETRON_IMAGE_T2STARMASKMAP)
+                                                                    || (dataRole[n]==GADGETRON_IMAGE_T2STARSDMAP)
+                                                                    || (dataRole[n]==GADGETRON_IMAGE_T2STARAMAP)
+                                                                    || (dataRole[n]==GADGETRON_IMAGE_T2STARTRUNCMAP)
+                                                                    || (dataRole[n]==GADGETRON_IMAGE_FREQMAP)
+                                                                    || (dataRole[n]==GADGETRON_IMAGE_B1MAP)
+                                                                    || (dataRole[n]==GADGETRON_IMAGE_FLIPANGLEMAP) )
+                                                                {
+                                                                    isParametricMap = true;
+                                                                }
+
+                                                                if ( dataRole[n]==GADGETRON_IMAGE_T1MAP )
+                                                                {
+                                                                    isParametricT1Map = true;
+                                                                }
+
+                                                                if ( dataRole[n]==GADGETRON_IMAGE_T1SDMAP )
+                                                                {
+                                                                    isParametricT1SDMap = true;
+                                                                }
+
+                                                                if ( dataRole[n]==GADGETRON_IMAGE_T2MAP )
+                                                                {
+                                                                    isParametricT2Map = true;
+                                                                }
+
+                                                                if ( dataRole[n]==GADGETRON_IMAGE_T2SDMAP )
+                                                                {
+                                                                    isParametricT2SDMap = true;
+                                                                }
+
+                                                                if ( dataRole[n]==GADGETRON_IMAGE_T2STARMAP )
+                                                                {
+                                                                    isParametricT2StarMap = true;
+                                                                }
+
+                                                                if ( dataRole[n]==GADGETRON_IMAGE_T2STARSDMAP )
+                                                                {
+                                                                    isParametricT2StarSDMap = true;
+                                                                }
+
+                                                                if ( dataRole[n]==GADGETRON_IMAGE_T2STARAMAP )
+                                                                {
+                                                                    isParametricT2StarAMap = true;
+                                                                }
+
+                                                                if ( dataRole[n]==GADGETRON_IMAGE_T2STARTRUNCMAP )
+                                                                {
+                                                                    isParametricT2StarTruncMap = true;
+                                                                }
+
+                                                                if ( dataRole[n]==GADGETRON_IMAGE_T2STARMASKMAP )
+                                                                {
+                                                                    isParametricT2StarMaskMap = true;
+                                                                }
+                                                            }
+
+                                                            std::vector<std::string> dataRoleAll;
+                                                            Gadgetron::getISMRMRMetaValues(*cm3->getObjectPtr(), GADGETRON_DATA_ROLE, dataRoleAll);
+
+                                                            if ( !debugFolder_fullPath_.empty() )
+                                                            {
+                                                                std::ostringstream ostr;
+                                                                for ( n=0; n<dataRoleAll.size(); n++ )
+                                                                {
+                                                                    ostr << dataRoleAll[n] << "_";
+                                                                }
+                                                                ostr << cm1->getObjectPtr()->image_index;
+
+                                                                if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(*cm2->getObjectPtr(), debugFolder_fullPath_+ostr.str()); }
+                                                            }
+
+                                                            // double check the image type
+                                                            if ( isRealImage )
+                                                            {
+                                                                cm1->getObjectPtr()->image_type = ISMRMRD::ISMRMRD_IMTYPE_REAL;
+                                                            }
+
+                                                            // image comment
+                                                            if ( isParametricMap )
+                                                            {
+                                                                // reset the image comment for maps
+
+                                                                std::vector<std::string> commentStr(dataRole.size()+1);
+
+                                                                commentStr[0] = "GT";
+                                                                for ( n=0; n<dataRole.size(); n++ )
+                                                                {
+                                                                    commentStr[n+1] = dataRole[n];
+                                                                }
+
+                                                                Gadgetron::setISMRMRMetaValues(*cm3->getObjectPtr(), GADGETRON_IMAGECOMMENT, commentStr);
+
+                                                                // get the scaling ratio
+                                                                float scalingRatio = 1;
+                                                                try
+                                                                {
+                                                                    scalingRatio = (float)(cm3->getObjectPtr()->as_double(GADGETRON_IMAGE_SCALE_RATIO, 0));
+
+                                                                    std::ostringstream ostr;
+                                                                    ostr << "x" << scalingRatio;
+                                                                    std::string scalingStr = ostr.str();
+                                                                    cm3->getObjectPtr()->append(GADGETRON_IMAGECOMMENT, scalingStr.c_str());
+
+                                                                    if ( isParametricT1Map || isParametricT1SDMap || isParametricT2Map || isParametricT2SDMap || isParametricT2StarMap || isParametricT2StarSDMap )
+                                                                    {
+                                                                        std::ostringstream ostr;
+                                                                        ostr << std::setprecision(3) << 1.0f/scalingRatio << "ms";
+                                                                        std::string unitStr = ostr.str();
+
+                                                                        cm3->getObjectPtr()->append(GADGETRON_IMAGECOMMENT, unitStr.c_str());
+                                                                    }
+                                                                }
+                                                                catch(...)
+                                                                {
+                                                                    GWARN_STREAM("Image attrib does not have the scale ratio ...");
+                                                                    scalingRatio = 1;
+                                                                }
+
+                                                                if ( isParametricT1Map || isParametricT2Map || isParametricT2StarMap )
+                                                                {
+                                                                    cm3->getObjectPtr()->set(GADGETRON_IMAGE_WINDOWCENTER, (long)((this->get_double_value("window_center"))*scalingRatio) );
+                                                                    cm3->getObjectPtr()->set(GADGETRON_IMAGE_WINDOWWIDTH, (long)((this->get_double_value("window_width"))*scalingRatio) );
+                                                                }
+
+                                                                if ( isParametricT1SDMap || isParametricT2SDMap || isParametricT2StarSDMap || isParametricT2StarAMap )
+                                                                {
+                                                                    cm3->getObjectPtr()->set(GADGETRON_IMAGE_WINDOWCENTER, (long)((this->get_double_value("sd_window_center"))*scalingRatio) );
+                                                                    cm3->getObjectPtr()->set(GADGETRON_IMAGE_WINDOWWIDTH, (long)((this->get_double_value("sd_window_width"))*scalingRatio) );
+                                                                }
+
+                                                                if ( isParametricT2StarTruncMap )
+                                                                {
+                                                                    cm3->getObjectPtr()->set(GADGETRON_IMAGE_WINDOWCENTER, (long)(4) );
+                                                                    cm3->getObjectPtr()->set(GADGETRON_IMAGE_WINDOWWIDTH, (long)(8) );
+                                                                }
+
+                                                                /* if ( isParametricT2Map )
+                                                                {
+                                                                cm3->getObjectPtr()->attributeInteger_.set(GADGETRON_IMAGE_WINDOWCENTER, 0, (long long)(60*scalingRatio) );
+                                                                cm3->getObjectPtr()->attributeInteger_.set(GADGETRON_IMAGE_WINDOWWIDTH, 0, (long long)(120*scalingRatio) );
+                                                                }
+
+                                                                if ( isParametricT2StarMap )
+                                                                {
+                                                                cm3->getObjectPtr()->attributeInteger_.set(GADGETRON_IMAGE_WINDOWCENTER, 0, (long long)(25*scalingRatio) );
+                                                                cm3->getObjectPtr()->attributeInteger_.set(GADGETRON_IMAGE_WINDOWWIDTH, 0, (long long)(50*scalingRatio) );
+                                                                } */
+                                                            }
+                                                            else
+                                                            {
+                                                                for ( n=0; n<dataRole.size(); n++ )
+                                                                {
+                                                                    cm3->getObjectPtr()->append(GADGETRON_IMAGECOMMENT, dataRole[n].c_str());
+                                                                }
+                                                            }
+
+                                                            // seq description
+                                                            Gadgetron::appendISMRMRMetaValues(*cm3->getObjectPtr(), GADGETRON_SEQUENCEDESCRIPTION, dataRoleAll);
+                                                        }
+
+                                                        GDEBUG_CONDITION_STREAM(verboseMode_, "--> GtPlusImageReconGadget, sending out 2D image [CHA SLC E2 CON PHS REP SET AVE] = [" 
+                                                            << cha << " " 
+                                                            << cm1->getObjectPtr()->slice << " " 
+                                                            << e2 << " " 
+                                                            << cm1->getObjectPtr()->contrast << " " 
+                                                            << cm1->getObjectPtr()->phase << " " 
+                                                            << cm1->getObjectPtr()->repetition << " " 
+                                                            << cm1->getObjectPtr()->set << " " 
+                                                            << cm1->getObjectPtr()->average << "] \t" 
+                                                            << " -- Image number -- " << cm1->getObjectPtr()->image_index);
+
+                                                        // image processing history
+                                                        if ( !processStr.empty() )
+                                                        {
+                                                            size_t n;
+                                                            for ( n=0; n<processStr.size(); n++ )
+                                                            {
+                                                                cm3->getObjectPtr()->append(GADGETRON_IMAGEPROCESSINGHISTORY, processStr[n].c_str());
+                                                            }
+                                                        }
+
+                                                        if ( windowCenter.size()==SLC && windowWidth.size()==SLC )
+                                                        {
+                                                            cm3->getObjectPtr()->set(GADGETRON_IMAGE_WINDOWCENTER, (long)windowCenter[slc]);
+                                                            cm3->getObjectPtr()->set(GADGETRON_IMAGE_WINDOWWIDTH, (long)windowWidth[slc]);
+                                                        }
+                                                    }
+
+                                                    if ( this->next()->putq(cm1) < 0 ) 
+                                                    {
+                                                        cm1->release();
+                                                        return false;
+                                                    }
+                                                }
+                                                catch(...)
+                                                {
+                                                    cm1->release();
+                                                    throw;
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in GtPlusImageReconGadget::sendOutImages(images, seriesNum, processStr, dataRole) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    int GtPlusImageReconGadget::close(unsigned long flags)
+    {
+        GDEBUG_CONDITION_STREAM(true, "GtPlusImageReconGadget - close(flags) : " << flags);
+
+        if ( BaseClass::close(flags) != GADGET_OK ) return GADGET_FAIL;
+
+        if ( flags != 0 )
+        {
+            std::string procTime;
+            gtPlus_util_.getCurrentMoment(procTime);
+
+            GDEBUG_STREAM("* ============================================================================== *");
+            GDEBUG_STREAM("---> Image recon phase, Current processing time : " << procTime << " <---");
+            GDEBUG_STREAM("* ============================================================================== *");
+        }
+
+        return GADGET_OK;
+    }
+
+    bool GtPlusImageReconGadget::exportImageContainer2D(ImageContainer2DType& input, const std::string& prefix)
+    {
+        if ( !this->debugFolder_.empty() )
+        {
+            size_t R = input.rows();
+
+            size_t r;
+
+            hoNDArray<ValueType> outArray;
+
+            for ( r=0; r<R; r++ )
+            {
+                input.to_NDArray(r, outArray);
+
+                std::ostringstream ostr;
+                ostr << prefix << "_" << r;
+
+                if ( !this->debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(outArray, this->debugFolder_fullPath_+ostr.str()); }
+            }
+        }
+
+        return true;
+    }
+
+    bool GtPlusImageReconGadget::exportImageContainer2D(ImageContainer2DMagType& input, const std::string& prefix)
+    {
+        if ( !this->debugFolder_.empty() )
+        {
+            size_t R = input.rows();
+
+            size_t r;
+
+            hoNDArray<T> outArray;
+
+            for ( r=0; r<R; r++ )
+            {
+                input.to_NDArray(r, outArray);
+
+                std::ostringstream ostr;
+                ostr << prefix << "_" << r;
+
+                if ( !this->debugFolder_fullPath_.empty() ) { gt_exporter_.exportArray(outArray, this->debugFolder_fullPath_+ostr.str()); }
+            }
+        }
+
+        return true;
+    }
+
+    bool GtPlusImageReconGadget::exportImageContainer3D(ImageContainer3DType& input, const std::string& prefix)
+    {
+        if ( !this->debugFolder_.empty() )
+        {
+            size_t R = input.rows();
+
+            size_t r, c;
+            for ( r=0; r<R; r++ )
+            {
+                for ( c=0; c<input.cols(r); c++ )
+                {
+                    std::ostringstream ostr;
+                    ostr << prefix << "_" << r << "_" << c;
+
+                    if ( !this->debugFolder_fullPath_.empty() )
+                    {
+                        gt_exporter_.exportImageComplex(input(r, c), this->debugFolder_fullPath_+ostr.str());
+                    }
+                }
+            }
+        }
+
+        return true;
+    }
+
+    bool GtPlusImageReconGadget::exportImageContainer3D(ImageContainer3DMagType& input, const std::string& prefix)
+    {
+        if ( !this->debugFolder_.empty() )
+        {
+            size_t R = input.rows();
+
+            size_t r, c;
+            for ( r=0; r<R; r++ )
+            {
+                for ( c=0; c<input.cols(r); c++ )
+                {
+                    std::ostringstream ostr;
+                    ostr << prefix << "_" << r << "_" << c;
+
+                    gt_exporter_.exportImage(input(r, c), this->debugFolder_fullPath_+ostr.str());
+                }
+            }
+        }
+
+        return true;
+    }
+
+}
diff --git a/gadgets/gtPlus/GtPlusImageReconGadget.h b/gadgets/gtPlus/GtPlusImageReconGadget.h
new file mode 100644
index 0000000..2bf7721
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusImageReconGadget.h
@@ -0,0 +1,134 @@
+/** \file   GtPlusImageReconGadget.h
+    \brief  The GtPlus image reconstruction gadget, used after GtPlus kspace reconstruction
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "hoNDObjectArray.h"
+#include "ismrmrd/ismrmrd.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+#include "hoNDImageContainer2D.h"
+#include "hoNDArray_utils.h"
+#include "hoNDImage.h"
+
+#include "GtPlusGadgetImageArray.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+
+#include "GadgetStreamController.h"
+#include "GtPlusReconGadgetUtil.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron
+{
+
+// the dimensionsal order of buffered images
+// [Cha Slice E2 Con Phase Rep Set Ave]
+//   0    1    2   3   4    5   6   7
+
+class EXPORTGTPLUSGADGET GtPlusImageReconGadget : public Gadget1< hoNDObjectArray< hoNDImage<std::complex<float>, 2> > >
+{
+public:
+    GADGET_DECLARE(GtPlusImageReconGadget);
+
+    typedef float T;
+    typedef std::complex<T> ValueType;
+
+    typedef hoNDImage<ValueType, 2> ImageType;
+    typedef hoNDImage<ValueType, 3> Image3DType;
+
+    typedef hoNDImage<T, 2> ImageMagType;
+    typedef hoNDImage<T, 3> Image3DMagType;
+
+    // typedef hoNDArray<ImageType*> ImageBufferType;
+    typedef hoNDObjectArray<ImageType> ImageBufferType;
+    typedef hoNDArray<ValueType> ImgArrayType;
+
+    typedef hoNDImageContainer2D<ImageType> ImageContainer2DType;
+    typedef hoNDImageContainer2D<Image3DType> ImageContainer3DType;
+
+    typedef hoNDImageContainer2D<ImageMagType> ImageContainer2DMagType;
+    typedef hoNDImageContainer2D<Image3DMagType> ImageContainer3DMagType;
+
+    typedef Gadget1< ImageBufferType > BaseClass;
+
+    GtPlusImageReconGadget();
+    ~GtPlusImageReconGadget();
+
+    virtual int close(unsigned long flags);
+
+    /// image series number
+    int image_series_num_;
+
+    // debug folder
+    std::string debugFolder_;
+    std::string debugFolder_fullPath_;
+
+    // whether to perform timing
+    bool performTiming_;
+
+protected:
+
+    // encoding space size
+    ISMRMRD::EncodingCounters meas_max_idx_;
+
+    // read in parameters
+    bool readParameters();
+
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(GadgetContainerMessage<ImageBufferType>* m1);
+
+    virtual int processImageBuffer(ImageBufferType& ori);
+
+    /// fill image buffer with null point
+    bool fillWithNULL(ImageBufferType& buf);
+
+    /// release the image buffer
+    bool releaseImageBuffer(ImageBufferType& buf);
+
+    /// get the 3D images from 2D buffer
+    Image3DType* getImage3DFromImage2D(ImageBufferType& ori, size_t cha, size_t slc, size_t con, size_t phs, size_t rep, size_t set, size_t ave);
+
+    /// get the 2D image in buffer from a 3D image
+    bool getImage2DFromImage3D(Image3DType& image3D, ImageBufferType& image2DBuf);
+
+    /// compute the image number
+    size_t computeSeriesImageNumber (ISMRMRD::ImageHeader& imheader, size_t nCHA, size_t cha, size_t nE2, size_t e2);
+
+    /// send out the images as a Gadget3 message
+    /// windowCenter and windowWidth is for every SLC
+    virtual bool sendOutImages(ImageBufferType& images, int seriesNum, const std::vector<std::string>& processStr, const std::vector<std::string>& dataRole, const std::vector<float>& windowCenter=std::vector<float>(), const std::vector<float>& windowWidth=std::vector<float>());
+
+    /// utility function to export image container
+    bool exportImageContainer2D(ImageContainer2DType& input, const std::string& prefix);
+    bool exportImageContainer2D(ImageContainer2DMagType& input, const std::string& prefix);
+
+    bool exportImageContainer3D(ImageContainer3DType& input, const std::string& prefix);
+    bool exportImageContainer3D(ImageContainer3DMagType& input, const std::string& prefix);
+
+    // util for gtplus
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtil< std::complex<float> > gtPlus_util_;
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // in verbose mode, more info is printed out
+    bool verboseMode_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusRecon2DTCloudPackage.h b/gadgets/gtPlus/GtPlusRecon2DTCloudPackage.h
new file mode 100644
index 0000000..cb4d754
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusRecon2DTCloudPackage.h
@@ -0,0 +1,301 @@
+/** \file   GtPlusRecon2DTCloudPackage.h
+    \brief  To support the dual layer GtPlus cloud, this cloud job type is defined here
+
+            Ref to: 
+
+            Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+            Distributed MRI Reconstruction using Gadgetron based Cloud Computing. 
+            Magenetic Resonance in Medicine, doi: 10.1002/mrm.25213.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+namespace Gadgetron
+{
+
+struct GtPlusRecon2DTPara
+{
+    size_t reconSizeRO_;
+    size_t reconSizeE1_;
+    size_t reconSizeE2_;
+
+    float encodingFOV_RO_;
+    float encodingFOV_E1_;
+    float encodingFOV_E2_;
+
+    float reconFOV_RO_;
+    float reconFOV_E1_;
+    float reconFOV_E2_;
+
+    Gadgetron::ISMRMRDDIM dim_4th_;
+    Gadgetron::ISMRMRDDIM dim_5th_;
+    Gadgetron::ISMRMRDDIM workOrder_ShareDim_;
+
+    bool no_acceleration_averageall_ref_;
+    int no_acceleration_ref_numOfModes_;
+    bool no_acceleration_same_combinationcoeff_allS_;
+    int no_acceleration_whichS_combinationcoeff_;
+
+    bool interleaved_same_combinationcoeff_allS_;
+    int interleaved_whichS_combinationcoeff_;
+    int interleaved_ref_numOfModes_;
+
+    bool embedded_averageall_ref_;
+    int embedded_ref_numOfModes_;
+    bool embedded_fullres_coilmap_;
+    bool embedded_fullres_coilmap_useHighestSignal_;
+    bool embedded_same_combinationcoeff_allS_;
+    int embedded_whichS_combinationcoeff_;
+    bool embedded_ref_fillback_;
+
+    bool separate_averageall_ref_;
+    int separate_ref_numOfModes_;
+    bool separate_fullres_coilmap_;
+    bool separate_same_combinationcoeff_allS_;
+    int separate_whichS_combinationcoeff_;
+
+    bool same_coil_compression_coeff_allS_;
+
+    bool recon_kspace_needed_;
+
+    Gadgetron::gtPlus::gtPlusReconWorkOrderPara workOrderPara_;
+};
+
+template <typename T> 
+struct GtPlusRecon2DTCloudPackage
+{
+    typedef typename realType<T>::Type real_value_type;
+
+    GtPlusRecon2DTPara para;
+
+    hoNDArray<T> kspace;
+    hoNDArray<real_value_type> timeStamp;
+    hoNDArray<real_value_type> physioTimeStamp;
+
+    hoNDArray<T> ref;
+
+    hoNDArray<T> complexIm;
+    hoNDArray<T> res;
+
+    // extra recon images
+    hoNDArray<T> complexImSecond;
+    // optional time stamps for the secod recon images, in the unit of seconds
+    hoNDArray<real_value_type> resTimeStampSecond;
+    hoNDArray<real_value_type> resPhysioTimeStampSecond;
+
+    GtPlusRecon2DTCloudPackage();
+    GtPlusRecon2DTCloudPackage(const GtPlusRecon2DTCloudPackage& pack);
+
+    ~GtPlusRecon2DTCloudPackage();
+
+    GtPlusRecon2DTCloudPackage<T>& operator=(const GtPlusRecon2DTCloudPackage<T>& pack);
+
+    virtual bool serialize(char*& buf, size_t& len) const ;
+    virtual bool deserialize(char* buf, size_t& len);
+};
+
+template <typename T> 
+GtPlusRecon2DTCloudPackage<T>::GtPlusRecon2DTCloudPackage()
+{
+    kspace.clear();
+    timeStamp.clear();
+    physioTimeStamp.clear();
+    ref.clear();
+    complexIm.clear();
+    complexImSecond.clear();
+    resTimeStampSecond.clear();
+    resPhysioTimeStampSecond.clear();
+    res.clear();
+}
+
+template <typename T> 
+GtPlusRecon2DTCloudPackage<T>::~GtPlusRecon2DTCloudPackage()
+{
+
+}
+
+template <typename T> 
+GtPlusRecon2DTCloudPackage<T>::GtPlusRecon2DTCloudPackage(const GtPlusRecon2DTCloudPackage& pack)
+{
+    para = pack.para;
+    kspace = pack.kspace;
+    timeStamp = pack.timeStamp;
+    physioTimeStamp = pack.physioTimeStamp;
+    ref = pack.ref;
+    complexIm = pack.complexIm;
+    complexImSecond = pack.complexImSecond;
+    resTimeStampSecond = pack.resTimeStampSecond;
+    resPhysioTimeStampSecond = pack.resPhysioTimeStampSecond;
+    res = pack.res;
+}
+
+template <typename T> 
+GtPlusRecon2DTCloudPackage<T>& GtPlusRecon2DTCloudPackage<T>::operator=(const GtPlusRecon2DTCloudPackage& pack)
+{
+    if ( this == &pack ) return *this;
+
+    para = pack.para;
+    kspace = pack.kspace;
+    timeStamp = pack.timeStamp;
+    physioTimeStamp = pack.physioTimeStamp;
+    ref = pack.ref;
+    complexIm = pack.complexIm;
+    complexImSecond = pack.complexImSecond;
+    resTimeStampSecond = pack.resTimeStampSecond;
+    resPhysioTimeStampSecond = pack.resPhysioTimeStampSecond;
+    res = pack.res;
+
+    return *this;
+}
+
+template <typename T> 
+bool GtPlusRecon2DTCloudPackage<T>::serialize(char*& buf, size_t& len) const 
+{
+    char *bufKSpace(NULL), *bufTimeStamp(NULL), *bufPhysioTimeStamp(NULL), *bufRef(NULL), *bufComplexIm(NULL), *bufRes(NULL), *bufComplexImSecond(NULL), *bufResTimeStampSecond(NULL), *bufResPhysioTimeStampSecond(NULL);
+    try
+    {
+        if ( buf != NULL ) delete[] buf;
+
+        // find the total len
+        size_t lenKSpace, lenTimeStamp, lenPhysioTimeStamp, lenRef, lenComplexIm, lenRes, lenComplexImSecond, lenResTimeStampSecond, lenResPhyisoTimeStampSecond;
+
+        GADGET_CHECK_THROW(kspace.serialize(bufKSpace, lenKSpace));
+        GADGET_CHECK_THROW(timeStamp.serialize(bufTimeStamp, lenTimeStamp));
+        GADGET_CHECK_THROW(physioTimeStamp.serialize(bufPhysioTimeStamp, lenPhysioTimeStamp));
+        GADGET_CHECK_THROW(ref.serialize(bufRef, lenRef));
+        GADGET_CHECK_THROW(complexIm.serialize(bufComplexIm, lenComplexIm));
+        GADGET_CHECK_THROW(res.serialize(bufRes, lenRes));
+        GADGET_CHECK_THROW(complexImSecond.serialize(bufComplexImSecond, lenComplexImSecond));
+        GADGET_CHECK_THROW(resTimeStampSecond.serialize(bufResTimeStampSecond, lenResTimeStampSecond));
+        GADGET_CHECK_THROW(resPhysioTimeStampSecond.serialize(bufResPhysioTimeStampSecond, lenResPhyisoTimeStampSecond));
+
+        // total length
+        len = sizeof(GtPlusRecon2DTPara) + lenTimeStamp + lenPhysioTimeStamp + lenKSpace + lenRef + lenComplexIm + lenRes + lenComplexImSecond + lenResTimeStampSecond + lenResPhyisoTimeStampSecond;
+
+        buf = new char[len];
+        GADGET_CHECK_RETURN_FALSE( buf != NULL );
+
+        size_t offset = 0, currLen=0;
+
+        currLen = sizeof(GtPlusRecon2DTPara);
+        memcpy(buf+offset, &para, currLen);
+        offset += currLen;
+
+        currLen = lenKSpace;
+        memcpy(buf+offset, bufKSpace, currLen);
+        offset += currLen;
+        delete [] bufKSpace;
+
+        currLen = lenTimeStamp;
+        memcpy(buf+offset, bufTimeStamp, currLen);
+        offset += currLen;
+        delete [] bufTimeStamp;
+
+        currLen = lenPhysioTimeStamp;
+        memcpy(buf+offset, bufPhysioTimeStamp, currLen);
+        offset += currLen;
+        delete [] bufPhysioTimeStamp;
+
+        currLen = lenRef;
+        memcpy(buf+offset, bufRef, currLen);
+        offset += currLen;
+        delete [] bufRef;
+
+        currLen = lenComplexIm;
+        memcpy(buf+offset, bufComplexIm, currLen);
+        offset += currLen;
+        delete [] bufComplexIm;
+
+        currLen = lenRes;
+        memcpy(buf+offset, bufRes, currLen);
+        offset += currLen;
+        delete [] bufRes;
+
+        currLen = lenComplexImSecond;
+        memcpy(buf+offset, bufComplexImSecond, currLen);
+        offset += currLen;
+        delete [] bufComplexImSecond;
+
+        currLen = lenResTimeStampSecond;
+        memcpy(buf+offset, bufResTimeStampSecond, currLen);
+        offset += currLen;
+        delete [] bufResTimeStampSecond;
+
+        currLen = lenResPhyisoTimeStampSecond;
+        memcpy(buf+offset, bufResPhysioTimeStampSecond, currLen);
+        offset += currLen;
+        delete [] bufResPhysioTimeStampSecond;
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors happened in GtPlusRecon2DTCloudPackage<T>::serialize(...) ... ");
+
+        if ( bufKSpace != NULL ) delete [] bufKSpace;
+        if ( bufTimeStamp != NULL ) delete [] bufTimeStamp;
+        if ( bufPhysioTimeStamp != NULL ) delete [] bufPhysioTimeStamp;
+        if ( bufRef != NULL ) delete [] bufRef;
+        if ( bufComplexIm != NULL ) delete [] bufComplexIm;
+        if ( bufRes != NULL ) delete [] bufRes;
+        if ( bufComplexImSecond != NULL ) delete [] bufComplexImSecond;
+        if ( bufResTimeStampSecond != NULL ) delete [] bufResTimeStampSecond;
+        if ( bufResPhysioTimeStampSecond != NULL ) delete [] bufResPhysioTimeStampSecond;
+
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool GtPlusRecon2DTCloudPackage<T>::deserialize(char* buf, size_t& len)
+{
+    try
+    {
+        memcpy(&para, buf, sizeof(GtPlusRecon2DTPara));
+
+        size_t offset(sizeof(GtPlusRecon2DTPara)), currLen=0;
+
+        GADGET_CHECK_RETURN_FALSE(kspace.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        GADGET_CHECK_RETURN_FALSE(timeStamp.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        GADGET_CHECK_RETURN_FALSE(physioTimeStamp.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        GADGET_CHECK_RETURN_FALSE(ref.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        GADGET_CHECK_RETURN_FALSE(complexIm.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        GADGET_CHECK_RETURN_FALSE(res.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        GADGET_CHECK_RETURN_FALSE(complexImSecond.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        GADGET_CHECK_RETURN_FALSE(resTimeStampSecond.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        GADGET_CHECK_RETURN_FALSE(resPhysioTimeStampSecond.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        // total length
+        len = offset;
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors happended in GtPlusRecon2DTCloudPackage<T>::deserialize(...) ...");
+        return false;
+    }
+
+    return true;
+}
+
+typedef GtPlusRecon2DTCloudPackage< std::complex<float> > GtPlusRecon2DTCloudPackageCPFL;
+
+}
diff --git a/gadgets/gtPlus/GtPlusRecon2DTGadget.cpp b/gadgets/gtPlus/GtPlusRecon2DTGadget.cpp
new file mode 100644
index 0000000..bbfa39c
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusRecon2DTGadget.cpp
@@ -0,0 +1,550 @@
+
+#include "GtPlusRecon2DTGadget.h"
+
+#ifdef USE_OMP
+    #include <omp.h>
+#endif // USE_OMP
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+GtPlusRecon2DTGadget::GtPlusRecon2DTGadget() : BaseClass()
+{
+
+}
+
+GtPlusRecon2DTGadget::~GtPlusRecon2DTGadget()
+{
+
+}
+
+bool GtPlusRecon2DTGadget::readParameters()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(BaseClass::readParameters());
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "------> GtPlusRecon2DTGadget parameters <------");
+
+        boost::shared_ptr<std::string> str = this->get_string_value("dim_4th");
+        para_.dim_4th_ = gtPlus_util_.getISMRMRDDimFromName(*str);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "dim_4th_ is " << *str);
+
+        str = this->get_string_value("dim_5th");
+        para_.dim_5th_ = gtPlus_util_.getISMRMRDDimFromName(*str);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "dim_5th_ is " << *str);
+
+        str = this->get_string_value("workOrder_ShareDim");
+        para_.workOrder_ShareDim_ = gtPlus_util_.getISMRMRDDimFromName(*str);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "workOrder_ShareDim_ is " << *str);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+        para_.no_acceleration_averageall_ref_ = this->get_bool_value("no_acceleration_averageall_ref");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "no_acceleration_averageall_ref_ is " << para_.no_acceleration_averageall_ref_);
+
+        para_.no_acceleration_ref_numOfModes_ = this->get_int_value("no_acceleration_ref_numOfModes");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "no_acceleration_ref_numOfModes_ is " << para_.no_acceleration_ref_numOfModes_);
+
+        para_.no_acceleration_same_combinationcoeff_allS_ = this->get_bool_value("no_acceleration_same_combinationcoeff_allS");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "no_acceleration_same_combinationcoeff_allS_ is " << para_.no_acceleration_same_combinationcoeff_allS_);
+
+        para_.no_acceleration_whichS_combinationcoeff_ = this->get_int_value("no_acceleration_whichS_combinationcoeff");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "no_acceleration_whichS_combinationcoeff_ is " << para_.no_acceleration_whichS_combinationcoeff_);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+        para_.interleaved_same_combinationcoeff_allS_ = this->get_bool_value("interleaved_same_combinationcoeff_allS");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "interleaved_same_combinationcoeff_allS_ is " << para_.interleaved_same_combinationcoeff_allS_);
+
+        para_.interleaved_ref_numOfModes_ = this->get_int_value("interleaved_ref_numOfModes");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "interleaved_ref_numOfModes_ is " << para_.interleaved_ref_numOfModes_);
+
+        para_.interleaved_whichS_combinationcoeff_ = this->get_int_value("interleaved_whichS_combinationcoeff");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "interleaved_whichS_combinationcoeff_ is " << para_.interleaved_whichS_combinationcoeff_);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+        para_.embedded_averageall_ref_ = this->get_bool_value("embedded_averageall_ref");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "embedded_averageall_ref_ is " << para_.embedded_averageall_ref_);
+
+        para_.embedded_ref_numOfModes_ = this->get_int_value("embedded_ref_numOfModes");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "embedded_ref_numOfModes_ is " << para_.embedded_ref_numOfModes_);
+
+        para_.embedded_fullres_coilmap_ = this->get_bool_value("embedded_fullres_coilmap");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "embedded_fullres_coilmap_ is " << para_.embedded_fullres_coilmap_);
+
+        para_.embedded_fullres_coilmap_useHighestSignal_ = this->get_bool_value("embedded_fullres_coilmap_useHighestSignal");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "embedded_fullres_coilmap_useHighestSignal_ is " << para_.embedded_fullres_coilmap_useHighestSignal_);
+
+        para_.embedded_same_combinationcoeff_allS_ = this->get_bool_value("embedded_same_combinationcoeff_allS");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "embedded_same_combinationcoeff_allS_ is " << para_.embedded_same_combinationcoeff_allS_);
+
+        para_.embedded_whichS_combinationcoeff_ = this->get_int_value("embedded_whichS_combinationcoeff");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "embedded_whichS_combinationcoeff_ is " << para_.embedded_whichS_combinationcoeff_);
+
+        para_.embedded_ref_fillback_ = this->get_bool_value("embedded_ref_fillback");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "embedded_ref_fillback_ is " << para_.embedded_ref_fillback_);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+        para_.separate_averageall_ref_ = this->get_bool_value("separate_averageall_ref");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "separate_averageall_ref_ is " << para_.separate_averageall_ref_);
+
+        para_.separate_ref_numOfModes_ = this->get_int_value("separate_ref_numOfModes");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "separate_ref_numOfModes_ is " << para_.separate_ref_numOfModes_);
+
+        para_.separate_fullres_coilmap_ = this->get_bool_value("separate_fullres_coilmap");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "separate_fullres_coilmap_ is " << para_.separate_fullres_coilmap_);
+
+        para_.separate_same_combinationcoeff_allS_ = this->get_bool_value("separate_same_combinationcoeff_allS");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "separate_same_combinationcoeff_allS_ is " << para_.separate_same_combinationcoeff_allS_);
+
+        para_.separate_whichS_combinationcoeff_ = this->get_int_value("separate_whichS_combinationcoeff");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "separate_whichS_combinationcoeff_ is " << para_.separate_whichS_combinationcoeff_);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+        para_.same_coil_compression_coeff_allS_ = this->get_bool_value("same_coil_compression_coeff_allS");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "same_coil_compression_coeff_allS_ is " << para_.same_coil_compression_coeff_allS_);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+        // get the parameters from base class
+        // BaseClass::readParameters();
+
+        para_.recon_kspace_needed_ = recon_kspace_needed_;
+        para_.workOrderPara_ = workOrderPara_;
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusRecon2DTGadget::readParameters() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusRecon2DTGadget::setWorkOrder2DTParameters(WorkOrder2DTType* workOrder)
+{
+    workOrder->recon_kspace_needed_ = para_.recon_kspace_needed_;
+
+    if ( para_.workOrderPara_.coil_compression_thres_>0 || para_.workOrderPara_.coil_compression_num_modesKept_>0 )
+    {
+        workOrder->coil_compression_ = true;
+    }
+    else
+    {
+        workOrder->coil_compression_ = false;
+    }
+
+    workOrder->same_coil_compression_coeff_allS_ = para_.same_coil_compression_coeff_allS_;
+
+    workOrder->embedded_averageall_ref_ = para_.embedded_averageall_ref_;
+    workOrder->embedded_ref_numOfModes_ = para_.embedded_ref_numOfModes_;
+    workOrder->embedded_fullres_coilmap_ = para_.embedded_fullres_coilmap_;
+    workOrder->embedded_fullres_coilmap_useHighestSignal_ = para_.embedded_fullres_coilmap_useHighestSignal_;
+    workOrder->embedded_same_combinationcoeff_allS_ = para_.embedded_same_combinationcoeff_allS_;
+    workOrder->embedded_whichS_combinationcoeff_ = para_.embedded_whichS_combinationcoeff_;
+    workOrder->embedded_ref_fillback_ = para_.embedded_ref_fillback_;
+
+    workOrder->separate_averageall_ref_ = para_.separate_averageall_ref_;
+    workOrder->separate_ref_numOfModes_ = para_.separate_ref_numOfModes_;
+    workOrder->separate_fullres_coilmap_ = para_.separate_fullres_coilmap_;
+    workOrder->separate_same_combinationcoeff_allS_ = para_.separate_same_combinationcoeff_allS_;
+    workOrder->separate_whichS_combinationcoeff_ = para_.separate_whichS_combinationcoeff_;
+
+    workOrder->interleaved_same_combinationcoeff_allS_ = para_.interleaved_same_combinationcoeff_allS_;
+    workOrder->interleaved_whichS_combinationcoeff_ = para_.interleaved_whichS_combinationcoeff_;
+    workOrder->interleaved_ref_numOfModes_ = para_.interleaved_ref_numOfModes_;
+
+    workOrder->no_acceleration_averageall_ref_ = para_.no_acceleration_averageall_ref_;
+    workOrder->no_acceleration_ref_numOfModes_ = para_.no_acceleration_ref_numOfModes_;
+    workOrder->no_acceleration_same_combinationcoeff_allS_ = para_.no_acceleration_same_combinationcoeff_allS_;
+    workOrder->no_acceleration_whichS_combinationcoeff_ = para_.no_acceleration_whichS_combinationcoeff_;
+
+    return true;
+}
+
+int GtPlusRecon2DTGadget::process_config(ACE_Message_Block* mb)
+{
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    //   0  1  2   3    4   5    6     7  8   9
+    GADGET_CHECK_RETURN(BaseClass::process_config(mb)==GADGET_OK, GADGET_FAIL);
+
+    if ( CloudComputing_ )
+    {
+        bool parseSuccess = this->parseGTCloudNodeFile(cloud_node_file_, gt_cloud_);
+        if ( parseSuccess )
+        {
+            CloudSize_ = (unsigned int)gt_cloud_.size();
+            if ( CloudSize_ == 0 ) CloudComputing_ = false;
+        }
+        else
+        {
+            CloudComputing_ = false;
+        }
+    }
+
+    return GADGET_OK;
+}
+
+int GtPlusRecon2DTGadget::process(Gadgetron::GadgetContainerMessage< GtPlusGadgetImageArray >* m1, Gadgetron::GadgetContainerMessage< WorkOrderType > * m2)
+{
+    GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusRecon2DTGadget::process(...) starts ... ");
+
+    processed_called_times_++;
+
+
+    GtPlusGadgetImageArray* images = m1->getObjectPtr();
+
+    WorkOrderType* workOrder = m2->getObjectPtr();
+
+    boost::shared_ptr< std::vector<size_t> > dims = workOrder->data_.get_dimensions();
+
+    size_t SEG = (*dims)[9];
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "[Ro E1 Cha Slice E2 Con Phase Rep Set Seg Ave] = [" 
+                                                << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " 
+                                                << (*dims)[3] << " " << (*dims)[4] << " " << (*dims)[5] << " " 
+                                                << (*dims)[6] << " " << (*dims)[7] << " " << (*dims)[8] << " " 
+                                                << (*dims)[9] << " " << (*dims)[10] << "]");
+
+    dimensions_ = *dims;
+
+    // fill in more parameters
+    para_.reconSizeRO_ = std::max(matrix_size_recon_[0], (*dims)[0]);
+    para_.reconSizeE1_ = reconE1_;
+    para_.reconSizeE2_ = reconE2_;
+    para_.encodingFOV_RO_ = field_of_view_encoding_[0];
+    para_.encodingFOV_E1_ = field_of_view_encoding_[1];
+    para_.encodingFOV_E2_ = field_of_view_encoding_[2];
+    para_.reconFOV_RO_ = field_of_view_recon_[0];
+    para_.reconFOV_E1_ = field_of_view_recon_[1];
+    para_.reconFOV_E2_ = field_of_view_recon_[2];
+
+    para_.workOrderPara_.CalibMode_ = workOrder->CalibMode_;
+    para_.workOrderPara_.InterleaveDim_ = workOrder->InterleaveDim_;
+
+    para_.workOrderPara_.acceFactorE1_ = workOrder->acceFactorE1_;
+    para_.workOrderPara_.acceFactorE2_ = workOrder->acceFactorE2_;
+
+    para_.workOrderPara_.kSpaceCenterRO_ = workOrder->kSpaceCenterRO_;
+    para_.workOrderPara_.kSpaceCenterEncode1_ = workOrder->kSpaceCenterEncode1_;
+    para_.workOrderPara_.kSpaceCenterEncode2_ = workOrder->kSpaceCenterEncode2_;
+
+    para_.workOrderPara_.kSpaceMaxRO_ = workOrder->kSpaceMaxRO_;
+    para_.workOrderPara_.kSpaceMaxEncode1_ = workOrder->kSpaceMaxEncode1_;
+    para_.workOrderPara_.kSpaceMaxEncode2_ = workOrder->kSpaceMaxEncode2_;
+
+    para_.workOrderPara_.start_RO_ = workOrder->start_RO_;
+    para_.workOrderPara_.end_RO_ = workOrder->end_RO_;
+
+    para_.workOrderPara_.start_E1_ = workOrder->start_E1_;
+    para_.workOrderPara_.end_E1_ = workOrder->end_E1_;
+
+    para_.workOrderPara_.start_E2_ = workOrder->start_E2_;
+    para_.workOrderPara_.end_E2_ = workOrder->end_E2_;
+
+    para_.workOrderPara_.retro_gated_images_ = workOrder->retro_gated_images_;
+    para_.workOrderPara_.retro_gated_segment_size_ = workOrder->retro_gated_segment_size_;
+
+    para_.workOrderPara_.workFlow_BufferKernel_ = workOrder->workFlow_BufferKernel_;
+    para_.workOrderPara_.workFlow_use_BufferedKernel_ = workOrder->workFlow_use_BufferedKernel_;
+    para_.workOrderPara_.num_channels_res_ = workOrder->num_channels_res_;
+
+    bool perform_retro_gating = (para_.workOrderPara_.retro_gated_images_>0);
+
+    // ---------------------------------------------------------
+    // set the work flow
+    // ---------------------------------------------------------
+    workflow_.reconSizeRO_ = para_.reconSizeRO_;
+    workflow_.reconSizeE1_ = para_.reconSizeE1_;
+    workflow_.reconSizeE2_ = para_.reconSizeE2_;
+    workflow_.encodingFOV_RO_ = para_.encodingFOV_RO_;
+    workflow_.encodingFOV_E1_ = para_.encodingFOV_E1_;
+    workflow_.encodingFOV_E2_ = para_.encodingFOV_E2_;
+    workflow_.reconFOV_RO_ = para_.reconFOV_RO_;
+    workflow_.reconFOV_E1_ = para_.reconFOV_E1_;
+    workflow_.reconFOV_E2_ = para_.reconFOV_E2_;
+
+    workflow_.dataDimStartingIndexes_ = workOrder->dataDimStartingIndexes_;
+    workflow_.dim4th_ = para_.dim_4th_;
+    workflow_.dim5th_ = para_.dim_5th_;
+    workflow_.WorkOrderShareDim_ = para_.workOrder_ShareDim_;
+    workflow_.performTiming_ = performTiming_;
+
+    // ---------------------------------------------------------
+    // set work order
+    // ---------------------------------------------------------
+    workOrder->copyFromPara(para_.workOrderPara_);
+
+    workOrder->CloudComputing_ = CloudComputing_;
+    workOrder->CloudSize_ = CloudSize_;
+    workOrder->gt_cloud_ = gt_cloud_;
+
+    // ---------------------------------------------------------
+    // set the worker
+    // ---------------------------------------------------------
+    worker_grappa_.verbose_ = verboseMode_;
+    worker_grappa_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_grappa_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_noacceleration_.verbose_ = verboseMode_;
+    worker_noacceleration_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_noacceleration_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_.verbose_ = verboseMode_;
+    worker_spirit_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_L1_ncg_.verbose_ = verboseMode_;
+    worker_spirit_L1_ncg_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_L1_ncg_.debugFolder_ = debugFolder_fullPath_;
+
+    if ( !debugFolder_fullPath_.empty() ) workflow_.debugFolder_ = debugFolder_fullPath_;
+
+    // if 'other' data is coming in
+    if ( workOrder->other_.get_number_of_elements() > 0 )
+    {
+        workOrder->duplicate(workOrder_recon_other_);
+        setWorkOrder2DTParameters(&workOrder_recon_other_);
+        workflow_.workOrder_ = &workOrder_recon_other_;
+
+        // perform a simple FFT recon
+        workOrder_recon_other_.CalibMode_ = ISMRMRD_noacceleration;
+        workOrder_recon_other_.acceFactorE1_ = 1;
+        workOrder_recon_other_.acceFactorE2_ = 1;
+
+        workOrder_recon_other_.start_RO_ = -1;
+        workOrder_recon_other_.end_RO_ = -1;
+        workOrder_recon_other_.start_E1_ = -1;
+        workOrder_recon_other_.end_E1_ = -1;
+        workOrder_recon_other_.start_E2_ = -1;
+        workOrder_recon_other_.end_E2_ = -1;
+
+        workflow_.worker_ = &worker_noacceleration_;
+        workflow_.setDataArray(workOrder->other_);
+        GADGET_CHECK_RETURN(workflow_.recon(), GADGET_FAIL);
+
+        //hoNDArray<ValueType> resResized;
+        //GADGET_CHECK_RETURN(gtPlus_util_complex_.zpadResize2D(workflow_.res_, workflow_.reconSizeRO_, workflow_.reconSizeE1_, resResized), GADGET_FAIL);
+        //GADGET_CHECK_RETURN(this->sendOutRecon(images, resResized, image_series_+1, workOrder->dataDimStartingIndexes_, "Other"), GADGET_FAIL);
+
+        GADGET_CHECK_RETURN(this->scalingImages(workflow_.res_), GADGET_FAIL);
+        GADGET_CHECK_RETURN(this->sendOutRecon(images, workflow_.res_, image_series_+1, workOrder->dataDimStartingIndexes_, "Other", GADGETRON_IMAGE_OTHER), GADGET_FAIL);
+
+        workflow_.res_.clear();
+        workflow_.data_ = NULL;
+        workflow_.ref_ = NULL;
+        workflow_.workOrder_ = NULL;
+
+        workOrder_recon_other_.reset();
+    }
+
+    // ------------------------------------------------------------------
+    // perform the recon
+    // ------------------------------------------------------------------
+    if ( performTiming_ ) { gt_timer1_.start("Recon 2DT workorder ..."); }
+
+    GADGET_CHECK_RETURN(this->generateKSpaceFilter(*workOrder), GADGET_FAIL);
+
+    /// set the work order
+    workOrder->duplicate(workOrder_recon_);
+    setWorkOrder2DTParameters(&workOrder_recon_);
+
+    workflow_.workOrder_ = &workOrder_recon_;
+
+    if ( verboseMode_ )
+    {
+        workflow_.workOrder_->print(std::cout);
+    }
+
+    /// set the data
+    workflow_.setDataArray(workOrder->data_, workOrder->time_stamp_, workOrder->physio_time_stamp_);
+
+    if ( workOrder->ref_.get_number_of_elements() > 0 )
+    {
+        workflow_.setRefArray(workOrder->ref_);
+    }
+    else if ( CalibMode_==Gadgetron::ISMRMRD_interleaved )
+    {
+        workOrder->ref_ = workOrder->data_;
+        workflow_.setRefArray(workOrder->ref_);
+    }
+
+    // set the work flow for worker and workOrder
+    if ( workOrder->acceFactorE1_ > 1 )
+    {
+        if ( para_.workOrderPara_.recon_algorithm_ == Gadgetron::ISMRMRD_SPIRIT )
+        {
+            workflow_.worker_ = &worker_spirit_;
+        }
+        else if ( para_.workOrderPara_.recon_algorithm_ == Gadgetron::ISMRMRD_L1SPIRIT )
+        {
+            workflow_.worker_ = &worker_spirit_L1_ncg_;
+        }
+        else
+        {
+            workflow_.worker_ = &worker_grappa_;
+        }
+    }
+    else
+    {
+        workflow_.worker_ = &worker_noacceleration_;
+    }
+
+    if ( workflow_.worker_ != &worker_grappa_ )
+    {
+        GWARN_STREAM("The gfactor computation is currently only avaialbe for grappa reconstruction ... ");
+        workflow_.workOrder_->gfactor_needed_ = false;
+
+        GWARN_STREAM("The wrap-around map computation is currently only avaialbe for grappa reconstruction ... ");
+        workflow_.workOrder_->wrap_around_map_needed_ = false;
+    }
+
+    /// perform the recon
+    GADGET_CHECK_RETURN(workflow_.preProcessing(), GADGET_FAIL);
+    GADGET_CHECK_RETURN(workflow_.recon(), GADGET_FAIL);
+    GADGET_CHECK_RETURN(workflow_.postProcessing(), GADGET_FAIL);
+
+    if ( performTiming_ ) { gt_timer1_.stop(); }
+
+    if ( !debugFolder2_fullPath_.empty() )
+    {
+        std::ostringstream ostr;
+        ostr << "Recon2DT_" << processed_called_times_;
+
+        hoNDArray< std::complex<float> > res = workflow_.res_;
+        res.squeeze();
+        if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder2_fullPath_+ostr.str()); }
+
+        if ( workflow_.workOrder_->gfactor_needed_ )
+        {
+            std::ostringstream ostr;
+            ostr << "Recon2DT_GFactor_" << processed_called_times_;
+
+            hoNDArray< std::complex<float> > gfactor = workflow_.gfactor_;
+            gfactor.squeeze();
+            if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(gfactor, debugFolder2_fullPath_+ostr.str()); }
+        }
+
+        if ( workflow_.workOrder_->wrap_around_map_needed_ )
+        {
+            std::ostringstream ostr;
+            ostr << "Recon2DT_WrapAroundMap_" << processed_called_times_;
+
+            hoNDArray< std::complex<float> > wrap_around_map = workflow_.wrap_around_map_;
+            wrap_around_map.squeeze();
+            if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(wrap_around_map, debugFolder2_fullPath_+ostr.str()); }
+        }
+
+        if ( workflow_.res_second_.get_number_of_elements() > 0 )
+        {
+            hoNDArray< std::complex<float> > res = workflow_.res_second_;
+            res.squeeze();
+
+            std::ostringstream ostr;
+            ostr << "Recon2DT_second_" << processed_called_times_;
+
+            if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder2_fullPath_+ostr.str()); }
+        }
+    }
+
+    // compute SNR image and stdmap
+    hoNDArray<ValueType> snrImage, stdMap;
+    bool snrImageComputed = false;
+    bool stdMapComputed = false;
+
+    if ( workflow_.workOrder_->gfactor_needed_ || workOrder->acceFactorE1_==1 )
+    {
+        if ( scalingFactor_snr_image_>0 || scalingFactor_std_map_>0)
+        {
+            bool withAcceleration = (workOrder->acceFactorE1_>1);
+
+            if ( !this->computeSNRImage(workflow_.res_, workflow_.gfactor_, 
+                    start_frame_for_std_map_, withAcceleration, snrImage, stdMap) )
+            {
+                snrImage.clear();
+                stdMap.clear();
+            }
+            else
+            {
+                snrImageComputed = true;
+                stdMapComputed = true;
+            }
+
+            if ( workOrder->acceFactorE1_==1 ) snrImageComputed = false;
+        }
+    }
+
+    // send out the results
+    GADGET_CHECK_RETURN(this->scalingImages(workflow_.res_), GADGET_FAIL);
+
+    if ( send_out_recon_ )
+    {
+        if ( perform_retro_gating )
+        {
+            GADGET_CHECK_RETURN(this->sendOutRecon(images, workflow_.res_, workflow_.res_time_stamp_, workflow_.res_physio_time_stamp_, image_series_, workOrder->dataDimStartingIndexes_, "ImageRetro", GADGETRON_IMAGE_RETRO), GADGET_FAIL);
+        }
+        else
+        {
+            GADGET_CHECK_RETURN(this->sendOutRecon(images, workflow_.res_, workflow_.res_time_stamp_, workflow_.res_physio_time_stamp_, image_series_, workOrder->dataDimStartingIndexes_, "Image", GADGETRON_IMAGE_REGULAR), GADGET_FAIL);
+        }
+
+        if ( workflow_.workOrder_->gfactor_needed_ )
+        {
+            Gadgetron::scal((float)scalingFactor_gfactor_, workflow_.gfactor_);
+            GADGET_CHECK_RETURN(this->sendOutRecon(images, workflow_.gfactor_, workflow_.res_time_stamp_, workflow_.res_physio_time_stamp_, image_series_+1, workOrder->dataDimStartingIndexes_, "gfactor", GADGETRON_IMAGE_GFACTOR), GADGET_FAIL);
+        }
+
+        if ( workflow_.workOrder_->wrap_around_map_needed_ )
+        {
+            Gadgetron::scal((float)scalingFactor_wrap_around_map_, workflow_.wrap_around_map_);
+            GADGET_CHECK_RETURN(this->sendOutRecon(images, workflow_.wrap_around_map_, workflow_.res_time_stamp_, workflow_.res_physio_time_stamp_, image_series_+2, workOrder->dataDimStartingIndexes_, "wrap_around_map", GADGETRON_IMAGE_WRAPAROUNDMAP), GADGET_FAIL);
+        }
+
+        if ( scalingFactor_snr_image_>0 && snrImage.get_number_of_elements()>0 && snrImageComputed )
+        {
+            Gadgetron::scal((float)scalingFactor_snr_image_, snrImage);
+            GADGET_CHECK_RETURN(this->sendOutRecon(images, snrImage, workflow_.res_time_stamp_, workflow_.res_physio_time_stamp_, image_series_+3, workOrder->dataDimStartingIndexes_, "snr_map", GADGETRON_IMAGE_SNR_MAP), GADGET_FAIL);
+        }
+
+        if ( scalingFactor_std_map_>0 && stdMap.get_number_of_elements()>0 && stdMapComputed )
+        {
+            Gadgetron::scal((float)scalingFactor_std_map_, stdMap);
+            GADGET_CHECK_RETURN(this->sendOutRecon(images, stdMap, workflow_.res_time_stamp_, workflow_.res_physio_time_stamp_, image_series_+4, workOrder->dataDimStartingIndexes_, "std_map", GADGETRON_IMAGE_STD_MAP), GADGET_FAIL);
+        }
+    }
+
+    if ( send_out_recon_second_ )
+    {
+        if ( workflow_.res_second_.get_number_of_elements() > 0 )
+        {
+            Gadgetron::scal((float)scalingFactor_, workflow_.res_second_);
+            GADGET_CHECK_RETURN(this->sendOutRecon(images, workflow_.res_second_, workflow_.res_time_stamp_second_, workflow_.res_physio_time_stamp_second_, image_series_+5, workOrder->dataDimStartingIndexes_, "ImageSecond", GADGETRON_IMAGE_REGULAR), GADGET_FAIL);
+        }
+    }
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusRecon2DTGadget::process(...) ends ... ");
+
+    // reset the status
+    workflow_.data_ = NULL;
+    workflow_.time_stamp_ = NULL;
+    workflow_.physio_time_stamp_ = NULL;
+    workflow_.ref_ = NULL;
+    workflow_.noise_ = NULL;
+    workflow_.workOrder_ = NULL;
+    // Gadgetron::clear(&workflow_.res_);
+
+    m1->release();
+    return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusRecon2DTGadget)
+
+}
diff --git a/gadgets/gtPlus/GtPlusRecon2DTGadget.h b/gadgets/gtPlus/GtPlusRecon2DTGadget.h
new file mode 100644
index 0000000..4bc36c6
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusRecon2DTGadget.h
@@ -0,0 +1,63 @@
+/** \file   GtPlusRecon2DTGadget.h
+    \brief  This gadget encapsulates the reconstruction for 2DT cases.
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "GtPlusReconGadget.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian2DT.h"
+#include "gtPlusISMRMRDReconWorker2DTGRAPPA.h"
+#include "gtPlusISMRMRDReconWorker2DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorker2DTNoAcceleration.h"
+#include "gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h"
+#include "GtPlusRecon2DTCloudPackage.h"
+
+namespace Gadgetron
+{
+
+class EXPORTGTPLUSGADGET GtPlusRecon2DTGadget : public GtPlusReconGadget
+{
+public:
+    GADGET_DECLARE(GtPlusRecon2DTGadget);
+
+    typedef GtPlusReconGadget BaseClass;
+
+    typedef BaseClass::ValueType ValueType;
+
+    typedef BaseClass::WorkOrderType WorkOrderType;
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder2DT<ValueType> WorkOrder2DTType;
+
+    typedef BaseClass::DimensionRecordType DimensionRecordType;
+
+    GtPlusRecon2DTGadget();
+    ~GtPlusRecon2DTGadget();
+
+    GtPlusRecon2DTPara para_;
+
+protected:
+
+    virtual bool readParameters();
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage< GtPlusGadgetImageArray >* m1, Gadgetron::GadgetContainerMessage< WorkOrderType > * m2);
+
+    // set 2DT specific work order parameters
+    bool setWorkOrder2DTParameters(WorkOrder2DTType* workOrder);
+
+    // work flow
+    Gadgetron::gtPlus::gtPlusISMRMRDReconWorkFlowCartesian2DT<ValueType> workflow_;
+
+    // worker
+    Gadgetron::gtPlus::gtPlusReconWorker2DTGRAPPA<ValueType> worker_grappa_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTNoAcceleration<ValueType> worker_noacceleration_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTSPIRIT<ValueType> worker_spirit_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTL1SPIRITNCG<ValueType> worker_spirit_L1_ncg_;
+
+    // workOrder for recon
+    WorkOrder2DTType workOrder_recon_;
+
+    // workOrder for recon 'other' data
+    WorkOrder2DTType workOrder_recon_other_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusRecon2DTGadgetCloud.cpp b/gadgets/gtPlus/GtPlusRecon2DTGadgetCloud.cpp
new file mode 100644
index 0000000..c5262e8
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusRecon2DTGadgetCloud.cpp
@@ -0,0 +1,726 @@
+
+#include "GtPlusRecon2DTGadgetCloud.h"
+
+#ifdef USE_OMP
+    #include <omp.h>
+#endif // USE_OMP
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+GtPlusRecon2DTGadgetCloud::GtPlusRecon2DTGadgetCloud() : BaseClass(), curr_node_(0), num_of_jobs_(0)
+{
+    packages_sent_.resize(1024);
+    packages_received_.resize(1024);
+    packages_passed_to_next_gadget_.resize(1024);
+    gt_timer_2DT_cloud_.set_timing_in_destruction(false);
+}
+
+GtPlusRecon2DTGadgetCloud::~GtPlusRecon2DTGadgetCloud()
+{
+
+}
+
+int GtPlusRecon2DTGadgetCloud::process_config(ACE_Message_Block* mb)
+{
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    //   0  1  2   3    4   5    6     7  8   9
+    GADGET_CHECK_RETURN(BaseClass::process_config(mb)==GADGET_OK, GADGET_FAIL);
+
+    if ( CloudComputing_ )
+    {
+        bool parseSuccess = this->parseGTCloudNodeFile(cloud_node_file_, gt_cloud_);
+        if ( parseSuccess )
+        {
+            CloudSize_ = (unsigned int)gt_cloud_.size();
+            if ( CloudSize_ == 0 ) CloudComputing_ = false;
+        }
+
+        if ( CloudComputing_ )
+        {
+            // set up the cloud
+            if (controller_.open () == -1)
+            {
+                GERROR_STREAM("Cloud controller cannot open the cloud ...");
+                controller_.handle_close (ACE_INVALID_HANDLE, 0);
+                CloudComputing_ = false;
+            }
+            else
+            {
+                readers_.resize(CloudSize_, NULL);
+                writers_.resize(CloudSize_, NULL);
+
+                unsigned int j;
+                for ( j=0; j<CloudSize_; j++ )
+                {
+                    readers_[j] = new GtPlus2DTGadgetCloudJobMessageReaderCPFL();
+                    writers_[j] = new GtPlus2DTGadgetCloudJobMessageWriterCPFL();
+                }
+
+                if ( controller_.createConnector(gt_cloud_, GADGET_MESSAGE_GADGETCLOUD_JOB, readers_, GADGET_MESSAGE_GADGETCLOUD_JOB, writers_) != 0 )
+                {
+                    GERROR_STREAM("Cloud controller_ creates connectors failed ...");
+                    controller_.handle_close (ACE_INVALID_HANDLE, 0);
+                    CloudComputing_ = false;
+                }
+                else if ( controller_.connectToCloud(gt_cloud_) != 0 )
+                {
+                    GERROR_STREAM("Cloud controller_ cannot connect to the cloud ...");
+                    controller_.handle_close (ACE_INVALID_HANDLE, 0);
+                    CloudComputing_ = false;
+                }
+            }
+        }
+    }
+
+    return GADGET_OK;
+}
+
+int GtPlusRecon2DTGadgetCloud::process(Gadgetron::GadgetContainerMessage< GtPlusGadgetImageArray >* m1, Gadgetron::GadgetContainerMessage< WorkOrderType > * m2)
+{
+    GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusRecon2DTGadgetCloud::process(...) starts ... ");
+
+    processed_called_times_++;
+
+    // start a gadget level timer
+    if ( processed_called_times_ == 1 )
+    {
+        gt_timer_2DT_cloud_.start("GtPlusRecon2DTGadgetCloud::process(...) gadegt level timer ... ");
+    }
+
+    // send out the package to current node
+    if ( CloudComputing_ )
+    {
+        GtPlusGadgetImageArray* images = m1->getObjectPtr();
+
+        WorkOrderType* workOrder = m2->getObjectPtr();
+
+        boost::shared_ptr< std::vector<size_t> > dims = workOrder->data_.get_dimensions();
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "[Ro E1 Cha Slice E2 Con Phase Rep Set Seg Ave] = [" 
+            << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " << (*dims)[3] << " " << (*dims)[4] 
+            << " " << (*dims)[5] << " " << (*dims)[6] << " " << (*dims)[7] << " " << (*dims)[8] << " " << (*dims)[9] << " " << (*dims)[10] << "]");
+
+        dimensions_ = *dims;
+
+        // fill in more parameters
+        para_.reconSizeRO_ = std::max(matrix_size_recon_[0], (*dims)[0]);
+        para_.reconSizeE1_ = reconE1_;
+        para_.reconSizeE2_ = reconE2_;
+        para_.encodingFOV_RO_ = field_of_view_encoding_[0];
+        para_.encodingFOV_E1_ = field_of_view_encoding_[1];
+        para_.encodingFOV_E2_ = field_of_view_encoding_[2];
+        para_.reconFOV_RO_ = field_of_view_recon_[0];
+        para_.reconFOV_E1_ = field_of_view_recon_[1];
+        para_.reconFOV_E2_ = field_of_view_recon_[2];
+
+        para_.workOrderPara_.CalibMode_ = workOrder->CalibMode_;
+        para_.workOrderPara_.InterleaveDim_ = workOrder->InterleaveDim_;
+
+        para_.workOrderPara_.acceFactorE1_ = workOrder->acceFactorE1_;
+        para_.workOrderPara_.acceFactorE2_ = workOrder->acceFactorE2_;
+
+        para_.workOrderPara_.kSpaceCenterRO_ = workOrder->kSpaceCenterRO_;
+        para_.workOrderPara_.kSpaceCenterEncode1_ = workOrder->kSpaceCenterEncode1_;
+        para_.workOrderPara_.kSpaceCenterEncode2_ = workOrder->kSpaceCenterEncode2_;
+
+        para_.workOrderPara_.kSpaceMaxRO_ = workOrder->kSpaceMaxRO_;
+        para_.workOrderPara_.kSpaceMaxEncode1_ = workOrder->kSpaceMaxEncode1_;
+        para_.workOrderPara_.kSpaceMaxEncode2_ = workOrder->kSpaceMaxEncode2_;
+
+        para_.workOrderPara_.start_RO_ = workOrder->start_RO_;
+        para_.workOrderPara_.end_RO_ = workOrder->end_RO_;
+
+        para_.workOrderPara_.start_E1_ = workOrder->start_E1_;
+        para_.workOrderPara_.end_E1_ = workOrder->end_E1_;
+
+        para_.workOrderPara_.start_E2_ = workOrder->start_E2_;
+        para_.workOrderPara_.end_E2_ = workOrder->end_E2_;
+
+        para_.workOrderPara_.workFlow_BufferKernel_ = workOrder->workFlow_BufferKernel_;
+        para_.workOrderPara_.workFlow_use_BufferedKernel_ = workOrder->workFlow_use_BufferedKernel_;
+        para_.workOrderPara_.num_channels_res_ = workOrder->num_channels_res_;
+
+        // set up a cloud package
+        CloudPackageType package;
+        package.para = para_;
+
+        packages_sent_[num_of_jobs_] = package;
+        packages_sent_[num_of_jobs_].kspace = workOrder->data_;
+
+        packages_received_[num_of_jobs_] = package;
+
+        packages_passed_to_next_gadget_[num_of_jobs_].first = num_of_jobs_;
+        packages_passed_to_next_gadget_[num_of_jobs_].second = false;
+
+        // store image headers
+        GtPlusGadgetImageArray imArray;
+        image_headers_.push_back(imArray);
+        image_headers_[image_headers_.size()-1].copy(*images);
+
+        // send the package to current node
+        std::vector<CloudPackageType* > jobListCloud(1);
+        std::vector<CloudPackageType* > completedJobListCloud(1);
+        std::vector<int> node_ids(1, curr_node_);
+
+        jobListCloud[0] = &packages_sent_[num_of_jobs_];
+        completedJobListCloud[0] = &packages_received_[num_of_jobs_];
+
+        // set the data and ref arrays
+
+        // get the data to be compressed format
+        if ( workOrder->acceFactorE1_>1 && workOrder->CalibMode_==Gadgetron::ISMRMRD_interleaved )
+        {
+            Gadgetron::extractSampledLinesUpTo11DArray(workOrder->data_, jobListCloud[0]->kspace, workOrder->time_stamp_, workOrder->acceFactorE1_, workOrder->acceFactorE2_);
+        }
+        else
+        {
+            jobListCloud[0]->kspace = workOrder->data_;
+        }
+
+        jobListCloud[0]->timeStamp = workOrder->time_stamp_;
+        jobListCloud[0]->physioTimeStamp = workOrder->physio_time_stamp_;
+        if ( workOrder->ref_.get_number_of_elements() > 0 )
+        {
+            jobListCloud[0]->ref = workOrder->ref_;
+        }
+        else if ( CalibMode_==Gadgetron::ISMRMRD_interleaved )
+        {
+            // jobListCloud[0]->ref = workOrder->data_;
+            jobListCloud[0]->ref.clear();
+        }
+
+        num_of_jobs_++;
+
+        if ( controller_.runJobsOnCloud(jobListCloud, completedJobListCloud, node_ids) != 0 )
+        {
+            GERROR_STREAM("Cloud controller runs jobs on the cloud failed ...");
+            controller_.handle_close (ACE_INVALID_HANDLE, 0);
+
+            // run locally
+            int retval = BaseClass::process(m1, m2);
+            packages_passed_to_next_gadget_[num_of_jobs_].second = true;
+
+            return retval;
+        }
+
+        curr_node_++;
+        if ( curr_node_ >= CloudSize_ ) curr_node_ = 0;
+
+        m1->release();
+    }
+    else
+    {
+        return BaseClass::process(m1, m2);
+    }
+
+    return GADGET_OK;
+}
+
+bool GtPlusRecon2DTGadgetCloud::processJob(CloudPackageType& jobSent, CloudPackageType& jobReceived)
+{
+    try
+    {
+        GtPlusRecon2DTCloudPackageCPFL* job = &jobSent;
+
+        boost::shared_ptr< std::vector<size_t> > dims = job->kspace.get_dimensions();
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "job array size : [Ro E1 Cha Slice E2 Con Phase Rep Set Seg Ave] = [" 
+            << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " << (*dims)[3] << " " << (*dims)[4] 
+            << " " << (*dims)[5] << " " << (*dims)[6] << " " << (*dims)[7] << " " << (*dims)[8] << " " << (*dims)[9] << " " << (*dims)[10] << "]");
+
+        GtPlusRecon2DTPara& para = job->para;
+
+        // ---------------------------------------------------------
+        // set the work flow
+        // ---------------------------------------------------------
+        workflow_.reconSizeRO_ = para.reconSizeRO_;
+        workflow_.reconSizeE1_ = para.reconSizeE1_;
+        workflow_.reconSizeE2_ = para.reconSizeE2_;
+        workflow_.encodingFOV_RO_ = para.encodingFOV_RO_;
+        workflow_.encodingFOV_E1_ = para.encodingFOV_E1_;
+        workflow_.encodingFOV_E2_ = para.encodingFOV_E2_;
+        workflow_.reconFOV_RO_ = para.reconFOV_RO_;
+        workflow_.reconFOV_E1_ = para.reconFOV_E1_;
+        workflow_.reconFOV_E2_ = para.reconFOV_E2_;
+
+        // workflow_.dataDimStartingIndexes_ = workOrder->dataDimStartingIndexes_;
+        workflow_.dim4th_ = para.dim_4th_;
+        workflow_.dim5th_ = para.dim_5th_;
+        workflow_.WorkOrderShareDim_ = para.workOrder_ShareDim_;
+        workflow_.performTiming_ = performTiming_;
+
+        // ---------------------------------------------------------
+        // set work order
+        // ---------------------------------------------------------
+        WorkOrder2DTType workOrder;
+
+        workOrder.copyFromPara(para.workOrderPara_);
+
+        workOrder.CloudComputing_ = CloudComputing_;
+        workOrder.CloudSize_ = CloudSize_;
+        workOrder.gt_cloud_ = gt_cloud_;
+
+        if ( workOrder.acceFactorE1_ <= 1 )
+        {
+            workOrder.data_ = job->kspace;
+        }
+        else
+        {
+            Gadgetron::fillSampledLinesUpTo11DArray(job->kspace, workOrder.data_, job->timeStamp);
+        }
+
+        workOrder.time_stamp_ = job->timeStamp;
+        workOrder.physio_time_stamp_ = job->physioTimeStamp;
+        workOrder.ref_ = job->ref;
+
+        // ---------------------------------------------------------
+        // set the worker
+        // ---------------------------------------------------------
+        worker_grappa_.verbose_ = verboseMode_;
+        worker_grappa_.performTiming_ = performTiming_;
+        if ( !debugFolder_fullPath_.empty() ) worker_grappa_.debugFolder_ = debugFolder_fullPath_;
+
+        worker_noacceleration_.verbose_ = verboseMode_;
+        worker_noacceleration_.performTiming_ = performTiming_;
+        if ( !debugFolder_fullPath_.empty() ) worker_noacceleration_.debugFolder_ = debugFolder_fullPath_;
+
+        worker_spirit_.verbose_ = verboseMode_;
+        worker_spirit_.performTiming_ = performTiming_;
+        if ( !debugFolder_fullPath_.empty() ) worker_spirit_.debugFolder_ = debugFolder_fullPath_;
+
+        worker_spirit_L1_ncg_.verbose_ = verboseMode_;
+        worker_spirit_L1_ncg_.performTiming_ = performTiming_;
+        if ( !debugFolder_fullPath_.empty() ) worker_spirit_L1_ncg_.debugFolder_ = debugFolder_fullPath_;
+
+        if ( !debugFolder_fullPath_.empty() ) workflow_.debugFolder_ = debugFolder_fullPath_;
+
+        if ( verboseMode_ )
+        {
+            workOrder.print(std::cout);
+        }
+
+        // perform the recon
+        if ( performTiming_ ) { gt_timer1_.start("Recon 2DT workorder on master node ... "); }
+
+        GADGET_CHECK_RETURN_FALSE(this->generateKSpaceFilter(workOrder));
+
+        workOrder.duplicate(workOrder_recon_);
+        this->setWorkOrder2DTParameters(&workOrder_recon_);
+
+        workflow_.workOrder_ = &workOrder_recon_;
+        if ( verboseMode_ )
+        {
+            workflow_.workOrder_->print(std::cout);
+        }
+
+        workflow_.setDataArray(workOrder.data_, workOrder.time_stamp_, workOrder.physio_time_stamp_);
+
+        if ( workOrder.ref_.get_number_of_elements() > 0 )
+        {
+            workflow_.setRefArray(workOrder.ref_);
+        }
+        else if ( para.workOrderPara_.CalibMode_==Gadgetron::ISMRMRD_interleaved )
+        {
+            workOrder.ref_ = workOrder.data_;
+            workflow_.setRefArray(workOrder.ref_);
+        }
+
+        // set the work flow for worker and workOrder
+        if ( workOrder.acceFactorE1_ > 1 )
+        {
+            if ( para.workOrderPara_.recon_algorithm_ == Gadgetron::ISMRMRD_SPIRIT )
+            {
+                workflow_.worker_ = &worker_spirit_;
+            }
+            else if ( para.workOrderPara_.recon_algorithm_ == Gadgetron::ISMRMRD_L1SPIRIT )
+            {
+                workflow_.worker_ = &worker_spirit_L1_ncg_;
+            }
+            else
+            {
+                workflow_.worker_ = &worker_grappa_;
+            }
+        }
+        else
+        {
+            workflow_.worker_ = &worker_noacceleration_;
+        }
+
+        bool succeed = true;
+        succeed = workflow_.preProcessing();
+        if ( succeed )
+        {
+            succeed = workflow_.recon();
+            if ( succeed )
+            {
+                succeed = workflow_.postProcessing();
+            }
+        }
+
+        if ( performTiming_ ) { gt_timer1_.stop(); }
+
+        if ( !debugFolder_fullPath_.empty() )
+        {
+            std::ostringstream ostr;
+            ostr << "Recon2DT";
+
+            hoNDArray< std::complex<float> > res = workflow_.res_;
+            res.squeeze();
+            if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_fullPath_+ostr.str()); }
+
+            if ( workflow_.res_second_.get_number_of_elements() > 0 )
+            {
+                hoNDArray< std::complex<float> > res = workflow_.res_second_;
+                res.squeeze();
+
+                std::ostringstream ostr;
+                ostr << "Recon2DT_Second_" << processed_called_times_;
+
+                if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_fullPath_+ostr.str()); }
+
+                if ( workflow_.res_time_stamp_second_.get_number_of_elements() > 0 )
+                {
+                    std::ostringstream ostr;
+                    ostr << "Recon2DT_Second_TimeStamp_" << processed_called_times_;
+
+                    hoNDArray<float> res = workflow_.res_time_stamp_second_;
+                    res.squeeze();
+                    if ( debugFolder_fullPath_.empty() ) { gt_exporter_.exportArray(res, debugFolder_fullPath_+ostr.str()); }
+                }
+
+                if ( workflow_.res_physio_time_stamp_second_.get_number_of_elements() > 0 )
+                {
+                    std::ostringstream ostr;
+                    ostr << "Recon2DT_Second_PhysioTimeStamp_" << processed_called_times_;
+
+                    hoNDArray<float> res = workflow_.res_physio_time_stamp_second_;
+                    res.squeeze();
+                    if ( debugFolder_fullPath_.empty() ) { gt_exporter_.exportArray(res, debugFolder_fullPath_+ostr.str()); }
+                }
+            }
+        }
+
+        if ( succeed )
+        {
+            jobReceived.complexIm = workflow_.res_;
+            jobReceived.complexImSecond = workflow_.res_second_;
+            jobReceived.resTimeStampSecond = workflow_.res_time_stamp_second_;
+            jobReceived.resPhysioTimeStampSecond = workflow_.res_physio_time_stamp_second_;
+        }
+        else
+        {
+            jobReceived.complexIm.clear();
+            jobReceived.complexImSecond.clear();
+            jobReceived.resTimeStampSecond.clear();
+            jobReceived.resPhysioTimeStampSecond.clear();
+            jobReceived.res.clear();
+        }
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusRecon2DTGadgetCloud::process(...) ends ... ");
+
+        // reset the status
+        workflow_.data_ = NULL;
+        workflow_.time_stamp_ = NULL;
+        workflow_.physio_time_stamp_ = NULL;
+        workflow_.ref_ = NULL;
+        workflow_.noise_ = NULL;
+        workflow_.workOrder_ = NULL;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in GtPlusRecon2DTGadgetCloud::processJob(CloudPackageType& jobSent, CloudPackageType& jobReceived) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+int GtPlusRecon2DTGadgetCloud::close(unsigned long flags)
+{
+    GDEBUG_CONDITION_STREAM(true, "GtPlusRecon2DTGadgetCloud - close(flags) : " << flags);
+
+    if ( BaseClass::close(flags) != GADGET_OK ) return GADGET_FAIL;
+
+    if ( flags!=0 )
+    {
+        GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusRecon2DTGadgetCloud number of total jobs : " << num_of_jobs_ << " ... ");
+
+        if ( CloudComputing_ )
+        {
+            controller_.closeCloudNode();
+
+            // register a job handler
+            GtPlusRecon2DTGadgetCloudSender gadgetJobHandler;
+            gadgetJobHandler.gadget_ = this;
+            controller_.job_handler_ = &gadgetJobHandler;
+
+            controller_.waitForJobToComplete();
+
+            // if some jobs are not completed successfully, reprocess them; otherwise, send out images
+            std::vector<DimensionRecordType> dataDimStartingIndexes;
+            unsigned int N = (unsigned int)image_headers_.size();
+            unsigned int ii;
+            for ( ii=0; ii<N; ii++ )
+            {
+                bool jobIsOk = true;
+
+                bool recomputeJob = (packages_received_[ii].complexIm.get_number_of_elements() == 0);
+
+                // special check if the second set of recon results is needed
+                if ( recon_res_second_required_ )
+                {
+                    GDEBUG_STREAM("Check received recon results (second set) ... ");
+
+                    if (packages_received_[ii].complexImSecond.get_number_of_elements() == 0)
+                    {
+                        recomputeJob = true;
+                    }
+                    else
+                    {
+                        // check the images are not empty
+                        real_value_type v(0);
+                        Gadgetron::norm2(packages_received_[ii].complexImSecond, v);
+
+                        if ( std::abs(v) < FLT_EPSILON )
+                        {
+                            recomputeJob = true;
+                            GWARN_STREAM("Received recon results (second set) contain no content ... ");
+                        }
+                    }
+                }
+
+                if ( recomputeJob )
+                {
+                    // if the cloud goes wrong, do not try again
+                    CloudComputing_ = false;
+                    jobIsOk = this->processJob(packages_sent_[ii], packages_received_[ii]);
+                }
+
+                if ( jobIsOk )
+                {
+                    if ( !packages_passed_to_next_gadget_[ii].second )
+                    {
+                        GADGET_CHECK_RETURN(this->scalingImages(packages_received_[ii].complexIm), GADGET_FAIL);
+
+                        if ( this->send_out_recon_ )
+                        {
+                            GADGET_CHECK_RETURN(this->sendOutRecon(&image_headers_[ii], packages_received_[ii].complexIm, image_series_, dataDimStartingIndexes, "Image", GADGETRON_IMAGE_REGULAR), GADGET_FAIL);
+                        }
+
+                        if ( this->send_out_recon_second_ )
+                        {
+                            if ( packages_received_[ii].complexImSecond.get_number_of_elements() > 0 )
+                            {
+                                Gadgetron::scal((float)scalingFactor_, packages_received_[ii].complexImSecond);
+
+                                if ( this->para_.workOrderPara_.retro_gated_images_>0 )
+                                {
+                                    GADGET_CHECK_RETURN(this->sendOutRecon(&image_headers_[ii], 
+                                                                            packages_received_[ii].complexImSecond, 
+                                                                            packages_received_[ii].resTimeStampSecond, 
+                                                                            packages_received_[ii].resPhysioTimeStampSecond, 
+                                                                            image_series_+1, dataDimStartingIndexes, 
+                                                                            "ImageRetro", GADGETRON_IMAGE_RETRO), GADGET_FAIL);
+                                }
+                                else
+                                {
+                                    GADGET_CHECK_RETURN(this->sendOutRecon(&image_headers_[ii], 
+                                                                            packages_received_[ii].complexImSecond, 
+                                                                            packages_received_[ii].resTimeStampSecond, 
+                                                                            packages_received_[ii].resPhysioTimeStampSecond, 
+                                                                            image_series_+1, dataDimStartingIndexes, 
+                                                                            "Image", GADGETRON_IMAGE_REGULAR), GADGET_FAIL);
+                                }
+                            }
+                        }
+                    }
+                }
+
+                if ( !debugFolder2_fullPath_.empty() )
+                {
+                    std::ostringstream ostr;
+                    ostr << "GadgetCloud_Recon2DT_" << ii;
+
+                    hoNDArray< std::complex<float> > res = packages_received_[ii].complexIm;
+                    res.squeeze();
+                    if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder2_fullPath_+ostr.str()); }
+
+                    if (packages_received_[ii].complexImSecond.get_number_of_elements() > 0 )
+                    {
+                        hoNDArray< std::complex<float> > res = packages_received_[ii].complexImSecond;
+                        res.squeeze();
+
+                        std::ostringstream ostr;
+                        ostr << "GadgetCloud_Recon2DT_Second_" << ii;
+
+                        if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder2_fullPath_+ostr.str()); }
+
+                        if ( packages_received_[ii].resTimeStampSecond.get_number_of_elements() > 0 )
+                        {
+                            std::ostringstream ostr;
+                            ostr << "GadgetCloud_Recon2DT_Second_TimeStamp_" << ii;
+
+                            hoNDArray<float> res = packages_received_[ii].resTimeStampSecond;
+                            res.squeeze();
+                            if ( debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArray(res, debugFolder2_fullPath_+ostr.str()); }
+                        }
+
+                        if ( packages_received_[ii].resPhysioTimeStampSecond.get_number_of_elements() > 0 )
+                        {
+                            std::ostringstream ostr;
+                            ostr << "GadgetCloud_Recon2DT_Second_PhysioTimeStamp_" << ii;
+
+                            hoNDArray<float> res = packages_received_[ii].resPhysioTimeStampSecond;
+                            res.squeeze();
+                            if ( debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArray(res, debugFolder2_fullPath_+ostr.str()); }
+                        }
+                    }
+                }
+            }
+        }
+
+        gt_timer_2DT_cloud_.stop();
+    }
+
+    return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusRecon2DTGadgetCloud)
+
+// -------------------------------------------------------------------------------------------
+// GtPlusRecon2DTGadgetCloudSender
+// -------------------------------------------------------------------------------------------
+
+GtPlusRecon2DTGadgetCloudSender::GtPlusRecon2DTGadgetCloudSender()
+{
+}
+
+GtPlusRecon2DTGadgetCloudSender::~GtPlusRecon2DTGadgetCloudSender()
+{
+}
+
+bool GtPlusRecon2DTGadgetCloudSender::processJob(int jobID, GtPlusRecon2DTCloudPackage< std::complex<float> >& ajob)
+{
+    try
+    {
+        bool jobIsOk = true;
+        if ( (gadget_->packages_received_[jobID].complexIm.get_number_of_elements() == 0) 
+            && (gadget_->packages_received_[jobID].res.get_number_of_elements() == 0) )
+        {
+            jobIsOk = false;
+            return true;
+        }
+
+        if ( jobIsOk )
+        {
+            std::vector<DimensionRecordType> dataDimStartingIndexes;
+
+            if ( !gadget_->packages_passed_to_next_gadget_[jobID].second )
+            {
+                gadget_->packages_passed_to_next_gadget_[jobID].second = true;
+
+                GADGET_CHECK_RETURN(gadget_->scalingImages(gadget_->packages_received_[jobID].complexIm), false);
+
+                if ( gadget_->send_out_recon_ )
+                {
+                    GADGET_CHECK_RETURN(gadget_->sendOutRecon(&gadget_->image_headers_[jobID], 
+                        gadget_->packages_received_[jobID].complexIm, gadget_->image_series_, dataDimStartingIndexes, "Image", GADGETRON_IMAGE_REGULAR), false);
+                }
+
+                if ( gadget_->send_out_recon_second_ )
+                {
+                    if ( gadget_->packages_received_[jobID].complexImSecond.get_number_of_elements() > 0 )
+                    {
+                        GDEBUG_STREAM("Check received recon results (second set) in cloud sender ... ");
+
+                        // check the images are not empty
+                        float v(0);
+                        Gadgetron::norm2(gadget_->packages_received_[jobID].complexImSecond, v);
+
+                        bool reconResSecondValid = true;
+                        if ( std::abs(v) < FLT_EPSILON )
+                        {
+                            reconResSecondValid = false;
+                            GWARN_STREAM("Received recon results (second set) contain no content ... ");
+                        }
+
+                        if ( reconResSecondValid )
+                        {
+                            Gadgetron::scal((float)gadget_->scalingFactor_, gadget_->packages_received_[jobID].complexImSecond);
+                            if ( gadget_->para_.workOrderPara_.retro_gated_images_ > 0 )
+                            {
+                                GADGET_CHECK_RETURN(gadget_->sendOutRecon(&gadget_->image_headers_[jobID], 
+                                                                        gadget_->packages_received_[jobID].complexImSecond, 
+                                                                        gadget_->packages_received_[jobID].resTimeStampSecond,
+                                                                        gadget_->packages_received_[jobID].resPhysioTimeStampSecond,
+                                                                        gadget_->image_series_+1, dataDimStartingIndexes, 
+                                                                        "ImageRetro", GADGETRON_IMAGE_RETRO), false);
+                            }
+                            else
+                            {
+                                GADGET_CHECK_RETURN(gadget_->sendOutRecon(&gadget_->image_headers_[jobID], 
+                                                                        gadget_->packages_received_[jobID].complexImSecond, 
+                                                                        gadget_->packages_received_[jobID].resTimeStampSecond,
+                                                                        gadget_->packages_received_[jobID].resPhysioTimeStampSecond,
+                                                                        gadget_->image_series_+1, dataDimStartingIndexes, 
+                                                                        "Image", GADGETRON_IMAGE_REGULAR), false);
+                            }
+                        }
+                    }
+                }
+
+                if ( !gadget_->debugFolder2_fullPath_.empty() )
+                {
+                    std::ostringstream ostr;
+                    ostr << "Recon2DT_" << jobID;
+
+                    hoNDArray< std::complex<float> > res = gadget_->packages_received_[jobID].complexIm;
+                    res.squeeze();
+                    if ( !gadget_->debugFolder2_fullPath_.empty() ) { gadget_->gt_exporter_.exportArrayComplex(res, gadget_->debugFolder2_fullPath_+ostr.str()); }
+
+                    if ( gadget_->packages_received_[jobID].complexImSecond.get_number_of_elements() > 0 )
+                    {
+                        std::ostringstream ostr;
+                        ostr << "Recon2DT_Second_" << jobID;
+
+                        hoNDArray< std::complex<float> > res = gadget_->packages_received_[jobID].complexImSecond;
+                        res.squeeze();
+                        if ( !gadget_->debugFolder2_fullPath_.empty() ) { gadget_->gt_exporter_.exportArrayComplex(res, gadget_->debugFolder2_fullPath_+ostr.str()); }
+
+                        if ( gadget_->packages_received_[jobID].resTimeStampSecond.get_number_of_elements() > 0 )
+                        {
+                            std::ostringstream ostr;
+                            ostr << "Recon2DT_Second_TimeStamp_" << jobID;
+
+                            hoNDArray<float> res = gadget_->packages_received_[jobID].resTimeStampSecond;
+                            res.squeeze();
+                            if ( gadget_->debugFolder2_fullPath_.empty() ) { gadget_->gt_exporter_.exportArray(res, gadget_->debugFolder2_fullPath_+ostr.str()); }
+                        }
+
+                        if ( gadget_->packages_received_[jobID].resPhysioTimeStampSecond.get_number_of_elements() > 0 )
+                        {
+                            std::ostringstream ostr;
+                            ostr << "Recon2DT_Second_PhysioTimeStamp_" << jobID;
+
+                            hoNDArray<float> res = gadget_->packages_received_[jobID].resPhysioTimeStampSecond;
+                            res.squeeze();
+                            if ( gadget_->debugFolder2_fullPath_.empty() ) { gadget_->gt_exporter_.exportArray(res, gadget_->debugFolder2_fullPath_+ostr.str()); }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GDEBUG("GtPlusRecon2DTGadgetCloudSender handling close...\n");
+        return false;
+    }
+
+    return true;
+}
+
+}
diff --git a/gadgets/gtPlus/GtPlusRecon2DTGadgetCloud.h b/gadgets/gtPlus/GtPlusRecon2DTGadgetCloud.h
new file mode 100644
index 0000000..3ede0b7
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusRecon2DTGadgetCloud.h
@@ -0,0 +1,91 @@
+/** \file   GtPlusRecon2DTGadgetCloud.h
+    \brief  This is the gateway gadget for the dual layer GtPlus cloud.
+            For every incoming k-space data package, it is sent to a first layer gadget.
+            If a data package was not processed successfully and results were not returned to this gadget,
+            the reconstruction will be performed locally.
+
+            Ref to: 
+
+            Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+            Distributed MRI Reconstruction using Gadgetron based Cloud Computing. 
+            Magenetic Resonance in Medicine, doi: 10.1002/mrm.25213.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "GtPlusRecon2DTGadget.h"
+#include "GadgetCloudController.h"
+#include "GadgetCloudJobMessageReadWrite.h"
+#include "GtPlusRecon2DTCloudPackage.h"
+
+namespace Gadgetron
+{
+
+class EXPORTGTPLUSGADGET GtPlusRecon2DTGadgetCloud : public GtPlusRecon2DTGadget
+{
+public:
+    GADGET_DECLARE(GtPlusRecon2DTGadgetCloud);
+
+    typedef GtPlusRecon2DTGadget BaseClass;
+
+    typedef BaseClass::ValueType ValueType;
+    typedef BaseClass::WorkOrderType WorkOrderType;
+    typedef BaseClass::WorkOrder2DTType WorkOrder2DTType;
+    typedef BaseClass::DimensionRecordType DimensionRecordType;
+
+    typedef GtPlusRecon2DTCloudPackage<ValueType> CloudPackageType;
+
+    typedef Gadgetron::GadgetCloudController< CloudPackageType > GTCloudControllerType;
+
+    GtPlusRecon2DTGadgetCloud();
+    ~GtPlusRecon2DTGadgetCloud();
+
+    virtual int close(unsigned long flags);
+
+    std::vector<CloudPackageType> packages_sent_;
+    std::vector<CloudPackageType> packages_received_;
+
+    // indicate whether the results of all sent packages have been passed to next gadget or not
+    std::vector< std::pair<unsigned int, bool> >  packages_passed_to_next_gadget_;
+
+    // store the image headers for every incoming package
+    std::vector<GtPlusGadgetImageArray> image_headers_;
+
+protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage< GtPlusGadgetImageArray >* m1, Gadgetron::GadgetContainerMessage< WorkOrderType > * m2);
+
+    virtual bool processJob(CloudPackageType& jobSent, CloudPackageType& jobReceived);
+
+    GTCloudControllerType controller_;
+
+    unsigned int curr_node_;
+
+    unsigned int num_of_jobs_;
+
+    std::vector<GadgetMessageReader*> readers_;
+    std::vector<GadgetMessageWriter*> writers_;
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer_2DT_cloud_;
+};
+
+class GtPlusRecon2DTGadgetCloudSender : public GadgetCloudJobProcessHandler< GtPlusRecon2DTCloudPackage< std::complex<float> > >
+{
+public:
+
+    typedef std::pair<Gadgetron::ISMRMRDDIM, size_t> DimensionRecordType;
+
+    GtPlusRecon2DTGadgetCloudSender();
+    virtual ~GtPlusRecon2DTGadgetCloudSender();
+
+    virtual bool processJob(int jobID, GtPlusRecon2DTCloudPackage< std::complex<float> >& ajob);
+
+    // pointer to the gadget
+    GtPlusRecon2DTGadgetCloud* gadget_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusRecon3DTGadget.cpp b/gadgets/gtPlus/GtPlusRecon3DTGadget.cpp
new file mode 100644
index 0000000..04f802e
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusRecon3DTGadget.cpp
@@ -0,0 +1,453 @@
+
+#include "GtPlusRecon3DTGadget.h"
+
+#ifdef USE_OMP
+    #include <omp.h>
+#endif // USE_OMP
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+GtPlusRecon3DTGadget::GtPlusRecon3DTGadget() : BaseClass()
+{
+
+}
+
+GtPlusRecon3DTGadget::~GtPlusRecon3DTGadget()
+{
+
+}
+
+bool GtPlusRecon3DTGadget::readParameters()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(BaseClass::readParameters());
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "------> GtPlusRecon3DTGadget parameters <------");
+
+        boost::shared_ptr<std::string> str = this->get_string_value("dim_5th");
+        para_.dim_5th_ = gtPlus_util_.getISMRMRDDimFromName(*str);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "dim_5th_ is " << *str);
+
+        str = this->get_string_value("workOrder_ShareDim");
+        para_.workOrder_ShareDim_ = gtPlus_util_.getISMRMRDDimFromName(*str);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "workOrder_ShareDim_ is " << *str);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+        para_.no_acceleration_averageall_ref_ = this->get_bool_value("no_acceleration_averageall_ref");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "no_acceleration_averageall_ref_ is " << para_.no_acceleration_averageall_ref_);
+
+        para_.no_acceleration_same_combinationcoeff_allN_ = this->get_bool_value("no_acceleration_same_combinationcoeff_allN");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "no_acceleration_same_combinationcoeff_allN_ is " << para_.no_acceleration_same_combinationcoeff_allN_);
+
+        para_.no_acceleration_whichN_combinationcoeff_ = this->get_int_value("no_acceleration_whichN_combinationcoeff");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "no_acceleration_whichN_combinationcoeff_ is " << para_.no_acceleration_whichN_combinationcoeff_);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+        para_.interleaved_same_combinationcoeff_allN_ = this->get_bool_value("interleaved_same_combinationcoeff_allN");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "interleaved_same_combinationcoeff_allN_ is " << para_.interleaved_same_combinationcoeff_allN_);
+
+        para_.interleaved_whichN_combinationcoeff_ = this->get_int_value("interleaved_whichN_combinationcoeff");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "interleaved_whichN_combinationcoeff_ is " << para_.interleaved_whichN_combinationcoeff_);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+        para_.embedded_averageall_ref_ = this->get_bool_value("embedded_averageall_ref");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "embedded_averageall_ref_ is " << para_.embedded_averageall_ref_);
+
+        para_.embedded_fullres_coilmap_ = this->get_bool_value("embedded_fullres_coilmap");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "embedded_fullres_coilmap_ is " << para_.embedded_fullres_coilmap_);
+
+        para_.embedded_same_combinationcoeff_allN_ = this->get_bool_value("embedded_same_combinationcoeff_allN");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "embedded_same_combinationcoeff_allN_ is " << para_.embedded_same_combinationcoeff_allN_);
+
+        para_.embedded_whichN_combinationcoeff_ = this->get_int_value("embedded_whichN_combinationcoeff");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "embedded_whichN_combinationcoeff_ is " << para_.embedded_whichN_combinationcoeff_);
+
+        para_.embedded_ref_fillback_ = this->get_bool_value("embedded_ref_fillback");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "embedded_ref_fillback_ is " << para_.embedded_ref_fillback_);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+        para_.separate_averageall_ref_ = this->get_bool_value("separate_averageall_ref");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "separate_averageall_ref_ is " << para_.separate_averageall_ref_);
+
+        para_.separate_fullres_coilmap_ = this->get_bool_value("separate_fullres_coilmap");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "separate_fullres_coilmap_ is " << para_.separate_fullres_coilmap_);
+
+        para_.separate_same_combinationcoeff_allN_ = this->get_bool_value("separate_same_combinationcoeff_allN");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "separate_same_combinationcoeff_allN_ is " << para_.separate_same_combinationcoeff_allN_);
+
+        para_.separate_whichN_combinationcoeff_ = this->get_int_value("separate_whichN_combinationcoeff");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "separate_whichN_combinationcoeff_ is " << para_.separate_whichN_combinationcoeff_);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+        para_.same_coil_compression_coeff_allN_ = this->get_bool_value("same_coil_compression_coeff_allN");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "same_coil_compression_coeff_allN_ is " << para_.same_coil_compression_coeff_allN_);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+        // get the parameters from base class
+        // BaseClass::readParameters();
+
+        para_.recon_kspace_needed_ = recon_kspace_needed_;
+        para_.workOrderPara_ = workOrderPara_;
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusRecon3DTGadget::readParameters() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusRecon3DTGadget::setWorkOrder3DTParameters(WorkOrder3DTType* workOrder)
+{
+    workOrder->recon_kspace_needed_ = recon_kspace_needed_;
+
+    if ( para_.workOrderPara_.coil_compression_thres_>0 || para_.workOrderPara_.coil_compression_num_modesKept_>0 )
+    {
+        workOrder->coil_compression_ = true;
+    }
+    else
+    {
+        workOrder->coil_compression_ = false;
+    }
+
+    workOrder->same_coil_compression_coeff_allN_ = para_.same_coil_compression_coeff_allN_;
+
+    workOrder->embedded_averageall_ref_ = para_.embedded_averageall_ref_;
+    workOrder->embedded_fullres_coilmap_ = para_.embedded_fullres_coilmap_;
+    workOrder->embedded_same_combinationcoeff_allN_ = para_.embedded_same_combinationcoeff_allN_;
+    workOrder->embedded_whichN_combinationcoeff_ = para_.embedded_whichN_combinationcoeff_;
+    workOrder->embedded_ref_fillback_ = para_.embedded_ref_fillback_;
+
+    workOrder->separate_averageall_ref_ = para_.separate_averageall_ref_;
+    workOrder->separate_fullres_coilmap_ = para_.separate_fullres_coilmap_;
+    workOrder->separate_same_combinationcoeff_allN_ = para_.separate_same_combinationcoeff_allN_;
+    workOrder->separate_whichN_combinationcoeff_ = para_.separate_whichN_combinationcoeff_;
+
+    //workOrder->interleaved_same_combinationcoeff_allN_ = interleaved_same_combinationcoeff_allN_;
+    //workOrder->interleaved_whichN_combinationcoeff_ = interleaved_whichN_combinationcoeff_;
+
+    workOrder->no_acceleration_averageall_ref_ = para_.no_acceleration_averageall_ref_;
+    workOrder->no_acceleration_same_combinationcoeff_allN_ = para_.no_acceleration_same_combinationcoeff_allN_;
+    workOrder->no_acceleration_whichN_combinationcoeff_ = para_.no_acceleration_whichN_combinationcoeff_;
+
+    return true;
+}
+
+int GtPlusRecon3DTGadget::process_config(ACE_Message_Block* mb)
+{
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    //   0  1  2   3    4   5    6     7  8   9
+    GADGET_CHECK_RETURN(BaseClass::process_config(mb)==GADGET_OK, GADGET_FAIL);
+
+    if ( CloudComputing_ )
+    {
+        bool parseSuccess = this->parseGTCloudNodeFile(cloud_node_file_, gt_cloud_);
+        if ( parseSuccess )
+        {
+            CloudSize_ = (unsigned int)gt_cloud_.size();
+            if ( CloudSize_ == 0 ) CloudComputing_ = false;
+        }
+        else
+        {
+            CloudComputing_ = false;
+        }
+    }
+
+    return GADGET_OK;
+}
+
+int GtPlusRecon3DTGadget::process(Gadgetron::GadgetContainerMessage< GtPlusGadgetImageArray >* m1, Gadgetron::GadgetContainerMessage< WorkOrderType > * m2)
+{
+    GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusRecon3DTGadget::process(...) starts ... ");
+
+    processed_called_times_++;
+
+    GtPlusGadgetImageArray* images = m1->getObjectPtr();
+
+    WorkOrderType* workOrder = m2->getObjectPtr();
+
+    boost::shared_ptr< std::vector<size_t> > dims = workOrder->data_.get_dimensions();
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "[Ro E1 Cha Slice E2 Con Phase Rep Set Seg Ave] = [" 
+        << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " << (*dims)[3] << " " << (*dims)[4] 
+        << " " << (*dims)[5] << " " << (*dims)[6] << " " << (*dims)[7] << " " << (*dims)[8] << " " << (*dims)[9] << " " << (*dims)[10] << "]");
+
+    dimensions_ = *dims;
+
+    // fill in more parameters
+    para_.reconSizeRO_ = matrix_size_recon_[0];
+    para_.reconSizeE1_ = reconE1_;
+    para_.reconSizeE2_ = reconE2_;
+    para_.encodingFOV_RO_ = field_of_view_encoding_[0];
+    para_.encodingFOV_E1_ = field_of_view_encoding_[1];
+    para_.encodingFOV_E2_ = field_of_view_encoding_[2];
+    para_.reconFOV_RO_ = field_of_view_recon_[0];
+    para_.reconFOV_E1_ = field_of_view_recon_[1];
+    para_.reconFOV_E2_ = field_of_view_recon_[2];
+
+    para_.workOrderPara_.CalibMode_ = workOrder->CalibMode_;
+    para_.workOrderPara_.InterleaveDim_ = workOrder->InterleaveDim_;
+
+    para_.workOrderPara_.acceFactorE1_ = workOrder->acceFactorE1_;
+    para_.workOrderPara_.acceFactorE2_ = workOrder->acceFactorE2_;
+
+    para_.workOrderPara_.kSpaceCenterRO_ = workOrder->kSpaceCenterRO_;
+    para_.workOrderPara_.kSpaceCenterEncode1_ = workOrder->kSpaceCenterEncode1_;
+    para_.workOrderPara_.kSpaceCenterEncode2_ = workOrder->kSpaceCenterEncode2_;
+
+    para_.workOrderPara_.kSpaceMaxRO_ = workOrder->kSpaceMaxRO_;
+    para_.workOrderPara_.kSpaceMaxEncode1_ = workOrder->kSpaceMaxEncode1_;
+    para_.workOrderPara_.kSpaceMaxEncode2_ = workOrder->kSpaceMaxEncode2_;
+
+    para_.workOrderPara_.start_RO_ = workOrder->start_RO_;
+    para_.workOrderPara_.end_RO_ = workOrder->end_RO_;
+
+    para_.workOrderPara_.start_E1_ = workOrder->start_E1_;
+    para_.workOrderPara_.end_E1_ = workOrder->end_E1_;
+
+    para_.workOrderPara_.start_E2_ = workOrder->start_E2_;
+    para_.workOrderPara_.end_E2_ = workOrder->end_E2_;
+
+    para_.workOrderPara_.workFlow_BufferKernel_ = workOrder->workFlow_BufferKernel_;
+    para_.workOrderPara_.workFlow_use_BufferedKernel_ = workOrder->workFlow_use_BufferedKernel_;
+    para_.workOrderPara_.num_channels_res_ = workOrder->num_channels_res_;
+
+    // ---------------------------------------------------------
+    // set the work flow
+    // ---------------------------------------------------------
+    workflow_.reconSizeRO_ = para_.reconSizeRO_;
+    workflow_.reconSizeE1_ = para_.reconSizeE1_;
+    workflow_.reconSizeE2_ = para_.reconSizeE2_;
+    workflow_.encodingFOV_RO_ = para_.encodingFOV_RO_;
+    workflow_.encodingFOV_E1_ = para_.encodingFOV_E1_;
+    workflow_.encodingFOV_E2_ = para_.encodingFOV_E2_;
+    workflow_.reconFOV_RO_ = para_.reconFOV_RO_;
+    workflow_.reconFOV_E1_ = para_.reconFOV_E1_;
+    workflow_.reconFOV_E2_ = para_.reconFOV_E2_;
+
+    workflow_.dataDimStartingIndexes_ = workOrder->dataDimStartingIndexes_;
+    workflow_.dim5th_ = para_.dim_5th_;
+    workflow_.WorkOrderShareDim_ = para_.workOrder_ShareDim_;
+    workflow_.performTiming_ = performTiming_;
+
+    // ---------------------------------------------------------
+    // set work order
+    // ---------------------------------------------------------
+    workOrder->copyFromPara(para_.workOrderPara_);
+
+    workOrder->CloudComputing_ = CloudComputing_;
+    workOrder->CloudSize_ = CloudSize_;
+    workOrder->gt_cloud_ = gt_cloud_;
+
+    // ---------------------------------------------------------
+    // set the worker
+    // ---------------------------------------------------------
+    worker_grappa_.verbose_ = verboseMode_;
+    worker_grappa_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_grappa_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_noacceleration_.verbose_ = verboseMode_;
+    worker_noacceleration_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_noacceleration_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_.verbose_ = verboseMode_;
+    worker_spirit_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_L1_ncg_.verbose_ = verboseMode_;
+    worker_spirit_L1_ncg_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_L1_ncg_.debugFolder_ = debugFolder_fullPath_;
+
+    if ( !debugFolder_fullPath_.empty() ) workflow_.debugFolder_ = debugFolder_fullPath_;
+
+    // if 'other' data is coming in
+    if ( workOrder->other_.get_number_of_elements() > 0 )
+    {
+        workOrder->duplicate(workOrder_recon_other_);
+        setWorkOrder3DTParameters(&workOrder_recon_other_);
+        workflow_.workOrder_ = &workOrder_recon_other_;
+
+        // perform a simple FFT recon
+        workOrder_recon_other_.CalibMode_ = ISMRMRD_noacceleration;
+        workOrder_recon_other_.acceFactorE1_ = 1;
+        workOrder_recon_other_.acceFactorE2_ = 1;
+
+        workOrder_recon_other_.start_RO_ = -1;
+        workOrder_recon_other_.end_RO_ = -1;
+        workOrder_recon_other_.start_E1_ = -1;
+        workOrder_recon_other_.end_E1_ = -1;
+        workOrder_recon_other_.start_E2_ = -1;
+        workOrder_recon_other_.end_E2_ = -1;
+
+        workflow_.worker_ = &worker_noacceleration_;
+        workflow_.setDataArray(workOrder->other_);
+        GADGET_CHECK_RETURN(workflow_.recon(), GADGET_FAIL);
+
+        GADGET_CHECK_RETURN(this->scalingImages(workflow_.res_), GADGET_FAIL);
+        GADGET_CHECK_RETURN(this->sendOutRecon(images, workflow_.res_, image_series_+1, workOrder->dataDimStartingIndexes_, "Other", GADGETRON_IMAGE_OTHER), GADGET_FAIL);
+
+        workflow_.res_.clear();
+        workflow_.data_ = NULL;
+        workflow_.ref_ = NULL;
+        workflow_.workOrder_ = NULL;
+
+        workOrder_recon_other_.reset();
+    }
+
+    // perform the recon
+    if ( performTiming_ ) { gt_timer1_.start("Recon 3DT workorder ... "); }
+
+    GADGET_CHECK_RETURN(this->generateKSpaceFilter(*workOrder), GADGET_FAIL);
+
+    workOrder->duplicate(workOrder_recon_);
+    setWorkOrder3DTParameters(&workOrder_recon_);
+
+    workflow_.workOrder_ = &workOrder_recon_;
+    if ( verboseMode_ )
+    {
+        workflow_.workOrder_->print(std::cout);
+    }
+
+    workflow_.setDataArray(workOrder->data_, workOrder->time_stamp_, workOrder->physio_time_stamp_);
+
+    if ( workOrder->ref_.get_number_of_elements() > 0 )
+    {
+        workflow_.setRefArray(workOrder->ref_);
+    }
+    else if ( CalibMode_==Gadgetron::ISMRMRD_interleaved )
+    {
+        workOrder->ref_ = workOrder->data_;
+        workflow_.setRefArray(workOrder->ref_);
+    }
+
+    // set the work flow for worker and workOrder
+    if ( workOrder->acceFactorE1_>1 || workOrder->acceFactorE2_>1 )
+    {
+        if ( para_.workOrderPara_.recon_algorithm_ == Gadgetron::ISMRMRD_SPIRIT )
+        {
+            workflow_.worker_ = &worker_spirit_;
+        }
+        else if ( para_.workOrderPara_.recon_algorithm_ == Gadgetron::ISMRMRD_L1SPIRIT )
+        {
+            workflow_.worker_ = &worker_spirit_L1_ncg_;
+        }
+        else
+        {
+            workflow_.worker_ = &worker_grappa_;
+        }
+    }
+    else
+    {
+        workflow_.worker_ = &worker_noacceleration_;
+    }
+
+    if ( workflow_.worker_ != &worker_grappa_ )
+    {
+        GWARN_STREAM("The gfactor computation is currently only avaialbe for grappa reconstruction ... ");
+        workflow_.workOrder_->gfactor_needed_ = false;
+    }
+
+    GADGET_CHECK_RETURN(workflow_.preProcessing(), GADGET_FAIL);
+    GADGET_CHECK_RETURN(workflow_.recon(), GADGET_FAIL);
+    GADGET_CHECK_RETURN(workflow_.postProcessing(), GADGET_FAIL);
+
+    if ( performTiming_ ) { gt_timer1_.stop(); }
+
+    if ( !debugFolder2_fullPath_.empty() )
+    {
+        std::ostringstream ostr;
+        ostr << "Recon3DT";
+
+        hoNDArray< std::complex<float> > res = workflow_.res_;
+        res.squeeze();
+        if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder2_fullPath_+ostr.str()); }
+
+        if ( workflow_.workOrder_->gfactor_needed_ )
+        {
+            std::ostringstream ostr;
+            ostr << "Recon3DT_GFactor";
+
+            hoNDArray< std::complex<float> > gfactor = workflow_.gfactor_;
+            gfactor.squeeze();
+            if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(gfactor, debugFolder2_fullPath_+ostr.str()); }
+        }
+    }
+
+    // compute SNR image and stdmap
+    hoNDArray<ValueType> snrImage, stdMap;
+    bool snrImageComputed = false;
+    bool stdMapComputed = false;
+
+    if ( workflow_.workOrder_->gfactor_needed_ || workOrder->acceFactorE1_*workOrder->acceFactorE2_==1 )
+    {
+        if ( scalingFactor_snr_image_>0 || scalingFactor_std_map_>0)
+        {
+            bool withAcceleration = (workOrder->acceFactorE1_*workOrder->acceFactorE2_>1);
+
+            if ( !this->computeSNRImage(workflow_.res_, workflow_.gfactor_, 
+                    start_frame_for_std_map_, withAcceleration, snrImage, stdMap) )
+            {
+                snrImage.clear();
+                stdMap.clear();
+            }
+            else
+            {
+                snrImageComputed = true;
+                stdMapComputed = true;
+            }
+
+            if ( workOrder->acceFactorE1_*workOrder->acceFactorE2_==1 ) snrImageComputed = false;
+        }
+    }
+
+    // send out the results
+    GADGET_CHECK_RETURN(this->scalingImages(workflow_.res_), GADGET_FAIL);
+    GADGET_CHECK_RETURN(this->sendOutRecon(images, workflow_.res_, image_series_, workOrder->dataDimStartingIndexes_, "Image", GADGETRON_IMAGE_REGULAR), GADGET_FAIL);
+
+    if ( workflow_.workOrder_->gfactor_needed_ )
+    {
+        Gadgetron::scal((float)scalingFactor_gfactor_, workflow_.gfactor_);
+        GADGET_CHECK_RETURN(this->sendOutRecon(images, workflow_.gfactor_, image_series_+1, workOrder->dataDimStartingIndexes_, "gfactor", GADGETRON_IMAGE_GFACTOR), GADGET_FAIL);
+    }
+
+    if ( scalingFactor_snr_image_>0 && snrImage.get_number_of_elements()>0 && snrImageComputed )
+    {
+        Gadgetron::scal((float)scalingFactor_snr_image_, snrImage);
+        GADGET_CHECK_RETURN(this->sendOutRecon(images, snrImage, image_series_+2, workOrder->dataDimStartingIndexes_, "snr_map", GADGETRON_IMAGE_SNR_MAP), GADGET_FAIL);
+    }
+
+    if ( scalingFactor_std_map_>0 && stdMap.get_number_of_elements()>0 && stdMapComputed )
+    {
+        Gadgetron::scal((float)scalingFactor_std_map_, stdMap);
+        GADGET_CHECK_RETURN(this->sendOutRecon(images, stdMap, image_series_+3, workOrder->dataDimStartingIndexes_, "std_map", GADGETRON_IMAGE_STD_MAP), GADGET_FAIL);
+    }
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusRecon3DTGadget::process(...) ends ... ");
+
+    // reset the status
+    workflow_.data_ = NULL;
+    workflow_.ref_ = NULL;
+    workflow_.noise_ = NULL;
+    workflow_.workOrder_ = NULL;
+    // Gadgetron::clear(&workflow_.res_);
+
+    m1->release();
+    return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusRecon3DTGadget)
+
+}
diff --git a/gadgets/gtPlus/GtPlusRecon3DTGadget.h b/gadgets/gtPlus/GtPlusRecon3DTGadget.h
new file mode 100644
index 0000000..fe6c845
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusRecon3DTGadget.h
@@ -0,0 +1,104 @@
+/** \file   GtPlusRecon3DTGadget.h
+    \brief  This gadget encapsulates the reconstruction for 3DT cases.
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "GtPlusReconGadget.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian3DT.h"
+#include "gtPlusISMRMRDReconWorker3DTGRAPPA.h"
+#include "gtPlusISMRMRDReconWorker3DTNoAcceleration.h"
+#include "gtPlusISMRMRDReconWorker3DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h"
+
+namespace Gadgetron
+{
+
+struct EXPORTGTPLUSGADGET GtPlusRecon3DTPara
+{
+    size_t reconSizeRO_;
+    size_t reconSizeE1_;
+    size_t reconSizeE2_;
+
+    float encodingFOV_RO_;
+    float encodingFOV_E1_;
+    float encodingFOV_E2_;
+
+    float reconFOV_RO_;
+    float reconFOV_E1_;
+    float reconFOV_E2_;
+
+    Gadgetron::ISMRMRDDIM dim_5th_;
+    Gadgetron::ISMRMRDDIM workOrder_ShareDim_;
+
+    bool no_acceleration_averageall_ref_;
+    bool no_acceleration_same_combinationcoeff_allN_;
+    int no_acceleration_whichN_combinationcoeff_;
+
+    bool interleaved_same_combinationcoeff_allN_;
+    int interleaved_whichN_combinationcoeff_;
+
+    bool embedded_averageall_ref_;
+    bool embedded_fullres_coilmap_;
+    bool embedded_same_combinationcoeff_allN_;
+    int embedded_whichN_combinationcoeff_;
+    bool embedded_ref_fillback_;
+
+    bool separate_averageall_ref_;
+    bool separate_fullres_coilmap_;
+    bool separate_same_combinationcoeff_allN_;
+    int separate_whichN_combinationcoeff_;
+
+    bool same_coil_compression_coeff_allN_;
+
+    bool recon_kspace_needed_;
+
+    Gadgetron::gtPlus::gtPlusReconWorkOrderPara workOrderPara_;
+};
+
+class EXPORTGTPLUSGADGET GtPlusRecon3DTGadget : public GtPlusReconGadget
+{
+public:
+    GADGET_DECLARE(GtPlusRecon3DTGadget);
+
+    typedef GtPlusReconGadget BaseClass;
+
+    typedef BaseClass::ValueType ValueType;
+
+    typedef BaseClass::WorkOrderType WorkOrderType;
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder3DT<ValueType> WorkOrder3DTType;
+
+    typedef BaseClass::DimensionRecordType DimensionRecordType;
+
+    GtPlusRecon3DTGadget();
+    ~GtPlusRecon3DTGadget();
+
+    GtPlusRecon3DTPara para_;
+
+protected:
+
+    virtual bool readParameters();
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage< GtPlusGadgetImageArray >* m1, Gadgetron::GadgetContainerMessage< WorkOrderType > * m2);
+
+    // set 3DT specific work order parameters
+    bool setWorkOrder3DTParameters(WorkOrder3DTType* workOrder);
+
+    // work flow
+    Gadgetron::gtPlus::gtPlusISMRMRDReconWorkFlowCartesian3DT<ValueType> workflow_;
+
+    // worker
+    Gadgetron::gtPlus::gtPlusReconWorker3DTGRAPPA<ValueType> worker_grappa_;
+    Gadgetron::gtPlus::gtPlusReconWorker3DTNoAcceleration<ValueType> worker_noacceleration_;
+    Gadgetron::gtPlus::gtPlusReconWorker3DTSPIRIT<ValueType> worker_spirit_;
+    Gadgetron::gtPlus::gtPlusReconWorker3DTL1SPIRITNCG<ValueType> worker_spirit_L1_ncg_;
+
+    // workOrder for recon
+    WorkOrder3DTType workOrder_recon_;
+
+    // workOrder for recon 'other' data
+    WorkOrder3DTType workOrder_recon_other_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusReconGadget.cpp b/gadgets/gtPlus/GtPlusReconGadget.cpp
new file mode 100644
index 0000000..46bc7be
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconGadget.cpp
@@ -0,0 +1,1909 @@
+
+#include "GtPlusReconGadget.h"
+#include "GtPlusGadgetOpenMP.h"
+#include "gadgetron_paths.h"
+#include <iomanip>
+#include "CloudBus.h"
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+    GtPlusReconGadget::GtPlusReconGadget()
+    {
+        image_series_ = 100;
+
+        min_intensity_value_ = 64;
+        max_intensity_value_ = 4095;
+
+        max_intensity_value_US_ = 2048;
+
+        scalingFactor_ = -1;
+        scalingFactor_gfactor_ = 100;
+        scalingFactor_wrap_around_map_ = 1000;
+        scalingFactor_snr_image_ = 10;
+        scalingFactor_std_map_ = 1000;
+
+        start_frame_for_std_map_ = 5;
+
+        use_constant_scalingFactor_ = false;
+
+        timeStampResolution_ = 0.0025f;
+
+        aSpacing_[0] = 2.0;
+        aSpacing_[1] = 2.0;
+        aSpacing_[2] = 6.0;
+        aSpacing_[3] = 1.0;
+        aSpacing_[4] = 1.0;
+        aSpacing_[5] = 1.0;
+
+        reconE1_ = 1;
+        reconE2_ = 1;
+
+        processed_called_times_ = 0;
+
+        thread_number_ratio_ = 0;
+
+        kSpaceMaxAcqE2No_ = 0;
+
+        filterRO_type_ = ISMRMRD_FILTER_GAUSSIAN;
+        filterRO_sigma_ = 1.5;
+        filterRO_width_ = 0.15;
+
+        filterE1_type_ = ISMRMRD_FILTER_GAUSSIAN;
+        filterE1_sigma_ = 1.5;
+        filterE1_width_ = 0.15;
+
+        filterE2_type_ = ISMRMRD_FILTER_GAUSSIAN;
+        filterE2_sigma_ = 1.5;
+        filterE2_width_ = 0.15;
+
+        filterRO_ref_type_ = ISMRMRD_FILTER_HANNING;
+        filterRO_ref_sigma_ = 1.5;
+        filterRO_ref_width_ = 0.15;
+
+        filterE1_ref_type_ = ISMRMRD_FILTER_HANNING;
+        filterE1_ref_sigma_ = 1.5;
+        filterE1_ref_width_ = 0.15;
+
+        filterE2_ref_type_ = ISMRMRD_FILTER_HANNING;
+        filterE2_ref_sigma_ = 1.5;
+        filterE2_ref_width_ = 0.15;
+
+        filterRO_pf_type_ = ISMRMRD_FILTER_HANNING;
+        filterRO_pf_sigma_ = 1.5;
+        filterRO_pf_width_ = 0.15;
+        filterRO_pf_densityComp_ = false;
+
+        filterE1_pf_type_ = ISMRMRD_FILTER_HANNING;
+        filterE1_pf_sigma_ = 1.5;
+        filterE1_pf_width_ = 0.15;
+        filterE1_pf_densityComp_ = false;
+
+        filterE2_pf_type_ = ISMRMRD_FILTER_HANNING;
+        filterE2_pf_sigma_ = 1.5;
+        filterE2_pf_width_ = 0.15;
+        filterE2_pf_densityComp_ = false;
+
+        recon_res_second_required_ = false;
+
+        send_out_recon_ = true;
+        send_out_recon_second_ = true;
+
+        debugFolder_ = "DebugOutput";
+        debugFolder2_ = debugFolder_;
+
+        performTiming_ = true;
+
+        verboseMode_ = false;
+
+        CloudComputing_ = false;
+        CloudSize_ = 0;
+
+        gt_timer1_.set_timing_in_destruction(false);
+        gt_timer2_.set_timing_in_destruction(false);
+        gt_timer3_.set_timing_in_destruction(false);
+
+        Gadgetron::prepOpenMP();
+    }
+
+    GtPlusReconGadget::~GtPlusReconGadget()
+    {
+    }
+
+    bool GtPlusReconGadget::readParameters()
+    {
+        try
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "------> GtPlusReconGadget parameters <------");
+
+            min_intensity_value_ = this->get_int_value("min_intensity_value");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "min_intensity_value_ is " << min_intensity_value_);
+
+            max_intensity_value_ = this->get_int_value("max_intensity_value");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "max_intensity_value_ is " << max_intensity_value_);
+
+            scalingFactor_ = this->get_double_value("scalingFactor");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "scalingFactor_ is " << scalingFactor_);
+
+            scalingFactor_gfactor_ = this->get_double_value("scalingFactor_gfactor");
+            if ( scalingFactor_gfactor_ == 0 ) scalingFactor_gfactor_ = 100;
+            GDEBUG_CONDITION_STREAM(verboseMode_, "scalingFactor_gfactor_ is " << scalingFactor_gfactor_);
+
+            scalingFactor_wrap_around_map_ = this->get_double_value("scalingFactor_wrap_around_map");
+            if ( scalingFactor_wrap_around_map_ == 0 ) scalingFactor_wrap_around_map_ = 1000;
+            GDEBUG_CONDITION_STREAM(verboseMode_, "scalingFactor_wrap_around_map_ is " << scalingFactor_wrap_around_map_);
+
+            scalingFactor_snr_image_ = this->get_double_value("scalingFactor_snr_image");
+            if ( scalingFactor_snr_image_ == 0 ) scalingFactor_snr_image_ = 10;
+            GDEBUG_CONDITION_STREAM(verboseMode_, "scalingFactor_snr_image_ is " << scalingFactor_snr_image_);
+
+            scalingFactor_std_map_ = this->get_double_value("scalingFactor_std_map");
+            if ( scalingFactor_std_map_ == 0 ) scalingFactor_std_map_ = 1000;
+            GDEBUG_CONDITION_STREAM(verboseMode_, "scalingFactor_std_map_ is " << scalingFactor_std_map_);
+
+            start_frame_for_std_map_ = this->get_int_value("start_frame_for_std_map");
+            if ( start_frame_for_std_map_ == 0 ) start_frame_for_std_map_ = 5;
+            GDEBUG_CONDITION_STREAM(verboseMode_, "start_frame_for_std_map_ is " << start_frame_for_std_map_);
+
+            use_constant_scalingFactor_ = this->get_bool_value("use_constant_scalingFactor");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "use_constant_scalingFactor_ is " << use_constant_scalingFactor_);
+
+            boost::shared_ptr<std::string> str = this->get_string_value("debugFolder");
+            debugFolder_ = *str;
+            GDEBUG_CONDITION_STREAM(verboseMode_, "debugFolder_ is " << debugFolder_);
+
+            boost::shared_ptr<std::string> str2 = this->get_string_value("debugFolder2");
+            debugFolder2_ = *str2;
+            GDEBUG_CONDITION_STREAM(verboseMode_, "debugFolder2_ is " << debugFolder2_);
+
+            timeStampResolution_ = (float)this->get_double_value("timeStampResolution");
+            if ( timeStampResolution_ < FLT_EPSILON ) timeStampResolution_ = 0.0025f;
+            GDEBUG_CONDITION_STREAM(verboseMode_, "timeStampResolution_ is " << timeStampResolution_);
+
+            str = this->get_string_value("send_out_recon");
+            if ( !str->empty() )
+            {
+                send_out_recon_ = this->get_bool_value("send_out_recon");
+            }
+            else
+            {
+                send_out_recon_ = true;
+            }
+            GDEBUG_CONDITION_STREAM(verboseMode_, "send_out_recon_ is " << send_out_recon_);
+
+            str = this->get_string_value("send_out_recon_second");
+            if ( !str->empty() )
+            {
+                send_out_recon_second_ = this->get_bool_value("send_out_recon_second");
+            }
+            else
+            {
+                send_out_recon_second_ = true;
+            }
+            GDEBUG_CONDITION_STREAM(verboseMode_, "send_out_recon_second_ is " << send_out_recon_second_);
+
+            performTiming_ = this->get_bool_value("performTiming");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "performTiming_ is " << performTiming_);
+
+            performTiming_ = this->get_bool_value("performTiming");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "performTiming_ is " << performTiming_);
+
+            // kspace filter parameters
+            str = this->get_string_value("filterRO");
+            filterRO_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+            filterRO_sigma_ = this->get_double_value("filterRO_sigma");
+            filterRO_width_ = this->get_double_value("filterRO_width");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_type_ is " << *str);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_sigma_ is " << filterRO_sigma_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_width_ is " << filterRO_width_);
+
+            str = this->get_string_value("filterE1");
+            filterE1_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+            filterE1_sigma_ = this->get_double_value("filterE1_sigma");
+            filterE1_width_ = this->get_double_value("filterE1_width");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_type_ is " << *str);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_sigma_ is " << filterE1_sigma_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_width_ is " << filterE1_width_);
+
+            str = this->get_string_value("filterE2");
+            filterE2_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+            filterE2_sigma_ = this->get_double_value("filterE2_sigma");
+            filterE2_width_ = this->get_double_value("filterE2_width");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_type_ is " << *str);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_sigma_ is " << filterE2_sigma_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_width_ is " << filterE2_width_);
+
+            str = this->get_string_value("filterRefRO");
+            filterRO_ref_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+            filterRO_ref_sigma_ = this->get_double_value("filterRefRO_sigma");
+            filterRO_ref_width_ = this->get_double_value("filterRefRO_width");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_ref_type_ is " << *str);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_ref_sigma_ is " << filterRO_ref_sigma_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_ref_width_ is " << filterRO_ref_width_);
+
+            str = this->get_string_value("filterRefE1");
+            filterE1_ref_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+            filterE1_ref_sigma_ = this->get_double_value("filterRefE1_sigma");
+            filterE1_ref_width_ = this->get_double_value("filterRefE1_width");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_ref_type_ is " << *str);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_ref_sigma_ is " << filterE1_ref_sigma_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_ref_width_ is " << filterE1_ref_width_);
+
+            str = this->get_string_value("filterRefE2");
+            filterE2_ref_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+            filterE2_ref_sigma_ = this->get_double_value("filterRefE2_sigma");
+            filterE2_ref_width_ = this->get_double_value("filterRefE2_width");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_ref_type_ is " << *str);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_ref_sigma_ is " << filterE2_ref_sigma_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_ref_width_ is " << filterE2_ref_width_);
+
+            str = this->get_string_value("filterPartialFourierRO");
+            filterRO_pf_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+            filterRO_pf_sigma_ = this->get_double_value("filterPartialFourierRO_sigma");
+            filterRO_pf_width_ = this->get_double_value("filterPartialFourierRO_width");
+            filterRO_pf_densityComp_ = this->get_bool_value("filterPartialFourierRO_densityComp");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_pf_type_ is " << *str);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_pf_sigma_ is " << filterRO_pf_sigma_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_pf_width_ is " << filterRO_pf_width_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_pf_densityComp_ is " << filterRO_pf_densityComp_);
+
+            str = this->get_string_value("filterPartialFourierE1");
+            filterE1_pf_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+            filterE1_pf_sigma_ = this->get_double_value("filterPartialFourierE1_sigma");
+            filterE1_pf_width_ = this->get_double_value("filterPartialFourierE1_width");
+            filterE1_pf_densityComp_ = this->get_bool_value("filterPartialFourierE1_densityComp");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_pf_type_ is " << *str);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_pf_sigma_ is " << filterE1_pf_sigma_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_pf_width_ is " << filterE1_pf_width_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_pf_densityComp_ is " << filterE1_pf_densityComp_);
+
+            str = this->get_string_value("filterPartialFourierE2");
+            filterE2_pf_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+            filterE2_pf_sigma_ = this->get_double_value("filterPartialFourierE2_sigma");
+            filterE2_pf_width_ = this->get_double_value("filterPartialFourierE2_width");
+            filterE2_pf_densityComp_ = this->get_bool_value("filterPartialFourierE2_densityComp");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_pf_type_ is " << *str);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_pf_sigma_ is " << filterE2_pf_sigma_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_pf_width_ is " << filterE2_pf_width_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_pf_densityComp_ is " << filterE2_pf_densityComp_);
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+            CloudComputing_ = this->get_bool_value("CloudComputing");
+            CloudSize_ = (unsigned int)(this->get_int_value("CloudSize"));
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "CloudComputing_ is " << CloudComputing_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "CloudSize_ is " << CloudSize_);
+
+            str = this->get_string_value("cloudNodeFile");
+            cloud_node_file_ = *str;
+            GDEBUG_CONDITION_STREAM(verboseMode_, "cloud_node_file_ is " << cloud_node_file_);
+
+            // read in the cloud information for every node
+            gt_cloud_.resize(CloudSize_);
+
+            for ( unsigned int ii=0; ii<CloudSize_; ii++ )
+            {
+                std::ostringstream ostreamstr1;
+                ostreamstr1 << "CloudNode" << ii << "_IP" << std::ends;
+                boost::shared_ptr<std::string> IP = this->get_string_value(ostreamstr1.str().c_str());
+                gt_cloud_[ii].get<0>() = *IP;
+
+                std::ostringstream ostreamstr2;
+                ostreamstr2 << "CloudNode" << ii << "_Port" << std::ends;
+                boost::shared_ptr<std::string> Port = this->get_string_value(ostreamstr2.str().c_str());
+                gt_cloud_[ii].get<1>() = *Port;
+
+                std::ostringstream ostreamstr3;
+                ostreamstr3 << "CloudNode" << ii << "_XMLConfiguration" << std::ends;
+                boost::shared_ptr<std::string> xmlName = this->get_string_value(ostreamstr3.str().c_str());
+                gt_cloud_[ii].get<2>() = *xmlName;
+
+                std::ostringstream ostreamstr4;
+                ostreamstr4 << "CloudNode" << ii << "_ComputingPowerIndex" << std::ends;
+                unsigned int computingPowerIndex = this->get_int_value(ostreamstr4.str().c_str());
+                gt_cloud_[ii].get<3>() = computingPowerIndex;
+
+                GDEBUG_CONDITION_STREAM(verboseMode_, "Cloud Node " << ii << " : " << gt_cloud_[ii]);
+            }
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+            thread_number_ratio_ = (float)this->get_double_value("thread_number_ratio");
+            if ( thread_number_ratio_>1 || thread_number_ratio_<0 ) thread_number_ratio_ = 0;
+            GDEBUG_CONDITION_STREAM(verboseMode_, "thread_number_ratio_ is " << thread_number_ratio_);
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "==================================================================");
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "------> GtPlus recon parameters <------");
+
+            workOrderPara_.upstream_coil_compression_ = this->get_bool_value("upstream_coil_compression");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "upstream_coil_compression_ is " << workOrderPara_.upstream_coil_compression_);
+
+            workOrderPara_.upstream_coil_compression_thres_ = this->get_double_value("upstream_coil_compression_thres");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "upstream_coil_compression_thres_ is " << workOrderPara_.upstream_coil_compression_thres_);
+
+            workOrderPara_.upstream_coil_compression_num_modesKept_ = this->get_int_value("upstream_coil_compression_num_modesKept");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "upstream_coil_compression_num_modesKept_ is " << workOrderPara_.upstream_coil_compression_num_modesKept_);
+
+            workOrderPara_.downstream_coil_compression_ = this->get_bool_value("downstream_coil_compression");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "downstream_coil_compression_ is " << workOrderPara_.downstream_coil_compression_);
+
+            workOrderPara_.coil_compression_thres_ = this->get_double_value("coil_compression_thres");
+
+            if (workOrderPara_.upstream_coil_compression_ && (workOrderPara_.upstream_coil_compression_thres_>0) && (workOrderPara_.coil_compression_thres_ > workOrderPara_.upstream_coil_compression_thres_))
+                workOrderPara_.coil_compression_thres_ = workOrderPara_.upstream_coil_compression_thres_;
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "coil_compression_thres_ is " << workOrderPara_.coil_compression_thres_);
+
+            workOrderPara_.coil_compression_num_modesKept_ = this->get_int_value("coil_compression_num_modesKept");
+
+            if (workOrderPara_.upstream_coil_compression_ && (workOrderPara_.upstream_coil_compression_num_modesKept_>0) && (workOrderPara_.coil_compression_num_modesKept_ > workOrderPara_.upstream_coil_compression_num_modesKept_))
+                workOrderPara_.coil_compression_num_modesKept_ = workOrderPara_.upstream_coil_compression_num_modesKept_;
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "coil_compression_num_modesKept_ is " << workOrderPara_.coil_compression_num_modesKept_);
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+            str = this->get_string_value("coil_map_algorithm");
+            workOrderPara_.coil_map_algorithm_ = gtPlus_util_.getISMRMRDCoilMapAlgoFromName(*str);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "coil_map_algorithm_ is " << *str);
+
+            workOrderPara_.csm_kSize_ = (size_t)(this->get_int_value("csm_kSize"));
+            GDEBUG_CONDITION_STREAM(verboseMode_, "csm_kSize_ is " << workOrderPara_.csm_kSize_);
+
+            workOrderPara_.csm_powermethod_num_ = (size_t)(this->get_int_value("csm_powermethod_num"));
+            GDEBUG_CONDITION_STREAM(verboseMode_, "csm_powermethod_num_ is " << workOrderPara_.csm_powermethod_num_);
+
+            workOrderPara_.csm_true_3D_ = this->get_bool_value("csm_true_3D");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "csm_true_3D_ is " << workOrderPara_.csm_true_3D_);
+
+            workOrderPara_.csm_iter_num_ = (size_t)(this->get_int_value("csm_iter_num"));
+            GDEBUG_CONDITION_STREAM(verboseMode_, "csm_iter_num_ is " << workOrderPara_.csm_iter_num_);
+
+            workOrderPara_.csm_iter_thres_ = this->get_double_value("csm_iter_thres");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "csm_iter_thres_ is " << workOrderPara_.csm_iter_thres_);
+
+            workOrderPara_.csm_use_gpu_ = this->get_bool_value("csm_use_gpu");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "csm_use_gpu_ is " << workOrderPara_.csm_use_gpu_);
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+            str = this->get_string_value("recon_algorithm");
+            workOrderPara_.recon_algorithm_ = gtPlus_util_.getISMRMRDReconAlgoFromName(*str);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "recon_algorithm_ is " << *str);
+
+            workOrderPara_.recon_auto_parameters_ = this->get_bool_value("recon_auto_parameters");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "recon_auto_parameters_ is " << workOrderPara_.recon_auto_parameters_);
+
+            workOrderPara_.gfactor_needed_ = this->get_bool_value("gfactor_needed");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "gfactor_needed_ is " << workOrderPara_.gfactor_needed_);
+
+            workOrderPara_.wrap_around_map_needed_ = this->get_bool_value("wrap_around_map_needed");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "wrap_around_map_needed_ is " << workOrderPara_.wrap_around_map_needed_);
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+            workOrderPara_.grappa_kSize_RO_ = (size_t)(this->get_int_value("grappa_kSize_RO"));
+            workOrderPara_.grappa_kSize_E1_ = (size_t)(this->get_int_value("grappa_kSize_E1"));
+            workOrderPara_.grappa_kSize_E2_ = (size_t)(this->get_int_value("grappa_kSize_E2"));
+            workOrderPara_.grappa_reg_lamda_ = this->get_double_value("grappa_reg_lamda");
+            workOrderPara_.grappa_calib_over_determine_ratio_ = this->get_double_value("grappa_calib_over_determine_ratio");
+            workOrderPara_.grappa_use_gpu_ = this->get_bool_value("grappa_use_gpu");
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "grappa_kSize_RO_ is " << workOrderPara_.grappa_kSize_RO_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "grappa_kSize_E1_ is " << workOrderPara_.grappa_kSize_E1_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "grappa_kSize_E2_ is " << workOrderPara_.grappa_kSize_E2_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "grappa_reg_lamda_ is " << workOrderPara_.grappa_reg_lamda_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "grappa_calib_over_determine_ratio_ is " << workOrderPara_.grappa_calib_over_determine_ratio_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "grappa_use_gpu_ is " << workOrderPara_.grappa_use_gpu_);
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+            workOrderPara_.spirit_kSize_RO_ = (size_t)(this->get_int_value("spirit_kSize_RO"));
+            if ( workOrderPara_.spirit_kSize_RO_ == 0 ) workOrderPara_.spirit_kSize_RO_ = 7;
+
+            workOrderPara_.spirit_kSize_E1_ = (size_t)(this->get_int_value("spirit_kSize_E1"));
+            if ( workOrderPara_.spirit_kSize_E1_ == 0 ) workOrderPara_.spirit_kSize_E1_ = 7;
+
+            workOrderPara_.spirit_kSize_E2_ = (size_t)(this->get_int_value("spirit_kSize_E2"));
+            if ( workOrderPara_.spirit_kSize_E2_ == 0 ) workOrderPara_.spirit_kSize_E2_ = 5;
+
+            workOrderPara_.spirit_reg_lamda_ = this->get_double_value("spirit_reg_lamda");
+            if ( workOrderPara_.spirit_reg_lamda_ < FLT_EPSILON ) workOrderPara_.spirit_reg_lamda_ = 0.005;
+
+            workOrderPara_.spirit_use_gpu_ = this->get_bool_value("spirit_use_gpu");
+            workOrderPara_.spirit_calib_over_determine_ratio_ = this->get_double_value("spirit_calib_over_determine_ratio");
+            workOrderPara_.spirit_solve_symmetric_ = this->get_bool_value("spirit_solve_symmetric");
+
+            workOrderPara_.spirit_iter_max_ = (size_t)(this->get_int_value("spirit_iter_max"));
+            if ( workOrderPara_.spirit_iter_max_ == 0 ) workOrderPara_.spirit_iter_max_ = 100;
+
+            workOrderPara_.spirit_iter_thres_ = this->get_double_value("spirit_iter_thres");
+            if ( workOrderPara_.spirit_iter_thres_ < FLT_EPSILON ) workOrderPara_.spirit_iter_thres_ = 0.0015;
+
+            workOrderPara_.spirit_print_iter_ = this->get_bool_value("spirit_print_iter");
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_kSize_RO_ is " << workOrderPara_.spirit_kSize_RO_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_kSize_E1_ is " << workOrderPara_.spirit_kSize_E1_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_kSize_E2_ is " << workOrderPara_.spirit_kSize_E2_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_reg_lamda_ is " << workOrderPara_.spirit_reg_lamda_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_use_gpu_ is " << workOrderPara_.spirit_use_gpu_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_calib_over_determine_ratio_ is " << workOrderPara_.spirit_calib_over_determine_ratio_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_solve_symmetric_ is " << workOrderPara_.spirit_solve_symmetric_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_iter_max_ is " << workOrderPara_.spirit_iter_max_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_iter_thres_ is " << workOrderPara_.spirit_iter_thres_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_print_iter_ is " << workOrderPara_.spirit_print_iter_);
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+            workOrderPara_.spirit_perform_linear_ = this->get_bool_value("spirit_perform_linear");
+            workOrderPara_.spirit_perform_nonlinear_ = this->get_bool_value("spirit_perform_nonlinear");
+            workOrderPara_.spirit_parallel_imaging_lamda_ = this->get_double_value("spirit_parallel_imaging_lamda");
+            workOrderPara_.spirit_image_reg_lamda_ = this->get_double_value("spirit_image_reg_lamda");
+            workOrderPara_.spirit_data_fidelity_lamda_ = this->get_double_value("spirit_data_fidelity_lamda");
+            workOrderPara_.spirit_ncg_iter_max_ = (size_t)(this->get_int_value("spirit_ncg_iter_max"));
+            workOrderPara_.spirit_ncg_iter_thres_ = this->get_double_value("spirit_ncg_iter_thres");
+            workOrderPara_.spirit_ncg_print_iter_ = this->get_bool_value("spirit_ncg_print_iter");
+            // spirit_ncg_scale_factor_ is computed from the data
+
+            workOrderPara_.spirit_use_coil_sen_map_ = this->get_bool_value("spirit_use_coil_sen_map");
+            workOrderPara_.spirit_use_moco_enhancement_ = this->get_bool_value("spirit_use_moco_enhancement");
+            workOrderPara_.spirit_recon_moco_images_ = this->get_bool_value("spirit_recon_moco_images");
+            workOrderPara_.spirit_RO_enhancement_ratio_ = this->get_double_value("spirit_RO_enhancement_ratio");
+            workOrderPara_.spirit_E1_enhancement_ratio_ = this->get_double_value("spirit_E1_enhancement_ratio");
+            workOrderPara_.spirit_E2_enhancement_ratio_ = this->get_double_value("spirit_E2_enhancement_ratio");
+            workOrderPara_.spirit_temporal_enhancement_ratio_ = this->get_double_value("spirit_temporal_enhancement_ratio");
+            workOrderPara_.spirit_2D_scale_per_chunk_ = this->get_bool_value("spirit_2D_scale_per_chunk");
+            workOrderPara_.spirit_3D_scale_per_chunk_ = this->get_bool_value("spirit_3D_scale_per_chunk");
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_perform_linear_ is " << workOrderPara_.spirit_perform_linear_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_perform_nonlinear_ is " << workOrderPara_.spirit_perform_nonlinear_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_parallel_imaging_lamda_ is " << workOrderPara_.spirit_parallel_imaging_lamda_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_image_reg_lamda_ is " << workOrderPara_.spirit_image_reg_lamda_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_data_fidelity_lamda_ is " << workOrderPara_.spirit_data_fidelity_lamda_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_ncg_iter_max_ is " << workOrderPara_.spirit_ncg_iter_max_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_ncg_iter_thres_ is " << workOrderPara_.spirit_ncg_iter_thres_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_ncg_print_iter_ is " << workOrderPara_.spirit_ncg_print_iter_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_use_coil_sen_map_ is " << workOrderPara_.spirit_use_coil_sen_map_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_use_moco_enhancement_ is " << workOrderPara_.spirit_use_moco_enhancement_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_recon_moco_images_ is " << workOrderPara_.spirit_recon_moco_images_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_RO_enhancement_ratio_ is " << workOrderPara_.spirit_RO_enhancement_ratio_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_E1_enhancement_ratio_ is " << workOrderPara_.spirit_E1_enhancement_ratio_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_E2_enhancement_ratio_ is " << workOrderPara_.spirit_E2_enhancement_ratio_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_temporal_enhancement_ratio_ is " << workOrderPara_.spirit_temporal_enhancement_ratio_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_2D_scale_per_chunk_ is " << workOrderPara_.spirit_2D_scale_per_chunk_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "spirit_3D_scale_per_chunk_ is " << workOrderPara_.spirit_3D_scale_per_chunk_);
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+            str = this->get_string_value("retro_gated_interp_method");
+            workOrderPara_.retro_gated_interp_method_ = gtPlus_util_.getISMRMRDRetroGatingInterpFromName(*str);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "retro_gated_interp_method_ is " << *str);
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+            workOrderPara_.job_split_by_S_ = this->get_bool_value("job_split_by_S");
+            workOrderPara_.job_num_of_N_ = (size_t)(this->get_int_value("job_num_of_N"));
+            workOrderPara_.job_max_Megabytes_ = (size_t)(this->get_int_value("job_max_Megabytes"));
+            workOrderPara_.job_overlap_ = (size_t)(this->get_int_value("job_overlap"));
+            workOrderPara_.job_perform_on_control_node_ = this->get_bool_value("job_perform_on_control_node");
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "job_split_by_S_ is " << workOrderPara_.job_split_by_S_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "job_num_of_N_ is " << workOrderPara_.job_num_of_N_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "job_max_Megabytes_ is " << workOrderPara_.job_max_Megabytes_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "job_overlap_ is " << workOrderPara_.job_overlap_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "job_perform_on_control_node_ is " << workOrderPara_.job_perform_on_control_node_);
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+            str = this->get_string_value("partialFourier_algo");
+            workOrderPara_.partialFourier_algo_ = gtPlus_util_.getISMRMRDPartialFourierReconAlgoFromName(*str);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "partialFourier_algo_ is " << *str);
+
+            workOrderPara_.partialFourier_homodyne_iters_ = (size_t)(this->get_int_value("partialFourier_homodyne_iters"));
+            workOrderPara_.partialFourier_homodyne_thres_ = this->get_double_value("partialFourier_homodyne_thres");
+            workOrderPara_.partialFourier_homodyne_densityComp_ = this->get_bool_value("partialFourier_homodyne_densityComp");
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "partialFourier_homodyne_iters_ is " << workOrderPara_.partialFourier_homodyne_iters_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "partialFourier_homodyne_thres_ is " << workOrderPara_.partialFourier_homodyne_thres_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "partialFourier_homodyne_densityComp_ is " << workOrderPara_.partialFourier_homodyne_densityComp_);
+
+            workOrderPara_.partialFourier_POCS_iters_ = (size_t)(this->get_int_value("partialFourier_POCS_iters"));
+            workOrderPara_.partialFourier_POCS_thres_ = this->get_double_value("partialFourier_POCS_thres");
+            workOrderPara_.partialFourier_POCS_transitBand_ = (size_t)(this->get_int_value("partialFourier_POCS_transitBand"));
+            workOrderPara_.partialFourier_POCS_transitBand_E2_ = (size_t)(this->get_int_value("partialFourier_POCS_transitBand_E2"));
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "partialFourier_POCS_iters_ is " << workOrderPara_.partialFourier_POCS_iters_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "partialFourier_POCS_thres_ is " << workOrderPara_.partialFourier_POCS_thres_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "partialFourier_POCS_transitBand_ is " << workOrderPara_.partialFourier_POCS_transitBand_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "partialFourier_POCS_transitBand_ is " << workOrderPara_.partialFourier_POCS_transitBand_E2_);
+
+            workOrderPara_.partialFourier_FengHuang_kSize_RO_ = (size_t)(this->get_int_value("partialFourier_FengHuang_kSize_RO"));
+            workOrderPara_.partialFourier_FengHuang_kSize_E1_ = (size_t)(this->get_int_value("partialFourier_FengHuang_kSize_E1"));
+            workOrderPara_.partialFourier_FengHuang_kSize_E2_ = (size_t)(this->get_int_value("partialFourier_FengHuang_kSize_E2"));
+            workOrderPara_.partialFourier_FengHuang_thresReg_ = this->get_double_value("partialFourier_FengHuang_thresReg");
+            workOrderPara_.partialFourier_FengHuang_sameKernel_allN_ = this->get_bool_value("partialFourier_FengHuang_sameKernel_allN");
+            workOrderPara_.partialFourier_FengHuang_transitBand_ = (size_t)(this->get_int_value("partialFourier_FengHuang_transitBand"));
+            workOrderPara_.partialFourier_FengHuang_transitBand_E2_ = (size_t)(this->get_int_value("partialFourier_FengHuang_transitBand_E2"));
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "partialFourier_FengHuang_kSize_RO_ is " << workOrderPara_.partialFourier_FengHuang_kSize_RO_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "partialFourier_FengHuang_kSize_E1_ is " << workOrderPara_.partialFourier_FengHuang_kSize_E1_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "partialFourier_FengHuang_kSize_E2_ is " << workOrderPara_.partialFourier_FengHuang_kSize_E2_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "partialFourier_FengHuang_thresReg_ is " << workOrderPara_.partialFourier_FengHuang_thresReg_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "partialFourier_FengHuang_sameKernel_allN_ is " << workOrderPara_.partialFourier_FengHuang_sameKernel_allN_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "partialFourier_FengHuang_transitBand_ is " << workOrderPara_.partialFourier_FengHuang_transitBand_);
+            GDEBUG_CONDITION_STREAM(verboseMode_, "partialFourier_FengHuang_transitBand_E2_ is " << workOrderPara_.partialFourier_FengHuang_transitBand_E2_);
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+            recon_kspace_needed_ = this->get_bool_value("recon_kspace_needed");
+            GDEBUG_CONDITION_STREAM(verboseMode_, "recon_kspace_needed_ is " << recon_kspace_needed_);
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in GtPlusReconGadget::readParameters() ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool GtPlusReconGadget::parseGTCloudNodeFile(const std::string& filename, CloudType& gtCloud)
+    {
+
+        bool using_cloudbus = this->get_bool_value("using_cloudbus");
+        bool has_cloud_node_xml_configuration = this->get_string_value("CloudNodeXMLConfiguration")->size();
+
+        if (using_cloudbus && has_cloud_node_xml_configuration) {
+            std::vector<GadgetronNodeInfo> nodes;
+            CloudBus::instance()->get_node_info(nodes);
+            gtCloud.resize(nodes.size());
+
+            unsigned int n;
+            for ( n=0; n<nodes.size(); n++ )
+            {
+                std::stringstream ss;
+                gtCloud[n].get<0>() = nodes[n].address;
+                ss << nodes[n].port;
+                gtCloud[n].get<1>() = ss.str();
+                gtCloud[n].get<2>() = *this->get_string_value("CloudNodeXMLConfiguration");
+                gtCloud[n].get<3>() = nodes[n].compute_capability;
+
+                GDEBUG_CONDITION_STREAM(verboseMode_, "Gadget Node " << n << " : " << gt_cloud_[n]);
+            }
+
+            return true; //We will leave the function here
+
+        }
+
+        std::string nodeFileName = get_gadgetron_home();
+        nodeFileName.append("/share/gadgetron/config/gtCloud/");
+        nodeFileName.append(filename);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "Cloud node file name is " << nodeFileName);
+
+        std::ifstream fs(nodeFileName.c_str(), std::ios::in);
+        if (!fs.is_open()) 
+        {
+            GWARN_STREAM("Cannot open GT CloudNodeFile; use the local setting instead ... ");
+            return false;
+        }
+
+        // control node hostname
+        std::string controlNode;
+        fs >> controlNode;
+
+        std::string portControlNode;
+        fs >> portControlNode;
+
+        // number of GadgetLevel nodes
+        unsigned int num;
+        fs >> num;
+
+        gtCloud.resize(num);
+
+        unsigned int n;
+        for ( n=0; n<num; n++ )
+        {
+            std::string gadgetNode;
+            fs >> gadgetNode;
+
+            std::string portGadgetNode;
+            fs >> portGadgetNode;
+
+            std::string xmlGadgetNode;
+            fs >> xmlGadgetNode;
+
+            unsigned int computingPowerIndex;
+            fs >> computingPowerIndex;
+
+            gtCloud[n].get<0>() = gadgetNode;
+            gtCloud[n].get<1>() = portGadgetNode;
+            gtCloud[n].get<2>() = xmlGadgetNode;
+            gtCloud[n].get<3>() = computingPowerIndex;
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "Gadget Node " << n << " : " << gt_cloud_[n]);
+        }
+
+        fs.close();
+
+        return true;
+    }
+
+    int GtPlusReconGadget::process_config(ACE_Message_Block* mb)
+    {
+        // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg Ave]
+        //   0  1  2   3    4   5    6     7  8   9   10
+
+        verboseMode_ = this->get_bool_value("verboseMode");
+
+        // read parameters from xml
+        image_series_ = this->get_int_value("image_series");
+
+        // read in parameters from the xml
+        GADGET_CHECK_RETURN(this->readParameters(), GADGET_FAIL);
+
+        // check whether the second set of recon results is required
+        recon_res_second_required_ = false;
+
+        ISMRMRD::IsmrmrdHeader h;
+        try {
+            deserialize(mb->rd_ptr(),h);
+        } catch (...) {
+            GDEBUG("Error parsing ISMRMRD Header");
+        }
+
+        if (!h.acquisitionSystemInformation) {
+            GDEBUG("acquisitionSystemInformation not found in header. Bailing out");
+            return GADGET_FAIL;
+        }
+        num_acq_channels_ = h.acquisitionSystemInformation->receiverChannels;
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "Number of acquisition channels : " << num_acq_channels_);
+
+        if (h.encoding.size() < 1 || h.encoding.size() > 2) {
+            GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+            GDEBUG("This GtPlusReconGadget only supports one or two encoding spaces\n");
+            return GADGET_FAIL;
+        }
+
+        // find out the encoding space 
+        ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+        ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+        ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+        matrix_size_encoding_[0] = e_space.matrixSize.x;
+        matrix_size_encoding_[1] = e_space.matrixSize.y;
+        matrix_size_encoding_[2] = e_space.matrixSize.z;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "Encoding matrix size: " << matrix_size_encoding_[0] << " " << matrix_size_encoding_[1] << " " << matrix_size_encoding_[2]);
+
+        field_of_view_encoding_[0] = e_space.fieldOfView_mm.x;
+        field_of_view_encoding_[1] = e_space.fieldOfView_mm.y;
+        field_of_view_encoding_[2] = e_space.fieldOfView_mm.z;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "Encoding field_of_view : " << field_of_view_encoding_[0] << " " << field_of_view_encoding_[1] << " " << field_of_view_encoding_[2]);
+
+        // find the recon space
+        matrix_size_recon_[0] = r_space.matrixSize.x;
+        matrix_size_recon_[1] = r_space.matrixSize.y;
+        matrix_size_recon_[2] = r_space.matrixSize.z;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "Recon matrix size : " << matrix_size_recon_[0] << " " << matrix_size_recon_[1] << " " << matrix_size_recon_[2]);
+
+        field_of_view_recon_[0] = r_space.fieldOfView_mm.x;
+        field_of_view_recon_[1] = r_space.fieldOfView_mm.y;
+        field_of_view_recon_[2] = r_space.fieldOfView_mm.z;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "Recon field_of_view :  " << field_of_view_recon_[0] << " " << field_of_view_recon_[1] << " " << field_of_view_recon_[2]);
+
+        // this gadget supports two encoding spaces only if the
+        // second encoding space has the same field of view and resolution as the first
+        // e.g. for FLASH PAT reference scans.
+        if (h.encoding.size() == 2)
+        {
+            if (! ((h.encoding[0].reconSpace.matrixSize.x == h.encoding[1].reconSpace.matrixSize.x) && 
+                (h.encoding[0].reconSpace.matrixSize.y == h.encoding[1].reconSpace.matrixSize.y) && 
+                (h.encoding[0].reconSpace.matrixSize.z == h.encoding[1].reconSpace.matrixSize.z) && 
+                (h.encoding[0].reconSpace.fieldOfView_mm.x == h.encoding[1].reconSpace.fieldOfView_mm.x) &&
+                (h.encoding[0].reconSpace.fieldOfView_mm.y == h.encoding[1].reconSpace.fieldOfView_mm.y) &&
+                (h.encoding[0].reconSpace.fieldOfView_mm.z == h.encoding[1].reconSpace.fieldOfView_mm.z)) )
+            {
+                GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+                GDEBUG("This GtPlusAccumulatorWorkOrderTriggerGadget only supports two encoding spaces with identical recon spaces.\n");
+                return GADGET_FAIL;
+            }
+        }
+
+        reconE1_ = matrix_size_recon_[1];
+        GDEBUG_CONDITION_STREAM(verboseMode_, "reconE1_ is " << reconE1_);
+
+        reconE2_ = matrix_size_recon_[2];
+        GDEBUG_CONDITION_STREAM(verboseMode_, "reconE2_ is " << reconE2_);
+
+        kSpaceMaxAcqE1No_ = matrix_size_encoding_[1]-1;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "kSpaceMaxAcqE1No_ is " << kSpaceMaxAcqE1No_);
+
+        kSpaceMaxAcqE2No_ = matrix_size_encoding_[2]-1;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "kSpaceMaxAcqE2No_ is " << kSpaceMaxAcqE2No_);
+
+        aSpacing_[0] = field_of_view_recon_[0]/matrix_size_recon_[0];
+        aSpacing_[1] = field_of_view_recon_[1]/reconE1_;
+        aSpacing_[2] = field_of_view_recon_[2]/reconE2_;
+
+        gt_exporter_.setPixelSize(aSpacing_[0], aSpacing_[1], aSpacing_[2], aSpacing_[3], aSpacing_[4], aSpacing_[5]);
+
+        //XUE-TODO: This is actually wrong. This assumes that you always zeropad, which is probably bad practice
+        meas_max_idx_.kspace_encode_step_1 = (uint16_t)matrix_size_encoding_[1]-1;
+        meas_max_idx_.set = (e_limits.set && (e_limits.set->maximum>0)) ? e_limits.set->maximum : 0;
+        meas_max_idx_.phase = (e_limits.phase && (e_limits.phase->maximum>0)) ? e_limits.phase->maximum : 0;
+
+        meas_max_idx_.kspace_encode_step_2 = (uint16_t)matrix_size_encoding_[2]-1; 
+
+        meas_max_idx_.contrast = (e_limits.contrast && (e_limits.contrast->maximum > 0)) ? e_limits.contrast->maximum : 0;
+        meas_max_idx_.slice = (e_limits.slice && (e_limits.slice->maximum > 0)) ? e_limits.slice->maximum : 0;
+        meas_max_idx_.repetition = e_limits.repetition ? e_limits.repetition->maximum : 0;
+        meas_max_idx_.average = e_limits.average ? e_limits.average->maximum : 0;
+
+        // combine all incoming segments
+        meas_max_idx_.segment = 0;
+
+        // find out the PAT mode
+        if (!h.encoding[0].parallelImaging) {
+            GDEBUG("Parallel Imaging section not found in header");
+            return GADGET_FAIL;
+        }
+
+        ISMRMRD::ParallelImaging p_imaging = *h.encoding[0].parallelImaging;
+
+        acceFactorE1_ = (long)(p_imaging.accelerationFactor.kspace_encoding_step_1);
+        acceFactorE2_ = (long)(p_imaging.accelerationFactor.kspace_encoding_step_2);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "acceFactorE1 is " << acceFactorE1_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "acceFactorE2 is " << acceFactorE2_);
+
+        std::string calib = *p_imaging.calibrationMode;
+
+        bool separate = (calib.compare("separate") == 0);
+        bool embedded = (calib.compare("embedded") == 0);
+        bool external = (calib.compare("external") == 0);
+        bool interleaved = (calib.compare("interleaved") == 0);
+        bool other = (calib.compare("other") == 0);
+
+        if ( separate )
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "Colibration mode is separate");
+        }
+        else if ( embedded )
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "Colibration mode is embedded");
+        }
+        else if ( interleaved )
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "Colibration mode is interleaved");
+        }
+        else if ( external )
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "Colibration mode is external");
+        }
+        else if ( other )
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "Colibration mode is other");
+        }
+
+        //if ( other_ && acceFactorE1_==1 && acceFactorE2_==1 )
+        //{
+        //    GDEBUG_CONDITION_STREAM(verboseMode_, "Colibration mode is changed to ISMRMRD_interleaved");
+        //    CalibMode_ = Gadgetron::ISMRMRD_interleaved;
+        //    acceFactorE1_ = 2;
+        //}
+
+        CalibMode_ = Gadgetron::ISMRMRD_noacceleration;
+
+        if ( interleaved )
+        {
+            CalibMode_ = Gadgetron::ISMRMRD_interleaved;
+
+            if ( p_imaging.interleavingDimension )
+            {
+                if ( p_imaging.interleavingDimension->compare("phase") == 0 ) {
+                    InterleaveDim_ = Gadgetron::DIM_Phase;
+                } else if ( p_imaging.interleavingDimension->compare("repetition") == 0 ) {
+                    InterleaveDim_ = Gadgetron::DIM_Repetition;
+                } else if ( p_imaging.interleavingDimension->compare("average") == 0 ) {
+                    InterleaveDim_ = Gadgetron::DIM_Average;
+                } else if ( p_imaging.interleavingDimension->compare("contrast") == 0 ) {
+                    InterleaveDim_ = Gadgetron::DIM_Contrast;
+                } else if ( p_imaging.interleavingDimension->compare("other") == 0 ) {
+                    InterleaveDim_ = Gadgetron::DIM_other1;
+                } else {
+                    GDEBUG("Unknown interleaving dimension. Bailing out");
+                    return GADGET_FAIL;
+                }
+            }
+        }
+        else if ( embedded )
+        {
+            CalibMode_ = Gadgetron::ISMRMRD_embedded;
+        }
+        else if ( separate )
+        {
+            CalibMode_ = Gadgetron::ISMRMRD_separate;
+        }
+        else if ( external )
+        {
+            CalibMode_ = Gadgetron::ISMRMRD_external;
+        }
+        else if ( other )
+        {
+            CalibMode_ = Gadgetron::ISMRMRD_other;
+        }
+
+        // ---------------------------------------------------------------------------------------------------------
+        // generate the destination folder
+        if ( !debugFolder_.empty() )
+        {
+            Gadgetron::getDebugFolderPath(debugFolder_, debugFolder_fullPath_, verboseMode_);
+        }
+        else
+        {
+            GDEBUG_STREAM("GtPlusRecon, debugFolder is not set ...");
+        }
+
+        if ( !debugFolder2_.empty() )
+        {
+            Gadgetron::getDebugFolderPath(debugFolder2_, debugFolder2_fullPath_, verboseMode_);
+        }
+        else
+        {
+            GDEBUG_STREAM("GtPlusRecon, debugFolder2 is not set ...");
+        }
+
+        // ---------------------------------------------------------------------------------------------------------
+        // set the maximal number of threads used
+        if ( thread_number_ratio_>0 && thread_number_ratio_<1 )
+        {
+        }
+
+        return GADGET_OK;
+    }
+
+    int GtPlusReconGadget::process(Gadgetron::GadgetContainerMessage< GtPlusGadgetImageArray >* m1, Gadgetron::GadgetContainerMessage< WorkOrderType > * m2)
+    {
+        GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusReconGadget::process(...) starts ... ");
+
+        processed_called_times_++;
+
+        GtPlusGadgetImageArray* images = m1->getObjectPtr();
+
+        boost::shared_ptr< std::vector<size_t> > dims = m2->getObjectPtr()->data_.get_dimensions();
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "[Ro E1 Cha Slice E2 Con Phase Rep Set Seg Ave] = [" 
+            << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " 
+            << (*dims)[3] << " " << (*dims)[4] << " " << (*dims)[5] << " " 
+            << (*dims)[6] << " " << (*dims)[7] << " " << (*dims)[8] << " " 
+            << (*dims)[9] << " " << (*dims)[10] << "]");
+
+        dimensions_ = *dims;
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusReconGadget::process(...) ends ... ");
+
+        m1->release();
+        return GADGET_OK;
+    }
+
+    size_t GtPlusReconGadget::computeSeriesImageNumber (ISMRMRD::ImageHeader& imheader, size_t nCHA, size_t cha, size_t nE2, size_t e2)
+    {
+        size_t nSET = meas_max_idx_.set+1;
+        size_t nREP = meas_max_idx_.repetition+1;
+        size_t nPHS = meas_max_idx_.phase+1;
+        size_t nSLC = meas_max_idx_.slice+1;
+        size_t nCON = meas_max_idx_.contrast+1;
+        if ( nE2 == 0 ) nE2 = 1;
+
+        size_t imageNum = imheader.average*nREP*nSET*nPHS*nCON*nSLC*nE2*nCHA 
+            + imheader.repetition*nSET*nPHS*nCON*nSLC*nE2*nCHA 
+            + imheader.set*nPHS*nCON*nSLC*nE2*nCHA 
+            + imheader.phase*nCON*nSLC*nE2*nCHA 
+            + imheader.contrast*nSLC*nE2*nCHA
+            + imheader.slice*nE2*nCHA 
+            + e2*nCHA 
+            + cha 
+            + 1;
+
+        return imageNum;
+    }
+
+    bool GtPlusReconGadget::
+        addPrePostZeros(int centreNo, int sampleNo, int& PrePostZeros)
+    {
+        // 1 : pre zeros
+        // 2 : post zeros
+        // 0 : no zeros
+        PrePostZeros = 0;
+
+        if ( sampleNo <= 1 )
+            return true;
+
+        if ( 2*centreNo == sampleNo )
+        {
+            PrePostZeros = 0;
+        }
+
+        if ( 2*centreNo < sampleNo )
+        {
+            PrePostZeros = 1;
+        }
+
+        if ( 2*centreNo > sampleNo )
+        {
+            PrePostZeros = 2;
+        }
+
+        return true;
+    }
+
+    bool GtPlusReconGadget::
+        scalingImages(hoNDArray<ValueType>& res)
+    {
+        if ( scalingFactor_ < 0 && !use_constant_scalingFactor_ )
+        {
+            hoNDArray<float> mag(res.get_dimensions());
+            Gadgetron::abs(res, mag);
+            GADGET_CHECK_RETURN_FALSE(this->scalingMagnitude(mag));
+        }
+
+        scal((float)scalingFactor_, res);
+
+        return true;
+    }
+
+    bool GtPlusReconGadget::
+        scalingMagnitude(hoNDArray<float>& mag)
+    {
+        if ( scalingFactor_ < 0 && !use_constant_scalingFactor_ )
+        {
+            // perform the scaling to [0 max_inten_value_]
+            size_t ind;
+            float maxInten;
+
+            size_t RO = mag.get_size(0);
+            size_t E1 = mag.get_size(1);
+            size_t num = mag.get_number_of_elements()/(RO*E1);
+
+            if ( num <= 24 )
+            {
+                Gadgetron::maxAbsolute(mag, maxInten, ind);
+            }
+            else
+            {
+                hoNDArray<float> magPartial(RO, E1, 24, mag.get_data_ptr()+(num/2 - 12)*RO*E1);
+                Gadgetron::maxAbsolute(magPartial, maxInten, ind);
+            }
+            if ( maxInten < FLT_EPSILON ) maxInten = 1.0f;
+
+            if ( (maxInten<min_intensity_value_) || (maxInten>max_intensity_value_) )
+            {
+                GDEBUG_CONDITION_STREAM(verboseMode_, "Using the dynamic intensity scaling factor - may not have noise prewhitening performed ... ");
+                scalingFactor_ = (float)(max_intensity_value_US_)/maxInten;
+            }
+            else
+            {
+                GDEBUG_CONDITION_STREAM(verboseMode_, "Using the fixed intensity scaling factor - must have noise prewhitening performed ... ");
+                scalingFactor_ = SNR_NOISEFLOOR_SCALEFACTOR;
+
+                while ( (maxInten*scalingFactor_ > max_intensity_value_) && (scalingFactor_>=2) )
+                {
+                    scalingFactor_ /= 2;
+                }
+
+                if (maxInten*scalingFactor_ > max_intensity_value_)
+                {
+                    GDEBUG_CONDITION_STREAM(verboseMode_, "The fixed intensity scaling factor leads to dynamic range overflow - switch to dyanmic intensity scaling ... ");
+                    scalingFactor_ = (float)(max_intensity_value_)/maxInten;
+                }
+
+                use_constant_scalingFactor_ = true;
+            }
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "scalingFactor_ : " << scalingFactor_);
+            scal((float)scalingFactor_, mag);
+        }
+        else
+        {
+            GDEBUG_CONDITION_STREAM(verboseMode_, "Using the fixed intensity scaling factor - scaling factor has been preset to be : " << scalingFactor_ << " ... ");
+            scal((float)scalingFactor_, mag);
+        }
+
+        return true;
+    }
+
+    bool GtPlusReconGadget::
+        generateKSpaceFilter(WorkOrderType& workOrder)
+    {
+        try
+        {
+            size_t RO = workOrder.data_.get_size(0);
+            size_t E1 = workOrder.data_.get_size(1);
+            size_t E2 = workOrder.data_.get_size(4);
+
+            size_t RO_ref = workOrder.ref_.get_size(0);
+            size_t E1_ref = workOrder.ref_.get_size(1);
+            size_t E2_ref = workOrder.ref_.get_size(4);
+
+            if ( workOrder.CalibMode_ == Gadgetron::ISMRMRD_interleaved )
+            {
+                RO_ref = RO;
+                E1_ref = E1;
+                E2_ref = E2;
+            }
+
+            // image data filter
+            if ( RO>1 && filterRO_type_ != ISMRMRD_FILTER_NONE )
+            {
+                workOrder.filterRO_.create(RO);
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(RO, workOrder.start_RO_, workOrder.end_RO_, workOrder.filterRO_, filterRO_type_, filterRO_sigma_, (size_t)std::ceil(filterRO_width_*RO)));
+                if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterRO_, debugFolder_fullPath_+"filterRO"); }
+            }
+
+            if ( E1>1 && filterE1_type_ != ISMRMRD_FILTER_NONE )
+            {
+                workOrder.filterE1_.create(E1);
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(E1, workOrder.start_E1_, workOrder.end_E1_, workOrder.filterE1_, filterE1_type_, filterE1_sigma_, (size_t)std::ceil(filterE1_width_*E1)));
+                if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterE1_, debugFolder_fullPath_+"filterE1"); }
+            }
+
+            if ( E2>1 && filterE2_type_ != ISMRMRD_FILTER_NONE )
+            {
+                workOrder.filterE2_.create(E2);
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(E2, workOrder.start_E2_, workOrder.end_E2_, workOrder.filterE2_, filterE2_type_, filterE2_sigma_, (size_t)std::ceil(filterE2_width_*E2)));
+                if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterE2_, debugFolder_fullPath_+"filterE2"); }
+            }
+
+            // ref data filter
+            if ( workOrder.ref_.get_number_of_elements() > 0 )
+            {
+                size_t startRO(0), endRO(0), startE1(0), endE1(0), startE2(0), endE2(0);
+                if ( E2_ref == 1 )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_complex_.detectSampledRegion2D(workOrder.ref_, startRO, endRO, startE1, endE1));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_complex_.detectSampledRegion3D(workOrder.ref_, startRO, endRO, startE1, endE1, startE2, endE2));
+                }
+
+                if ( (workOrder.CalibMode_ == ISMRMRD_interleaved) || (workOrder.CalibMode_ == ISMRMRD_embedded) )
+                {
+                    // use the image data sample range
+                    startRO = workOrder.start_RO_; if ( startRO < 0 ) startRO=0;
+                    endRO = workOrder.end_RO_; if ( endRO < 0 ) endRO = RO_ref-1;
+                }
+
+                if ( RO_ref > 1 && filterRO_ref_type_ != ISMRMRD_FILTER_NONE )
+                {
+                    workOrder.filterRO_ref_.create(RO_ref);
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(RO_ref, startRO, endRO, workOrder.filterRO_ref_, filterRO_ref_type_, filterRO_ref_sigma_, (size_t)std::ceil(filterRO_ref_width_*RO_ref)));
+                    if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterRO_ref_, debugFolder_fullPath_+"filterRO_ref"); }
+                }
+
+                if ( (workOrder.CalibMode_ == ISMRMRD_separate) || (workOrder.CalibMode_ == ISMRMRD_external) )
+                {
+                    if ( E1_ref > 1 && filterE1_ref_type_ != ISMRMRD_FILTER_NONE )
+                    {
+                        size_t len = endE1-startE1+1;
+                        workOrder.filterE1_ref_.create(len);
+                        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(len, 0, len-1, workOrder.filterE1_ref_, filterE1_ref_type_, filterE1_ref_sigma_, (size_t)std::ceil(filterE1_ref_width_*len)));
+                        if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterE1_ref_, debugFolder_fullPath_+"filterE1_ref"); }
+                    }
+
+                    if ( E2_ref > 1 && filterE2_ref_type_ != ISMRMRD_FILTER_NONE )
+                    {
+                        size_t len = endE2-startE2+1;
+                        workOrder.filterE2_ref_.create(len);
+                        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(len, 0, len-1, workOrder.filterE2_ref_, filterE2_ref_type_, filterE2_ref_sigma_, (size_t)std::ceil(filterE2_ref_width_*len)));
+                        if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterE2_ref_, debugFolder_fullPath_+"filterE2_ref"); }
+                    }
+                }
+                else
+                {
+                    // this makes sure for interleaved and embedded, the kspace filter is applied at correct lines
+                    if ( E1_ref > 1 && filterE1_ref_type_ != ISMRMRD_FILTER_NONE )
+                    {
+                        size_t len = E1_ref;
+                        workOrder.filterE1_ref_.create(len);
+                        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(len, startE1, endE1, workOrder.filterE1_ref_, filterE1_ref_type_, filterE1_ref_sigma_, (size_t)std::ceil(filterE1_ref_width_*len)));
+                        if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterE1_ref_, debugFolder_fullPath_+"filterE1_ref"); }
+                    }
+
+                    if ( E2_ref > 1 && filterE2_ref_type_ != ISMRMRD_FILTER_NONE )
+                    {
+                        size_t len = E2_ref;
+                        workOrder.filterE2_ref_.create(len);
+                        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(len, startE2, endE2, workOrder.filterE2_ref_, filterE2_ref_type_, filterE2_ref_sigma_, (size_t)std::ceil(filterE2_ref_width_*len)));
+                        if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterE2_ref_, debugFolder_fullPath_+"filterE2_ref"); }
+                    }
+                }
+            }
+
+            // partial fourier handling filter
+            if ( RO>1 && workOrder.start_RO_>=0 && workOrder.end_RO_>0 )
+            {
+                workOrder.filterRO_partialfourier_.create(RO);
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(RO, workOrder.start_RO_, workOrder.end_RO_, workOrder.filterRO_partialfourier_, filterRO_pf_type_, (size_t)std::ceil(filterRO_pf_width_*RO), filterRO_pf_densityComp_));
+                if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterRO_partialfourier_, debugFolder_fullPath_+"filterRO_partialfourier"); }
+            }
+
+            if ( E1>1 && workOrder.start_E1_>=0 && workOrder.end_E1_>0 )
+            {
+                workOrder.filterE1_partialfourier_.create(E1);
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(E1, workOrder.start_E1_, workOrder.end_E1_, workOrder.filterE1_partialfourier_, filterE1_pf_type_, (size_t)std::ceil(filterE1_pf_width_*E1), filterE1_pf_densityComp_));
+                if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterE1_partialfourier_, debugFolder_fullPath_+"filterE1_partialfourier"); }
+            }
+
+            if ( E2>1 && workOrder.start_E2_>=0 && workOrder.end_E2_>0 )
+            {
+                workOrder.filterE2_partialfourier_.create(E2);
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(E2, workOrder.start_E2_, workOrder.end_E2_, workOrder.filterE2_partialfourier_, filterE2_pf_type_, (size_t)std::ceil(filterE2_pf_width_*E2), filterE2_pf_densityComp_));
+                if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterE2_partialfourier_, debugFolder_fullPath_+"filterE2_partialfourier"); }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in GtPlusReconGadget::generateKSpaceFilter(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool GtPlusReconGadget::
+        recomputeImageGeometry(GtPlusGadgetImageArray* images, GtPlusGadgetImageExt& imageHeader, size_t slc, size_t e2, size_t con, size_t phs, size_t rep, size_t set, size_t seg, size_t ave, size_t maxE2)
+    {
+        size_t E2 = images->matrix_size[4];
+
+        // need to recompute image geometry
+        // no need to consider RO and E1, because image position vector points to the image center
+
+        if ( e2 >= E2 ) e2 = E2/2;
+
+        size_t offsetCurr = images->get_offset(slc, e2, con, phs, rep, set, 0, ave);
+        imageHeader = images->imageArray_[offsetCurr];
+
+        // find the center partition
+        if ( E2 > 1 )
+        {
+            size_t midE2 = E2/2;
+            size_t offset = images->get_offset(slc, midE2, con, phs, rep, set, 0, ave);
+
+            while ( std::abs(imageHeader.slice_dir[0])<1e-6 && std::abs(imageHeader.slice_dir[1])<1e-6 && std::abs(imageHeader.slice_dir[2])<1e-6 )
+            {
+                imageHeader = images->imageArray_[offset];
+                midE2++;
+                offset = images->get_offset(slc, midE2, con, phs, rep, set, 0, ave);
+            }
+
+            // position vector for the center partition
+            float posVec[3];
+            posVec[0] = imageHeader.position[0];
+            posVec[1] = imageHeader.position[1];
+            posVec[2] = imageHeader.position[2];
+
+            // slice direction
+            float sliceVec[3];
+            sliceVec[0] = imageHeader.slice_dir[0];
+            sliceVec[1] = imageHeader.slice_dir[1];
+            sliceVec[2] = imageHeader.slice_dir[2];
+
+            midE2 = E2/2;
+
+            // comput slice postion vector for this partition
+            float posVecCurr[3];
+            posVecCurr[0] = (float)(posVec[0] + aSpacing_[2]*sliceVec[0]*(e2-midE2+0.5f));
+            posVecCurr[1] = (float)(posVec[1] + aSpacing_[2]*sliceVec[1]*(e2-midE2+0.5f));
+            posVecCurr[2] = (float)(posVec[2] + aSpacing_[2]*sliceVec[2]*(e2-midE2+0.5f));
+
+            imageHeader.position[0] = posVecCurr[0];
+            imageHeader.position[1] = posVecCurr[1];
+            imageHeader.position[2] = posVecCurr[2];
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "--> image position : [" << imageHeader.position[0] << " , " << imageHeader.position[1] << " , " << imageHeader.position[2] << "]");
+
+            imageHeader.field_of_view[2] = (float)(aSpacing_[2]);
+
+            imageHeader.user_int[0] = (int32_t)e2;
+        }
+
+        if ( imageHeader.measurement_uid == 0 )
+        {
+            GWARN_STREAM("imageHeader.measurement_uid == 0");
+        }
+
+        return true;
+    }
+
+    bool GtPlusReconGadget::
+        sendOutRecon(GtPlusGadgetImageArray* images, const hoNDArray<ValueType>& res, int seriesNum, const std::vector<DimensionRecordType>& dimStartingIndexes, const std::string& prefix, const std::string& dataRole)
+    {
+        try
+        {
+            hoNDArray<real_value_type> timeStamp, physioTimeStamp;
+            GADGET_CHECK_RETURN_FALSE( this->sendOutRecon(images, res, timeStamp, physioTimeStamp, seriesNum, dimStartingIndexes, prefix, dataRole) );
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in GtPlusReconGadget::sendOutRecon(complex float) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool GtPlusReconGadget::
+        sendOutRecon(GtPlusGadgetImageArray* images, const hoNDArray<ValueType>& res, const hoNDArray<real_value_type>& timeStamp, const hoNDArray<real_value_type>& physioTimeStamp, 
+        int seriesNum, const std::vector<DimensionRecordType>& dimStartingIndexes, const std::string& prefix, const std::string& dataRole)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dims = res.get_dimensions();
+            size_t RO =  (*dims)[0];
+            size_t E1 =  (*dims)[1];
+            size_t CHA = (*dims)[2];
+            size_t SLC = (*dims)[3];
+            size_t E2 =  (*dims)[4];
+            size_t CON = (*dims)[5];
+            size_t PHS = (*dims)[6];
+            size_t REP = (*dims)[7];
+            size_t SET = (*dims)[8];
+            size_t AVE = (*dims)[9];
+
+            GDEBUG_CONDITION_STREAM(true, "sending out images, acquisition boundary [RO E1 CHA SLC E2 CON PHS REP SET AVE] = [" 
+                << RO << " " << E1 << " " << CHA << " " 
+                << SLC << " " << E2 << " " << CON << " " 
+                << PHS << " " << REP << " " << SET << " " 
+                << AVE << "] " );
+
+            bool hasTimeStamp = false;
+            if ( timeStamp.get_number_of_elements()>0 
+                && timeStamp.get_size(9)==AVE 
+                && timeStamp.get_size(8)==SET 
+                && timeStamp.get_size(7)==REP 
+                && timeStamp.get_size(6)==PHS 
+                && timeStamp.get_size(5)==CON 
+                && timeStamp.get_size(4)==E2 
+                && timeStamp.get_size(3)==SLC )
+            {
+                hasTimeStamp = true;
+            }
+
+            bool hasPhysioTimeStamp = false;
+            if ( physioTimeStamp.get_number_of_elements()>0 
+                && physioTimeStamp.get_size(9)==AVE 
+                && physioTimeStamp.get_size(8)==SET 
+                && physioTimeStamp.get_size(7)==REP 
+                && physioTimeStamp.get_size(6)==PHS 
+                && physioTimeStamp.get_size(5)==CON 
+                && physioTimeStamp.get_size(4)==E2 
+                && physioTimeStamp.get_size(3)==SLC )
+            {
+                hasPhysioTimeStamp = true;
+            }
+
+            // info string for image, gfactor, snr map and std map
+            std::ostringstream ostr_image;
+            ostr_image << "x" << std::setprecision(4) << this->scalingFactor_;
+            std::string imageInfo = ostr_image.str();
+
+            std::ostringstream ostr_gfactor;
+            ostr_gfactor << "x" << this->scalingFactor_gfactor_;
+            std::string gfactorInfo = ostr_gfactor.str();
+
+            std::ostringstream ostr_wrap_around_map;
+            ostr_wrap_around_map << "x" << this->scalingFactor_wrap_around_map_;
+            std::string wrapAroundMapInfo = ostr_wrap_around_map.str();
+
+            std::ostringstream ostr_snr;
+            ostr_snr << "x" << this->scalingFactor_snr_image_;
+            std::string snrMapInfo = ostr_snr.str();
+
+            std::ostringstream ostr_std;
+            ostr_std << "x" << this->scalingFactor_std_map_;
+            std::string stdMapInfo = ostr_std.str();
+
+            // ------------------------------------------------------------- //
+
+            std::vector<size_t> ind(10, 0);
+
+            std::vector<size_t> dim2D(2);
+            dim2D[0] = RO;
+            dim2D[1] = E1;
+
+            size_t set(0), rep(0), phs(0), con(0), e2(0), slc(0), cha(0), seg(0), ave(0);
+            for ( ave=0; ave<AVE; ave++ )
+            {
+                for ( e2=0; e2<E2; e2++ )
+                {
+                    for ( slc=0; slc<SLC; slc++ )
+                    {
+                        for ( rep=0; rep<REP; rep++ )
+                        {
+                            for ( phs=0; phs<PHS; phs++ )
+                            {
+                                for ( set=0; set<SET; set++ )
+                                {
+                                    for ( con=0; con<CON; con++ )
+                                    {
+                                        GtPlusGadgetImageExt imageHeaderSent;
+
+                                        GADGET_CHECK_RETURN_FALSE(recomputeImageGeometry(images, imageHeaderSent, slc, e2, con, phs, rep, set, 0, ave, E2));
+
+                                        if ( imageHeaderSent.measurement_uid == 0 )
+                                        {
+                                            continue;
+                                        }
+
+                                        ind[0] = 0;
+                                        ind[1] = 0;
+                                        ind[2] = 0;
+                                        ind[3] = slc;
+                                        ind[4] = e2;
+                                        ind[5] = con;
+                                        ind[6] = phs;
+                                        ind[7] = rep;
+                                        ind[8] = set;
+                                        ind[9] = ave;
+
+                                        if ( hasTimeStamp )
+                                        {
+                                            if ( timeStamp(ind) > 0 )
+                                            {
+                                                imageHeaderSent.acquisition_time_stamp = (uint32_t)( (double)(timeStamp(ind)/timeStampResolution_) + 0.5 );
+                                                GDEBUG_CONDITION_STREAM(verboseMode_, "Set acquisition time stamp : " << imageHeaderSent.acquisition_time_stamp);
+                                            }
+                                        }
+
+                                        if ( hasPhysioTimeStamp )
+                                        {
+                                            if ( physioTimeStamp(ind) > 0 )
+                                            {
+                                                imageHeaderSent.physiology_time_stamp[0] = (uint32_t)( (double)(physioTimeStamp(ind)/timeStampResolution_) + 0.5 );
+                                                GDEBUG_CONDITION_STREAM(verboseMode_, "Set physio time stamp : " << imageHeaderSent.physiology_time_stamp[0]);
+                                            }
+                                        }
+
+                                        for ( cha=0; cha<CHA; cha++ )
+                                        {
+                                            ind[0] = 0;
+                                            ind[1] = 0;
+                                            ind[2] = cha;
+                                            ind[3] = slc;
+                                            ind[4] = e2;
+                                            ind[5] = con;
+                                            ind[6] = phs;
+                                            ind[7] = rep;
+                                            ind[8] = set;
+                                            ind[9] = ave;
+
+                                            hoNDArray<ValueType> currIm(dim2D, const_cast<ValueType*>(res.begin()+res.calculate_offset(ind)) );
+
+                                            Gadgetron::GadgetContainerMessage<ISMRMRD::ImageHeader>* cm1 = new Gadgetron::GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+                                            Gadgetron::GadgetContainerMessage<ISMRMRD::MetaContainer>* cm3 = new Gadgetron::GadgetContainerMessage<ISMRMRD::MetaContainer>();
+
+                                            *(cm1->getObjectPtr()) = imageHeaderSent;
+
+                                            cm1->getObjectPtr()->flags = 0;
+                                            cm1->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_CXFLOAT;
+
+                                            // image number and image series
+                                            cm1->getObjectPtr()->image_index = (uint16_t)computeSeriesImageNumber ( *(cm1->getObjectPtr()), CHA, cha, E2, e2);
+                                            cm1->getObjectPtr()->image_series_index = seriesNum;
+                                            // GDEBUG_CONDITION_STREAM(verboseMode_, "image number " << cm1->getObjectPtr()->image_index << "    image series " << cm1->getObjectPtr()->image_series_index << " ... ");
+
+                                            // ----------------------------------------------------------
+                                            // set the image attributes
+                                            cm3->getObjectPtr()->set(GADGETRON_IMAGENUMBER, (long)cm1->getObjectPtr()->image_index);
+
+                                            cm3->getObjectPtr()->set(GADGETRON_CHA,        (long)cha);
+                                            cm3->getObjectPtr()->set(GADGETRON_SLC,        (long)cm1->getObjectPtr()->slice);
+                                            cm3->getObjectPtr()->set(GADGETRON_E2,         (long)e2);
+                                            cm3->getObjectPtr()->set(GADGETRON_CONTRAST,   (long)cm1->getObjectPtr()->contrast);
+                                            cm3->getObjectPtr()->set(GADGETRON_PHASE,      (long)cm1->getObjectPtr()->phase);
+                                            cm3->getObjectPtr()->set(GADGETRON_REP,        (long)cm1->getObjectPtr()->repetition);
+                                            cm3->getObjectPtr()->set(GADGETRON_SET,        (long)cm1->getObjectPtr()->set);
+                                            cm3->getObjectPtr()->set(GADGETRON_AVERAGE,    (long)cm1->getObjectPtr()->average);
+
+                                            cm3->getObjectPtr()->set(GADGETRON_IMAGEPROCESSINGHISTORY, "GT");
+
+                                            if ( dataRole == GADGETRON_IMAGE_REGULAR )
+                                            {
+                                                cm1->getObjectPtr()->image_type = ISMRMRD::ISMRMRD_IMTYPE_MAGNITUDE;
+
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGECOMMENT, "GT");
+                                                cm3->getObjectPtr()->append(GADGETRON_IMAGECOMMENT, imageInfo.c_str());
+
+                                                cm3->getObjectPtr()->append(GADGETRON_SEQUENCEDESCRIPTION, "_GT");
+                                                cm3->getObjectPtr()->set(GADGETRON_DATA_ROLE, GADGETRON_IMAGE_REGULAR);
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGE_SCALE_RATIO, (double)(this->scalingFactor_));
+                                            }
+                                            else if ( dataRole == GADGETRON_IMAGE_RETRO )
+                                            {
+                                                cm1->getObjectPtr()->image_type = ISMRMRD::ISMRMRD_IMTYPE_MAGNITUDE;
+
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGECOMMENT, "GT");
+                                                cm3->getObjectPtr()->append(GADGETRON_IMAGECOMMENT, "RETRO");
+                                                cm3->getObjectPtr()->append(GADGETRON_IMAGECOMMENT, imageInfo.c_str());
+
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGEPROCESSINGHISTORY, "RETRO");
+
+                                                cm3->getObjectPtr()->set(GADGETRON_SEQUENCEDESCRIPTION, "_GT_RETRO");
+                                                cm3->getObjectPtr()->set(GADGETRON_DATA_ROLE, GADGETRON_IMAGE_RETRO);
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGE_SCALE_RATIO, (double)(this->scalingFactor_));
+                                            }
+                                            else if ( dataRole == GADGETRON_IMAGE_PHASE )
+                                            {
+                                                cm1->getObjectPtr()->image_type = ISMRMRD::ISMRMRD_IMTYPE_PHASE;
+
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGECOMMENT, "PHS_GT");
+                                                cm3->getObjectPtr()->set(GADGETRON_SEQUENCEDESCRIPTION, "PHS_GT");
+                                                cm3->getObjectPtr()->set(GADGETRON_DATA_ROLE, GADGETRON_IMAGE_PHASE);
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGE_SCALE_RATIO, (double)(this->scalingFactor_));
+                                            }
+                                            else if ( dataRole == GADGETRON_IMAGE_GFACTOR )
+                                            {
+                                                cm1->getObjectPtr()->image_type = ISMRMRD::ISMRMRD_IMTYPE_MAGNITUDE;
+
+                                                std::string comment = gfactorInfo;
+                                                comment.append("_");
+                                                comment.append("gfactor_GT");
+
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGECOMMENT, comment.c_str());
+                                                cm3->getObjectPtr()->set(GADGETRON_SEQUENCEDESCRIPTION, "_gfactor_GT");
+                                                cm3->getObjectPtr()->set(GADGETRON_DATA_ROLE, GADGETRON_IMAGE_GFACTOR);
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGE_SCALE_RATIO, (double)(this->scalingFactor_gfactor_));
+                                            }
+                                            else if ( dataRole == GADGETRON_IMAGE_WRAPAROUNDMAP )
+                                            {
+                                                cm1->getObjectPtr()->image_type = ISMRMRD::ISMRMRD_IMTYPE_MAGNITUDE;
+
+                                                std::string comment = wrapAroundMapInfo;
+                                                comment.append("_");
+                                                comment.append("WrapAround_Map_GT");
+
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGECOMMENT, comment.c_str());
+                                                cm3->getObjectPtr()->set(GADGETRON_SEQUENCEDESCRIPTION, "_WrapAround_Map_GT");
+                                                cm3->getObjectPtr()->set(GADGETRON_DATA_ROLE, GADGETRON_IMAGE_WRAPAROUNDMAP);
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGE_SCALE_RATIO, (float)(this->scalingFactor_wrap_around_map_));
+                                            }
+                                            else if ( dataRole == GADGETRON_IMAGE_SNR_MAP )
+                                            {
+                                                cm1->getObjectPtr()->image_type = ISMRMRD::ISMRMRD_IMTYPE_MAGNITUDE;
+
+                                                std::string comment = snrMapInfo;
+                                                comment.append("_");
+                                                comment.append("SNR_Map_GT");
+
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGECOMMENT, comment.c_str());
+                                                cm3->getObjectPtr()->set(GADGETRON_SEQUENCEDESCRIPTION, "_SNR_Map_GT");
+                                                cm3->getObjectPtr()->set(GADGETRON_DATA_ROLE, GADGETRON_IMAGE_SNR_MAP);
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGE_SCALE_RATIO, (double)(this->scalingFactor_snr_image_));
+                                            }
+                                            else if ( dataRole == GADGETRON_IMAGE_STD_MAP )
+                                            {
+                                                cm1->getObjectPtr()->image_type = ISMRMRD::ISMRMRD_IMTYPE_MAGNITUDE;
+
+                                                std::string comment = stdMapInfo;
+                                                comment.append("_");
+                                                comment.append("Std_Map_GT");
+
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGECOMMENT, comment.c_str());
+                                                cm3->getObjectPtr()->set(GADGETRON_SEQUENCEDESCRIPTION, "_Std_Map_GT");
+                                                cm3->getObjectPtr()->set(GADGETRON_DATA_ROLE, GADGETRON_IMAGE_STD_MAP);
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGE_SCALE_RATIO, (double)(this->scalingFactor_std_map_));
+
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGE_WINDOWCENTER, (long)(this->scalingFactor_std_map_));
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGE_WINDOWWIDTH, (long)(2*this->scalingFactor_std_map_));
+                                            }
+                                            else if ( dataRole == GADGETRON_IMAGE_OTHER )
+                                            {
+                                                cm1->getObjectPtr()->image_type = ISMRMRD::ISMRMRD_IMTYPE_MAGNITUDE;
+
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGECOMMENT, "GT");
+                                                cm3->getObjectPtr()->set(GADGETRON_SEQUENCEDESCRIPTION, "_GT");
+                                                cm3->getObjectPtr()->set(GADGETRON_DATA_ROLE, GADGETRON_IMAGE_OTHER);
+                                                cm3->getObjectPtr()->set(GADGETRON_IMAGE_SCALE_RATIO, (double)(this->scalingFactor_));
+                                            }
+
+                                            // ----------------------------------------------------------
+
+                                            // set the time stamp
+                                            // the time stamp of the first readout line in this 2D kspace is used
+
+                                            Gadgetron::GadgetContainerMessage< Gadgetron::hoNDArray<ValueType> >* cm2 = new Gadgetron::GadgetContainerMessage< Gadgetron::hoNDArray<ValueType> >();
+                                            cm1->cont(cm2);
+                                            cm2->cont(cm3);
+
+                                            std::vector<size_t> img_dims(2);
+                                            img_dims[0] = RO;
+                                            img_dims[1] = E1;
+
+                                            //Fixing array dimensions (MSH)
+                                            cm1->getObjectPtr()->matrix_size[0] = (uint16_t)RO;
+                                            cm1->getObjectPtr()->matrix_size[1] = (uint16_t)E1;
+                                            cm1->getObjectPtr()->matrix_size[2] = 1;
+                                            cm1->getObjectPtr()->channels = 1;
+
+                                            try
+                                            {
+                                                cm2->getObjectPtr()->create(&img_dims);
+                                                Gadgetron::clear(cm2->getObjectPtr());
+                                            }
+                                            catch(...)
+                                            {
+                                                GDEBUG("Unable to allocate new image\n");
+                                                cm1->release();
+                                                return false;
+                                            }
+
+                                            memcpy(cm2->getObjectPtr()->begin(), currIm.begin(), sizeof(ValueType)*RO*E1);
+
+                                            if ( !debugFolder2_fullPath_.empty() )
+                                            {
+                                                std::ostringstream ostr;
+                                                ostr << prefix << "_" << cm1->getObjectPtr()->image_index;
+                                                if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(*cm2->getObjectPtr(), debugFolder2_fullPath_+ostr.str()); }
+                                            }
+
+                                            GDEBUG_CONDITION_STREAM(verboseMode_, "sending out " << dataRole << " image [CHA SLC E2 CON PHS REP SET AVE] = [" 
+                                                << cha << " " 
+                                                << cm1->getObjectPtr()->slice << " " 
+                                                << e2 << " " 
+                                                << cm1->getObjectPtr()->contrast << " " 
+                                                << cm1->getObjectPtr()->phase << " " 
+                                                << cm1->getObjectPtr()->repetition << " " 
+                                                << cm1->getObjectPtr()->set << " " 
+                                                << cm1->getObjectPtr()->average << " " << "] " 
+                                                << " -- Image number -- " << cm1->getObjectPtr()->image_index);
+
+                                            // send out the images
+                                            if (this->next()->putq(cm1) < 0) 
+                                            {
+                                                GERROR_STREAM("Put image to Q failed ... ");
+                                                return false;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in GtPlusReconGadget::sendOutRecon(complex float, time stamp) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool GtPlusReconGadget::sendOutRecon2D(GtPlusGadgetImageArray* images, const hoNDArray<ValueType>& res, int seriesNum, int imageNum)
+    {
+        try
+        {
+            // extract the magnitude
+            hoNDArray<float> mag(res.get_dimensions());
+            Gadgetron::abs(res, mag);
+            GADGET_CHECK_RETURN_FALSE(scalingMagnitude(mag));
+            GADGET_CHECK_RETURN_FALSE(sendOutRecon2D(images, mag, seriesNum, imageNum));
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Exceptions happened in GtPlusReconGadget::sendOutRecon2D(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool GtPlusReconGadget::sendOutRecon2D(GtPlusGadgetImageArray* images, const hoNDArray<float>& res, int seriesNum, int imageNum)
+    {
+        try
+        {
+            Gadgetron::GadgetContainerMessage<ISMRMRD::ImageHeader>* cm1 = new Gadgetron::GadgetContainerMessage<ISMRMRD::ImageHeader>();
+            Gadgetron::GadgetContainerMessage<ISMRMRD::MetaContainer>* cm3 = new Gadgetron::GadgetContainerMessage<ISMRMRD::MetaContainer>();
+
+            *(cm1->getObjectPtr()) = images->imageArray_[0];
+
+            cm1->getObjectPtr()->flags = 0;
+            cm1->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_FLOAT;
+            cm1->getObjectPtr()->image_type = ISMRMRD::ISMRMRD_IMTYPE_MAGNITUDE;
+
+            // image number and image series
+            cm1->getObjectPtr()->image_index = imageNum;
+            cm1->getObjectPtr()->image_series_index = seriesNum;
+
+            Gadgetron::GadgetContainerMessage< Gadgetron::hoNDArray<float> >* cm2 = new Gadgetron::GadgetContainerMessage< Gadgetron::hoNDArray<float> >();
+            cm1->cont(cm2);
+            cm2->cont(cm3);
+
+            std::vector<size_t> img_dims(2);
+            img_dims[0] = res.get_size(0);
+            img_dims[1] = res.get_size(1);
+
+            // set the image attributes
+            cm3->getObjectPtr()->set(GADGETRON_IMAGECOMMENT, "GT");
+            cm3->getObjectPtr()->set(GADGETRON_SEQUENCEDESCRIPTION, "_GT");
+            cm3->getObjectPtr()->set(GADGETRON_IMAGEPROCESSINGHISTORY, "GT");
+            cm3->getObjectPtr()->set(GADGETRON_DATA_ROLE, GADGETRON_IMAGE_REGULAR);
+
+            cm3->getObjectPtr()->set(GADGETRON_CHA,        (long)0);
+            cm3->getObjectPtr()->set(GADGETRON_SLC,        (long)cm1->getObjectPtr()->slice);
+            cm3->getObjectPtr()->set(GADGETRON_E2,         (long)0);
+            cm3->getObjectPtr()->set(GADGETRON_CONTRAST,   (long)cm1->getObjectPtr()->contrast);
+            cm3->getObjectPtr()->set(GADGETRON_PHASE,      (long)cm1->getObjectPtr()->phase);
+            cm3->getObjectPtr()->set(GADGETRON_REP,        (long)cm1->getObjectPtr()->repetition);
+            cm3->getObjectPtr()->set(GADGETRON_SET,        (long)cm1->getObjectPtr()->set);
+            cm3->getObjectPtr()->set(GADGETRON_AVERAGE,    (long)cm1->getObjectPtr()->average);
+
+            cm3->getObjectPtr()->set(GADGETRON_IMAGE_SCALE_RATIO, (double)(this->scalingFactor_));
+
+            //Fixing array dimensions (MSH)
+            cm1->getObjectPtr()->matrix_size[0] = (uint16_t)res.get_size(0);
+            cm1->getObjectPtr()->matrix_size[1] = (uint16_t)res.get_size(1);
+            cm1->getObjectPtr()->matrix_size[2] = 1;
+            cm1->getObjectPtr()->channels = 1;
+
+            try
+            {
+                cm2->getObjectPtr()->create(&img_dims);
+            }
+            catch(...)
+            {
+                GDEBUG("Unable to allocate new image\n");
+                cm1->release();
+                return false;
+            }
+
+            memcpy(cm2->getObjectPtr()->begin(), res.begin(), sizeof(float)*res.get_size(0)*res.get_size(1));
+
+            if ( !debugFolder2_fullPath_.empty() )
+            {
+                std::ostringstream ostr;
+                ostr << "SentImage2D" << "_" << cm1->getObjectPtr()->image_index;
+                if ( debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArray(*cm2->getObjectPtr(), debugFolder2_fullPath_+ostr.str()); }
+            }
+
+            // send out the images
+            if (this->next()->putq(cm1) < 0) 
+            {
+                return false;
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in GtPlusReconGadget::sendOutRecon2D(float) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool GtPlusReconGadget::computeSNRImage(const hoNDArray<ValueType>& res, const hoNDArray<ValueType>& gfactor, unsigned int startInd, bool withAcceleration, hoNDArray<ValueType>& snrImage, hoNDArray<ValueType>& stdMap)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dims = res.get_dimensions();
+            size_t RO = (*dims)[0];
+            size_t E1 = (*dims)[1];
+            size_t CHA = (*dims)[2];
+            size_t SLC = (*dims)[3];
+            size_t E2 = (*dims)[4];
+            size_t CON = (*dims)[5];
+            size_t PHS = (*dims)[6];
+            size_t REP = (*dims)[7];
+            size_t SET = (*dims)[8];
+            size_t AVE = (*dims)[9];
+
+            snrImage = gfactor;
+
+            if ( withAcceleration )
+            {
+                Gadgetron::addEpsilon(snrImage);
+                Gadgetron::divide(res, snrImage, snrImage);
+            }
+            else
+            {
+                snrImage = res;
+            }
+
+            if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(snrImage, debugFolder2_fullPath_+"snrImage"); }
+
+            std::vector<size_t> dimStdMap(*dims);
+
+            std::vector<size_t> ind(10, 0);
+            size_t set(0), rep(0), phs(0), con(0), e2(0), slc(0), cha(0), seg(0), ave(0);
+
+            if ( REP > startInd+2 )
+            {
+                dimStdMap[7] = 1;
+                stdMap.create(dimStdMap);
+                Gadgetron::clear(stdMap);
+
+                size_t numOfIm = REP - startInd;
+
+                hoNDArray<ValueType> repBuf(RO, E1, numOfIm);
+                hoNDArray<real_value_type> repBufMag(RO, E1, numOfIm);
+                hoNDArray<real_value_type> stdMap2D(RO, E1);
+
+                for ( ave=0; ave<AVE; ave++ )
+                {
+                    for ( set=0; set<SET; set++ )
+                    {
+                        for ( phs=0; phs<PHS; phs++ )
+                        {
+                            for ( con=0; con<CON; con++ )
+                            {
+                                for ( e2=0; e2<E2; e2++ )
+                                {
+                                    for ( slc=0; slc<SLC; slc++ )
+                                    {
+                                        for ( cha=0; cha<CHA; cha++ )
+                                        {
+                                            Gadgetron::clear(repBuf);
+
+                                            for ( rep=startInd; rep<REP; rep++ )
+                                            {
+                                                ind[2] = cha;
+                                                ind[3] = slc;
+                                                ind[4] = e2;
+                                                ind[5] = con;
+                                                ind[6] = phs;
+                                                ind[7] = rep;
+                                                ind[8] = set;
+                                                ind[9] = ave;
+
+                                                size_t offset = snrImage.calculate_offset(ind);
+
+                                                memcpy(repBuf.begin()+(rep-startInd)*RO*E1, 
+                                                    snrImage.begin()+offset, sizeof(ValueType)*RO*E1);
+                                            }
+
+                                            if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(repBuf, debugFolder2_fullPath_+"repBuf"); }
+
+                                            Gadgetron::abs(repBuf, repBufMag);
+                                            if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArray(repBufMag, debugFolder2_fullPath_+"repBufMag"); }
+
+                                            // compute std
+                                            GADGET_CHECK_RETURN_FALSE(Gadgetron::stdOver3rdDimension(repBufMag, stdMap2D, true));
+                                            if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArray(stdMap2D, debugFolder2_fullPath_+"stdMap2D"); }
+
+                                            // copy it to the std map
+                                            ind[2] = cha;
+                                            ind[3] = slc;
+                                            ind[4] = e2;
+                                            ind[5] = con;
+                                            ind[6] = phs;
+                                            ind[7] = 0;
+                                            ind[8] = set;
+                                            ind[9] = ave;
+
+                                            size_t offset = stdMap.calculate_offset(ind);
+                                            hoNDArray<ValueType> stdMapCurr(RO, E1, stdMap.begin()+offset, false);
+                                            stdMapCurr.copyFrom(stdMap2D);
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else if ( PHS > startInd+2 )
+            {
+                dimStdMap[6] = 1;
+                stdMap.create(dimStdMap);
+                Gadgetron::clear(stdMap);
+
+                size_t numOfIm = PHS - startInd;
+
+                hoNDArray<ValueType> phsBuf(RO, E1, numOfIm);
+                hoNDArray<real_value_type> phsBufMag(RO, E1, numOfIm);
+                hoNDArray<real_value_type> stdMap2D(RO, E1);
+
+                for ( ave=0; ave<AVE; ave++ )
+                {
+                    for ( set=0; set<SET; set++ )
+                    {
+                        for ( rep=0; rep<REP; rep++ )
+                        {
+                            for ( con=0; con<CON; con++ )
+                            {
+                                for ( e2=0; e2<E2; e2++ )
+                                {
+                                    for ( slc=0; slc<SLC; slc++ )
+                                    {
+                                        for ( cha=0; cha<CHA; cha++ )
+                                        {
+                                            Gadgetron::clear(phsBuf);
+
+                                            for ( phs=startInd; phs<PHS; phs++ )
+                                            {
+                                                ind[2] = cha;
+                                                ind[3] = slc;
+                                                ind[4] = e2;
+                                                ind[5] = con;
+                                                ind[6] = phs;
+                                                ind[7] = rep;
+                                                ind[8] = set;
+                                                ind[9] = ave;
+
+                                                size_t offset = snrImage.calculate_offset(ind);
+
+                                                memcpy(phsBuf.begin()+(phs-startInd)*RO*E1, 
+                                                    snrImage.begin()+offset, sizeof(ValueType)*RO*E1);
+                                            }
+
+                                            if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(phsBuf, debugFolder2_fullPath_+"phsBuf"); }
+
+                                            Gadgetron::abs(phsBuf, phsBufMag);
+                                            if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArray(phsBufMag, debugFolder2_fullPath_+"phsBufMag"); }
+
+                                            // compute std
+                                            GADGET_CHECK_RETURN_FALSE(Gadgetron::stdOver3rdDimension(phsBufMag, stdMap2D, true));
+                                            if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArray(stdMap2D, debugFolder2_fullPath_+"stdMap2D"); }
+
+                                            // copy it to the std map
+                                            ind[2] = cha;
+                                            ind[3] = slc;
+                                            ind[4] = e2;
+                                            ind[5] = con;
+                                            ind[6] = 0;
+                                            ind[7] = rep;
+                                            ind[8] = set;
+                                            ind[9] = ave;
+
+                                            size_t offset = stdMap.calculate_offset(ind);
+                                            hoNDArray<ValueType> stdMapCurr(RO, E1, stdMap.begin()+offset, false);
+                                            stdMapCurr.copyFrom(stdMap2D);
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in GtPlusReconGadget::computeSNRImage(res, gfactor, snrImage, stdmap) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    int GtPlusReconGadget::close(unsigned long flags)
+    {
+        GDEBUG_CONDITION_STREAM(true, "GtPlusReconGadget - close(flags) : " << flags);
+
+        if ( BaseClass::close(flags) != GADGET_OK ) return GADGET_FAIL;
+
+        if ( flags != 0 )
+        {
+            std::string procTime;
+            gtPlus_util_.getCurrentMoment(procTime);
+
+            GDEBUG_STREAM("* ============================================================================== *");
+            GDEBUG_STREAM("---> MR recon phase, Currnt processing time : " << procTime << " <---");
+            GDEBUG_STREAM("* ============================================================================== *");
+        }
+
+        return GADGET_OK;
+    }
+
+    GADGET_FACTORY_DECLARE(GtPlusReconGadget)
+
+}
diff --git a/gadgets/gtPlus/GtPlusReconGadget.h b/gadgets/gtPlus/GtPlusReconGadget.h
new file mode 100644
index 0000000..15ec549
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconGadget.h
@@ -0,0 +1,305 @@
+/** \file   GtPlusReconGadget.h
+    \brief  This is the base class gadget for both 2DT and 3DT reconstruction.
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd/ismrmrd.h"
+#include "ismrmrd/xml.h"
+#include "ismrmrd/meta.h"
+#include "GadgetronTimer.h"
+
+#include "hoNDArray_utils.h"
+
+#include "GtPlusGadgetImageArray.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorkOrder.h"
+
+#include "GadgetStreamController.h"
+
+#include "GtPlusReconGadgetUtil.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+#define SNR_NOISEFLOOR_SCALEFACTOR 8
+
+namespace Gadgetron
+{
+
+class EXPORTGTPLUSGADGET GtPlusReconGadget : public Gadgetron::Gadget2< GtPlusGadgetImageArray, Gadgetron::gtPlus::gtPlusReconWorkOrder<std::complex<float> > >
+{
+public:
+    GADGET_DECLARE(GtPlusReconGadget);
+
+    typedef float real_value_type;
+    typedef std::complex<real_value_type> ValueType;
+
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder<ValueType> WorkOrderType;
+
+    typedef Gadget2< GtPlusGadgetImageArray, WorkOrderType > BaseClass;
+
+    typedef std::pair<Gadgetron::ISMRMRDDIM, size_t> DimensionRecordType;
+
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder<ValueType>::CloudNodeType CloudNodeType;
+    typedef std::vector<CloudNodeType> CloudType;
+
+    GtPlusReconGadget();
+    ~GtPlusReconGadget();
+
+    // image series number
+    int image_series_;
+
+    // the min/max dynamic range of magnitude images
+    size_t min_intensity_value_;
+    size_t max_intensity_value_;
+
+    // maximal intensity value when converted to unsigned short
+    size_t max_intensity_value_US_;
+
+    // scaling factor for recon results
+    double scalingFactor_;
+
+    // scaling factor for gfactor images
+    double scalingFactor_gfactor_;
+
+    // scaling factor for wrap around map
+    double scalingFactor_wrap_around_map_;
+
+    // scaling factor for snr images
+    double scalingFactor_snr_image_;
+
+    // scaling factor for std map
+    double scalingFactor_std_map_;
+
+    // start frame to compute std map, to avoid transitional signal
+    unsigned int start_frame_for_std_map_;
+
+    // whether to use the fixed intensity scaling factor
+    bool use_constant_scalingFactor_;
+
+    // time stamp resolution (default, 2.5ms)
+    float timeStampResolution_;
+
+    // pixel spacing when exporting the images
+    double aSpacing_[6];
+
+    // field of view in mm
+    double FOV_RO_;
+    double FOV_E1_;
+    double FOV_E2_;
+
+    // debug folder
+    std::string debugFolder_;
+    std::string debugFolder_fullPath_;
+
+    // debug folder 2
+    std::string debugFolder2_;
+    std::string debugFolder2_fullPath_;
+
+    // whether to perform timing
+    bool performTiming_;
+
+    // whether to recon kspace
+    bool recon_kspace_needed_;
+
+    // whether the second set of recon results is required
+    bool recon_res_second_required_;
+
+    // whether to send out recon results
+    bool send_out_recon_;
+    bool send_out_recon_second_;
+
+    // parameters for gt-plus recon
+    Gadgetron::gtPlus::gtPlusReconWorkOrderPara workOrderPara_;
+
+    // --------------------------------------------------
+    // utility functions
+    // --------------------------------------------------
+
+    // compute image number using ICE way
+    size_t computeSeriesImageNumber (ISMRMRD::ImageHeader& imheader, size_t nCHA=1, size_t cha=0, size_t nE2=1, size_t e2=0);
+
+    // to handle partial fourier, add pre or post zeros
+    // PrePostZeros: 0 no zeros; 1 pre zeros; 2 post zeros
+    bool addPrePostZeros(int centreNo, int sampleNo, int& PrePostZeros);
+
+    // find the dimension index
+    bool findStartingDimIndex(const std::vector<DimensionRecordType>& dimStartingIndexes, Gadgetron::ISMRMRDDIM& dim, size_t ind);
+
+    // compute SNR image and std map
+    bool computeSNRImage(const hoNDArray<ValueType>& res, const hoNDArray<ValueType>& gfactor, unsigned int startInd, bool withAcceleration, hoNDArray<ValueType>& snrImage, hoNDArray<ValueType>& stdMap);
+
+    // scale the recon images
+    bool scalingImages(hoNDArray<ValueType>& res);
+
+    // scale the magnitude images
+    bool scalingMagnitude(hoNDArray<float>& mag);
+
+    // recompute the image geometry parameters if the recon FOV is different from encoding FOV
+    bool recomputeImageGeometry(GtPlusGadgetImageArray* images, GtPlusGadgetImageExt& imageHeader, size_t slc, size_t e2, size_t con, size_t phs, size_t rep, size_t set, size_t seg, size_t ave, size_t maxE2);
+
+    // get the acquisition and PMU time stamps
+    bool getTimeStamp(GtPlusGadgetImageArray* images, WorkOrderType& workOrder, hoNDArray<real_value_type>& timeStamp,  hoNDArray<real_value_type>& pmuTimeStamp);
+
+    // send out the recon results
+    virtual bool sendOutRecon(GtPlusGadgetImageArray* images, const hoNDArray<ValueType>& res, int seriesNum, const std::vector<DimensionRecordType>& dimStartingIndexes, const std::string& prefix, const std::string& dataRole);
+    virtual bool sendOutRecon(GtPlusGadgetImageArray* images, const hoNDArray<ValueType>& res, const hoNDArray<real_value_type>& timeStamp, const hoNDArray<real_value_type>& physioTimeStamp, int seriesNum, const std::vector<DimensionRecordType>& dimStartingIndexes, const std::string& prefix, const std::string& dataRole);
+
+    // special sending function for the interactive cases
+    virtual bool sendOutRecon2D(GtPlusGadgetImageArray* images, const hoNDArray<float>& res, int seriesNum, int imageNum);
+    virtual bool sendOutRecon2D(GtPlusGadgetImageArray* images, const hoNDArray<ValueType>& res, int seriesNum, int imageNum);
+
+    // compute the kspace filter
+    bool generateKSpaceFilter(WorkOrderType& workOrder);
+    //void GDEBUG_CONDITION_STREAM(bool verboseMode_, const char* arg2);
+
+protected:
+
+    // --------------------------------------------------
+    // functional functions
+    // --------------------------------------------------
+
+    // default interface function
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage< GtPlusGadgetImageArray >* m1, Gadgetron::GadgetContainerMessage< WorkOrderType > * m2);
+
+    // read in parameters
+    virtual bool readParameters();
+
+    // parse the cloud file if any
+    virtual bool parseGTCloudNodeFile(const std::string& filename, CloudType& gtCloud);
+
+    // close call
+    int close(unsigned long flags);
+
+public:
+
+    // --------------------------------------------------
+    // variables used for data buffer and processing
+    // --------------------------------------------------
+
+    // dimension of incoming array
+    std::vector<size_t> dimensions_;
+
+    // number of acquisition channels
+    size_t num_acq_channels_;
+
+    // encoding matrix size (the real sampled size)
+    size_t matrix_size_encoding_[3];
+
+    // encoding filed of view [mm]
+    float field_of_view_encoding_[3];
+
+    // recon matrix size (the final image size)
+    size_t matrix_size_recon_[3];
+
+    // recon filed of view [mm]
+    float field_of_view_recon_[3];
+
+    // number of E1/E2 after zero-filling resize
+    size_t reconE1_;
+    size_t reconE2_;
+
+    // acceleration factor
+    double acceFactorE1_;
+    double acceFactorE2_;
+
+    // calibration mode
+    Gadgetron::ISMRMRDCALIBMODE CalibMode_;
+    Gadgetron::ISMRMRDDIM InterleaveDim_;
+
+    // acquired max indexes
+    size_t kSpaceMaxAcqE1No_;
+    size_t kSpaceMaxAcqE2No_;
+
+    // number of times the process function is called
+    unsigned int processed_called_times_;
+
+    // kspace filter for RO/E1/E2
+    // for the partial fourier, zero-padding resize or asymmetric echo
+    // if the kspace filter is not selected, the default filter will be used anyway
+
+    // kspace filter
+    Gadgetron::ISMRMRDKSPACEFILTER filterRO_type_;
+    double filterRO_sigma_;
+    double filterRO_width_;
+
+    Gadgetron::ISMRMRDKSPACEFILTER filterE1_type_;
+    double filterE1_sigma_;
+    double filterE1_width_;
+
+    Gadgetron::ISMRMRDKSPACEFILTER filterE2_type_;
+    double filterE2_sigma_;
+    double filterE2_width_;
+
+    // ref data filter
+    Gadgetron::ISMRMRDKSPACEFILTER filterRO_ref_type_;
+    double filterRO_ref_sigma_;
+    double filterRO_ref_width_;
+
+    Gadgetron::ISMRMRDKSPACEFILTER filterE1_ref_type_;
+    double filterE1_ref_sigma_;
+    double filterE1_ref_width_;
+
+    Gadgetron::ISMRMRDKSPACEFILTER filterE2_ref_type_;
+    double filterE2_ref_sigma_;
+    double filterE2_ref_width_;
+
+    // partial fourier filter
+    Gadgetron::ISMRMRDKSPACEFILTER filterRO_pf_type_;
+    double filterRO_pf_sigma_;
+    double filterRO_pf_width_;
+    bool filterRO_pf_densityComp_;
+
+    Gadgetron::ISMRMRDKSPACEFILTER filterE1_pf_type_;
+    double filterE1_pf_sigma_;
+    double filterE1_pf_width_;
+    bool filterE1_pf_densityComp_;
+
+    Gadgetron::ISMRMRDKSPACEFILTER filterE2_pf_type_;
+    double filterE2_pf_sigma_;
+    double filterE2_pf_width_;
+    bool filterE2_pf_densityComp_;
+
+    /// cloud related definition
+    bool CloudComputing_;
+    unsigned int CloudSize_;
+
+    CloudType gt_cloud_;
+
+    // cloud node file
+    std::string cloud_node_file_;
+
+    // encoding space size
+    ISMRMRD::EncodingCounters meas_max_idx_;
+
+    // define the maximal number of threads used
+    // number_of_used_threads = thread_number_ratio_ * max_available_threads_number
+    // 0 means all threads are used
+    float thread_number_ratio_;
+
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtil<ValueType> gtPlus_util_;
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtilComplex<ValueType> gtPlus_util_complex_;
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // in verbose mode, more info is printed out
+    bool verboseMode_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusReconGadgetUtil.cpp b/gadgets/gtPlus/GtPlusReconGadgetUtil.cpp
new file mode 100644
index 0000000..6ced01e
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconGadgetUtil.cpp
@@ -0,0 +1,711 @@
+
+#include "GtPlusReconGadgetUtil.h"
+
+#include <boost/filesystem.hpp>
+using namespace boost::filesystem;
+
+namespace Gadgetron
+{
+
+    bool findCalibMode(ISMRMRD::IsmrmrdHeader& h, Gadgetron::ISMRMRDCALIBMODE& CalibMode, ISMRMRDDIM& InterleaveDim, double& acceFactorE1, double& acceFactorE2, bool verbose)
+    {
+        try
+        {
+            if (!h.encoding[0].parallelImaging)
+            {
+                GERROR_STREAM("Parallel Imaging section not found in header");
+                return false;
+            }
+
+            ISMRMRD::ParallelImaging p_imaging = *h.encoding[0].parallelImaging;
+
+            acceFactorE1 = (double)(p_imaging.accelerationFactor.kspace_encoding_step_1);
+            acceFactorE2 = (double)(p_imaging.accelerationFactor.kspace_encoding_step_2);
+
+            GDEBUG_CONDITION_STREAM(verbose, "acceFactorE1 is " << acceFactorE1);
+            GDEBUG_CONDITION_STREAM(verbose, "acceFactorE2 is " << acceFactorE2);
+
+            if ( !p_imaging.calibrationMode.is_present() )
+            {
+                GERROR_STREAM("Parallel calibration mode not found in header");
+                return false;
+            }
+
+            std::string calib = *p_imaging.calibrationMode;
+            if ( calib.compare("interleaved") == 0 )
+            {
+                CalibMode = Gadgetron::ISMRMRD_interleaved;
+                GDEBUG_CONDITION_STREAM(verbose, "Calibration mode is interleaved");
+
+                if ( p_imaging.interleavingDimension )
+                {
+                    if ( p_imaging.interleavingDimension->compare("phase") == 0 )
+                    {
+                        InterleaveDim = Gadgetron::DIM_Phase;
+                    }
+                    else if ( p_imaging.interleavingDimension->compare("repetition") == 0 )
+                    {
+                        InterleaveDim = Gadgetron::DIM_Repetition;
+                    }
+                    else if ( p_imaging.interleavingDimension->compare("average") == 0 )
+                    {
+                        InterleaveDim = Gadgetron::DIM_Average;
+                    }
+                    else if ( p_imaging.interleavingDimension->compare("contrast") == 0 )
+                    {
+                        InterleaveDim = Gadgetron::DIM_Contrast;
+                    }
+                    else if ( p_imaging.interleavingDimension->compare("other") == 0 )
+                    {
+                        InterleaveDim = Gadgetron::DIM_other1;
+                    }
+                    else
+                    {
+                        GERROR_STREAM("Unknown interleaving dimension. Bailing out");
+                        return false;
+                    }
+                }
+            }
+            else if ( calib.compare("embedded") == 0 )
+            {
+                CalibMode = Gadgetron::ISMRMRD_embedded;
+                GDEBUG_CONDITION_STREAM(verbose, "Calibration mode is embedded");
+            }
+            else if ( calib.compare("separate") == 0 )
+            {
+                CalibMode = Gadgetron::ISMRMRD_separate;
+                GDEBUG_CONDITION_STREAM(verbose, "Calibration mode is separate");
+            }
+            else if ( calib.compare("external") == 0 )
+            {
+                CalibMode = Gadgetron::ISMRMRD_external;
+            }
+            else if ( (calib.compare("other") == 0) && acceFactorE1==1 && acceFactorE2==1 )
+            {
+                CalibMode = Gadgetron::ISMRMRD_noacceleration;
+                acceFactorE1=1;
+            }
+            else if ( (calib.compare("other") == 0) &&  (acceFactorE1>1 || acceFactorE2>1) )
+            {
+                CalibMode = Gadgetron::ISMRMRD_interleaved;
+                acceFactorE1=2;
+                InterleaveDim = Gadgetron::DIM_Phase;
+            }
+            else
+            {
+                GERROR_STREAM("Failed to process parallel imaging calibration mode");
+                return false;
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in findCalibMode(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool findEncodingLimits(ISMRMRD::IsmrmrdHeader& h, ISMRMRD::EncodingCounters& meas_max_idx, bool verbose)
+    {
+        try
+        {
+            ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+            ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+            ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+            meas_max_idx.kspace_encode_step_1 = (uint16_t)e_space.matrixSize.y-1;
+
+            meas_max_idx.set = (e_limits.set && (e_limits.set->maximum>0)) ? e_limits.set->maximum : 0;
+            meas_max_idx.phase = (e_limits.phase && (e_limits.phase->maximum>0)) ? e_limits.phase->maximum : 0;
+
+            meas_max_idx.kspace_encode_step_2 = (uint16_t)e_space.matrixSize.z-1;
+
+            meas_max_idx.contrast = (e_limits.contrast && (e_limits.contrast->maximum > 0)) ? e_limits.contrast->maximum : 0;
+
+            meas_max_idx.slice = (e_limits.slice && (e_limits.slice->maximum > 0)) ? e_limits.slice->maximum : 0;
+
+            meas_max_idx.repetition = e_limits.repetition ? e_limits.repetition->maximum : 0;
+
+            meas_max_idx.average = e_limits.average ? e_limits.average->maximum : 0;
+
+            // always combine the SEG
+            meas_max_idx.segment = 0;
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in findEncodingLimits(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    void findMatrixSizeEncoding(ISMRMRD::IsmrmrdHeader& h, size_t matrix_size_encoding[3])
+    {
+        matrix_size_encoding[0] = h.encoding[0].encodedSpace.matrixSize.x;
+        matrix_size_encoding[1] = h.encoding[0].encodedSpace.matrixSize.y;
+        matrix_size_encoding[2] = h.encoding[0].encodedSpace.matrixSize.z;
+    }
+
+    void findFOVEncoding(ISMRMRD::IsmrmrdHeader& h, float field_of_view_encoding[3])
+    {
+        field_of_view_encoding[0] = h.encoding[0].encodedSpace.fieldOfView_mm.x;
+        field_of_view_encoding[1] = h.encoding[0].encodedSpace.fieldOfView_mm.y;
+        field_of_view_encoding[2] = h.encoding[0].encodedSpace.fieldOfView_mm.z;
+    }
+
+    void findMatrixSizeRecon(ISMRMRD::IsmrmrdHeader& h, size_t matrix_size_recon[3])
+    {
+        matrix_size_recon[0] = h.encoding[0].reconSpace.matrixSize.x;
+        matrix_size_recon[1] = h.encoding[0].reconSpace.matrixSize.y;
+        matrix_size_recon[2] = h.encoding[0].reconSpace.matrixSize.z;
+    }
+
+    void findFOVRecon(ISMRMRD::IsmrmrdHeader& h, float field_of_view_recon[3])
+    {
+        field_of_view_recon[0] = h.encoding[0].reconSpace.fieldOfView_mm.x;
+        field_of_view_recon[1] = h.encoding[0].reconSpace.fieldOfView_mm.y;
+        field_of_view_recon[2] = h.encoding[0].reconSpace.fieldOfView_mm.z;
+    }
+
+    bool checkReadoutStatus(uint64_t flag, int samples, Gadgetron::ISMRMRDCALIBMODE& CalibMode, int roLen, 
+        bool& bIsKSpace, bool& bIsRef, bool& bIsNoise, 
+        bool& bIsPhaseCorr, bool& bIsReflect, bool& bIsOther, 
+        bool& bIsNavigator, bool& bIsRTFeedback, bool& bIsHPFeedback, 
+        bool& bIsDummyScan)
+    {
+        try
+        {
+            bIsNoise = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT).isSet(flag);
+            bool is_ref = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_PARALLEL_CALIBRATION).isSet(flag);
+            bool is_ref_kspace = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_PARALLEL_CALIBRATION_AND_IMAGING).isSet(flag);
+            bIsReflect = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_REVERSE).isSet(flag);
+            bIsPhaseCorr = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_PHASECORR_DATA).isSet(flag);
+            bIsNavigator = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_NAVIGATION_DATA).isSet(flag);
+            bIsRTFeedback = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_RTFEEDBACK_DATA).isSet(flag);
+            bIsHPFeedback = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_HPFEEDBACK_DATA).isSet(flag);
+            bIsDummyScan = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_DUMMYSCAN_DATA).isSet(flag);
+
+            bIsKSpace = false;
+            bIsRef = false;
+            bIsOther = false;
+
+            if ( bIsNoise || bIsDummyScan )
+            {
+                return true;
+            }
+
+            if ( CalibMode==ISMRMRD_noacceleration )
+            {
+                bIsKSpace = true;
+                bIsRef = false;
+            }
+
+            // in interleaved mode, only store the image data
+            if ( CalibMode==ISMRMRD_interleaved )
+            {
+                bIsKSpace = true;
+                bIsRef = false;
+            }
+
+            // in embedded, kspace stores only the undersampled lines
+            // ref stores all lines used for references
+            if ( CalibMode==ISMRMRD_embedded )
+            {
+                if ( is_ref && !is_ref_kspace )
+                {
+                    bIsKSpace = false;
+                    bIsRef = true;
+                }
+
+                if ( !is_ref && is_ref_kspace )
+                {
+                    bIsKSpace = true;
+                    bIsRef = true;
+                }
+
+                if ( is_ref && is_ref_kspace )
+                {
+                    bIsKSpace = true;
+                    bIsRef = true;
+                }
+
+                if ( !is_ref && !is_ref_kspace )
+                {
+                    bIsKSpace = true;
+                    bIsRef = false;
+                }
+            }
+
+            // in separate mode
+            if ( CalibMode==ISMRMRD_separate 
+                || CalibMode==ISMRMRD_external )
+            {
+                if ( is_ref )
+                {
+                    bIsKSpace = false;
+                    bIsRef = true;
+                }
+
+                if ( !is_ref )
+                {
+                    bIsKSpace = true;
+                    bIsRef = false;
+                }
+            }
+
+            // store other data, e.g. AIF
+            // only for tpat
+            if ( !is_ref && !is_ref_kspace && (samples!=roLen) )
+            {
+                bIsOther = true;
+                bIsKSpace = false;
+                bIsRef = false;
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in checkReadoutStatus(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool estimateMaxSEGForRetroGating(Gadgetron::ISMRMRDCALIBMODE CalibMode, 
+        double acceFactorE1, double acceFactorE2, 
+        size_t retro_gated_segment_size, 
+        uint16_t E1, uint16_t embedded_ref_lines_E1, 
+        uint16_t E2, uint16_t embedded_ref_lines_E2, 
+        uint16_t& segment, bool verbose)
+    {
+        try
+        {
+            if ( acceFactorE2 <= 1 )
+            {
+                if ( CalibMode == ISMRMRD_embedded )
+                {
+                    segment = (uint16_t)std::ceil( (double)E1/acceFactorE1/retro_gated_segment_size 
+                        + (acceFactorE1-1)*(double)embedded_ref_lines_E1/acceFactorE1/retro_gated_segment_size );
+                }
+                else
+                {
+                    segment = (uint16_t)std::ceil( (double)E1/acceFactorE1/retro_gated_segment_size );
+                }
+            }
+            else
+            {
+                if ( CalibMode == ISMRMRD_embedded )
+                {
+                    segment = (uint16_t)std::ceil( (double)E1*E2/(acceFactorE1*acceFactorE2*retro_gated_segment_size) 
+                        + (acceFactorE1*acceFactorE2-1)*(double)(embedded_ref_lines_E1*embedded_ref_lines_E2)/(acceFactorE1*acceFactorE2*retro_gated_segment_size) );
+                }
+                else
+                {
+                    segment = (uint16_t)std::ceil( (double)E1*E2/(acceFactorE1*acceFactorE2*retro_gated_segment_size) );
+                }
+            }
+
+            if ( segment > 1 ) segment--;
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in estimateMaxSEGForRetroGating(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    void getDebugFolderPath(const std::string& debugFolder, std::string& debugFolderPath, bool verbose)
+    {
+      debugFolderPath = getenv("GADGETRON_DEBUG_FOLDER");
+      if ( debugFolderPath.empty() )
+      {
+#ifdef _WIN32
+            debugFolderPath = "c:/temp/gadgetron";
+#else
+            debugFolderPath = "/tmp/gadgetron";
+#endif // _WIN32
+        }
+
+        debugFolderPath.append("/");
+        debugFolderPath.append(debugFolder);
+        debugFolderPath.append("/");
+
+        createFolderWithAllPermissions(debugFolderPath);
+
+        GDEBUG_CONDITION_STREAM(verbose, "Debug folder is " << debugFolderPath);
+    }
+
+    bool createFolderWithAllPermissions(const std::string& workingdirectory)
+    {
+        if ( !boost::filesystem::exists(workingdirectory) )
+        {
+            boost::filesystem::path workingPath(workingdirectory);
+            if ( !boost::filesystem::create_directory(workingPath) )
+            {
+	      GERROR("Error creating the working directory.\n");
+	      return false;
+            }
+
+            // set the permission for the folder
+#ifdef _WIN32
+            try
+            {
+                boost::filesystem::permissions(workingPath, all_all);
+            }
+            catch(...)
+            {
+	      GERROR("Error changing the permission of the working directory.\n");
+	      return false;
+            }
+#else
+            // in case an older version of boost is used in non-win system
+            // the system call is used
+            int res = chmod(workingPath.string().c_str(), S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IXGRP|S_IROTH|S_IWOTH|S_IXOTH);
+            if ( res != 0 )
+            {
+	      GERROR("Error changing the permission of the working directory.\n");
+	      return false;
+            }
+#endif // _WIN32
+        }
+
+        return true;
+    }
+
+    bool getISMRMRMetaValues(const ISMRMRD::MetaContainer& attrib, const std::string& name, std::vector<long>& v)
+    {
+        try
+        {
+            size_t num = attrib.length(name.c_str());
+            if ( num == 0 )
+            {
+                v.clear();
+                GWARN_STREAM("getISMRMRMetaValues, can not find field : " << name);
+                return true;
+            }
+
+            v.resize(num);
+
+            size_t ii;
+            for ( ii=0; ii<num; ii++ )
+            {
+                v[ii] = attrib.as_long(name.c_str(), ii);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in getISMRMRMetaValues(const ISMRMRD::MetaContainer& attrib, const std::string& name, std::vector<long>& v) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool getISMRMRMetaValues(const ISMRMRD::MetaContainer& attrib, const std::string& name, std::vector<double>& v)
+    {
+        try
+        {
+            size_t num = attrib.length(name.c_str());
+            if ( num == 0 )
+            {
+                v.clear();
+                GWARN_STREAM("getISMRMRMetaValues, can not find field : " << name);
+                return true;
+            }
+
+            v.resize(num);
+
+            size_t ii;
+            for ( ii=0; ii<num; ii++ )
+            {
+                v[ii] = attrib.as_double(name.c_str(), ii);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in getISMRMRMetaValues(const ISMRMRD::MetaContainer& attrib, const std::string& name, std::vector<double>& v) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool getISMRMRMetaValues(const ISMRMRD::MetaContainer& attrib, const std::string& name, std::vector<std::string>& v)
+    {
+        try
+        {
+            size_t num = attrib.length(name.c_str());
+            if ( num == 0 )
+            {
+                v.clear();
+                GWARN_STREAM("getISMRMRMetaValues, can not find field : " << name);
+                return true;
+            }
+
+            v.resize(num);
+
+            size_t ii;
+            for ( ii=0; ii<num; ii++ )
+            {
+                v[ii] = std::string( attrib.as_str(name.c_str(), ii) );
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in getISMRMRMetaValues(const ISMRMRD::MetaContainer& attrib, const std::string& name, std::vector<std::string>& v) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T>
+    bool setISMRMRMetaValues(ISMRMRD::MetaContainer& attrib, const std::string& name, const std::vector<T>& v)
+    {
+        try
+        {
+            size_t num = v.size();
+            if ( num == 0 )
+            {
+                GWARN_STREAM("setISMRMRMetaValues, input vector is empty ... " << name);
+                return true;
+            }
+
+            attrib.set(name.c_str(), v[0]);
+
+            size_t ii;
+            for ( ii=1; ii<v.size(); ii++ )
+            {
+                attrib.append(name.c_str(), v[ii]);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in setISMRMRMetaValues(ISMRMRD::MetaContainer& attrib, const std::string& name, const std::vector<T>& v) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template EXPORTGTPLUSGADGET bool setISMRMRMetaValues(ISMRMRD::MetaContainer& attrib, const std::string& name, const std::vector<long>& v);
+    template EXPORTGTPLUSGADGET bool setISMRMRMetaValues(ISMRMRD::MetaContainer& attrib, const std::string& name, const std::vector<double>& v);
+
+    bool setISMRMRMetaValues(ISMRMRD::MetaContainer& attrib, const std::string& name, const std::vector<std::string>& v)
+    {
+        try
+        {
+            size_t num = v.size();
+            if ( num == 0 )
+            {
+                GWARN_STREAM("setISMRMRMetaValues, input vector is empty ... " << name);
+                return true;
+            }
+
+            attrib.set(name.c_str(), v[0].c_str());
+
+            size_t ii;
+            for ( ii=1; ii<v.size(); ii++ )
+            {
+                attrib.append(name.c_str(), v[ii].c_str());
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in setISMRMRMetaValues(ISMRMRD::MetaContainer& attrib, const std::string& name, const std::vector<std::string>& v) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T>
+    bool appendISMRMRMetaValues(ISMRMRD::MetaContainer& attrib, const std::string& name, const std::vector<T>& v)
+    {
+        try
+        {
+            size_t num = v.size();
+            if ( num == 0 )
+            {
+                GWARN_STREAM("appendISMRMRMetaValues, input vector is empty ... " << name);
+                return true;
+            }
+
+            attrib.append(name.c_str(), v[0]);
+
+            size_t ii;
+            for ( ii=1; ii<v.size(); ii++ )
+            {
+                attrib.append(name.c_str(), v[ii]);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in appendISMRMRMetaValues(ISMRMRD::MetaContainer& attrib, const std::string& name, const std::vector<T>& v) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template EXPORTGTPLUSGADGET bool appendISMRMRMetaValues(ISMRMRD::MetaContainer& attrib, const std::string& name, const std::vector<long>& v);
+    template EXPORTGTPLUSGADGET bool appendISMRMRMetaValues(ISMRMRD::MetaContainer& attrib, const std::string& name, const std::vector<double>& v);
+
+    bool appendISMRMRMetaValues(ISMRMRD::MetaContainer& attrib, const std::string& name, const std::vector<std::string>& v)
+    {
+        try
+        {
+            size_t num = v.size();
+            if ( num == 0 )
+            {
+                GWARN_STREAM("appendISMRMRMetaValues, input vector is empty ... " << name);
+                return true;
+            }
+
+            attrib.append(name.c_str(), v[0].c_str());
+
+            size_t ii;
+            for ( ii=1; ii<v.size(); ii++ )
+            {
+                attrib.append(name.c_str(), v[ii].c_str());
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in appendISMRMRMetaValues(ISMRMRD::MetaContainer& attrib, const std::string& name, const std::vector<std::string>& v) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool PatientCoordinateSystemToDeviceCoordinateSystem(double& x, double& y, double& z, const std::string& position)
+    {
+        // this is following dicom tag (0020, 0037)
+
+        if ( position == "HFS" ) // Head-first supine (HFS)
+        {
+            y = -y;
+            z = -z;
+        }
+        else if ( position == "HFP" ) // Head-first prone (HFP)
+        {
+            x = -x;
+            z = -z;
+        }
+        else if ( position == "HFDR" ) // Head-first decubitus-right 
+        {
+            double v = x;
+            x = y;
+            y = v;
+            z = -z;
+        }
+        else if ( position == "HFDL" ) // Head-first decubitus-left (HFDL)
+        {
+            double v = x;
+            x = y;
+            y = v;
+
+            x = -x;
+            y = -y;
+            z = -z;
+        }
+        else if ( position == "FFDR" ) // Feet-first decubitus-right (FFDR)
+        {
+            double v = x;
+            x = y;
+            y = v;
+
+            x = -x;
+        }
+        else if ( position == "FFDL" ) // Feet-first decubitus-left (FFDL)
+        {
+            double v = x;
+            x = y;
+            y = v;
+
+            y = -y;
+        }
+        else if ( position == "FFP" ) // Feet-first prone (FFP)
+        {
+        }
+        else if ( position == "FFS" ) // Feet-first supine (FFS)
+        {
+            x = -x;
+            y = -y;
+        }
+        else 
+        {
+            GERROR_STREAM("Unknown position string :" << position);
+            return false;
+        }
+
+        return true;
+    }
+
+    bool DeviceCoordinateSystemToPatientCoordinateSystem(double& x, double& y, double& z, const std::string& position)
+    {
+        if ( position == "HFS" ) // Head-first supine (HFS)
+        {
+            y = -y;
+            z = -z;
+        }
+        else if ( position == "HFP" ) // Head-first prone (HFP)
+        {
+            x = -x;
+            z = -z;
+        }
+        else if ( position == "HFDR" ) // Head-first decubitus-right 
+        {
+            double v = x;
+            x = y;
+            y = v;
+            z = -z;
+        }
+        else if ( position == "HFDL" ) // Head-first decubitus-left (HFDL)
+        {
+            double v = x;
+            x = y;
+            y = v;
+
+            x = -x;
+            y = -y;
+            z = -z;
+        }
+        else if ( position == "FFDR" ) // Feet-first decubitus-right (FFDR)
+        {
+            double v = x;
+            x = y;
+            y = v;
+
+            y = -y;
+        }
+        else if ( position == "FFDL" ) // Feet-first decubitus-left (FFDL)
+        {
+            double v = x;
+            x = y;
+            y = v;
+
+            x = -x;
+        }
+        else if ( position == "FFP" ) // Feet-first prone (FFP)
+        {
+        }
+        else if ( position == "FFS" ) // Feet-first supine (FFS)
+        {
+            x = -x;
+            y = -y;
+        }
+        else 
+        {
+            GERROR_STREAM("Unknown position string :" << position);
+            return false;
+        }
+
+        return true;
+    }
+}
diff --git a/gadgets/gtPlus/GtPlusReconGadgetUtil.h b/gadgets/gtPlus/GtPlusReconGadgetUtil.h
new file mode 100644
index 0000000..bb7eff6
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconGadgetUtil.h
@@ -0,0 +1,77 @@
+/** \file   GtPlusReconGadgetUtil.h
+    \brief  Store some utilities functions for reconstruction
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "hoNDArray.h"
+#include "mri_core_def.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "gtPlusISMRMRDReconUtil.h"
+
+#include "ismrmrd/ismrmrd.h"
+#include "ismrmrd/xml.h"
+#include "ismrmrd/meta.h"
+
+namespace Gadgetron
+{
+
+// [Ro E1 Cha Slice E2 Con Phase Rep Set Seg AVE]
+//   0  1  2   3    4   5    6     7  8   9  10
+
+    // find the calibration mode from protocol
+    bool EXPORTGTPLUSGADGET findCalibMode(ISMRMRD::IsmrmrdHeader& h, Gadgetron::ISMRMRDCALIBMODE& CalibMode, Gadgetron::ISMRMRDDIM& InterleaveDim, double& acceFactorE1, double& acceFactorE2, bool verbose=false);
+
+    // find the encoding limits from protocol
+    bool EXPORTGTPLUSGADGET findEncodingLimits(ISMRMRD::IsmrmrdHeader& h, ISMRMRD::EncodingCounters& meas_max_idx, bool verbose=false);
+
+    // find encoding matrix size and FOV
+    void EXPORTGTPLUSGADGET findMatrixSizeEncoding(ISMRMRD::IsmrmrdHeader& h, size_t matrix_size_encoding[3]);
+    void EXPORTGTPLUSGADGET findFOVEncoding(ISMRMRD::IsmrmrdHeader& h, float field_of_view_encoding[3]);
+
+    // find recon matrix size and FOV
+    void EXPORTGTPLUSGADGET findMatrixSizeRecon(ISMRMRD::IsmrmrdHeader& h, size_t matrix_size_recon[3]);
+    void EXPORTGTPLUSGADGET findFOVRecon(ISMRMRD::IsmrmrdHeader& h, float field_of_view_recon[3]);
+
+    // find the status of a readout line
+    bool EXPORTGTPLUSGADGET checkReadoutStatus(uint64_t flag, int samples, Gadgetron::ISMRMRDCALIBMODE& CalibMode, int roLen, 
+                        bool& bIsKSpace, bool& bIsRef, bool& bIsNoise, 
+                        bool& bIsPhaseCorr, bool& bIsReflect, bool& bIsOther, 
+                        bool& bIsNavigator, bool& bIsRTFeedback, bool& bIsHPFeedback, 
+                        bool& bIsDummyScan);
+
+    // estimate the max SEG for a segmented acquisition (number of total segments is segment+1)
+    // retro_gated_segment_size : number of readout lines acquired in one segment
+    // E1, embedded_ref_lines_E1: number of lines measured along E1 and number of reference lines for embedded mode
+    bool EXPORTGTPLUSGADGET estimateMaxSEGForRetroGating(Gadgetron::ISMRMRDCALIBMODE CalibMode, 
+                                                      double acceFactorE1, double acceFactorE2, 
+                                                      size_t retro_gated_segment_size, 
+                                                      uint16_t E1, uint16_t embedded_ref_lines_E1, 
+                                                      uint16_t E2, uint16_t embedded_ref_lines_E2, 
+                                                      uint16_t& segment, bool verbose=false);
+
+
+    // get debug folder full path
+    void EXPORTGTPLUSGADGET getDebugFolderPath(const std::string& debugFolder, std::string& debugFolderPath, bool verbose=false);
+
+    // create a folder with all permissions for all users
+    bool EXPORTGTPLUSGADGET createFolderWithAllPermissions(const std::string& workingdirectory);
+
+    // get a vector of values from ismrmrd meta
+    bool EXPORTGTPLUSGADGET getISMRMRMetaValues(const ISMRMRD::MetaContainer& attrib, const std::string& name, std::vector<long>& v);
+    bool EXPORTGTPLUSGADGET getISMRMRMetaValues(const ISMRMRD::MetaContainer& attrib, const std::string& name, std::vector<double>& v);
+    bool EXPORTGTPLUSGADGET getISMRMRMetaValues(const ISMRMRD::MetaContainer& attrib, const std::string& name, std::vector<std::string>& v);
+
+    template <typename T> EXPORTGTPLUSGADGET bool setISMRMRMetaValues(ISMRMRD::MetaContainer& attrib, const std::string& name, const std::vector<T>& v);
+    bool EXPORTGTPLUSGADGET setISMRMRMetaValues(ISMRMRD::MetaContainer& attrib, const std::string& name, const std::vector<std::string>& v);
+
+    template <typename T> EXPORTGTPLUSGADGET bool appendISMRMRMetaValues(ISMRMRD::MetaContainer& attrib, const std::string& name, const std::vector<T>& v);
+    bool EXPORTGTPLUSGADGET appendISMRMRMetaValues(ISMRMRD::MetaContainer& attrib, const std::string& name, const std::vector<std::string>& v);
+
+    // perform the patient to device coordinate transformation
+    bool EXPORTGTPLUSGADGET PatientCoordinateSystemToDeviceCoordinateSystem(double& x, double& y, double& z, const std::string& position);
+    bool EXPORTGTPLUSGADGET DeviceCoordinateSystemToPatientCoordinateSystem(double& x, double& y, double& z, const std::string& position);
+}
diff --git a/gadgets/gtPlus/GtPlusReconJob2DTGadget.cpp b/gadgets/gtPlus/GtPlusReconJob2DTGadget.cpp
new file mode 100644
index 0000000..89fe2c1
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconJob2DTGadget.cpp
@@ -0,0 +1,201 @@
+
+#include "GtPlusReconJob2DTGadget.h"
+#include "GtPlusGadgetOpenMP.h"
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+GtPlusReconJob2DTGadget::GtPlusReconJob2DTGadget()
+{
+    debugFolder_ = "DebugOutput";
+
+    performTiming_ = true;
+
+    verboseMode_ = false;
+
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+
+    process_config_called_ = false;
+
+    Gadgetron::prepOpenMP();
+}
+
+GtPlusReconJob2DTGadget::~GtPlusReconJob2DTGadget()
+{
+
+}
+
+bool GtPlusReconJob2DTGadget::readParameters()
+{
+    try
+    {
+        GDEBUG_CONDITION_STREAM(verboseMode_, "------> GtPlusReconJob2DTGadget parameters <------");
+
+        boost::shared_ptr<std::string> str = this->get_string_value("debugFolder");
+        debugFolder_ = *str;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "debugFolder_ is " << debugFolder_);
+
+        performTiming_ = this->get_bool_value("performTiming");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "performTiming_ is " << performTiming_);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusReconJob2DTGadget::readParameters() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+int GtPlusReconJob2DTGadget::process_config(ACE_Message_Block* mb)
+{
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    //   0  1  2   3    4   5    6     7  8   9
+
+    verboseMode_ = this->get_bool_value("verboseMode");
+
+    // read in parameters from the xml
+    GADGET_CHECK_RETURN(this->readParameters(), GADGET_FAIL);
+
+    // generate the destination folder
+    if ( !debugFolder_.empty() )
+    {
+        Gadgetron::getDebugFolderPath(debugFolder_, debugFolder_fullPath_, verboseMode_);
+    }
+    else
+    {
+        GDEBUG_STREAM("GtPlusRecon, debugFolder is not set ...");
+    }
+
+    return GADGET_OK;
+}
+
+int GtPlusReconJob2DTGadget::process(Gadgetron::GadgetContainerMessage< int >* m1, Gadgetron::GadgetContainerMessage< GtPlusReconJobTypeCPFL > * m2)
+{
+    // because the parameter configuration will not be sent, we need to call process_config explicitly
+    if ( !process_config_called_ )
+    {
+        GADGET_CHECK_RETURN( (this->process_config(m1)==0), GADGET_FAIL);
+        process_config_called_ = true;
+    }
+    GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusReconJob2DTGadget::process(...) starts ... ");
+
+    int* jobID = m1->getObjectPtr();
+    GDEBUG_CONDITION_STREAM(verboseMode_, "--> arriving job : " << *jobID << " ... ");
+
+    GtPlusReconJobTypeCPFL* job = m2->getObjectPtr();
+    GDEBUG_CONDITION_STREAM(verboseMode_, "    job array size : [ " << job->kspace.get_size(0) << " " 
+                                                                 << job->kspace.get_size(1) << " " 
+                                                                 << job->kspace.get_size(2) << " " 
+                                                                 << job->kspace.get_size(3) << " ] ... ");
+
+    // set the worker
+    worker_grappa_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_grappa_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_noacceleration_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_noacceleration_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_L1_ncg_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_L1_ncg_.debugFolder_ = debugFolder_fullPath_;
+
+    if ( verboseMode_ )
+    {
+        job->workOrder2DT.print(std::cout);
+    }
+
+    bool succeed = true;
+    if ( performTiming_ ) { gt_timer1_.start("Recon 2DT job ... "); }
+
+    succeed = worker_spirit_L1_ncg_.performUnwarppingImpl(*job);
+
+    if ( performTiming_ ) { gt_timer1_.stop(); }
+
+    // export the results
+    if ( !debugFolder_fullPath_.empty() )
+    {
+        std::ostringstream ostr;
+        ostr << "ReconJob2DT_ID" << *jobID;
+
+        hoNDArray< std::complex<float> > res = job->res;
+        res.squeeze();
+        if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_fullPath_+ostr.str()); }
+    }
+
+    // clean the kspace and ker and coil map
+    job->kspace.clear();
+    job->ker.clear();
+    if ( !job->workOrder2DT.coilMap_ ) job->workOrder2DT.coilMap_->clear();
+
+    if ( !succeed )
+    {
+        job->complexIm.clear();
+        job->res.clear();
+    }
+
+    // send out the results
+    GADGET_CHECK_RETURN(this->sendOutJob(*jobID, job), GADGET_FAIL);
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusReconJob2DTGadget::process(...) ends ... ");
+
+    m1->release();
+
+    return GADGET_OK;
+}
+
+bool GtPlusReconJob2DTGadget::
+sendOutJob(int jobID, GtPlusReconJobTypeCPFL* job)
+{
+    try
+    {
+      GDEBUG("GtPlusReconJob2DTGadget sendOutJob ...\n");
+
+        if (!this->controller_)
+        {
+	  GERROR("Cannot return result to controller, no controller set\n");
+	  return false;
+        }
+
+        GadgetContainerMessage<GadgetMessageIdentifier>* mb =
+            new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+        mb->getObjectPtr()->id = GADGET_MESSAGE_CLOUD_JOB;
+
+        GadgetContainerMessage<int>* m1 = new GadgetContainerMessage<int>();
+        *(m1->getObjectPtr()) = jobID;
+
+        GadgetContainerMessage<GtPlusReconJobTypeCPFL>* m2 = new GadgetContainerMessage<GtPlusReconJobTypeCPFL>();
+
+        *(m2->getObjectPtr()) = *job;
+
+        m1->cont(m2);
+        mb->cont(m1);
+
+        int ret =  this->controller_->output_ready(mb);
+        if (ret < 0)
+        {
+            GDEBUG("Failed to return GtPlusReconJob2DTGadget job massage to controller\n");
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusReconJob2DTGadget::sendOutJob(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusReconJob2DTGadget)
+
+}
diff --git a/gadgets/gtPlus/GtPlusReconJob2DTGadget.h b/gadgets/gtPlus/GtPlusReconJob2DTGadget.h
new file mode 100644
index 0000000..fa32210
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconJob2DTGadget.h
@@ -0,0 +1,103 @@
+/** \file   GtPlusReconJob2DTGadget.h
+    \brief  This is a cloud gadget performing the computation for 2DT job data package.
+
+            This gadget can either serve as the working gadget for the signle layer cloud, or it can work as the 
+            second layer gadget for the dual layer cloud.
+
+            Ref to: 
+
+            Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+            Distributed MRI Reconstruction using Gadgetron based Cloud Computing. 
+            Magenetic Resonance in Medicine, doi: 10.1002/mrm.25213.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "Gadget.h"
+#include "GadgetStreamController.h"
+#include "GadgetCloudJobMessageReadWrite.h"
+#include "GadgetronTimer.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorkOrder.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian2DT.h"
+#include "gtPlusISMRMRDReconWorker2DTGRAPPA.h"
+#include "gtPlusISMRMRDReconWorker2DTNoAcceleration.h"
+#include "gtPlusISMRMRDReconWorker2DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h"
+#include "GtPlusReconGadgetUtil.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron
+{
+
+class EXPORTGTPLUSGADGET GtPlusReconJob2DTGadget : public Gadgetron::Gadget2< int, GtPlusReconJobTypeCPFL >
+{
+public:
+    GADGET_DECLARE(GtPlusReconJob2DTGadget);
+
+    typedef std::complex<float> ValueType;
+
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder<ValueType> WorkOrderType;
+
+    typedef Gadget2< int, GtPlusReconJobTypeCPFL > BaseClass;
+
+    GtPlusReconJob2DTGadget();
+    ~GtPlusReconJob2DTGadget();
+
+    // debug folder
+    std::string debugFolder_;
+    std::string debugFolder_fullPath_;
+
+    // whether to perform timing
+    bool performTiming_;
+
+protected:
+
+    // --------------------------------------------------
+    // functional functions
+    // --------------------------------------------------
+
+    // default interface function
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage< int >* m1, Gadgetron::GadgetContainerMessage< GtPlusReconJobTypeCPFL > * m2);
+
+    // process config is only to be called once
+    bool process_config_called_;
+
+    // read in parameters
+    virtual bool readParameters();
+
+    // send the completed job
+    bool sendOutJob(int jobID, GtPlusReconJobTypeCPFL* job);
+
+    // worker
+    Gadgetron::gtPlus::gtPlusReconWorker2DTGRAPPA<ValueType> worker_grappa_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTNoAcceleration<ValueType> worker_noacceleration_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTSPIRIT<ValueType> worker_spirit_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTL1SPIRITNCG<ValueType> worker_spirit_L1_ncg_;
+
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtil<ValueType> gtPlus_util_;
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtilComplex<ValueType> gtPlus_util_complex_;
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // in verbose mode, more info is printed out
+    bool verboseMode_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusReconJob2DTGadgetCloud.cpp b/gadgets/gtPlus/GtPlusReconJob2DTGadgetCloud.cpp
new file mode 100644
index 0000000..7d4af11
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconJob2DTGadgetCloud.cpp
@@ -0,0 +1,790 @@
+
+#include "GtPlusReconJob2DTGadgetCloud.h"
+#include "GtPlusGadgetOpenMP.h"
+#include "gadgetron_paths.h"
+#include "log.h"
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+GtPlusReconJob2DTGadgetCloud::GtPlusReconJob2DTGadgetCloud()
+{
+    debugFolder_ = "DebugOutput";
+
+    performTiming_ = true;
+
+    verboseMode_ = false;
+
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+
+    filterRO_type_ = ISMRMRD_FILTER_GAUSSIAN;
+    filterRO_sigma_ = 1.5;
+    filterRO_width_ = 0.15;
+
+    filterE1_type_ = ISMRMRD_FILTER_GAUSSIAN;
+    filterE1_sigma_ = 1.5;
+    filterE1_width_ = 0.15;
+
+    filterE2_type_ = ISMRMRD_FILTER_GAUSSIAN;
+    filterE2_sigma_ = 1.5;
+    filterE2_width_ = 0.15;
+
+    filterRO_ref_type_ = ISMRMRD_FILTER_HANNING;
+    filterRO_ref_sigma_ = 1.5;
+    filterRO_ref_width_ = 0.15;
+
+    filterE1_ref_type_ = ISMRMRD_FILTER_HANNING;
+    filterE1_ref_sigma_ = 1.5;
+    filterE1_ref_width_ = 0.15;
+
+    filterE2_ref_type_ = ISMRMRD_FILTER_HANNING;
+    filterE2_ref_sigma_ = 1.5;
+    filterE2_ref_width_ = 0.15;
+
+    filterRO_pf_type_ = ISMRMRD_FILTER_HANNING;
+    filterRO_pf_sigma_ = 1.5;
+    filterRO_pf_width_ = 0.15;
+    filterRO_pf_densityComp_ = false;
+
+    filterE1_pf_type_ = ISMRMRD_FILTER_HANNING;
+    filterE1_pf_sigma_ = 1.5;
+    filterE1_pf_width_ = 0.15;
+    filterE1_pf_densityComp_ = false;
+
+    filterE2_pf_type_ = ISMRMRD_FILTER_HANNING;
+    filterE2_pf_sigma_ = 1.5;
+    filterE2_pf_width_ = 0.15;
+    filterE2_pf_densityComp_ = false;
+
+    process_config_called_ = false;
+
+    Gadgetron::prepOpenMP();
+}
+
+GtPlusReconJob2DTGadgetCloud::~GtPlusReconJob2DTGadgetCloud()
+{
+
+}
+
+bool GtPlusReconJob2DTGadgetCloud::readParameters()
+{
+    try
+    {
+        GDEBUG_CONDITION_STREAM(verboseMode_, "------> GtPlusReconJob2DTGadgetCloud parameters <------");
+
+        boost::shared_ptr<std::string> str = this->get_string_value("debugFolder");
+        debugFolder_ = *str;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "debugFolder_ is " << debugFolder_);
+
+        str = this->get_string_value("debugFolder2");
+        debugFolder2_ = *str;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "debugFolder2_ is " << debugFolder2_);
+
+        str = this->get_string_value("cloudNodeFile");
+        cloud_node_file_ = *str;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "cloud_node_file_ is " << cloud_node_file_);
+
+        performTiming_ = this->get_bool_value("performTiming");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "performTiming_ is " << performTiming_);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+        // kspace filter parameters
+        str = this->get_string_value("filterRO");
+        filterRO_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterRO_sigma_ = this->get_double_value("filterRO_sigma");
+        filterRO_width_ = this->get_double_value("filterRO_width");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_type_ is " << *str);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_sigma_ is " << filterRO_sigma_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_width_ is " << filterRO_width_);
+
+        str = this->get_string_value("filterE1");
+        filterE1_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE1_sigma_ = this->get_double_value("filterE1_sigma");
+        filterE1_width_ = this->get_double_value("filterE1_width");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_type_ is " << *str);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_sigma_ is " << filterE1_sigma_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_width_ is " << filterE1_width_);
+
+        str = this->get_string_value("filterE2");
+        filterE2_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE2_sigma_ = this->get_double_value("filterE2_sigma");
+        filterE2_width_ = this->get_double_value("filterE2_width");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_type_ is " << *str);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_sigma_ is " << filterE2_sigma_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_width_ is " << filterE2_width_);
+
+        str = this->get_string_value("filterRefRO");
+        filterRO_ref_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterRO_ref_sigma_ = this->get_double_value("filterRefRO_sigma");
+        filterRO_ref_width_ = this->get_double_value("filterRefRO_width");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_ref_type_ is " << *str);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_ref_sigma_ is " << filterRO_ref_sigma_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_ref_width_ is " << filterRO_ref_width_);
+
+        str = this->get_string_value("filterRefE1");
+        filterE1_ref_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE1_ref_sigma_ = this->get_double_value("filterRefE1_sigma");
+        filterE1_ref_width_ = this->get_double_value("filterRefE1_width");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_ref_type_ is " << *str);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_ref_sigma_ is " << filterE1_ref_sigma_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_ref_width_ is " << filterE1_ref_width_);
+
+        str = this->get_string_value("filterRefE2");
+        filterE2_ref_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE2_ref_sigma_ = this->get_double_value("filterRefE2_sigma");
+        filterE2_ref_width_ = this->get_double_value("filterRefE2_width");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_ref_type_ is " << *str);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_ref_sigma_ is " << filterE2_ref_sigma_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_ref_width_ is " << filterE2_ref_width_);
+
+        str = this->get_string_value("filterPartialFourierRO");
+        filterRO_pf_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterRO_pf_sigma_ = this->get_double_value("filterPartialFourierRO_sigma");
+        filterRO_pf_width_ = this->get_double_value("filterPartialFourierRO_width");
+        filterRO_pf_densityComp_ = this->get_bool_value("filterPartialFourierRO_densityComp");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_pf_type_ is " << *str);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_pf_sigma_ is " << filterRO_pf_sigma_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_pf_width_ is " << filterRO_pf_width_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterRO_pf_densityComp_ is " << filterRO_pf_densityComp_);
+
+        str = this->get_string_value("filterPartialFourierE1");
+        filterE1_pf_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE1_pf_sigma_ = this->get_double_value("filterPartialFourierE1_sigma");
+        filterE1_pf_width_ = this->get_double_value("filterPartialFourierE1_width");
+        filterE1_pf_densityComp_ = this->get_bool_value("filterPartialFourierE1_densityComp");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_pf_type_ is " << *str);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_pf_sigma_ is " << filterE1_pf_sigma_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_pf_width_ is " << filterE1_pf_width_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE1_pf_densityComp_ is " << filterE1_pf_densityComp_);
+
+        str = this->get_string_value("filterPartialFourierE2");
+        filterE2_pf_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE2_pf_sigma_ = this->get_double_value("filterPartialFourierE2_sigma");
+        filterE2_pf_width_ = this->get_double_value("filterPartialFourierE2_width");
+        filterE2_pf_densityComp_ = this->get_bool_value("filterPartialFourierE2_densityComp");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_pf_type_ is " << *str);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_pf_sigma_ is " << filterE2_pf_sigma_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_pf_width_ is " << filterE2_pf_width_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "filterE2_pf_densityComp_ is " << filterE2_pf_densityComp_);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+        job_split_by_S_ = this->get_bool_value("job_split_by_S");
+        job_num_of_N_ = (size_t)(this->get_int_value("job_num_of_N"));
+        job_max_Megabytes_ = (size_t)(this->get_int_value("job_max_Megabytes"));
+        job_overlap_ = (size_t)(this->get_int_value("job_overlap"));
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "job_split_by_S_ is " << job_split_by_S_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "job_num_of_N_ is " << job_num_of_N_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "job_max_Megabytes_ is " << job_max_Megabytes_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "job_overlap_ is " << job_overlap_);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+
+        CloudComputing_ = this->get_bool_value("CloudComputing");
+        CloudSize_ = (unsigned int)(this->get_int_value("CloudSize"));
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "CloudComputing_ is " << CloudComputing_);
+        GDEBUG_CONDITION_STREAM(verboseMode_, "CloudSize_ is " << CloudSize_);
+
+        // read in the cloud information for every node
+        gt_cloud_.resize(CloudSize_);
+
+        for ( unsigned int ii=0; ii<CloudSize_; ii++ )
+        {
+            std::ostringstream ostreamstr1;
+            ostreamstr1 << "CloudNode" << ii << "_IP" << std::ends;
+            boost::shared_ptr<std::string> IP = this->get_string_value(ostreamstr1.str().c_str());
+            gt_cloud_[ii].get<0>() = *IP;
+
+            std::ostringstream ostreamstr2;
+            ostreamstr2 << "CloudNode" << ii << "_Port" << std::ends;
+            boost::shared_ptr<std::string> Port = this->get_string_value(ostreamstr2.str().c_str());
+            gt_cloud_[ii].get<1>() = *Port;
+
+            std::ostringstream ostreamstr3;
+            ostreamstr3 << "CloudNode" << ii << "_XMLConfiguration" << std::ends;
+            boost::shared_ptr<std::string> xmlName = this->get_string_value(ostreamstr3.str().c_str());
+            gt_cloud_[ii].get<2>() = *xmlName;
+
+            std::ostringstream ostreamstr4;
+            ostreamstr4 << "CloudNode" << ii << "_ComputingPowerIndex" << std::ends;
+            unsigned int computingPowerIndex = this->get_int_value(ostreamstr4.str().c_str());
+            gt_cloud_[ii].get<3>() = computingPowerIndex;
+
+            GDEBUG_CONDITION_STREAM(verboseMode_, "Cloud Node " << ii << " : " << gt_cloud_[ii]);
+        }
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusReconJob2DTGadgetCloud::readParameters() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+int GtPlusReconJob2DTGadgetCloud::process_config(ACE_Message_Block* mb)
+{
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    //   0  1  2   3    4   5    6     7  8   9
+
+    verboseMode_ = this->get_bool_value("verboseMode");
+
+    // read in parameters from the xml
+    GADGET_CHECK_RETURN(this->readParameters(), GADGET_FAIL);
+
+    // generate the destination folder
+    if ( !debugFolder_.empty() )
+    {
+        getDebugFolderPath(debugFolder_, debugFolder_fullPath_, verboseMode_);
+    }
+    else
+    {
+        GDEBUG_STREAM("GtPlusRecon, debugFolder is not set ...");
+    }
+
+    if ( !debugFolder2_.empty() )
+    {
+        getDebugFolderPath(debugFolder2_, debugFolder2_fullPath_, verboseMode_);
+    }
+    else
+    {
+        GDEBUG_STREAM("GtPlusRecon, debugFolder2 is not set ...");
+    }
+
+    return GADGET_OK;
+}
+
+bool GtPlusReconJob2DTGadgetCloud::setWorkOrder2DTParameters(GtPlusRecon2DTPara& para, WorkOrder2DTType* workOrder)
+{
+    workOrder->recon_kspace_needed_ = para.recon_kspace_needed_;
+
+    if ( para.workOrderPara_.coil_compression_thres_>0 || para.workOrderPara_.coil_compression_num_modesKept_>0 )
+    {
+        workOrder->coil_compression_ = true;
+    }
+    else
+    {
+        workOrder->coil_compression_ = false;
+    }
+
+    workOrder->same_coil_compression_coeff_allS_ = para.same_coil_compression_coeff_allS_;
+
+    workOrder->embedded_averageall_ref_ = para.embedded_averageall_ref_;
+    workOrder->embedded_ref_numOfModes_ = para.embedded_ref_numOfModes_;
+    workOrder->embedded_fullres_coilmap_ = para.embedded_fullres_coilmap_;
+    workOrder->embedded_fullres_coilmap_useHighestSignal_ = para.embedded_fullres_coilmap_useHighestSignal_;
+    workOrder->embedded_same_combinationcoeff_allS_ = para.embedded_same_combinationcoeff_allS_;
+    workOrder->embedded_whichS_combinationcoeff_ = para.embedded_whichS_combinationcoeff_;
+    workOrder->embedded_ref_fillback_ = para.embedded_ref_fillback_;
+
+    workOrder->separate_averageall_ref_ = para.separate_averageall_ref_;
+    workOrder->separate_ref_numOfModes_ = para.separate_ref_numOfModes_;
+    workOrder->separate_fullres_coilmap_ = para.separate_fullres_coilmap_;
+    workOrder->separate_same_combinationcoeff_allS_ = para.separate_same_combinationcoeff_allS_;
+    workOrder->separate_whichS_combinationcoeff_ = para.separate_whichS_combinationcoeff_;
+
+    workOrder->interleaved_same_combinationcoeff_allS_ = para.interleaved_same_combinationcoeff_allS_;
+    workOrder->interleaved_whichS_combinationcoeff_ = para.interleaved_whichS_combinationcoeff_;
+    workOrder->interleaved_ref_numOfModes_ = para.interleaved_ref_numOfModes_;
+
+    workOrder->no_acceleration_averageall_ref_ = para.no_acceleration_averageall_ref_;
+    workOrder->no_acceleration_ref_numOfModes_ = para.no_acceleration_ref_numOfModes_;
+    workOrder->no_acceleration_same_combinationcoeff_allS_ = para.no_acceleration_same_combinationcoeff_allS_;
+    workOrder->no_acceleration_whichS_combinationcoeff_ = para.no_acceleration_whichS_combinationcoeff_;
+
+    return true;
+}
+
+bool GtPlusReconJob2DTGadgetCloud::parseGTCloudNodeFile(const std::string& filename, CloudType& gtCloud)
+{
+    std::string nodeFileName = get_gadgetron_home();
+    nodeFileName.append("/config/gtCloud/");
+    nodeFileName.append(filename);
+    GDEBUG_CONDITION_STREAM(verboseMode_, "Cloud node file name is " << nodeFileName);
+
+    std::ifstream fs(nodeFileName.c_str(), std::ios::in);
+    if (!fs.is_open()) 
+    {
+        GWARN_STREAM("Cannot open GT CloudNodeFile; use the local setting instead ... ");
+        return false;
+    }
+
+    // control node hostname
+    std::string controlNode;
+    fs >> controlNode;
+
+    std::string portControlNode;
+    fs >> portControlNode;
+
+    // number of GadgetLevel nodes
+    unsigned int num;
+    fs >> num;
+
+    gtCloud.resize(num);
+
+    unsigned int n;
+    for ( n=0; n<num; n++ )
+    {
+        std::string gadgetNode;
+        fs >> gadgetNode;
+
+        std::string portGadgetNode;
+        fs >> portGadgetNode;
+
+        std::string xmlGadgetNode;
+        fs >> xmlGadgetNode;
+
+        unsigned int computingPowerIndex;
+        fs >> computingPowerIndex;
+
+        gtCloud[n].get<0>() = gadgetNode;
+        gtCloud[n].get<1>() = portGadgetNode;
+        gtCloud[n].get<2>() = xmlGadgetNode;
+        gtCloud[n].get<3>() = computingPowerIndex;
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "Gadget Node " << n << " : " << gt_cloud_[n]);
+    }
+
+    fs.close();
+
+    return true;
+}
+
+int GtPlusReconJob2DTGadgetCloud::process(Gadgetron::GadgetContainerMessage< int >* m1, Gadgetron::GadgetContainerMessage< GtPlusRecon2DTCloudPackageCPFL > * m2)
+{
+    // because the parameter configuration will not be sent, we need to call process_config explicitly
+    if ( !process_config_called_ )
+    {
+        GADGET_CHECK_RETURN( (this->process_config(m1)==0), GADGET_FAIL);
+        process_config_called_ = true;
+
+        if ( CloudComputing_ )
+        {
+            bool parseSuccess = this->parseGTCloudNodeFile(cloud_node_file_, gt_cloud_);
+            if ( parseSuccess )
+            {
+                CloudComputing_ = true;
+                CloudSize_ = (int)gt_cloud_.size();
+
+                if ( CloudSize_ == 0 )
+                {
+                    CloudComputing_ = false;
+                    GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusReconJob2DTGadgetCloud : cannot find algorithm nodes ... ");
+                }
+            }
+        }
+    }
+    GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusReconJob2DTGadgetCloud::process(...) starts ... ");
+
+    int* jobID = m1->getObjectPtr();
+    GDEBUG_CONDITION_STREAM(verboseMode_, "--> arriving job : " << *jobID << " ... ");
+
+    GtPlusRecon2DTCloudPackageCPFL* job = m2->getObjectPtr();
+
+    boost::shared_ptr< std::vector<size_t> > dims = job->kspace.get_dimensions();
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "job array size : [Ro E1 Cha Slice E2 Con Phase Rep Set Seg] = [" 
+        << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " << (*dims)[3] << " " << (*dims)[4] 
+        << " " << (*dims)[5] << " " << (*dims)[6] << " " << (*dims)[7] << " " << (*dims)[8] << " " << (*dims)[9] << "]");
+
+    GtPlusRecon2DTPara& para = job->para;
+
+    // ---------------------------------------------------------
+    // set the work flow
+    // ---------------------------------------------------------
+    workflow_.reconSizeRO_ = para.reconSizeRO_;
+    workflow_.reconSizeE1_ = para.reconSizeE1_;
+    workflow_.reconSizeE2_ = para.reconSizeE2_;
+    workflow_.encodingFOV_RO_ = para.encodingFOV_RO_;
+    workflow_.encodingFOV_E1_ = para.encodingFOV_E1_;
+    workflow_.encodingFOV_E2_ = para.encodingFOV_E2_;
+    workflow_.reconFOV_RO_ = para.reconFOV_RO_;
+    workflow_.reconFOV_E1_ = para.reconFOV_E1_;
+    workflow_.reconFOV_E2_ = para.reconFOV_E2_;
+
+    // workflow_.dataDimStartingIndexes_ = workOrder->dataDimStartingIndexes_;
+    workflow_.dim4th_ = para.dim_4th_;
+    workflow_.dim5th_ = para.dim_5th_;
+    workflow_.WorkOrderShareDim_ = para.workOrder_ShareDim_;
+    workflow_.performTiming_ = performTiming_;
+
+    // ---------------------------------------------------------
+    // set work order
+    // ---------------------------------------------------------
+    WorkOrder2DTType workOrder;
+
+    workOrder.copyFromPara(para.workOrderPara_);
+
+    workOrder.job_split_by_S_ = job_split_by_S_;
+    workOrder.job_num_of_N_ = job_num_of_N_;
+    workOrder.job_max_Megabytes_ = job_max_Megabytes_;
+    workOrder.job_overlap_ = job_overlap_;
+
+    workOrder.CloudComputing_ = CloudComputing_;
+    workOrder.CloudSize_ = CloudSize_;
+    workOrder.gt_cloud_ = gt_cloud_;
+
+    if ( workOrder.acceFactorE1_>1 && workOrder.CalibMode_==Gadgetron::ISMRMRD_interleaved )
+    {
+        Gadgetron::fillSampledLinesUpTo11DArray(job->kspace, workOrder.data_, job->timeStamp);
+    }
+    else
+    {
+        workOrder.data_ = job->kspace;
+    }
+
+    workOrder.time_stamp_ = job->timeStamp;
+    workOrder.physio_time_stamp_ = job->physioTimeStamp;
+    workOrder.ref_ = job->ref;
+
+    // ---------------------------------------------------------
+    // set the worker
+    // ---------------------------------------------------------
+    worker_grappa_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_grappa_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_noacceleration_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_noacceleration_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_L1_ncg_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_L1_ncg_.debugFolder_ = debugFolder_fullPath_;
+
+    if ( !debugFolder_fullPath_.empty() ) workflow_.debugFolder_ = debugFolder_fullPath_;
+
+    // perform the recon
+    if ( performTiming_ ) { gt_timer1_.start("Recon 2DT workorder on cloud node ... "); }
+
+    GADGET_CHECK_RETURN(this->generateKSpaceFilter(workOrder), GADGET_FAIL);
+
+    workOrder.duplicate(workOrder_recon_);
+    setWorkOrder2DTParameters(para, &workOrder_recon_);
+
+    workflow_.workOrder_ = &workOrder_recon_;
+
+    if ( verboseMode_ )
+    {
+        workflow_.workOrder_->print(std::cout);
+    }
+
+    workflow_.setDataArray(workOrder.data_, workOrder.time_stamp_, workOrder.physio_time_stamp_);
+
+    if ( workOrder.ref_.get_number_of_elements() > 0 )
+    {
+        workflow_.setRefArray(workOrder.ref_);
+    }
+    else if ( para.workOrderPara_.CalibMode_==Gadgetron::ISMRMRD_interleaved )
+    {
+        workOrder.ref_ = workOrder.data_;
+        workflow_.setRefArray(workOrder.ref_);
+    }
+
+    // set the work flow for worker and workOrder
+    if ( workOrder.acceFactorE1_ > 1 )
+    {
+        if ( para.workOrderPara_.recon_algorithm_ == Gadgetron::ISMRMRD_SPIRIT )
+        {
+            workflow_.worker_ = &worker_spirit_;
+        }
+        else if ( para.workOrderPara_.recon_algorithm_ == Gadgetron::ISMRMRD_L1SPIRIT )
+        {
+            workflow_.worker_ = &worker_spirit_L1_ncg_;
+        }
+        else
+        {
+            workflow_.worker_ = &worker_grappa_;
+        }
+    }
+    else
+    {
+        workflow_.worker_ = &worker_noacceleration_;
+    }
+
+    bool succeed = true;
+    succeed = workflow_.preProcessing();
+    if ( succeed )
+    {
+        succeed = workflow_.recon();
+        if ( succeed )
+        {
+            succeed = workflow_.postProcessing();
+        }
+    }
+
+    if ( !succeed )
+    {
+        GERROR_STREAM("GtPlusReconJob2DTGadgetCloud::process(...) failed... ");
+    }
+
+    if ( performTiming_ ) { gt_timer1_.stop(); }
+
+    if ( !debugFolder2_fullPath_.empty() )
+    {
+        std::ostringstream ostr;
+        ostr << "Node_Recon2DT_" << *jobID;
+
+        hoNDArray< std::complex<float> > res = workflow_.res_;
+        res.squeeze();
+        if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder2_fullPath_+ostr.str()); }
+
+        if ( workflow_.res_second_.get_number_of_elements() > 0 )
+        {
+            hoNDArray< std::complex<float> > res = workflow_.res_second_;
+            res.squeeze();
+
+            std::ostringstream ostr;
+            ostr << "Node_Recon2DT_second_" << *jobID;
+
+            if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder2_fullPath_+ostr.str()); }
+        }
+    }
+
+    // clean the kspace and ker and coil map
+    job->kspace.clear();
+    job->timeStamp.clear();
+    job->physioTimeStamp.clear();
+    job->ref.clear();
+
+    if ( succeed )
+    {
+        job->complexIm = workflow_.res_;
+        job->complexImSecond = workflow_.res_second_;
+        job->resTimeStampSecond = workflow_.res_time_stamp_second_;
+        job->resPhysioTimeStampSecond = workflow_.res_physio_time_stamp_second_;
+    }
+    else
+    {
+        job->complexIm.clear();
+        job->res.clear();
+
+        job->complexImSecond.clear();
+        job->resTimeStampSecond.clear();
+        job->resPhysioTimeStampSecond.clear();
+    }
+
+    // send out the results
+    GADGET_CHECK_RETURN(this->sendOutJob(*jobID, job), GADGET_FAIL);
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusReconJob2DTGadgetCloud::process(...) ends ... ");
+
+    // reset the status
+    workflow_.data_ = NULL;
+    workflow_.time_stamp_ = NULL;
+    workflow_.physio_time_stamp_ = NULL;
+    workflow_.ref_ = NULL;
+    workflow_.noise_ = NULL;
+    workflow_.workOrder_ = NULL;
+    // Gadgetron::clear(&workflow_.res_);
+
+    m1->release();
+
+    if ( this->verboseMode_ )
+    {
+        std::string procTime;
+        gtPlus_util_.getCurrentMoment(procTime);
+
+        GDEBUG_STREAM("* ============================================================================== *");
+        GDEBUG_STREAM("---> MR recon 2DT gadget cloud, Currnt processing time : " << procTime << " <---");
+        GDEBUG_STREAM("* ============================================================================== *");
+    }
+
+    return GADGET_OK;
+}
+
+bool GtPlusReconJob2DTGadgetCloud::
+sendOutJob(int jobID, GtPlusRecon2DTCloudPackageCPFL* job)
+{
+    try
+    {
+      GDEBUG("GtPlusReconJob2DTGadgetCloud sendOutJob ...\n");
+
+        if (!this->controller_)
+        {
+	  GERROR("Cannot return result to controller, no controller set\n");
+	  return false;
+        }
+
+        GadgetContainerMessage<GadgetMessageIdentifier>* mb =
+            new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+        mb->getObjectPtr()->id = GADGET_MESSAGE_GADGETCLOUD_JOB;
+
+        GadgetContainerMessage<int>* m1 = new GadgetContainerMessage<int>();
+        *(m1->getObjectPtr()) = jobID;
+
+        GadgetContainerMessage<GtPlusRecon2DTCloudPackageCPFL>* m2 = new GadgetContainerMessage<GtPlusRecon2DTCloudPackageCPFL>();
+
+        *(m2->getObjectPtr()) = *job;
+
+        m1->cont(m2);
+        mb->cont(m1);
+
+        int ret =  this->controller_->output_ready(mb);
+        if (ret < 0)
+        {
+            GDEBUG("Failed to return GtPlusReconJob2DTGadgetCloud job massage to controller\n");
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusReconJob2DTGadgetCloud::sendOutJob(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusReconJob2DTGadgetCloud::
+generateKSpaceFilter(WorkOrderType& workOrder)
+{
+    try
+    {
+        size_t RO = workOrder.data_.get_size(0);
+        size_t E1 = workOrder.data_.get_size(1);
+        size_t E2 = workOrder.data_.get_size(4);
+
+        size_t RO_ref = workOrder.ref_.get_size(0);
+        size_t E1_ref = workOrder.ref_.get_size(1);
+        size_t E2_ref = workOrder.ref_.get_size(4);
+
+        if ( workOrder.CalibMode_ == Gadgetron::ISMRMRD_interleaved )
+        {
+            RO_ref = RO;
+            E1_ref = E1;
+            E2_ref = E2;
+        }
+
+        // image data filter
+        if ( RO>1 && filterRO_type_ != ISMRMRD_FILTER_NONE )
+        {
+            workOrder.filterRO_.create(RO);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(RO, workOrder.start_RO_, workOrder.end_RO_, workOrder.filterRO_, filterRO_type_, filterRO_sigma_, (size_t)std::ceil(filterRO_width_*RO)));
+            if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterRO_, debugFolder_fullPath_+"filterRO"); }
+        }
+
+        if ( E1>1 && filterE1_type_ != ISMRMRD_FILTER_NONE )
+        {
+            workOrder.filterE1_.create(E1);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(E1, workOrder.start_E1_, workOrder.end_E1_, workOrder.filterE1_, filterE1_type_, filterE1_sigma_, (size_t)std::ceil(filterE1_width_*E1)));
+            if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterE1_, debugFolder_fullPath_+"filterE1"); }
+        }
+
+        if ( E2>1 && filterE2_type_ != ISMRMRD_FILTER_NONE )
+        {
+            workOrder.filterE2_.create(E2);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(E2, workOrder.start_E2_, workOrder.end_E2_, workOrder.filterE2_, filterE2_type_, filterE2_sigma_, (size_t)std::ceil(filterE2_width_*E2)));
+            if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterE2_, debugFolder_fullPath_+"filterE2"); }
+        }
+
+        // ref data filter
+        if ( workOrder.ref_.get_number_of_elements() > 0 )
+        {
+            size_t startRO(0), endRO(0), startE1(0), endE1(0), startE2(0), endE2(0);
+            if ( E2_ref == 1 )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_complex_.detectSampledRegion2D(workOrder.ref_, startRO, endRO, startE1, endE1));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_complex_.detectSampledRegion3D(workOrder.ref_, startRO, endRO, startE1, endE1, startE2, endE2));
+            }
+
+            if ( (workOrder.CalibMode_ == ISMRMRD_interleaved) || (workOrder.CalibMode_ == ISMRMRD_embedded) )
+            {
+                // use the image data sample range
+                startRO = workOrder.start_RO_; if ( startRO < 0 ) startRO=0;
+                endRO = workOrder.end_RO_; if ( endRO < 0 ) endRO = RO_ref-1;
+            }
+
+            if ( RO_ref > 1 && filterRO_ref_type_ != ISMRMRD_FILTER_NONE )
+            {
+                workOrder.filterRO_ref_.create(RO_ref);
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(RO_ref, startRO, endRO, workOrder.filterRO_ref_, filterRO_ref_type_, filterRO_ref_sigma_, (size_t)std::ceil(filterRO_ref_width_*RO_ref)));
+                if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterRO_ref_, debugFolder_fullPath_+"filterRO_ref"); }
+            }
+
+            if ( (workOrder.CalibMode_ == ISMRMRD_separate) || (workOrder.CalibMode_ == ISMRMRD_external) )
+            {
+                if ( E1_ref > 1 && filterE1_ref_type_ != ISMRMRD_FILTER_NONE )
+                {
+                    size_t len = endE1-startE1+1;
+                    workOrder.filterE1_ref_.create(len);
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(len, 0, len-1, workOrder.filterE1_ref_, filterE1_ref_type_, filterE1_ref_sigma_, (size_t)std::ceil(filterE1_ref_width_*len)));
+                    if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterE1_ref_, debugFolder_fullPath_+"filterE1_ref"); }
+                }
+
+                if ( E2_ref > 1 && filterE2_ref_type_ != ISMRMRD_FILTER_NONE )
+                {
+                    size_t len = endE2-startE2+1;
+                    workOrder.filterE2_ref_.create(len);
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(len, 0, len-1, workOrder.filterE2_ref_, filterE2_ref_type_, filterE2_ref_sigma_, (size_t)std::ceil(filterE2_ref_width_*len)));
+                    if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterE2_ref_, debugFolder_fullPath_+"filterE2_ref"); }
+                }
+            }
+            else
+            {
+                // this makes sure for interleaved and embedded, the kspace filter is applied at correct lines
+                if ( E1_ref > 1 && filterE1_ref_type_ != ISMRMRD_FILTER_NONE )
+                {
+                    size_t len = E1_ref;
+                    workOrder.filterE1_ref_.create(len);
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(len, startE1, endE1, workOrder.filterE1_ref_, filterE1_ref_type_, filterE1_ref_sigma_, (size_t)std::ceil(filterE1_ref_width_*len)));
+                    if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterE1_ref_, debugFolder_fullPath_+"filterE1_ref"); }
+                }
+
+                if ( E2_ref > 1 && filterE2_ref_type_ != ISMRMRD_FILTER_NONE )
+                {
+                    size_t len = E2_ref;
+                    workOrder.filterE2_ref_.create(len);
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(len, startE2, endE2, workOrder.filterE2_ref_, filterE2_ref_type_, filterE2_ref_sigma_, (size_t)std::ceil(filterE2_ref_width_*len)));
+                    if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterE2_ref_, debugFolder_fullPath_+"filterE2_ref"); }
+                }
+            }
+        }
+
+        // partial fourier handling filter
+        if ( RO>1 && workOrder.start_RO_>=0 && workOrder.end_RO_>0 )
+        {
+            workOrder.filterRO_partialfourier_.create(RO);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(RO, workOrder.start_RO_, workOrder.end_RO_, workOrder.filterRO_partialfourier_, filterRO_pf_type_, (size_t)std::ceil(filterRO_pf_width_*RO), filterRO_pf_densityComp_));
+            if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterRO_partialfourier_, debugFolder_fullPath_+"filterRO_partialfourier"); }
+        }
+
+        if ( E1>1 && workOrder.start_E1_>=0 && workOrder.end_E1_>0 )
+        {
+            workOrder.filterE1_partialfourier_.create(E1);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(E1, workOrder.start_E1_, workOrder.end_E1_, workOrder.filterE1_partialfourier_, filterE1_pf_type_, (size_t)std::ceil(filterE1_pf_width_*E1), filterE1_pf_densityComp_));
+            if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterE1_partialfourier_, debugFolder_fullPath_+"filterE1_partialfourier"); }
+        }
+
+        if ( E2>1 && workOrder.start_E2_>=0 && workOrder.end_E2_>0 )
+        {
+            workOrder.filterE2_partialfourier_.create(E2);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(E2, workOrder.start_E2_, workOrder.end_E2_, workOrder.filterE2_partialfourier_, filterE2_pf_type_, (size_t)std::ceil(filterE2_pf_width_*E2), filterE2_pf_densityComp_));
+            if ( !debugFolder_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(workOrder.filterE2_partialfourier_, debugFolder_fullPath_+"filterE2_partialfourier"); }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusReconJob2DTGadgetCloud::generateKSpaceFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusReconJob2DTGadgetCloud)
+
+}
diff --git a/gadgets/gtPlus/GtPlusReconJob2DTGadgetCloud.h b/gadgets/gtPlus/GtPlusReconJob2DTGadgetCloud.h
new file mode 100644
index 0000000..e6dc06e
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconJob2DTGadgetCloud.h
@@ -0,0 +1,181 @@
+/** \file   GtPlusReconJob2DTGadgetCloud.h
+    \brief  This gadget serves as the first layer gadget for the dual layer cloud.
+
+            Ref to: 
+
+            Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+            Distributed MRI Reconstruction using Gadgetron based Cloud Computing. 
+            Magenetic Resonance in Medicine, doi: 10.1002/mrm.25213.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "Gadget.h"
+#include "GadgetStreamController.h"
+#include "GadgetCloudJobMessageReadWrite.h"
+#include "GadgetronTimer.h"
+
+#include "hoNDArray_utils.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorkOrder.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian2DT.h"
+#include "gtPlusISMRMRDReconWorker2DTGRAPPA.h"
+#include "gtPlusISMRMRDReconWorker2DTNoAcceleration.h"
+#include "gtPlusISMRMRDReconWorker2DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h"
+
+#include "GtPlusRecon2DTCloudPackage.h"
+#include "GtPlusReconGadgetUtil.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron
+{
+
+class EXPORTGTPLUSGADGET GtPlusReconJob2DTGadgetCloud : public Gadgetron::Gadget2< int, GtPlusRecon2DTCloudPackageCPFL >
+{
+public:
+    GADGET_DECLARE(GtPlusReconJob2DTGadgetCloud);
+
+    typedef std::complex<float> ValueType;
+    typedef Gadgetron::Gadget2< int, GtPlusRecon2DTCloudPackageCPFL > BaseClass;
+
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder2DT<ValueType> WorkOrderType;
+    typedef WorkOrderType WorkOrder2DTType;
+
+    GtPlusReconJob2DTGadgetCloud();
+    ~GtPlusReconJob2DTGadgetCloud();
+
+    // kspace filter
+    Gadgetron::ISMRMRDKSPACEFILTER filterRO_type_;
+    double filterRO_sigma_;
+    double filterRO_width_;
+
+    Gadgetron::ISMRMRDKSPACEFILTER filterE1_type_;
+    double filterE1_sigma_;
+    double filterE1_width_;
+
+    Gadgetron::ISMRMRDKSPACEFILTER filterE2_type_;
+    double filterE2_sigma_;
+    double filterE2_width_;
+
+    // ref data filter
+    Gadgetron::ISMRMRDKSPACEFILTER filterRO_ref_type_;
+    double filterRO_ref_sigma_;
+    double filterRO_ref_width_;
+
+    Gadgetron::ISMRMRDKSPACEFILTER filterE1_ref_type_;
+    double filterE1_ref_sigma_;
+    double filterE1_ref_width_;
+
+    Gadgetron::ISMRMRDKSPACEFILTER filterE2_ref_type_;
+    double filterE2_ref_sigma_;
+    double filterE2_ref_width_;
+
+    // partial fourier filter
+    Gadgetron::ISMRMRDKSPACEFILTER filterRO_pf_type_;
+    double filterRO_pf_sigma_;
+    double filterRO_pf_width_;
+    bool filterRO_pf_densityComp_;
+
+    Gadgetron::ISMRMRDKSPACEFILTER filterE1_pf_type_;
+    double filterE1_pf_sigma_;
+    double filterE1_pf_width_;
+    bool filterE1_pf_densityComp_;
+
+    Gadgetron::ISMRMRDKSPACEFILTER filterE2_pf_type_;
+    double filterE2_pf_sigma_;
+    double filterE2_pf_width_;
+    bool filterE2_pf_densityComp_;
+
+    bool job_split_by_S_;
+    size_t job_num_of_N_;
+    size_t job_max_Megabytes_;
+    size_t job_overlap_;
+
+    /// cloud related definition
+    bool CloudComputing_;
+    unsigned int CloudSize_;
+
+    typedef boost::tuple<std::string, std::string, std::string, unsigned int> CloudNodeType;
+    typedef std::vector<CloudNodeType> CloudType;
+
+    CloudType gt_cloud_;
+
+    // debug folder
+    std::string debugFolder_;
+    std::string debugFolder_fullPath_;
+
+    // debug folder 2
+    std::string debugFolder2_;
+    std::string debugFolder2_fullPath_;
+
+    // cloud node file
+    std::string cloud_node_file_;
+
+    // whether to perform timing
+    bool performTiming_;
+
+protected:
+
+    // --------------------------------------------------
+    // functional functions
+    // --------------------------------------------------
+
+    // default interface function
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage< int >* m1, Gadgetron::GadgetContainerMessage< GtPlusRecon2DTCloudPackageCPFL > * m2);
+
+    bool parseGTCloudNodeFile(const std::string& filename, CloudType& gtCloud);
+
+    // process config is only to be called once
+    bool process_config_called_;
+
+    // read in parameters
+    virtual bool readParameters();
+
+    // send the completed job
+    bool sendOutJob(int jobID, GtPlusRecon2DTCloudPackageCPFL* job);
+
+    // set 2DT specific work order parameters
+    bool setWorkOrder2DTParameters(GtPlusRecon2DTPara& para, WorkOrder2DTType* workOrder);
+
+    // compute the kspace filter
+    bool generateKSpaceFilter(WorkOrderType& workOrder);
+
+    // work flow
+    Gadgetron::gtPlus::gtPlusISMRMRDReconWorkFlowCartesian2DT<ValueType> workflow_;
+
+    // worker
+    Gadgetron::gtPlus::gtPlusReconWorker2DTGRAPPA<ValueType> worker_grappa_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTNoAcceleration<ValueType> worker_noacceleration_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTSPIRIT<ValueType> worker_spirit_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTL1SPIRITNCG<ValueType> worker_spirit_L1_ncg_;
+
+    // workOrder for recon
+    WorkOrder2DTType workOrder_recon_;
+
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtil<ValueType> gtPlus_util_;
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtilComplex<ValueType> gtPlus_util_complex_;
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // in verbose mode, more info is printed out
+    bool verboseMode_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusReconJob3DTGadget.cpp b/gadgets/gtPlus/GtPlusReconJob3DTGadget.cpp
new file mode 100644
index 0000000..478ae68
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconJob3DTGadget.cpp
@@ -0,0 +1,229 @@
+
+#include "GtPlusReconJob3DTGadget.h"
+#include "GtPlusGadgetOpenMP.h"
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+GtPlusReconJob3DTGadget::GtPlusReconJob3DTGadget()
+{
+    debugFolder_ = "DebugOutput";
+
+    performTiming_ = true;
+
+    verboseMode_ = false;
+
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+
+    process_config_called_ = false;
+
+    Gadgetron::prepOpenMP();
+}
+
+GtPlusReconJob3DTGadget::~GtPlusReconJob3DTGadget()
+{
+
+}
+
+bool GtPlusReconJob3DTGadget::readParameters()
+{
+    try
+    {
+        GDEBUG_CONDITION_STREAM(verboseMode_, "------> GtPlusReconJob3DTGadget parameters <------");
+
+        boost::shared_ptr<std::string> str = this->get_string_value("debugFolder");
+        debugFolder_ = *str;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "debugFolder_ is " << debugFolder_);
+
+        str = this->get_string_value("debugFolder2");
+        debugFolder2_ = *str;
+        GDEBUG_CONDITION_STREAM(verboseMode_, "debugFolder2_ is " << debugFolder2_);
+
+        performTiming_ = this->get_bool_value("performTiming");
+        GDEBUG_CONDITION_STREAM(verboseMode_, "performTiming_ is " << performTiming_);
+
+        GDEBUG_CONDITION_STREAM(verboseMode_, "-----------------------------------------------");
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusReconJob3DTGadget::readParameters() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+int GtPlusReconJob3DTGadget::process_config(ACE_Message_Block* mb)
+{
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    //   0  1  2   3    4   5    6     7  8   9
+
+    verboseMode_ = this->get_bool_value("verboseMode");
+
+    // read in parameters from the xml
+    GADGET_CHECK_RETURN(this->readParameters(), GADGET_FAIL);
+
+    // generate the destination folder
+    if ( !debugFolder_.empty() )
+    {
+        getDebugFolderPath(debugFolder_, debugFolder_fullPath_, verboseMode_);
+    }
+    else
+    {
+        GDEBUG_STREAM("GtPlusRecon, debugFolder is not set ...");
+    }
+
+    if ( !debugFolder2_.empty() )
+    {
+        getDebugFolderPath(debugFolder2_, debugFolder2_fullPath_, verboseMode_);
+    }
+    else
+    {
+        GDEBUG_STREAM("GtPlusRecon, debugFolder2 is not set ...");
+    }
+
+    return GADGET_OK;
+}
+
+int GtPlusReconJob3DTGadget::process(Gadgetron::GadgetContainerMessage< int >* m1, Gadgetron::GadgetContainerMessage< GtPlusReconJobTypeCPFL > * m2)
+{
+    // because the parameter configuration will not be sent, we need to call process_config explicitly
+    if ( !process_config_called_ )
+    {
+        GADGET_CHECK_RETURN( (this->process_config(m1)==0), GADGET_FAIL);
+        process_config_called_ = true;
+    }
+    GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusReconJob3DTGadget::process(...) starts ... ");
+
+    int* jobID = m1->getObjectPtr();
+    GDEBUG_CONDITION_STREAM(verboseMode_, "--> arriving job : " << *jobID << " ... ");
+
+    GtPlusReconJobTypeCPFL* job = m2->getObjectPtr();
+    GDEBUG_CONDITION_STREAM(verboseMode_, "    job array size : [ " << job->kspace.get_size(0) << " " 
+                                                                 << job->kspace.get_size(1) << " " 
+                                                                 << job->kspace.get_size(2) << " " 
+                                                                 << job->kspace.get_size(3) << " ] ... ");
+
+    // set the worker
+    worker_grappa_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_grappa_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_noacceleration_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_noacceleration_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_L1_ncg_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_L1_ncg_.debugFolder_ = debugFolder_fullPath_;
+
+    if ( verboseMode_ )
+    {
+        job->workOrder2DT.print(std::cout);
+    }
+
+    bool succeed = true;
+    if ( performTiming_ ) { gt_timer1_.start("Recon 2DT job ... "); }
+
+    succeed = worker_spirit_L1_ncg_.performUnwarppingImpl(*job);
+
+    if ( performTiming_ ) { gt_timer1_.stop(); }
+
+    // export the results
+    if ( !debugFolder2_fullPath_.empty() )
+    {
+        std::ostringstream ostr;
+        ostr << "ReconJob2DT_ID" << *jobID;
+
+        hoNDArray< std::complex<float> > res = job->res;
+        res.squeeze();
+        if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder2_fullPath_+ostr.str()); }
+
+        std::ostringstream ostr2;
+        ostr2 << "Job2DT_kspace_ID" << *jobID;
+        if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(job->kspace, debugFolder2_fullPath_+ostr2.str()); }
+
+        std::ostringstream ostr3;
+        ostr3 << "Job2DT_ker_ID" << *jobID;
+        if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(job->ker, debugFolder2_fullPath_+ostr3.str()); }
+
+        if ( job->workOrder2DT.coilMap_->get_number_of_elements() > 0 )
+        {
+            std::ostringstream ostr4;
+            ostr4 << "Job2DT_coilmap_ID" << *jobID;
+            if ( !debugFolder2_fullPath_.empty() ) { gt_exporter_.exportArrayComplex(*job->workOrder2DT.coilMap_, debugFolder2_fullPath_+ostr4.str()); }
+        }
+    }
+
+    // clean the kspace and ker and coil map
+    job->kspace.clear();
+    job->ker.clear();
+    if ( !job->workOrder2DT.coilMap_ ) job->workOrder2DT.coilMap_->clear();
+
+    if ( !succeed )
+    {
+        job->complexIm.clear();
+        job->res.clear();
+    }
+
+    // send out the results
+    GADGET_CHECK_RETURN(this->sendOutJob(*jobID, job), GADGET_FAIL);
+
+    GDEBUG_CONDITION_STREAM(verboseMode_, "GtPlusReconJob3DTGadget::process(...) ends ... ");
+
+    m1->release();
+
+    return GADGET_OK;
+}
+
+bool GtPlusReconJob3DTGadget::
+sendOutJob(int jobID, GtPlusReconJobTypeCPFL* job)
+{
+    try
+    {
+      GDEBUG("GtPlusReconJob3DTGadget sendOutJob ...\n");
+
+        if (!this->controller_)
+        {
+	  GERROR("Cannot return result to controller, no controller set\n");
+	  return false;
+        }
+
+        GadgetContainerMessage<GadgetMessageIdentifier>* mb =
+            new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+        mb->getObjectPtr()->id = GADGET_MESSAGE_CLOUD_JOB;
+
+        GadgetContainerMessage<int>* m1 = new GadgetContainerMessage<int>();
+        *(m1->getObjectPtr()) = jobID;
+
+        GadgetContainerMessage<GtPlusReconJobTypeCPFL>* m2 = new GadgetContainerMessage<GtPlusReconJobTypeCPFL>();
+
+        *(m2->getObjectPtr()) = *job;
+
+        m1->cont(m2);
+        mb->cont(m1);
+
+        int ret =  this->controller_->output_ready(mb);
+        if (ret < 0)
+        {
+            GDEBUG("Failed to return GtPlusReconJob3DTGadget job massage to controller\n");
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GtPlusReconJob3DTGadget::sendOutJob(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusReconJob3DTGadget)
+
+}
diff --git a/gadgets/gtPlus/GtPlusReconJob3DTGadget.h b/gadgets/gtPlus/GtPlusReconJob3DTGadget.h
new file mode 100644
index 0000000..fb83044
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconJob3DTGadget.h
@@ -0,0 +1,103 @@
+/** \file   GtPlusReconJob3DTGadget.h
+    \brief  This gadget serves as the working gadget for the single layer cloud for 3DT reconstruction.
+
+            Ref to: 
+
+            Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+            Distributed MRI Reconstruction using Gadgetron based Cloud Computing. 
+            Magenetic Resonance in Medicine, doi: 10.1002/mrm.25213.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "Gadget.h"
+#include "GadgetStreamController.h"
+#include "GadgetCloudJobMessageReadWrite.h"
+#include "GadgetronTimer.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorkOrder.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian3DT.h"
+#include "gtPlusISMRMRDReconWorker3DTGRAPPA.h"
+#include "gtPlusISMRMRDReconWorker3DTNoAcceleration.h"
+#include "gtPlusISMRMRDReconWorker3DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h"
+#include "GtPlusReconGadgetUtil.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron
+{
+
+class EXPORTGTPLUSGADGET GtPlusReconJob3DTGadget : public Gadgetron::Gadget2< int, GtPlusReconJobTypeCPFL >
+{
+public:
+    GADGET_DECLARE(GtPlusReconJob3DTGadget);
+
+    typedef std::complex<float> ValueType;
+
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder<ValueType> WorkOrderType;
+
+    typedef Gadget2< int, GtPlusReconJobTypeCPFL > BaseClass;
+
+    GtPlusReconJob3DTGadget();
+    ~GtPlusReconJob3DTGadget();
+
+    // debug folder
+    std::string debugFolder_;
+    std::string debugFolder_fullPath_;
+
+    std::string debugFolder2_;
+    std::string debugFolder2_fullPath_;
+
+    // whether to perform timing
+    bool performTiming_;
+
+protected:
+
+    // --------------------------------------------------
+    // functional functions
+    // --------------------------------------------------
+
+    // default interface function
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage< int >* m1, Gadgetron::GadgetContainerMessage< GtPlusReconJobTypeCPFL > * m2);
+
+    // process config is only to be called once
+    bool process_config_called_;
+
+    // read in parameters
+    virtual bool readParameters();
+
+    // send the completed job
+    bool sendOutJob(int jobID, GtPlusReconJobTypeCPFL* job);
+
+    // worker
+    Gadgetron::gtPlus::gtPlusReconWorker3DTGRAPPA<ValueType> worker_grappa_;
+    Gadgetron::gtPlus::gtPlusReconWorker3DTNoAcceleration<ValueType> worker_noacceleration_;
+    Gadgetron::gtPlus::gtPlusReconWorker3DTSPIRIT<ValueType> worker_spirit_;
+    Gadgetron::gtPlus::gtPlusReconWorker3DTL1SPIRITNCG<ValueType> worker_spirit_L1_ncg_;
+
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtil<ValueType> gtPlus_util_;
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtilComplex<ValueType> gtPlus_util_complex_;
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // in verbose mode, more info is printed out
+    bool verboseMode_;
+};
+
+}
diff --git a/gadgets/gtPlus/config/GT_2DT_Cartesian.xml b/gadgets/gtPlus/config/GT_2DT_Cartesian.xml
new file mode 100644
index 0000000..1cf1e73
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_Cartesian.xml
@@ -0,0 +1,798 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 2D or 2D+T cartesian reconstruction
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!--Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_NONE</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>wrap_around_map_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>90</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.0025</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.0001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>10240</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_Cartesian_CloudNode.xml b/gadgets/gtPlus/config/GT_2DT_Cartesian_CloudNode.xml
new file mode 100644
index 0000000..d555c4d
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_Cartesian_CloudNode.xml
@@ -0,0 +1,77 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on GtPlus Cloud
+        This configuration file configures one gadget to perform the reconstruction for
+        2DT job packages
+
+        Depending on the incoming algorithm parameters, both linear and non-linear reconstruction
+        can be performed
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1013</slot>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusCloudJobMessageReaderCPFL</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1013</slot>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusCloudJobMessageWriterCPFL</classname>
+    </writer>
+
+    <!--
+    Recon computation for 2DT/3DT cases, process one job
+    a gtPlusReconJob2DT job consists of kspace, kernel and parameters
+    kspace: [RO E1 CHA E2/PHS]
+    -->
+
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusReconJob2DTGadget</classname>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_Cartesian_Dicom.xml b/gadgets/gtPlus/config/GT_2DT_Cartesian_Dicom.xml
new file mode 100644
index 0000000..4a2dbd6
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_Cartesian_Dicom.xml
@@ -0,0 +1,804 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 2D or 2D+T cartesian reconstruction
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <writer>
+        <slot>1018</slot>
+        <dll>gadgetron_dicom</dll>
+        <classname>DicomImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!--Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_NONE</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>wrap_around_map_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>90</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.0025</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.0001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>10240</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>DicomFinish</name>
+        <dll>gadgetron_dicom</dll>
+        <classname>DicomFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_Cartesian_DualLayer_Gateway_L1SPIRIT.xml b/gadgets/gtPlus/config/GT_2DT_Cartesian_DualLayer_Gateway_L1SPIRIT.xml
new file mode 100644
index 0000000..2c2f222
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_Cartesian_DualLayer_Gateway_L1SPIRIT.xml
@@ -0,0 +1,804 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 2D cartesian reconstruction using L1 SPIRIT
+        The GtPlus cloud computing by default is turned on in this configuration file
+        The dual-layer cloud topology is used here, therefore every incoming SLICE is sent
+        to one first layer GtPlus cloud node. This first layer node can further split the job and
+        process the SLICE with one or more second layer nodes.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+
+        Ref to: 
+
+        Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+        Distributed MRI Reconstruction using Gadgetron based Cloud Computing. 
+        Magenetic Resonance in Medicine, doi: 10.1002/mrm.25213.
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadgetCloud</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Phase</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.002</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_L1SPIRIT</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>90</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.0001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>20.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_ZEROFILLING_FILTER</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT_DualLayer.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>48</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>2048</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+
+        <property>
+            <name>CloudNodeXMLConfiguration</name>
+            <value>GT_2DT_Cartesian_FirstLayer_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudComputing</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_2DT_Cartesian_FirstLayer_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_Cartesian_DualLayer_Gateway_SPIRIT.xml b/gadgets/gtPlus/config/GT_2DT_Cartesian_DualLayer_Gateway_SPIRIT.xml
new file mode 100644
index 0000000..79e843b
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_Cartesian_DualLayer_Gateway_SPIRIT.xml
@@ -0,0 +1,798 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 2D cartesian reconstruction using linear SPIRIT
+        The GtPlus cloud computing by default is turned on in this configuration file
+        The dual-layer cloud topology is used here, therefore every incoming SLICE is sent
+        to one first layer GtPlus cloud node. This first layer node can further split the job and
+        process the SLICE with one or more second layer nodes.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+
+        Ref to: 
+
+        Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+        Distributed MRI Reconstruction using Gadgetron based Cloud Computing. 
+        Magenetic Resonance in Medicine, doi: 10.1002/mrm.25213.
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadgetCloud</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Phase</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>90</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.0025</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.0001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT_DualLayer.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>48</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>2048</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_2DT_Cartesian_FirstLayer_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_Cartesian_FirstLayer_CloudNode.xml b/gadgets/gtPlus/config/GT_2DT_Cartesian_FirstLayer_CloudNode.xml
new file mode 100644
index 0000000..e35e3ea
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_Cartesian_FirstLayer_CloudNode.xml
@@ -0,0 +1,279 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 2D cartesian reconstruction using linear or non-linear SPIRIT
+        The dual-layer cloud topology is used here, therefore every incoming SLICE is sent
+        to one first layer GtPlus cloud node. This first layer node can further split the job and
+        process the SLICE with one or more second layer nodes.
+
+        This configuration file is for the first layer GtPlus cloud node.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+
+        Ref to: 
+
+        Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+        Distributed MRI Reconstruction using Gadgetron based Cloud Computing. 
+        Magenetic Resonance in Medicine, doi: 10.1002/mrm.25213.
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1014</slot>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlus2DTGadgetCloudJobMessageReaderCPFL</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1014</slot>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlus2DTGadgetCloudJobMessageWriterCPFL</classname>
+    </writer>
+
+    <!--
+    Recon computation for 2DT cases
+    -->
+
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusReconJob2DTGadgetCloud</classname>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>48</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>2048</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9004</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT_DualLayer_FirstLayer.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_Cartesian_GFactor.xml b/gadgets/gtPlus/config/GT_2DT_Cartesian_GFactor.xml
new file mode 100644
index 0000000..ff1a86f
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_Cartesian_GFactor.xml
@@ -0,0 +1,799 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 2D or 2D+T cartesian reconstruction
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+
+        <!-- File prefix for stored noise prewhitener matrix -->
+        <property>
+            <name>noise_dependency_prefix</name>
+            <value>GadgetronNoiseCovarianceMatrix</value>
+        </property>
+    </gadget>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!--Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_NONE</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>false</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_thres</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation, ISMRMRD_SOUHEIL, ISMRMRD_SOUHEIL_ITER -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>wrap_around_map_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>9</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>9</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>90</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>10</value>
+        </property>
+
+        <property>
+            <name>scalingFactor_gfactor</name>
+            <value>100</value>
+        </property>
+
+        <property>
+            <name>scalingFactor_snr_image</name>
+            <value>10</value>
+        </property>
+
+        <property>
+            <name>scalingFactor_std_map</name>
+            <value>1000</value>
+        </property>
+
+        <property>
+            <name>start_frame_for_std_map</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <!-- 
+             ISMRMRD_FILTER_GAUSSIAN,
+             ISMRMRD_FILTER_HANNING,
+             ISMRMRD_FILTER_TUKEY,
+             ISMRMRD_FILTER_TAPERED_HANNING,
+             ISMRMRD_FILTER_NONE 
+        -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_NONE</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_NONE</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_NONE</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <!-- 
+             ISMRMRD_FILTER_GAUSSIAN,
+             ISMRMRD_FILTER_HANNING,
+             ISMRMRD_FILTER_TUKEY,
+             ISMRMRD_FILTER_TAPERED_HANNING,
+             ISMRMRD_FILTER_NONE 
+        -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <!-- only ISMRMRD_FILTER_TAPERED_HANNING is available for the moment -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_ZEROFILLING_FILTER</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>10240</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+
+        <property>
+            <name>max_intensity</name>
+            <value>32767</value>
+        </property>
+
+        <property>
+            <name>intensity_offset</name>
+            <value>16384</value>
+        </property>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+        <name>DicomFinish</name>
+        <dll>gadgetron_dicom</dll>
+        <classname>DicomFinishGadget</classname>
+    </gadget>>
+    -->
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_Cartesian_ImageTrigger_Dicom.xml b/gadgets/gtPlus/config/GT_2DT_Cartesian_ImageTrigger_Dicom.xml
new file mode 100644
index 0000000..0fe34a8
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_Cartesian_ImageTrigger_Dicom.xml
@@ -0,0 +1,826 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 2D or 2D+T cartesian reconstruction
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <writer>
+        <slot>1018</slot>
+        <dll>gadgetron_dicom</dll>
+        <classname>DicomImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!--Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_NONE</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>wrap_around_map_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>90</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.0025</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.0001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>10240</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- Image recon processing -->
+    <gadget>
+        <name>ImageAcc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorImageTriggerGadget</classname>
+
+        <!-- triggered dimensions -->
+        <property> <name>TriggerChannel</name>      <value>false</value> </property>
+        <property> <name>TriggerSlice</name>        <value>false</value> </property>
+        <property> <name>TriggerE2</name>           <value>false</value> </property>
+        <property> <name>TriggerContrast</name>     <value>false</value> </property>
+        <property> <name>TriggerPhase</name>        <value>true</value> </property>
+        <property> <name>TriggerRepetition</name>   <value>false</value> </property>
+        <property> <name>TriggerSet</name>          <value>false</value> </property>
+
+        <!-- work flow -->
+        <property> <name>PassImageImmediately</name> <value>false</value> </property>
+
+        <!-- debug and info mode -->
+        <property> <name>verboseMode</name> <value>true</value> </property>
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>DicomFinish</name>
+        <dll>gadgetron_dicom</dll>
+        <classname>DicomFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_Cartesian_L1SPIRIT.xml b/gadgets/gtPlus/config/GT_2DT_Cartesian_L1SPIRIT.xml
new file mode 100644
index 0000000..1735390
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_Cartesian_L1SPIRIT.xml
@@ -0,0 +1,789 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 2D cartesian reconstruction using L1 SPIRIT
+        The GtPlus cloud computing can be turned on in this configuration file
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>8</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_NONE</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_L1SPIRIT</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>90</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.0025</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.0001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>20.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_ZEROFILLING_FILTER</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>64</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>8192</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_Cartesian_PseudoReplica_SNRUnitRecon.xml b/gadgets/gtPlus/config/GT_2DT_Cartesian_PseudoReplica_SNRUnitRecon.xml
new file mode 100644
index 0000000..f501f2d
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_Cartesian_PseudoReplica_SNRUnitRecon.xml
@@ -0,0 +1,766 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GtProg_2DT_Cartesian_PseudoRplica_SNRUnitRecon.xml
+
+        GT Plus configuratin file for general 2D or 2D+T cartesian reconstruction with pseudo replica noise addon
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+
+        This workflow is the second step of pseudo-replica SNR unit reconstruction.
+
+        The input data to the workflow is SNR unit scaled kspace data.
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <writer>
+        <slot>1012</slot>
+        <dll>gadgetron_dicom</dll>
+        <classname>DicomImageWriter</classname>
+    </writer>
+
+    <writer>
+        <slot>1018</slot>
+        <dll>gadgetron_dicom</dll>
+        <classname>DicomImageAttribWriter</classname>
+    </writer>
+
+    <!-- add pesudo white noise -->
+    <gadget>
+        <name>WhiteNoiseAdd</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>WhiteNoiseInjectorGadget</classname>
+
+        <!-- mean and standard deviation for the added white noise -->
+        <property>
+            <name>noise_mean</name>
+            <value>0.0</value>
+        </property>
+
+        <property>
+            <name>noise_std</name>
+            <value>1.0</value>
+        </property>
+
+        <!-- whether to add noise on seperate or external reference acquisitions -->
+        <property>
+            <name>add_noise_ref</name>
+            <value>false</value>
+        </property>
+    </gadget>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!--Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_NONE</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>false</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_thres</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation, ISMRMRD_SOUHEIL, ISMRMRD_SOUHEIL_ITER -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>wrap_around_map_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>100</value>
+        </property>
+
+        <property>
+            <name>scalingFactor_gfactor</name>
+            <value>1000</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <!-- 
+             ISMRMRD_FILTER_GAUSSIAN,
+             ISMRMRD_FILTER_HANNING,
+             ISMRMRD_FILTER_TUKEY,
+             ISMRMRD_FILTER_TAPERED_HANNING,
+             ISMRMRD_FILTER_NONE 
+        -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_NONE</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_NONE</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_NONE</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <!-- 
+             ISMRMRD_FILTER_GAUSSIAN,
+             ISMRMRD_FILTER_HANNING,
+             ISMRMRD_FILTER_TUKEY,
+             ISMRMRD_FILTER_TAPERED_HANNING,
+             ISMRMRD_FILTER_NONE 
+        -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <!-- only ISMRMRD_FILTER_TAPERED_HANNING is available for the moment -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_ZEROFILLING_FILTER</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>10240</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <!--
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+
+        <property>
+            <name>max_intensity</name>
+            <value>32767</value>
+        </property>
+
+        <property>
+            <name>intensity_offset</name>
+            <value>16384</value>
+        </property>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_Cartesian_SPIRIT.xml b/gadgets/gtPlus/config/GT_2DT_Cartesian_SPIRIT.xml
new file mode 100644
index 0000000..b862d41
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_Cartesian_SPIRIT.xml
@@ -0,0 +1,789 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 2D cartesian reconstruction using linear SPIRIT
+        The GtPlus cloud computing can be turned on in this configuration file
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>8</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_NONE</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_SPIRIT</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>90</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.0025</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.0001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>64</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>8192</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_Cartesian_SingleLayer_CloudNode.xml b/gadgets/gtPlus/config/GT_2DT_Cartesian_SingleLayer_CloudNode.xml
new file mode 100644
index 0000000..e35e3ea
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_Cartesian_SingleLayer_CloudNode.xml
@@ -0,0 +1,279 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 2D cartesian reconstruction using linear or non-linear SPIRIT
+        The dual-layer cloud topology is used here, therefore every incoming SLICE is sent
+        to one first layer GtPlus cloud node. This first layer node can further split the job and
+        process the SLICE with one or more second layer nodes.
+
+        This configuration file is for the first layer GtPlus cloud node.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+
+        Ref to: 
+
+        Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+        Distributed MRI Reconstruction using Gadgetron based Cloud Computing. 
+        Magenetic Resonance in Medicine, doi: 10.1002/mrm.25213.
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1014</slot>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlus2DTGadgetCloudJobMessageReaderCPFL</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1014</slot>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlus2DTGadgetCloudJobMessageWriterCPFL</classname>
+    </writer>
+
+    <!--
+    Recon computation for 2DT cases
+    -->
+
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusReconJob2DTGadgetCloud</classname>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>48</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>2048</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9004</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT_DualLayer_FirstLayer.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_FatWater.xml b/gadgets/gtPlus/config/GT_2DT_FatWater.xml
new file mode 100644
index 0000000..4b7529b
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_FatWater.xml
@@ -0,0 +1,649 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on fat water multi-contrast application
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Repetition</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Contrast</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_Set</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>70</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>1e-5</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_HASTE.xml b/gadgets/gtPlus/config/GT_2DT_HASTE.xml
new file mode 100644
index 0000000..e00ac95
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_HASTE.xml
@@ -0,0 +1,757 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 2D or 2D+T cartesian reconstruction
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!--
+    <writer>
+        <slot>1012</slot>
+        <dll>gadgetron_dicom</dll>
+        <classname>DicomImageWriter</classname>
+    </writer>
+
+    <writer>
+        <slot>1018</slot>
+        <dll>gadgetron_dicom</dll>
+        <classname>DicomImageAttribWriter</classname>
+    </writer>
+    -->
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+
+        <!-- File prefix for stored noise prewhitener matrix -->
+        <property>
+            <name>noise_dependency_prefix</name>
+            <value>GadgetronNoiseCovarianceMatrix</value>
+        </property>
+
+        <!-- Preset noise dwell time; for noise dependency measurements -->
+	<!--
+        <property>
+            <name>noise_dwell_time_us_preset</name>
+            <value>5.0</value>
+        </property>
+	-->
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!--Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Repetition</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_thres</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation, ISMRMRD_SOUHEIL, ISMRMRD_SOUHEIL_ITER -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>10</value>
+        </property>
+
+        <property>
+            <name>scalingFactor_gfactor</name>
+            <value>100</value>
+        </property>
+
+        <property>
+            <name>scalingFactor_snr_image</name>
+            <value>10</value>
+        </property>
+
+        <property>
+            <name>scalingFactor_std_map</name>
+            <value>1000</value>
+        </property>
+
+        <property>
+            <name>start_frame_for_std_map</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <!-- 
+             ISMRMRD_FILTER_GAUSSIAN,
+             ISMRMRD_FILTER_HANNING,
+             ISMRMRD_FILTER_TUKEY,
+             ISMRMRD_FILTER_TAPERED_HANNING,
+             ISMRMRD_FILTER_NONE 
+        -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <!-- 
+             ISMRMRD_FILTER_GAUSSIAN,
+             ISMRMRD_FILTER_HANNING,
+             ISMRMRD_FILTER_TUKEY,
+             ISMRMRD_FILTER_TAPERED_HANNING,
+             ISMRMRD_FILTER_NONE 
+        -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <!-- only ISMRMRD_FILTER_TAPERED_HANNING is available for the moment -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_POCS</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>6</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>6</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>10240</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_HASTE_MOCO_AVE.xml b/gadgets/gtPlus/config/GT_2DT_HASTE_MOCO_AVE.xml
new file mode 100644
index 0000000..b8ea817
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_HASTE_MOCO_AVE.xml
@@ -0,0 +1,1033 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for HASTE imaging with MOCO+ACE
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!--
+    <writer>
+        <slot>1012</slot>
+        <dll>gadgetron_dicom</dll>
+        <classname>DicomImageWriter</classname>
+    </writer>
+
+    <writer>
+        <slot>1018</slot>
+        <dll>gadgetron_dicom</dll>
+        <classname>DicomImageAttribWriter</classname>
+    </writer>
+    -->
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+
+        <!-- File prefix for stored noise prewhitener matrix -->
+        <property>
+            <name>noise_dependency_prefix</name>
+            <value>GadgetronNoiseCovarianceMatrix</value>
+        </property>
+
+        <!-- Preset noise dwell time; for noise dependency measurements -->
+	<!--
+        <property>
+            <name>noise_dwell_time_us_preset</name>
+            <value>5.0</value>
+        </property>
+	-->
+
+        <!-- Whether to perform timing -->
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!--Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Repetition</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_thres</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation, ISMRMRD_SOUHEIL, ISMRMRD_SOUHEIL_ITER -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>10</value>
+        </property>
+
+        <property>
+            <name>scalingFactor_gfactor</name>
+            <value>100</value>
+        </property>
+
+        <property>
+            <name>scalingFactor_snr_image</name>
+            <value>10</value>
+        </property>
+
+        <property>
+            <name>scalingFactor_std_map</name>
+            <value>1000</value>
+        </property>
+
+        <property>
+            <name>start_frame_for_std_map</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <!-- 
+             ISMRMRD_FILTER_GAUSSIAN,
+             ISMRMRD_FILTER_HANNING,
+             ISMRMRD_FILTER_TUKEY,
+             ISMRMRD_FILTER_TAPERED_HANNING,
+             ISMRMRD_FILTER_NONE 
+        -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <!-- 
+             ISMRMRD_FILTER_GAUSSIAN,
+             ISMRMRD_FILTER_HANNING,
+             ISMRMRD_FILTER_TUKEY,
+             ISMRMRD_FILTER_TAPERED_HANNING,
+             ISMRMRD_FILTER_NONE 
+        -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <!-- only ISMRMRD_FILTER_TAPERED_HANNING is available for the moment -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_POCS</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>6</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>6</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>10240</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- Image recon accummulator -->
+    <gadget>
+        <name>ImageAcc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorImageTriggerGadget</classname>
+
+        <!-- triggered dimensions -->
+        <property>
+            <name>TriggerChannel</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>TriggerSlice</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>TriggerE2</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>TriggerContrast</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>TriggerPhase</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>TriggerRepetition</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>TriggerSet</name>
+            <value>false</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>PassImageImmediately</name>
+            <value>false</value>
+        </property>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+    </gadget>
+
+    <!-- MOCO AVE, PSIR recon -->
+    <gadget>
+        <name>MoCoAve</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusImageMoCoAveGadget</classname>
+
+        <!-- ===================================================================================== -->
+        <!-- dimension to perform MOCO -->
+        <property>
+            <name>moco_dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <!-- dimension to perform cross-row MOCO, e.g. MOCO between PD and IR images -->
+        <property>
+            <name>moco_cross_row_dim</name>
+            <value>DIM_Set</value>
+        </property>
+
+        <!-- whether to perform averaging -->
+        <property>
+            <name>moco_ave</name>
+            <value>true</value>
+        </property>
+
+        <!-- whether to perform cross-row MOCO -->
+        <property>
+            <name>moco_cross_row</name>
+            <value>false</value>
+        </property>
+
+        <!-- whether all rows have the same reference -->
+        <property>
+            <name>cross_row_same_reference</name>
+            <value>false</value>
+        </property>
+
+        <!-- whether to perform 3D MOCO -->
+        <property>
+            <name>moco_ave_3D</name>
+            <value>false</value>
+        </property>
+
+        <!-- If cross-row MOCO is performed, which row is selected as the reference -->
+        <property>
+            <name>ref_moco_cross_row</name>
+            <value>0</value>
+        </property>
+
+        <!-- Strategy to pick reference for rows, "SSD" or "Deformation" -->
+        <property>
+            <name>row_ref_pick_strategy</name>
+            <value>SSD</value>
+        </property>
+
+        <!-- ===================================================================================== -->
+        <!-- Parameters for image sending -->
+        <!-- Whether to send original images -->
+        <property>
+            <name>send_ori</name>
+            <value>true</value>
+        </property>
+
+        <!-- Whether to send MOCO images -->
+        <property>
+            <name>send_moco</name>
+            <value>true</value>
+        </property>
+
+        <!-- Whether to send averaged images -->
+        <property>
+            <name>send_moco_ave</name>
+            <value>true</value>
+        </property>
+
+        <!-- Whether to keep original image number -->
+        <property>
+            <name>moco_ave_keep_origial_image_number</name>
+            <value>false</value>
+        </property>
+
+        <!-- ===================================================================================== -->
+        <!-- Parameters for MOCO -->
+        <!-- MOCO strategy, 'FixedReference', or 'Progressive' -->
+        <property>
+            <name>strategy</name>
+            <value>FixedReference</value>
+        </property>
+        <!-- Image dissimilarity measures, 'SSD' or 'LocalCCR' or 'MutualInformation' -->
+        <property>
+            <name>dissimilarity</name>
+            <value>LocalCCR</value>
+        </property>
+        <!-- Number of levels for the multi-resolution pyramid -->
+        <property>
+            <name>level</name>
+            <value>4</value>
+        </property>
+        <!-- Number of iterations for every pyramid level, level 0 is for the highest resolution -->
+        <property>
+            <name>iter_0</name>
+            <value>16</value>
+        </property>
+        <property>
+            <name>iter_1</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>iter_2</name>
+            <value>64</value>
+        </property>
+        <property>
+            <name>iter_3</name>
+            <value>64</value>
+        </property>
+        <!-- Strength of registration regularization, in the unit of pixel -->
+        <property>
+            <name>regularization_hilbert_strength</name>
+            <value>12.0</value>
+        </property>
+        <!-- Whether to perform bidirectional MOCO -->
+        <property>
+            <name>bidirectional_moco</name>
+            <value>false</value>
+        </property>
+
+        <!-- ========================================= -->
+        <!-- Image dissimilarity measures for the cross-row MOCO, 'SSD' or 'LocalCCR' or 'MutualInformation' -->
+        <property>
+            <name>dissimilarity_cross_row</name>
+            <value>LocalCCR</value>
+        </property>
+        <!-- Number of levels for the multi-resolution pyramid for the cross-row MOCO -->
+        <property>
+            <name>level_cross_row</name>
+            <value>3</value>
+        </property>
+        <!-- Number of iterations for every pyramid level for the cross-row MOCO, level 0 is for the highest resolution -->
+        <property>
+            <name>iter_cross_row_0</name>
+            <value>8</value>
+        </property>
+        <property>
+            <name>iter_cross_row_1</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>iter_cross_row_2</name>
+            <value>64</value>
+        </property>
+        <!-- Strength of registration regularization for the cross-row MOCO, in the unit of pixel -->
+        <property>
+            <name>regularization_hilbert_strength_cross_row</name>
+            <value>32.0</value>
+        </property>
+        <!-- Whether to perform bidirectional MOCO for the cross-row MOCO -->
+        <property>
+            <name>bidirectional_moco_cross_row</name>
+            <value>true</value>
+        </property>
+
+        <!-- ========================================= -->
+
+        <!-- Threshold for image dissimilarity minimization -->
+        <property>
+            <name>dissimilarity_thres</name>
+            <value>1e-5</value>
+        </property>
+        <!-- Number of sub-division search in minimization -->
+        <property>
+            <name>div_num</name>
+            <value>2</value>
+        </property>
+        <!-- For the bidirectional MOCO, the number of bidirectional iteration -->
+        <property>
+            <name>inverse_deform_enforce_iter</name>
+            <value>10</value>
+        </property>
+        <!-- For the bidirectional MOCO, the weight between forward and inverse MOCO -->
+        <property>
+            <name>inverse_deform_enforce_weight</name>
+            <value>0.5</value>
+        </property>
+
+        <!-- ===================================================================================== -->
+
+        <!-- Fraction of images kept for averaging -->
+        <property>
+            <name>percentage_kept_for_averaging</name>
+            <value>0.5</value>
+        </property>
+
+        <!-- Whether to perform soft averaging -->
+        <property>
+            <name>soft_averaging</name>
+            <value>true</value>
+        </property>
+
+        <!-- ===================================================================================== -->
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseModeMOCO</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_LGE.xml b/gadgets/gtPlus/config/GT_2DT_LGE.xml
new file mode 100644
index 0000000..708cb68
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_LGE.xml
@@ -0,0 +1,654 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on cardiac LGE imaging
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Repetition</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_Set</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>wrap_around_map_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>70</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>1e-5</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_MOLLI.xml b/gadgets/gtPlus/config/GT_2DT_MOLLI.xml
new file mode 100644
index 0000000..1964a65
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_MOLLI.xml
@@ -0,0 +1,649 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on cardiac MOLLI T1 mapping
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Set</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>5</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Set</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>3</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>3</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>3</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>70</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>1e-5</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_MOLLI_Offline.xml b/gadgets/gtPlus/config/GT_2DT_MOLLI_Offline.xml
new file mode 100644
index 0000000..bb8a5e1
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_MOLLI_Offline.xml
@@ -0,0 +1,652 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on cardiac MOLLI T1 mapping
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Set</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>5</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Set</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>70</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>1e-5</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_POCS</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing 
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_Perfusion.xml b/gadgets/gtPlus/config/GT_2DT_Perfusion.xml
new file mode 100644
index 0000000..1e20aa0
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_Perfusion.xml
@@ -0,0 +1,655 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on cardiac perfusion mapping
+        The support for AIF acquisition is implemented.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Repetition</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>wrap_around_map_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>70</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>1e-5</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_PseudoReplica_SNRUnitRecon_DataExport.xml b/gadgets/gtPlus/config/GT_2DT_PseudoReplica_SNRUnitRecon_DataExport.xml
new file mode 100644
index 0000000..f4b0d2b
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_PseudoReplica_SNRUnitRecon_DataExport.xml
@@ -0,0 +1,68 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GtProg_PseudoRplica_SNRUnitRecon_DataExport.xml
+
+        Hui Xue
+        hui.xue at nih.gov
+
+        To export the SNR unit scaled ISMRMRD data
+        This is the first step for pseudo-replica SNR measurements
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- Noise prewhitening 
+         The SNR unit scaling is performed
+    -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+
+        <!-- File prefix for stored noise prewhitener matrix -->
+        <property>
+            <name>noise_dependency_prefix</name>
+            <value>GadgetronNoiseCovarianceMatrix</value>
+        </property>
+    </gadget>
+
+    <!-- Dump the noise scaled data to an ismrmd dataset 
+    -->
+    <gadget>
+        <name>IsmrmrdDump</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>IsmrmrdDumpGadget</classname>
+
+        <!-- File prefix for the data set -->
+        <property>
+            <name>file_prefix</name>
+            <value>ISMRMRD_PseudoReplica</value>
+        </property>
+
+        <!-- Whether to append time stamp to the file name -->
+        <property>
+            <name>append_timestamp</name>
+            <value>false</value>
+        </property>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_RTCine_L1SPIRIT_PhysioInterp.xml b/gadgets/gtPlus/config/GT_2DT_RTCine_L1SPIRIT_PhysioInterp.xml
new file mode 100644
index 0000000..74c7c27
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_RTCine_L1SPIRIT_PhysioInterp.xml
@@ -0,0 +1,819 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 2D cartesian reconstruction using L1 SPIRIT
+        The GtPlus cloud computing can be turned on in this configuration file
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>8</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_NONE</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.002</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm, ISMRMRD_GRAPPA, ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_L1SPIRIT</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>90</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.0001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>20.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_ZEROFILLING_FILTER</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>64</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>8192</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- Physio interpolation -->
+    <gadget>
+        <name>PhysioInterpolation</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PhysioInterpolationGadget</classname>
+
+        <property>
+            <name>phases</name>
+            <value>30</value>
+        </property>
+
+        <!-- 0=seperate series for each complete RR -->
+        <!-- 1=First complete RR interval only -->
+        <property>
+            <name>mode</name>
+            <value>1</value>
+        </property>
+
+        <property>
+            <name>first_beat_on_trigger</name>
+            <value>true</value>
+        </property>
+
+        <!-- "Spline" or "BSpline" -->
+        <property>
+            <name>interp_method</name>
+            <value>BSpline</value>
+        </property>
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_RTCine_L1SPIRIT_PhysioInterp_DualLayer_Gateway.xml b/gadgets/gtPlus/config/GT_2DT_RTCine_L1SPIRIT_PhysioInterp_DualLayer_Gateway.xml
new file mode 100644
index 0000000..51aeac3
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_RTCine_L1SPIRIT_PhysioInterp_DualLayer_Gateway.xml
@@ -0,0 +1,828 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 2D cartesian reconstruction using L1 SPIRIT
+        The GtPlus cloud computing by default is turned on in this configuration file
+        The dual-layer cloud topology is used here, therefore every incoming SLICE is sent
+        to one first layer GtPlus cloud node. This first layer node can further split the job and
+        process the SLICE with one or more second layer nodes.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+
+        Ref to: 
+
+        Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+        Distributed MRI Reconstruction using Gadgetron based Cloud Computing. 
+        Magenetic Resonance in Medicine, doi: 10.1002/mrm.25213.
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadgetCloud</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Phase</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.002</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_L1SPIRIT</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>90</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.0001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>20.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_ZEROFILLING_FILTER</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>NHLBI_Cloud_2DT_DualLayer.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>48</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>2048</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_2DT_Cartesian_FirstLayer_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- Physio interpolation -->
+    <gadget>
+        <name>PhysioInterpolation</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PhysioInterpolationGadget</classname>
+
+        <property>
+            <name>phases</name>
+            <value>30</value>
+        </property>
+
+        <!-- 0=seperate series for each complete RR -->
+        <!-- 1=First complete RR interval only -->
+        <property>
+            <name>mode</name>
+            <value>1</value>
+        </property>
+
+        <property>
+            <name>first_beat_on_trigger</name>
+            <value>true</value>
+        </property>
+
+        <!-- "Spline" or "BSpline" -->
+        <property>
+            <name>interp_method</name>
+            <value>BSpline</value>
+        </property>
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_RealTimeCine.xml b/gadgets/gtPlus/config/GT_2DT_RealTimeCine.xml
new file mode 100644
index 0000000..8416afa
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_RealTimeCine.xml
@@ -0,0 +1,736 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on real-time cine imaging
+        The GtPlus supports the on-the-fly reconstruction, therefore the reconstruction starts 
+        whenever sufficient data is received. The reconstructed images are sent out once the 
+        computation is finished.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Phase</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_Phase</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>8</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_Phase</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_Phase</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Phase</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>100</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>1e-5</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>5.0</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>20480</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_POCS</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_RealTimeFlow.xml b/gadgets/gtPlus/config/GT_2DT_RealTimeFlow.xml
new file mode 100644
index 0000000..27cfd12
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_RealTimeFlow.xml
@@ -0,0 +1,718 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on real-time flow imaging
+        The GtPlus supports the on-the-fly reconstruction, therefore the reconstruction starts 
+        whenever sufficient data is received. The reconstructed images are sent out once the 
+        computation is finished.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_None</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_None</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_None</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_None</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!--
+    Recon computation for 2DT cases
+
+    kspace_: [RO E1 CHA N S], for 2D recon, N can be 1
+    ref_: [RO E1 CHA M S], M can equal to N or 1 or others
+    fullkspace_: [RO E1 CHA N S]
+    complexIm_: [RO E1 N S], after coil combination
+    coilMap_: [RO E1 CHA 1 or N S]
+    gfactor_: [RO E1 CHA 1 or N S]
+
+    the 4th and 5th dimensions (N and S) needs to be specified. For example,
+    for real-time cine, N = DIM_Phase and S=DIM_Slice
+
+    default behavior
+    a) the coil compression coefficients are computed once across all S
+    b) the kernel or coil sensitivity are estimated for every S
+
+    embedded mode
+    a) perform recon and estimate kernel/coil sensitivity for every 2D kspace [RO E1 CHA]
+    b) coil combination uses different coil maps for every S
+    c) if the kspace recon is performed, the coil combination map is reestimated on the fullkspace for every 2D images
+    d) the ref lines are filled back to fullkspace_
+
+    separate mode
+    a) perform recon and estimate kernel/coil sensitivity for every 2D kspace [RO E1 CHA] if M==N
+    b) if M==1, the kernel is only estimated once for every S
+    c) coil combination uses different coil maps for every S
+    d) if the kspace recon is performed, the coil combination map is reestimated on the fullkspace for every 2D images
+
+    interleave
+    a) the average-all ref is used
+    b) kernel/coil sensitivity is estimated once for every S
+    -->
+
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Phase</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Set</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>70</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>1e-5</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+    </gadget>
+
+    <!-- flow processing -->
+    <gadget>
+        <name>PhaseSubtraction</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FlowPhaseSubtractionGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>MaxwellCorrection</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>MaxwellCorrectionGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>Extract</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ExtractGadget</classname>
+        <property>
+            <name>extract_mask</name>
+            <value>9</value>
+        </property>
+        <property>
+            <name>scaling_factor_angle</name>
+            <value>1.0</value>
+        </property>
+    </gadget>
+
+    <!-- after recon processing -->
+    <!--
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+
+        <property>
+            <name>intensity_offset</name>
+            <value>2048</value>
+        </property>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_2DT_T2W.xml b/gadgets/gtPlus/config/GT_2DT_T2W.xml
new file mode 100644
index 0000000..8d481a3
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_2DT_T2W.xml
@@ -0,0 +1,654 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on T2 weigthed cardiac imaging
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>5</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Repetition</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Set</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>wrap_around_map_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>70</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>1e-5</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_3DT_Cartesian.xml b/gadgets/gtPlus/config/GT_3DT_Cartesian.xml
new file mode 100644
index 0000000..56751d6
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_3DT_Cartesian.xml
@@ -0,0 +1,802 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 3D or 3D+T cartesian reconstruction
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Encoding2</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 3DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon3DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allN</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allN</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allN</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_thres</name>
+            <value>0.01</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL_ITER</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>45</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>15</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>70</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_3D_scale_per_chunk</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>scalingFactor_gfactor</name>
+            <value>1000</value>
+        </property>
+
+        <property>
+            <name>scalingFactor_snr_image</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>scalingFactor_std_map</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>start_frame_for_std_map</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_POCS</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand_E2</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand_E2</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_3DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>64</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>13000</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_3DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_3DT_Cartesian_CloudNode.xml b/gadgets/gtPlus/config/GT_3DT_Cartesian_CloudNode.xml
new file mode 100644
index 0000000..a59fd8b
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_3DT_Cartesian_CloudNode.xml
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for 3D cartesian reconstruction on GtPlus Cloud
+        This configuration file configures one gadget to perform the reconstruction for
+        3DT job packages
+
+        Depending on the incoming algorithm parameters, both linear and non-linear reconstruction
+        can be performed
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1013</slot>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusCloudJobMessageReaderCPFL</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1013</slot>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusCloudJobMessageWriterCPFL</classname>
+    </writer>
+
+    <!--
+    Recon computation for 2DT/3DT cases, process one job
+    a gtPlusReconJob2DT job consists of kspace, kernel and parameters
+    kspace: [RO E1 CHA E2/PHS]
+    -->
+
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusReconJob3DTGadget</classname>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_3DT_Cartesian_GFactor.xml b/gadgets/gtPlus/config/GT_3DT_Cartesian_GFactor.xml
new file mode 100644
index 0000000..8ca1653
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_3DT_Cartesian_GFactor.xml
@@ -0,0 +1,682 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 3D or 3D+T cartesian reconstruction
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+
+        <!-- File prefix for stored noise prewhitener matrix -->
+        <property>
+            <name>noise_dependency_prefix</name>
+            <value>GadgetronNoiseCovarianceMatrix</value>
+        </property>
+
+        <!-- Whether to perform timing -->
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+    </gadget>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Encoding2</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 3DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon3DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allN</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allN</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allN</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_thres</name>
+            <value>0.01</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL_ITER</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>gfactor_needed</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>45</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>10</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_NONE</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_NONE</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_NONE</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_POCS</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand_E2</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand_E2</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_3DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>13000</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_3DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_3DT_Cartesian_L1SPIRIT.xml b/gadgets/gtPlus/config/GT_3DT_Cartesian_L1SPIRIT.xml
new file mode 100644
index 0000000..44039e4
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_3DT_Cartesian_L1SPIRIT.xml
@@ -0,0 +1,806 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 3D or 3D+T cartesian reconstruction using L1 SPIRIT
+        The GtPlus cloud computing can be turned on in this configuration file
+        The single-layer cloud topology is used here.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+
+        Ref to: 
+
+        Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+        Distributed MRI Reconstruction using Gadgetron based Cloud Computing. 
+        Magenetic Resonance in Medicine, doi: 10.1002/mrm.25213.
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Encoding2</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>8</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 3DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon3DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allN</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allN</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allN</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_thres</name>
+            <value>0.01</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_L1SPIRIT</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>45</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>15</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>100</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.002</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_3D_scale_per_chunk</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_POCS</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand_E2</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand_E2</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_3DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>2499</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>2</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_3DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 1 -->
+        <property>
+            <name>CloudNode1_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode1_Port</name>
+            <value>9004</value>
+        </property>
+
+        <property>
+            <name>CloudNode1_XMLConfiguration</name>
+            <value>GtProg_3DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode1_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_3DT_Cartesian_SPIRIT.xml b/gadgets/gtPlus/config/GT_3DT_Cartesian_SPIRIT.xml
new file mode 100644
index 0000000..abbe8ee
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_3DT_Cartesian_SPIRIT.xml
@@ -0,0 +1,828 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 3D or 3D+T cartesian reconstruction using linear SPIRIT
+        The GtPlus cloud computing can be turned on in this configuration file
+        The single-layer cloud topology is used here.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+
+        Ref to: 
+
+        Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+        Distributed MRI Reconstruction using Gadgetron based Cloud Computing. 
+        Magenetic Resonance in Medicine, doi: 10.1002/mrm.25213.
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+    <writer>
+        <slot>1005</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1006</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <writer>
+        <slot>1015</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageAttribWriterCPLX</classname>
+    </writer>
+    <writer>
+        <slot>1016</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageAttribWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1017</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageAttribWriterUSHORT</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+
+        <property>
+            <name>constant_noise_variance</name>
+            <value>true</value>
+        </property>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Encoding2</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>8</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 3DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon3DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allN</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allN</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allN</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_thres</name>
+            <value>0.005</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_SPIRIT</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>45</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_oSize_RO</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>spirit_oSize_E1</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>spirit_oSize_E2</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>15</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>100</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_3D_scale_per_chunk</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_POCS</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand_E2</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand_E2</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_3DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>2499</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GadgetronProgram_gtPlus_3DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatAttribGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortAttribGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/GT_3DT_Cartesian_SingleLayer_L1SPIRIT.xml b/gadgets/gtPlus/config/GT_3DT_Cartesian_SingleLayer_L1SPIRIT.xml
new file mode 100644
index 0000000..bdb178a
--- /dev/null
+++ b/gadgets/gtPlus/config/GT_3DT_Cartesian_SingleLayer_L1SPIRIT.xml
@@ -0,0 +1,806 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!--
+       _____              _____     _____   ______   _______   _____     ____    _   _ 
+      / ____|     /\     |  __ \   / ____| |  ____| |__   __| |  __ \   / __ \  | \ | |
+     | |  __     /  \    | |  | | | |  __  | |__       | |    | |__) | | |  | | |  \| |
+     | | |_ |   / /\ \   | |  | | | | |_ | |  __|      | |    |  _  /  | |  | | | . ` |
+     | |__| |  / ____ \  | |__| | | |__| | | |____     | |    | | \ \  | |__| | | |\  |
+      \_____| /_/    \_\ |_____/   \_____| |______|    |_|    |_|  \_\  \____/  |_| \_|
+                                                                                       
+    -->
+
+    <!-- 
+        GT Plus configuratin file for general 3D or 3D+T cartesian reconstruction using L1 SPIRIT
+        The GtPlus cloud computing can be turned on in this configuration file
+        The single-layer cloud topology is used here.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+
+        Ref to: 
+
+        Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+        Distributed MRI Reconstruction using Gadgetron based Cloud Computing. 
+        Magenetic Resonance in Medicine, doi: 10.1002/mrm.25213.
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1022</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriter</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>AsymmetricEcho</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AsymmetricEchoAdjustROGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Encoding2</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>8</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 3DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon3DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allN</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allN</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allN</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_thres</name>
+            <value>0.01</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_L1SPIRIT</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>45</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>15</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>100</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.002</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_3D_scale_per_chunk</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_POCS</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand_E2</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand_E2</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>NHLBI_Cloud_3DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>2499</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>2</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GtProg_3DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 1 -->
+        <property>
+            <name>CloudNode1_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode1_Port</name>
+            <value>9004</value>
+        </property>
+
+        <property>
+            <name>CloudNode1_XMLConfiguration</name>
+            <value>GtProg_3DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode1_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>ComplexToFloatAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ComplexToFloatGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FloatToShortAttrib</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>ImageFinish</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/gtPlus/config/gtCloud/myCloud_2DT.txt b/gadgets/gtPlus/config/gtCloud/myCloud_2DT.txt
new file mode 100644
index 0000000..191eb7a
--- /dev/null
+++ b/gadgets/gtPlus/config/gtCloud/myCloud_2DT.txt
@@ -0,0 +1,8 @@
+localhost
+9002
+1
+localhost
+9003
+GT_2DT_Cartesian_CloudNode.xml
+1
+0
\ No newline at end of file
diff --git a/gadgets/gtPlus/config/gtCloud/myCloud_2DT_DualLayer.txt b/gadgets/gtPlus/config/gtCloud/myCloud_2DT_DualLayer.txt
new file mode 100644
index 0000000..5edc0a6
--- /dev/null
+++ b/gadgets/gtPlus/config/gtCloud/myCloud_2DT_DualLayer.txt
@@ -0,0 +1,8 @@
+localhost
+9002
+1
+localhost
+9003
+GT_2DT_Cartesian_FirstLayer_CloudNode.xml
+1
+0
diff --git a/gadgets/gtPlus/config/gtCloud/myCloud_2DT_DualLayer_FirstLayer.txt b/gadgets/gtPlus/config/gtCloud/myCloud_2DT_DualLayer_FirstLayer.txt
new file mode 100644
index 0000000..2121e88
--- /dev/null
+++ b/gadgets/gtPlus/config/gtCloud/myCloud_2DT_DualLayer_FirstLayer.txt
@@ -0,0 +1,8 @@
+localhost
+9003
+1
+localhost
+9004
+GT_2DT_Cartesian_CloudNode.xml
+1
+0
diff --git a/gadgets/gtPlus/config/gtCloud/myCloud_3DT.txt b/gadgets/gtPlus/config/gtCloud/myCloud_3DT.txt
new file mode 100644
index 0000000..4a6a3be
--- /dev/null
+++ b/gadgets/gtPlus/config/gtCloud/myCloud_3DT.txt
@@ -0,0 +1,12 @@
+localhost
+9002
+2
+localhost
+9003
+GT_3DT_Cartesian_CloudNode.xml
+1
+localhost
+9004
+GT_3DT_Cartesian_CloudNode.xml
+1
+0
\ No newline at end of file
diff --git a/gadgets/hyper/CMRT.xml b/gadgets/hyper/CMRT.xml
new file mode 100644
index 0000000..298d065
--- /dev/null
+++ b/gadgets/hyper/CMRT.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+    <gadget>
+      <name>CMRT</name>
+      <dll>gadgetron_hyper</dll>
+      <classname>CMRTGadget</classname>
+      <property><name>projections_per_recon</name><value>128</value></property>
+      <property><name>golden_ratio</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>  
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/hyper/CMRT3D.xml b/gadgets/hyper/CMRT3D.xml
new file mode 100644
index 0000000..5045101
--- /dev/null
+++ b/gadgets/hyper/CMRT3D.xml
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NFFT</name>
+      <dll>gadgetron_hyper</dll>
+      <classname>NFFT2DGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CMRT3S</name>
+      <dll>gadgetron_hyper</dll>
+      <classname>CMRT3DGadget</classname>
+      <property><name>projections_per_recon</name><value>128</value></property>
+      <property><name>projections_percentage</name><value>100</value></property>
+      <property><name>golden_ratio</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>  
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/hyper/CMRT3DGadget.cpp b/gadgets/hyper/CMRT3DGadget.cpp
new file mode 100644
index 0000000..1eb8e12
--- /dev/null
+++ b/gadgets/hyper/CMRT3DGadget.cpp
@@ -0,0 +1,265 @@
+#include "CMRT3DGadget.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "hoNDFFT.h"
+#include "hoNDArray_utils.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_math.h"
+#include "radial_utilities.h"
+#include "vector_td_operators.h"
+#include <ismrmrd/xml.h>
+
+static const float alpha_ = 2.0f; // oversampling for radial NFFT. If to be changed, also change setup arguments
+static const float W_ = 5.5f; // Kaiser-Bessel Window size for the radial NFFT
+static const float readout_oversampling_factor_ = 1.0f; // There is no "readout" oversampling for the radial NFFT
+
+namespace Gadgetron {
+
+/**
+ *   Expects ISMRMRD XML configuration
+ *
+ */
+
+int CMRT3DGadget::process_config(ACE_Message_Block* mb)
+{
+	ISMRMRD::IsmrmrdHeader h;
+	ISMRMRD::deserialize(mb->rd_ptr(),h);
+
+
+	if (h.encoding.size() != 1) {
+		GDEBUG("This Gadget only supports one encoding space\n");
+		return GADGET_FAIL;
+	}
+
+	ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+	ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+	ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+
+	// Matrix size x is the oversampled readout size : size FH*2 -- not for the hyperplarization?
+	// Matrix size y is the AP/RL size
+
+	image_space_dimensions_3D_.push_back(e_space.matrixSize.y);
+	image_space_dimensions_3D_.push_back(e_space.matrixSize.y);
+	image_space_dimensions_3D_.push_back(e_space.matrixSize.x/*/2*/);
+
+	GDEBUG("Matrix size: %d, %d, %d\n",
+			image_space_dimensions_3D_[0],
+			image_space_dimensions_3D_[1],
+			image_space_dimensions_3D_[2] );
+
+	num_projections_expected_ = get_int_value(std::string("projections_per_recon").c_str());
+	projections_percentage_ = get_int_value(std::string("projections_percentage").c_str());
+	num_projections_to_use_ = num_projections_expected_/(100/projections_percentage_);
+
+	golden_ratio_ = get_bool_value("golden_ratio");
+	GDEBUG("Number of projections (expected/utilization percentage): %d/%d\n", num_projections_expected_, projections_percentage_ );
+	GDEBUG("I.e. using %d projections for the reconstruction\n", num_projections_to_use_ );
+
+	std::vector<size_t> dims;
+	dims.push_back(image_space_dimensions_3D_[0]); // number of samples per radial profile
+	dims.push_back(num_projections_to_use_);       // number of radial profiles
+	dims.push_back(image_space_dimensions_3D_[2]); // number of slices
+
+	buffer_ = boost::shared_ptr< cuNDArray< complext<float> > >( new cuNDArray< complext<float> >(&dims) );
+
+	// Calculate trajectories and dcw for the radial NFFTs
+	//
+
+	boost::shared_ptr< cuNDArray<floatd2> > traj = calculate_trajectory();
+	boost::shared_ptr< cuNDArray<float> > dcw = calculate_density_compensation();
+
+
+	if( !traj.get() || !dcw.get() ){
+		GDEBUG("Failed to initialize radial trajecotory/dcw\n");
+		return GADGET_FAIL;
+	}
+
+	// Setup radial NFFT encoding operator
+	//
+
+	E_ = boost::shared_ptr< cuNFFTOperator<float,2> >( new cuNFFTOperator<float,2>() );
+
+	E_->set_dcw( dcw );
+
+	E_->setup( uint64d2(image_space_dimensions_3D_[0], image_space_dimensions_3D_[1]),
+			uint64d2(image_space_dimensions_3D_[0], image_space_dimensions_3D_[1])<<1, // !! <-- alpha_
+			W_ );
+
+
+	return GADGET_OK;
+}
+
+int CMRT3DGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+		GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+	// Check if we should ignore this image
+	// - to simulate undersampling in the number of slices
+
+	if( (images_received_%(100/projections_percentage_)) != 0 ){
+		// Ignore this image
+		images_received_++;
+		return GADGET_OK;
+	}
+
+	// We will not pass along m2, so we can modify its array safely
+	//
+
+	hoNDArray< std::complex<float> > *host_image = m2->getObjectPtr();
+
+	// Some validity checks
+	//
+
+	if( !(host_image->get_number_of_dimensions() == 2 || ( host_image->get_number_of_dimensions()==3 && host_image->get_size(2)==1 ))) {
+		GDEBUG("The input image has an unexpected number of dimensions\n", host_image->get_number_of_dimensions());
+		return GADGET_FAIL;
+	}
+
+	if( host_image->get_size(0) != image_space_dimensions_3D_[0] ||
+			host_image->get_size(1) != image_space_dimensions_3D_[2] ){
+		GDEBUG("The input image has unexpected dimensionality: %d %d %d %d\n", host_image->get_size(0), host_image->get_size(1), image_space_dimensions_3D_[0], image_space_dimensions_3D_[1] );
+		return GADGET_FAIL;
+	}
+
+	// Perform batched 1D FFTs along the phase encoding direction of the input image
+	// I.e. permute and then perform FFT
+
+
+	//*host_image = *permute( host_image, &order );
+	hoNDFFT<float>::instance()->fft( host_image, 0 );
+
+	// Next copy each line into the buffer
+	//
+
+	GDEBUG("Received image #%d\n", images_received_);
+
+	for( size_t row=0;row<host_image->get_size(1); row++ ){
+
+		size_t offset_in =
+				row*host_image->get_size(0);
+
+		size_t offset_out =
+				row*host_image->get_size(0)*num_projections_to_use_+
+				images_used_*host_image->get_size(0);
+
+		if( cudaMemcpy( buffer_->get_data_ptr()+offset_out,
+				host_image->get_data_ptr()+offset_in,
+				host_image->get_size(0)*sizeof(complext<float>),
+				cudaMemcpyHostToDevice ) != cudaSuccess ){
+			GDEBUG("Upload to device for line %d failed\n", row);
+			return GADGET_FAIL;
+		}
+	}
+
+	// Another image has been received and uploaded...
+	//
+
+	images_received_++;
+	images_used_++;
+
+	// When we are ready to perform reconstruction, do it...
+	//
+
+	if( images_used_ == num_projections_to_use_ ){
+
+		auto traj = calculate_trajectory(tot_images_);
+		E_->preprocess( traj.get() );
+		GDEBUG("\n\nPerforming reconstruction\n");
+
+		std::vector<size_t> dims;
+		dims.push_back(image_space_dimensions_3D_[0]);
+		dims.push_back(image_space_dimensions_3D_[1]);
+		dims.push_back(image_space_dimensions_3D_[2]);
+
+		cuNDArray< complext<float> > result(&dims);
+
+		E_->mult_MH( buffer_.get(), &result );
+
+		/*
+        boost::shared_ptr< hoNDArray<complext<float> > > host_result = result.to_host();
+        write_nd_array<complext<float> >(host_result.get(), "result.cplx");   
+
+        boost::shared_ptr< hoNDArray<float> > host_norm = abs(&result)->to_host();
+        write_nd_array<float>( host_norm.get(), "result.real" );*/
+
+		// Create new image header/image to pass along
+		//
+
+		GadgetContainerMessage<ISMRMRD::ImageHeader> *m =
+				new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+		GadgetContainerMessage< hoNDArray< std::complex<float> > > *cm =
+				new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+		*m->getObjectPtr() = *m1->getObjectPtr();
+		m->cont(cm);
+
+		// std::complex<float> and Gadgetron::complext<float> are binary compatible
+		boost::shared_ptr< hoNDArray< complext<float> > > host_result = result.to_host();
+		*cm->getObjectPtr() = *((hoNDArray< std::complex<float> >*) host_result.get());
+
+		m->getObjectPtr()->matrix_size[0] = dims[0];
+		m->getObjectPtr()->matrix_size[1] = dims[1];
+		m->getObjectPtr()->matrix_size[2] = dims[2];
+		m->getObjectPtr()->channels       = 1;
+		m->getObjectPtr()->image_index    = 1;
+
+		if (this->next()->putq(m) < 0) {
+			GDEBUG("Failed to put result image on to queue\n");
+			m->release();
+			return GADGET_FAIL;
+		}
+		tot_images_ += images_used_;
+		images_used_ = 0;
+	}
+
+	m1->release();
+	return GADGET_OK;
+}
+
+boost::shared_ptr< cuNDArray<floatd2> >
+CMRT3DGadget::calculate_trajectory(unsigned int offset)
+{
+	// Define trajectories
+
+	boost::shared_ptr< cuNDArray<floatd2> > traj;
+	if (golden_ratio_)
+		traj =	compute_radial_trajectory_golden_ratio_2d<float>
+			( image_space_dimensions_3D_[0], num_projections_to_use_, 1,offset,GR_ORIGINAL );
+	else
+		traj =	compute_radial_trajectory_fixed_angle_2d<float>
+			( image_space_dimensions_3D_[0], num_projections_to_use_, 1 /*number of frames*/ );
+
+	if (!traj.get()) {
+		GDEBUG("Failed to compute radial trajectory");
+		return boost::shared_ptr< cuNDArray<floatd2> >();
+	}
+
+	return traj;
+}
+
+boost::shared_ptr< cuNDArray<float> >
+CMRT3DGadget::calculate_density_compensation()
+{
+	// Compute density compensation weights
+	boost::shared_ptr< cuNDArray<float> > dcw;
+
+	if (golden_ratio_)
+		dcw =compute_radial_dcw_golden_ratio_2d
+			( image_space_dimensions_3D_[0], num_projections_to_use_, alpha_, 1.0f/readout_oversampling_factor_, GR_ORIGINAL );
+	else
+		dcw =compute_radial_dcw_fixed_angle_2d
+			( image_space_dimensions_3D_[0], num_projections_to_use_, alpha_, 1.0f/readout_oversampling_factor_ );
+
+	if (!dcw.get()) {
+		GDEBUG("Failed to compute density compensation weights\n");
+		return boost::shared_ptr< cuNDArray<float> >();
+	}
+
+	return dcw;
+}
+
+GADGET_FACTORY_DECLARE(CMRT3DGadget)
+}
diff --git a/gadgets/hyper/CMRT3DGadget.h b/gadgets/hyper/CMRT3DGadget.h
new file mode 100644
index 0000000..1380681
--- /dev/null
+++ b/gadgets/hyper/CMRT3DGadget.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include "gadgetron_hyper_export.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "cuNDArray.h"
+#include "complext.h"
+#include "cuNFFTOperator.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron {
+
+  class EXPORTGADGETSHYPER CMRT3DGadget : 
+    public Gadget2< ISMRMRD::ImageHeader, hoNDArray< std::complex<float> > >
+  {  
+  public:
+    CMRT3DGadget() : images_received_(0), images_used_(0), tot_images_(0) {};
+    ~CMRT3DGadget() {};
+	GADGET_DECLARE(CMRT3DGADGET);
+
+  protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(GadgetContainerMessage< ISMRMRD::ImageHeader > *m1,
+                        GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2);
+
+    virtual boost::shared_ptr< cuNDArray<floatd2> > calculate_trajectory(unsigned int offset=0);
+    virtual boost::shared_ptr< cuNDArray<float> > calculate_density_compensation();
+
+    boost::shared_ptr< cuNDArray< complext<float> > > buffer_;
+    boost::shared_ptr< cuNFFTOperator<float,2> > E_;
+    std::vector<size_t> image_space_dimensions_3D_;
+    unsigned int num_projections_expected_;
+    unsigned int num_projections_to_use_;
+    unsigned int projections_percentage_;
+    unsigned int images_received_;
+    unsigned int images_used_;
+    unsigned int tot_images_;
+    bool golden_ratio_;
+  };
+}
diff --git a/gadgets/hyper/CMRTGadget.cpp b/gadgets/hyper/CMRTGadget.cpp
new file mode 100644
index 0000000..6fa68fe
--- /dev/null
+++ b/gadgets/hyper/CMRTGadget.cpp
@@ -0,0 +1,501 @@
+#include "CMRTGadget.h"
+#include "cuNFFT.h"
+#include "vector_td_utilities.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "permutationOperator.h"
+#include "hoNDArray_utils.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_operators.h"
+#include "radial_utilities.h"
+#include "vector_td_operators.h"
+#include "cuNFFTOperator.h"
+#include "multiplicationOperatorContainer.h"
+#include "cuCgSolver.h"
+#include "cuTvOperator.h"
+#include "lbfgsSolver.h"
+#include "cuSbcCgSolver.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuPartialDerivativeOperator2.h"
+#include <numeric>
+#include <functional>
+#include "cuNlcgSolver.h"
+#include <boost/make_shared.hpp>
+
+#include <ismrmrd/xml.h>
+#include <cmath>
+
+namespace Gadgetron{
+
+
+int CMRTGadget::process_config(ACE_Message_Block* mb)
+{
+	ISMRMRD::IsmrmrdHeader h;
+	ISMRMRD::deserialize(mb->rd_ptr(),h);
+
+
+	if (h.encoding.size() != 1) {
+		GDEBUG("This Gadget only supports one encoding space\n");
+		return GADGET_FAIL;
+	}
+
+	ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+	ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+	ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+
+	// Matrix size x is the oversampled readout size : size FH*2 -- not for the hyperplarization?
+	// Matrix size y is the AP/RL size
+
+	image_space_dimensions_3D_.push_back(e_space.matrixSize.y);
+	image_space_dimensions_3D_.push_back(e_space.matrixSize.y);
+	image_space_dimensions_3D_.push_back(e_space.matrixSize.x/*/2*/);
+
+
+	GDEBUG("Matrix size: %d, %d, %d\n",
+			image_space_dimensions_3D_[0],
+			image_space_dimensions_3D_[1],
+			image_space_dimensions_3D_[2] );
+
+	GDEBUG("Matrix size: %d, %d\n", e_space.matrixSize.x, e_space.matrixSize.y, e_space.matrixSize.z);
+	dimensions_.push_back(r_space.matrixSize.x);
+	dimensions_.push_back(r_space.matrixSize.y);
+
+	field_of_view_.push_back(e_space.fieldOfView_mm.x);
+	field_of_view_.push_back(e_space.fieldOfView_mm.y);
+	GDEBUG("FOV: %f, %f\n", r_space.fieldOfView_mm.x, r_space.fieldOfView_mm.y);
+
+	repetitions_ = e_limits.repetition.is_present() ? e_limits.repetition.get().maximum + 1 : 1;
+	GDEBUG("#Repetitions: %d\n", repetitions_);
+
+
+	// Allocate readout and trajectory/dcw queues
+	//
+
+	golden_ratio_ = get_bool_value("golden_ratio");
+	use_TV_ = get_bool_value("use_TV");
+	projections_per_recon_ = get_int_value("projections_per_recon");
+	iterations_ = get_int_value("iterations");
+	frame_readout_queue_ = boost::shared_ptr< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>());
+	frame_traj_queue_ = boost::shared_ptr< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>());
+
+	size_t bsize = sizeof(GadgetContainerMessage< hoNDArray< std::complex<float> > >)*dimensions_[0]*10;
+
+	frame_readout_queue_->high_water_mark(bsize);
+	frame_readout_queue_->low_water_mark(bsize);
+	frame_traj_queue_->high_water_mark(bsize);
+	frame_traj_queue_->low_water_mark(bsize);
+
+	return GADGET_OK;
+}
+
+int CMRTGadget::process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader > *m1,        // header
+		GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2,  // data
+		GadgetContainerMessage< hoNDArray<float> > *m3 )                 // traj/dcw
+{
+	// Throw away any noise samples if they have been allowed to pass this far down the chain...
+	//
+
+	bool is_noise = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT);
+	if (is_noise) {
+		m1->release();
+		return GADGET_OK;
+	}
+
+	// First pass initialization
+	//
+
+	if (frame_readout_queue_->message_count() == 0 ) {
+		samples_per_readout_ = m1->getObjectPtr()->number_of_samples;
+		num_coils_ = m1->getObjectPtr()->active_channels;
+		dimensions_.push_back(m1->getObjectPtr()->active_channels);
+		dimensions_.push_back(repetitions_);
+		num_trajectory_dims_ = m3->getObjectPtr()->get_size(0); // 2 for trajectories only, 3 for both trajectories + dcw
+	}
+
+	int samples = m1->getObjectPtr()->number_of_samples;
+	int readout = m1->getObjectPtr()->idx.kspace_encode_step_1;
+	int repetition = m1->getObjectPtr()->idx.kspace_encode_step_2;
+
+	// Enqueue incoming readouts and trajectories
+	//
+
+	frame_readout_queue_->enqueue_tail(duplicate_array(m2));
+
+	//Only extract trajectories for first frame. Assume next frames are equal
+	if (frames.size() == 0 )
+		frame_traj_queue_->enqueue_tail(duplicate_array(m3));
+
+	// If the last readout for a slice has arrived then perform a reconstruction
+	//
+
+	bool is_last_scan_in_repetition =
+			m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_LAST_IN_REPETITION);
+
+	if (is_last_scan_in_repetition) {
+		num_frames++;
+		GDEBUG("FRAME # %d \n",num_frames);
+		// Get samples for frame
+		//
+		GDEBUG("Extracting samples \n");
+		frames.push_back(extract_samples_from_queue( frame_readout_queue_.get()));
+		// Get trajectories/dcw for frame - Only for first frame
+		//
+		if (frames.size() == 1 ){
+			extract_trajectory_and_dcw_from_queue( frame_traj_queue_.get(), this->traj, this->dcw);
+			GDEBUG("Extracting trajectory \n");
+		}
+
+		bool is_last_scan_in_slice= m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_LAST_IN_SLICE);
+		GDEBUG("Last scan in slice %i \n",is_last_scan_in_slice);
+		//If we have enough projections, get the show on the road
+		if (is_last_scan_in_slice ){
+			GDEBUG("Framing %i frames \n",num_frames);
+
+			if (num_frames%projections_per_recon_ != 0) {
+				GDEBUG("Number of frames must be divisible by number of projecitons");
+				return GADGET_FAIL;
+			}
+			boost::shared_ptr<cuNDArray<float_complext> > data = get_combined_frames();
+			auto data_dims = data->get_dimensions();
+			size_t ntimeframes = data_dims->back()/projections_per_recon_;
+			data_dims->back() = projections_per_recon_;
+			data_dims->push_back(ntimeframes);
+			data->reshape(data_dims);
+			// Initialize plan
+			//
+			GDEBUG("Data size: %i %i %i",data->get_size(0),data->get_size(1),data->get_size(2));
+			boost::shared_ptr<cuNDArray<floatd2> >cu_traj(new cuNDArray<floatd2>(*traj));
+
+			std::vector<size_t> projection_dims;
+			projection_dims.push_back(dimensions_[0]*2);
+			projection_dims.push_back(dimensions_[1]);
+
+			projection_dims.push_back(projections_per_recon_);
+			projection_dims.push_back(ntimeframes);
+
+
+			//cuNDArray<float_complext> result(&image_space_dimensions_3D_);
+			boost::shared_ptr<CMRTOperator<float> > E(new CMRTOperator<float>);
+			E->setup(cu_traj,image_space_dimensions_3D_,projection_dims,0,golden_ratio_);
+
+			auto image_space_dimensions_4D = image_space_dimensions_3D_;
+			image_space_dimensions_4D.push_back(ntimeframes);
+			E->set_domain_dimensions(&image_space_dimensions_4D);
+			E->set_codomain_dimensions(data->get_dimensions().get());
+
+
+			boost::shared_ptr<cuNDArray<float_complext> > result;
+			//cuCgSolver<float_complext> solver;
+			//cuNlcgSolver<float_complext> solver;
+
+			if (use_TV_){
+				cuSbcCgSolver<float_complext> solver;
+				solver.set_encoding_operator(E);
+				//solver.set_max_iterations(20);
+				solver.set_max_outer_iterations(iterations_);
+				solver.get_inner_solver()->set_max_iterations(10);
+				solver.set_tc_tolerance(1e-8f);
+				auto Rx1_ = boost::make_shared< cuPartialDerivativeOperator<float_complext,4> >(0);
+
+				auto Ry1_ = boost::make_shared< cuPartialDerivativeOperator<float_complext,4> >(1);
+				auto Rz1_ = boost::make_shared< cuPartialDerivativeOperator<float_complext,4> >(2);
+
+				auto Rt1_ = boost::make_shared< cuPartialDerivativeOperator2<float_complext,4> >();
+
+				Rx1_->set_domain_dimensions(&image_space_dimensions_4D);
+				Rx1_->set_codomain_dimensions(&image_space_dimensions_4D);
+
+				Ry1_->set_domain_dimensions(&image_space_dimensions_4D);
+				Ry1_->set_codomain_dimensions(&image_space_dimensions_4D);
+
+				Rz1_->set_domain_dimensions(&image_space_dimensions_4D);
+				Rz1_->set_codomain_dimensions(&image_space_dimensions_4D);
+
+				Rt1_->set_domain_dimensions(&image_space_dimensions_4D);
+				Rt1_->set_codomain_dimensions(&image_space_dimensions_4D);
+				float lambda = 2000;
+				float mu = 1000;
+				Rx1_ ->set_weight(lambda);
+				Ry1_ ->set_weight(lambda);
+				Rz1_ ->set_weight(lambda);
+				Rt1_->set_weight(lambda);
+				E->set_weight(mu);
+				solver.add_regularization_group_operator(Rx1_);
+				solver.add_regularization_group_operator(Ry1_);
+				solver.add_regularization_group_operator(Rz1_);
+				solver.add_regularization_group_operator(Rt1_);
+				solver.add_group();
+
+
+
+
+				solver.set_output_mode(cuCgSolver<float_complext>::OUTPUT_VERBOSE);
+
+				//*data *= *cu_dcw;
+
+				result = solver.solve(data.get());
+			} else
+			{
+				cuCgSolver<float_complext> solver;
+				//cuNlcgSolver<float_complext> solver;
+				solver.set_encoding_operator(E);
+				solver.set_max_iterations(iterations_);
+				solver.set_tc_tolerance(1e-8f);
+				solver.set_output_mode(cuCgSolver<float_complext>::OUTPUT_VERBOSE);
+
+				result = solver.solve(data.get());
+			}
+			//boost::shared_ptr<cuNDArray<float_complext> > result(new cuNDArray<float_complext>(&image_space_dimensions_3D_));
+			//E->mult_MH(data.get(),result.get());
+			GDEBUG(" Penguins report mission accomplished \n");
+
+
+			size_t nelements3d = std::accumulate(image_space_dimensions_3D_.begin(),image_space_dimensions_3D_.end(),1,std::multiplies<size_t>());
+
+			for (size_t i = 0; i < ntimeframes; i++){
+
+				// Define the image header
+				//
+
+				GadgetContainerMessage<ISMRMRD::ImageHeader> *cm1 =
+						new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+				GadgetContainerMessage< hoNDArray< std::complex<float> > > *cm2 =
+						new GadgetContainerMessage<hoNDArray< std::complex<float> > >();
+
+				cm1->getObjectPtr()->flags = 0;
+				cm1->cont(cm2);
+
+
+				cm1->getObjectPtr()->field_of_view[0]   = field_of_view_[0];
+				cm1->getObjectPtr()->field_of_view[1]   = field_of_view_[1];
+				cm1->getObjectPtr()->channels           = num_coils_;
+				cm1->getObjectPtr()->repetition         = m1->getObjectPtr()->idx.repetition;
+
+
+
+				memcpy(cm1->getObjectPtr()->patient_table_position,
+						m1->getObjectPtr()->patient_table_position, sizeof(float)*3);
+
+				cm1->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_CXFLOAT;
+				cm1->getObjectPtr()->image_index = 0;
+				cm1->getObjectPtr()->image_series_index = 0;
+
+				// std::complex<float> and Gadgetron::complext<float> are binary compatible
+				cuNDArray<complext<float> > cuView(image_space_dimensions_3D_,result->get_data_ptr()+i*nelements3d);
+				boost::shared_ptr< hoNDArray< complext<float> > > host_result = cuView.to_host();
+				*cm2->getObjectPtr() = *((hoNDArray< std::complex<float> >*) host_result.get());
+
+				cm1->getObjectPtr()->matrix_size[0] = image_space_dimensions_3D_[0];
+				cm1->getObjectPtr()->matrix_size[1] = image_space_dimensions_3D_[1];
+				cm1->getObjectPtr()->matrix_size[2] = image_space_dimensions_3D_[2];
+				cm1->getObjectPtr()->channels       = 1;
+				cm1->getObjectPtr()->image_index    = i+1;
+
+				if (this->next()->putq(cm1) < 0) {
+					GDEBUG("Failed to put result image on to queue\n");
+					cm1->release();
+					return GADGET_FAIL;
+				}
+			}
+
+			num_frames = 0;
+		}
+
+	}
+
+	m1->release();
+	return GADGET_OK;
+}
+
+template<class T> GadgetContainerMessage< hoNDArray<T> >*
+CMRTGadget::duplicate_array( GadgetContainerMessage< hoNDArray<T> > *array )
+{
+	GadgetContainerMessage< hoNDArray<T> > *copy = new GadgetContainerMessage< hoNDArray<T> >();
+	*(copy->getObjectPtr()) = *(array->getObjectPtr());
+	return copy;
+}
+
+boost::shared_ptr< hoNDArray<float_complext> >
+CMRTGadget::extract_samples_from_queue ( ACE_Message_Queue<ACE_MT_SYNCH> *queue )
+{
+	if(!queue) {
+		GDEBUG("Illegal queue pointer, cannot extract samples\n");
+		throw std::runtime_error("CMRTGadget::extract_samples_from_queue: illegal queue pointer");
+	}
+
+	unsigned int readouts_buffered = queue->message_count();
+
+	std::vector<size_t> dims;
+	dims.push_back(samples_per_readout_);
+	dims.push_back(readouts_buffered);
+	dims.push_back(num_coils_);
+
+	boost::shared_ptr< hoNDArray<float_complext> > host_samples(new hoNDArray<float_complext>(dims));
+
+	for (unsigned int p=0; p<readouts_buffered; p++) {
+
+		ACE_Message_Block* mbq;
+		if (queue->dequeue_head(mbq) < 0) {
+			GDEBUG("Message dequeue failed\n");
+			throw std::runtime_error("CMRTGadget::extract_samples_from_queue: dequeing failed");
+		}
+
+		GadgetContainerMessage< hoNDArray< std::complex<float> > > *daq = AsContainerMessage<hoNDArray< std::complex<float> > >(mbq);
+
+		if (!daq) {
+			GDEBUG("Unable to interpret data on message queue\n");
+			throw std::runtime_error("CMRTGadget::extract_samples_from_queue: failed to interpret data");
+		}
+
+		for (unsigned int c = 0; c < num_coils_; c++) {
+
+			float_complext *data_ptr = host_samples->get_data_ptr();
+			data_ptr += c*samples_per_readout_*readouts_buffered+p*samples_per_readout_;
+
+			std::complex<float> *r_ptr = daq->getObjectPtr()->get_data_ptr();
+			r_ptr += c*daq->getObjectPtr()->get_size(0);
+
+			memcpy(data_ptr, r_ptr, samples_per_readout_*sizeof(float_complext));
+		}
+
+		mbq->release();
+	}
+
+	return host_samples;
+}
+
+boost::shared_ptr< hoNDArray<float> >
+CMRTGadget::extract_trajectory_from_queue ( ACE_Message_Queue<ACE_MT_SYNCH> *queue )
+{
+	if(!queue) {
+		GDEBUG("Illegal queue pointer, cannot extract trajectory\n");
+		throw std::runtime_error("CMRTGadget::extract_trajectory_from_queue: illegal queue pointer");
+	}
+
+	unsigned int readouts_buffered = queue->message_count();
+
+	std::vector<size_t> dims;
+	dims.push_back(num_trajectory_dims_); // 2 for trajectories only, 3 for both trajectories + dcw
+	dims.push_back(samples_per_readout_);
+	dims.push_back(readouts_buffered);
+
+	boost::shared_ptr< hoNDArray<float> > host_traj(new hoNDArray<float>(&dims));
+
+	for (unsigned int p=0; p<readouts_buffered; p++) {
+		ACE_Message_Block* mbq;
+		if (queue->dequeue_head(mbq) < 0) {
+			GDEBUG("Message dequeue failed\n");
+			throw std::runtime_error("CMRTGadget::extract_trajectory_from_queue: dequeing failed");
+		}
+
+		GadgetContainerMessage< hoNDArray<float> > *daq = AsContainerMessage<hoNDArray<float> >(mbq);
+
+		if (!daq) {
+			GDEBUG("Unable to interpret data on message queue\n");
+			throw std::runtime_error("CMRTGadget::extract_trajectory_from_queue: failed to interpret data");
+		}
+
+		float *data_ptr = host_traj->get_data_ptr();
+		data_ptr += num_trajectory_dims_*samples_per_readout_*p;
+
+		float *r_ptr = daq->getObjectPtr()->get_data_ptr();
+
+		memcpy(data_ptr, r_ptr, num_trajectory_dims_*samples_per_readout_*sizeof(float));
+
+		mbq->release();
+	}
+
+	return host_traj;
+}
+
+void CMRTGadget::extract_trajectory_and_dcw_from_queue
+( ACE_Message_Queue<ACE_MT_SYNCH> *queue, boost::shared_ptr< hoNDArray<floatd2> > & traj, boost::shared_ptr< hoNDArray<float> > & dcw )
+{
+	// Extract trajectory and (if present) density compensation weights.
+	// They are stored as a float array of dimensions: {2,3} x #samples_per_readout x #readouts.
+	// We need
+	// - a floatd2 trajectory array
+	// - a float dcw array
+	//
+
+	if( num_trajectory_dims_ == 2 ){
+		//This is an evil evil hack to get the trajectories out. Ohh the horror.
+		boost::shared_ptr<hoNDArray<float> > tmp_traj = extract_trajectory_from_queue( queue );
+		std::vector<size_t> dims_1d; dims_1d.push_back(tmp_traj->get_size(1)*tmp_traj->get_size(2));
+		traj = boost::shared_ptr<hoNDArray<floatd2> >(new hoNDArray<floatd2>(&dims_1d));
+		memcpy(traj->get_data_ptr(),tmp_traj->get_data_ptr(),tmp_traj->get_number_of_elements()*sizeof(float));
+
+
+	}
+	else{
+
+		boost::shared_ptr< hoNDArray<float> > host_traj_dcw = extract_trajectory_from_queue( queue );
+
+		std::vector<size_t> order;
+		order.push_back(1); order.push_back(2); order.push_back(0);
+
+		boost::shared_ptr< hoNDArray<float> > host_traj_dcw_shifted = permute( host_traj_dcw.get(), &order );
+
+		std::vector<size_t> dims_1d;
+		dims_1d.push_back(host_traj_dcw_shifted->get_size(0)*host_traj_dcw_shifted->get_size(1));
+
+		dcw = boost::shared_ptr<hoNDArray<float> > (new hoNDArray<float>(&dims_1d, host_traj_dcw_shifted->get_data_ptr()+2*dims_1d[0]));
+
+
+		std::vector<size_t> dims_2d = dims_1d; dims_2d.push_back(2);
+		order.clear(); order.push_back(1); order.push_back(0);
+
+
+		hoNDArray<float> tmp(&dims_2d, host_traj_dcw_shifted->get_data_ptr());
+
+		boost::shared_ptr< hoNDArray<float> > _traj = permute( &tmp, &order );
+
+		traj = boost::shared_ptr<hoNDArray<floatd2> > (new hoNDArray<floatd2>(&dims_1d, (floatd2*)_traj->get_data_ptr()));
+	}
+
+	std::vector<size_t >dims_2d;
+	dims_2d.push_back(traj->get_number_of_elements());
+	dims_2d.push_back(1); // Number of frames
+
+	traj->reshape(&dims_2d);
+	if( num_trajectory_dims_ == 3 ) dcw->reshape(&dims_2d);
+}
+
+boost::shared_ptr<cuNDArray<float_complext> > CMRTGadget::get_combined_frames(){
+	if (frames.size() == 0)
+		throw std::runtime_error("No frames received. This should not be possible. Your RAM might be replaced with live salmon, or you may have set the expected number of frames to 0");
+
+	for (unsigned int i = 1; i < frames.size(); i++){
+		if (!frames[0]->dimensions_equal(frames[i].get()))
+			throw std::runtime_error("CMRTGadget: Frames received do not have equal size");
+	}
+	//Get data dimensions. Assume all frames have the same dimensions
+	std::vector<size_t> dims = *frames[0]->get_dimensions();
+	dims.push_back(frames.size());
+
+	boost::shared_ptr<cuNDArray<float_complext> > combined(new cuNDArray<float_complext>(dims));
+
+	//Copy data into 1 array on device
+	size_t offset = 0;
+	for (unsigned int i = 0; i < frames.size(); i++){
+		cudaMemcpy(combined->get_data_ptr()+offset,frames[i]->get_data_ptr(),frames[i]->get_number_of_elements()*sizeof(float_complext),cudaMemcpyHostToDevice);
+		offset += frames[i]->get_number_of_elements();
+	}
+
+	frames.clear();
+	return combined;
+
+
+
+
+
+
+}
+
+
+
+GADGET_FACTORY_DECLARE(CMRTGadget)
+}
diff --git a/gadgets/hyper/CMRTGadget.h b/gadgets/hyper/CMRTGadget.h
new file mode 100644
index 0000000..a7d6211
--- /dev/null
+++ b/gadgets/hyper/CMRTGadget.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "cuNDArray.h"
+#include "gadgetron_hyper_export.h"
+
+#include "CMRTOperator.h"
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSHYPER CMRTGadget :
+    public Gadget3< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> >, hoNDArray<float> >
+  {
+    
+  public:
+    
+    CMRTGadget(): num_frames(0) {
+    	set_parameter("golden_ratio","false");
+    	set_parameter("use_TV","false");
+    	set_parameter("projections_per_recon","0");
+    	set_parameter("iterations","30");
+    }
+    ~CMRTGadget() {}
+    
+  protected:
+    
+    virtual int process_config(ACE_Message_Block* mb);
+
+    virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader > *m1,        // header
+                        GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2,  // data
+                        GadgetContainerMessage< hoNDArray<float> > *m3 );                // traj/dcw
+
+  protected:
+        
+    template<class T> GadgetContainerMessage< hoNDArray<T> >* 
+      duplicate_array( GadgetContainerMessage< hoNDArray<T> > *array );        
+    
+    boost::shared_ptr< hoNDArray<float_complext> > 
+      extract_samples_from_queue ( ACE_Message_Queue<ACE_MT_SYNCH> *queue );
+    
+    boost::shared_ptr< hoNDArray<float> >
+      extract_trajectory_from_queue ( ACE_Message_Queue<ACE_MT_SYNCH> *queue );
+    
+    void extract_trajectory_and_dcw_from_queue
+      ( ACE_Message_Queue<ACE_MT_SYNCH> *queue, boost::shared_ptr< hoNDArray<floatd2> > & traj, boost::shared_ptr< hoNDArray<float> > & dcw  );
+
+    /***
+     * Combines all stored frames and resets the frame buffer
+     */
+    boost::shared_ptr<cuNDArray<float_complext> > get_combined_frames();
+
+  protected:
+
+    std::vector<size_t> image_space_dimensions_3D_;
+    unsigned int projections_per_recon_;
+
+
+    
+    boost::shared_ptr< ACE_Message_Queue<ACE_MT_SYNCH> > frame_readout_queue_;
+    boost::shared_ptr< ACE_Message_Queue<ACE_MT_SYNCH> > frame_traj_queue_;
+    std::vector<size_t> dimensions_;
+    std::vector<float> field_of_view_;
+    size_t repetitions_;
+    size_t samples_per_readout_;
+    size_t num_coils_;
+    size_t num_trajectory_dims_; // 2 for trajectories only, 3 for both trajectories + dcw
+
+    std::vector<boost::shared_ptr<hoNDArray<float_complext> > > frames;
+    boost::shared_ptr<hoNDArray<float> > dcw;
+    boost::shared_ptr<hoNDArray<floatd2> > traj;
+    unsigned int num_frames;
+    unsigned int iterations_;
+    bool golden_ratio_;
+    bool use_TV_;
+  };
+}
diff --git a/gadgets/hyper/CMakeLists.txt b/gadgets/hyper/CMakeLists.txt
new file mode 100644
index 0000000..68207c8
--- /dev/null
+++ b/gadgets/hyper/CMakeLists.txt
@@ -0,0 +1,70 @@
+IF (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_HYPER__)
+ENDIF (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/fft/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/fft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/arma_math
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/hyper
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri_core
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CMAKE_SOURCE_DIR}/toolboxes/dwt/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+  ${CMAKE_SOURCE_DIR}/gadgets/pmri
+  ${HDF5_INCLUDE_DIR}
+  ${HDF5_INCLUDE_DIR}/cpp
+  ${ARMADILLO_INCLUDE_DIRS}
+)
+
+add_library(gadgetron_hyper SHARED 
+  NFFT2DGadget.h NFFT2DGadget.cpp
+  CMRT3DGadget.h CMRT3DGadget.cpp
+  CMRTGadget.h CMRTGadget.cpp
+  CSIGadget.h CSIGadget.cpp
+  gpuCSICoilEstimationGadget.cpp
+  
+  )
+
+set_target_properties(gadgetron_hyper PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_hyper
+  gadgetron_gadgetbase
+  gadgetron_toolbox_gpunfft
+  gadgetron_toolbox_gpucore
+  gadgetron_toolbox_cpucore
+  gadgetron_toolbox_cpufft
+  gadgetron_toolbox_gpudwt
+  gadgetron_toolbox_cpucore_math
+  gadgetron_toolbox_gpusolvers
+  gadgetron_toolbox_gpuoperators
+  gadgetron_toolbox_gpuparallelmri
+  gadgetron_toolbox_hyper
+  ${ISMRMRD_LIBRARIES}  
+  ${FFTW3_LIBRARIES} 
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+  ${BOOST_LIBRARIES}
+  )
+
+install(FILES 
+  NFFT2DGadget.h
+  CMRTGadget.h
+  CMRT3DGadget.h
+  CSIGadget.h
+  gadgetron_hyper_export.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH})
+
+install(FILES NFFT2D.xml CMRT3D.xml CMRT.xml DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH})
+
+install(TARGETS gadgetron_hyper DESTINATION lib COMPONENT main)
diff --git a/gadgets/hyper/CSIGadget.cpp b/gadgets/hyper/CSIGadget.cpp
new file mode 100644
index 0000000..a19c3cc
--- /dev/null
+++ b/gadgets/hyper/CSIGadget.cpp
@@ -0,0 +1,330 @@
+/*
+ * CSIGadget.cpp
+ *
+ *  Created on: Nov 11, 2014
+ *      Author: dch
+ */
+
+#include "CSIGadget.h"
+#include <ismrmrd/xml.h>
+#include "cudaDeviceManager.h"
+#include "cuNDArray_utils.h"
+#include "cuNlcgSolver.h"
+#include "eigenTester.h"
+#include "CSfreqOperator.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuDWTOperator.h"
+#include <boost/make_shared.hpp>
+namespace Gadgetron {
+
+CSIGadget::CSIGadget() {
+	// TODO Auto-generated constructor stub
+
+}
+
+CSIGadget::~CSIGadget() {
+	// TODO Auto-generated destructor stub
+}
+
+
+int CSIGadget::process_config(ACE_Message_Block *mb){
+	//GDEBUG("gpuCgSenseGadget::process_config\n");
+
+	device_number_ = get_int_value("deviceno");
+
+	int number_of_devices = 0;
+	if (cudaGetDeviceCount(&number_of_devices)!= cudaSuccess) {
+		GDEBUG( "Error: unable to query number of CUDA devices.\n" );
+		return GADGET_FAIL;
+	}
+
+	if (number_of_devices == 0) {
+		GDEBUG( "Error: No available CUDA devices.\n" );
+		return GADGET_FAIL;
+	}
+
+	if (device_number_ >= number_of_devices) {
+		GDEBUG("Adjusting device number from %d to %d\n", device_number_,  (device_number_%number_of_devices));
+		device_number_ = (device_number_%number_of_devices);
+	}
+
+	if (cudaSetDevice(device_number_)!= cudaSuccess) {
+		GDEBUG( "Error: unable to set CUDA device.\n" );
+		return GADGET_FAIL;
+	}
+
+	pass_on_undesired_data_ = get_bool_value("pass_on_undesired_data");
+	cg_limit_ = get_double_value("cg_limit");
+	oversampling_factor_ = get_double_value("oversampling_factor");
+	kernel_width_ = get_double_value("kernel_width");
+	kappa_ = get_double_value("kappa");
+	output_convergence_ = get_bool_value("output_convergence");
+	number_of_sb_iterations_ = get_int_value("number_of_sb_iterations");
+	number_of_cg_iterations_ = get_int_value("number_of_cg_iterations");
+
+
+	mu_ = get_double_value("mu");
+
+	// Get the Ismrmrd header
+	//
+	ISMRMRD::IsmrmrdHeader h;
+	ISMRMRD::deserialize(mb->rd_ptr(),h);
+
+
+
+	if (h.encoding.size() != 1) {
+		GDEBUG("This Gadget only supports one encoding space\n");
+		return GADGET_FAIL;
+	}
+
+	// Get the encoding space and trajectory description
+	ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+	ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+	ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+	img_dims_ = {r_space.matrixSize.x,r_space.matrixSize.y,r_space.matrixSize.z};
+
+	matrix_size_ = {r_space.matrixSize.x,r_space.matrixSize.y};
+
+	unsigned int warp_size = cudaDeviceManager::Instance()->warp_size(device_number_);
+
+	matrix_size_os_ =
+			uint64d2(((static_cast<unsigned int>(std::ceil(matrix_size_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+					((static_cast<unsigned int>(std::ceil(matrix_size_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+
+
+		if (h.acquisitionSystemInformation) {
+			channels_ = h.acquisitionSystemInformation->receiverChannels ? *h.acquisitionSystemInformation->receiverChannels : 1;
+		} else {
+			channels_ = 1;
+		}
+
+
+		/*if (~h.userParameters.is_present()){
+			GDEBUG("CSI gadget requires userparameters to be set to obtain timesteps.");
+			return GADGET_FAIL;
+		}*/
+
+
+		auto parameters = h.userParameters->userParameterDouble;
+		auto bw = std::find_if(parameters.begin(),parameters.end(), [](ISMRMRD::UserParameterDouble d) { return d.name=="bw";});
+
+		if (bw->name != "bw"){
+			GDEBUG("CSI gadget: User parameter bw is missing.");
+			return GADGET_FAIL;
+		}
+
+
+		auto dte = std::find_if(parameters.begin(),parameters.end(), [](ISMRMRD::UserParameterDouble d) { return d.name=="dte";});
+		if (dte->name != "dte"){
+			GDEBUG("CSI gadget: User parameter dte is missing.");
+			return GADGET_FAIL;
+		}
+
+// Allocate encoding operator for non-Cartesian Sense
+		E_ = boost::make_shared< CSIOperator<float> >(1/bw->value, -dte->value);
+		E_->set_weight(mu_);
+
+		std::vector<float> freqs;
+		auto frequency_string = get_string_value("frequencies");
+
+		if (*frequency_string != ""){
+			std::stringstream stream(*frequency_string);
+			float freq;
+			while(stream >> freq)
+				freqs.push_back(freq);
+		} else {
+			float frequency_max = get_double_value("frequency_max");
+			float frequency_min = get_double_value("frequency_min");
+			float frequency_step = get_double_value("frequency_step");
+			for (float f = frequency_min; f <= frequency_max; f+= frequency_step)
+				freqs.push_back(f);
+		}
+
+		if (freqs.size() == 0)
+			throw std::runtime_error("CSIGadget: Frequencies not set!");
+		E_->set_frequencies(freqs);
+
+		img_dims_[2]=freqs.size();
+
+		S_ = boost::make_shared<cuNonCartesianSenseOperator<float,2>>();
+
+		E_->set_senseOp(S_);
+
+
+		auto idOp = boost::make_shared<identityOperator<cuNDArray<float_complext>>>();
+		idOp->set_domain_dimensions(&img_dims_);
+		idOp->set_codomain_dimensions(&img_dims_);
+		idOp->set_weight(2*mu_);
+		solver_.add_regularization_operator(idOp);
+
+		auto dX = boost::make_shared<cuPartialDerivativeOperator<float_complext,3>>(0);
+		dX->set_domain_dimensions(&img_dims_);
+		dX->set_codomain_dimensions(&img_dims_);
+		dX->set_weight(2*mu_);
+		auto dY = boost::make_shared<cuPartialDerivativeOperator<float_complext,3>>(1);
+		dY->set_domain_dimensions(&img_dims_);
+		dY->set_codomain_dimensions(&img_dims_);
+		dY->set_weight(2*mu_);
+		auto dZ = boost::make_shared<cuPartialDerivativeOperator<float_complext,3>>(2);
+		dZ->set_domain_dimensions(&img_dims_);
+		dZ->set_codomain_dimensions(&img_dims_);
+		dZ->set_weight(2*mu_);
+
+
+		solver_.add_regularization_group_operator(dX);
+		solver_.add_regularization_group_operator(dY);
+
+		//solver_.add_regularization_group_operator(dZ);
+		solver_.add_group();
+
+		auto W = boost::make_shared<cuDWTOperator<float_complext,3>>();
+		W->set_domain_dimensions(&img_dims_);
+		W->set_codomain_dimensions(&img_dims_);
+		W->set_weight(2*mu_);
+
+
+		auto W2 = boost::make_shared<cuDWTOperator<float_complext,3>>();
+		W2->set_shift(2);
+		W2->set_domain_dimensions(&img_dims_);
+		W2->set_codomain_dimensions(&img_dims_);
+		W2->set_weight(2*mu_);
+		solver_.add_regularization_operator(W);
+		solver_.add_regularization_operator(W2);
+		// Setup solver
+		solver_.set_encoding_operator( E_ );        // encoding matrix
+		solver_.set_max_outer_iterations( number_of_sb_iterations_ );
+		solver_.set_max_inner_iterations(1);
+		solver_.get_inner_solver()->set_max_iterations(number_of_cg_iterations_);
+		solver_.get_inner_solver()->set_tc_tolerance( cg_limit_ );
+		solver_.set_output_mode( (output_convergence_) ? cuCgSolver<float_complext>::OUTPUT_VERBOSE : cuCgSolver<float_complext>::OUTPUT_SILENT);
+		is_configured_ = true;
+	return GADGET_OK;
+
+}
+
+int CSIGadget::process(GadgetContainerMessage<cuSenseData>* m1){
+
+
+	if (!is_configured_) {
+		GDEBUG("\nData received before configuration complete\n");
+		return GADGET_FAIL;
+	}
+
+
+	GDEBUG("CSI is on the job\n");
+
+
+
+	auto traj = m1->getObjectPtr()->traj;
+
+
+	auto data = m1->getObjectPtr()->data;
+	auto csm =m1->getObjectPtr()->csm;
+	auto dcw = m1->getObjectPtr()->dcw;
+	//dcw.reset();
+	auto permutations = std::vector<size_t>{0,2,1};
+	data= permute(data.get(),&permutations);
+
+	if (dcw)
+		sqrt_inplace(dcw.get());
+
+
+	E_->set_domain_dimensions(&img_dims_);
+	E_->set_codomain_dimensions(data->get_dimensions().get());
+
+	std::vector<size_t> sense_dims = *data->get_dimensions();
+	sense_dims[1] = img_dims_[2];
+
+
+	S_->set_domain_dimensions(&img_dims_);
+	S_->set_codomain_dimensions(&sense_dims);
+/*
+	{
+		GDEBUG("Removing CSM maps");
+		auto csm_dims = *csm->get_dimensions();
+		csm_dims.pop_back();
+		cuNDArray<float_complext> csm_view(csm_dims,csm->get_data_ptr());
+		fill(&csm_view,complext<float>(1,0));
+		size_t nelements = csm_view.get_number_of_elements();
+		for (int  i = 1; i< csm->get_size(2); i++){
+			cuNDArray<float_complext> csm_view2(csm_dims,csm->get_data_ptr()+i*nelements);
+			clear(&csm_view2);
+		}
+	}
+*/
+	S_->set_csm(csm);
+	S_->set_dcw(dcw);
+	S_->setup( matrix_size_, matrix_size_os_, kernel_width_ );
+	S_->preprocess(traj.get());
+
+	GDEBUG("Setup done, solving....\n");
+	/*
+	eigenTester<cuNDArray<float_complext>> tester;
+	std::vector<float> freqs{  -575.1223,-450.1223,-360.1223,  -183.1223,140.8777};
+	auto T_ = boost::make_shared<CSfreqOperator>(E_->get_pointtime(),E_->get_echotime());
+	T_->set_frequencies(freqs);
+	T_->set_codomain_dimensions(data->get_dimensions().get());
+
+	std::vector<size_t> tim_dims = *data->get_dimensions();
+	tim_dims[2] = freqs.size();
+	T_->set_domain_dimensions(&tim_dims);
+
+	tester.add_encoding_operator(E_);
+
+	float_complext eigVal = tester.get_smallest_eigenvalue();
+
+	GDEBUG("Smallest eigenvalue: %f %f /n",real(eigVal),imag(eigVal));
+*/
+	/*
+	cuNlcgSolver<float_complext> solv;
+	//cuCgSolver<float_complext> solv;
+	solv.set_output_mode(cuCgSolver<float_complext>::OUTPUT_VERBOSE);
+	solv.set_max_iterations(10);
+	solv.set_encoding_operator(E_);
+	solv.set_tc_tolerance(1e-8f);
+	*/
+	auto result = solver_.solve(data.get());
+	//auto result = solv.solve(data.get());
+
+	//E_->mult_MH(data.get(),result.get(),false);
+
+	GDEBUG("Image sum: %f \n",asum(result.get()));
+	m1->release();
+
+	GDEBUG("Solver done, next patient...");
+
+	GadgetContainerMessage< hoNDArray< std::complex<float> > > *cm =
+			new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+	GadgetContainerMessage<ISMRMRD::ImageHeader> *m =
+			new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+
+	m->cont(cm);
+
+
+
+	result->to_host((hoNDArray<float_complext>*)cm->getObjectPtr());
+
+	GDEBUG("Result size: %i %i %i \n",result->get_size(0),result->get_size(1),result->get_size(2));
+
+	m->getObjectPtr()->matrix_size[0] = img_dims_[0];
+	m->getObjectPtr()->matrix_size[1] = img_dims_[1];
+	m->getObjectPtr()->matrix_size[2] = img_dims_[2];
+	m->getObjectPtr()->channels       = 1;
+	m->getObjectPtr()->image_index    = 1;
+
+
+	if (!this->next()->putq(m)){
+		GDEBUG("Failed to put image on que");
+		return GADGET_FAIL;
+	}
+
+
+	return GADGET_OK;
+}
+
+  GADGET_FACTORY_DECLARE(CSIGadget)
+
+} /* namespace Gadgetron */
diff --git a/gadgets/hyper/CSIGadget.h b/gadgets/hyper/CSIGadget.h
new file mode 100644
index 0000000..2a7ef22
--- /dev/null
+++ b/gadgets/hyper/CSIGadget.h
@@ -0,0 +1,52 @@
+/*
+ * CSIGadget.h
+ *
+ *  Created on: Nov 11, 2014
+ *      Author: dch
+ */
+
+#ifndef CSIGADGET_H_
+#define CSIGADGET_H_
+
+#include "Gadget.h"
+#include <ismrmrd/ismrmrd.h>
+#include "GenericReconJob.h"
+#include "CSIOperator.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuSbcCgSolver.h"
+#include "gpuCSICoilEstimationGadget.h"
+namespace Gadgetron {
+
+class CSIGadget: public Gadgetron::Gadget1<cuSenseData>{
+public:
+	CSIGadget();
+	virtual ~CSIGadget();
+
+	virtual int process(GadgetContainerMessage<cuSenseData>* m1);
+    virtual int process_config( ACE_Message_Block* mb );
+
+    int device_number_;
+    unsigned int number_of_cg_iterations_;
+    unsigned int number_of_sb_iterations_;
+    float cg_limit_;
+    float oversampling_factor_;
+    float kernel_width_;
+    float kappa_;
+    float mu_;
+    float lambda_;
+    bool output_convergence_;
+
+    std::vector<size_t> img_dims_;
+
+    uint64d2 matrix_size_;
+    uint64d2 matrix_size_os_;
+
+    boost::shared_ptr<CSIOperator<float> > E_;
+    boost::shared_ptr<cuNonCartesianSenseOperator<float,2> > S_;
+    cuSbcCgSolver<float_complext> solver_;
+    bool is_configured_;
+    unsigned int channels_;
+};
+
+} /* namespace Gadgetron */
+#endif /* CSIGADGET_H_ */
diff --git a/gadgets/hyper/NFFT2D.xml b/gadgets/hyper/NFFT2D.xml
new file mode 100644
index 0000000..6c96c72
--- /dev/null
+++ b/gadgets/hyper/NFFT2D.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+  
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+  
+    <gadget>
+      <name>NFFT</name>
+      <dll>gadgetron_hyper</dll>
+      <classname>NFFT2DGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>  
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/hyper/NFFT2DGadget.cpp b/gadgets/hyper/NFFT2DGadget.cpp
new file mode 100644
index 0000000..5b439b7
--- /dev/null
+++ b/gadgets/hyper/NFFT2DGadget.cpp
@@ -0,0 +1,365 @@
+#include "NFFT2DGadget.h"
+#include "cuNFFT.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_utils.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_utils.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "ismrmrd/xml.h"
+#include "radial_utilities.h"
+#include <cmath>
+
+namespace Gadgetron{
+
+  int NFFT2DGadget::process_config(ACE_Message_Block* mb)
+  {
+
+  	ISMRMRD::IsmrmrdHeader h;
+  	ISMRMRD::deserialize(mb->rd_ptr(),h);
+
+
+    if (h.encoding.size() != 1) {
+      GDEBUG("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+
+    ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+    ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+    ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+
+    GDEBUG("Matrix size: %d, %d\n", e_space.matrixSize.x, e_space.matrixSize.y, e_space.matrixSize.z);
+    dimensions_.push_back(r_space.matrixSize.x);
+    dimensions_.push_back(r_space.matrixSize.y);
+
+    field_of_view_.push_back(e_space.fieldOfView_mm.x);
+    field_of_view_.push_back(e_space.fieldOfView_mm.y);
+    GDEBUG("FOV: %f, %f\n", r_space.fieldOfView_mm.x, r_space.fieldOfView_mm.y);
+
+    repetitions_ = e_limits.repetition.is_present() ? e_limits.repetition.get().maximum + 1 : 1;
+    GDEBUG("#Repetitions: %d\n", repetitions_);
+
+    // Allocate readout and trajectory/dcw queues
+    //
+
+    frame_readout_queue_ = boost::shared_ptr< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>());
+    frame_traj_queue_ = boost::shared_ptr< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>());
+    
+    size_t bsize = sizeof(GadgetContainerMessage< hoNDArray< std::complex<float> > >)*dimensions_[0]*10;
+    
+    frame_readout_queue_->high_water_mark(bsize);
+    frame_readout_queue_->low_water_mark(bsize);
+    frame_traj_queue_->high_water_mark(bsize);
+    frame_traj_queue_->low_water_mark(bsize);
+
+    return GADGET_OK;
+  }
+
+  int NFFT2DGadget::process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader > *m1,        // header
+                            GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2,  // data
+                            GadgetContainerMessage< hoNDArray<float> > *m3 )                 // traj/dcw
+  {    
+    // Throw away any noise samples if they have been allowed to pass this far down the chain...
+    //
+    
+  	bool is_noise = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT);
+    if (is_noise) { 
+      m1->release();
+      return GADGET_OK;
+    }
+    
+    // First pass initialization
+    //
+    
+    if (frame_readout_queue_->message_count() == 0 ) {      
+      samples_per_readout_ = m1->getObjectPtr()->number_of_samples;
+      num_coils_ = m1->getObjectPtr()->active_channels;      
+      dimensions_.push_back(m1->getObjectPtr()->active_channels);
+      dimensions_.push_back(repetitions_);
+      num_trajectory_dims_ = m3->getObjectPtr()->get_size(0); // 2 for trajectories only, 3 for both trajectories + dcw
+    }
+
+    int samples = m1->getObjectPtr()->number_of_samples;
+    int readout = m1->getObjectPtr()->idx.kspace_encode_step_1;
+    int repetition = m1->getObjectPtr()->idx.kspace_encode_step_2;
+
+    // Enqueue incoming readouts and trajectories
+    //
+
+    frame_readout_queue_->enqueue_tail(duplicate_array(m2));
+    frame_traj_queue_->enqueue_tail(duplicate_array(m3));
+    
+    // If the last readout for a slice has arrived then perform a reconstruction
+    //
+
+    bool is_last_scan_in_repetition = 
+    		m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_LAST_IN_REPETITION);
+
+    if (is_last_scan_in_repetition) {
+
+
+      // Define the image header
+      //
+
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *cm1 = 
+        new GadgetContainerMessage<ISMRMRD::ImageHeader>();      
+      
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *cm2 = 
+        new GadgetContainerMessage<hoNDArray< std::complex<float> > >();
+      
+      cm1->getObjectPtr()->flags = 0;
+      cm1->cont(cm2);
+    
+      cm1->getObjectPtr()->matrix_size[0]     = dimensions_[0];
+      cm1->getObjectPtr()->matrix_size[1]     = dimensions_[1];
+      cm1->getObjectPtr()->matrix_size[2]     = 1;
+      cm1->getObjectPtr()->field_of_view[0]   = field_of_view_[0];
+      cm1->getObjectPtr()->field_of_view[1]   = field_of_view_[1];
+      cm1->getObjectPtr()->channels           = num_coils_;
+      cm1->getObjectPtr()->repetition         = m1->getObjectPtr()->idx.repetition;
+
+      memcpy(cm1->getObjectPtr()->position,
+             m1->getObjectPtr()->position,
+             sizeof(float)*3);
+
+      memcpy(cm1->getObjectPtr()->read_dir,
+             m1->getObjectPtr()->read_dir,
+             sizeof(float)*3);
+
+      memcpy(cm1->getObjectPtr()->phase_dir,
+             m1->getObjectPtr()->phase_dir,
+             sizeof(float)*3);
+
+      memcpy(cm1->getObjectPtr()->slice_dir,
+             m1->getObjectPtr()->slice_dir,
+             sizeof(float)*3);
+
+      memcpy(cm1->getObjectPtr()->patient_table_position,
+             m1->getObjectPtr()->patient_table_position, sizeof(float)*3);
+
+      cm1->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_CXFLOAT;
+      cm1->getObjectPtr()->image_index = 0;
+      cm1->getObjectPtr()->image_series_index = 0;
+
+      //
+      // Perform reconstruction of repetition
+      //
+      
+      // Get samples for frame
+      //
+
+      cuNDArray<float_complext> samples( extract_samples_from_queue( frame_readout_queue_.get()).get() );
+
+      // Get trajectories/dcw for frame
+      //
+      
+      boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2>);
+      boost::shared_ptr<cuNDArray<float> > dcw(new cuNDArray<float>);
+
+      extract_trajectory_and_dcw_from_queue( frame_traj_queue_.get(), traj.get(), dcw.get() );
+      //traj = compute_radial_trajectory_golden_ratio_2d<float>(samples_per_readout_,dimensions_[1],1,0,GR_ORIGINAL);
+
+      unsigned int num_profiles = samples.get_number_of_elements()/samples_per_readout_;
+      dcw = compute_radial_dcw_golden_ratio_2d<float>(samples_per_readout_,num_profiles,1.0,1.0f/samples_per_readout_/num_profiles,0,GR_ORIGINAL);
+      // Create output array
+      //
+
+
+      std::vector<size_t> img_dims(2);
+      img_dims[0] = dimensions_[0];
+      img_dims[1] = dimensions_[1];
+      cm2->getObjectPtr()->create(&img_dims);
+      cuNDArray<float_complext> image(&img_dims);
+      
+      // Initialize plan
+      //
+      
+      const float kernel_width = 5.5f;
+      cuNFFT_plan<float,2> plan( from_std_vector<size_t,2>(img_dims), from_std_vector<size_t,2>(img_dims)<<1, kernel_width );
+      plan.preprocess( traj.get(), cuNFFT_plan<float,2>::NFFT_PREP_NC2C );
+/*
+      if( dcw->get_number_of_elements() == 0 ){
+        std::vector<size_t> dcw_dims; dcw_dims.push_back(samples_per_readout_);
+        hoNDArray<float> host_dcw( dcw_dims );
+        for( int i=0; i<(int)dcw_dims[0]; i++ )
+          host_dcw.get_data_ptr()[i]=abs(i-(int)dcw_dims[0]/2);
+        host_dcw.get_data_ptr()[dcw_dims[0]/2] = 0.25f; // ad hoc value (we do not want a DC component of 0)        
+        dcw = expand(&host_dcw, traj->get_number_of_elements()/samples_per_readout_);
+      }
+*/
+      // Gridder
+      //
+      
+      plan.compute( &samples, &image,  
+                    (dcw->get_number_of_elements()>0) ? dcw.get() : 0x0,
+                    cuNFFT_plan<float,2>::NFFT_BACKWARDS_NC2C );
+
+
+      // Download to host
+      //
+
+      image.to_host( (hoNDArray<float_complext>*)cm2->getObjectPtr() );
+      // Pass on data down the gadget chain
+      //
+
+      if (this->next()->putq(cm1) < 0) {
+        return GADGET_FAIL;
+      }
+    }
+
+    m1->release();
+    return GADGET_OK;
+  }
+  
+  template<class T> GadgetContainerMessage< hoNDArray<T> >*
+  NFFT2DGadget::duplicate_array( GadgetContainerMessage< hoNDArray<T> > *array )
+  {
+    GadgetContainerMessage< hoNDArray<T> > *copy = new GadgetContainerMessage< hoNDArray<T> >();   
+    *(copy->getObjectPtr()) = *(array->getObjectPtr());
+    return copy;
+  }
+  
+  boost::shared_ptr< hoNDArray<float_complext> > 
+  NFFT2DGadget::extract_samples_from_queue ( ACE_Message_Queue<ACE_MT_SYNCH> *queue )                                             
+  {    
+    if(!queue) {
+      GDEBUG("Illegal queue pointer, cannot extract samples\n");
+      throw std::runtime_error("NFFT2DGadget::extract_samples_from_queue: illegal queue pointer");	
+    }
+
+    unsigned int readouts_buffered = queue->message_count();
+    
+    std::vector<size_t> dims;
+    dims.push_back(samples_per_readout_*readouts_buffered);
+    dims.push_back(num_coils_);
+    
+    boost::shared_ptr< hoNDArray<float_complext> > host_samples(new hoNDArray<float_complext>(dims));
+    
+    for (unsigned int p=0; p<readouts_buffered; p++) {
+      
+      ACE_Message_Block* mbq;
+      if (queue->dequeue_head(mbq) < 0) {
+        GDEBUG("Message dequeue failed\n");
+        throw std::runtime_error("NFFT2DGadget::extract_samples_from_queue: dequeing failed");	
+      }
+      
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *daq = AsContainerMessage<hoNDArray< std::complex<float> > >(mbq);
+	
+      if (!daq) {
+        GDEBUG("Unable to interpret data on message queue\n");
+        throw std::runtime_error("NFFT2DGadget::extract_samples_from_queue: failed to interpret data");	
+      }
+	
+      for (unsigned int c = 0; c < num_coils_; c++) {
+	
+        float_complext *data_ptr = host_samples->get_data_ptr();
+        data_ptr += c*samples_per_readout_*readouts_buffered+p*samples_per_readout_;
+	    
+        std::complex<float> *r_ptr = daq->getObjectPtr()->get_data_ptr();
+        r_ptr += c*daq->getObjectPtr()->get_size(0);
+	  
+        memcpy(data_ptr, r_ptr, samples_per_readout_*sizeof(float_complext));
+      }
+
+      mbq->release();
+    }
+    
+    return host_samples;
+  }
+
+  boost::shared_ptr< hoNDArray<float> > 
+  NFFT2DGadget::extract_trajectory_from_queue ( ACE_Message_Queue<ACE_MT_SYNCH> *queue )
+  {    
+    if(!queue) {
+      GDEBUG("Illegal queue pointer, cannot extract trajectory\n");
+      throw std::runtime_error("NFFT2DGadget::extract_trajectory_from_queue: illegal queue pointer");	
+    }
+
+    unsigned int readouts_buffered = queue->message_count();
+    
+    std::vector<size_t> dims;
+    dims.push_back(num_trajectory_dims_); // 2 for trajectories only, 3 for both trajectories + dcw
+    dims.push_back(samples_per_readout_);
+    dims.push_back(readouts_buffered);
+    
+    boost::shared_ptr< hoNDArray<float> > host_traj(new hoNDArray<float>(&dims));
+    
+    for (unsigned int p=0; p<readouts_buffered; p++) {      
+      ACE_Message_Block* mbq;
+      if (queue->dequeue_head(mbq) < 0) {
+        GDEBUG("Message dequeue failed\n");
+        throw std::runtime_error("NFFT2DGadget::extract_trajectory_from_queue: dequeing failed");	
+      }
+      
+      GadgetContainerMessage< hoNDArray<float> > *daq = AsContainerMessage<hoNDArray<float> >(mbq);
+	
+      if (!daq) {
+        GDEBUG("Unable to interpret data on message queue\n");
+        throw std::runtime_error("NFFT2DGadget::extract_trajectory_from_queue: failed to interpret data");	
+      }
+
+      float *data_ptr = host_traj->get_data_ptr();
+      data_ptr += num_trajectory_dims_*samples_per_readout_*p;
+      
+      float *r_ptr = daq->getObjectPtr()->get_data_ptr();
+      
+      memcpy(data_ptr, r_ptr, num_trajectory_dims_*samples_per_readout_*sizeof(float));
+      
+      mbq->release();
+    }
+    
+    return host_traj;
+  }
+
+  void NFFT2DGadget::extract_trajectory_and_dcw_from_queue
+  ( ACE_Message_Queue<ACE_MT_SYNCH> *queue, cuNDArray<floatd2> *traj, cuNDArray<float> *dcw )
+  {
+    // Extract trajectory and (if present) density compensation weights.
+    // They are stored as a float array of dimensions: {2,3} x #samples_per_readout x #readouts.
+    // We need
+    // - a floatd2 trajectory array 
+    // - a float dcw array 
+    //
+        
+    if( num_trajectory_dims_ == 2 ){
+      boost::shared_ptr< hoNDArray<float> > host_traj = extract_trajectory_from_queue( queue );
+      std::vector<size_t> dims_1d; dims_1d.push_back(host_traj->get_size(1)*host_traj->get_size(2));
+      hoNDArray<floatd2> host_traj2(&dims_1d,(floatd2*)host_traj->get_data_ptr());
+      *traj = cuNDArray<floatd2>(host_traj2);
+
+    }
+    else{
+
+      boost::shared_ptr< hoNDArray<float> > host_traj_dcw = extract_trajectory_from_queue( queue );
+
+      std::vector<size_t> order;
+      order.push_back(1); order.push_back(2); order.push_back(0);
+      
+      boost::shared_ptr< hoNDArray<float> > host_traj_dcw_shifted = permute( host_traj_dcw.get(), &order );
+      
+      std::vector<size_t> dims_1d;
+      dims_1d.push_back(host_traj_dcw_shifted->get_size(0)*host_traj_dcw_shifted->get_size(1));
+      
+      hoNDArray<float> tmp(&dims_1d, host_traj_dcw_shifted->get_data_ptr()+2*dims_1d[0]);
+      *dcw = tmp;
+      
+      std::vector<size_t> dims_2d = dims_1d; dims_2d.push_back(2);
+      order.clear(); order.push_back(1); order.push_back(0);
+      
+      tmp.create(&dims_2d, host_traj_dcw_shifted->get_data_ptr());
+      auto _traj = permute( &tmp, &order );
+      hoNDArray<floatd2> tmp2(&dims_1d,(floatd2*)_traj->get_data_ptr());
+      
+      *traj = cuNDArray<floatd2>(tmp2);
+    }
+
+    std::vector<size_t >dims_2d;
+    dims_2d.push_back(traj->get_number_of_elements());
+    dims_2d.push_back(1); // Number of frames
+
+    traj->reshape(&dims_2d);
+    if( num_trajectory_dims_ == 3 ) dcw->reshape(&dims_2d);
+  }
+
+  GADGET_FACTORY_DECLARE(NFFT2DGadget)
+}
diff --git a/gadgets/hyper/NFFT2DGadget.h b/gadgets/hyper/NFFT2DGadget.h
new file mode 100644
index 0000000..25632e9
--- /dev/null
+++ b/gadgets/hyper/NFFT2DGadget.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "cuNDArray.h"
+#include "gadgetron_hyper_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSHYPER NFFT2DGadget :
+    public Gadget3< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> >, hoNDArray<float> >
+  {
+    
+  public:
+    //GADGET_DECLARE(NFFT2DGadget);
+    
+    NFFT2DGadget() {}
+    ~NFFT2DGadget() {}
+    
+  protected:
+    
+    virtual int process_config(ACE_Message_Block* mb);
+
+    virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader > *m1,        // header
+                        GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2,  // data
+                        GadgetContainerMessage< hoNDArray<float> > *m3 );                // traj/dcw
+
+  protected:
+        
+    template<class T> GadgetContainerMessage< hoNDArray<T> >* 
+      duplicate_array( GadgetContainerMessage< hoNDArray<T> > *array );        
+    
+    boost::shared_ptr< hoNDArray<float_complext> > 
+      extract_samples_from_queue ( ACE_Message_Queue<ACE_MT_SYNCH> *queue );
+    
+    boost::shared_ptr< hoNDArray<float> > 
+      extract_trajectory_from_queue ( ACE_Message_Queue<ACE_MT_SYNCH> *queue );
+    
+    void extract_trajectory_and_dcw_from_queue
+      ( ACE_Message_Queue<ACE_MT_SYNCH> *queue, cuNDArray<floatd2> *traj, cuNDArray<float> *dcw );
+
+  protected:
+    
+    boost::shared_ptr< ACE_Message_Queue<ACE_MT_SYNCH> > frame_readout_queue_;
+    boost::shared_ptr< ACE_Message_Queue<ACE_MT_SYNCH> > frame_traj_queue_;
+    std::vector<size_t> dimensions_;
+    std::vector<float> field_of_view_;
+    size_t repetitions_;
+    size_t samples_per_readout_;
+    size_t num_coils_;
+    size_t num_trajectory_dims_; // 2 for trajectories only, 3 for both trajectories + dcw
+  };
+}
diff --git a/gadgets/hyper/gadgetron_hyper_export.h b/gadgets/hyper/gadgetron_hyper_export.h
new file mode 100644
index 0000000..60a7053
--- /dev/null
+++ b/gadgets/hyper/gadgetron_hyper_export.h
@@ -0,0 +1,14 @@
+#ifndef GADGETRON_HYPER_EXPORT_H_
+#define GADGETRON_HYPER_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_HYPER__)
+#define EXPORTGADGETSHYPER __declspec(dllexport)
+#else
+#define EXPORTGADGETSHYPER __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETSHYPER
+#endif
+
+#endif /* GADGETRON_HYPER_EXPORT_H_ */
diff --git a/gadgets/hyper/gpuCSICoilEstimationGadget.cpp b/gadgets/hyper/gpuCSICoilEstimationGadget.cpp
new file mode 100644
index 0000000..3c97cca
--- /dev/null
+++ b/gadgets/hyper/gpuCSICoilEstimationGadget.cpp
@@ -0,0 +1,271 @@
+/*
+ * gpuCSICoilEstimationGadget.cpp
+ *
+ *  Created on: Nov 12, 2014
+ *      Author: dch
+ */
+
+#include "gpuCSICoilEstimationGadget.h"
+#include <utility>
+#include <ismrmrd/xml.h>
+#include "cuNDArray.h"
+#include "cuNFFT.h"
+#include "b1_map.h"
+#include "vector_td_utilities.h"
+#include "cuNFFTOperator.h"
+#include "cuCgSolver.h"
+#include "cuNDArray_fileio.h"
+#include "trajectory_utils.h"
+#include <boost/make_shared.hpp>
+
+namespace Gadgetron {
+
+gpuCSICoilEstimationGadget::gpuCSICoilEstimationGadget() {
+	set_parameter("kernel_width","5.5");
+	set_parameter("skip_lines","0");
+
+}
+
+gpuCSICoilEstimationGadget::~gpuCSICoilEstimationGadget() {
+	// TODO Auto-generated destructor stub
+}
+
+int gpuCSICoilEstimationGadget::process_config(ACE_Message_Block* mb) {
+	ISMRMRD::IsmrmrdHeader h;
+	ISMRMRD::deserialize(mb->rd_ptr(),h);
+
+	if (h.encoding.size() != 1){
+		GDEBUG("Coil estimation gadget only supports encoding spaces of size 1\n");
+		return GADGET_FAIL;
+	}
+	img_size = {h.encoding[0].reconSpace.matrixSize.x,h.encoding[0].reconSpace.matrixSize.y};
+	kernel_width = get_double_value("kernel_width");
+
+	coils = h.acquisitionSystemInformation->receiverChannels;
+	skip_lines = get_int_value("skip_lines");
+}
+
+int gpuCSICoilEstimationGadget::process(
+		GadgetContainerMessage<IsmrmrdAcquisitionBucket>* m1) {
+	IsmrmrdAcquisitionBucket* bucket = m1->getObjectPtr();
+
+	auto cm1 = new GadgetContainerMessage<cuSenseData>();
+	auto senseData = cm1->getObjectPtr();
+
+	coils = bucket->data_.front().head_->getObjectPtr()->active_channels;
+	//GDEBUG("Active channels %i \n",coils);
+
+
+	{
+
+		hoNDArray<std::complex<float>> * ho_data;
+		hoNDArray<float>* ho_traj;
+
+		std::tie(ho_data,ho_traj) = combine_data(bucket->data_);
+
+		if (skip_lines > 0){
+			auto cal_dims = *ho_data->get_dimensions();
+			cal_dims.back() = skip_lines;
+			auto data_dims = *ho_data->get_dimensions();
+			data_dims.back() -= skip_lines;
+
+
+			hoNDArray<float_complext> cal_view(cal_dims,(float_complext*) ho_data->get_data_ptr());
+			senseData->freq_calibration = boost::make_shared<cuNDArray<float_complext>>(cal_view);
+			hoNDArray<float_complext> data_view(data_dims,(float_complext*)ho_data->get_data_ptr()+cal_view.get_number_of_elements());
+			senseData->data = boost::make_shared<cuNDArray<float_complext>>(data_view);
+		} else {
+
+			senseData->data = boost::make_shared<cuNDArray<float_complext>>(reinterpret_cast<hoNDArray<float_complext>*>(ho_data));
+		}
+
+
+		if (ho_traj->get_size(0) > 2){ //We have dcw
+			auto traj_dcw = separate_traj_and_dcw(ho_traj);
+			senseData->traj = boost::make_shared<cuNDArray<floatd2>>(*std::get<0>(traj_dcw));
+			senseData->dcw = boost::make_shared<cuNDArray<float>>(*std::get<1>(traj_dcw));
+		} else {
+			std::vector<size_t> tdims = *ho_traj->get_dimensions();
+			std::vector<size_t> tmp_dim(tdims.begin()+1,tdims.end());
+			hoNDArray<floatd2> tmp(tmp_dim,reinterpret_cast<floatd2*>(ho_traj->get_data_ptr()));
+			senseData->traj = boost::make_shared<cuNDArray<floatd2>>(tmp);
+		}
+
+		delete ho_data;
+		delete ho_traj;
+	}
+
+
+	//Remove Initial Spirals
+
+	boost::shared_ptr< cuNDArray<float_complext> >  ref_data;
+	boost::shared_ptr< cuNDArray<floatd2> > ref_traj;
+	boost::shared_ptr<cuNDArray<float> > ref_dcw;
+
+
+	if (bucket->ref_.empty()){
+		ref_data = senseData->data;
+		ref_traj = senseData->traj;
+		ref_dcw = senseData->dcw;
+	} else {
+
+		hoNDArray<std::complex<float>> * ho_data;
+		hoNDArray<float>* ho_traj;
+		std::tie(ho_data,ho_traj) = combine_data(bucket->ref_);
+
+		ref_data = boost::make_shared<cuNDArray<float_complext>>(reinterpret_cast<hoNDArray<float_complext>*>(ho_data));
+		if (ho_traj->get_size(0) > 2){
+			auto traj_dcw = separate_traj_and_dcw(ho_traj);
+			ref_traj =boost::make_shared<cuNDArray<floatd2>>(*std::get<0>(traj_dcw));
+			ref_dcw = boost::make_shared<cuNDArray<float>>(*std::get<1>(traj_dcw));
+		} else {
+			std::vector<size_t> tdims = *ho_traj->get_dimensions();
+			std::vector<size_t> tmp_dim(tdims.begin()+1,tdims.end());
+			hoNDArray<floatd2> tmp(tmp_dim,reinterpret_cast<floatd2*>(ho_traj->get_data_ptr()));
+			ref_traj = boost::make_shared<cuNDArray<floatd2>>(tmp);
+		}
+		delete ho_data;
+		delete ho_traj;
+
+
+	}
+
+	senseData->csm = calculate_CSM(ref_data.get(),ref_traj.get(),ref_dcw.get());
+
+	this->next()->putq(cm1);
+	//All important stuff has been taken from the bucket. Free it.
+	m1->release();
+
+
+
+
+}
+
+std::tuple<hoNDArray<std::complex<float>>*, hoNDArray<float>*> gpuCSICoilEstimationGadget::combine_data(
+		std::vector<IsmrmrdAcquisitionData>& acquisitions) {
+
+
+	std::vector<size_t> data_dims = *acquisitions.front().data_->getObjectPtr()->get_dimensions();
+	std::vector<size_t> traj_dims = *acquisitions.front().traj_->getObjectPtr()->get_dimensions();
+	std::vector<size_t> base_dim = data_dims;
+	data_dims.push_back(acquisitions.size());
+	if (acquisitions.size() == 1 ||  acquisitions[2].traj_) //Trajectory present on all acquisitions
+		traj_dims.push_back(acquisitions.size());
+
+
+	auto result = new hoNDArray<std::complex<float> >(data_dims);
+	auto traj = new hoNDArray<float>(traj_dims);
+
+	std::complex<float>* ptr = result->get_data_ptr();
+	float* traj_ptr = traj->get_data_ptr();
+	for (const IsmrmrdAcquisitionData & data : acquisitions){
+		hoNDArray<std::complex<float>>* array = data.data_->getObjectPtr();
+		if (data.traj_) { //Only copy if trajectory is present
+			hoNDArray<float>* array_traj = data.traj_->getObjectPtr();
+			memcpy(traj_ptr,array_traj->get_data_ptr(),array_traj->get_number_of_bytes());
+			traj_ptr += array_traj->get_number_of_elements();
+		}
+		if (!array->dimensions_equal(&base_dim)){
+			return std::tuple<hoNDArray<std::complex<float>>*, hoNDArray<float>*>(nullptr,nullptr);
+		}
+		memcpy(ptr,array->get_data_ptr(),array->get_number_of_bytes());
+		ptr += array->get_number_of_elements();
+
+	}
+
+	return std::make_tuple(result,traj);
+
+}
+
+boost::shared_ptr<cuNDArray<float_complext> > gpuCSICoilEstimationGadget::calculate_CSM(
+		cuNDArray<float_complext>* data, cuNDArray<floatd2>* traj, cuNDArray<float>* dcw ) {
+
+
+	if (dcw) { //We have density compensation, so we can get away with gridding
+
+		cuNFFT_plan<float,2> plan(from_std_vector<size_t,2>(img_size),from_std_vector<size_t,2>(img_size)*size_t(2),kernel_width);
+		std::vector<size_t> csm_dims = img_size;
+		csm_dims.push_back(coils);
+		cuNDArray<float_complext> tmp(csm_dims);
+		GDEBUG("Coils %i \n\n",tmp.get_size(2));
+		std::vector<size_t> flat_dims = {traj->get_number_of_elements()};
+		cuNDArray<floatd2> flat_traj(flat_dims,traj->get_data_ptr());
+
+		std::vector<size_t> spiral_dims{data->get_size(0),data->get_size(1)}; //Trajectories, coils
+		cuNDArray<complext<float>> second_spiral(spiral_dims,data->get_data_ptr()+spiral_dims[0]*spiral_dims[1]*0);
+		std::vector<size_t> spiral_traj_dims{spiral_dims[0]};
+		cuNDArray<floatd2> spiral_traj(spiral_traj_dims,traj->get_data_ptr()+spiral_dims[0]*0);
+		cuNDArray<float> spiral_dcw(spiral_traj_dims,dcw->get_data_ptr()+spiral_dims[0]*0);
+
+		GDEBUG("Preprocessing\n\n");
+		plan.preprocess(&spiral_traj,cuNFFT_plan<float,2>::NFFT_PREP_NC2C);
+		GDEBUG("Computing\n\n");
+		plan.compute(&second_spiral,&tmp,&spiral_dcw,cuNFFT_plan<float,2>::NFFT_BACKWARDS_NC2C);
+		auto tmp_abs = abs(&tmp);
+		write_nd_array(tmp_abs.get(),"images.real");
+
+		return estimate_b1_map<float,2>(&tmp);
+
+	} else { //No density compensation, we have to do iterative reconstruction.
+		std::vector<size_t> csm_dims = img_size;
+		csm_dims.push_back(coils);
+
+		auto E = boost::make_shared<cuNFFTOperator<float,2>>();
+
+		E->setup(from_std_vector<size_t,2>(img_size),from_std_vector<size_t,2>(img_size)*size_t(2),kernel_width);
+		std::vector<size_t> flat_dims = {traj->get_number_of_elements()};
+		cuNDArray<floatd2> flat_traj(flat_dims,traj->get_data_ptr());
+
+		E->set_domain_dimensions(&csm_dims);
+		cuCgSolver<float_complext> solver;
+		solver.set_max_iterations(200);
+		solver.set_encoding_operator(E);
+		std::vector<size_t> spiral_dims{data->get_size(0),data->get_size(1)}; //Trajectories, coils
+		cuNDArray<complext<float>> second_spiral(spiral_dims,data->get_data_ptr()+spiral_dims[0]*spiral_dims[1]*0);
+		E->set_codomain_dimensions(&spiral_dims);
+		std::vector<size_t> spiral_traj_dims{spiral_dims[0]};
+		cuNDArray<floatd2> spiral_traj(spiral_traj_dims,traj->get_data_ptr()+spiral_dims[0]*0);
+		E->preprocess(&spiral_traj);
+		auto tmp = solver.solve(&second_spiral);
+		auto tmp_abs = abs(tmp.get());
+
+		write_nd_array(tmp_abs.get(),"images2.real");
+
+		auto res = estimate_b1_map<float,2>(tmp.get());
+		//fill(res.get(),float_complext(1,0));
+		//auto res= boost::make_shared<cuNDArray<float_complext>>(csm_dims);
+		//fill(res.get(),float_complext(1,0));
+		return res;
+
+	}
+
+}
+
+std::tuple<boost::shared_ptr<hoNDArray<floatd2 > >, boost::shared_ptr<hoNDArray<float >>> gpuCSICoilEstimationGadget::separate_traj_and_dcw(
+		hoNDArray<float >* traj_dcw) {
+	std::vector<size_t> dims = *traj_dcw->get_dimensions();
+	std::vector<size_t> reduced_dims(dims.begin()+1,dims.end()); //Copy vector, but leave out first dim
+	auto  dcw = boost::make_shared<hoNDArray<float>>(reduced_dims);
+
+	auto traj = boost::make_shared<hoNDArray<floatd2>>(reduced_dims);
+
+	auto dcw_ptr = dcw->get_data_ptr();
+	auto traj_ptr = traj->get_data_ptr();
+	auto ptr = traj_dcw->get_data_ptr();
+
+	for (size_t i = 0; i < traj_dcw->get_number_of_elements()/3; i++){
+		traj_ptr[i][0] = ptr[i*3];
+		traj_ptr[i][1] = ptr[i*3+1];
+		dcw_ptr[i] = ptr[i*3+2];
+	}
+
+	return std::make_tuple(traj,dcw);
+
+
+}
+
+
+
+GADGET_FACTORY_DECLARE(gpuCSICoilEstimationGadget)
+
+} /* namespace Gadgetron */
diff --git a/gadgets/hyper/gpuCSICoilEstimationGadget.h b/gadgets/hyper/gpuCSICoilEstimationGadget.h
new file mode 100644
index 0000000..39accbb
--- /dev/null
+++ b/gadgets/hyper/gpuCSICoilEstimationGadget.h
@@ -0,0 +1,56 @@
+/*
+ * gpuCSICoilEstimationGadget.h
+ *
+ *  Created on: Nov 12, 2014
+ *      Author: dch
+ */
+
+#ifndef gpuCSICoilESTIMATIONGADGET_H_
+#define gpuCSICoilESTIMATIONGADGET_H_
+
+#include "Gadget.h"
+#include <tuple>
+#include "mri_core_data.h"
+#include "cuNDArray.h"
+
+namespace Gadgetron {
+
+class gpuCSICoilEstimationGadget: public Gadgetron::Gadget1<IsmrmrdAcquisitionBucket> {
+public:
+	gpuCSICoilEstimationGadget();
+	virtual ~gpuCSICoilEstimationGadget();
+	virtual int process_config(ACE_Message_Block* mb);
+
+    virtual int process(GadgetContainerMessage<IsmrmrdAcquisitionBucket>* m1);
+
+protected:
+
+    static  std::tuple<hoNDArray<std::complex<float>>*, hoNDArray<float>*> combine_data(std::vector<IsmrmrdAcquisitionData>& aquisitions);
+
+    boost::shared_ptr<cuNDArray<float_complext>> calculate_CSM(cuNDArray<float_complext>* data,cuNDArray<floatd2>* traj, cuNDArray<float>* dcw );
+/**
+ * Separates trajectory and dcw
+ * @param array containing trajectory and dcw, so that the first dimension has size 3
+ * @return tuple containing trajectory and dcw
+ */
+    static std::tuple<boost::shared_ptr<hoNDArray<floatd2 > >, boost::shared_ptr<hoNDArray<float >>> separate_traj_and_dcw(hoNDArray<float>*);
+    std::vector<size_t> img_size;
+    size_t coils;
+    float kernel_width;
+
+    unsigned int skip_lines;
+
+};
+
+
+struct cuSenseData {
+	boost::shared_ptr<cuNDArray<float_complext>> data;
+	boost::shared_ptr<cuNDArray<floatd2>> traj;
+	boost::shared_ptr<cuNDArray<float_complext>> csm;
+	boost::shared_ptr<cuNDArray<float>> dcw;
+	boost::shared_ptr<cuNDArray<float_complext>> freq_calibration;
+
+};
+
+} /* namespace Gadgetron */
+#endif /* gpuCSICoilESTIMATIONGADGET_H_ */
diff --git a/gadgets/interventional_mri/CMakeLists.txt b/gadgets/interventional_mri/CMakeLists.txt
new file mode 100644
index 0000000..bfc090c
--- /dev/null
+++ b/gadgets/interventional_mri/CMakeLists.txt
@@ -0,0 +1,55 @@
+IF (WIN32)
+    ADD_DEFINITIONS(-D__BUILD_GADGETRON_INTERVENTIONAL_MRI__)
+ENDIF (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+
+include_directories(
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+    ${CMAKE_SOURCE_DIR}/toolboxes/core
+    ${CMAKE_SOURCE_DIR}/toolboxes/mri_core
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/algorithm
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/fft/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/fft/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus
+    ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+    ${HDF5_INCLUDE_DIR}
+    ${HDF5_INCLUDE_DIR}/cpp
+    ${ARMADILLO_INCLUDE_DIRS}
+    ${MKL_INCLUDE_DIR}
+    ${ISMRMRD_INCLUDE_DIR}
+)
+
+
+add_library(gadgetron_interventional_mri SHARED 
+    gadgetron_interventional_mri_export.h 
+    DeviceChannelSplitterGadget.h
+    DeviceChannelSplitterGadget.cpp
+)
+
+set_target_properties(gadgetron_interventional_mri PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})                                                                                                                                                                                                      
+
+target_link_libraries(gadgetron_interventional_mri
+    gadgetron_gadgetbase
+    gadgetron_toolbox_log
+    gadgetron_toolbox_cpucore
+    gadgetron_toolbox_gadgettools
+    ${ISMRMRD_LIBRARIES} 
+    ${ACE_LIBRARIES}
+)
+
+install(FILES 
+    gadgetron_interventional_mri_export.h
+    DeviceChannelSplitterGadget.h
+    DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+install(FILES grappa_device.xml DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
+
+install(TARGETS gadgetron_interventional_mri DESTINATION lib COMPONENT main)
diff --git a/gadgets/interventional_mri/DeviceChannelSplitterGadget.cpp b/gadgets/interventional_mri/DeviceChannelSplitterGadget.cpp
new file mode 100644
index 0000000..2643339
--- /dev/null
+++ b/gadgets/interventional_mri/DeviceChannelSplitterGadget.cpp
@@ -0,0 +1,92 @@
+#include "DeviceChannelSplitterGadget.h"
+#include "ismrmrd/meta.h"
+
+ //This is needed for things such as data role, which should NOT be defined in gtPlus
+#include "mri_core_def.h"
+
+namespace Gadgetron{
+
+template <typename T>
+int DeviceChannelSplitterGadget<T>
+::process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+	  GadgetContainerMessage< hoNDArray< T > >* m2)
+{
+  
+  //Some consistency checking
+  unsigned int header_channels = m1->getObjectPtr()->channels;
+  unsigned int array_channels = m2->getObjectPtr()->get_size(m2->getObjectPtr()->get_number_of_dimensions()-1);
+  unsigned int dim_x = m2->getObjectPtr()->get_size(0);
+  unsigned int dim_y = m2->getObjectPtr()->get_size(1);
+  unsigned int dim_z = m2->getObjectPtr()->get_size(2);
+  size_t image_elements = dim_x*dim_y*dim_z;
+
+  if (header_channels != array_channels) {
+    GDEBUG("Inconsistent number of header channels (%d) and array channels (%d)\n", header_channels, array_channels);
+    m1->release();
+    return GADGET_FAIL;
+  }
+  
+
+  for (int i = 0; i < array_channels; i++) {
+    
+
+    GadgetContainerMessage<ISMRMRD::ImageHeader>* im1 = new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+    *(im1->getObjectPtr()) = *(m1->getObjectPtr());
+    im1->getObjectPtr()->channels = 1;
+    
+    /*
+    GDEBUG("Image with matrix (cha=%d): %d, %d, %d and fov %f, %f, %f\n", 
+		  i,
+		  im1->getObjectPtr()->matrix_size[0], 
+		  im1->getObjectPtr()->matrix_size[1], 
+		  im1->getObjectPtr()->matrix_size[2],
+		  im1->getObjectPtr()->field_of_view[0],
+		  im1->getObjectPtr()->field_of_view[1],
+		  im1->getObjectPtr()->field_of_view[2]);
+
+    */
+
+    GadgetContainerMessage< hoNDArray< T > >* im2 = new GadgetContainerMessage< hoNDArray< T > >();
+    im2->getObjectPtr()->create(dim_x,dim_y,dim_z,1);
+    memcpy(im2->getObjectPtr()->get_data_ptr(), m2->getObjectPtr()->get_data_ptr() + i*image_elements, sizeof(T)*image_elements);
+    
+    im1->cont(im2);
+    
+    Gadgetron::GadgetContainerMessage<ISMRMRD::MetaContainer>* im3 = new Gadgetron::GadgetContainerMessage<ISMRMRD::MetaContainer>();
+    if (i == 0) {
+      im3->getObjectPtr()->set(GADGETRON_DATA_ROLE, GADGETRON_IMAGE_IRT_IMAGE);
+    } else {
+      im3->getObjectPtr()->set(GADGETRON_DATA_ROLE, GADGETRON_IMAGE_IRT_DEVICE);
+      im3->getObjectPtr()->set(GADGETRON_IMAGE_CUR_DEVICE_CHA, (long)i);
+
+    }
+    im3->getObjectPtr()->append(GADGETRON_DATA_ROLE, GADGETRON_IMAGE_INTENSITY_UNCHANGED);
+
+    if (array_channels > 1) {
+      im3->getObjectPtr()->set(GADGETRON_IMAGE_NUM_DEVICE_CHA, (long)(array_channels-1));
+    } else {
+      im3->getObjectPtr()->set(GADGETRON_IMAGE_NUM_DEVICE_CHA, (long)(-1));
+    }
+
+    im2->cont(im3);
+
+    if (this->next()->putq(im1) == -1) {
+      m1->release();
+      GERROR("DeviceChannelSplitterGadget::process, passing data on to next gadget\n");
+      return -1;
+    }
+  }
+
+  //We are done with the original data
+  m1->release();
+
+
+  return GADGET_OK;
+}
+
+//Declare factories for the various template instances
+GADGET_FACTORY_DECLARE(DeviceChannelSplitterGadgetFLOAT);
+GADGET_FACTORY_DECLARE(DeviceChannelSplitterGadgetUSHORT);
+GADGET_FACTORY_DECLARE(DeviceChannelSplitterGadgetCPLX);
+
+}
diff --git a/gadgets/interventional_mri/DeviceChannelSplitterGadget.h b/gadgets/interventional_mri/DeviceChannelSplitterGadget.h
new file mode 100644
index 0000000..6a1a2c9
--- /dev/null
+++ b/gadgets/interventional_mri/DeviceChannelSplitterGadget.h
@@ -0,0 +1,45 @@
+#ifndef DEVICECHANNELSPLITTERGADGET_H
+#define DEVICECHANNELSPLITTERGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "GadgetMRIHeaders.h"
+#include "GadgetStreamController.h"
+#include "gadgetron_interventional_mri_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  template <typename T> class EXPORTGADGETSINTERVENTIONAL_MRI DeviceChannelSplitterGadget : 
+  public Gadget2<ISMRMRD::ImageHeader,hoNDArray< T > >
+  {
+  protected:
+    virtual int process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, 
+			GadgetContainerMessage< hoNDArray< T > >* m2);
+  };
+  
+  class EXPORTGADGETSINTERVENTIONAL_MRI DeviceChannelSplitterGadgetUSHORT :
+  public DeviceChannelSplitterGadget<ACE_UINT16>
+  {
+  public:
+    GADGET_DECLARE(DeviceChannelSplitterGadgetUSHORT);
+  };
+
+  class EXPORTGADGETSINTERVENTIONAL_MRI DeviceChannelSplitterGadgetFLOAT :
+  public DeviceChannelSplitterGadget<float>
+  {
+  public:
+    GADGET_DECLARE(DeviceChannelSplitterGadgetFLOAT);
+  };
+
+  class EXPORTGADGETSINTERVENTIONAL_MRI DeviceChannelSplitterGadgetCPLX :
+  public DeviceChannelSplitterGadget< std::complex<float> >
+  {
+  public:
+    GADGET_DECLARE(DeviceChannelSplitterGadgetCPLX);
+  };
+}
+
+#endif //DEVICECHANNELSPLITTERGADGET_H
diff --git a/gadgets/interventional_mri/gadgetron_interventional_mri_export.h b/gadgets/interventional_mri/gadgetron_interventional_mri_export.h
new file mode 100644
index 0000000..43f3f30
--- /dev/null
+++ b/gadgets/interventional_mri/gadgetron_interventional_mri_export.h
@@ -0,0 +1,14 @@
+#ifndef GADGETRON_INTERVENTIONAL_MRI_EXPORT_H_
+#define GADGETRON_INTERVENTIONAL_MRI_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_INTERVENTIONAL_MRI__)
+#define EXPORTGADGETSINTERVENTIONAL_MRI __declspec(dllexport)
+#else
+#define EXPORTGADGETSINTERVENTIONAL_MRI __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETSINTERVENTIONAL_MRI
+#endif
+
+#endif /* GADGETRON_INTERVENTIONAL_MRI_EXPORT_H_ */
diff --git a/gadgets/interventional_mri/grappa_device.xml b/gadgets/interventional_mri/grappa_device.xml
new file mode 100644
index 0000000..facb3c7
--- /dev/null
+++ b/gadgets/interventional_mri/grappa_device.xml
@@ -0,0 +1,104 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+ 
+     <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+      <property><name>scale_only_channels_by_name</name><value>uncombined_channels_by_name at PCA</value></property>
+    </gadget>    
+
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+      <property><name>uncombined_channels_by_name</name><value>Loop_7:L7</value></property>
+
+      <!-- present_uncombined_channels will get updated by the gadget based on the attached coils -->
+      <property><name>present_uncombined_channels</name><value>0</value></property>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+
+    <gadget>
+      <name>RemoveROOversampling</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>Grappa</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaGadget</classname>
+      <!-- After PCA gadget, the device channel with be the first channel -->
+      <!--
+      <property><name>uncombined_channels</name><value>0</value></property>
+      -->
+      <property><name>device_channels</name><value>present_uncombined_channels at PCA</value></property>
+      <property><name>use_gpu</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>GrappaUnmixing</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaUnmixingGadget</classname>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+    
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>DeviceChannelSplitter</name>
+      <dll>gadgetron_interventional_mri</dll>
+      <classname>DeviceChannelSplitterGadgetUSHORT</classname>
+    </gadget>>
+
+     <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+</gadgetronStreamConfiguration>
diff --git a/gadgets/matlab/BaseBufferGadget.m b/gadgets/matlab/BaseBufferGadget.m
new file mode 100644
index 0000000..4409e12
--- /dev/null
+++ b/gadgets/matlab/BaseBufferGadget.m
@@ -0,0 +1,69 @@
+classdef BaseBufferGadget < handle
+    
+  properties
+
+        imageQ = [];
+        bufferQ = struct([]);
+        xml = [];
+
+    end
+
+    methods
+
+        % Constructor
+        function g = BaseBufferGadget()
+            
+        end
+
+        % Init function
+        function init(g, xmlstr)
+            % Convert the xml config string to an IsmrmrdHeader object
+            g.xml = ismrmrd.xml.deserialize(xmlstr);
+            g.emptyQ();
+        end
+        
+           % Process function
+        function [imageQ,bufferQ] = run_process(g, recon_data)
+            %% Convert headers
+            for n = 1:numel(recon_data)
+                recon_data(n).data.headers = ismrmrd.AcquisitionHeader(recon_data(n).data.headers);
+                if isfield(recon_data(n),'reference')
+                    if isstruct(recon_data(n).reference)
+                        recon_data(n).reference.headers = ismrmrd.AcquisitionHeader(recon_data(n).reference.headers);
+                    end
+                end
+            end
+            
+            %% Process data
+            g.process(recon_data);
+            
+            imageQ = g.imageQ;
+            bufferQ = g.bufferQ;
+        end
+        
+          % Q related functions
+        function emptyQ(g)
+           g.bufferQ = struct([]);
+           g.imageQ = [];
+        end
+
+        
+         function putImageQ(g, header, image)
+             disp('Putting Image on Q')
+             idx = length(g.imageQ) + 1;
+             header.check(); 
+             g.imageQ(idx).bytes = header.toBytes();
+             g.imageQ(idx).image = single(image);             
+         end
+        
+         function putBufferQ(g,data,reference)
+             idx = length(g.bufferQ)+1;
+             g.bufferQ(idx).data = data;
+             g.bufferQ(idx).data.headers = g.bufferQ(idx).data.headers.toBytes();
+             if (exist('reference','var'))
+                 g.bufferQ(idx).reference = reference;
+                 g.bufferQ(idx).reference.headers = reference.headers.toBytes();
+             end
+        end
+    end
+end
\ No newline at end of file
diff --git a/gadgets/matlab/BaseGadget.m b/gadgets/matlab/BaseGadget.m
new file mode 100644
index 0000000..9f09f48
--- /dev/null
+++ b/gadgets/matlab/BaseGadget.m
@@ -0,0 +1,74 @@
+classdef BaseGadget < handle
+
+    properties
+
+        Q = [];
+        xml = [];
+
+    end
+
+    methods
+
+        % Constructor
+        function g = BaseGadget()
+        end
+
+        % Init function
+        function init(g, xmlstr)
+            % Convert the xml config string to an IsmrmrdHeader object
+            g.xml = ismrmrd.xml.deserialize(xmlstr);
+            g.emptyQ();
+        end
+
+        % Process function
+        function [Q] = run_process(g, htype, hdr_bytes, data)
+            if (htype == 1)
+                head = ismrmrd.AcquisitionHeader(hdr_bytes);
+            elseif (htype == 2)
+                head = ismrmrd.ImageHeader(hdr_bytes);
+            else
+                error('Uknown header type.');
+            end
+            g.process(head, data);
+            Q = g.Q;
+        end
+
+        % Config function
+        function config(g)
+            fprintf('%s\n',char(serialize(g.xml)));
+        end
+        
+        % Process function
+        function process(g, head, data)
+            g.putQ(head,data);
+        end
+
+        % Q related functions
+        function emptyQ(g)
+           g.Q = [];
+        end
+
+        function putQ(g, head, data)
+            % find the end of the queue
+	        idx = length(g.Q) + 1;
+            % put the type of the header and the bytes for the header on the queue
+            if isa(head, 'ismrmrd.AcquisitionHeader')
+                g.Q(idx).type = int32(1);
+                head.check(); % fix the types
+                g.Q(idx).bytes = head.toBytes();
+            elseif isa(head, 'ismrmrd.ImageHeader')
+                g.Q(idx).type = int32(2);
+                head.check(); % fix the types
+                g.Q(idx).bytes = head.toBytes();
+            else
+                % TODO: do we throw an error here?
+                g.Q(idx).type = int32(0);
+		disp('Illegal header type found')
+            end
+            % put the data on the queue
+            % make sure the data is single precision
+            g.Q(idx).data = single(data);
+        end
+
+    end
+end
diff --git a/gadgets/matlab/CMakeLists.txt b/gadgets/matlab/CMakeLists.txt
new file mode 100644
index 0000000..225ae9b
--- /dev/null
+++ b/gadgets/matlab/CMakeLists.txt
@@ -0,0 +1,57 @@
+find_package(Ismrmrd REQUIRED)
+
+include_directories(${MATLAB_INCLUDE_DIR} ${CMAKE_SOURCE_DIR}/toolboxes/mri_core ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math)
+
+if (UNIX)
+    if (APPLE)
+        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+        SET(MATLAB_SUFFIX ".mexmaci64")
+    else(APPLE)
+        SET(MATLAB_SUFFIX ".mexglnxa64")
+    endif(APPLE)
+else(UNIX)
+    SET(MATLAB_SUFFIX ".dll")
+endif(UNIX)
+
+if (UNIX)
+    add_library(gadgetron_matlab SHARED gadgetron_matlab_export.h MatlabGadget.h MatlabGadget.cpp MatlabBufferGadget.h MatlabBufferGadget.cpp MatlabUtils.h MatlabUtils.cpp)
+else(UNIX)
+    add_library(gadgetron_matlab SHARED gadgetron_matlab_export.h MatlabGadget.h MatlabGadget.cpp)
+endif(UNIX)
+set_target_properties(gadgetron_matlab PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(
+    gadgetron_matlab
+    gadgetron_toolbox_log
+    gadgetron_gadgetbase
+    gadgetron_toolbox_cpucore
+    gadgetron_toolbox_cpucore_math
+    ${MATLAB_LIBRARIES}
+    ${ISMRMRD_LIBRARIES}
+    optimized ${ACE_LIBRARIES}
+    debug ${ACE_DEBUG_LIBRARY}
+)
+
+if (UNIX)
+    set(JAVA_MATLAB_SERVER_SRC "MatlabCommandServer.java")
+    string(REPLACE "java" "class" JAVA_MATLAB_SERVER_CLASS ${JAVA_MATLAB_SERVER_SRC})
+    set(JAVA_MATLAB_SERVER_CLASS "${CMAKE_CURRENT_BINARY_DIR}/${JAVA_MATLAB_SERVER_CLASS}")
+
+    string(REPLACE ";" ":" MATLAB_UNIX_JARS "${MATLAB_JARS}")
+
+    add_custom_command(
+        OUTPUT ${JAVA_MATLAB_SERVER_CLASS}
+        DEPENDS ${JAVA_MATLAB_SERVER_SRC}
+        COMMAND javac -source 1.5 -target 1.5 -d ${CMAKE_CURRENT_BINARY_DIR} -cp "${MATLAB_UNIX_JARS}" ${CMAKE_CURRENT_SOURCE_DIR}/${JAVA_MATLAB_SERVER_SRC}
+        COMMENT "Generating Matlab Command Server class" VERBATIM
+    )
+    add_custom_target(matlab_command_server ALL DEPENDS ${JAVA_MATLAB_SERVER_CLASS})
+    install(FILES ${JAVA_MATLAB_SERVER_CLASS} DESTINATION ${GADGETRON_INSTALL_MATLAB_PATH} COMPONENT main)
+else(UNIX)
+        MESSAGE( "Don't know how to build the Matlab Command Server class on Windows" )
+endif(UNIX)
+
+install(TARGETS gadgetron_matlab DESTINATION lib COMPONENT main)
+install(FILES MatlabGadget.h gadgetron_matlab_export.h DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+install(FILES BaseGadget.m BaseBufferGadget.m bufferRecon.m scale.m accumulate_and_recon.m mask_image.m recon.m trajectoryScale.m DESTINATION ${GADGETRON_INSTALL_MATLAB_PATH} COMPONENT main)
+install(FILES matlab.xml matlabbuffer.xml matlabnoncartesian.xml DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
diff --git a/gadgets/matlab/MatlabBufferGadget.cpp b/gadgets/matlab/MatlabBufferGadget.cpp
new file mode 100644
index 0000000..0abb1f8
--- /dev/null
+++ b/gadgets/matlab/MatlabBufferGadget.cpp
@@ -0,0 +1,114 @@
+#include "MatlabBufferGadget.h"
+#include "MatlabUtils.h"
+
+namespace Gadgetron{
+
+int MatlabBufferGadget::process(GadgetContainerMessage<IsmrmrdReconData>* m1)
+{
+	// Initialize a string for matlab commands
+	std::string cmd;
+
+	auto recon_data = m1->getObjectPtr();
+	mwSize nencoding_spaces = recon_data->rbit_.size();
+
+	const char* fieldnames[2] = {"data","reference"};
+	auto reconArray = mxCreateStructArray(1,&nencoding_spaces,2,fieldnames);
+//auto reconArray = mxCreateCellArray(1,&nencoding_spaces);
+
+	for (int i = 0; i <  recon_data->rbit_.size(); i++){
+		auto mxrecon = BufferToMatlabStruct(&recon_data->rbit_[i].data_);
+		mxSetField(reconArray,i,"data",mxrecon);
+		if (recon_data->rbit_[i].ref_.data_.get_number_of_elements()){
+			auto mxref = BufferToMatlabStruct(&recon_data->rbit_[i].data_);
+			mxSetField(reconArray,i,"reference",mxref);
+		}
+
+	}
+	engPutVariable(engine_, "recon_data", reconArray);
+
+	cmd = "[imageQ,bufferQ] = matgadget.run_process(recon_data); matgadget.emptyQ(); whos()";
+	send_matlab_command(cmd);
+
+	// Get the size of the gadget's queue
+
+	mxArray *imageQ = engGetVariable(engine_, "imageQ");
+	if (imageQ == NULL) {
+		GERROR("Failed to get the imageQ from matgadget\n");
+		return GADGET_FAIL;
+	}
+
+	size_t qlen = mxGetNumberOfElements(imageQ);
+	GDEBUG("Image Queue size: %d \n", qlen);
+
+	const mwSize* dims = mxGetDimensions(imageQ);
+	mwSize ndims = mxGetNumberOfDimensions(imageQ);
+
+
+
+	//Read all Image bytes
+	for (mwIndex idx = 0; idx < qlen; idx++) {
+		mxArray *res_hdr  = mxGetField(imageQ, idx, "bytes");
+		mxArray *res_data = mxGetField(imageQ, idx, "image");
+
+		GadgetContainerMessage<ISMRMRD::ImageHeader>* m3 =
+				new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+		ISMRMRD::ImageHeader *hdr_new = m3->getObjectPtr();
+		memcpy(hdr_new, mxGetData(res_hdr), sizeof(ISMRMRD::ImageHeader));
+
+		auto image= MatlabToHoNDArray<std::complex<float>>(res_data);
+		auto m4 = new GadgetContainerMessage< hoNDArray< std::complex<float> > >(image);
+		auto dims = *image->get_dimensions();
+
+		delete image;
+		m3->cont(m4);
+		if (this->next()->putq(m3) < 0) {
+			GDEBUG("Failed to put Image message on queue\n");
+			return GADGET_FAIL;
+		}
+
+	}
+	//Match engGetVariable with mxDestroy___s
+
+
+	mxArray* bufferQ = engGetVariable(engine_,"bufferQ");
+
+	qlen = mxGetNumberOfElements(bufferQ);
+	GDEBUG("Buffer Queue size: %d \n", qlen);
+
+	IsmrmrdReconData output_data;
+	for (mwIndex idx = 0; idx <qlen; idx++){
+
+		IsmrmrdReconBit bit;
+		bit.data_ = MatlabStructToBuffer(mxGetField(bufferQ,idx,"data"));
+
+		auto ref = mxGetField(bufferQ,idx,"reference");
+		if (ref){
+			GDEBUG("Adding reference");
+			bit.ref_ = MatlabStructToBuffer(ref);
+			GDEBUG("Number of elements %i \n",bit.ref_.data_.get_number_of_elements());
+		}
+		output_data.rbit_.push_back(bit);
+	}
+
+
+
+
+	if (!output_data.rbit_.empty()){
+		auto m3 = new GadgetContainerMessage<IsmrmrdReconData>(output_data.rbit_);
+		if (this->next()->putq(m3) < 0){
+			GDEBUG("Failed to put Buffer message on queue\n");
+			return GADGET_FAIL;
+		}
+	}
+	mxDestroyArray(bufferQ);
+	mxDestroyArray(imageQ);
+	//mxDestroyArray(reconArray); //We're not supposed to delete this?
+
+	// We are finished with the incoming messages m1 and m2
+	m1->release();
+
+	return GADGET_OK;
+}
+}
+
+
diff --git a/gadgets/matlab/MatlabBufferGadget.h b/gadgets/matlab/MatlabBufferGadget.h
new file mode 100644
index 0000000..737af91
--- /dev/null
+++ b/gadgets/matlab/MatlabBufferGadget.h
@@ -0,0 +1,185 @@
+#pragma once
+
+#include "gadgetron_matlab_export.h"
+#include "Gadget.h"
+#include "gadgetron_paths.h"
+#include "hoNDArray.h"
+#include "ismrmrd/ismrmrd.h"
+#include "engine.h"     // Matlab Engine header
+
+#include "ace/Synch.h"  // For the MatlabCommandServer
+#include "ace/SOCK_Connector.h"
+#include "ace/INET_Addr.h"
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <complex>
+#include <boost/lexical_cast.hpp>
+#include "mri_core_data.h"
+// TODO:
+//Make the port option work so that we can have multiple matlabs running, each with its own command server.
+//Create a debug option to use evalstring and get back the matlab output on every function call.
+//Finish the image stuff
+//Is there a better way to kill the command server?
+//Test on windows
+
+
+namespace Gadgetron{
+
+class MatlabBufferGadget:
+    public Gadget1<IsmrmrdReconData >
+{
+public:
+    MatlabBufferGadget(): Gadget1<IsmrmrdReconData >()
+    {
+        // Open the Matlab Engine on the current host
+        GDEBUG("Starting MATLAB engine\n");
+        if (!(engine_ = engOpen("matlab -nosplash -nodesktop"))) {
+            // TODO: error checking!
+            GDEBUG("Can't start MATLAB engine\n");
+        } else {
+            // Add ISMRMRD Java bindings jar to Matlab's path
+            // TODO: this should be in user's Matlab path NOT HERE
+
+            // Prepare a buffer for collecting Matlab's output
+            char matlab_buffer_[2049] = "\0";
+            engOutputBuffer(engine_, matlab_buffer_, 2048);
+
+	    // Add the necessary paths to the matlab environment
+	    // Java matlab command server
+	    std::string gadgetron_matlab_path = get_gadgetron_home() + "/share/gadgetron/matlab";
+	    std::string java_add_path_cmd = std::string("javaaddpath('") + gadgetron_matlab_path + std::string("');");
+	    std::string add_path_cmd = std::string("addpath('") + gadgetron_matlab_path + std::string("');");
+            // Gadgetron matlab scripts
+	    engEvalString(engine_, java_add_path_cmd.c_str());
+	    engEvalString(engine_, add_path_cmd.c_str());
+            // ISMRMRD matlab library
+            engEvalString(engine_, "addpath(fullfile(getenv('ISMRMRD_HOME'), '/share/ismrmrd/matlab'));");
+
+	    GDEBUG("%s", matlab_buffer_);
+        }
+    }
+
+    ~MatlabBufferGadget()
+    {
+        char matlab_buffer_[2049] = "\0";
+        engOutputBuffer(engine_, matlab_buffer_, 2048);
+	// Stop the Java Command server
+        // send the stop signal to the command server and
+        //  wait a bit for it to shut down cleanly.
+        GDEBUG("Closing down the Matlab Command Server\n");
+	engEvalString(engine_, "M.notifyEnd(); pause(1);");
+        engEvalString(engine_, "clear java;");
+        GDEBUG("%s", matlab_buffer_);
+        // Close the Matlab engine
+        GDEBUG("Closing down Matlab\n");
+        engClose(engine_);
+    }
+
+    virtual int process(GadgetContainerMessage<IsmrmrdReconData> *);
+
+protected:
+    GADGET_PROPERTY(debug_mode, bool, "Debug mode", false);
+    GADGET_PROPERTY(matlab_path, std::string, "Path to Matlab code", "");
+    GADGET_PROPERTY(matlab_classname, std::string, "Name of Matlab gadget class", "");
+    GADGET_PROPERTY(matlab_port, int, "Port on which to run Matlab command server", 3000);
+
+    int process_config(ACE_Message_Block* mb)
+    {
+        std::string cmd;
+
+        debug_mode_  = debug_mode.value();
+        path_        = matlab_path.value();
+        classname_   = matlab_classname.value();
+        if (classname_.empty()) {
+            GERROR("Missing Matlab Gadget classname in config!");
+            return GADGET_FAIL;
+        }
+        command_server_port_ = matlab_port.value();
+
+        GDEBUG("MATLAB Class Name : %s\n", classname_.c_str());
+
+        //char matlab_buffer_[2049] = "\0";
+        char matlab_buffer_[20481] = "\0";
+        engOutputBuffer(engine_, matlab_buffer_, 20480);
+
+   	// Instantiate the Java Command server
+        // TODO: we HAVE to pause in Matlab to allow the java command server thread to start
+        cmd = "M = MatlabCommandServer(" + boost::lexical_cast<std::string>(command_server_port_) +
+                "); M.start(); pause(1);";
+	engEvalString(engine_, cmd.c_str());
+        GDEBUG("%s", matlab_buffer_);
+
+        // add user specified path for this gadget
+        if (!path_.empty()) {
+            cmd = "addpath('" + path_ + "');";
+            send_matlab_command(cmd);
+        }
+
+        // Put the XML Header into the matlab workspace
+        std::string xmlConfig = std::string(mb->rd_ptr());
+        mxArray *xmlstring = mxCreateString(xmlConfig.c_str());
+        engPutVariable(engine_, "xmlstring", xmlstring);
+
+        // Instantiate the Matlab gadget object from the user specified class
+        // Call matlab gadget's init method with the XML Header
+        // and the user defined config method
+        cmd = "matgadget = " + classname_ + "();";
+        cmd += "matgadget.init(xmlstring); matgadget.config();";
+        if (send_matlab_command(cmd) != GADGET_OK) {
+            GDEBUG("Failed to send matlab command.\n");
+            return GADGET_FAIL;
+        }
+
+	mxDestroyArray(xmlstring);
+
+        return GADGET_OK;
+    }
+
+    int send_matlab_command(std::string& command)
+    {
+
+        if (debug_mode_) {
+            char matlab_buffer_[8193] = "\0";
+            engOutputBuffer(engine_, matlab_buffer_, 8192);
+            engEvalString(engine_, command.c_str());
+            GDEBUG("%s\n", matlab_buffer_);
+            return GADGET_OK;
+        }
+        else {
+            ACE_SOCK_Stream client_stream;
+            ACE_INET_Addr remote_addr(command_server_port_, "localhost");
+            ACE_SOCK_Connector connector;
+
+            if (connector.connect(client_stream, remote_addr) == -1) {
+                GDEBUG("Connection failed\n");
+                return GADGET_FAIL;
+            }
+
+            ACE_Time_Value timeout(10);
+            if (client_stream.send_n(command.c_str(), command.size(), &timeout) == -1) {
+                GDEBUG("Error in send_n\n");
+                client_stream.close();
+                return GADGET_FAIL;
+            }
+
+            if (client_stream.close () == -1){
+                GDEBUG("Error in close\n");
+                return GADGET_FAIL;
+            }
+            return GADGET_OK;
+        }
+    }
+
+
+    std::string path_;
+    std::string classname_;
+    int command_server_port_;
+    int debug_mode_;
+
+    Engine *engine_;
+};
+
+GADGET_FACTORY_DECLARE(MatlabBufferGadget);
+}
diff --git a/gadgets/matlab/MatlabCommandServer.java b/gadgets/matlab/MatlabCommandServer.java
new file mode 100644
index 0000000..0a7bdb9
--- /dev/null
+++ b/gadgets/matlab/MatlabCommandServer.java
@@ -0,0 +1,129 @@
+import java.lang.Thread;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.io.*;
+import com.mathworks.jmi.*;
+
+class MatlabCommandServer extends Thread {
+
+    ServerSocket socket = null;
+    Matlab matlab = null;
+
+    int port;
+
+    boolean stop_signal_received = false;
+    boolean socket_is_open = false;
+
+    public MatlabCommandServer(int port) {
+	try {
+	    this.matlab = new Matlab();
+	} catch (Exception e) {
+	    System.err.println("Failed to create new Matlab");
+            System.err.println(e.getMessage());
+	}
+        this.port = port;
+    }
+
+    public int getLocalPort() {
+        if (!socket_is_open) {
+            System.err.println("Socket isn't open!");
+            return -1;
+        }
+        return this.socket.getLocalPort();
+    }
+
+    private boolean openSocket() {
+        if (socket_is_open) {
+            return false;
+        }
+
+        try {
+            this.socket = new ServerSocket(this.port);
+            this.socket.setSoTimeout(1000); //1000ms time out. We will check if shutdown has occurred every 1000ms.
+        } catch (Exception e) {
+            // Socket creation has failed, we should do something
+            System.err.println("Socket failed to open");
+            System.err.println(e.getMessage());
+            return false;
+        }
+        socket_is_open = true;
+        return true;
+    }
+
+
+    private boolean closeSocket() {
+        if (!socket_is_open) {
+            return false;
+        }
+
+        try {
+            socket.close();
+        } catch (Exception e) {
+            // Socket close has failed, we should do something
+            System.err.println("Socket failed to close");
+            System.err.println(e.getMessage());
+            return false;
+        }
+        socket_is_open = false;
+        return true;
+    }
+
+    private boolean receiveCommand() {
+	try {
+	    Socket sock = socket.accept();
+	    BufferedReader in = new BufferedReader(new InputStreamReader(sock.getInputStream()));
+
+	    //System.out.println("Waiting for command");
+	    while (!in.ready()) ;
+
+	    String command = in.readLine();
+
+	    //System.out.println(command);
+	    matlab.evalConsoleOutput(command);
+
+	    in.close();
+	    sock.close();
+
+
+	} catch (java.io.InterruptedIOException e) {
+             // This means that we have waited for connection but so far nothing.
+             // We should check if the thread has been notified to stop,
+             // if so, stop the loop and otherwise continue.
+	    if (stop_signal_received) {
+		return false;
+	    }
+	} catch (Exception e) {
+	    System.err.println("Something unexpected has happened!!");
+	    System.err.println(e.getMessage());
+	    return false;
+	}
+	return true;
+
+    }
+
+    public void notifyEnd() {
+	stop_signal_received = true;
+    }
+
+    public void run() {
+	if (!openSocket()) {
+            return;
+        }
+
+        System.err.format("Matlab Command Server is running on port %d%n", this.getLocalPort());
+
+	while (true) {
+	    if (!receiveCommand()) break;
+	}
+	closeSocket();
+	stop_signal_received = false;
+    }
+
+    protected void finalize() throws Throwable {
+	System.out.println("MatlabMessageServer finalize() called");
+	stop_signal_received  = true;
+	closeSocket();
+	super.finalize();
+    }
+
+}
diff --git a/gadgets/matlab/MatlabGadget.cpp b/gadgets/matlab/MatlabGadget.cpp
new file mode 100644
index 0000000..99f8baa
--- /dev/null
+++ b/gadgets/matlab/MatlabGadget.cpp
@@ -0,0 +1,302 @@
+#include "MatlabGadget.h"
+
+namespace Gadgetron{
+
+int AcquisitionMatlabGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+        GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+    // Initialize a string for matlab commands
+    std::string cmd;
+
+    ISMRMRD::AcquisitionHeader *acq = m1->getObjectPtr();
+
+    mwSize acq_hdr_dims[2] = {sizeof(ISMRMRD::AcquisitionHeader), 1};
+    mxArray *acq_hdr_bytes = mxCreateNumericArray(2, acq_hdr_dims, mxUINT8_CLASS, mxREAL);
+    memcpy(mxGetData(acq_hdr_bytes), acq, sizeof(ISMRMRD::AcquisitionHeader));
+
+    // Copy the data
+    std::complex<float> *raw_data = m2->getObjectPtr()->get_data_ptr();
+    if (!raw_data) {
+        GDEBUG("Broken raw_data pointer\n");
+        return GADGET_FAIL;
+    }
+
+    unsigned long num_elements = m2->getObjectPtr()->get_number_of_elements();
+
+    float *real_data = (float *)mxCalloc(num_elements, sizeof(float));
+    if (!real_data) {
+        GDEBUG("Failed to allocate float* for real_data\n");
+        return GADGET_FAIL;
+    }
+    float *imag_data = (float *)mxCalloc(num_elements, sizeof(float));
+    if (!imag_data) {
+        GDEBUG("Failed to allocate float* for imag_data\n");
+        return GADGET_FAIL;
+    }
+
+    for (int i = 0; i < num_elements; i++) {
+        //GDEBUG_STREAM(i << ": " << raw_data[i].real() << ", " << raw_data[i].imag() << endl);
+        real_data[i] = raw_data[i].real();
+        imag_data[i] = raw_data[i].imag();
+    }
+
+    mxArray *acq_data = mxCreateNumericMatrix(0, 0, mxSINGLE_CLASS, mxCOMPLEX);
+    mxSetData(acq_data, real_data);
+    mxSetImagData(acq_data, imag_data);
+    mxSetM(acq_data, m1->getObjectPtr()->number_of_samples);
+    mxSetN(acq_data, m1->getObjectPtr()->active_channels);
+
+    // Logic:
+    // send AcquisitionHeader as a byte array
+    // send AcquisitionData as a complex float array
+    // Call the run_process function in the BaseGadget
+    // Empty the gadget's queue.
+    // This puts a copy of the queue on the workspace.
+    // The queue is a structure array and we read it back
+    // TODO put this in a readme file somewhere useful
+    engPutVariable(engine_, "hdr_bytes", acq_hdr_bytes);
+
+    engPutVariable(engine_, "data", acq_data);
+    cmd = "Q = matgadget.run_process(1, hdr_bytes, data); matgadget.emptyQ();";
+    send_matlab_command(cmd);
+
+    // Get the size of the gadget's queue
+    mxArray *Q = engGetVariable(engine_, "Q");
+    if (Q == NULL) {
+        GDEBUG("Failed to get the Queue from matgadget\n");
+        return GADGET_FAIL;
+    }
+
+    size_t qlen = mxGetNumberOfElements(Q);
+   
+
+    // Loop over the elements of the Q, reading one entry at a time
+    // to get a structure with type, headerbytes, and data
+    mwIndex idx;
+    for (idx = 0; idx < qlen; idx++) {
+        mxArray *res_type = mxGetField(Q, idx, "type");
+        mxArray *res_hdr  = mxGetField(Q, idx, "bytes");
+        mxArray *res_data = mxGetField(Q, idx, "data");
+
+        // determine the type of the object on the quue (i.e. acquisition or image)
+        int tp = *((int *)mxGetData(res_type));
+        switch (tp) {
+        case 1:     // AcquisitionHeader
+        {
+            // grab the modified AcquisitionHeader and convert it back to C++
+            GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m3 =
+                    new GadgetContainerMessage<ISMRMRD::AcquisitionHeader>();
+            ISMRMRD::AcquisitionHeader *hdr_new = m3->getObjectPtr();
+            memcpy(hdr_new, mxGetData(res_hdr), sizeof(ISMRMRD::AcquisitionHeader));
+
+            size_t number_of_samples = mxGetM(res_data);
+            size_t active_channels = mxGetN(res_data);
+
+            GadgetContainerMessage<hoNDArray< std::complex<float> > >* m4 =
+                    new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+            m3->cont(m4);
+            std::vector<size_t> dims;
+            dims.push_back(number_of_samples);
+            dims.push_back(active_channels);
+            try {
+                m4->getObjectPtr()->create(&dims);
+            } catch (std::bad_alloc& err) {
+                GDEBUG("Failed to create new hoNDArray\n");
+                return GADGET_FAIL;
+            }
+
+            float *real_data = (float *)mxGetData(res_data);
+            float *imag_data = (float *)mxGetImagData(res_data);
+            for (int i = 0; i < number_of_samples*active_channels; i++) {
+                m4->getObjectPtr()->get_data_ptr()[i] = std::complex<float>(real_data[i],imag_data[i]);
+            }
+
+            if (this->next()->putq(m3) < 0) {
+                GDEBUG("Failed to put Acquisition message on queue\n");
+                return GADGET_FAIL;
+            }
+
+            break;
+        }
+        case 2:     // ImageHeader
+        {
+            // grab the modified AcquisitionHeader and convert it back to C++
+            GadgetContainerMessage<ISMRMRD::ImageHeader>* m3 =
+                    new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+            ISMRMRD::ImageHeader *hdr_new = m3->getObjectPtr();
+            memcpy(hdr_new, mxGetData(res_hdr), sizeof(ISMRMRD::ImageHeader));
+
+            GadgetContainerMessage<hoNDArray< std::complex<float> > >* m4 =
+                    new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+            m3->cont(m4);
+            std::vector<size_t> dims;
+            dims.push_back(hdr_new->matrix_size[0]);
+            dims.push_back(hdr_new->matrix_size[1]);
+            dims.push_back(hdr_new->matrix_size[2]);
+            dims.push_back(hdr_new->channels);
+            try {
+                m4->getObjectPtr()->create(&dims);
+            } catch (std::bad_alloc& err) {
+                GDEBUG("Failed to create new hoNDArray\n");
+                return GADGET_FAIL;
+            }
+
+            float *real_data = (float *)mxGetData(res_data);
+            float *imag_data = (float *)mxGetImagData(res_data);
+            for (int i = 0; i < m4->getObjectPtr()->get_number_of_elements(); i++) {
+                m4->getObjectPtr()->get_data_ptr()[i] = std::complex<float>(real_data[i],imag_data[i]);
+            }
+
+            if (this->next()->putq(m3) < 0) {
+                GDEBUG("Failed to put Image message on queue\n");
+                return GADGET_FAIL;
+            }
+
+            break;
+        }
+        default:
+            GDEBUG("Matlab gadget returned undefined header type\n");
+            return GADGET_FAIL;
+        }
+    }
+
+    // Match all mxCreate___s with mxDestroy___s
+    mxDestroyArray(acq_hdr_bytes);
+    mxDestroyArray(acq_data);
+
+    //Match engGetVariable with mxDestroy___s
+    mxDestroyArray(Q);
+
+    // We are finished with the incoming messages m1 and m2
+    m1->release();
+
+    return GADGET_OK;
+}
+
+
+// TODO: The ImageMatlabGadget is not currently templated
+//      It only works for images of type std::complex<float>
+int ImageMatlabGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+        GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+    // Initialize a string for matlab commands
+    std::string cmd;
+
+    ISMRMRD::ImageHeader *img = m1->getObjectPtr();
+
+    // Create a mxArray of bytes for the ISMRMRD::ImageHeader
+    mwSize img_hdr_dims[2] = {sizeof(ISMRMRD::ImageHeader), 1};
+    mxArray *img_hdr_bytes = mxCreateNumericArray(2, img_hdr_dims, mxUINT8_CLASS, mxREAL);
+    memcpy(mxGetData(img_hdr_bytes), img, sizeof(ISMRMRD::ImageHeader));
+
+    // Create a mxArray for the Image data
+    std::complex<float> *raw_data = m2->getObjectPtr()->get_data_ptr();
+    if (!raw_data) {
+        GDEBUG("Broken raw_data pointer\n");
+        return GADGET_FAIL;
+    }
+
+    if (img->matrix_size[0] == 0) img->matrix_size[0] = 1;
+    if (img->matrix_size[1] == 0) img->matrix_size[1] = 1;
+    if (img->matrix_size[2] == 0) img->matrix_size[2] = 1;
+    if (img->channels == 0) img->channels = 1;
+
+    mwSize ndim = 4;
+    mwSize dims[4] = {img->matrix_size[0], img->matrix_size[1], img->matrix_size[2], img->channels};
+    mxArray *img_data = mxCreateNumericArray(ndim, dims, mxSINGLE_CLASS, mxCOMPLEX);
+
+    float *real_data = (float *)mxGetData(img_data);
+    float *imag_data = (float *)mxGetImagData(img_data);
+    unsigned long num_elements = m2->getObjectPtr()->get_number_of_elements();
+    for (int i = 0; i < num_elements; i++) {
+        real_data[i] = raw_data[i].real();
+        imag_data[i] = raw_data[i].imag();
+    }
+
+    engPutVariable(engine_, "hdr_bytes", img_hdr_bytes);
+    engPutVariable(engine_, "data", img_data);
+    cmd = "Q = matgadget.run_process(2, hdr_bytes, data); matgadget.emptyQ();";
+    send_matlab_command(cmd);
+
+    // Get the size of the gadget's queue
+    mxArray *Q = engGetVariable(engine_, "Q");
+    if (Q == NULL) {
+        GDEBUG("Failed to get the Queue from matgadget\n");
+        return GADGET_FAIL;
+    }
+    size_t qlen = mxGetNumberOfElements(Q);
+
+    // Loop over the elements of the Q, reading one entry at a time
+    // to get a structure with type, headerbytes, and data
+    mwIndex idx;
+    for (idx = 0; idx < qlen; idx++) {
+        mxArray *res_type = mxGetField(Q, idx, "type");
+        mxArray *res_hdr  = mxGetField(Q, idx, "bytes");
+        mxArray *res_data = mxGetField(Q, idx, "data");
+
+        // determine the type of the object on the queue (i.e. acquisition or image)
+        // although, since this is an Image gadget, it better be an image
+        int tp = *((int *)mxGetData(res_type));
+        switch (tp) {
+        case 2:     // ImageHeader
+        {
+            // grab the modified AcquisitionHeader and convert it back to C++
+            GadgetContainerMessage<ISMRMRD::ImageHeader>* m3 =
+                    new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+            ISMRMRD::ImageHeader *hdr_new = m3->getObjectPtr();
+            memcpy(hdr_new, mxGetData(res_hdr), sizeof(ISMRMRD::ImageHeader));
+
+            GadgetContainerMessage<hoNDArray< std::complex<float> > >* m4 =
+                    new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+            m3->cont(m4);
+            std::vector<size_t> dims;
+            dims.push_back(hdr_new->matrix_size[0]);
+            dims.push_back(hdr_new->matrix_size[1]);
+            dims.push_back(hdr_new->matrix_size[2]);
+            dims.push_back(hdr_new->channels);
+            try {
+                m4->getObjectPtr()->create(&dims);
+            } catch (std::bad_alloc& err) {
+                GDEBUG("Failed to create new hoNDArray\n");
+                return GADGET_FAIL;
+            }
+
+            float *real_data = (float *)mxGetData(res_data);
+            float *imag_data = (float *)mxGetImagData(res_data);
+            for (int i = 0; i < m4->getObjectPtr()->get_number_of_elements(); i++) {
+                m4->getObjectPtr()->get_data_ptr()[i] = std::complex<float>(real_data[i],imag_data[i]);
+            }
+
+            if (this->next()->putq(m3) < 0) {
+                GDEBUG("Failed to put Image message on queue\n");
+                return GADGET_FAIL;
+            }
+
+            break;
+        }
+        default:
+            GDEBUG("Matlab gadget returned undefined header type\n");
+            return GADGET_FAIL;
+        }
+    }
+
+    // Match all mxCreate___s with mxDestroy___s
+    mxDestroyArray(img_hdr_bytes);
+    mxDestroyArray(img_data);
+
+    // Match engGetVariable with mxDestroy___s
+    mxDestroyArray(Q);
+
+    // We are finished with the incoming messages m1 and m2
+    m1->release();
+
+    return GADGET_OK;
+}
+
+
+GADGET_FACTORY_DECLARE(AcquisitionMatlabGadget)
+GADGET_FACTORY_DECLARE(ImageMatlabGadget)
+}
diff --git a/gadgets/matlab/MatlabGadget.h b/gadgets/matlab/MatlabGadget.h
new file mode 100644
index 0000000..55655c6
--- /dev/null
+++ b/gadgets/matlab/MatlabGadget.h
@@ -0,0 +1,203 @@
+#pragma once
+
+#include "gadgetron_matlab_export.h"
+#include "Gadget.h"
+#include "gadgetron_paths.h"
+#include "hoNDArray.h"
+#include "ismrmrd/ismrmrd.h"
+#include "log.h"
+#include "engine.h"     // Matlab Engine header
+
+#include "ace/Synch.h"  // For the MatlabCommandServer
+#include "ace/SOCK_Connector.h"
+#include "ace/INET_Addr.h"
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <complex>
+#include <boost/lexical_cast.hpp>
+
+// TODO:
+//Make the port option work so that we can have multiple matlabs running, each with its own command server.
+//Create a debug option to use evalstring and get back the matlab output on every function call.
+//Finish the image stuff
+//Is there a better way to kill the command server?
+//Test on windows
+
+
+namespace Gadgetron{
+
+template <class T> class MatlabGadget :
+    public Gadget2<T, hoNDArray< std::complex<float> > >
+{
+public:
+    MatlabGadget(): Gadget2<T, hoNDArray< std::complex<float> > >()
+    {
+        // Open the Matlab Engine on the current host
+        GDEBUG("Starting MATLAB engine\n");
+        if (!(engine_ = engOpen("matlab -nosplash -nodesktop"))) {
+            // TODO: error checking!
+            GDEBUG("Can't start MATLAB engine\n");
+        } else {
+            // Add ISMRMRD Java bindings jar to Matlab's path
+            // TODO: this should be in user's Matlab path NOT HERE
+
+            // Prepare a buffer for collecting Matlab's output
+            char matlab_buffer_[2049] = "\0";
+            engOutputBuffer(engine_, matlab_buffer_, 2048);
+
+	    // Add the necessary paths to the matlab environment
+	    // Java matlab command server
+	    std::string gadgetron_matlab_path = get_gadgetron_home() + "/share/gadgetron/matlab";
+	    std::string java_add_path_cmd = std::string("javaaddpath('") + gadgetron_matlab_path + std::string("');");
+	    std::string add_path_cmd = std::string("addpath('") + gadgetron_matlab_path + std::string("');");
+            // Gadgetron matlab scripts
+	    engEvalString(engine_, java_add_path_cmd.c_str());
+	    engEvalString(engine_, add_path_cmd.c_str());
+            // ISMRMRD matlab library
+            engEvalString(engine_, "addpath(fullfile(getenv('ISMRMRD_HOME'), '/share/ismrmrd/matlab'));");
+
+	    GDEBUG("%s", matlab_buffer_);
+        }
+    }
+
+    ~MatlabGadget()
+    {
+        char matlab_buffer_[2049] = "\0";
+        engOutputBuffer(engine_, matlab_buffer_, 2048);
+	// Stop the Java Command server
+        // send the stop signal to the command server and
+        //  wait a bit for it to shut down cleanly.
+        GDEBUG("Closing down the Matlab Command Server\n");
+	engEvalString(engine_, "M.notifyEnd(); pause(1);");
+        engEvalString(engine_, "clear java;");
+        GDEBUG("%s", matlab_buffer_);
+        // Close the Matlab engine
+        GDEBUG("Closing down Matlab\n");
+        engClose(engine_);
+    }
+
+protected:
+    GADGET_PROPERTY(debug_mode, bool, "Debug mode", false);
+    GADGET_PROPERTY(matlab_path, std::string, "Path to Matlab code", "");
+    GADGET_PROPERTY(matlab_classname, std::string, "Name of Matlab gadget class", "");
+    GADGET_PROPERTY(matlab_port, int, "Port on which to run Matlab command server", 3000);
+
+    int process_config(ACE_Message_Block* mb)
+    {
+        std::string cmd;
+
+        debug_mode_  = debug_mode.value();
+        path_        = matlab_path.value();
+        classname_   = matlab_classname.value();
+        if (classname_.empty()) {
+            GERROR("Missing Matlab Gadget classname in config!");
+            return GADGET_FAIL;
+        }
+        command_server_port_ = matlab_port.value();
+
+        GDEBUG("MATLAB Class Name : %s\n", classname_.c_str());
+
+        //char matlab_buffer_[2049] = "\0";
+        char matlab_buffer_[20481] = "\0";
+        engOutputBuffer(engine_, matlab_buffer_, 20480);
+
+   	// Instantiate the Java Command server
+        // TODO: we HAVE to pause in Matlab to allow the java command server thread to start
+        cmd = "M = MatlabCommandServer(" + boost::lexical_cast<std::string>(command_server_port_) +
+                "); M.start(); pause(1);";
+	engEvalString(engine_, cmd.c_str());
+        GDEBUG("%s", matlab_buffer_);
+
+        // add user specified path for this gadget
+        if (!path_.empty()) {
+            cmd = "addpath('" + path_ + "');";
+            send_matlab_command(cmd);
+        }
+
+        // Put the XML Header into the matlab workspace
+        std::string xmlConfig = std::string(mb->rd_ptr());
+        mxArray *xmlstring = mxCreateString(xmlConfig.c_str());
+        engPutVariable(engine_, "xmlstring", xmlstring);
+
+        // Instantiate the Matlab gadget object from the user specified class
+        // Call matlab gadget's init method with the XML Header
+        // and the user defined config method
+        cmd = "matgadget = " + classname_ + "();";
+        cmd += "matgadget.init(xmlstring); matgadget.config();";
+        if (send_matlab_command(cmd) != GADGET_OK) {
+            GDEBUG("Failed to send matlab command.\n");
+            return GADGET_FAIL;
+        }
+
+	mxDestroyArray(xmlstring);
+
+        return GADGET_OK;
+    }
+
+    int send_matlab_command(std::string& command)
+    {
+
+        if (debug_mode_) {
+            char matlab_buffer_[2049] = "\0";
+            engOutputBuffer(engine_, matlab_buffer_, 2048);
+            engEvalString(engine_, command.c_str());
+            GDEBUG("%s\n", matlab_buffer_);
+            return GADGET_OK;
+        }
+        else {
+            ACE_SOCK_Stream client_stream;
+            ACE_INET_Addr remote_addr(command_server_port_, "localhost");
+            ACE_SOCK_Connector connector;
+
+            if (connector.connect(client_stream, remote_addr) == -1) {
+                GDEBUG("Connection failed\n");
+                return GADGET_FAIL;
+            }
+
+            ACE_Time_Value timeout(10);
+            if (client_stream.send_n(command.c_str(), command.size(), &timeout) == -1) {
+                GDEBUG("Error in send_n\n");
+                client_stream.close();
+                return GADGET_FAIL;
+            }
+
+            if (client_stream.close () == -1){
+                GDEBUG("Error in close\n");
+                return GADGET_FAIL;
+            }
+            return GADGET_OK;
+        }
+    }
+
+    std::string path_;
+    std::string classname_;
+    int command_server_port_;
+    int debug_mode_;
+
+    Engine *engine_;
+};
+
+
+
+class EXPORTGADGETSMATLAB AcquisitionMatlabGadget :
+    public MatlabGadget<ISMRMRD::AcquisitionHeader>
+{
+    public:
+        GADGET_DECLARE(AcquisitionMatlabGadget);
+        int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+                GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+};
+
+class EXPORTGADGETSMATLAB ImageMatlabGadget :
+    public MatlabGadget<ISMRMRD::ImageHeader>
+{
+    public:
+        GADGET_DECLARE(ImageMatlabGadget);
+        int process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+                GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+};
+}
diff --git a/gadgets/matlab/MatlabUtils.cpp b/gadgets/matlab/MatlabUtils.cpp
new file mode 100644
index 0000000..2a5cdbb
--- /dev/null
+++ b/gadgets/matlab/MatlabUtils.cpp
@@ -0,0 +1,376 @@
+#include "MatlabUtils.h"
+
+#include "hoNDArray_math.h"
+
+using namespace Gadgetron;
+
+template<class T> struct isComplex { static constexpr mxComplexity value = mxREAL;};
+template<class REAL> struct isComplex<complext<REAL>> { static constexpr mxComplexity value = mxCOMPLEX;};
+template<class REAL> struct isComplex<std::complex<REAL>>{ static constexpr mxComplexity value = mxCOMPLEX;};
+
+
+
+template<class T>  struct  MatlabClassID {};
+
+template<>	struct MatlabClassID<double>{ static constexpr mxClassID value =  mxDOUBLE_CLASS;};
+template<>	struct MatlabClassID<float>{ static constexpr mxClassID value =  mxSINGLE_CLASS;};
+
+template<class REAL> struct MatlabClassID<complext<REAL>>{ static constexpr mxClassID value =  MatlabClassID<REAL>::value;};
+template<class REAL> struct MatlabClassID<std::complex<REAL>>{ static constexpr mxClassID value =  MatlabClassID<REAL>::value;};
+
+template<>	struct MatlabClassID<int8_t>{ static constexpr mxClassID value =  mxINT8_CLASS;};
+template<>	struct MatlabClassID<uint8_t>{ static constexpr mxClassID value =  mxUINT8_CLASS;};
+template<>	struct MatlabClassID<int16_t>{ static constexpr mxClassID value =  mxINT16_CLASS;};
+template<>	struct MatlabClassID<uint16_t>{ static constexpr mxClassID value =  mxUINT16_CLASS;};
+template<>	struct MatlabClassID<int32_t>{ static constexpr mxClassID value =  mxINT32_CLASS;};
+template<>	struct MatlabClassID<uint32_t>{ static constexpr mxClassID value =  mxUINT32_CLASS;};
+template<>	struct MatlabClassID<int64_t>{ static constexpr mxClassID value =  mxINT64_CLASS;};
+template<>	struct MatlabClassID<uint64_t>{ static constexpr mxClassID value =  mxUINT64_CLASS;};
+
+template<class T> struct MatlabConverter {
+	static mxArray* convert(hoNDArray<T>* input){
+
+		mwSize ndim = input->get_number_of_dimensions();
+		mwSize* dims = new mwSize[ndim];
+		for (size_t i = 0; i < ndim; i++)
+			dims[i] = input->get_size(i);
+
+		T* raw_data = (T*) mxCalloc(input->get_number_of_elements(),sizeof(T));
+		memcpy(raw_data,input->get_data_ptr(),input->get_number_of_bytes());
+		auto result =  mxCreateNumericArray(ndim,dims,MatlabClassID<T>::value,isComplex<T>::value);
+		mxSetData(result,raw_data);
+		return result;
+
+	}
+
+	static hoNDArray<T>* convert(mxArray* input) {
+		auto ndims = mxGetNumberOfDimensions(input);
+		auto dims = mxGetDimensions(input);
+		std::vector<size_t> dimensions(ndims);
+		for (size_t i = 0; i <ndims; i++) dimensions[i] = dims[i];
+
+		auto result =  new hoNDArray<T>(dimensions);
+
+		if (mxGetImagData(input)) //This is for REAL data only
+			throw std::runtime_error("Trying to convert complex matlab data to non-complex c++ type");
+		if (mxGetClassID(input) == MatlabClassID<T>::value ){ //Same type, so we can just memcpy
+			T* raw_data = (T*) mxGetData(input);
+			memcpy(result->get_data_ptr(),raw_data,result->get_number_of_elements()*sizeof(T));
+		} else {
+			switch (mxGetClassID(input)){ // Have to do runtime type conversion, which means cases en-masse.
+			case MatlabClassID<double>::value :
+			copyMatlabdata<double>(input,result->get_data_ptr(),result->get_number_of_elements());
+			break;
+
+			case MatlabClassID<float>::value:
+			copyMatlabdata<float>(input,result->get_data_ptr(),result->get_number_of_elements());
+			break;
+
+			case MatlabClassID<int8_t>::value:
+			copyMatlabdata<int8_t>(input,result->get_data_ptr(),result->get_number_of_elements());
+			break;
+
+			case MatlabClassID<uint8_t>::value:
+			copyMatlabdata<uint8_t>(input,result->get_data_ptr(),result->get_number_of_elements());
+			break;
+
+			case MatlabClassID<int16_t>::value:
+			copyMatlabdata<int16_t>(input,result->get_data_ptr(),result->get_number_of_elements());
+			break;
+
+			case MatlabClassID<uint16_t>::value:
+			copyMatlabdata<uint16_t>(input,result->get_data_ptr(),result->get_number_of_elements());
+			break;
+
+			case MatlabClassID<int32_t>::value:
+			copyMatlabdata<int32_t>(input,result->get_data_ptr(),result->get_number_of_elements());
+			break;
+
+			case MatlabClassID<uint32_t>::value:
+			copyMatlabdata<uint32_t>(input,result->get_data_ptr(),result->get_number_of_elements());
+			break;
+			case MatlabClassID<int64_t>::value:
+			copyMatlabdata<int64_t>(input,result->get_data_ptr(),result->get_number_of_elements());
+			break;
+
+			case MatlabClassID<uint64_t>::value:
+			copyMatlabdata<uint64_t>(input,result->get_data_ptr(),result->get_number_of_elements());
+			break;
+
+			default:
+				throw std::runtime_error("Trying to convert from unsupported data type");
+				break;
+
+
+			}
+
+		}
+		return result;
+	}
+
+	template<class R> static void copyMatlabdata(mxArray* input, T* output,size_t len){
+		R* raw_ptr = (R*) mxGetData(input);
+		for (size_t i = 0; i < len; i++)
+			output[i] = T(raw_ptr[i]);
+	}
+};
+template<class REAL> struct MatlabConverter<complext<REAL>> {
+	static mxArray* convert(hoNDArray<complext<REAL>>* input){
+
+		size_t ndim = input->get_number_of_dimensions();
+
+		//Matlab does not support to creation of 7D arrays, but 8,6 and 9 works just fine.
+		//If you're on a train that's running Matlab as its control system, you should be very very scared.
+
+		mwSize* dims = new mwSize[ndim];
+		for (size_t i = 0; i < ndim; i++)
+			dims[i] = input->get_size(i);
+
+		REAL* real_data = (REAL*) mxCalloc(input->get_number_of_elements(),sizeof(REAL));
+		REAL* imag_data = (REAL*) mxCalloc(input->get_number_of_elements(),sizeof(REAL));
+
+		complext<REAL>* raw_data = input->get_data_ptr();
+		for (size_t i = 0; i < input->get_number_of_elements(); i++){
+			real_data[i] = real(raw_data[i]);
+			imag_data[i] = imag(raw_data[i]);
+		}
+
+		auto result  =  mxCreateNumericArray(ndim,dims,MatlabClassID<REAL>::value,isComplex<complext<REAL>>::value);
+		mxSetData(result,real_data);
+		mxSetImagData(result,imag_data);
+
+		auto ndims_test = mxGetNumberOfDimensions(result);
+
+		return result;
+	}
+	static hoNDArray<complext<REAL> >* convert(mxArray* input) {
+		auto ndims = mxGetNumberOfDimensions(input);
+		auto dims = mxGetDimensions(input);
+		std::vector<size_t> dimensions(ndims);
+		for (size_t i = 0; i <ndims; i++) dimensions[i] = dims[i];
+		auto result = new hoNDArray<complext<REAL>>(dimensions);
+		switch (mxGetClassID(input)){ // Have to do runtime type conversion, which means cases en-masse.
+		case MatlabClassID<double>::value :
+		copyMatlabdata<double>(input,result->get_data_ptr(),result->get_number_of_elements());
+		break;
+
+		case MatlabClassID<float>::value:
+		copyMatlabdata<float>(input,result->get_data_ptr(),result->get_number_of_elements());
+		break;
+
+		case MatlabClassID<int8_t>::value:
+		copyMatlabdata<int8_t>(input,result->get_data_ptr(),result->get_number_of_elements());
+		break;
+
+		case MatlabClassID<uint8_t>::value:
+		copyMatlabdata<uint8_t>(input,result->get_data_ptr(),result->get_number_of_elements());
+		break;
+
+		case MatlabClassID<int16_t>::value:
+		copyMatlabdata<int16_t>(input,result->get_data_ptr(),result->get_number_of_elements());
+		break;
+
+		case MatlabClassID<uint16_t>::value:
+		copyMatlabdata<uint16_t>(input,result->get_data_ptr(),result->get_number_of_elements());
+		break;
+
+		case MatlabClassID<int32_t>::value:
+		copyMatlabdata<int32_t>(input,result->get_data_ptr(),result->get_number_of_elements());
+		break;
+
+		case MatlabClassID<uint32_t>::value:
+		copyMatlabdata<uint32_t>(input,result->get_data_ptr(),result->get_number_of_elements());
+		break;
+		case MatlabClassID<int64_t>::value:
+		copyMatlabdata<int64_t>(input,result->get_data_ptr(),result->get_number_of_elements());
+		break;
+
+		case MatlabClassID<uint64_t>::value:
+		copyMatlabdata<uint64_t>(input,result->get_data_ptr(),result->get_number_of_elements());
+		break;
+
+		default:
+			throw std::runtime_error("Trying to convert from unsupported data type");
+			break;
+
+		}
+
+		return result;
+
+	}
+
+	template<class R> static void copyMatlabdata(mxArray* input, complext<REAL>* output,size_t len){
+		R* real_ptr = (R*) mxGetData(input);
+		R* imag_ptr = (R*) mxGetImagData(input);
+		if (imag_ptr) {
+			for (size_t i = 0; i < len; i++)
+				output[i] = complext<REAL>(REAL(real_ptr[i]),REAL(imag_ptr[i]));
+		} else{
+			for (size_t i = 0; i < len; i++)
+				output[i] = complext<REAL>(REAL(real_ptr[i]),0);
+		}
+	}
+
+};
+
+template<class REAL> struct MatlabConverter<std::complex<REAL>> {
+	static mxArray* convert(hoNDArray<std::complex<REAL>>* input){
+		return MatlabConverter<complext<REAL>>::convert((hoNDArray<complext<REAL>>*) input);
+	}
+
+	static hoNDArray<std::complex<REAL>>* convert(mxArray* input){
+		return (hoNDArray<std::complex<REAL>>*) MatlabConverter<complext<REAL>>::convert(input);
+	}
+};
+
+template<class T> mxArray* Gadgetron::hoNDArrayToMatlab(hoNDArray<T> * input){
+	return MatlabConverter<T>::convert(input);
+
+}
+
+
+template<class T> hoNDArray<T>* Gadgetron::MatlabToHoNDArray(mxArray* data){
+	return MatlabConverter<T>::convert(data);
+}
+
+mxArray* Gadgetron::BufferToMatlabStruct(IsmrmrdDataBuffered* buffer){
+
+	const char * field_names[] = {"data","trajectory","headers","samplingdescription"};
+	mwSize one = 1;
+	auto mxstruct = mxCreateStructArray(1,&one,4,field_names);
+
+
+	if (!mxstruct) throw std::runtime_error("Failed to allocate Matlab struct");
+
+	auto mxdata = hoNDArrayToMatlab(&buffer->data_);
+	mxSetField(mxstruct,0,"data",mxdata);
+	//Add trajectory if available
+	if (buffer->trajectory_.get_number_of_elements() > 0){
+		int traj_fieldnumber = mxAddField(mxstruct,"trajectory");
+		auto mxtraj = hoNDArrayToMatlab(&buffer->trajectory_);
+		mxSetFieldByNumber(mxstruct,0,traj_fieldnumber,mxtraj);
+	}
+
+	//Add headers
+	std::cout << "Adding headers " << std::endl;
+	mwSize num_headers = buffer->headers_.get_number_of_elements();
+	auto mxheaders = mxCreateNumericMatrix(sizeof(ISMRMRD::AcquisitionHeader),num_headers,mxUINT8_CLASS,mxREAL);
+	memcpy(mxGetData(mxheaders),buffer->headers_.get_data_ptr(),sizeof(ISMRMRD::AcquisitionHeader)*num_headers);
+	mxSetField(mxstruct,0,"headers",mxheaders);
+
+	auto samplingdescription = samplingdescriptionToMatlabStruct(&buffer->sampling_);
+	mxSetField(mxstruct,0,"samplingdescription",samplingdescription);
+
+	return mxstruct;
+
+
+}
+static SamplingDescription MatlabStructToSamplingdescription(mxArray* mxstruct){
+
+	SamplingDescription samp;
+	auto encFOV = mxGetField(mxstruct,0,"encoded_FOV");
+	std::cout << "FOV PTR " << encFOV << " " << mxGetData(encFOV)<< std::endl;
+	memcpy(samp.encoded_FOV_,mxGetData(encFOV),sizeof(samp.encoded_FOV_));
+	auto recFOV = mxGetField(mxstruct,0,"recon_FOV");
+	memcpy(samp.recon_FOV_,mxGetData(recFOV),sizeof(samp.recon_FOV_));
+	auto encoded_matrix = mxGetField(mxstruct,0,"encoded_matrix");
+	memcpy(samp.encoded_matrix_,mxGetData(encoded_matrix),sizeof(samp.encoded_matrix_));
+	auto recon_matrix = mxGetField(mxstruct,0,"recon_matrix");
+	memcpy(samp.recon_matrix_,mxGetData(recon_matrix),sizeof(samp.recon_matrix_));
+	auto sampling_limit = mxGetField(mxstruct,0,"sampling_limits");
+	memcpy(samp.sampling_limits_,mxGetData(sampling_limit),sizeof(samp.sampling_limits_));
+
+	return samp;
+
+}
+
+
+IsmrmrdDataBuffered Gadgetron::MatlabStructToBuffer(mxArray* mxstruct){
+	IsmrmrdDataBuffered buffer;
+
+	auto data = mxGetField(mxstruct,0,"data");
+	buffer.data_ = *MatlabToHoNDArray<std::complex<float>>(data);
+	if (buffer.data_.get_number_of_dimensions() != 7){ //Someone (Matlab) got rid of our dimensions. Ghee thanks;
+		std::vector<size_t> newdims = *buffer.data_.get_dimensions();
+		for (int i = buffer.data_.get_number_of_dimensions(); i<7; i++)
+			newdims.push_back(1);
+		buffer.data_.reshape(&newdims);
+	}
+	auto traj = mxGetField(mxstruct,0,"trajectory");
+	if (traj){
+		buffer.trajectory_ = *MatlabToHoNDArray<float>(traj);
+		if (buffer.trajectory_.get_number_of_dimensions() != 7){
+			std::vector<size_t> newdims = *buffer.trajectory_.get_dimensions();
+			for (int i = buffer.trajectory_.get_number_of_dimensions(); i<7; i++)
+				newdims.push_back(1);
+			buffer.trajectory_.reshape(&newdims);
+		}
+	}
+	auto headers = mxGetField(mxstruct,0,"headers");
+
+	auto nmat_headers = mxGetN(headers);
+	std::vector<size_t> header_dim = {buffer.data_.get_size(1),buffer.data_.get_size(2),buffer.data_.get_size(4),buffer.data_.get_size(5),buffer.data_.get_size(6)};
+
+	buffer.headers_ = hoNDArray<ISMRMRD::AcquisitionHeader>(header_dim);
+
+	std::cout << "Number of headers: " << nmat_headers << " Expected: " << buffer.headers_.get_number_of_elements() << std::endl;
+	if (nmat_headers != buffer.headers_.get_number_of_elements())
+		throw std::runtime_error("Number of headers does not match number of kspace acquisitions");
+
+	memcpy(buffer.headers_.get_data_ptr(),mxGetData(headers),sizeof(ISMRMRD::AcquisitionHeader)*buffer.headers_.get_number_of_elements());
+
+	auto samplingdescription = mxGetField(mxstruct,0,"samplingdescription");
+	buffer.sampling_ = MatlabStructToSamplingdescription(samplingdescription);
+	return buffer;
+
+
+}
+
+
+mxArray* Gadgetron::samplingdescriptionToMatlabStruct(SamplingDescription* samp){
+
+	const char* fieldnames[5] = {"encoded_FOV","recon_FOV","encoded_matrix","recon_matrix","sampling_limits"};
+	mwSize one_dim  = 1;
+	auto sampStruct = mxCreateStructArray(1,&one_dim,5,fieldnames);
+	//Encoded FOV
+	mwSize dims = 3;
+	auto encFOV = mxCreateNumericArray(1,&dims,MatlabClassID<float>::value,mxComplexity(0));
+	memcpy(mxGetData(encFOV),samp->encoded_FOV_,sizeof(samp->encoded_FOV_));
+	mxSetField(sampStruct,0,"encoded_FOV",encFOV);
+	//Recon FOV
+	auto recFOV = mxCreateNumericArray(1,&dims,MatlabClassID<float>::value,mxComplexity(0));
+	memcpy(mxGetData(recFOV),samp->recon_FOV_,sizeof(samp->recon_FOV_));
+	mxSetField(sampStruct,0,"recon_FOV",recFOV);
+	//Encoded Matrix
+	auto encoded_matrix = mxCreateNumericArray(1,&dims,MatlabClassID<uint16_t>::value,mxComplexity(0));
+	memcpy(mxGetData(encoded_matrix),samp->encoded_matrix_,sizeof(samp->encoded_matrix_));
+	mxSetField(sampStruct,0,"encoded_matrix",encoded_matrix);
+	//Recon matrix
+	auto recon_matrix = mxCreateNumericArray(1,&dims,MatlabClassID<uint16_t>::value,mxComplexity(0));
+	memcpy(mxGetData(recon_matrix),samp->recon_matrix_,sizeof(samp->recon_matrix_));
+	mxSetField(sampStruct,0,"recon_matrix",recon_matrix);
+	//Sampling Limit
+	mwSize twodims[] = {3,3};
+	auto sampling_limit = mxCreateNumericArray(2,twodims,MatlabClassID<uint16_t>::value,mxComplexity(0));
+	memcpy(mxGetData(sampling_limit),samp->sampling_limits_,sizeof(samp->sampling_limits_));
+	mxSetField(sampStruct,0,"sampling_limits",sampling_limit);
+	return sampStruct;
+}
+
+
+
+template mxArray* Gadgetron::hoNDArrayToMatlab<float>(hoNDArray<float> *);
+template mxArray* Gadgetron::hoNDArrayToMatlab<double>(hoNDArray<double> *);
+template mxArray* Gadgetron::hoNDArrayToMatlab<float_complext>(hoNDArray<float_complext> *);
+template mxArray* Gadgetron::hoNDArrayToMatlab<double_complext>(hoNDArray<double_complext> *);
+template mxArray* Gadgetron::hoNDArrayToMatlab<std::complex<double>>(hoNDArray<std::complex<double>> *);
+template mxArray* Gadgetron::hoNDArrayToMatlab<std::complex<float>>(hoNDArray<std::complex<float>> *);
+
+
+template hoNDArray<float>* Gadgetron::MatlabToHoNDArray<float>(mxArray *);
+template hoNDArray<double>* Gadgetron::MatlabToHoNDArray<double>(mxArray *);
+template hoNDArray<float_complext>* Gadgetron::MatlabToHoNDArray<float_complext>(mxArray *);
+template hoNDArray<double_complext>* Gadgetron::MatlabToHoNDArray<double_complext>(mxArray *);
+
+template hoNDArray<std::complex<double>>* Gadgetron::MatlabToHoNDArray<std::complex<double>>(mxArray *);
+template hoNDArray<std::complex<float>>* Gadgetron::MatlabToHoNDArray<std::complex<float>>(mxArray *);
diff --git a/gadgets/matlab/MatlabUtils.h b/gadgets/matlab/MatlabUtils.h
new file mode 100644
index 0000000..1ce250f
--- /dev/null
+++ b/gadgets/matlab/MatlabUtils.h
@@ -0,0 +1,47 @@
+/*
+ * MatlabUtils.h
+ *
+ *  Created on: Dec 5, 2014
+ *      Author: dch
+ */
+
+#ifndef MATLABUTILS_H_
+#define MATLABUTILS_H_
+
+#include "engine.h"
+#include "hoNDArray.h"
+#include "mri_core_data.h"
+namespace Gadgetron{
+
+/**
+ * Creates a Matlab array from an hoNDArray
+ * @param
+ * @return
+ */
+template<class T> mxArray* hoNDArrayToMatlab(hoNDArray<T>* );
+
+/**
+ * Create hoNDArray from a Matlab array. Will attempt type conversion.
+ * @param
+ * @return
+ */
+template<class T> hoNDArray<T>* MatlabToHoNDArray(mxArray*);
+
+/**
+ * Creates a matlab struct from an IsmrmrdDataBuffer
+ * @param buffer
+ * @return
+ */
+mxArray* BufferToMatlabStruct(IsmrmrdDataBuffered* buffer);
+
+
+IsmrmrdDataBuffered MatlabStructToBuffer(mxArray* mxstruct);
+
+/**
+ * Create a Matlab struct from a SamplingDescription
+ * @param samp
+ * @return
+ */
+mxArray* samplingdescriptionToMatlabStruct(SamplingDescription* samp);
+}
+#endif /* MATLABUTILS_H_ */
diff --git a/gadgets/matlab/accumulate_and_recon.m b/gadgets/matlab/accumulate_and_recon.m
new file mode 100644
index 0000000..fe8ec16
--- /dev/null
+++ b/gadgets/matlab/accumulate_and_recon.m
@@ -0,0 +1,94 @@
+classdef accumulate_and_recon < handle & BaseGadget
+
+    properties
+
+        image_num;
+        series_num;
+        center_line;
+        accumulation;
+        
+    end
+
+    methods
+
+        function g = config(g)
+            
+            fprintf('The resonance frequency is %d\n', g.xml.experimentalConditions.H1resonanceFrequency_Hz);
+            nx = g.xml.encoding.encodedSpace.matrixSize.x;
+            ny = g.xml.encoding.encodedSpace.matrixSize.y;
+            nz = g.xml.encoding.encodedSpace.matrixSize.z;
+            % for 2D sequences the number of getZ breaks
+            %try
+            %catch
+            
+	     % nz =1
+            %end
+            % the number of receiver channels is optional
+            try
+                % this is the only cast from java.lang.Integer that works in Matlab
+                nc = g.xml.acquisitionSystemInformation.receiverChannels;
+            catch
+	        nc = 1;
+            end
+            % the number of slices is optional
+            try
+                ns = g.xml.encoding.encodingLimits.slice.maximum + 1;
+            catch
+	        ns = 1;
+            end
+
+            g.center_line = g.xml.encoding.encodingLimits.kspace_encoding_step_1.center;
+            g.accumulation = zeros(nx, ny, nz, ns, nc);
+            g.image_num = 0;   % todo this needs to be static or global...
+            g.series_num = 0;  % todo this needs to be static or global...
+        end
+
+        function g = process(g, head, data)
+            %disp('Processing')
+            % stuff the line
+            line_offset = floor(size(g.accumulation,2)/2) - g.center_line;
+            kyind = head.idx.kspace_encode_step_1 + line_offset + 1;
+            kzind = head.idx.kspace_encode_step_2 + 1;
+            slind = head.idx.slice + 1;
+            %fprintf('  offset = %d, center = %d, index = %d\n', line_offset, g.center_line, kyind);
+            if (kyind == 1)
+		    disp(kzind)
+	    end
+            g.accumulation(:, kyind, kzind, slind, :) = data;
+
+            % At the end of the acquisition, reconstruct the slice
+            if (head.flagIsSet(head.FLAGS.ACQ_LAST_IN_SLICE))
+                disp('Found last in slice')
+                img_head = ismrmrd.ImageHeader;
+                img_head.channels = head.active_channels;
+                img_head.slice = head.idx.slice;
+                % set the matrix size
+                % set one element at a time to not break the type (uint16) of matrix_size
+     	        img_head.matrix_size(1) = size(g.accumulation,1); % nx
+     	        img_head.matrix_size(2) = size(g.accumulation,2); % ny
+     	        img_head.matrix_size(3) = size(g.accumulation,3); % nz
+
+                img_head.position = head.position;
+                img_head.read_dir = head.read_dir;
+                img_head.phase_dir = head.phase_dir;
+                img_head.slice_dir = head.slice_dir;
+                img_head.patient_table_position = head.patient_table_position;
+                img_head.acquisition_time_stamp = head.acquisition_time_stamp;
+                img_head.image_index = g.image_num;
+                img_head.image_series_index = g.series_num;
+
+		img_data = squeeze(g.accumulation(:,:,:,slind,:));
+                img_data = fftshift(ifftn(fftshift(img_data)));
+                imagesc(abs(img_data(:,:,1,1))); axis image; axis square;
+		pause(2)
+                close()
+
+                g.putQ(img_head, img_data);
+                %fprintf('Put on Queue %d, type = %d\n',length(g.Q),g.Q{1}.type);
+
+            end
+
+        end
+
+    end
+end
diff --git a/gadgets/matlab/bufferRecon.m b/gadgets/matlab/bufferRecon.m
new file mode 100644
index 0000000..1145f9d
--- /dev/null
+++ b/gadgets/matlab/bufferRecon.m
@@ -0,0 +1,88 @@
+classdef bufferRecon < handle & BaseBufferGadget
+    
+    properties
+        
+        image_num;
+        series_num;
+        center_line;
+        img_size;
+        
+    end
+    
+    methods
+        
+        function g = config(g)
+            
+            fprintf('The resonance frequency is %d\n', g.xml.experimentalConditions.H1resonanceFrequency_Hz);
+            nx = g.xml.encoding.encodedSpace.matrixSize.x;
+            ny = g.xml.encoding.encodedSpace.matrixSize.y;
+            nz = g.xml.encoding.encodedSpace.matrixSize.z;
+            % for 2D sequences the number of getZ breaks
+            %try
+            %catch
+            
+            % nz =1
+            %end
+            % the number of receiver channels is optional
+            try
+                % this is the only cast from java.lang.Integer that works in Matlab
+                nc = g.xml.acquisitionSystemInformation.receiverChannels;
+            catch
+                nc = 1;
+            end
+            % the number of slices is optional
+            try
+                ns = g.xml.encoding.encodingLimits.slice.maximum + 1;
+            catch
+                ns = 1;
+            end
+            
+            g.center_line = g.xml.encoding.encodingLimits.kspace_encoding_step_1.center;
+            g.img_size = [nx ny nz];
+            g.image_num = 0;   % todo this needs to be static or global...
+            g.series_num = 0;  % todo this needs to be static or global...
+        end
+        
+        function g = process(g, recon_data)
+            disp('Processing')
+            % stuff the line
+            
+            data = recon_data{1}.data; %Assume only one encoding space
+            head = recon_data{1}.headers{1}; %Just get header from first trajectory
+            
+            
+            % At the end of the acquisition, reconstruct the slice
+            img_head = ismrmrd.ImageHeader;
+
+            %img_head.slice = head.idx.slice;
+            % set the matrix size
+            % set one element at a time to not break the type (uint16) of matrix_size
+            img_head.matrix_size(1) = g.img_size(1); % nx
+            img_head.matrix_size(2) = g.img_size(2); % ny
+            img_head.matrix_size(3) = g.img_size(3); % nz
+            
+            img_head.position = head.position;
+            img_head.read_dir = head.read_dir;
+            img_head.phase_dir = head.phase_dir;
+            img_head.slice_dir = head.slice_dir;
+            img_head.patient_table_position = head.patient_table_position;
+            img_head.acquisition_time_stamp = head.acquisition_time_stamp;
+            img_head.image_index = g.image_num;
+            img_head.image_series_index = g.series_num;
+            
+            img_data = squeeze(data);
+            img_data = fftshift(ifftn(fftshift(img_data)));
+            imagesc(abs(img_data(:,:,1,1))); axis image; axis square;
+            pause(2)
+            close()
+            
+            disp(size(img_data));
+            g.putImageQ(img_head, img_data);
+            %fprintf('Put on Queue %d, type = %d\n',length(g.Q),g.Q{1}.type);
+            
+            
+            
+        end
+        
+    end
+end
diff --git a/gadgets/matlab/gadgetron_matlab_export.h b/gadgets/matlab/gadgetron_matlab_export.h
new file mode 100644
index 0000000..7cf5c26
--- /dev/null
+++ b/gadgets/matlab/gadgetron_matlab_export.h
@@ -0,0 +1,23 @@
+/*
+ * gadgetroncore_export.h
+ *
+ *  Created on: Jan 28, 2013
+ *      Author: Michael S. Hansen
+ */
+
+#ifndef GADGETRONMATLAB_EXPORT_H_
+#define GADGETRONMATLAB_EXPORT_H_
+
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_MATLAB__) || defined (gadgetron_matlab_EXPORTS)
+#define EXPORTGADGETSMATLAB __declspec(dllexport)
+#else
+#define EXPORTGADGETSMATLAB __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETSMATLAB
+#endif
+
+
+#endif /* GADGETRONMATLAB_EXPORT_H_ */
diff --git a/gadgets/matlab/mask_image.m b/gadgets/matlab/mask_image.m
new file mode 100644
index 0000000..b78b6b5
--- /dev/null
+++ b/gadgets/matlab/mask_image.m
@@ -0,0 +1,27 @@
+classdef mask_image < handle & BaseGadget
+
+    properties
+    end
+
+    methods
+
+        function g = config(g)
+        end
+
+        function g = process(g, head, data)
+            % put the original data on the Q
+            g.putQ(head, data);
+
+            % modify the series number
+            head.image_series_index = head.image_series_index + 1;
+
+            % zero out a corner of the image
+            data(1:end/2,1:end/2,:) = 0;
+            
+            % put the modified header and image on the Q
+            g.putQ(head,data);
+
+        end
+
+    end
+end
diff --git a/gadgets/matlab/matlab.xml b/gadgets/matlab/matlab.xml
new file mode 100644
index 0000000..7a97a39
--- /dev/null
+++ b/gadgets/matlab/matlab.xml
@@ -0,0 +1,63 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>MatlabAcquisition</name>
+    <dll>gadgetron_matlab</dll>
+    <classname>AcquisitionMatlabGadget</classname>
+    <property><name>debug_mode</name><value>false</value></property>
+    <property><name>matlab_path</name><value></value></property>
+    <property><name>matlab_classname</name><value>scale</value></property>
+    <property><name>matlab_port</name><value>3000</value></property>
+  </gadget>
+
+  <gadget>
+    <name>MatlabAcquisition</name>
+    <dll>gadgetron_matlab</dll>
+    <classname>AcquisitionMatlabGadget</classname>
+    <property><name>matlab_path</name><value></value></property>
+    <property><name>matlab_classname</name><value>accumulate_and_recon</value></property>
+    <property><name>matlab_port</name><value>3001</value></property>
+  </gadget>
+
+  <gadget>
+    <name>CropCombine</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CropAndCombineGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>MatlabImage</name>
+    <dll>gadgetron_matlab</dll>
+    <classname>ImageMatlabGadget</classname>
+    <property><name>matlab_path</name><value></value></property>
+    <property><name>matlab_classname</name><value>mask_image</value></property>
+    <property><name>matlab_port</name><value>3002</value></property>
+  </gadget>
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>
+</gadgetronStreamConfiguration>
diff --git a/gadgets/matlab/matlabbuffer.xml b/gadgets/matlab/matlabbuffer.xml
new file mode 100644
index 0000000..7c6696e
--- /dev/null
+++ b/gadgets/matlab/matlabbuffer.xml
@@ -0,0 +1,71 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+    <gadget>
+        <name>AccTrig</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AcquisitionAccumulateTriggerGadget</classname>
+        <property>
+            <name>trigger_dimension</name>
+            <value>repetition</value>
+        </property>
+    </gadget>
+
+    <gadget>
+        <name>Buff</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>BucketToBufferGadget</classname>
+        <property>
+            <name>N_dimension</name>
+            <value></value>
+        </property>
+        <property>
+          <name>S_dimension</name>
+          <value></value>
+        </property>
+        <property>
+          <name>split_slices</name>
+          <value>true</value>
+        </property>
+    </gadget>
+
+
+  <gadget>
+    <name>MatlabBuffer</name>
+    <dll>gadgetron_matlab</dll>
+    <classname>MatlabBufferGadget</classname>
+    <property><name>debug_mode</name><value>true</value></property>
+    <property><name>matlab_path</name><value></value></property>
+    <property><name>matlab_classname</name><value>bufferRecon</value></property>
+    <property><name>matlab_port</name><value>3000</value></property>
+  </gadget>
+  <gadget>
+    <name>Combine</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CombineGadget</classname>
+  </gadget>
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>
+</gadgetronStreamConfiguration>
diff --git a/gadgets/matlab/matlabnoncartesian.xml b/gadgets/matlab/matlabnoncartesian.xml
new file mode 100644
index 0000000..444fc10
--- /dev/null
+++ b/gadgets/matlab/matlabnoncartesian.xml
@@ -0,0 +1,71 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+    <gadget>
+        <name>AccTrig</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AcquisitionAccumulateTriggerGadget</classname>
+        <property>
+            <name>trigger_dimension</name>
+            <value>repetition</value>
+        </property>
+    </gadget>
+
+    <gadget>
+        <name>Buff</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>BucketToBufferGadget</classname>
+        <property>
+            <name>N_dimension</name>
+            <value></value>
+        </property>
+        <property>
+          <name>S_dimension</name>
+          <value></value>
+        </property>
+        <property>
+          <name>split_slices</name>
+          <value>true</value>
+        </property>
+    </gadget>
+
+
+  <gadget>
+    <name>MatlabBuffer</name>
+    <dll>gadgetron_matlab</dll>
+    <classname>MatlabBufferGadget</classname>
+    <property><name>debug_mode</name><value>true</value></property>
+    <property><name>matlab_path</name><value></value></property>
+    <property><name>matlab_classname</name><value>trajectoryScale</value></property>
+    <property><name>matlab_port</name><value>3000</value></property>
+  </gadget>
+  <gadget>
+    <name>Combine</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CombineGadget</classname>
+  </gadget>
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>
+</gadgetronStreamConfiguration>
diff --git a/gadgets/matlab/recon.m b/gadgets/matlab/recon.m
new file mode 100644
index 0000000..37632c6
--- /dev/null
+++ b/gadgets/matlab/recon.m
@@ -0,0 +1,87 @@
+classdef recon < handle & BaseGadget
+    
+    properties
+        
+        image_num;
+        series_num;
+        center_line;
+        img_size;
+        
+    end
+    
+    methods
+        
+        function g = config(g)
+            
+            fprintf('The resonance frequency is %d\n', g.xml.experimentalConditions.H1resonanceFrequency_Hz);
+            nx = g.xml.encoding.encodedSpace.matrixSize.x;
+            ny = g.xml.encoding.encodedSpace.matrixSize.y;
+            nz = g.xml.encoding.encodedSpace.matrixSize.z;
+            % for 2D sequences the number of getZ breaks
+            %try
+            %catch
+            
+            % nz =1
+            %end
+            % the number of receiver channels is optional
+            try
+                % this is the only cast from java.lang.Integer that works in Matlab
+                nc = g.xml.acquisitionSystemInformation.receiverChannels;
+            catch
+                nc = 1;
+            end
+            % the number of slices is optional
+            try
+                ns = g.xml.encoding.encodingLimits.slice.maximum + 1;
+            catch
+                ns = 1;
+            end
+            
+            g.center_line = g.xml.encoding.encodingLimits.kspace_encoding_step_1.center;
+            g.img_size = [nx ny nz];
+            g.image_num = 0;   % todo this needs to be static or global...
+            g.series_num = 0;  % todo this needs to be static or global...
+        end
+        
+        function g = process(g, head, data)
+            %disp('Processing')
+            % stuff the line
+            
+            
+            
+            % At the end of the acquisition, reconstruct the slice
+            img_head = ismrmrd.ImageHeader;
+            %img_head.channels = head.active_channels;
+            img_head.channels = 1;
+            img_head.slice = 1;
+            %img_head.slice = head.idx.slice;
+            % set the matrix size
+            % set one element at a time to not break the type (uint16) of matrix_size
+            img_head.matrix_size(1) = g.img_size(1); % nx
+            img_head.matrix_size(2) = g.img_size(2); % ny
+            img_head.matrix_size(3) = g.img_size(3); % nz
+            
+            img_head.position = head.position;
+            img_head.read_dir = head.read_dir;
+            img_head.phase_dir = head.phase_dir;
+            img_head.slice_dir = head.slice_dir;
+            img_head.patient_table_position = head.patient_table_position;
+            img_head.acquisition_time_stamp = head.acquisition_time_stamp;
+            img_head.image_index = g.image_num;
+            img_head.image_series_index = g.series_num;
+            
+            img_data = squeeze(data);
+            img_data = fftshift(ifftn(fftshift(img_data)));
+            imagesc(abs(img_data(:,:,1,1))); axis image; axis square;
+            pause(2)
+            close()
+            
+            g.putQ(img_head, img_data);
+            %fprintf('Put on Queue %d, type = %d\n',length(g.Q),g.Q{1}.type);
+            
+            
+            
+        end
+        
+    end
+end
diff --git a/gadgets/matlab/scale.m b/gadgets/matlab/scale.m
new file mode 100644
index 0000000..0aa17a0
--- /dev/null
+++ b/gadgets/matlab/scale.m
@@ -0,0 +1,21 @@
+classdef scale < BaseGadget
+
+    properties
+        factor;
+    end
+
+    methods
+
+        function config(g)
+            g.factor = 2;
+        end
+
+        function process(g, head, data)
+    	    %fprintf('Processing line = %d\n', head.idx.kspace_encode_step_1);
+            reshdr = head;
+            resdata = g.factor * data;
+            g.putQ(reshdr, resdata);
+        end
+
+    end
+end
diff --git a/gadgets/matlab/trajectoryScale.m b/gadgets/matlab/trajectoryScale.m
new file mode 100644
index 0000000..421233f
--- /dev/null
+++ b/gadgets/matlab/trajectoryScale.m
@@ -0,0 +1,69 @@
+classdef trajectoryScale < handle & BaseBufferGadget
+    
+    properties
+        
+        image_num;
+        series_num;
+        center_line;
+        img_size;
+        
+    end
+    
+    methods
+        
+        function g = config(g)
+            
+            fprintf('The resonance frequency is %d\n', g.xml.experimentalConditions.H1resonanceFrequency_Hz);
+            nx = g.xml.encoding.encodedSpace.matrixSize.x;
+            ny = g.xml.encoding.encodedSpace.matrixSize.y;
+            nz = g.xml.encoding.encodedSpace.matrixSize.z;
+            % for 2D sequences the number of getZ breaks
+            %try
+            %catch
+            
+            % nz =1
+            %end
+            % the number of receiver channels is optional
+            try
+                % this is the only cast from java.lang.Integer that works in Matlab
+                nc = g.xml.acquisitionSystemInformation.receiverChannels;
+            catch
+                nc = 1;
+            end
+            % the number of slices is optional
+            try
+                ns = g.xml.encoding.encodingLimits.slice.maximum + 1;
+            catch
+                ns = 1;
+            end
+            
+            g.center_line = g.xml.encoding.encodingLimits.kspace_encoding_step_1.center;
+            g.img_size = [nx ny nz];
+            g.image_num = 0;   % todo this needs to be static or global...
+            g.series_num = 0;  % todo this needs to be static or global...
+        end
+        
+        function g = process(g, recon_data)
+            disp('Processing')
+            % stuff the line
+            
+            
+           for n = 1:numel(recon_data) 
+                buffer =recon_data{n};
+                size(buffer.data)
+                buffer.trajectory = buffer.trajectory*0.5;
+                size(buffer.data)
+               g.putBufferQ(buffer) 
+           end
+           
+           
+           
+            
+            %fprintf('Put on Queue %d, type = %d\n',length(g.Q),g.Q{1}.type);
+            
+            
+            
+        end
+        
+    end
+end
diff --git a/gadgets/moco/CMakeLists.txt b/gadgets/moco/CMakeLists.txt
new file mode 100644
index 0000000..caa39ec
--- /dev/null
+++ b/gadgets/moco/CMakeLists.txt
@@ -0,0 +1,94 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_MOCO__)
+endif (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+
+set( CPU_REG 0)
+set( GPU_REG 0)
+
+if(ARMADILLO_FOUND)
+  if(ARMADILLO_VERSION_STRING VERSION_GREATER "3.819" )
+    
+    set(CPU_REG 1)
+    list(APPEND CPU_GADGETS cpuRegistrationAveragingGadget.cpp)
+    list(APPEND CPU_LIBS gadgetron_toolbox_cpucore_math gadgetron_toolbox_cpureg)
+    
+  elseif(ARMADILLO_VERSION_STRING VERSION_GREATER "3.819" )
+    MESSAGE("Armadillo of at least version 3.820 not found, not compiling cpu-based registration gadgets")
+  endif(ARMADILLO_VERSION_STRING VERSION_GREATER "3.819" )
+elseif (ARMADILLO_FOUND)
+  MESSAGE("Armadillo not found, not compiling cpu-based registration gadgets")
+endif (ARMADILLO_FOUND)
+
+if(CUDA_FOUND)
+  set(GPU_REG 1)
+  list(APPEND GPU_GADGETS gpuRegistrationAveragingGadget.cpp gpuRegistrationScatteringGadget.cpp)
+  list(APPEND GPU_LIBS gadgetron_toolbox_gpucore gadgetron_toolbox_gpureg ${CUDA_LIBRARIES})
+elseif (CUDA_FOUND)
+  MESSAGE("Cuda not found, not compiling gpu-based registration gadgets")
+endif (CUDA_FOUND)
+
+include_directories(   
+  ${ACE_INCLUDE_DIR} 
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+  ${CMAKE_SOURCE_DIR}/gadgets/moco
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/
+  ${HDF5_INCLUDE_DIR}
+  ${HDF5_INCLUDE_DIR}/cpp
+)
+
+if(CPU_REG)
+  include_directories(   
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu
+    ${ARMADILLO_INCLUDE_DIRS}
+    )
+endif(CPU_REG)
+
+if(GPU_REG)
+  include_directories(   
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/gpu
+    ${CUDA_INCLUDE_DIRS}
+    )
+endif(GPU_REG)
+
+if(CUDA_FOUND)
+    add_library(gadgetron_moco SHARED
+        cpuRegistrationAveragingGadget.h
+        gadgetron_moco_export.h
+        gpuRegistrationAveragingGadget.h
+        gpuRegistrationScatteringGadget.h
+        RegistrationAveragingGadget.h
+        RegistrationScatteringGadget.h
+        ${CPU_GADGETS}
+        ${GPU_GADGETS}
+      )
+
+    set_target_properties(gadgetron_moco PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})                                                                                                                                                                                                         
+
+    target_link_libraries(gadgetron_moco
+      gadgetron_gadgetbase
+      gadgetron_toolbox_cpucore gadgetron_mricore ${CPU_LIBS} ${GPU_LIBS}
+      ${Boost_LIBRARIES} ${ISMRMRD_LIBRARIES}
+      optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+      )
+
+    install (TARGETS gadgetron_moco DESTINATION lib COMPONENT main)
+endif(CUDA_FOUND)
+
+install(FILES   cpuRegistrationAveragingGadget.h
+                gadgetron_moco_export.h
+                gpuRegistrationAveragingGadget.h
+                gpuRegistrationScatteringGadget.h
+                RegistrationAveragingGadget.h
+                RegistrationScatteringGadget.h
+                DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+add_subdirectory(config)
diff --git a/gadgets/moco/RegistrationAveragingGadget.h b/gadgets/moco/RegistrationAveragingGadget.h
new file mode 100644
index 0000000..76325fb
--- /dev/null
+++ b/gadgets/moco/RegistrationAveragingGadget.h
@@ -0,0 +1,328 @@
+#ifndef RegistrationAveragingGadget_H
+#define RegistrationAveragingGadget_H
+
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "complext.h"
+#include "PhysioInterpolationGadget.h"
+#include "GadgetStreamInterface.h"
+#include "GadgetronTimer.h"
+#include "gadgetron_moco_export.h"
+#include "hoNDArray_fileio.h"
+
+#ifdef USE_CUDA
+#include "cuNDArray_reductions.h"
+#endif // USE_CUDA
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+
+namespace Gadgetron{  
+
+  template<class ARRAY_TYPE, unsigned int D> class opticalFlowSolver;
+  
+  /**
+     This is an abstract gadget class and consequently should not be included in any xml configuration file.
+     "Instatiate" instead the cpuRegistrationAveragingGadget or gpuRegistrationAveragingGadget.
+  */
+  template<class ARRAY_TYPE, unsigned int D> class EXPORTGADGETS_MOCO RegistrationAveragingGadget 
+    : public Gadget2<ISMRMRD::ImageHeader, hoNDArray< typename ARRAY_TYPE::element_type > > // se note below
+  {
+    //
+    // We use hoNDArray to interface the gadget chain, even if ARRAY_TYPE is a cuNDArray
+    // Instead of hard coding the interface to use single precision (float), 
+    // "typename ARRAY_TYPE::element_type" could in principle denote a double precison type (double) as well.
+    // Registration of complex images is however not supported currently...
+    //
+    
+  public:
+    
+    RegistrationAveragingGadget() {
+      this->of_solver_ = 0x0;
+      this->number_of_phases_ = 0; // This is a property queried from the PhysioInterpolationGadget
+      this->set_parameter(std::string("alpha").c_str(), "0.05");
+      this->set_parameter(std::string("beta").c_str(), "1.0");
+      this->set_parameter(std::string("limit").c_str(), "0.01");
+      this->set_parameter(std::string("num_multiresolution_levels").c_str(), "3");
+      this->set_parameter(std::string("max_iterations_per_level").c_str(), "500");    
+      this->set_parameter(std::string("output_convergence").c_str(), "false");
+    }
+
+    virtual ~RegistrationAveragingGadget() {
+      if( this->of_solver_ ) delete this->of_solver_;
+    }
+
+  protected:
+
+    virtual int process_config(ACE_Message_Block *mb)
+    {
+      this->alpha_ = (typename ARRAY_TYPE::element_type)this->get_double_value("alpha");
+      this->beta_  = (typename ARRAY_TYPE::element_type)this->get_double_value("beta");
+      this->limit_ = (typename ARRAY_TYPE::element_type)this->get_double_value("limit");
+      this->output_convergence_ = this->get_bool_value(std::string("output_convergence").c_str());
+      this->num_multires_levels_ = this->get_int_value(std::string("num_multiresolution_levels").c_str());
+      this->max_iterations_per_level_ = this->get_int_value(std::string("max_iterations_per_level").c_str());
+      
+      // Fow now we require the existence of a gadget named "PhysioInterpolationGadget" upstream,
+      // to determine the number of incoming phases.
+      //
+      
+      GadgetStreamInterface *controller = this->get_controller();
+    
+      if( controller == 0x0 ){
+        GDEBUG("Failed to get controller\n");
+        return GADGET_FAIL;
+      }
+      
+      PhysioInterpolationGadget *physio = 
+        dynamic_cast<PhysioInterpolationGadget*>( controller->find_gadget(std::string("PhysioInterpolationGadget")) );
+      
+      if( physio == 0x0 ){
+        GDEBUG("Could not find (or cast) PhysioInterpolationGadget in gadget stream\n");
+        return GADGET_FAIL;
+      }
+      
+      this->number_of_phases_ = physio->get_number_of_phases();      
+      
+      GDEBUG("Configured for %d phases\n", this->number_of_phases_); 
+      return GADGET_OK;
+    }
+    
+    virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader > *m1,
+                         GadgetContainerMessage< hoNDArray< typename ARRAY_TYPE::element_type > > *m2 )
+    {
+
+      //GDEBUG("\nSERIES: %d, PHASE: %d", m1->getObjectPtr()->image_series_index, m1->getObjectPtr()->phase );
+
+      // If this image header corresponds to series 0, it is not part of the sorted phases.
+      // Just pass those images along...
+      //
+
+      if( m1->getObjectPtr()->image_series_index < 9 ){
+        return this->next()->putq(m1);
+      }
+      
+      // At first pass allocate the image buffer array.
+      //
+      
+      if( this->phase_images_.get() == 0x0 ){
+      
+        this->image_dimensions_ = *m2->getObjectPtr()->get_dimensions();
+        this->phase_images_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >
+          (new ACE_Message_Queue<ACE_MT_SYNCH>[this->number_of_phases_]);      
+	
+        size_t bsize = sizeof(GadgetContainerMessage<ISMRMRD::ImageHeader>)*100*this->number_of_phases_;
+	
+        for( unsigned int i=0; i<this->number_of_phases_; i++ ){
+          this->phase_images_[i].high_water_mark(bsize);
+          this->phase_images_[i].low_water_mark(bsize);      
+        }
+	
+        // Setup the optical flow solver
+        //
+	
+        if( this->setup_solver() != GADGET_OK ){
+          GDEBUG("Failed to set up optical flow solver\n");
+          return GADGET_FAIL;
+        }
+      }
+      
+      //
+      // Put the incoming images on the appropriate queue (based on the phase index).
+      // 
+      
+      unsigned int phase = m1->getObjectPtr()->phase;
+      
+      if( this->phase_images_[phase].enqueue_tail(m1) < 0 ) {
+        GDEBUG("Failed to add image to buffer\n");
+        return GADGET_FAIL;
+      }
+
+      return GADGET_OK;
+    }
+
+    // All the work is done here in the close method
+    //
+
+    virtual int close(unsigned long flags)
+    {
+      if( this->phase_images_.get() ){
+      
+        GDEBUG("RegistrationAveragingGadget::close (performing registration and averaging images)\n");
+      
+        // Make sure we have the same number of images on all phase queues
+        // (It doesn't really matter, but if not the case something probably went wrong upstream)
+        //
+
+        unsigned int num_images = this->phase_images_[0].message_count();
+
+        GDEBUG("Number of images for phase 0: %d", num_images );
+        
+        for( unsigned int phase = 0; phase< this->number_of_phases_; phase++ ){
+
+          unsigned int num_images_phase = this->phase_images_[phase].message_count();
+          GDEBUG("Number of images for phase %d: %d", phase, num_images_phase );
+
+          if( num_images != num_images_phase ){
+            GDEBUG("Failed to set up registration, a different number of images received for each phase\n");
+            return Gadget::close(flags);
+          }
+        }
+      
+        if( num_images == 0 ){
+          GDEBUG("No images to register\n");
+          return Gadget::close(flags);
+        }
+
+        for( unsigned int phase=0; phase < this->number_of_phases_; phase++ ){
+	
+          unsigned int num_image_elements = this->image_dimensions_[0]*image_dimensions_[1];
+          std::vector<size_t> moving_dims = this->image_dimensions_;
+          moving_dims.push_back(num_images-1);
+	
+          GadgetContainerMessage<ISMRMRD::ImageHeader> *header;
+
+          ARRAY_TYPE fixed_image;
+          ARRAY_TYPE moving_image(&moving_dims);
+	
+          for( unsigned int image=0; image<num_images; image++ ){
+	  
+            ACE_Message_Block *mbq;
+	  
+            if( this->phase_images_[phase].dequeue_head(mbq) < 0 ) {
+              GDEBUG("Image header dequeue failed\n");
+              return Gadget::close(flags);
+            }
+	  
+            GadgetContainerMessage<ISMRMRD::ImageHeader> *m1 = AsContainerMessage<ISMRMRD::ImageHeader>(mbq);
+	  
+            if( m1 == 0x0 ) {
+              GDEBUG("Unexpected image type on queue\n");
+              return Gadget::close(flags);
+            }
+	  
+            GadgetContainerMessage< hoNDArray<typename ARRAY_TYPE::element_type> > *m2 = 
+              AsContainerMessage< hoNDArray<typename ARRAY_TYPE::element_type> >(m1->cont());
+	  
+            if( m2 == 0x0 ) {
+              GDEBUG("Unexpected continuation on queue\n");
+              m1->release();
+              return Gadget::close(flags);
+            }
+	  
+            if( image == 0 ){
+
+              // Setup the fixed image.
+              // If ARRAY_TYPE is an cuNDArray the following assignment uploads the array to the device,
+              // for an 'hoNDArray' it merely copies the array.
+              fixed_image = *m2->getObjectPtr();
+
+              // We are going to pass on the averaged image using this header
+              header = m1; 
+
+              // The continuation will be a new array (set after registration).
+              // No registration is however performed if we received only one image. 
+              // In the latter case keep the current continuation.
+              if( num_images > 1 ){	      
+                m1->cont(0x0); 
+                m2->release();
+              }
+            }
+            else{
+
+              // Assign this image as the 'image-1'th frame in the moving image
+              ARRAY_TYPE tmp_moving(&image_dimensions_, moving_image.get_data_ptr()+(image-1)*num_image_elements);
+              tmp_moving = *m2->getObjectPtr(); // Copy as for the fixed image
+              m1->release();	    
+            }
+          }
+	
+          if( num_images > 1 ){
+	  
+            // Perform registration for the current phase
+            //
+	  
+            boost::shared_ptr<ARRAY_TYPE> deformations;
+            {
+              GadgetronTimer timer("Running registration");
+              deformations = this->of_solver_->solve( &fixed_image, &moving_image );
+            }
+
+            // Deform moving images based on the registration
+            //
+	  
+            boost::shared_ptr<ARRAY_TYPE> deformed_moving;
+            {
+              GadgetronTimer timer("Applying deformation");
+              deformed_moving = this->of_solver_->deform( &moving_image, deformations );
+            }
+	  
+            /*{
+            // The debug code below only compiles for cuNDArrays.
+            // To use (temporarily) comment out
+            // list(APPEND CPU_GADGETS cpuRegistrationAveragingGadget.cpp)
+            // in the CMakeList.txt
+            //
+            char filename[256];
+            sprintf((char*)filename, "fixed_%d.real", phase);
+            write_nd_array<float>( fixed_image.to_host().get(), filename );
+            sprintf((char*)filename, "moving_%d.real", phase);
+            write_nd_array<float>( moving_image.to_host().get(), filename );
+            sprintf((char*)filename, "deformed_moving_%d.real", phase);
+            write_nd_array<float>( deformed_moving->to_host().get(), filename );
+            sprintf((char*)filename, "deformation_%d.real", phase);
+            write_nd_array<float>( deformations->to_host().get(), filename );
+            } */
+
+	 
+            // Accumulate the deformed moving images (into one image) and add this image to the fixed image. 
+            // Then divide by the number of images to get the average.
+            //	  
+	  
+            fixed_image += ((deformed_moving->get_number_of_dimensions() == 3) ? *sum(deformed_moving.get(), 2) : *deformed_moving);
+            fixed_image /= ((typename ARRAY_TYPE::element_type)num_images);
+	  
+            // Pass along averaged image
+            //
+	  
+            if( set_continuation( header, &fixed_image ) < 0 ) {
+              GDEBUG("Failed to set continuation\n");
+              header->release();
+              return Gadget::close(flags);
+            }
+          }
+
+          if( this->next()->putq(header) < 0 ) {
+            GDEBUG("Failed to put registrered image on queue\n");
+            header->release();
+            return Gadget::close(flags);
+          }
+        }
+      }
+    
+      return Gadget::close(flags);
+    }
+
+    virtual int setup_solver() = 0;
+    virtual int set_continuation( GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, ARRAY_TYPE *continuation ) = 0;
+
+  protected:
+    opticalFlowSolver<ARRAY_TYPE,D> *of_solver_;
+    typename ARRAY_TYPE::element_type alpha_;
+    typename ARRAY_TYPE::element_type beta_;
+    typename ARRAY_TYPE::element_type limit_;
+    bool output_convergence_;
+    unsigned int num_multires_levels_;
+    unsigned int max_iterations_per_level_;
+
+  private:
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > phase_images_;
+    std::vector<size_t> image_dimensions_;
+    unsigned short number_of_phases_;    
+  };
+}
+
+#endif //RegistrationAveragingGadget_H
diff --git a/gadgets/moco/RegistrationScatteringGadget.h b/gadgets/moco/RegistrationScatteringGadget.h
new file mode 100644
index 0000000..bc18ac9
--- /dev/null
+++ b/gadgets/moco/RegistrationScatteringGadget.h
@@ -0,0 +1,375 @@
+#ifndef RegistrationScatteringGadget_H
+#define RegistrationScatteringGadget_H
+
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "complext.h"
+#include "PhysioInterpolationGadget.h"
+#include "GadgetStreamInterface.h"
+#include "GadgetronTimer.h"
+#include "gadgetron_moco_export.h"
+#include "hoNDArray_fileio.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+
+namespace Gadgetron{  
+
+  template<class ARRAY_TYPE, unsigned int D> class opticalFlowSolver;
+  
+  /**
+     This is an abstract gadget class and consequently should not be included in any xml configuration file.
+     Use instead the gpuRegistrationScatteringGadget.
+  */
+  template<class ARRAY_TYPE, unsigned int D> class EXPORTGADGETS_MOCO RegistrationScatteringGadget 
+    : public Gadget2<ISMRMRD::ImageHeader, hoNDArray< typename ARRAY_TYPE::element_type > > // se note below
+  {
+    //
+    // We use hoNDArray to interface the gadget chain, even if ARRAY_TYPE is a cuNDArray
+    // Instead of hard coding the interface to use single precision (float), 
+    // "typename ARRAY_TYPE::element_type" could in principle denote a double precison type (double) as well.
+    // Registration of complex images is however not supported currently.
+    //
+    
+  public:
+    
+    RegistrationScatteringGadget() {
+      this->of_solver_ = 0x0;
+      this->number_of_phases_ = 0; // This is a property queried from the PhysioInterpolationGadget
+      this->set_parameter(std::string("alpha").c_str(), "0.05");
+      this->set_parameter(std::string("beta").c_str(), "1.0");
+      this->set_parameter(std::string("limit").c_str(), "0.01");
+      this->set_parameter(std::string("num_multiresolution_levels").c_str(), "3");
+      this->set_parameter(std::string("max_iterations_per_level").c_str(), "500");    
+      this->set_parameter(std::string("output_convergence").c_str(), "false");
+    }
+
+    virtual ~RegistrationScatteringGadget() {
+      if( this->of_solver_ ) delete this->of_solver_;
+    }
+
+  protected:
+
+    virtual int process_config(ACE_Message_Block *mb)
+    {
+      this->alpha_ = (typename ARRAY_TYPE::element_type)this->get_double_value("alpha");
+      this->beta_  = (typename ARRAY_TYPE::element_type)this->get_double_value("beta");
+      this->limit_ = (typename ARRAY_TYPE::element_type)this->get_double_value("limit");
+      this->output_convergence_ = this->get_bool_value(std::string("output_convergence").c_str());
+      this->num_multires_levels_ = this->get_int_value(std::string("num_multiresolution_levels").c_str());
+      this->max_iterations_per_level_ = this->get_int_value(std::string("max_iterations_per_level").c_str());
+      
+      // Fow now we require the existence of a gadget named "PhysioInterpolationGadget" upstream,
+      // to determine the number of incoming phases.
+      //
+      
+      GadgetStreamInterface *controller = this->get_controller();
+    
+      if( controller == 0x0 ){
+        GDEBUG("Failed to get controller\n");
+        return GADGET_FAIL;
+      }
+      
+      PhysioInterpolationGadget *physio = 
+        dynamic_cast<PhysioInterpolationGadget*>( controller->find_gadget(std::string("PhysioInterpolationGadget")) );
+      
+      if( physio == 0x0 ){
+        GDEBUG("Could not find (or cast) PhysioInterpolationGadget in gadget stream\n");
+        return GADGET_FAIL;
+      }
+      
+      this->number_of_phases_ = physio->get_number_of_phases();      
+      
+      GDEBUG("Configured for %d phases\n", this->number_of_phases_); 
+      return GADGET_OK;
+    }
+    
+    virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader > *m1,
+                         GadgetContainerMessage< hoNDArray< typename ARRAY_TYPE::element_type > > *m2 )
+    {
+
+      //GDEBUG("\nSERIES: %d, PHASE: %d", m1->getObjectPtr()->image_series_index, m1->getObjectPtr()->phase );
+
+      // If this image header corresponds to series 0, it is not part of the sorted phases.
+      // Just pass those images along...
+      //
+
+      if( m1->getObjectPtr()->image_series_index < 9 ){
+        return this->next()->putq(m1);
+      }
+      
+      // At first pass allocate the image buffer array.
+      //
+      
+      if( this->phase_images_.get() == 0x0 ){
+      
+        this->image_dimensions_ = *m2->getObjectPtr()->get_dimensions();
+        this->phase_images_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >
+          (new ACE_Message_Queue<ACE_MT_SYNCH>[this->number_of_phases_]);      
+        
+        size_t bsize = sizeof(GadgetContainerMessage<ISMRMRD::ImageHeader>)*100*this->number_of_phases_;
+        
+        for( unsigned int i=0; i<this->number_of_phases_; i++ ){
+          this->phase_images_[i].high_water_mark(bsize);
+          this->phase_images_[i].low_water_mark(bsize);      
+        }
+        
+        // Setup the optical flow solver
+        //
+        
+        if( this->setup_solver() != GADGET_OK ){
+          GDEBUG("Failed to set up optical flow solver\n");
+          return GADGET_FAIL;
+        }
+      }
+      
+      //
+      // Put the incoming images on the appropriate queue (based on the phase index).
+      // 
+      
+      unsigned int phase = m1->getObjectPtr()->phase;
+      
+      if( this->phase_images_[phase].enqueue_tail(m1) < 0 ) {
+        GDEBUG("Failed to add image to buffer\n");
+        return GADGET_FAIL;
+      }
+      
+      return GADGET_OK;
+    }
+    
+    // All the work is done here in the close method
+    //
+    virtual int close(unsigned long flags)
+    {
+      if( this->phase_images_.get() ){
+      
+        GDEBUG("RegistrationScatteringGadget::close (performing registration and scattering images)\n");
+      
+        // Make sure we have the same number of images on all phase queues
+        // (It doesn't really matter, but if not the case something probably went wrong upstream)
+        //
+
+        unsigned int num_images = this->phase_images_[0].message_count();
+
+        GDEBUG("Number of images for phase 0: %d", num_images );
+
+        for( unsigned int phase = 0; phase< this->number_of_phases_; phase++ ){
+
+          unsigned int num_images_phase = this->phase_images_[phase].message_count();
+          GDEBUG("Number of images for phase %d: %d", phase, num_images_phase );
+
+          if( num_images != num_images_phase ){
+            GDEBUG("Failed to set up registration, a different number of images received for each phase\n");
+            return Gadget::close(flags);
+          }
+        }
+      
+        if( num_images == 0 ){
+          GDEBUG("No images to register\n");
+          return Gadget::close(flags);
+        }
+
+        // These are the dimensions of the vector field written out
+        // - just a plain 'write_nd_array' below for now...
+        //
+
+        std::vector<size_t> reg_dims = this->image_dimensions_; // x,y
+        reg_dims.push_back(num_images-1); // this many registrations 
+        reg_dims.push_back(2); // 2d flow vectors
+        reg_dims.push_back(this->number_of_phases_);
+        ARRAY_TYPE reg_field(&reg_dims);
+        unsigned int num_reg_elements_phase = reg_dims[0]*reg_dims[1]*reg_dims[2]*reg_dims[3];
+
+        for( unsigned int phase=0; phase < this->number_of_phases_; phase++ ){
+	
+          unsigned int num_image_elements = this->image_dimensions_[0]*image_dimensions_[1];
+          std::vector<size_t> fixed_dims = this->image_dimensions_;
+          fixed_dims.push_back(num_images-1);
+	
+          std::vector< GadgetContainerMessage<ISMRMRD::ImageHeader>*> headers;
+
+          ARRAY_TYPE fixed_image(&fixed_dims);
+          ARRAY_TYPE moving_image;
+	
+          for( unsigned int image=0; image<num_images; image++ ){
+	  
+            ACE_Message_Block *mbq;
+	  
+            if( this->phase_images_[phase].dequeue_head(mbq) < 0 ) {
+              GDEBUG("Image header dequeue failed\n");
+              return Gadget::close(flags);
+            }
+	  
+            GadgetContainerMessage<ISMRMRD::ImageHeader> *m1 = AsContainerMessage<ISMRMRD::ImageHeader>(mbq);
+	  
+            if( m1 == 0x0 ) {
+              GDEBUG("Unexpected image type on queue\n");
+              return Gadget::close(flags);
+            }
+	  
+            GadgetContainerMessage< hoNDArray<typename ARRAY_TYPE::element_type> > *m2 = 
+              AsContainerMessage< hoNDArray<typename ARRAY_TYPE::element_type> >(m1->cont());
+	  
+            if( m2 == 0x0 ) {
+              GDEBUG("Unexpected continuation on queue\n");
+              m1->release();
+              return Gadget::close(flags);
+            }
+	  
+            if( image == 0 ){
+
+              // Setup the moving image.
+              // If ARRAY_TYPE is an cuNDArray the following assignment uploads the array to the device,
+              // for an 'hoNDArray' it merely copies the array.
+              //
+
+              moving_image = *m2->getObjectPtr();
+              headers.push_back(m1);
+            }
+            else{
+
+              // Assign this image as the 'image-1'th frame in the moving image
+              //
+
+              ARRAY_TYPE tmp_fixed(&image_dimensions_, fixed_image.get_data_ptr()+(image-1)*num_image_elements);
+              tmp_fixed = *m2->getObjectPtr(); // Copy as for the moving image
+              headers.push_back(m1);
+
+              // The continuation will be a new array (set after registration).
+              // No registration is however performed if we received only one image. 
+              // In the latter case keep the current continuation.
+              //
+
+              if( num_images > 1 ){
+                m1->cont(0x0);
+                m2->release();
+              }             
+            }
+          }
+	
+          if( num_images > 1 ){
+	  
+            // Perform registration for the current phase
+            //
+	  
+            boost::shared_ptr<ARRAY_TYPE> deformations;
+            {
+              GadgetronTimer timer("Running registration");
+              deformations = this->of_solver_->solve( &fixed_image, &moving_image );
+            }
+
+            // Copy displacement field to vector field array
+            //
+            
+            {              
+              std::vector<size_t> phase_reg_dims = reg_dims; phase_reg_dims.pop_back();
+              ARRAY_TYPE tmp_in( &phase_reg_dims, deformations->get_data_ptr() ); // the vector field has an extra dimension for CK (to be discarded)
+              ARRAY_TYPE tmp_out( &phase_reg_dims, reg_field.get_data_ptr()+phase*num_reg_elements_phase );
+              tmp_out = tmp_in;
+            }
+
+            // Deform moving images based on the registration
+            //
+	  
+            boost::shared_ptr<ARRAY_TYPE> deformed_moving;
+            {
+              GadgetronTimer timer("Applying deformation");
+              deformed_moving = this->of_solver_->deform( &moving_image, deformations );
+            }
+	  
+            /*{
+            // The debug code below only compiles for cuNDArrays.
+            // To use (temporarily) comment out
+            // list(APPEND CPU_GADGETS cpuRegistrationScatteringGadget.cpp)
+            // in the CMakeList.txt
+            //
+            char filename[256];
+            sprintf((char*)filename, "fixed_%d.real", phase);
+            write_nd_array<float>( fixed_image.to_host().get(), filename );
+            sprintf((char*)filename, "moving_%d.real", phase);
+            write_nd_array<float>( moving_image.to_host().get(), filename );
+            sprintf((char*)filename, "deformed_moving_%d.real", phase);
+            write_nd_array<float>( deformed_moving->to_host().get(), filename );
+            sprintf((char*)filename, "deformation_%d.real", phase);
+            write_nd_array<float>( deformations->to_host().get(), filename );
+            } */
+
+
+            // Pass along the deformed moving images
+            //	  
+	  
+            for( unsigned int i=0; i<headers.size(); i++ ){
+              
+              if( i==0 ){
+                GDEBUG("Putting image %d image on queue\n", i);
+                
+                if( this->next()->putq(headers[i]) < 0 ) {
+                  GDEBUG("Failed to put registrered image on queue\n");
+                  headers[i]->release();
+                  return Gadget::close(flags);
+                }
+              }
+              else{                
+                std::vector<size_t> moving_dims = *moving_image.get_dimensions();
+                cuNDArray<float> subimage( &moving_dims, deformed_moving->get_data_ptr()+(i-1)*num_image_elements);
+                
+                if( set_continuation( headers[i], &subimage ) < 0 ) {
+                  GDEBUG("Failed to set continuation\n");
+                  headers[i]->release();
+                  return Gadget::close(flags);
+                }
+                
+                GDEBUG("Putting image %d image on queue\n", i);
+                
+                if( this->next()->putq(headers[i]) < 0 ) {
+                  GDEBUG("Failed to put registrered image on queue\n");
+                  headers[i]->release();
+                  return Gadget::close(flags);
+                }
+              }
+            }
+          }
+        }
+        
+        // Write out the result after permutation to the data order
+        // - to be betetr suited for a subsequent reconstruction pass
+        //
+        
+        std::vector<size_t> order;
+        order.push_back(0); 
+        order.push_back(1);
+        order.push_back(4);
+        order.push_back(2);
+        order.push_back(3);
+        
+        GDEBUG("Writing out displacement field with dimensions: %d %d %d %d %d\n", order[0], order[1], order[2], order[3], order[4]);
+        write_displacement_field( permute(&reg_field, &order).get() );
+      }
+      
+      return Gadget::close(flags);
+    }
+    
+    virtual int setup_solver() = 0;
+    virtual int set_continuation( GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, ARRAY_TYPE *continuation ) = 0;
+    virtual int write_displacement_field( ARRAY_TYPE *vec_field ) = 0;
+    
+  protected:
+    opticalFlowSolver<ARRAY_TYPE,D> *of_solver_;
+    typename ARRAY_TYPE::element_type alpha_;
+    typename ARRAY_TYPE::element_type beta_;
+    typename ARRAY_TYPE::element_type limit_;
+    bool output_convergence_;
+    unsigned int num_multires_levels_;
+    unsigned int max_iterations_per_level_;
+    
+  private:
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > phase_images_;
+    std::vector<size_t> image_dimensions_;
+    unsigned short number_of_phases_;    
+  };
+}
+
+#endif //RegistrationScatteringGadget_H
diff --git a/gadgets/moco/config/CMakeLists.txt b/gadgets/moco/config/CMakeLists.txt
new file mode 100644
index 0000000..7c18f7c
--- /dev/null
+++ b/gadgets/moco/config/CMakeLists.txt
@@ -0,0 +1,13 @@
+if(CUDA_FOUND)
+  if(CPU_REG)
+    install (FILES 
+      cpureg_cartesian_averaging.xml
+      DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
+  endif(CPU_REG)
+  
+  if(GPU_REG)
+    install (FILES 
+      gpureg_cartesian_averaging.xml
+      DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
+  endif(GPU_REG)
+endif(CUDA_FOUND)
diff --git a/gadgets/moco/config/cpureg_cartesian_averaging.xml b/gadgets/moco/config/cpureg_cartesian_averaging.xml
new file mode 100644
index 0000000..f997916
--- /dev/null
+++ b/gadgets/moco/config/cpureg_cartesian_averaging.xml
@@ -0,0 +1,120 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>CartesianToGenericGadget</name>
+    <dll>gadgetron_cartesian</dll>
+    <classname>CartesianToGenericGadget</classname>
+    <!-- Property 'matrix_size_as_a_multiple_of' is required for the gpu nfft 
+	 to enforce the matrix size to be a multiple of the gpu warp size (32) -->
+    <property><name>matrix_size_as_a_multiple_of</name><value>32</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>rotations_per_reconstruction</name><value>8</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>20</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.05</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhysioInterpolationGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PhysioInterpolationGadget</classname>
+    <property><name>mode</name><value>0</value></property>
+  </gadget>
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>cpuRegistrationAveragingGadget2D</name>
+    <dll>gadgetron_moco</dll>
+    <classname>cpuRegistrationAveragingGadget2D</classname>
+    <property><name>alpha</name><value>0.05</value></property>
+    <property><name>beta</name><value>1.0</value></property>
+    <property><name>num_multiresolution_levels</name><value>3</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageFinishFloat</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+      </gadget>    
+  -->
+
+  <gadget>
+    <name>AutoScale</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>
+  
+</gadgetronStreamConfiguration>
diff --git a/gadgets/moco/config/gpureg_cartesian_averaging.xml b/gadgets/moco/config/gpureg_cartesian_averaging.xml
new file mode 100644
index 0000000..b81f2c8
--- /dev/null
+++ b/gadgets/moco/config/gpureg_cartesian_averaging.xml
@@ -0,0 +1,120 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>CartesianToGenericGadget</name>
+    <dll>gadgetron_cartesian</dll>
+    <classname>CartesianToGenericGadget</classname>
+    <!-- Property 'matrix_size_as_a_multiple_of' is required for the gpu nfft 
+	 to enforce the matrix size to be a multiple of the gpu warp size (32) -->
+    <property><name>matrix_size_as_a_multiple_of</name><value>32</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>rotations_per_reconstruction</name><value>8</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>20</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.05</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhysioInterpolationGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PhysioInterpolationGadget</classname>
+    <property><name>mode</name><value>0</value></property>
+  </gadget>
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>gpuRegistrationAveragingGadget2D</name>
+    <dll>gadgetron_moco</dll>
+    <classname>gpuRegistrationAveragingGadget2D</classname>
+    <property><name>alpha</name><value>0.05</value></property>
+    <property><name>beta</name><value>1.0</value></property>
+    <property><name>num_multiresolution_levels</name><value>3</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageFinishFloat</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+      </gadget>    
+  -->
+  
+  <gadget>
+    <name>AutoScale</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/moco/cpuRegistrationAveragingGadget.cpp b/gadgets/moco/cpuRegistrationAveragingGadget.cpp
new file mode 100644
index 0000000..e3eed47
--- /dev/null
+++ b/gadgets/moco/cpuRegistrationAveragingGadget.cpp
@@ -0,0 +1,44 @@
+#include "cpuRegistrationAveragingGadget.h"
+#include "hoLinearResampleOperator.h"
+#include "hoCKOpticalFlowSolver.h"
+
+namespace Gadgetron{
+
+  int cpuRegistrationAveragingGadget2D::setup_solver()
+  {
+    // Allocate solver
+    hoCKOpticalFlowSolver<float,2> *solver = new hoCKOpticalFlowSolver<float,2>();
+    this->of_solver_ = solver;
+
+    // Use bilinear resampling for interpolation
+    solver->set_interpolator( boost::shared_ptr< hoLinearResampleOperator<float,2> >(new hoLinearResampleOperator<float,2>()) );
+    
+    // Configurable settings from the xml propoerties
+    //
+    
+    if( this->output_convergence_ )
+      solver->set_output_mode( hoCKOpticalFlowSolver<float,2>::OUTPUT_VERBOSE );
+    else
+      solver->set_output_mode( hoCKOpticalFlowSolver<float,2>::OUTPUT_SILENT );
+    
+    solver->set_num_multires_levels(this->num_multires_levels_);
+    solver->set_max_num_iterations_per_level(this->max_iterations_per_level_);
+    solver->set_alpha(this->alpha_);
+    solver->set_beta(this->beta_);
+    solver->set_limit(this->limit_);
+
+    return GADGET_OK;
+  }
+
+  int cpuRegistrationAveragingGadget2D::set_continuation
+  ( GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, hoNDArray<float> *continuation )
+  {
+    GadgetContainerMessage< hoNDArray<float> > *m2 = new GadgetContainerMessage< hoNDArray<float> >();      
+    *m2->getObjectPtr() = *continuation;
+    m1->cont(m2);
+
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(cpuRegistrationAveragingGadget2D)
+}
diff --git a/gadgets/moco/cpuRegistrationAveragingGadget.h b/gadgets/moco/cpuRegistrationAveragingGadget.h
new file mode 100644
index 0000000..162acac
--- /dev/null
+++ b/gadgets/moco/cpuRegistrationAveragingGadget.h
@@ -0,0 +1,25 @@
+#ifndef cpuRegistrationAveragingGadget_H
+#define cpuRegistrationAveragingGadget_H
+
+#include "hoNDArray_math.h"
+#include "hoNDArray_utils.h"
+#include "hoCKOpticalFlowSolver.h"
+#include "RegistrationAveragingGadget.h"
+
+namespace Gadgetron{  
+
+  class EXPORTGADGETS_MOCO cpuRegistrationAveragingGadget2D :
+    public RegistrationAveragingGadget< hoNDArray<float>, 2 >
+  {    
+  public:
+    GADGET_DECLARE(cpuRegistrationAveragingGadget2D);
+    cpuRegistrationAveragingGadget2D() : RegistrationAveragingGadget< hoNDArray<float>, 2 >() {}
+    virtual ~cpuRegistrationAveragingGadget2D() {}
+
+  protected:
+    virtual int setup_solver();
+    virtual int set_continuation( GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, hoNDArray<float> *continuation );
+  };
+}
+
+#endif //cpuRegistrationAveragingGadget_H
diff --git a/gadgets/moco/gadgetron_moco_export.h b/gadgets/moco/gadgetron_moco_export.h
new file mode 100644
index 0000000..95bc7c5
--- /dev/null
+++ b/gadgets/moco/gadgetron_moco_export.h
@@ -0,0 +1,14 @@
+#ifndef GADGETRON_MOCO_EXPORT_H_
+#define GADGETRON_MOCO_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_MOCO__)
+#define EXPORTGADGETS_MOCO __declspec(dllexport)
+#else
+#define EXPORTGADGETS_MOCO __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETS_MOCO
+#endif
+
+#endif /* GADGETRON_MOCO_EXPORT_H_ */
diff --git a/gadgets/moco/gpuRegistrationAveragingGadget.cpp b/gadgets/moco/gpuRegistrationAveragingGadget.cpp
new file mode 100644
index 0000000..4c3c8e8
--- /dev/null
+++ b/gadgets/moco/gpuRegistrationAveragingGadget.cpp
@@ -0,0 +1,50 @@
+#include "gpuRegistrationAveragingGadget.h"
+#include "cuLinearResampleOperator.h"
+#include "cuCKOpticalFlowSolver.h"
+
+namespace Gadgetron{
+
+  int gpuRegistrationAveragingGadget2D::setup_solver()
+  {
+    // Allocate solver
+    cuCKOpticalFlowSolver<float,2> *solver = new cuCKOpticalFlowSolver<float,2>();
+    this->of_solver_ = solver;
+
+    // Use bilinear resampling for interpolation
+    solver->set_interpolator( boost::shared_ptr< cuLinearResampleOperator<float,2> >(new cuLinearResampleOperator<float,2>()) );
+    
+    // Configurable settings from the xml propoerties
+    //
+    
+    if( this->output_convergence_ )
+      solver->set_output_mode( cuCKOpticalFlowSolver<float,2>::OUTPUT_VERBOSE );
+    else
+      solver->set_output_mode( cuCKOpticalFlowSolver<float,2>::OUTPUT_SILENT );
+    
+    solver->set_num_multires_levels(this->num_multires_levels_);
+    solver->set_max_num_iterations_per_level(this->max_iterations_per_level_);
+    solver->set_alpha(this->alpha_);
+    solver->set_beta(this->beta_);
+    solver->set_limit(this->limit_);
+
+    return GADGET_OK;
+  }
+
+  int gpuRegistrationAveragingGadget2D::set_continuation
+  ( GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, cuNDArray<float> *continuation )
+  {
+    GadgetContainerMessage< hoNDArray<float> > *m2 = new GadgetContainerMessage< hoNDArray<float> >();      
+    m2->getObjectPtr()->create(continuation->get_dimensions());
+    
+    if( cudaMemcpy( m2->getObjectPtr()->get_data_ptr(), continuation->get_data_ptr(), 
+		    continuation->get_number_of_elements()*sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) {
+      throw cuda_error("gpuRegistrationAveragingGadget::set_continuation(): failed to copy memory from device");
+    }
+
+    m1->cont(m2);
+
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuRegistrationAveragingGadget2D)
+}
diff --git a/gadgets/moco/gpuRegistrationAveragingGadget.h b/gadgets/moco/gpuRegistrationAveragingGadget.h
new file mode 100644
index 0000000..6f50b35
--- /dev/null
+++ b/gadgets/moco/gpuRegistrationAveragingGadget.h
@@ -0,0 +1,26 @@
+#ifndef gpuRegistrationAveragingGadget_H
+#define gpuRegistrationAveragingGadget_H
+
+#include "cuNDArray_operators.h"
+#include "cuNDArray_utils.h"
+#include "cuCKOpticalFlowSolver.h"
+#include "RegistrationAveragingGadget.h"
+
+namespace Gadgetron{  
+
+  class EXPORTGADGETS_MOCO gpuRegistrationAveragingGadget2D :
+    public RegistrationAveragingGadget< cuNDArray<float>, 2 >
+  {    
+  public:
+    GADGET_DECLARE(gpuRegistrationAveragingGadget2D);
+
+    gpuRegistrationAveragingGadget2D() : RegistrationAveragingGadget< cuNDArray<float>, 2 >() {}
+    virtual ~gpuRegistrationAveragingGadget2D() {}
+
+  protected:
+    virtual int setup_solver();
+    virtual int set_continuation( GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, cuNDArray<float> *continuation );
+  };
+}
+
+#endif //gpuRegistrationAveragingGadget_H
diff --git a/gadgets/moco/gpuRegistrationScatteringGadget.cpp b/gadgets/moco/gpuRegistrationScatteringGadget.cpp
new file mode 100644
index 0000000..eeadc0d
--- /dev/null
+++ b/gadgets/moco/gpuRegistrationScatteringGadget.cpp
@@ -0,0 +1,57 @@
+#include "gpuRegistrationScatteringGadget.h"
+#include "cuLinearResampleOperator.h"
+#include "cuCKOpticalFlowSolver.h"
+#include "hoNDArray_fileio.h"
+
+namespace Gadgetron{
+
+  int gpuRegistrationScatteringGadget2D::setup_solver()
+  {
+    // Allocate solver
+    cuCKOpticalFlowSolver<float,2> *solver = new cuCKOpticalFlowSolver<float,2>();
+    this->of_solver_ = solver;
+
+    // Use bilinear resampling for interpolation
+    solver->set_interpolator( boost::shared_ptr< cuLinearResampleOperator<float,2> >(new cuLinearResampleOperator<float,2>()) );
+    
+    // Configurable settings from the xml propoerties
+    //
+    
+    if( this->output_convergence_ )
+      solver->set_output_mode( cuCKOpticalFlowSolver<float,2>::OUTPUT_VERBOSE );
+    else
+      solver->set_output_mode( cuCKOpticalFlowSolver<float,2>::OUTPUT_SILENT );
+    
+    solver->set_num_multires_levels(this->num_multires_levels_);
+    solver->set_max_num_iterations_per_level(this->max_iterations_per_level_);
+    solver->set_alpha(this->alpha_);
+    solver->set_beta(this->beta_);
+    solver->set_limit(this->limit_);
+
+    return GADGET_OK;
+  }
+
+  int gpuRegistrationScatteringGadget2D::set_continuation
+  ( GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, cuNDArray<float> *continuation )
+  {
+    GadgetContainerMessage< hoNDArray<float> > *m2 = new GadgetContainerMessage< hoNDArray<float> >();      
+    m2->getObjectPtr()->create(continuation->get_dimensions());
+    
+    if( cudaMemcpy( m2->getObjectPtr()->get_data_ptr(), continuation->get_data_ptr(), 
+		    continuation->get_number_of_elements()*sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) {
+      throw cuda_error("gpuRegistrationScatteringGadget::set_continuation(): failed to copy memory from device");
+    }
+
+    m1->cont(m2);
+
+    return GADGET_OK;
+  }
+  
+  int gpuRegistrationScatteringGadget2D::write_displacement_field( cuNDArray<float> *displacements )
+  {
+    write_nd_array<float>(displacements->to_host().get(), "displacement_field_from_scattering_gadget.real");
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuRegistrationScatteringGadget2D)
+}
diff --git a/gadgets/moco/gpuRegistrationScatteringGadget.h b/gadgets/moco/gpuRegistrationScatteringGadget.h
new file mode 100644
index 0000000..4c0f388
--- /dev/null
+++ b/gadgets/moco/gpuRegistrationScatteringGadget.h
@@ -0,0 +1,26 @@
+#ifndef gpuRegistrationScatteringGadget_H
+#define gpuRegistrationScatteringGadget_H
+
+#include "cuNDArray_operators.h"
+#include "cuNDArray_utils.h"
+#include "cuCKOpticalFlowSolver.h"
+#include "RegistrationScatteringGadget.h"
+
+namespace Gadgetron{  
+
+  class EXPORTGADGETS_MOCO gpuRegistrationScatteringGadget2D :
+    public RegistrationScatteringGadget< cuNDArray<float>, 2 >
+  {    
+  public:
+    GADGET_DECLARE(gpuRegistrationScatteringGadget2D);
+    gpuRegistrationScatteringGadget2D() : RegistrationScatteringGadget< cuNDArray<float>, 2 >() {}
+    virtual ~gpuRegistrationScatteringGadget2D() {}
+
+  protected:
+    virtual int setup_solver();
+    virtual int set_continuation( GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, cuNDArray<float> *continuation );
+    virtual int write_displacement_field( cuNDArray<float> *displacements );
+  };
+}
+
+#endif //gpuRegistrationScatteringGadget_H
diff --git a/gadgets/mri_core/AccumulatorGadget.cpp b/gadgets/mri_core/AccumulatorGadget.cpp
new file mode 100644
index 0000000..32f303e
--- /dev/null
+++ b/gadgets/mri_core/AccumulatorGadget.cpp
@@ -0,0 +1,187 @@
+#include "AccumulatorGadget.h"
+#include "ismrmrd/xml.h"
+
+namespace Gadgetron{
+AccumulatorGadget::AccumulatorGadget()
+  :buffer_(0)
+  , image_counter_(0)
+  , image_series_(0)
+{
+
+}
+ 
+AccumulatorGadget::~AccumulatorGadget()
+{
+  if (buffer_) delete buffer_;
+}
+
+/**
+ *   Expects ISMRMRD XML configuration
+ *
+ */
+int AccumulatorGadget::process_config(ACE_Message_Block* mb)
+{
+  ISMRMRD::IsmrmrdHeader h;
+  ISMRMRD::deserialize(mb->rd_ptr(),h);
+
+  if (h.encoding.size() != 1) {
+    GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+    GDEBUG("This simple AccumulatorGadget only supports one encoding space\n");
+    return GADGET_FAIL;
+  }
+
+
+  ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+  ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+  ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+  
+  GDEBUG("Matrix size: %d, %d, %d\n", r_space.matrixSize.x, e_space.matrixSize.y, e_space.matrixSize.z);
+  dimensions_.push_back(r_space.matrixSize.x);
+  dimensions_.push_back(e_space.matrixSize.y);
+  dimensions_.push_back(e_space.matrixSize.z);
+  
+  field_of_view_.push_back(r_space.fieldOfView_mm.x);
+  field_of_view_.push_back(e_space.fieldOfView_mm.y);
+  field_of_view_.push_back(e_space.fieldOfView_mm.z);
+  GDEBUG("FOV: %f, %f, %f\n", r_space.fieldOfView_mm.x, e_space.fieldOfView_mm.y, e_space.fieldOfView_mm.z);
+  
+  slices_ = e_limits.slice? e_limits.slice->maximum+1 : 1;
+
+  return GADGET_OK;
+}
+
+int AccumulatorGadget::
+process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+	GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+
+  if (!buffer_) {
+	  dimensions_.push_back(m1->getObjectPtr()->active_channels);
+	  dimensions_.push_back(slices_);
+
+	  if (!(buffer_ = new hoNDArray< std::complex<float> >())) {
+		  GDEBUG("Failed create buffer\n");
+		  return GADGET_FAIL;
+	  }
+
+	  try {buffer_->create(&dimensions_);}
+	  catch (std::runtime_error &err){
+		  GEXCEPTION(err,"Failed allocate buffer array\n");
+		  return GADGET_FAIL;
+	  }
+
+	  image_series_ = image_series.value();
+
+  }
+
+
+  std::complex<float>* b =
+		  buffer_->get_data_ptr();
+
+  std::complex<float>* d =
+		  m2->getObjectPtr()->get_data_ptr();
+
+  int samples =  m1->getObjectPtr()->number_of_samples;
+  int line = m1->getObjectPtr()->idx.kspace_encode_step_1;
+  int partition = m1->getObjectPtr()->idx.kspace_encode_step_2;
+  int slice = m1->getObjectPtr()->idx.slice;
+
+  if (samples > static_cast<int>(dimensions_[0])) {
+	  GDEBUG("Wrong number of samples received\n");
+	  return GADGET_FAIL;
+  }
+
+  size_t offset= 0;
+  //Copy the data for all the channels
+  for (int c = 0; c < m1->getObjectPtr()->active_channels; c++) {
+    offset = 
+      slice*dimensions_[0]*dimensions_[1]*dimensions_[2]*dimensions_[3] +
+      c*dimensions_[0]*dimensions_[1]*dimensions_[2] +
+      partition*dimensions_[0]*dimensions_[1] +
+      line*dimensions_[0] + (dimensions_[0]>>1)-m1->getObjectPtr()->center_sample;
+    
+    memcpy(b+offset,
+    	d+c*samples,
+    	sizeof(std::complex<float>)*samples);
+  }
+  
+  bool is_last_scan_in_slice = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_LAST_IN_SLICE);
+  
+  if (is_last_scan_in_slice) {
+    GadgetContainerMessage<ISMRMRD::ImageHeader>* cm1 = 
+      new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+    // On some platforms, it is necessary to initialize the image header
+    memset(cm1->getObjectPtr(),0,sizeof(ISMRMRD::ImageHeader));
+    
+    cm1->getObjectPtr()->clearAllFlags();
+
+    GadgetContainerMessage< hoNDArray< std::complex<float> > >* cm2 = 
+      new GadgetContainerMessage<hoNDArray< std::complex<float> > >();
+    
+    cm1->cont(cm2);
+    
+    std::vector<size_t> img_dims(4);
+    img_dims[0] = dimensions_[0];
+    img_dims[1] = dimensions_[1];
+    img_dims[2] = dimensions_[2];
+    img_dims[3] = dimensions_[3];
+    
+    try{cm2->getObjectPtr()->create(&img_dims);}
+    catch (std::runtime_error &err){
+      GEXCEPTION(err,"Unable to allocate new image array\n");
+      cm1->release();
+      return -1;
+    }
+    
+    size_t data_length = dimensions_[0]*dimensions_[1]*
+    		dimensions_[2]*dimensions_[3];
+    
+    offset = slice*data_length;
+    
+    memcpy(cm2->getObjectPtr()->get_data_ptr(),b+offset,
+	   sizeof(std::complex<float>)*data_length);
+    
+    cm1->getObjectPtr()->matrix_size[0]     = (uint16_t)img_dims[0];
+    cm1->getObjectPtr()->matrix_size[1]     = (uint16_t)img_dims[1];
+    cm1->getObjectPtr()->matrix_size[2]     = (uint16_t)img_dims[2];
+    cm1->getObjectPtr()->field_of_view[0]   = field_of_view_[0];
+    cm1->getObjectPtr()->field_of_view[1]   = field_of_view_[1];
+    cm1->getObjectPtr()->field_of_view[2]   = field_of_view_[2];
+    cm1->getObjectPtr()->channels           = (uint16_t)img_dims[3];
+    cm1->getObjectPtr()->slice   = m1->getObjectPtr()->idx.slice;
+
+    memcpy(cm1->getObjectPtr()->position,
+    		m1->getObjectPtr()->position,
+	   sizeof(float)*3);
+
+    memcpy(cm1->getObjectPtr()->read_dir,
+                m1->getObjectPtr()->read_dir,
+           sizeof(float)*3);
+
+    memcpy(cm1->getObjectPtr()->phase_dir,
+                m1->getObjectPtr()->phase_dir,
+           sizeof(float)*3);
+
+    memcpy(cm1->getObjectPtr()->slice_dir,
+                m1->getObjectPtr()->slice_dir,
+           sizeof(float)*3);
+
+    memcpy(cm1->getObjectPtr()->patient_table_position,
+    		m1->getObjectPtr()->patient_table_position, sizeof(float)*3);
+
+    cm1->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_CXFLOAT;
+    cm1->getObjectPtr()->image_index = (uint16_t)(++image_counter_);
+    cm1->getObjectPtr()->image_series_index = (uint16_t)image_series_;
+
+    if (this->next()->putq(cm1) < 0) {
+    	return GADGET_FAIL;
+    }
+  } 
+
+  m1->release();
+  return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(AccumulatorGadget)
+}
diff --git a/gadgets/mri_core/AccumulatorGadget.h b/gadgets/mri_core/AccumulatorGadget.h
new file mode 100644
index 0000000..26f7c34
--- /dev/null
+++ b/gadgets/mri_core/AccumulatorGadget.h
@@ -0,0 +1,38 @@
+#ifndef ACCUMULATORGADGET_H
+#define ACCUMULATORGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+  
+  class EXPORTGADGETSMRICORE AccumulatorGadget : 
+  public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+    {
+      
+    public:
+      GADGET_DECLARE(AccumulatorGadget);
+      
+      AccumulatorGadget();
+      ~AccumulatorGadget();
+      
+    protected:
+      GADGET_PROPERTY(image_series, int, "Image series", 0);
+
+      virtual int process_config(ACE_Message_Block* mb);
+      virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2);
+      
+      hoNDArray< std::complex<float> >* buffer_;
+      std::vector<size_t> dimensions_;
+      std::vector<float> field_of_view_;
+      size_t slices_;
+      long long image_counter_;
+      long long image_series_;
+    };
+}
+#endif //ACCUMULATORGADGET_H
diff --git a/gadgets/mri_core/AcquisitionAccumulateTriggerGadget.cpp b/gadgets/mri_core/AcquisitionAccumulateTriggerGadget.cpp
new file mode 100644
index 0000000..2950ca6
--- /dev/null
+++ b/gadgets/mri_core/AcquisitionAccumulateTriggerGadget.cpp
@@ -0,0 +1,403 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "AcquisitionAccumulateTriggerGadget.h"
+#include "mri_core_data.h"
+#include "log.h"
+
+namespace Gadgetron{
+
+  AcquisitionAccumulateTriggerGadget::~AcquisitionAccumulateTriggerGadget()
+  {
+    //The buckets array should be empty but just in case, let's make sure all the stuff is released.
+    for (map_type_::iterator it = buckets_.begin(); it != buckets_.end(); it++) {
+      if (it->second) {
+	it->second->release();
+      }
+    }
+  }
+
+  int AcquisitionAccumulateTriggerGadget
+  ::process_config(ACE_Message_Block* mb)
+  {
+
+    std::string trigger_dimension_local = trigger_dimension.value();
+    std::string sorting_dimension_local = sorting_dimension.value();
+    
+    if (trigger_dimension_local.size() == 0) {
+      trigger_ = NONE;
+    } else if (trigger_dimension_local.compare("kspace_encode_step_1") == 0) {
+      trigger_ = KSPACE_ENCODE_STEP_1;
+    } else if (trigger_dimension_local.compare("kspace_encode_step_2") == 0) {
+      trigger_ = KSPACE_ENCODE_STEP_2;
+    } else if (trigger_dimension_local.compare("average") == 0) {
+      trigger_ = AVERAGE;
+    } else if (trigger_dimension_local.compare("slice") == 0) {
+      trigger_ = SLICE;
+    } else if (trigger_dimension_local.compare("contrast") == 0) {
+      trigger_ = CONTRAST;
+    } else if (trigger_dimension_local.compare("phase") == 0) {
+      trigger_ = PHASE;
+    } else if (trigger_dimension_local.compare("repetition") == 0) {
+      trigger_ = REPETITION;
+    } else if (trigger_dimension_local.compare("set") == 0) {
+      trigger_ = SET;
+    } else if (trigger_dimension_local.compare("segment") == 0) {
+      trigger_ = SEGMENT;
+    } else if (trigger_dimension_local.compare("user_0") == 0) {
+      trigger_ = USER_0;
+    } else if (trigger_dimension_local.compare("user_1") == 0) {
+      trigger_ = USER_1;
+    } else if (trigger_dimension_local.compare("user_2") == 0) {
+      trigger_ = USER_2;
+    } else if (trigger_dimension_local.compare("user_3") == 0) {
+      trigger_ = USER_3;
+    } else if (trigger_dimension_local.compare("user_4") == 0) {
+      trigger_ = USER_4;
+    } else if (trigger_dimension_local.compare("user_5") == 0) {
+      trigger_ = USER_5;
+    } else if (trigger_dimension_local.compare("user_6") == 0) {
+      trigger_ = USER_6;
+    } else if (trigger_dimension_local.compare("user_7") == 0) {
+      trigger_ = USER_7;
+    } else {
+      GDEBUG("WARNING: Unknown trigger dimension (%s), trigger condition set to NONE (end of scan)", trigger_dimension_local.c_str());
+      trigger_ = NONE;
+    }
+  
+    GDEBUG("TRIGGER DIMENSION IS: %s (%d)\n", trigger_dimension_local.c_str(), trigger_);
+
+    if (sorting_dimension_local.size() == 0) {
+      sort_ = NONE;
+    } else if (sorting_dimension_local.compare("kspace_encode_step_1") == 0) {
+      sort_ = KSPACE_ENCODE_STEP_1;
+    } else if (sorting_dimension_local.compare("kspace_encode_step_2") == 0) {
+      sort_ = KSPACE_ENCODE_STEP_2;
+    } else if (sorting_dimension_local.compare("average") == 0) {
+      sort_ = AVERAGE;
+    } else if (sorting_dimension_local.compare("slice") == 0) {
+      sort_ = SLICE;
+    } else if (sorting_dimension_local.compare("contrast") == 0) {
+      sort_ = CONTRAST;
+    } else if (sorting_dimension_local.compare("phase") == 0) {
+      sort_ = PHASE;
+    } else if (sorting_dimension_local.compare("repetition") == 0) {
+      sort_ = REPETITION;
+    } else if (sorting_dimension_local.compare("set") == 0) {
+      sort_ = SET;
+    } else if (sorting_dimension_local.compare("segment") == 0) {
+      sort_ = SEGMENT;
+    } else if (sorting_dimension_local.compare("user_0") == 0) {
+      sort_ = USER_0;
+    } else if (sorting_dimension_local.compare("user_1") == 0) {
+      sort_ = USER_1;
+    } else if (sorting_dimension_local.compare("user_2") == 0) {
+      sort_ = USER_2;
+    } else if (sorting_dimension_local.compare("user_3") == 0) {
+      sort_ = USER_3;
+    } else if (sorting_dimension_local.compare("user_4") == 0) {
+      sort_ = USER_4;
+    } else if (sorting_dimension_local.compare("user_5") == 0) {
+      sort_ = USER_5;
+    } else if (sorting_dimension_local.compare("user_6") == 0) {
+      sort_ = USER_6;
+    } else if (sorting_dimension_local.compare("user_7") == 0) {
+      sort_ = USER_7;
+    } else {
+      GDEBUG("WARNING: Unknown sort dimension (%s), sorting set to NONE\n", sorting_dimension_local.c_str());
+      sort_ = NONE;
+    }
+  
+    GDEBUG("SORTING DIMENSION IS: %s (%d)\n", sorting_dimension_local.c_str(), sort_);
+
+    trigger_events_ = 0;
+
+    return GADGET_OK;
+  }
+
+  int AcquisitionAccumulateTriggerGadget
+  ::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+	    GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+  {
+
+    //Ignore noise scans
+    if (m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT)) {
+        m1->release();
+        return GADGET_OK;
+    }
+                
+    //It is enough to put the first one, since they are linked
+    unsigned short sorting_index = 0;
+    switch (sort_) {
+    case KSPACE_ENCODE_STEP_1:
+      sorting_index = m1->getObjectPtr()->idx.kspace_encode_step_1;
+      break;
+    case KSPACE_ENCODE_STEP_2:
+      sorting_index = m1->getObjectPtr()->idx.kspace_encode_step_2;
+      break;
+    case AVERAGE:
+      sorting_index = m1->getObjectPtr()->idx.average;
+      break;
+    case SLICE:
+      sorting_index = m1->getObjectPtr()->idx.slice;
+      break;
+    case CONTRAST:
+      sorting_index = m1->getObjectPtr()->idx.contrast;
+      break;
+    case PHASE:
+      sorting_index = m1->getObjectPtr()->idx.phase;
+      break;
+    case REPETITION:
+      sorting_index = m1->getObjectPtr()->idx.repetition;
+      break;
+    case SET:
+      sorting_index = m1->getObjectPtr()->idx.set;
+      break;
+    case SEGMENT:
+      sorting_index = m1->getObjectPtr()->idx.segment;	
+      break;
+    case USER_0:
+      sorting_index = m1->getObjectPtr()->idx.user[0];
+      break;
+    case USER_1:
+      sorting_index = m1->getObjectPtr()->idx.user[1];	
+      break;
+    case USER_2:
+      sorting_index = m1->getObjectPtr()->idx.user[2];
+      break;
+    case USER_3:
+      sorting_index = m1->getObjectPtr()->idx.user[3];
+      break;
+    case USER_4:
+      sorting_index = m1->getObjectPtr()->idx.user[4];	
+      break;
+    case USER_5:
+      sorting_index = m1->getObjectPtr()->idx.user[5];
+      break;
+    case USER_6:
+      sorting_index = m1->getObjectPtr()->idx.user[6];	
+      break;
+    case USER_7:
+      sorting_index = m1->getObjectPtr()->idx.user[7];
+      break;
+    case NONE:
+      sorting_index = 0;
+      break;	
+    default:
+      GDEBUG("Unknown sorting condition %d\n", sort_);
+      m1->release();
+      return GADGET_FAIL;
+    }
+    
+    //Create the data structure that will go in the bucket
+    IsmrmrdAcquisitionData d(m1,m2,AsContainerMessage< hoNDArray<float> >(m2->cont()));
+
+    //Now let's figure out if a trigger condition has occurred.
+    if (prev_.head_) { //Make sure this is not the first acquisition we are receiving
+      switch (trigger_) {
+      case KSPACE_ENCODE_STEP_1:
+ 	if (prev_.head_->getObjectPtr()->idx.kspace_encode_step_1 !=
+	    d.head_->getObjectPtr()->idx.kspace_encode_step_1) {
+	  trigger();
+	}
+	break;
+      case KSPACE_ENCODE_STEP_2:
+ 	if (prev_.head_->getObjectPtr()->idx.kspace_encode_step_2 !=
+	    d.head_->getObjectPtr()->idx.kspace_encode_step_2) {
+	  trigger();
+	}
+	break;
+      case AVERAGE:
+ 	if (prev_.head_->getObjectPtr()->idx.average !=
+	    d.head_->getObjectPtr()->idx.average) {
+	  trigger();
+	}
+	break;
+      case SLICE:
+	if (prev_.head_->getObjectPtr()->idx.slice !=
+	    d.head_->getObjectPtr()->idx.slice) {
+	  trigger();
+	}
+	break;
+      case CONTRAST:
+	if (prev_.head_->getObjectPtr()->idx.contrast !=
+	    d.head_->getObjectPtr()->idx.contrast) {
+	  trigger();
+	}
+	break;
+      case PHASE:
+	if (prev_.head_->getObjectPtr()->idx.phase !=
+	    d.head_->getObjectPtr()->idx.phase) {
+	  trigger();
+	}
+	break;
+      case REPETITION:
+	if (prev_.head_->getObjectPtr()->idx.repetition !=
+	    d.head_->getObjectPtr()->idx.repetition) {
+	  trigger();
+	}
+	break;
+      case SET:
+	if (prev_.head_->getObjectPtr()->idx.set !=
+	    d.head_->getObjectPtr()->idx.set) {
+	  trigger();
+	}      
+	break;
+      case SEGMENT:
+	if (prev_.head_->getObjectPtr()->idx.segment !=
+	    d.head_->getObjectPtr()->idx.segment) {
+	  trigger();
+	}
+	break;
+      case USER_0:
+	if (prev_.head_->getObjectPtr()->idx.user[0] !=
+	    d.head_->getObjectPtr()->idx.user[0]) {
+	  trigger();
+	}
+	break;
+      case USER_1:
+	if (prev_.head_->getObjectPtr()->idx.user[1] !=
+	    d.head_->getObjectPtr()->idx.user[1]) {
+	  trigger();
+	}
+	break;
+      case USER_2:
+	if (prev_.head_->getObjectPtr()->idx.user[2] !=
+	    d.head_->getObjectPtr()->idx.user[2]) {
+	  trigger();
+	}
+	break;
+      case USER_3:
+	if (prev_.head_->getObjectPtr()->idx.user[3] !=
+	    d.head_->getObjectPtr()->idx.user[3]) {
+	  trigger();
+	}
+	break;
+      case USER_4:
+	if (prev_.head_->getObjectPtr()->idx.user[4] !=
+	    d.head_->getObjectPtr()->idx.user[4]) {
+	  trigger();
+	}
+	break;
+      case USER_5:
+	if (prev_.head_->getObjectPtr()->idx.user[5] !=
+	    d.head_->getObjectPtr()->idx.user[5]) {
+	  trigger();
+	}
+	break;
+      case USER_6:
+	if (prev_.head_->getObjectPtr()->idx.user[6] !=
+	    d.head_->getObjectPtr()->idx.user[6]) {
+	  trigger();
+	}
+	break;
+      case USER_7:
+	if (prev_.head_->getObjectPtr()->idx.user[7] !=
+	    d.head_->getObjectPtr()->idx.user[7]) {
+	  trigger();
+	}
+	break;
+      case NONE:
+	break;	
+      default:
+	GDEBUG("Unknown trigger condition %d\n", trigger_);
+	return GADGET_FAIL;	
+      }
+    }
+    
+    //Now we can update the previous data item that we store for 
+    //purposes of determining if trigger condition has occurred. 
+    prev_ = d;
+    
+    //Find the bucket the data should go in
+    map_type_::iterator it = buckets_.find(sorting_index);
+    if (it == buckets_.end()) {
+      //Bucket does not exist, create it
+      buckets_[sorting_index] = new GadgetContainerMessage<IsmrmrdAcquisitionBucket>;
+    }
+    IsmrmrdAcquisitionBucket* bucket = buckets_[sorting_index]->getObjectPtr();
+
+    uint16_t espace = m1->getObjectPtr()->encoding_space_ref;
+
+    if (!ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_PARALLEL_CALIBRATION).isSet(m1->getObjectPtr()->flags))
+      {
+	bucket->data_.push_back(d);
+        if (bucket->datastats_.size() < (espace+1)) {
+            bucket->datastats_.resize(espace+1);
+        }
+        bucket->datastats_[espace].kspace_encode_step_1.insert(m1->getObjectPtr()->idx.kspace_encode_step_1);
+        bucket->datastats_[espace].kspace_encode_step_2.insert(m1->getObjectPtr()->idx.kspace_encode_step_2);
+        bucket->datastats_[espace].slice.insert(m1->getObjectPtr()->idx.slice);
+        bucket->datastats_[espace].phase.insert(m1->getObjectPtr()->idx.phase);
+        bucket->datastats_[espace].contrast.insert(m1->getObjectPtr()->idx.contrast);
+        bucket->datastats_[espace].set.insert(m1->getObjectPtr()->idx.set);
+        bucket->datastats_[espace].segment.insert(m1->getObjectPtr()->idx.segment);
+        bucket->datastats_[espace].average.insert(m1->getObjectPtr()->idx.average);
+        bucket->datastats_[espace].repetition.insert(m1->getObjectPtr()->idx.repetition);
+      }
+
+    if ( ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_PARALLEL_CALIBRATION).isSet(m1->getObjectPtr()->flags) ||
+	 ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_PARALLEL_CALIBRATION_AND_IMAGING).isSet(m1->getObjectPtr()->flags) )
+      {
+	bucket->ref_.push_back(d);
+        if (bucket->refstats_.size() < (espace+1)) {
+            bucket->refstats_.resize(espace+1);
+        }
+        bucket->refstats_[espace].kspace_encode_step_1.insert(m1->getObjectPtr()->idx.kspace_encode_step_1);
+        bucket->refstats_[espace].kspace_encode_step_2.insert(m1->getObjectPtr()->idx.kspace_encode_step_2);
+        bucket->refstats_[espace].slice.insert(m1->getObjectPtr()->idx.slice);
+        bucket->refstats_[espace].phase.insert(m1->getObjectPtr()->idx.phase);
+        bucket->refstats_[espace].contrast.insert(m1->getObjectPtr()->idx.contrast);
+        bucket->refstats_[espace].set.insert(m1->getObjectPtr()->idx.set);
+        bucket->refstats_[espace].segment.insert(m1->getObjectPtr()->idx.segment);
+        bucket->refstats_[espace].average.insert(m1->getObjectPtr()->idx.average);
+        bucket->refstats_[espace].repetition.insert(m1->getObjectPtr()->idx.repetition);
+      }
+
+    //We can release the data now. It is reference counted and counter have been incremented through operations above. 
+    m1->release();
+
+    //TODO: 
+    // At this point it would make sense to check the data flags for trigger conditions. 
+    
+    return GADGET_OK;
+  }
+
+  int AcquisitionAccumulateTriggerGadget::trigger() 
+  {
+    //We will keep track of the triggers we encounter
+    trigger_events_++;
+
+    GDEBUG("Trigger (%d) occurred, sending out %d buckets\n", trigger_events_, buckets_.size());
+    //Pass all buckets down the chain
+    for (map_type_::iterator it = buckets_.begin(); it != buckets_.end(); it++) {
+      if (it->second) {
+	  if (this->next()->putq(it->second) == -1) {
+	    it->second->release();
+	    GDEBUG("Failed to pass bucket down the chain\n");
+	    return GADGET_FAIL;
+	  }
+      }
+    }
+
+    buckets_.clear();
+    prev_ = IsmrmrdAcquisitionData(); //Reset previous so that we don't end up triggering again
+    return GADGET_OK;
+  }
+
+  int AcquisitionAccumulateTriggerGadget::close(unsigned long flags)
+  {
+    
+    int ret = Gadget::close(flags);
+    
+    if ( flags != 0 ) {
+      GDEBUG("AcquisitionAccumulateTriggerGadget::close\n");
+      trigger();
+    }
+    return ret;
+  }
+
+
+  GADGET_FACTORY_DECLARE(AcquisitionAccumulateTriggerGadget)
+
+}
+
+
diff --git a/gadgets/mri_core/AcquisitionAccumulateTriggerGadget.h b/gadgets/mri_core/AcquisitionAccumulateTriggerGadget.h
new file mode 100644
index 0000000..86fe4c9
--- /dev/null
+++ b/gadgets/mri_core/AcquisitionAccumulateTriggerGadget.h
@@ -0,0 +1,88 @@
+#ifndef ACQUISITIONACCUMULATETRIGGERGADGET_H
+#define ACQUISITIONACCUMULATETRIGGERGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+#include <map>
+#include "mri_core_data.h"
+
+namespace Gadgetron{
+
+
+  class EXPORTGADGETSMRICORE AcquisitionAccumulateTriggerGadget : 
+  public Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(AcquisitionAccumulateTriggerGadget);
+
+      typedef std::map< unsigned short int, GadgetContainerMessage<IsmrmrdAcquisitionBucket>* > map_type_;
+
+      virtual ~AcquisitionAccumulateTriggerGadget();
+
+      int close(unsigned long flags);
+
+
+    protected:
+      GADGET_PROPERTY_LIMITS(trigger_dimension, std::string, "Dimension to trigger on", "",
+			     GadgetPropertyLimitsEnumeration, 
+			     "kspace_encode_step_1",
+			     "kspace_encode_step_2",
+			     "average",
+			     "slice",
+			     "contrast",
+			     "phase",
+			     "repetition",
+			     "set",
+			     "segment",
+			     "user_0",
+			     "user_1",
+			     "user_2",
+			     "user_3",
+			     "user_4",
+			     "user_5",
+			     "user_6",
+			     "user_7",
+			     "");
+
+      GADGET_PROPERTY_LIMITS(sorting_dimension, std::string, "Dimension to sort by", "", 
+			     GadgetPropertyLimitsEnumeration, 
+			     "kspace_encode_step_1",
+			     "kspace_encode_step_2",
+			     "average",
+			     "slice",
+			     "contrast",
+			     "phase",
+			     "repetition",
+			     "set",
+			     "segment",
+			     "user_0",
+			     "user_1",
+			     "user_2",
+			     "user_3",
+			     "user_4",
+			     "user_5",
+			     "user_6",
+			     "user_7",
+			     "");
+      IsmrmrdCONDITION trigger_;
+      IsmrmrdCONDITION sort_;
+      map_type_  buckets_;
+      IsmrmrdAcquisitionData prev_;
+      unsigned long trigger_events_;
+
+      virtual int process_config(ACE_Message_Block* mb);
+
+      virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+      virtual int trigger();
+
+    };
+
+  
+}
+#endif //ACQUISITIONACCUMULATETRIGGERGADGET_H
diff --git a/gadgets/mri_core/AcquisitionFinishGadget.cpp b/gadgets/mri_core/AcquisitionFinishGadget.cpp
new file mode 100644
index 0000000..2e1da85
--- /dev/null
+++ b/gadgets/mri_core/AcquisitionFinishGadget.cpp
@@ -0,0 +1,25 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "GadgetMessageInterface.h"
+#include "AcquisitionFinishGadget.h"
+#include "GadgetStreamController.h"
+
+using namespace Gadgetron;
+
+int AcquisitionFinishGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+				 GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+  if (!controller_) {
+    GERROR("Cannot return result to controller, no controller set\n");
+    return -1;
+  }
+
+  GadgetContainerMessage<GadgetMessageIdentifier>* mb =
+    new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+  mb->getObjectPtr()->id = GADGET_MESSAGE_ACQUISITION;
+
+  mb->cont(m1);
+  return controller_->output_ready(mb);
+}
+
+GADGET_FACTORY_DECLARE(AcquisitionFinishGadget)
diff --git a/gadgets/mri_core/AcquisitionFinishGadget.h b/gadgets/mri_core/AcquisitionFinishGadget.h
new file mode 100644
index 0000000..b12ec44
--- /dev/null
+++ b/gadgets/mri_core/AcquisitionFinishGadget.h
@@ -0,0 +1,26 @@
+#ifndef ACQUISITIONFINISHGADGET_H
+#define ACQUISITIONFINISHGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "GadgetMRIHeaders.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE AcquisitionFinishGadget : 
+  public Gadget2<ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(AcquisitionFinishGadget);
+      
+    protected:
+      virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+    };
+}
+
+#endif //ACQUISITIONFINISHGADGET_H
diff --git a/gadgets/mri_core/AcquisitionPassthroughGadget.cpp b/gadgets/mri_core/AcquisitionPassthroughGadget.cpp
new file mode 100644
index 0000000..870d562
--- /dev/null
+++ b/gadgets/mri_core/AcquisitionPassthroughGadget.cpp
@@ -0,0 +1,21 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "AcquisitionPassthroughGadget.h"
+
+namespace Gadgetron{
+int AcquisitionPassthroughGadget
+::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+  //It is enough to put the first one, since they are linked
+  if (this->next()->putq(m1) == -1) {
+    m1->release();
+    GERROR("AcquisitionPassthroughGadget::process, passing data on to next gadget");
+    return -1;
+  }
+
+  return 0;
+}
+GADGET_FACTORY_DECLARE(AcquisitionPassthroughGadget)
+}
+
+
diff --git a/gadgets/mri_core/AcquisitionPassthroughGadget.h b/gadgets/mri_core/AcquisitionPassthroughGadget.h
new file mode 100644
index 0000000..9708de5
--- /dev/null
+++ b/gadgets/mri_core/AcquisitionPassthroughGadget.h
@@ -0,0 +1,24 @@
+#ifndef ACQUISITIONPASSTHROUGHGADGET_H
+#define ACQUISITIONPASSTHROUGHGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE AcquisitionPassthroughGadget : 
+  public Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(AcquisitionPassthroughGadget);
+      
+    protected:
+      virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+    };
+}
+#endif //ACQUISITIONPASSTHROUGHGADGET_H
diff --git a/gadgets/mri_core/AsymmetricEchoAdjustROGadget.cpp b/gadgets/mri_core/AsymmetricEchoAdjustROGadget.cpp
new file mode 100644
index 0000000..0dac5aa
--- /dev/null
+++ b/gadgets/mri_core/AsymmetricEchoAdjustROGadget.cpp
@@ -0,0 +1,138 @@
+#include "AsymmetricEchoAdjustROGadget.h"
+#include "ismrmrd/xml.h"
+
+namespace Gadgetron
+{
+
+AsymmetricEchoAdjustROGadget::AsymmetricEchoAdjustROGadget() : maxRO_(0)
+{
+
+}
+
+int AsymmetricEchoAdjustROGadget::process_config(ACE_Message_Block* mb)
+{
+  ISMRMRD::IsmrmrdHeader h;
+  deserialize(mb->rd_ptr(),h);
+
+  if (h.encoding.size() != 1) {
+    GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+    GDEBUG("This partial fourier gadget only supports one encoding space\n");
+    return GADGET_FAIL;
+  }
+
+  ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+  maxRO_ = e_space.matrixSize.x;
+  GDEBUG_STREAM("max RO : " << maxRO_);
+  return GADGET_OK;
+}
+
+int addPrePostZeros(size_t centre_column, size_t samples)
+{
+    // 1 : pre zeros
+    // 2 : post zeros
+    // 0 : no zeros
+    if ( 2*centre_column == samples )
+    {
+        return 0;
+    }
+
+    if ( 2*centre_column < samples )
+    {
+        return 1;
+    }
+
+    if ( 2*centre_column > samples )
+    {
+        return 2;
+    }
+
+    return 0;
+}
+
+int AsymmetricEchoAdjustROGadget
+::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+        GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+
+    bool is_noise = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT).isSet(m1->getObjectPtr()->flags);
+    long long channels = (long long)m1->getObjectPtr()->active_channels;
+    size_t samples = m1->getObjectPtr()->number_of_samples;
+    size_t centre_column = m1->getObjectPtr()->center_sample;
+
+    if (!is_noise) 
+    {
+        // adjust the center echo
+        int az = addPrePostZeros(centre_column, samples);
+
+        if ( az!= 0 && samples < maxRO_ )
+        {
+            GadgetContainerMessage< hoNDArray< std::complex<float> > >* m3 = new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+            if (!m3)
+            {
+                return GADGET_FAIL;
+            }
+
+            std::vector<size_t> data_out_dims = *m2->getObjectPtr()->get_dimensions();
+            data_out_dims[0] = maxRO_;
+            try
+            {
+                m3->getObjectPtr()->create(&data_out_dims);
+            }
+            catch(...)
+            {
+                GDEBUG("Unable to create new data array for downsampled data\n");
+                return GADGET_FAIL;
+            }
+            m3->getObjectPtr()->fill(0);
+
+            std::complex<float>* pM3 = m3->getObjectPtr()->get_data_ptr();
+            std::complex<float>* pM2 = m2->getObjectPtr()->get_data_ptr();
+
+            long long c;
+            size_t numOfBytes = sizeof( std::complex<float> )*samples;
+
+            if ( az == 1 ) // pre zeros
+            {
+                //#pragma omp parallel for default(none) private(c) shared(channels, pM3, pM2, samples, numOfBytes)
+                for ( c=0; c<channels; c++ )
+                {
+                    memcpy(pM3+c*maxRO_+maxRO_-samples, pM2+c*samples, numOfBytes);
+                }
+            }
+
+            if ( az == 2 ) // post zeros
+            {
+                //#pragma omp parallel for default(none) private(c) shared(channels, pM3, pM2, samples, numOfBytes)
+                for ( c=0; c<channels; c++ )
+                {
+                    memcpy(pM3+c*maxRO_, pM2+c*samples, numOfBytes);
+                }
+            }
+
+            m2->release(); //We are done with this data
+
+            m1->cont(m3);
+            m1->getObjectPtr()->number_of_samples = data_out_dims[0];
+        }
+
+        if (this->next()->putq(m1) == -1) 
+        {
+	  GERROR("NoiseAdjustGadget::process, passing data on to next gadget");
+	  return -1;
+        }
+    }
+    else
+    {
+        if (this->next()->putq(m1) == -1) 
+        {
+	  GERROR("NoiseAdjustGadget::process, passing data on to next gadget");
+	  return -1;
+        }
+    }
+
+    return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(AsymmetricEchoAdjustROGadget)
+
+}
diff --git a/gadgets/mri_core/AsymmetricEchoAdjustROGadget.h b/gadgets/mri_core/AsymmetricEchoAdjustROGadget.h
new file mode 100644
index 0000000..1c97492
--- /dev/null
+++ b/gadgets/mri_core/AsymmetricEchoAdjustROGadget.h
@@ -0,0 +1,32 @@
+
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd/ismrmrd.h"
+#include "gadgetron_mricore_export.h"
+
+namespace Gadgetron
+{
+
+/// for incoming readout
+/// if not the noise scan and the partial fourier along readout is detected
+/// the readout data will be realigned with center of echo at the centre of incoming 1D array
+class EXPORTGADGETSMRICORE AsymmetricEchoAdjustROGadget : public Gadgetron::Gadget2<ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+{
+public:
+
+    GADGET_DECLARE(AsymmetricEchoAdjustROGadget);
+
+    AsymmetricEchoAdjustROGadget();
+
+protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+        Gadgetron::GadgetContainerMessage< Gadgetron::hoNDArray< std::complex<float> > >* m2);
+
+    unsigned int maxRO_;
+};
+
+}
diff --git a/gadgets/mri_core/AutoScaleGadget.cpp b/gadgets/mri_core/AutoScaleGadget.cpp
new file mode 100644
index 0000000..5ab93a9
--- /dev/null
+++ b/gadgets/mri_core/AutoScaleGadget.cpp
@@ -0,0 +1,81 @@
+/*
+ * AutoScaleGadget.cpp
+ *
+ *  Created on: Dec 19, 2011
+ *      Author: Michael S. Hansen
+ */
+
+#include "GadgetIsmrmrdReadWrite.h"
+#include "AutoScaleGadget.h"
+
+namespace Gadgetron{
+
+AutoScaleGadget::AutoScaleGadget()
+	: histogram_bins_(100)
+	, current_scale_(1.0)
+	, max_value_(2048)
+{
+}
+
+AutoScaleGadget::~AutoScaleGadget() {
+	// TODO Auto-generated destructor stub
+}
+
+int AutoScaleGadget::process_config(ACE_Message_Block* mb) {
+        max_value_ = max_value.value();
+	return GADGET_OK;
+}
+
+
+int AutoScaleGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<hoNDArray<float> > *m2)
+{
+	if (m1->getObjectPtr()->image_type == ISMRMRD::ISMRMRD_IMTYPE_MAGNITUDE) { //Only scale magnitude images for now
+		float max = 0.0f;
+		float* d = m2->getObjectPtr()->get_data_ptr();
+		for (unsigned long int i = 0; i < m2->getObjectPtr()->get_number_of_elements(); i++) {
+			if (d[i] > max) max = d[i];
+		}
+
+		if (histogram_.size() != histogram_bins_) {
+			histogram_ = std::vector<size_t>(histogram_bins_);
+		}
+
+		for (size_t i = 0; i < histogram_bins_; i++) {
+			histogram_[i] = 0;
+		}
+
+		for (unsigned long int i = 0; i < m2->getObjectPtr()->get_number_of_elements(); i++) {
+			size_t bin = static_cast<size_t>(floor((d[i]/max)*histogram_bins_));
+			if (bin >= histogram_bins_) {
+				bin = histogram_bins_-1;
+			}
+			histogram_[bin]++;
+		}
+
+		//Find 99th percentile
+		long long cumsum = 0;
+		size_t counter = 0;
+		while (cumsum < (0.99*m2->getObjectPtr()->get_number_of_elements())) {
+			cumsum += (long long)(histogram_[counter++]);
+		}
+		max = (counter+1)*(max/histogram_bins_);
+		GDEBUG("Max: %f\n",max);
+
+		current_scale_ = max_value_/max;
+
+		for (unsigned long int i = 0; i < m2->getObjectPtr()->get_number_of_elements(); i++) {
+			d[i] *= current_scale_;
+		}
+	}
+
+	if (this->next()->putq(m1) < 0) {
+		GDEBUG("Failed to pass on data to next Gadget\n");
+		return GADGET_FAIL;
+	}
+
+	return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(AutoScaleGadget)
+
+}
diff --git a/gadgets/mri_core/AutoScaleGadget.h b/gadgets/mri_core/AutoScaleGadget.h
new file mode 100644
index 0000000..eeeedc3
--- /dev/null
+++ b/gadgets/mri_core/AutoScaleGadget.h
@@ -0,0 +1,35 @@
+#ifndef AUTOSCALEGADGET_H_
+#define AUTOSCALEGADGET_H_
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE AutoScaleGadget:
+    public Gadget2<ISMRMRD::ImageHeader,hoNDArray< float > >
+  {
+  public:
+    GADGET_DECLARE(AutoScaleGadget);
+
+    AutoScaleGadget();
+    virtual ~AutoScaleGadget();
+
+  protected:
+    GADGET_PROPERTY(max_value, float, "Maximum value (after scaling)", 2048);
+
+    virtual int process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+			GadgetContainerMessage< hoNDArray< float > >* m2);
+    virtual int process_config(ACE_Message_Block *mb);
+
+    unsigned int histogram_bins_;
+    std::vector<size_t> histogram_;
+    float current_scale_;
+    float max_value_;
+  };
+}
+
+#endif /* AUTOSCALEGADGET_H_ */
diff --git a/gadgets/mri_core/BucketToBufferGadget.cpp b/gadgets/mri_core/BucketToBufferGadget.cpp
new file mode 100644
index 0000000..5da7276
--- /dev/null
+++ b/gadgets/mri_core/BucketToBufferGadget.cpp
@@ -0,0 +1,622 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "BucketToBufferGadget.h"
+#include "mri_core_data.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_reductions.h"
+namespace Gadgetron{
+
+  BucketToBufferGadget::~BucketToBufferGadget()
+  {
+    //The buckets array should be empty but just in case, let's make sure all the stuff is released.
+  }
+
+  int BucketToBufferGadget
+  ::process_config(ACE_Message_Block* mb)
+  {
+    if (N_dimension.value().size() == 0) {
+      N_ = NONE;
+    } else if (N_dimension.value().compare("average") == 0) {
+      N_ = AVERAGE;
+    } else if (N_dimension.value().compare("contrast") == 0) {
+      N_ = CONTRAST;
+    } else if (N_dimension.value().compare("phase") == 0) {
+      N_ = PHASE;
+    } else if (N_dimension.value().compare("repetition") == 0) {
+      N_ = REPETITION;
+    } else if (N_dimension.value().compare("set") == 0) {
+      N_ = SET;
+    } else if (N_dimension.value().compare("segment") == 0) {
+      N_ = SEGMENT;
+    } else if (N_dimension.value().compare("slice") == 0){
+      N_ = SLICE;
+    } else {
+      GDEBUG("WARNING: Unknown N dimension (%s), N set to NONE", N_dimension.value().c_str());
+      N_ = NONE;
+    }
+
+    GDEBUG("N DIMENSION IS: %s (%d)\n", N_dimension.value().c_str(), N_);
+
+    if (S_dimension.value().size() == 0) {
+        S_ = NONE;
+    } else if (S_dimension.value().compare("average") == 0) {
+        S_ = AVERAGE;
+    } else if (S_dimension.value().compare("contrast") == 0) {
+        S_ = CONTRAST;
+    } else if (S_dimension.value().compare("phase") == 0) {
+        S_ = PHASE;
+    } else if (S_dimension.value().compare("repetition") == 0) {
+        S_ = REPETITION;
+    } else if (S_dimension.value().compare("set") == 0) {
+        S_ = SET;
+    } else if (S_dimension.value().compare("segment") == 0) {
+        S_ = SEGMENT;
+    } else if (N_dimension.value().compare("slice") == 0){
+        S_ = SLICE;
+    } else {
+        GDEBUG("WARNING: Unknown sort dimension (%s), sorting set to NONE\n", S_dimension.value().c_str());
+        S_ = NONE;
+    }
+
+    GDEBUG("S DIMENSION IS: %s (%d)\n", S_dimension.value().c_str(), S_);
+
+    split_slices_  = split_slices.value();
+    GDEBUG("SPLIT SLICES IS: %b\n", split_slices_);
+
+    ignore_segment_  = ignore_segment.value();
+    GDEBUG("IGNORE SEGMENT IS: %b\n", ignore_segment_);
+
+    // keep a copy of the deserialized ismrmrd xml header for runtime
+    ISMRMRD::deserialize(mb->rd_ptr(), hdr_);
+
+    return GADGET_OK;
+  }
+
+  int BucketToBufferGadget
+  ::process(GadgetContainerMessage<IsmrmrdAcquisitionBucket>* m1)
+  {
+
+    size_t key;
+    std::map<size_t, GadgetContainerMessage<IsmrmrdReconData>* > recon_data_buffers;
+
+    //GDEBUG("BucketToBufferGadget::process\n");
+
+    //Some information about the bucket
+    //GDEBUG_STREAM("The Reference part: " << m1->getObjectPtr()->refstats_.size() << std::endl);
+    //GDEBUG_STREAM("   nslices: " << m1->getObjectPtr()->refstats_[0].slice.size() << std::endl);
+    //for (int e=0; e<m1->getObjectPtr()->refstats_.size() ; e++) {
+    //    for (std::set<uint16_t>::iterator it = m1->getObjectPtr()->refstats_[e].kspace_encode_step_1.begin();
+    //         it != m1->getObjectPtr()->refstats_[e].kspace_encode_step_1.end(); ++it) {
+    //        GDEBUG_STREAM("   K1: " <<  *it << std::endl);
+    //    }
+    //}
+    //GDEBUG_STREAM("The data part: " << m1->getObjectPtr()->datastats_.size() << std::endl);
+    //GDEBUG_STREAM("   nslices: " << m1->getObjectPtr()->datastats_[0].slice.size() << std::endl);
+    //for (int e=0; e<m1->getObjectPtr()->datastats_.size() ; e++) {
+    //    for (std::set<uint16_t>::iterator it = m1->getObjectPtr()->datastats_[e].kspace_encode_step_1.begin();
+    //         it != m1->getObjectPtr()->datastats_[e].kspace_encode_step_1.end(); ++it) {
+    //        GDEBUG_STREAM("   K1: " <<  *it << std::endl);
+    //    }
+    //}
+
+    //Iterate over the reference data of the bucket
+    for(std::vector<IsmrmrdAcquisitionData>::iterator it = m1->getObjectPtr()->ref_.begin();
+        it != m1->getObjectPtr()->ref_.end(); ++it)
+      {
+        //Get a reference to the header for this acquisition
+        ISMRMRD::AcquisitionHeader & acqhdr = *it->head_->getObjectPtr();
+
+        //Generate the key to the corresponding ReconData buffer
+        key = getKey(acqhdr.idx);
+
+        //The storage is based on the encoding space
+        uint16_t espace = acqhdr.encoding_space_ref;
+
+        //GDEBUG_STREAM("espace: " << acqhdr.encoding_space_ref << std::endl);
+        //GDEBUG_STREAM("slice: " << acqhdr.idx.slice << std::endl);
+        //GDEBUG_STREAM("rep: " << acqhdr.idx.repetition << std::endl);
+        //GDEBUG_STREAM("k1: " << acqhdr.idx.kspace_encode_step_1 << std::endl);
+        //GDEBUG_STREAM("k2: " << acqhdr.idx.kspace_encode_step_2 << std::endl);
+        //GDEBUG_STREAM("seg: " << acqhdr.idx.segment << std::endl);
+        //GDEBUG_STREAM("key: " << key << std::endl);
+
+        //Get some references to simplify the notation
+        //the reconstruction bit corresponding to this ReconDataBuffer and encoding space
+        IsmrmrdReconBit & rbit = getRBit(recon_data_buffers, key, espace);
+        //and the corresponding data buffer for the reference data
+        IsmrmrdDataBuffered & dataBuffer = rbit.ref_;
+        //this encoding space's xml header info
+        ISMRMRD::Encoding & encoding = hdr_.encoding[espace];
+        //this bucket's reference stats
+        IsmrmrdAcquisitionBucketStats & stats = m1->getObjectPtr()->refstats_[espace];
+
+        //Fill the sampling description for this data buffer
+        fillSamplingDescription(dataBuffer.sampling_, encoding, stats);
+
+        //Make sure that the data storage for this data buffer has been allocated
+        //TODO should this check the limits, or should that be done in the stuff function?
+        allocateDataArrays(dataBuffer, acqhdr, encoding, stats);
+
+        // Stuff the data, header and trajectory into this data buffer
+        stuff(it, dataBuffer, encoding);
+      }
+
+
+    //Iterate over the imaging data of the bucket
+    // this is exactly the same code as for the reference data except for
+    // the chunk of the data buffer.
+    for(std::vector<IsmrmrdAcquisitionData>::iterator it = m1->getObjectPtr()->data_.begin();
+        it != m1->getObjectPtr()->data_.end(); ++it)
+      {
+        //Get a reference to the header for this acquisition
+        ISMRMRD::AcquisitionHeader & acqhdr = *it->head_->getObjectPtr();
+
+        //Generate the key to the corresponding ReconData buffer
+        key = getKey(acqhdr.idx);
+
+        //The storage is based on the encoding space
+        uint16_t espace = acqhdr.encoding_space_ref;
+
+        //GDEBUG_STREAM("espace: " << acqhdr.encoding_space_ref << std::endl);
+        //GDEBUG_STREAM("slice: " << acqhdr.idx.slice << std::endl);
+        //GDEBUG_STREAM("rep: " << acqhdr.idx.repetition << std::endl);
+        //GDEBUG_STREAM("k1: " << acqhdr.idx.kspace_encode_step_1 << std::endl);
+        //GDEBUG_STREAM("k2: " << acqhdr.idx.kspace_encode_step_2 << std::endl);
+        //GDEBUG_STREAM("seg: " << acqhdr.idx.segment << std::endl);
+        //GDEBUG_STREAM("key: " << key << std::endl);
+
+        //Get some references to simplify the notation
+        //the reconstruction bit corresponding to this ReconDataBuffer and encoding space
+        IsmrmrdReconBit & rbit = getRBit(recon_data_buffers, key, espace);
+        //and the corresponding data buffer for the imaging data
+        IsmrmrdDataBuffered & dataBuffer = rbit.data_;
+        //this encoding space's xml header info
+        ISMRMRD::Encoding & encoding = hdr_.encoding[espace];
+        //this bucket's imaging data stats
+        IsmrmrdAcquisitionBucketStats & stats = m1->getObjectPtr()->datastats_[espace];
+
+        //Fill the sampling description for this data buffer
+        fillSamplingDescription(dataBuffer.sampling_, encoding, stats);
+
+        //Make sure that the data storage for this data buffer has been allocated
+        //TODO should this check the limits, or should that be done in the stuff function?
+        allocateDataArrays(dataBuffer, acqhdr, encoding, stats);
+
+        // Stuff the data, header and trajectory into this data buffer
+        stuff(it, dataBuffer, encoding);
+      }
+
+
+    //Send all the ReconData messages
+    GDEBUG("End of bucket reached, sending out %d ReconData buffers\n", recon_data_buffers.size());
+    for(std::map<size_t, GadgetContainerMessage<IsmrmrdReconData>* >::iterator it = recon_data_buffers.begin(); it != recon_data_buffers.end(); it++)
+      {
+        //GDEBUG_STREAM("Sending: " << it->first << std::endl);
+        if (it->second) {
+            if (this->next()->putq(it->second) == -1) {
+                it->second->release();
+                throw std::runtime_error("Failed to pass bucket down the chain\n");
+            }
+        }
+      }
+
+    //Clear the recondata buffer map
+    recon_data_buffers.clear();  // is this necessary?
+
+    //We can release the incoming bucket now. This will release all of the data it contains.
+    m1->release();
+
+    return GADGET_OK;
+  }
+
+  int BucketToBufferGadget::close(unsigned long flags)
+  {
+
+    int ret = Gadget::close(flags);
+    GDEBUG("BucketToBufferGadget::close\n");
+
+    return ret;
+  }
+
+  size_t BucketToBufferGadget::getSlice(ISMRMRD::ISMRMRD_EncodingCounters idx)
+  {
+    size_t index;
+
+    if( split_slices_ ) {
+        index = idx.slice;
+    } else {
+        index = 0;
+    }
+
+    return index;
+  }
+
+  size_t BucketToBufferGadget::getN(ISMRMRD::ISMRMRD_EncodingCounters idx)
+  {
+    size_t index;
+
+    if (N_ == AVERAGE) {
+        index = idx.average;
+    } else if (N_ == CONTRAST) {
+        index = idx.contrast;
+    } else if (N_ == PHASE) {
+        index = idx.phase;
+    } else if (N_ == REPETITION) {
+        index = idx.repetition;
+    } else if (N_ == SET) {
+        index = idx.set;
+    } else if (N_ == SEGMENT) {
+        index = idx.segment;
+    } else {
+        index = 0;
+    }
+
+    return index;
+  }
+
+  size_t BucketToBufferGadget::getS(ISMRMRD::ISMRMRD_EncodingCounters idx)
+  {
+    size_t index;
+
+    if (S_ == AVERAGE) {
+        index = idx.average;
+    } else if (S_ == CONTRAST) {
+        index = idx.contrast;
+    } else if (S_ == PHASE) {
+        index = idx.phase;
+    } else if (S_ == REPETITION) {
+        index = idx.repetition;
+    } else if (S_ == SET) {
+        index = idx.set;
+    } else if (S_ == SEGMENT) {
+        index = idx.segment;
+    } else {
+        index = 0;
+    }
+
+    return index;
+  }
+
+  size_t BucketToBufferGadget::getKey(ISMRMRD::ISMRMRD_EncodingCounters idx)
+  {
+    //[SLC, PHS, CON, REP, SET, SEG, AVE]
+    //collapse across two of them (N and S)
+
+    size_t slice, phase, contrast, repetition, set, segment, average;
+
+    if (split_slices_) {
+        slice = idx.slice;
+    } else {
+        slice = 0;
+    }
+
+    if ((N_ == PHASE) || (S_ == PHASE)) {
+        phase = 0;
+    } else {
+        phase = idx.phase;
+    }
+
+    if ((N_ == CONTRAST) || (S_ == CONTRAST)) {
+        contrast = 0;
+    } else {
+        contrast = idx.contrast;
+    }
+
+    if ((N_ == REPETITION) || (S_ == REPETITION)) {
+        repetition = 0;
+    } else {
+        repetition = idx.repetition;
+    }
+
+    if ((N_ == SET) || (S_ == SET)) {
+        set = 0;
+    } else {
+        set = idx.set;
+    }
+
+    if ((N_ == SEGMENT) || (S_ == SEGMENT) || ignore_segment_) {
+        segment = 0;
+    } else {
+        segment = idx.segment;
+    }
+
+    if ((S_ == AVERAGE) || (N_ == AVERAGE)) {
+        average = 0;
+    } else {
+        average = idx.average;
+    }
+
+    size_t key = 0;
+    key += slice      * 0x1;
+    key += phase      * 0x100;
+    key += contrast   * 0x10000;
+    key += repetition * 0x1000000;
+    key += set        * 0x100000000;
+    key += segment    * 0x10000000000;
+    key += average    * 0x1000000000000;
+
+    return key;
+  }
+
+  IsmrmrdReconBit & BucketToBufferGadget::getRBit(std::map<size_t, GadgetContainerMessage<IsmrmrdReconData>* > & recon_data_buffers, size_t key, uint16_t espace)
+  {
+    //Look up the corresponding ReconData buffer
+    if (recon_data_buffers.find(key) == recon_data_buffers.end())
+      {
+        //ReconData buffer does not exist, create it
+        recon_data_buffers[key] = new GadgetContainerMessage<IsmrmrdReconData>;
+      }
+
+    //Look up the DataBuffered entry corresponding to this encoding space
+    // create if needed and set the fields of view and matrix size
+    if ( recon_data_buffers[key]->getObjectPtr()->rbit_.size() < (espace+1) )
+      {
+        recon_data_buffers[key]->getObjectPtr()->rbit_.resize(espace+1);
+      }
+
+    return recon_data_buffers[key]->getObjectPtr()->rbit_[espace];
+
+  }
+
+  void BucketToBufferGadget::allocateDataArrays(IsmrmrdDataBuffered & dataBuffer, ISMRMRD::AcquisitionHeader & acqhdr, ISMRMRD::Encoding encoding, IsmrmrdAcquisitionBucketStats & stats)
+  {
+    if (dataBuffer.data_.get_number_of_elements() == 0)
+      {
+        //Allocate the reference data array
+        //7D,  fixed order [E0, E1, E2, CHA, N, S, LOC]
+        //11D, fixed order [E0, E1, E2, CHA, SLC, PHS, CON, REP, SET, SEG, AVE]
+        uint16_t NE0;
+        if (encoding.trajectory.compare("cartesian") == 0) {
+            NE0 = encoding.reconSpace.matrixSize.x;
+        } else {
+            NE0 = acqhdr.number_of_samples - acqhdr.discard_pre - acqhdr.discard_post;
+        }
+
+        uint16_t NE1;
+        if (encoding.trajectory.compare("cartesian") == 0) {
+            NE1 = encoding.encodedSpace.matrixSize.y;
+        } else {
+            if (encoding.encodingLimits.kspace_encoding_step_1.is_present()) {
+                NE1 = encoding.encodingLimits.kspace_encoding_step_1->maximum - encoding.encodingLimits.kspace_encoding_step_1->minimum + 1;
+            } else {
+                NE1 = *stats.kspace_encode_step_1.rbegin() - *stats.kspace_encode_step_1.begin() + 1;
+            }
+        }
+
+        uint16_t NE2;
+        if (encoding.trajectory.compare("cartesian") == 0) {
+            NE2 = encoding.encodedSpace.matrixSize.z;
+        } else {
+            if (encoding.encodingLimits.kspace_encoding_step_2.is_present()) {
+                NE2 = encoding.encodingLimits.kspace_encoding_step_2->maximum - encoding.encodingLimits.kspace_encoding_step_2->minimum + 1;
+            } else {
+                NE2 = *stats.kspace_encode_step_2.rbegin() - *stats.kspace_encode_step_2.begin() + 1;
+            }
+        }
+
+        uint16_t NCHA = acqhdr.active_channels;
+
+        uint16_t NLOC;
+        if (split_slices_) {
+            NLOC = 1;
+        } else {
+            if (encoding.encodingLimits.slice.is_present()) {
+                NLOC = encoding.encodingLimits.slice->maximum - encoding.encodingLimits.slice->minimum + 1;
+            } else {
+                NLOC = *stats.slice.rbegin() - *stats.slice.begin() + 1;
+            }
+        }
+
+        uint16_t NN;
+        switch (N_) {
+        case PHASE:
+          NN = *stats.phase.rbegin() - *stats.phase.begin() + 1;
+          break;
+        case CONTRAST:
+          NN = *stats.contrast.rbegin() - *stats.contrast.begin() + 1;
+          break;
+        case REPETITION:
+          NN = *stats.repetition.rbegin() - *stats.repetition.begin() + 1;
+          break;
+        case SET:
+          NN = *stats.set.rbegin() - *stats.set.begin() + 1;
+          break;
+        case SEGMENT:
+          NN = *stats.segment.rbegin() - *stats.segment.begin() + 1;
+          break;
+        case AVERAGE:
+          NN = *stats.average.rbegin() - *stats.average.begin() + 1;
+          break;
+        case SLICE:
+          NN =  *stats.slice.rbegin() - *stats.slice.begin() + 1;
+          break;
+        default:
+          NN = 1;
+        }
+
+        uint16_t NS;
+        switch (S_) {
+        case PHASE:
+          NS = *stats.phase.rbegin() - *stats.phase.begin() + 1;
+          break;
+        case CONTRAST:
+          NS = *stats.contrast.rbegin() - *stats.contrast.begin() + 1;
+          break;
+        case REPETITION:
+          NS = *stats.repetition.rbegin() - *stats.repetition.begin() + 1;
+          break;
+        case SET:
+          NS = *stats.set.rbegin() - *stats.set.begin() + 1;
+          break;
+        case SEGMENT:
+          NS = *stats.segment.rbegin() - *stats.segment.begin() + 1;
+          break;
+        case AVERAGE:
+          NS = *stats.average.rbegin() - *stats.average.begin() + 1;
+          break;
+        case SLICE:
+          NS =  *stats.slice.rbegin() - *stats.slice.begin() + 1;
+          break;
+        default:
+          NS = 1;
+        }
+
+        //GDEBUG_STREAM("Data dimensions:" << std::endl);
+        //GDEBUG_STREAM("   NE0:  " << NE0  << std::endl);
+        //GDEBUG_STREAM("   NE1:  " << NE1  << std::endl);
+        //GDEBUG_STREAM("   NE2:  " << NE2  << std::endl);
+        //GDEBUG_STREAM("   NLOC: " << NLOC << std::endl);
+        //GDEBUG_STREAM("   NCHA: " << NCHA << std::endl);
+        //GDEBUG_STREAM("   NN:   " << NN   << std::endl);
+        //GDEBUG_STREAM("   NS:   " << NS   << std::endl);
+
+        //Allocate the array for the data
+        dataBuffer.data_.create(NE0, NE1, NE2, NCHA, NN, NS, NLOC);
+        clear(&dataBuffer.data_);
+
+        //Allocate the array for the headers
+        dataBuffer.headers_.create(NE1, NE2, NN, NS, NLOC);
+
+        //Allocate the array for the trajectories
+        uint16_t TRAJDIM = acqhdr.trajectory_dimensions;
+        if (TRAJDIM > 0)
+          {
+            dataBuffer.trajectory_.create(TRAJDIM, NE0, NE1, NE2, NN, NS, NLOC);
+            clear(&dataBuffer.trajectory_);
+          }
+
+        //boost::shared_ptr< std::vector<size_t> > dims =  dataBuffer.data_.get_dimensions();
+        //GDEBUG_STREAM("NDArray dims: ");
+        //for( std::vector<size_t>::const_iterator i = dims->begin(); i != dims->end(); ++i) {
+        //    GDEBUG_STREAM(*i << ' ');
+        //}
+        //GDEBUG_STREAM(std::endl);
+      }
+
+  }
+
+  void BucketToBufferGadget::fillSamplingDescription(SamplingDescription & sampling, ISMRMRD::Encoding & encoding, IsmrmrdAcquisitionBucketStats & stats)
+  {
+    // For cartesian trajectories, assume that any oversampling has been removed.
+    if (encoding.trajectory.compare("cartesian") == 0) {
+        sampling.encoded_FOV_[0] = encoding.reconSpace.fieldOfView_mm.x;
+        sampling.encoded_matrix_[0] = encoding.reconSpace.matrixSize.x;
+    } else {
+        sampling.encoded_FOV_[0] = encoding.encodedSpace.fieldOfView_mm.x;
+        sampling.encoded_matrix_[0] = encoding.encodedSpace.matrixSize.x;
+    }
+
+    sampling.encoded_FOV_[1] = encoding.encodedSpace.fieldOfView_mm.y;
+    sampling.encoded_FOV_[2] = encoding.encodedSpace.fieldOfView_mm.z;
+
+    sampling.encoded_matrix_[1] = encoding.encodedSpace.matrixSize.y;
+    sampling.encoded_matrix_[2] = encoding.encodedSpace.matrixSize.z;
+
+    sampling.recon_FOV_[0] = encoding.reconSpace.fieldOfView_mm.x;
+    sampling.recon_FOV_[1] = encoding.reconSpace.fieldOfView_mm.y;
+    sampling.recon_FOV_[2] = encoding.reconSpace.fieldOfView_mm.z;
+
+    sampling.recon_matrix_[0] = encoding.reconSpace.matrixSize.x;
+    sampling.recon_matrix_[1] = encoding.reconSpace.matrixSize.y;
+    sampling.recon_matrix_[2] = encoding.reconSpace.matrixSize.z;
+
+    // For cartesian trajectories, assume that any oversampling has been removed.
+    if (encoding.trajectory.compare("cartesian") == 0) {
+        sampling.sampling_limits_[0].min_ = 0;
+        sampling.sampling_limits_[0].max_ = encoding.reconSpace.matrixSize.x - 1;
+        sampling.sampling_limits_[0].center_ = encoding.reconSpace.matrixSize.x / 2;
+    } else {
+        sampling.sampling_limits_[0].min_ = 0;
+        sampling.sampling_limits_[0].max_ = encoding.encodedSpace.matrixSize.x - 1;
+        sampling.sampling_limits_[0].center_ = encoding.encodedSpace.matrixSize.x / 2;
+    }
+
+    sampling.sampling_limits_[1].min_ =
+        encoding.encodingLimits.kspace_encoding_step_1->minimum;
+    sampling.sampling_limits_[1].max_ =
+        encoding.encodingLimits.kspace_encoding_step_1->maximum;
+    sampling.sampling_limits_[1].center_ =
+        encoding.encodingLimits.kspace_encoding_step_1->center;
+
+    sampling.sampling_limits_[2].min_ =
+        encoding.encodingLimits.kspace_encoding_step_2->minimum;
+    sampling.sampling_limits_[2].max_ =
+        encoding.encodingLimits.kspace_encoding_step_2->maximum;
+    sampling.sampling_limits_[2].center_ =
+        encoding.encodingLimits.kspace_encoding_step_2->center;
+  }
+
+  void BucketToBufferGadget::stuff(std::vector<IsmrmrdAcquisitionData>::iterator it, IsmrmrdDataBuffered & dataBuffer, ISMRMRD::Encoding encoding)
+  {
+
+    // The acquisition header and data
+    ISMRMRD::AcquisitionHeader & acqhdr = *it->head_->getObjectPtr();
+    hoNDArray< std::complex<float> > & acqdata = *it->data_->getObjectPtr();
+    // we make one for the trajectory down below if we need it
+
+    size_t slice_loc;
+    if (split_slices_)
+      {
+        slice_loc = 0;
+      }
+    else
+      {
+        slice_loc = acqhdr.idx.slice;
+      }
+
+    //Stuff the data
+    uint16_t npts_to_copy = acqhdr.number_of_samples - acqhdr.discard_pre - acqhdr.discard_post;
+    long long offset;
+    if (encoding.trajectory.compare("cartesian") == 0) {
+        offset  = (long long) dataBuffer.sampling_.sampling_limits_[0].center_ - (long long) acqhdr.center_sample;
+    } else {
+        //TODO what about EPI with asymmetric readouts?
+        //TODO any other sort of trajectory?
+        offset = 0;
+    }
+    long long roffset = (long long) dataBuffer.data_.get_size(0) - npts_to_copy - offset;
+
+    //GDEBUG_STREAM("Num_samp: "<< acqhdr.number_of_samples << ", pre: " << acqhdr.discard_pre << ", post" << acqhdr.discard_post << std::endl);
+    //std::cout << "Sampling limits: "
+    //    << "  min: " << dataBuffer.sampling_.sampling_limits_[0].min_
+    //    << "  max: " << dataBuffer.sampling_.sampling_limits_[0].max_
+    //    << "  center: " << dataBuffer.sampling_.sampling_limits_[0].center_
+    //    << std::endl;
+    //GDEBUG_STREAM("npts_to_copy = " << npts_to_copy  << std::endl);
+    //GDEBUG_STREAM("offset = " << offset  << std::endl);
+    //GDEBUG_STREAM("loffset = " << roffset << std::endl);
+
+    if ((offset < 0) | (roffset < 0) )
+      {
+        throw std::runtime_error("Acquired reference data does not fit into the reference data buffer.\n");
+      }
+
+    std::complex<float> *dataptr;
+    uint16_t NCHA = dataBuffer.data_.get_size(3);
+    for (uint16_t cha = 0; cha < NCHA; cha++)
+      {
+        dataptr = & dataBuffer.data_(
+            offset, acqhdr.idx.kspace_encode_step_1, acqhdr.idx.kspace_encode_step_2, cha, getN(acqhdr.idx),  getS(acqhdr.idx), slice_loc);
+
+
+        memcpy(dataptr, &acqdata(acqhdr.discard_pre, cha), sizeof(std::complex<float>)*npts_to_copy);
+      }
+
+    //Stuff the header
+    dataBuffer.headers_(acqhdr.idx.kspace_encode_step_1,
+        acqhdr.idx.kspace_encode_step_2, getN(acqhdr.idx),  getS(acqhdr.idx), slice_loc) = acqhdr;
+
+    //Stuff the trajectory
+    if (acqhdr.trajectory_dimensions > 0) {
+        hoNDArray< float > & acqtraj = *it->traj_->getObjectPtr();  // TODO do we need to check this?
+
+        float * trajptr;
+        trajptr = &dataBuffer.trajectory_(0,
+            offset, acqhdr.idx.kspace_encode_step_1, acqhdr.idx.kspace_encode_step_2, getN(acqhdr.idx),  getS(acqhdr.idx), slice_loc);
+        memcpy(trajptr, & acqtraj(0,acqhdr.discard_pre ), sizeof(float)*npts_to_copy*acqhdr.trajectory_dimensions);
+
+    }
+  }
+
+  GADGET_FACTORY_DECLARE(BucketToBufferGadget)
+
+}
diff --git a/gadgets/mri_core/BucketToBufferGadget.h b/gadgets/mri_core/BucketToBufferGadget.h
new file mode 100644
index 0000000..06e23e1
--- /dev/null
+++ b/gadgets/mri_core/BucketToBufferGadget.h
@@ -0,0 +1,77 @@
+#ifndef BUCKETTOBUFFER_H
+#define BUCKETTOBUFFER_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <ismrmrd/xml.h>
+#include <complex>
+#include <map>
+#include "mri_core_data.h"
+
+namespace Gadgetron{
+
+    // TODO the ignore_segment_ flag is a hack for some EPI sequences
+    // should be fixed on the converter side.
+
+  class EXPORTGADGETSMRICORE BucketToBufferGadget : 
+  public Gadget1<IsmrmrdAcquisitionBucket>
+    {
+    public:
+      GADGET_DECLARE(BucketToBufferGadget);
+
+      virtual ~BucketToBufferGadget();
+
+      int close(unsigned long flags);
+      
+    protected:
+      GADGET_PROPERTY_LIMITS(N_dimension, std::string, "N-Dimensions", "", 
+			     GadgetPropertyLimitsEnumeration,
+			     "average",
+			     "contrast",
+			     "phase",
+			     "repetition",
+			     "set",
+			     "segment",
+			     "slice",
+			     "");
+
+      GADGET_PROPERTY_LIMITS(S_dimension, std::string, "S-Dimensions", "", 
+			     GadgetPropertyLimitsEnumeration,
+			     "average",
+			     "contrast",
+			     "phase",
+			     "repetition",
+			     "set",
+			     "segment",
+			     "slice",
+			     "");
+
+      GADGET_PROPERTY(split_slices, bool, "Split slices", false);
+      GADGET_PROPERTY(ignore_segment, bool, "Ignore segment", false);
+
+      IsmrmrdCONDITION N_;
+      IsmrmrdCONDITION S_;
+      bool split_slices_;
+      bool ignore_segment_;
+      ISMRMRD::IsmrmrdHeader hdr_;
+      
+      virtual int process_config(ACE_Message_Block* mb);
+      virtual int process(GadgetContainerMessage<IsmrmrdAcquisitionBucket>* m1);
+      size_t getKey(ISMRMRD::ISMRMRD_EncodingCounters idx);
+      size_t getSlice(ISMRMRD::ISMRMRD_EncodingCounters idx);
+      size_t getN(ISMRMRD::ISMRMRD_EncodingCounters idx);
+      size_t getS(ISMRMRD::ISMRMRD_EncodingCounters idx);
+
+      IsmrmrdReconBit & getRBit(std::map<size_t, GadgetContainerMessage<IsmrmrdReconData>* > & recon_data_buffers, size_t key, uint16_t espace);
+      void allocateDataArrays(IsmrmrdDataBuffered &  dataBuffer, ISMRMRD::AcquisitionHeader & acqhdr, ISMRMRD::Encoding encoding, IsmrmrdAcquisitionBucketStats & stats);
+      void fillSamplingDescription(SamplingDescription & sampling, ISMRMRD::Encoding & encoding, IsmrmrdAcquisitionBucketStats & stats);
+      void stuff(std::vector<IsmrmrdAcquisitionData>::iterator it, IsmrmrdDataBuffered & dataBuffer, ISMRMRD::Encoding encoding);
+
+    };
+
+  
+}
+#endif //BUCKETTOBUFFER_H
diff --git a/gadgets/mri_core/CMakeLists.txt b/gadgets/mri_core/CMakeLists.txt
new file mode 100644
index 0000000..ea4bf3c
--- /dev/null
+++ b/gadgets/mri_core/CMakeLists.txt
@@ -0,0 +1,163 @@
+IF (WIN32)
+    ADD_DEFINITIONS(-D__BUILD_GADGETRON_MRICORE__)
+ENDIF (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+
+if (MKL_FOUND)
+    # This is a fix for the bug in SVD when MKL is multi-threaded
+    MESSAGE("MKL Found, enabling MKL for mri_core gadgets.")
+    add_definitions(-DHAVE_MKL)
+    # These are needed to get the linking to work properly when
+    # MKL is installed, but Armadillo is NOT using it.
+    list(APPEND EXTRA_MKL_LIBRARIES mkl_core)
+    list(APPEND EXTRA_MKL_LIBRARIES mkl_intel_thread)
+    INCLUDE_DIRECTORIES( ${MKL_INCLUDE_DIR} )
+    LINK_DIRECTORIES( ${MKL_LIB_DIR} ${MKL_COMPILER_LIB_DIR} )
+endif (MKL_FOUND)
+
+include_directories(
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+    ${CMAKE_SOURCE_DIR}/toolboxes/core
+    ${CMAKE_SOURCE_DIR}/toolboxes/log
+    ${CMAKE_SOURCE_DIR}/toolboxes/mri_core
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/algorithm
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/mri_core
+    ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/fft/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/fft/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/util
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/workflow
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/algorithm
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/solver
+    ${ARMADILLO_INCLUDE_DIRS}
+    ${MKL_INCLUDE_DIR}
+    ${ISMRMRD_INCLUDE_DIR}
+)
+
+if (ARMADILLO_FOUND)
+    list(APPEND OPTIMIZED_GADGETS NoiseAdjustGadget.cpp)
+    list(APPEND OPTIMIZED_GADGETS PCACoilGadget.cpp)
+    list(APPEND OPTIMIZED_GADGET_HEADERS NoiseAdjustGadget.h)
+    list(APPEND OPTIMIZED_GADGET_HEADERS PCACoilGadget.h)
+    list(APPEND OPTIMIZED_GADGET_LIBS gadgetron_toolbox_cpucore_math ${ARMADILLO_LIBRARIES})
+elseif (ARMADILLO_FOUND)
+    MESSAGE("Armadillo not found, i.e. not compiling Blas/Lapack optimized MRI Gadgets")
+endif (ARMADILLO_FOUND)
+
+set( gadgetron_mricore_header_files GadgetMRIHeaders.h 
+                                    AcquisitionPassthroughGadget.h 
+                                    AcquisitionFinishGadget.h 
+                                    AccumulatorGadget.h 
+                                    FFTGadget.h 
+                                    ImageFinishGadget.h 
+                                    CombineGadget.h
+                                    CropAndCombineGadget.h  
+                                    ImageWriterGadget.h 
+                                    MRIImageWriter.h 
+                                    NoiseAdjustGadget_unoptimized.h 
+                                    ExtractGadget.h 
+                                    FloatToFixPointGadget.h 
+                                    RemoveROOversamplingGadget.h 
+                                    CoilReductionGadget.h 
+                                    AutoScaleGadget.h 
+                                    FlowPhaseSubtractionGadget.h 
+                                    GadgetIsmrmrdReadWrite.h 
+                                    PhysioInterpolationGadget.h 
+                                    IsmrmrdDumpGadget.h 
+                                    AsymmetricEchoAdjustROGadget.h 
+                                    MaxwellCorrectionGadget.h 
+                                    CplxDumpGadget.h 
+                                    DependencyQueryGadget.h 
+                                    DependencyQueryWriter.h 
+                                    ComplexToFloatGadget.h
+                                    AcquisitionAccumulateTriggerGadget.h
+                                    BucketToBufferGadget.h 
+                                    ImageArraySplitGadget.h
+                                    SimpleReconGadget.h)
+
+set( gadgetron_mricore_src_files AcquisitionPassthroughGadget.cpp 
+                                AcquisitionFinishGadget.cpp 
+                                AccumulatorGadget.cpp 
+                                FFTGadget.cpp 
+                                ImageFinishGadget.cpp 
+                                CombineGadget.cpp 
+                                CropAndCombineGadget.cpp 
+                                ImageWriterGadget.cpp 
+                                MRIImageWriter.cpp 
+                                NoiseAdjustGadget_unoptimized.cpp 
+                                ExtractGadget.cpp 
+                                FloatToFixPointGadget.cpp 
+                                RemoveROOversamplingGadget.cpp
+                                CoilReductionGadget.cpp
+                                AutoScaleGadget.cpp
+                                FlowPhaseSubtractionGadget.cpp
+                                GadgetIsmrmrdReadWrite.cpp
+                                PhysioInterpolationGadget.cpp
+                                IsmrmrdDumpGadget.cpp
+                                AsymmetricEchoAdjustROGadget.cpp
+                                MaxwellCorrectionGadget.cpp
+                                CplxDumpGadget.cpp 
+                                DependencyQueryGadget.cpp 
+                                DependencyQueryWriter.cpp 
+                                ComplexToFloatGadget.cpp 
+                                AcquisitionAccumulateTriggerGadget.cpp
+                                BucketToBufferGadget.cpp
+                                ImageArraySplitGadget.cpp
+                                SimpleReconGadget.cpp  )
+
+if (WIN32)
+    set( gadgetron_mricore_header_files ${gadgetron_mricore_header_files} WhiteNoiseInjectorGadget.h )
+    set( gadgetron_mricore_src_files ${gadgetron_mricore_src_files} WhiteNoiseInjectorGadget.cpp )
+endif (WIN32)
+
+set( gadgetron_mricore_config_files
+    default.xml
+    default_short.xml
+    default_optimized.xml
+    default_measurement_dependencies.xml
+)
+
+add_library(gadgetron_mricore SHARED 
+    gadgetron_mricore_export.h 
+    ${gadgetron_mricore_header_files} 
+    ${gadgetron_mricore_src_files}
+    ${gadgetron_mricore_config_files}
+    ${OPTIMIZED_GADGETS}
+    ${OPTIMIZED_GADGET_HEADERS}
+)
+
+set_target_properties(gadgetron_mricore PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_mricore 
+    gadgetron_gadgetbase
+    gadgetron_toolbox_log
+    gadgetron_toolbox_cpucore
+    gadgetron_toolbox_cpufft
+    ${ISMRMRD_LIBRARIES} 
+    ${FFTW3_LIBRARIES} 
+    optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+    ${BOOST_LIBRARIES}
+    ${OPTIMIZED_GADGET_LIBS}
+)
+
+install(FILES 
+    gadgetron_mricore_export.h
+    ${gadgetron_mricore_header_files}
+    ${OPTIMIZED_GADGET_HEADERS}
+    DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+if (ARMADILLO_FOUND)
+    install(FILES ${gadgetron_mricore_config_files} DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
+endif (ARMADILLO_FOUND)
+
+install(FILES ismrmrd_dump.xml DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
+
+install(TARGETS gadgetron_mricore DESTINATION lib COMPONENT main)
diff --git a/gadgets/mri_core/CoilReductionGadget.cpp b/gadgets/mri_core/CoilReductionGadget.cpp
new file mode 100644
index 0000000..117eace
--- /dev/null
+++ b/gadgets/mri_core/CoilReductionGadget.cpp
@@ -0,0 +1,122 @@
+/*
+* CoilReductionGadget.cpp
+*
+*  Created on: Dec 5, 2011
+*      Author: hansenms
+*/
+
+#include "CoilReductionGadget.h"
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/split.hpp>
+#include "ismrmrd/xml.h"
+
+namespace Gadgetron{
+
+    CoilReductionGadget::CoilReductionGadget() {
+    }
+
+    CoilReductionGadget::~CoilReductionGadget() {
+    }
+
+    int CoilReductionGadget::process_config(ACE_Message_Block *mb)
+    {
+      ISMRMRD::IsmrmrdHeader h;
+      ISMRMRD::deserialize(mb->rd_ptr(),h);
+      
+      coils_in_ = h.acquisitionSystemInformation->receiverChannels ? *h.acquisitionSystemInformation->receiverChannels : 128;
+
+      std::string coil_mask_int = coil_mask.value();
+
+      if (coil_mask_int.compare(std::string("")) == 0) {
+	if (coils_out.value() <= 0) {
+	  GDEBUG("Invalid number of output coils %d\n", coils_out.value());
+	  return GADGET_FAIL;
+	}
+	coil_mask_ = std::vector<unsigned short>(coils_out.value(),1);
+      } else {
+	std::vector<std::string> chm;
+	boost::split(chm, coil_mask_int, boost::is_any_of(" "));
+	for (size_t i = 0; i < chm.size(); i++) {
+	  std::string ch = boost::algorithm::trim_copy(chm[i]);
+	  if (ch.size() > 0) {
+	    size_t mv = static_cast<size_t>(ACE_OS::atoi(ch.c_str()));
+	    //GDEBUG("Coil mask value: %d\n", mv);
+	    if (mv > 0) {
+	      coil_mask_.push_back(1);
+	    } else {
+	      coil_mask_.push_back(0);
+	    }
+	  }
+	}
+      }
+      
+      while (coil_mask_.size() < coils_in_) coil_mask_.push_back(0);
+      while (coil_mask_.size() > coils_in_) coil_mask_.pop_back();
+      
+      if (coil_mask_.size() != coils_in_) {
+	GDEBUG("Error configuring coils for coil reduction\n");
+	return GADGET_FAIL;
+      }
+      
+      coils_out_ = 0;
+      for (size_t i = 0; i < coil_mask_.size(); i++) {
+	if (coil_mask_[i]) coils_out_++;
+      }
+      
+      GDEBUG("Coil reduction from %d to %d\n", coils_in_, coils_out_);
+      
+      return GADGET_OK;
+    }
+
+
+    int CoilReductionGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1, GadgetContainerMessage<hoNDArray<std::complex<float> > > *m2)
+    {
+        std::vector<size_t> dims_out(2);
+        dims_out[0] = m1->getObjectPtr()->number_of_samples;
+        dims_out[1] = coils_out_;
+
+        GadgetContainerMessage< hoNDArray<std::complex<float> > >* m3 =
+            new GadgetContainerMessage< hoNDArray<std::complex<float> > >();
+
+        try{ m3->getObjectPtr()->create(&dims_out);}
+        catch (std::runtime_error &err){
+            GEXCEPTION(err,"Unable to create storage for reduced dataset size\n");
+            return GADGET_FAIL;
+        }
+
+        std::complex<float>* s = m2->getObjectPtr()->get_data_ptr();
+        std::complex<float>* d = m3->getObjectPtr()->get_data_ptr();
+        size_t samples =  m1->getObjectPtr()->number_of_samples;
+        size_t coils_copied = 0;
+        for (int c = 0; c < m1->getObjectPtr()->active_channels; c++) {
+            if (c > coil_mask_.size()) {
+                GDEBUG("Fatal error, too many coils for coil mask\n");
+                m3->release();
+                return GADGET_FAIL;
+            }
+            if (coil_mask_[c]) {
+                memcpy(d+coils_copied*samples,s+c*samples,sizeof(std::complex<float>)*samples);
+                coils_copied++;
+            }
+        }
+
+        m1->cont(m3);
+	
+	//In case trajectories are attached
+	m3->cont(m2->cont());
+	m2->cont(0);
+
+        m2->release();
+
+        m1->getObjectPtr()->active_channels = coils_out_;
+	
+        if( this->next()->putq(m1) < 0 ){
+	  GDEBUG("Failed to put message on queue\n");
+	  return GADGET_FAIL;
+	}
+	
+	return GADGET_OK;
+    }
+
+    GADGET_FACTORY_DECLARE(CoilReductionGadget)
+}
diff --git a/gadgets/mri_core/CoilReductionGadget.h b/gadgets/mri_core/CoilReductionGadget.h
new file mode 100644
index 0000000..cdae85a
--- /dev/null
+++ b/gadgets/mri_core/CoilReductionGadget.h
@@ -0,0 +1,35 @@
+#ifndef COILREDUCTIONGADGET_H_
+#define COILREDUCTIONGADGET_H_
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+class EXPORTGADGETSMRICORE CoilReductionGadget :
+  public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(CoilReductionGadget);
+      
+      CoilReductionGadget();
+      virtual ~CoilReductionGadget();
+      
+      virtual int process_config(ACE_Message_Block* mb);
+      virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2);
+      
+    protected:
+      GADGET_PROPERTY(coil_mask, std::string, "String mask of zeros and ones, e.g. 000111000 indicating which coils to keep", "");
+      GADGET_PROPERTY_LIMITS(coils_out, int, "Number of coils to keep, coils with higher indices will be discarded", 128,
+			     GadgetPropertyLimitsRange, 1, 1024);
+      std::vector<unsigned short> coil_mask_;
+      unsigned int coils_in_;
+      unsigned int coils_out_;      
+    };
+}
+#endif /* COILREDUCTIONGADGET_H_ */
diff --git a/gadgets/mri_core/CombineGadget.cpp b/gadgets/mri_core/CombineGadget.cpp
new file mode 100644
index 0000000..031510f
--- /dev/null
+++ b/gadgets/mri_core/CombineGadget.cpp
@@ -0,0 +1,69 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "CombineGadget.h"
+
+namespace Gadgetron{
+
+  CombineGadget::CombineGadget() {}
+  CombineGadget::~CombineGadget() {}
+
+int CombineGadget::
+process( GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+	 GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+
+  // Get the dimensions
+  size_t nx = m2->getObjectPtr()->get_size(0);
+  size_t ny = m2->getObjectPtr()->get_size(1);
+  size_t nz = m2->getObjectPtr()->get_size(2);
+  size_t nc = m2->getObjectPtr()->get_size(3);
+
+  // Create a new message with an hoNDArray for the combined image
+  GadgetContainerMessage< hoNDArray<std::complex<float> > >* m3 = 
+    new GadgetContainerMessage< hoNDArray<std::complex<float> > >();
+
+  std::vector<size_t> dimensions(3);
+  dimensions[0] = nx;
+  dimensions[1] = ny; 
+  dimensions[2] = nz;
+
+  try{m3->getObjectPtr()->create(&dimensions);}
+  catch (std::runtime_error &err){
+  	GEXCEPTION(err,"CombineGadget, failed to allocate new array\n");
+    return -1;
+  }
+
+  std::complex<float>* d1 = m2->getObjectPtr()->get_data_ptr();
+  std::complex<float>* d2 = m3->getObjectPtr()->get_data_ptr();
+
+  size_t img_block = nx*ny*nz;
+
+  for (size_t z = 0; z < nz; z++) {
+    for (size_t y = 0; y < ny; y++) {
+      for (size_t x = 0; x < nx; x++) {
+	float mag = 0;
+	float phase = 0;
+	size_t offset = z*ny*nx+y*nx+x;
+	for (size_t c = 0; c < nc; c++) {
+	  float mag_tmp = norm(d1[offset + c*img_block]);
+	  phase += mag_tmp*arg(d1[offset + c*img_block]);
+	  mag += mag_tmp;
+	}
+	d2[offset] = std::polar(std::sqrt(mag),phase);
+      }
+    }
+  }
+
+  // Modify header to match the size and change the type to real
+  m1->getObjectPtr()->channels = 1;
+
+  // Now add the new array to the outgoing message
+  m1->cont(m3);
+
+  // Release the old data
+  m2->release();
+
+  return this->next()->putq(m1);
+}
+
+GADGET_FACTORY_DECLARE(CombineGadget)
+}
diff --git a/gadgets/mri_core/CombineGadget.h b/gadgets/mri_core/CombineGadget.h
new file mode 100644
index 0000000..fc3e89b
--- /dev/null
+++ b/gadgets/mri_core/CombineGadget.h
@@ -0,0 +1,27 @@
+#ifndef COMBINEGADGET_H
+#define COMBINEGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "hoArmadillo.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+  
+  class  EXPORTGADGETSMRICORE CombineGadget : 
+  public Gadget2<ISMRMRD::ImageHeader, hoNDArray< std::complex<float> > >
+    {
+    public:
+      CombineGadget();
+      virtual ~CombineGadget();
+      
+    protected:
+      virtual int process( GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+			   GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);     
+    };
+}
+
+#endif //COMBINEGADGET_H
diff --git a/gadgets/mri_core/ComplexToFloatGadget.cpp b/gadgets/mri_core/ComplexToFloatGadget.cpp
new file mode 100644
index 0000000..0cac825
--- /dev/null
+++ b/gadgets/mri_core/ComplexToFloatGadget.cpp
@@ -0,0 +1,90 @@
+/*
+*       ComplexToFloatGadget.cpp
+*       Author: Hui Xue
+*/
+
+#include "GadgetIsmrmrdReadWrite.h"
+#include "ComplexToFloatGadget.h"
+#include "hoNDArray_elemwise.h"
+
+namespace Gadgetron
+{
+    ComplexToFloatGadget::ComplexToFloatGadget()
+    {
+    }
+
+    ComplexToFloatGadget::~ComplexToFloatGadget()
+    {
+    }
+
+    int ComplexToFloatGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, GadgetContainerMessage< hoNDArray< ValueType > >* m2)
+    {
+        GadgetContainerMessage<hoNDArray< float > > *cm2 = new GadgetContainerMessage<hoNDArray< float > >();
+
+        boost::shared_ptr< std::vector<size_t> > dims = m2->getObjectPtr()->get_dimensions();
+
+        try
+        {
+            cm2->getObjectPtr()->create(dims);
+        }
+        catch (std::runtime_error &err)
+        {
+            GEXCEPTION(err,"Unable to create float storage in ComplexToFloatGadget");
+            return GADGET_FAIL;
+        }
+
+        switch (m1->getObjectPtr()->image_type)
+        {
+            case ISMRMRD::ISMRMRD_IMTYPE_MAGNITUDE:
+            {
+                GADGET_CHECK_EXCEPTION_RETURN(Gadgetron::abs(*m2->getObjectPtr(), *cm2->getObjectPtr()), GADGET_FAIL);
+            }
+            break;
+
+            case ISMRMRD::ISMRMRD_IMTYPE_REAL:
+            {
+                GADGET_CHECK_EXCEPTION_RETURN(Gadgetron::complex_to_real(*m2->getObjectPtr(), *cm2->getObjectPtr()), GADGET_FAIL);
+            }
+            break;
+
+            case ISMRMRD::ISMRMRD_IMTYPE_IMAG:
+            {
+                GADGET_CHECK_EXCEPTION_RETURN(Gadgetron::complex_to_imag(*m2->getObjectPtr(), *cm2->getObjectPtr()), GADGET_FAIL);
+            }
+            break;
+
+            case ISMRMRD::ISMRMRD_IMTYPE_PHASE:
+            {
+                GADGET_CHECK_EXCEPTION_RETURN(Gadgetron::argument(*m2->getObjectPtr(), *cm2->getObjectPtr()), GADGET_FAIL);
+            }
+            break;
+
+            default:
+                GDEBUG("Unknown image type %d, bailing out\n",m1->getObjectPtr()->image_type);
+                m1->release();
+                cm2->release();
+                return GADGET_FAIL;
+        }
+
+        GadgetContainerMessage<ISMRMRD::MetaContainer>* m3 = AsContainerMessage<ISMRMRD::MetaContainer>(m2->cont());
+
+        m1->cont(cm2);
+        if(m3) cm2->cont(m3);
+
+        m2->cont(NULL);
+        m2->release();
+
+        m1->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_FLOAT;
+
+        if (this->next()->putq(m1) == -1)
+        {
+            m1->release();
+            GDEBUG("Unable to put unsigned short magnitude image on next gadgets queue");
+            return GADGET_FAIL;
+        }
+
+        return GADGET_OK;
+    }
+
+    GADGET_FACTORY_DECLARE(ComplexToFloatGadget)
+}
diff --git a/gadgets/mri_core/ComplexToFloatGadget.h b/gadgets/mri_core/ComplexToFloatGadget.h
new file mode 100644
index 0000000..175a257
--- /dev/null
+++ b/gadgets/mri_core/ComplexToFloatGadget.h
@@ -0,0 +1,34 @@
+/** \file   ComplexToFloatGadget.h
+    \brief  This Gadget converts complex float values to float format.
+    \author Hui Xue
+*/
+
+#ifndef ComplexToFloatGadget_H_
+#define ComplexToFloatGadget_H_
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd/meta.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+
+namespace Gadgetron
+{
+    class EXPORTGADGETSMRICORE ComplexToFloatGadget:public Gadget2<ISMRMRD::ImageHeader, hoNDArray< std::complex<float> > >
+    {
+    public:
+
+        GADGET_DECLARE(ComplexToFloatGadget);
+
+        typedef std::complex<float> ValueType;
+
+        ComplexToFloatGadget();
+        virtual ~ComplexToFloatGadget();
+
+    protected:
+        virtual int process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, GadgetContainerMessage< hoNDArray< ValueType > >* m2);
+    };
+}
+
+#endif // ComplexToFloatGadget
diff --git a/gadgets/mri_core/CplxDumpGadget.cpp b/gadgets/mri_core/CplxDumpGadget.cpp
new file mode 100644
index 0000000..a17f49e
--- /dev/null
+++ b/gadgets/mri_core/CplxDumpGadget.cpp
@@ -0,0 +1,137 @@
+#include "CplxDumpGadget.h"
+#include "hoNDArray_fileio.h"
+#include "hoNDArray_utils.h"
+
+namespace Gadgetron{
+
+  CplxDumpGadget::CplxDumpGadget() 
+    : Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >()
+    , buffer_(ACE_Message_Queue_Base::DEFAULT_HWM * 10000, ACE_Message_Queue_Base::DEFAULT_LWM * 10000)
+  {
+  }
+
+  CplxDumpGadget::~CplxDumpGadget() {}
+
+  int CplxDumpGadget::process_config(ACE_Message_Block* mb)
+  {
+    filename_ = filename.value();
+    return GADGET_OK;
+  }
+
+  int CplxDumpGadget::close(unsigned long flags) {
+    
+    GDEBUG("CplxDumpGadget::close...\n");
+    GDEBUG("Number of items on Q: %d\n", buffer_.message_count());
+
+    int ret = Gadget::close(flags);
+    unsigned int readouts_buffered = buffer_.message_count();
+
+    if( readouts_buffered == 0 )
+      return GADGET_OK;
+    
+    // Get the array size from the dimensions of the first buffer entry
+    //
+
+    ACE_Message_Block* mbq;
+    if (buffer_.dequeue_head(mbq) < 0) {
+      GDEBUG("Message dequeue failed\n");
+      return GADGET_FAIL;
+    }
+
+    GadgetContainerMessage< hoNDArray< std::complex<float> > > *daq = AsContainerMessage<hoNDArray< std::complex<float> > >(mbq);
+    
+    if (!daq) {
+      GDEBUG("Unable to interpret data on message queue\n");
+      return GADGET_FAIL;
+    }
+
+    hoNDArray< std::complex<float> > *entry = daq->getObjectPtr();
+    std::vector<size_t> dims_profile = *entry->get_dimensions();
+    std::vector<size_t> dims = dims_profile;
+    dims.push_back(readouts_buffered);
+
+    // Allocate array for result
+    //
+
+    hoNDArray< std::complex<float> > result( &dims );
+
+    // And copy over the first profile
+    //
+
+    {
+      hoNDArray< std::complex<float> > tmp( &dims_profile, result.get_data_ptr() );
+      tmp = *entry;
+    }
+
+    mbq->release();
+    
+    // Copy the remaining profiles to the array
+    //
+    
+    for (unsigned int i = 1; i < readouts_buffered; i++) {
+      
+      if (buffer_.dequeue_head(mbq) < 0) {
+        GDEBUG("Message dequeue failed\n");
+        return GADGET_FAIL;
+      }
+      
+      daq = AsContainerMessage<hoNDArray< std::complex<float> > >(mbq);
+      
+      if (!daq) {
+        GDEBUG("Unable to interpret data on message queue\n");
+        return GADGET_FAIL;
+      }
+      
+      entry = daq->getObjectPtr();
+      hoNDArray< std::complex<float> > tmp( &dims_profile, result.get_data_ptr()+i*entry->get_number_of_elements() );
+      tmp = *entry;
+      mbq->release();
+    }      
+  
+    // Reshape to get the coil dimension as the last
+    //
+  
+    std::vector<size_t> order; order.push_back(0); order.push_back(2); order.push_back(1);
+    result = *permute( &result, &order);
+
+    // Write out the result
+    //
+  
+    write_nd_array< std::complex<float> >( &result, filename_.c_str() );
+  
+    return GADGET_OK;
+  }
+  
+  int CplxDumpGadget::
+  process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+          GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+  {
+    
+    // Noise should have been consumed by the noise adjust, but just in case...
+    //
+    
+    bool is_noise = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT).isSet(m1->getObjectPtr()->flags);
+    if (is_noise) {
+      m1->release();
+      return GADGET_OK;
+    }
+    
+    GadgetContainerMessage< hoNDArray< std::complex<float> > >* copy = new GadgetContainerMessage< hoNDArray< std::complex<float> > >;
+    *copy->getObjectPtr() = *m2->getObjectPtr();
+    
+    if (buffer_.enqueue_tail(copy) < 0) {
+      GDEBUG("Failed to add profile to buffer\n");
+      copy->release();
+      return GADGET_FAIL;
+    }
+    
+    if (this->next()->putq(m1) < 0) {
+      GDEBUG("Unable to put data on queue\n");
+      return GADGET_FAIL;
+    }
+    
+    return GADGET_OK;
+  }
+  
+  GADGET_FACTORY_DECLARE(CplxDumpGadget)
+}
diff --git a/gadgets/mri_core/CplxDumpGadget.h b/gadgets/mri_core/CplxDumpGadget.h
new file mode 100644
index 0000000..f0e7bcf
--- /dev/null
+++ b/gadgets/mri_core/CplxDumpGadget.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE CplxDumpGadget : 
+  public Gadget2<ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(CplxDumpGadget);
+
+      CplxDumpGadget();
+      ~CplxDumpGadget();
+
+    protected:
+      GADGET_PROPERTY(filename, std::string, "Filename of dumpfile", "profiles.cplx");
+
+      virtual int process_config(ACE_Message_Block* mb);
+
+      virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+      virtual int close(unsigned long flags); //All the work is done here in this Gadget
+
+    private:
+      std::string filename_;
+      ACE_Message_Queue<ACE_MT_SYNCH> buffer_;
+    };
+}
diff --git a/gadgets/mri_core/CropAndCombineGadget.cpp b/gadgets/mri_core/CropAndCombineGadget.cpp
new file mode 100644
index 0000000..a384b02
--- /dev/null
+++ b/gadgets/mri_core/CropAndCombineGadget.cpp
@@ -0,0 +1,70 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "CropAndCombineGadget.h"
+
+namespace Gadgetron{
+int CropAndCombineGadget::
+process( GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+	 GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+
+
+  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m3 = 
+    new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+  std::vector<size_t> new_dimensions(3);
+  new_dimensions[0] = m2->getObjectPtr()->get_size(0)>>1;
+  new_dimensions[1] = m2->getObjectPtr()->get_size(1);
+  new_dimensions[2] = m2->getObjectPtr()->get_size(2);
+
+  try{m3->getObjectPtr()->create(&new_dimensions);}
+  catch (std::runtime_error &err){
+  	GEXCEPTION(err,"CropAndCombineGadget, failed to allocate new array\n");
+    return -1;
+  }
+
+  size_t dimx     = m3->getObjectPtr()->get_size(0);
+  size_t dimx_old = m2->getObjectPtr()->get_size(0);
+
+  size_t dimy = m3->getObjectPtr()->get_size(1);
+  size_t dimz = m3->getObjectPtr()->get_size(2);
+
+  size_t channels = m2->getObjectPtr()->get_size(3);
+
+  std::complex<float>* d1 = m2->getObjectPtr()->get_data_ptr();
+  std::complex<float>* d2 = m3->getObjectPtr()->get_data_ptr();
+
+  size_t img_block_old = dimx_old*dimy*dimz;
+
+  for (size_t z = 0; z < dimz; z++) {
+    for (size_t y = 0; y < dimy; y++) {
+      for (size_t x = 0; x < dimx; x++) {
+	float mag = 0;
+	float phase = 0;
+	size_t offset_1 = z*dimy*dimx_old+y*dimx_old+x+((dimx_old-dimx)>>1);
+	size_t offset_2 = z*dimy*dimx+y*dimx+x;
+	for (size_t c = 0; c < channels; c++) {
+	  float mag_tmp = norm(d1[offset_1 + c*img_block_old]);
+	  phase += mag_tmp*arg(d1[offset_1 + c*img_block_old]);
+	  mag += mag_tmp;
+	}
+
+	d2[offset_2] = std::polar(std::sqrt(mag),phase);
+      }
+    }
+  }
+
+  //Now add the new array to the outgoing message
+  m1->cont(m3);
+  m2->release();
+
+  //Modify header to match
+  m1->getObjectPtr()->matrix_size[0] = m1->getObjectPtr()->matrix_size[0]>>1;
+  m1->getObjectPtr()->channels = 1;
+
+  m1->getObjectPtr()->field_of_view[0] = m1->getObjectPtr()->field_of_view[0]/2;
+
+  return this->next()->putq(m1);
+}
+
+GADGET_FACTORY_DECLARE(CropAndCombineGadget)
+}
diff --git a/gadgets/mri_core/CropAndCombineGadget.h b/gadgets/mri_core/CropAndCombineGadget.h
new file mode 100644
index 0000000..14bcac6
--- /dev/null
+++ b/gadgets/mri_core/CropAndCombineGadget.h
@@ -0,0 +1,25 @@
+#ifndef CROPANDCOMBINEGADGET_H
+#define CROPANDCOMBINEGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+  
+  class EXPORTGADGETSMRICORE CropAndCombineGadget : 
+  public Gadget2<ISMRMRD::ImageHeader, hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(CropAndCombineGadget);
+      
+    protected:
+      virtual int process( GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+			   GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);     
+    };
+}
+
+#endif //CROPANDCOMBINEGADGET_H
diff --git a/gadgets/mri_core/DependencyQueryGadget.cpp b/gadgets/mri_core/DependencyQueryGadget.cpp
new file mode 100644
index 0000000..692e85f
--- /dev/null
+++ b/gadgets/mri_core/DependencyQueryGadget.cpp
@@ -0,0 +1,194 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "DependencyQueryGadget.h"
+#include "gtPlusUtil.h"
+
+#include <boost/version.hpp>
+#include <boost/filesystem.hpp>
+using namespace boost::filesystem;
+
+namespace Gadgetron
+{
+    DependencyQueryGadget::DependencyQueryGadget()
+    {
+        processed_in_close_ = false;
+
+        noise_dependency_prefix_ = "GadgetronNoiseCovarianceMatrix";
+
+        noise_dependency_attrib_name_ = "NoiseDependencies";
+
+        clean_storage_while_query_ = true;
+        time_limit_in_storage_ = 24.0;
+
+        // get current time
+        std::time(&curr_time_UTC_);
+        struct tm* currTm = std::gmtime(&curr_time_UTC_);
+        curr_time_UTC_ = std::mktime(currTm);
+    }
+
+    DependencyQueryGadget::~DependencyQueryGadget()
+    {
+    }
+
+    int DependencyQueryGadget::process_config(ACE_Message_Block* mb)
+    {
+        return GADGET_OK;
+    }
+
+    int DependencyQueryGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< ValueType > >* m2)
+    {
+        return GADGET_OK;
+    }
+
+    int DependencyQueryGadget::close(unsigned long flags)
+    {
+        typedef unsigned long long size_t_type;
+
+        if ( BaseClass::close(flags) != GADGET_OK ) return GADGET_FAIL;
+
+        if ( !processed_in_close_ )
+        {
+            processed_in_close_ = true;
+
+            if ( !workingDirectory.value().empty() )
+            {
+	      noise_dependency_folder_ = workingDirectory.value();
+            }
+            else
+            {
+	      //This is an error, we should not be writing dependencies without having a working directory
+	      return GADGET_FAIL;
+            }
+            GDEBUG_STREAM("Folder to store noise dependencies is " << noise_dependency_folder_);
+
+            if ( !noise_dependency_prefix.value().empty() )
+            {
+	      noise_dependency_prefix_ = noise_dependency_prefix.value();
+            }
+
+            if ( !noise_dependency_attrib_name.value().empty() )
+            {
+	      noise_dependency_attrib_name_ = noise_dependency_attrib_name.value();
+            }
+
+            clean_storage_while_query_ = clean_storage_while_query.value();
+            GDEBUG_STREAM( "clean_storage_while_query_ is " << clean_storage_while_query_);
+
+            time_limit_in_storage_ = time_limit_in_storage.value();
+            if ( time_limit_in_storage_ < 0 )
+            {
+                time_limit_in_storage_ = 24.0;
+            }
+            GDEBUG_STREAM( "time_limit_in_storage_ is " << time_limit_in_storage_);
+
+            // list the content in the noise dependency folder
+            path p (noise_dependency_folder_);
+
+            try
+            {
+                if ( exists(p) )
+                {
+                    if ( is_directory(p) )
+                    {
+                        typedef std::vector<path> vec;
+                        vec v;
+                        v.reserve(100);
+
+                        copy(directory_iterator(p), directory_iterator(), back_inserter(v));
+                        sort(v.begin(), v.end());
+
+                        GDEBUG_STREAM( "A total of " << v.size() << " dependency measurements are found ... ");
+
+                        // if needed, clean the storage first
+                        std::string filename;
+
+                        if ( clean_storage_while_query_ )
+                        {
+                            Gadgetron::gtPlus::gtPlusUtil<ValueType> gt_util;
+
+                            for (vec::const_iterator it (v.begin()); it != v.end(); ++it)
+                            {
+                                filename = it->string();
+
+                                // find the file creation/modification time
+                                std::time_t lastWriteTime = last_write_time(*it);
+                                struct tm* lastWriteTm = std::gmtime(&lastWriteTime);
+                                lastWriteTime = std::mktime(lastWriteTm);
+
+                                if ( std::abs( (double)lastWriteTime - (double)curr_time_UTC_ ) > time_limit_in_storage_*3600.0 )
+                                {
+                                    remove(*it);
+                                }
+                            }
+
+                            // update the file list
+                            v.clear();
+                            copy(directory_iterator(p), directory_iterator(), back_inserter(v));
+                            sort(v.begin(), v.end());
+
+                            GDEBUG_STREAM( "A total of " << v.size() << " dependency measurements are found after cleaning ... ");
+                        }
+
+                        // declear the attributes
+                        Gadgetron::GadgetContainerMessage<ISMRMRD::MetaContainer>* m1 = new Gadgetron::GadgetContainerMessage<ISMRMRD::MetaContainer>();
+
+                        size_t count = 0;
+                        size_t ind;
+
+                        for (vec::const_iterator it (v.begin()); it != v.end(); ++it)
+                        {
+#                       if BOOST_VERSION < 104600
+                            filename = it->filename();
+#                       else
+                            filename = it->filename().string();
+#                       endif
+                            ind = filename.find(noise_dependency_prefix_);
+
+                            if ( ind != std::string::npos )
+                            {
+                                m1->getObjectPtr()->append(noise_dependency_attrib_name_.c_str(), filename.c_str());
+                                count++;
+                            }
+                        }
+
+                        GDEBUG_STREAM( "A total of " << count << " noise dependency measurements are found ... ");
+
+                        if ( count == 0 )
+                        {
+                            // put into a dummy item
+                            m1->getObjectPtr()->set(noise_dependency_attrib_name_.c_str(), "Dummy");
+                        }
+
+                        // send the found dependencies
+                        GadgetContainerMessage<GadgetMessageIdentifier>* mb = new GadgetContainerMessage<GadgetMessageIdentifier>();
+                        mb->getObjectPtr()->id = GADGET_MESSAGE_DEPENDENCY_QUERY;
+                        mb->cont(m1);
+
+                        int ret =  this->controller_->output_ready(mb);
+                        if ( (ret < 0) )
+                        {
+                            GDEBUG("Failed to return massage to controller\n");
+                            return GADGET_FAIL;
+                        }
+                    }
+                    else
+                    {
+                        GERROR_STREAM( noise_dependency_folder_ << " is not a valid folder ... ");
+                    }
+                }
+                else
+                {
+                    GERROR_STREAM("Cannot find dependency folder : " << noise_dependency_folder_);
+                }
+            }
+            catch (const filesystem_error& ex)
+            {
+                GERROR_STREAM( ex.what() );
+            }
+        }
+
+        return GADGET_OK;
+    }
+
+    GADGET_FACTORY_DECLARE(DependencyQueryGadget)
+
+} // namespace Gadgetron
diff --git a/gadgets/mri_core/DependencyQueryGadget.h b/gadgets/mri_core/DependencyQueryGadget.h
new file mode 100644
index 0000000..98ae493
--- /dev/null
+++ b/gadgets/mri_core/DependencyQueryGadget.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+#include <ctime>
+#include "GadgetMRIHeaders.h"
+#include "ismrmrd/meta.h"
+#include "GadgetStreamController.h"
+
+namespace Gadgetron
+{
+    class EXPORTGADGETSMRICORE DependencyQueryGadget : public Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >
+    {
+    public:
+        GADGET_DECLARE(DependencyQueryGadget);
+
+        typedef std::complex<float> ValueType;
+
+        typedef Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< ValueType > > BaseClass;
+
+        DependencyQueryGadget();
+        virtual ~DependencyQueryGadget();
+
+        virtual int close(unsigned long flags);
+
+    protected:
+	GADGET_PROPERTY(noise_dependency_prefix, std::string, "Prefix on noise dependency file", "");
+	GADGET_PROPERTY(noise_dependency_attrib_name, std::string, "Noise dependeny attribute name", "");
+	GADGET_PROPERTY(clean_storage_while_query, bool, "Clean storage while querying", false);
+	GADGET_PROPERTY(time_limit_in_storage, float, "Time limit for storing noise dependency", 0);
+
+        // if true, the old stored file will be deleted while querying
+        bool clean_storage_while_query_;
+
+        // in the unit of hours, how long a file is allowed to be in the storage
+        double time_limit_in_storage_;
+
+        // current time, year/month/day/hour/min/second
+        std::time_t curr_time_UTC_;
+
+        bool processed_in_close_;
+
+        std::string noise_dependency_folder_;
+        std::string noise_dependency_prefix_;
+
+        std::string noise_dependency_attrib_name_;
+
+        virtual int process_config(ACE_Message_Block* mb);
+
+        virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+            GadgetContainerMessage< hoNDArray< ValueType > >* m2);
+    };
+}
diff --git a/gadgets/mri_core/DependencyQueryWriter.cpp b/gadgets/mri_core/DependencyQueryWriter.cpp
new file mode 100644
index 0000000..d6759f6
--- /dev/null
+++ b/gadgets/mri_core/DependencyQueryWriter.cpp
@@ -0,0 +1,70 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "DependencyQueryWriter.h"
+#include "GadgetContainerMessage.h"
+
+namespace Gadgetron{
+
+  int DependencyQueryWriter::write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb)
+  {
+    typedef unsigned long long size_t_type;
+
+    GadgetContainerMessage<ISMRMRD::MetaContainer>* attribmb = AsContainerMessage<ISMRMRD::MetaContainer>(mb);
+    if (!attribmb)
+      {
+	GERROR("DependencyQueryWriter::write, invalid meta attribute message objects\n");
+        return -1;
+      }
+
+    ssize_t send_cnt = 0;
+    GadgetMessageIdentifier id;
+    id.id = GADGET_MESSAGE_DEPENDENCY_QUERY;
+
+    if ((send_cnt = sock->send_n (&id, sizeof(GadgetMessageIdentifier))) <= 0)
+      {
+	GERROR("Unable to send image message identifier\n");
+	return -1;
+      }
+
+    char* buf = NULL;
+    size_t_type len(0);
+
+    try
+      {
+        std::stringstream str;
+        ISMRMRD::serialize( *attribmb->getObjectPtr(), str);
+        std::string attribContent = str.str();
+        len = attribContent.length()+1;
+
+        buf = new char[len];
+        GADGET_CHECK_THROW(buf != NULL);
+
+        memset(buf, '\0', sizeof(char)*len);
+        memcpy(buf, attribContent.c_str(), len-1);
+      }
+    catch(...)
+      {
+	GERROR("Unable to serialize image meta attributes\n");
+        return -1;
+      }
+
+    if ( (send_cnt = sock->send_n (&len, sizeof(size_t_type))) <= 0 )
+      {
+	GERROR("Unable to send image meta attributes length \n");
+	if ( buf != NULL ) delete [] buf;
+	return -1;
+      }
+
+    if ( (send_cnt = sock->send_n (buf, len)) <= 0 )
+      {
+	GERROR("Unable to send image meta attributes\n");
+	if ( buf != NULL ) delete [] buf;
+	return -1;
+      }
+
+    if ( buf != NULL ) delete [] buf;
+
+    return 0;
+  }
+
+  GADGETRON_WRITER_FACTORY_DECLARE(DependencyQueryWriter)
+}
diff --git a/gadgets/mri_core/DependencyQueryWriter.h b/gadgets/mri_core/DependencyQueryWriter.h
new file mode 100644
index 0000000..6e31abe
--- /dev/null
+++ b/gadgets/mri_core/DependencyQueryWriter.h
@@ -0,0 +1,28 @@
+/** \file   DependencyQueryWriter.h
+    \brief  MRI image writer with meta attributes.
+    \author Hui Xue
+*/
+
+#ifndef DependencyQueryWriter_H
+#define DependencyQueryWriter_H
+
+#include "GadgetMessageInterface.h"
+#include "GadgetMRIHeaders.h"
+#include "ismrmrd/meta.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron
+{
+
+    class EXPORTGADGETSMRICORE DependencyQueryWriter : public GadgetMessageWriter
+    {
+    public:
+        GADGETRON_WRITER_DECLARE(DependencyQueryWriter)
+        virtual int write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb);
+    };
+
+}
+#endif
diff --git a/gadgets/mri_core/ExtractGadget.cpp b/gadgets/mri_core/ExtractGadget.cpp
new file mode 100644
index 0000000..054dd1f
--- /dev/null
+++ b/gadgets/mri_core/ExtractGadget.cpp
@@ -0,0 +1,117 @@
+/*
+ * ExtractMagnitudeGadget.cpp
+ *
+ *  Created on: Nov 8, 2011
+ *      Author: Michael S. Hansen
+ */
+
+#include "GadgetIsmrmrdReadWrite.h"
+#include "ExtractGadget.h"
+
+
+namespace Gadgetron{
+ExtractGadget::ExtractGadget()
+: extract_mask_(GADGET_EXTRACT_MAGNITUDE)
+{
+
+}
+
+ExtractGadget::~ExtractGadget()
+{
+
+}
+
+int ExtractGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<hoNDArray<std::complex<float> > > *m2)
+{
+        int em = extract_mask.value();
+	if (em > 0) {
+		if (em < GADGET_EXTRACT_MAX ) {
+			extract_mask_ = static_cast<unsigned short>(em);
+		}
+	}
+
+	static int counter = 0;
+	for (size_t m = GADGET_EXTRACT_MAGNITUDE; m < GADGET_EXTRACT_MAX; m = m<<1) {
+		if (extract_mask_ & m) {
+			GadgetContainerMessage<ISMRMRD::ImageHeader>* cm1 =
+					new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+			//Copy the header
+			*cm1->getObjectPtr() = *m1->getObjectPtr();
+
+			GadgetContainerMessage<hoNDArray< float > > *cm2 =
+					new GadgetContainerMessage<hoNDArray< float > >();
+
+			boost::shared_ptr< std::vector<size_t> > dims = m2->getObjectPtr()->get_dimensions();
+
+			try{cm2->getObjectPtr()->create(dims.get());}
+			catch (std::runtime_error &err){
+				GEXCEPTION(err,"Unable to create unsigned short storage in Extract Magnitude Gadget");
+				return GADGET_FAIL;
+			}
+
+			std::complex<float>* src = m2->getObjectPtr()->get_data_ptr();
+			float* dst = cm2->getObjectPtr()->get_data_ptr();
+
+			float pix_val;
+			for (unsigned long i = 0; i < cm2->getObjectPtr()->get_number_of_elements(); i++) {
+				switch (m) {
+				case GADGET_EXTRACT_MAGNITUDE:
+					pix_val = abs(src[i]);
+					break;
+				case GADGET_EXTRACT_REAL:
+					pix_val = real(src[i]);
+					break;
+				case GADGET_EXTRACT_IMAG:
+					pix_val = imag(src[i]);
+					break;
+				case GADGET_EXTRACT_PHASE:
+					pix_val = arg(src[i]);
+					break;
+				default:
+					GDEBUG("Unexpected extract mask %d, bailing out\n", m);
+					return GADGET_FAIL;
+				}
+				dst[i] = pix_val;
+			}
+
+			cm1->cont(cm2);
+			cm1->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_FLOAT;//GADGET_IMAGE_REAL_FLOAT;
+
+			switch (m) {
+			case GADGET_EXTRACT_MAGNITUDE:
+				cm1->getObjectPtr()->image_type = ISMRMRD::ISMRMRD_IMTYPE_MAGNITUDE;//GADGET_IMAGE_MAGNITUDE;
+				break;
+			case GADGET_EXTRACT_REAL:
+				cm1->getObjectPtr()->image_type = ISMRMRD::ISMRMRD_IMTYPE_REAL;
+				cm1->getObjectPtr()->image_series_index += 1000; //Ensure that this will go in a different series
+				break;
+			case GADGET_EXTRACT_IMAG:
+				cm1->getObjectPtr()->image_type = ISMRMRD::ISMRMRD_IMTYPE_IMAG;
+				cm1->getObjectPtr()->image_series_index += 2000; //Ensure that this will go in a different series
+				break;
+			case GADGET_EXTRACT_PHASE:
+				cm1->getObjectPtr()->image_type = ISMRMRD::ISMRMRD_IMTYPE_PHASE;
+				cm1->getObjectPtr()->image_series_index += 3000; //Ensure that this will go in a different series
+				break;
+			default:
+				GDEBUG("Unexpected extract mask %d, bailing out\n", m);
+				break;
+			}
+
+
+			if (this->next()->putq(cm1) == -1) {
+				m1->release();
+				GDEBUG("Unable to put extracted images on next gadgets queue");
+				return GADGET_FAIL;
+			}
+		}
+	}
+
+	m1->release(); //We have copied all the data in this case
+	return GADGET_OK;
+}
+
+
+GADGET_FACTORY_DECLARE(ExtractGadget)
+}
diff --git a/gadgets/mri_core/ExtractGadget.h b/gadgets/mri_core/ExtractGadget.h
new file mode 100644
index 0000000..90df071
--- /dev/null
+++ b/gadgets/mri_core/ExtractGadget.h
@@ -0,0 +1,64 @@
+#ifndef EXTRACTGADGET_H_
+#define EXTRACTGADGET_H_
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "GadgetMRIHeaders.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+#define MAX_UNSIGNED_SHORT_IMAGE_VALUE
+
+//Extract flags
+#define GADGET_EXTRACT_NONE                   (0)      //0
+#define GADGET_EXTRACT_MAGNITUDE              (1 << 0) //1
+#define GADGET_EXTRACT_REAL                   (1 << 1) //2
+#define GADGET_EXTRACT_IMAG                   (1 << 2) //4
+#define GADGET_EXTRACT_PHASE                  (1 << 3) //8
+#define GADGET_EXTRACT_MAX                    (1 << 4) //16
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE ExtractGadget:
+  public Gadget2<ISMRMRD::ImageHeader,hoNDArray< std::complex<float> > >
+    {
+
+    public:
+      GADGET_DECLARE(ExtractGadget);
+
+      ExtractGadget();
+      virtual ~ExtractGadget();
+
+      void set_extract_mask(unsigned short mask) {
+	extract_mask_ = mask;
+      }
+
+      bool extract_magnitude() {
+	return (extract_mask_ & GADGET_EXTRACT_MAGNITUDE);
+      }
+
+      bool extract_real() {
+	return (extract_mask_ & GADGET_EXTRACT_REAL);
+      }
+
+      bool extract_imag() {
+	return (extract_mask_ & GADGET_EXTRACT_IMAG);
+      }
+
+      bool extract_phase() {
+	return (extract_mask_ & GADGET_EXTRACT_PHASE);
+      }
+
+    protected:
+      GADGET_PROPERTY(extract_mask, int, "Extract mask, bitmask MAG=1, REAL=2, IMAG=4, PHASE=8", 1);
+
+      virtual int process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+      unsigned short extract_mask_;
+    };
+}
+
+#endif /* EXTRACTGADGET_H_ */
diff --git a/gadgets/mri_core/FFTGadget.cpp b/gadgets/mri_core/FFTGadget.cpp
new file mode 100644
index 0000000..55e104f
--- /dev/null
+++ b/gadgets/mri_core/FFTGadget.cpp
@@ -0,0 +1,112 @@
+#include "FFTGadget.h"
+#include "hoNDFFT.h"
+
+namespace Gadgetron{
+
+FFTGadget::FFTGadget()
+  : image_counter_(0)
+{
+
+}
+
+
+int FFTGadget::process( GadgetContainerMessage<IsmrmrdReconData>* m1)
+{
+    
+    //Iterate over all the recon bits
+    for(std::vector<IsmrmrdReconBit>::iterator it = m1->getObjectPtr()->rbit_.begin();
+        it != m1->getObjectPtr()->rbit_.end(); ++it)
+    {
+        //Grab a reference to the buffer containing the imaging data
+        IsmrmrdDataBuffered & dbuff = it->data_;
+
+        //7D, fixed order [E0, E1, E2, CHA, N, S, LOC]
+        uint16_t E0 = dbuff.data_.get_size(0);
+        uint16_t E1 = dbuff.data_.get_size(1);
+        uint16_t E2 = dbuff.data_.get_size(2);
+        uint16_t CHA = dbuff.data_.get_size(3);
+        uint16_t N = dbuff.data_.get_size(4);
+        uint16_t S = dbuff.data_.get_size(5);
+        uint16_t LOC = dbuff.data_.get_size(6);
+
+        //Each image will be [E0,E1,E2,CHA] big
+        std::vector<size_t> img_dims(4);
+        img_dims[0] = E0;
+        img_dims[1] = E1;
+        img_dims[2] = E2;
+        img_dims[3] = CHA;
+
+        //Loop over S and N and LOC
+        for (uint16_t loc=0; loc < LOC; loc++) {
+            for (uint16_t s=0; s < S; s++) {                
+                for (uint16_t n=0; n < N; n++) {
+                    
+                    //Create a new image
+                    GadgetContainerMessage<ISMRMRD::ImageHeader>* cm1 = 
+                            new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+                    GadgetContainerMessage< hoNDArray< std::complex<float> > >* cm2 = 
+                            new GadgetContainerMessage<hoNDArray< std::complex<float> > >();
+                    cm1->cont(cm2);
+                    //TODO do we want an image attribute string?  
+                    try{cm2->getObjectPtr()->create(&img_dims);}
+                    catch (std::runtime_error &err){
+                        GEXCEPTION(err,"Unable to allocate new image array\n");
+                        cm1->release();
+                        return GADGET_FAIL;
+                    }
+
+                    //Set some information into the image header
+                    //Use the middle header for some info
+                    //[E1, E2, N, S, LOC]
+                    ISMRMRD::AcquisitionHeader & acqhdr = dbuff.headers_(dbuff.sampling_.sampling_limits_[1].center_,
+                                                                         dbuff.sampling_.sampling_limits_[2].center_,
+                                                                         n, s, loc);
+                    
+                    cm1->getObjectPtr()->matrix_size[0]     = E0;
+                    cm1->getObjectPtr()->matrix_size[1]     = E1;
+                    cm1->getObjectPtr()->matrix_size[2]     = E2;
+                    cm1->getObjectPtr()->field_of_view[0]   = dbuff.sampling_.recon_FOV_[0];
+                    cm1->getObjectPtr()->field_of_view[1]   = dbuff.sampling_.recon_FOV_[1];
+                    cm1->getObjectPtr()->field_of_view[2]   = dbuff.sampling_.recon_FOV_[2];
+                    cm1->getObjectPtr()->channels           = CHA;
+                    
+                    cm1->getObjectPtr()->average = acqhdr.idx.average;
+                    cm1->getObjectPtr()->slice = acqhdr.idx.slice;
+                    cm1->getObjectPtr()->contrast = acqhdr.idx.contrast;
+                    cm1->getObjectPtr()->phase = acqhdr.idx.phase;
+                    cm1->getObjectPtr()->repetition = acqhdr.idx.repetition;
+                    cm1->getObjectPtr()->set = acqhdr.idx.set;
+                    cm1->getObjectPtr()->acquisition_time_stamp = acqhdr.acquisition_time_stamp;
+
+                    memcpy(cm1->getObjectPtr()->position, acqhdr.position, sizeof(float)*3);
+                    memcpy(cm1->getObjectPtr()->read_dir, acqhdr.read_dir, sizeof(float)*3);
+                    memcpy(cm1->getObjectPtr()->phase_dir, acqhdr.phase_dir, sizeof(float)*3);
+                    memcpy(cm1->getObjectPtr()->slice_dir, acqhdr.slice_dir, sizeof(float)*3);
+                    memcpy(cm1->getObjectPtr()->patient_table_position, acqhdr.patient_table_position, sizeof(float)*3);
+                    cm1->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_CXFLOAT;
+                    cm1->getObjectPtr()->image_index = ++image_counter_;
+
+                    //Copy the 4D data block [E0,E1,E2,CHA] for this loc, n, and s into the output image
+                    memcpy(cm2->getObjectPtr()->get_data_ptr(), &dbuff.data_(0,0,0,0,n,s,loc), E0*E1*E2*CHA*sizeof(std::complex<float>));
+
+                    //Do the FFTs in place
+                    hoNDFFT<float>::instance()->ifft(cm2->getObjectPtr(),0);
+                    hoNDFFT<float>::instance()->ifft(cm2->getObjectPtr(),1);
+                    if (E2>1) {
+                        hoNDFFT<float>::instance()->ifft(cm2->getObjectPtr(),2);
+                    }
+
+                    //Pass the image down the chain
+                    if (this->next()->putq(cm1) < 0) {
+                        return GADGET_FAIL;
+                    }
+                }
+            }
+        }
+    }
+    return GADGET_OK;  
+
+}
+
+GADGET_FACTORY_DECLARE(FFTGadget)
+}
diff --git a/gadgets/mri_core/FFTGadget.h b/gadgets/mri_core/FFTGadget.h
new file mode 100644
index 0000000..5e703c6
--- /dev/null
+++ b/gadgets/mri_core/FFTGadget.h
@@ -0,0 +1,26 @@
+#ifndef FFTGADGET_H
+#define FFTGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <ismrmrd/xml.h>
+#include "mri_core_data.h"
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE FFTGadget : 
+  public Gadget1<IsmrmrdReconData>
+    {
+    public:
+      GADGET_DECLARE(FFTGadget)
+      FFTGadget();
+	
+    protected:
+      virtual int process(GadgetContainerMessage<IsmrmrdReconData>* m1);
+      long long image_counter_;      
+    };
+}
+#endif //FFTGADGET_H
diff --git a/gadgets/mri_core/FloatToFixPointGadget.cpp b/gadgets/mri_core/FloatToFixPointGadget.cpp
new file mode 100644
index 0000000..fad4f1a
--- /dev/null
+++ b/gadgets/mri_core/FloatToFixPointGadget.cpp
@@ -0,0 +1,206 @@
+/*
+*       FloatToFixPointGadget.cpp
+*
+*       Created on: March 10, 2014
+*       Author: Hui Xue
+*/
+
+#include "GadgetIsmrmrdReadWrite.h"
+#include "FloatToFixPointGadget.h"
+#include "mri_core_def.h"
+
+namespace Gadgetron
+{
+    template <typename T> 
+    FloatToFixPointGadget<T>::FloatToFixPointGadget() 
+        : max_intensity_value_(std::numeric_limits<T>::max()), 
+          min_intensity_value_(std::numeric_limits<T>::min()), 
+          intensity_offset_value_(0)
+    {
+    }
+
+    template <typename T> 
+    FloatToFixPointGadget<T>::~FloatToFixPointGadget()
+    {
+    }
+
+    template <typename T> 
+    int FloatToFixPointGadget<T>::process_config(ACE_Message_Block* mb)
+    {
+        // gadget parameters
+        max_intensity_value_ = max_intensity.value();
+        min_intensity_value_ = min_intensity.value();
+        intensity_offset_value_ = intensity_offset.value();
+
+        return GADGET_OK;
+    }
+
+    template <typename T> 
+    int FloatToFixPointGadget<T>::process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, GadgetContainerMessage< hoNDArray< float > >* m2)
+    {
+        GadgetContainerMessage<hoNDArray< T > > *cm2 =
+            new GadgetContainerMessage<hoNDArray< T > >();
+
+        boost::shared_ptr< std::vector<size_t> > dims = m2->getObjectPtr()->get_dimensions();
+
+        try {cm2->getObjectPtr()->create(dims);}
+        catch (std::runtime_error &err){
+            GEXCEPTION(err,"Unable to create unsigned fix point storage in Extract Magnitude Gadget");
+            return GADGET_FAIL;
+        }
+
+        float* src = m2->getObjectPtr()->get_data_ptr();
+        T* dst = cm2->getObjectPtr()->get_data_ptr();
+
+        long long i;
+        long long numOfPixels = (long long)cm2->getObjectPtr()->get_number_of_elements();
+
+        GadgetContainerMessage<ISMRMRD::MetaContainer>* m3 = AsContainerMessage<ISMRMRD::MetaContainer>(m2->cont());
+
+        switch (m1->getObjectPtr()->image_type)
+        {
+            case ISMRMRD::ISMRMRD_IMTYPE_MAGNITUDE:
+            {
+                #pragma omp parallel for default(none) private(i) shared(numOfPixels, src, dst)
+                for (i=0; i<numOfPixels; i++)
+                {
+                    float pix_val = src[i];
+                    pix_val = std::abs(pix_val);
+                    if (pix_val < (float)min_intensity_value_) pix_val = (float)min_intensity_value_;
+                    if (pix_val > (float)max_intensity_value_) pix_val = (float)max_intensity_value_;
+                    dst[i] = static_cast<T>(pix_val+0.5);
+                }
+            }
+            break;
+
+            case ISMRMRD::ISMRMRD_IMTYPE_REAL:
+            case ISMRMRD::ISMRMRD_IMTYPE_IMAG:
+            {
+                #pragma omp parallel for default(none) private(i) shared(numOfPixels, src, dst)
+                for (i=0; i<numOfPixels; i++)
+                {
+                    float pix_val = src[i];
+                    pix_val = pix_val + intensity_offset_value_;
+                    if (pix_val < (float)min_intensity_value_) pix_val = (float)min_intensity_value_;
+                    if (pix_val > (float)max_intensity_value_) pix_val = (float)max_intensity_value_;
+                    dst[i] = static_cast<T>(pix_val+0.5);
+                }
+
+                if (m3)
+                {
+                    if (m3->getObjectPtr()->length(GADGETRON_IMAGE_WINDOWCENTER) > 0)
+                    {
+                        long windowCenter;
+                        windowCenter = m3->getObjectPtr()->as_long(GADGETRON_IMAGE_WINDOWCENTER, 0);
+                        m3->getObjectPtr()->set(GADGETRON_IMAGE_WINDOWCENTER, windowCenter + (long)intensity_offset_value_);
+                    }
+                }
+            }
+            break;
+
+            case ISMRMRD::ISMRMRD_IMTYPE_PHASE:
+            {
+                #pragma omp parallel for default(none) private(i) shared(numOfPixels, src, dst)
+                for (i=0; i<numOfPixels; i++)
+                {
+                    float pix_val = src[i];
+                    pix_val *= (float)(intensity_offset_value_/3.14159265);
+                    pix_val += intensity_offset_value_;
+                    if (pix_val < (float)min_intensity_value_) pix_val = (float)min_intensity_value_;
+                    if (pix_val > (float)max_intensity_value_) pix_val = (float)max_intensity_value_;
+                    dst[i] = static_cast<T>(pix_val);
+                }
+            }
+            break;
+
+            default:
+                GDEBUG("Unknown image type %d, bailing out\n",m1->getObjectPtr()->image_type);
+                m1->release();
+                cm2->release();
+                return GADGET_FAIL;
+        }
+
+        m1->cont(cm2);
+        if(m3) cm2->cont(m3);
+
+        m2->cont(NULL);
+        m2->release();
+
+        if (typeid(T) == typeid(unsigned short))
+        {
+            m1->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_USHORT;
+        }
+        else if (typeid(T) == typeid(short))
+        {
+            m1->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_SHORT;
+        }
+        else if (typeid(T) == typeid(unsigned int))
+        {
+            m1->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_UINT;
+        }
+        else if (typeid(T) == typeid(int))
+        {
+            m1->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_INT;
+        }
+        else
+        {
+            GDEBUG("Unknown data type, bailing out\n");
+            m1->release();
+            cm2->release();
+            return GADGET_FAIL;
+        }
+
+        if (this->next()->putq(m1) == -1)
+        {
+            m1->release();
+            GDEBUG("Unable to put unsigned fix point image on next gadgets queue");
+            return GADGET_FAIL;
+        }
+
+        return GADGET_OK;
+    }
+
+    FloatToUShortGadget::FloatToUShortGadget()
+    {
+        max_intensity.value(4095);
+        min_intensity.value(0);
+        intensity_offset.value(2048);
+
+        max_intensity_value_ = 4095;
+        min_intensity_value_ = 0;
+        intensity_offset_value_ = 2048;
+    }
+
+    FloatToUShortGadget::~FloatToUShortGadget()
+    {
+    }
+
+    FloatToShortGadget::FloatToShortGadget()
+    {
+    }
+
+    FloatToShortGadget::~FloatToShortGadget()
+    {
+    }
+
+    FloatToUIntGadget::FloatToUIntGadget()
+    {
+    }
+
+    FloatToUIntGadget::~FloatToUIntGadget()
+    {
+    }
+
+    FloatToIntGadget::FloatToIntGadget()
+    {
+    }
+
+    FloatToIntGadget::~FloatToIntGadget()
+    {
+    }
+
+    GADGET_FACTORY_DECLARE(FloatToUShortGadget)
+    GADGET_FACTORY_DECLARE(FloatToShortGadget)
+    GADGET_FACTORY_DECLARE(FloatToIntGadget)
+    GADGET_FACTORY_DECLARE(FloatToUIntGadget)
+}
diff --git a/gadgets/mri_core/FloatToFixPointGadget.h b/gadgets/mri_core/FloatToFixPointGadget.h
new file mode 100644
index 0000000..3e991f0
--- /dev/null
+++ b/gadgets/mri_core/FloatToFixPointGadget.h
@@ -0,0 +1,84 @@
+#ifndef FloatToFixPointGadget_H_
+#define FloatToFixPointGadget_H_
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd/meta.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+
+namespace Gadgetron
+{
+
+    /**
+    * This Gadget converts float values to fix point integer format.
+    *
+    * How the conversion is done will depend on the image type:
+    * Magnitude images: Values above 4095 will be clamped.
+    * Real or Imag: Values below -2048 and above 2047 will be clamped. Zero will be 2048.
+    * Phase: -pi will be 0, +pi will be 4095.
+    *
+    */
+
+    template <typename T> 
+    class EXPORTGADGETSMRICORE FloatToFixPointGadget:public Gadget2<ISMRMRD::ImageHeader, hoNDArray< float > >
+    {
+    public:
+
+        GADGET_DECLARE(FloatToFixPointGadget);
+
+        FloatToFixPointGadget();
+        virtual ~FloatToFixPointGadget();
+
+    protected:
+        GADGET_PROPERTY(max_intensity, T, "Maximum intensity value", std::numeric_limits<T>::max() );
+        GADGET_PROPERTY(min_intensity, T, "Minimal intensity value", std::numeric_limits<T>::min());
+        GADGET_PROPERTY(intensity_offset, T, "Intensity offset", 0);
+
+        T max_intensity_value_;
+        T min_intensity_value_;
+        T intensity_offset_value_;
+
+        virtual int process_config(ACE_Message_Block* mb);
+        virtual int process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, GadgetContainerMessage< hoNDArray< float > >* m2);
+    };
+
+    class EXPORTGADGETSMRICORE FloatToShortGadget :public FloatToFixPointGadget < short > 
+    {
+    public:
+        GADGET_DECLARE(FloatToShortGadget);
+
+        FloatToShortGadget();
+        virtual ~FloatToShortGadget();
+    };
+
+    class EXPORTGADGETSMRICORE FloatToUShortGadget :public FloatToFixPointGadget < unsigned short >
+    {
+    public:
+        GADGET_DECLARE(FloatToUShortGadget);
+
+        FloatToUShortGadget();
+        virtual ~FloatToUShortGadget();
+    };
+
+    class EXPORTGADGETSMRICORE FloatToIntGadget :public FloatToFixPointGadget < int >
+    {
+    public:
+        GADGET_DECLARE(FloatToIntGadget);
+
+        FloatToIntGadget();
+        virtual ~FloatToIntGadget();
+    };
+
+    class EXPORTGADGETSMRICORE FloatToUIntGadget :public FloatToFixPointGadget < unsigned int >
+    {
+    public:
+        GADGET_DECLARE(FloatToUIntGadget);
+
+        FloatToUIntGadget();
+        virtual ~FloatToUIntGadget();
+    };
+}
+
+#endif /* FloatToFixPointGadget_H_ */
diff --git a/gadgets/mri_core/FlowPhaseSubtractionGadget.cpp b/gadgets/mri_core/FlowPhaseSubtractionGadget.cpp
new file mode 100644
index 0000000..35072b5
--- /dev/null
+++ b/gadgets/mri_core/FlowPhaseSubtractionGadget.cpp
@@ -0,0 +1,149 @@
+#include "FlowPhaseSubtractionGadget.h"
+#include "ismrmrd/xml.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif 
+
+namespace Gadgetron{
+
+  FlowPhaseSubtractionGadget::FlowPhaseSubtractionGadget() {}
+
+  FlowPhaseSubtractionGadget::~FlowPhaseSubtractionGadget() {}
+
+  int FlowPhaseSubtractionGadget::process_config(ACE_Message_Block* mb)
+  {
+
+    ISMRMRD::IsmrmrdHeader h;
+    ISMRMRD::deserialize(mb->rd_ptr(),h);
+    
+    if (h.encoding.size() != 1) {
+      GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+      GDEBUG("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+
+  ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+  ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+  ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+  
+  sets_ = e_limits.set ? e_limits.set->maximum + 1 : 1;
+  
+  if (sets_ > 2) {
+    GDEBUG("Phase subtraction only implemented for two sets for now\n");
+    GDEBUG("Number of sets detected: %d, bailing out.\n", sets_);
+    return GADGET_FAIL;
+  }
+  
+  buffer_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[sets_]); 
+  
+  size_t bsize = sizeof(GadgetContainerMessage< GadgetContainerMessage<ISMRMRD::ImageHeader> >)*10000;
+  
+  for( size_t i=0; i<sets_; i++ ){
+    buffer_[i].high_water_mark(bsize);
+    buffer_[i].low_water_mark(bsize);
+  }
+  
+  return GADGET_OK;
+  }
+
+  int FlowPhaseSubtractionGadget::
+  process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+  {
+
+    // We need two sets to make a phase subtraction
+    if (sets_ < 2) {
+      if (this->next()->putq(m1) < 0) {
+	return GADGET_FAIL;
+      }
+      return GADGET_OK;
+    }
+
+    size_t set = m1->getObjectPtr()->set;
+
+    // Enqueue until we have images from both sets
+    //
+
+    if( buffer_[set].enqueue_tail(m1) < 0 ){
+      GDEBUG("Message enqueue failed\n");
+      return GADGET_FAIL;
+    };
+
+    // Phase subtract 
+    //
+
+    while( buffer_[0].message_count()>0 && buffer_[1].message_count()>0 ) {
+
+      ACE_Message_Block *mbq1, *mbq2;
+
+      if( buffer_[0].dequeue_head(mbq1) < 0 || buffer_[1].dequeue_head(mbq2) < 0 ) {
+	GDEBUG("Message dequeue failed\n");
+	if( buffer_[set].message_count() > 0 ) 
+	  buffer_[set].dequeue_tail(mbq1); // or m1 will be attempted deleted twice
+	return GADGET_FAIL;
+      }
+	
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *pm1 = 
+	AsContainerMessage<ISMRMRD::ImageHeader>(mbq1);
+
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *cpm1 = 
+	AsContainerMessage<hoNDArray< std::complex<float> > >(mbq1->cont());
+
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *pm2 = 
+	AsContainerMessage<ISMRMRD::ImageHeader>(mbq2);
+
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *cpm2 = 
+	AsContainerMessage<hoNDArray< std::complex<float> > >(mbq2->cont());
+	
+      // Some validity checks
+      //
+
+      if( pm1->getObjectPtr()->image_index != pm2->getObjectPtr()->image_index ) {
+	GDEBUG("Mismatch in image indices detected (%d, %d). Bailing out.\n", 
+		      pm1->getObjectPtr()->image_index, pm2->getObjectPtr()->image_index);
+	pm1->release();
+	if( buffer_[set].message_count() > 0 ){
+	  pm2->release();		
+	  buffer_[set].dequeue_tail(mbq1); // or m1 will be attempted deleted twice
+	}
+	return GADGET_FAIL;
+      }
+      
+      if (cpm1->getObjectPtr()->get_number_of_elements() != cpm2->getObjectPtr()->get_number_of_elements()) {
+	GDEBUG("Mismatch in number of elements detected. Bailing out.\n");
+	pm1->release();
+	if( buffer_[set].message_count() > 0 ){
+	  pm2->release();
+	  buffer_[set].dequeue_tail(mbq1); // or m1 will be attempted deleted twice
+	}
+	return GADGET_FAIL;
+      }
+
+      std::complex<float> *p1 = cpm1->getObjectPtr()->get_data_ptr();
+      std::complex<float> *p2 = cpm2->getObjectPtr()->get_data_ptr();
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+      for( long i = 0; i < (long)m2->getObjectPtr()->get_number_of_elements(); i++ ) {
+	std::complex<float> tmp = std::polar((std::abs(p1[i])+std::abs(p2[i]))/2.0f, std::arg(p2[i])-std::arg(p1[i]));
+	p2[i] = tmp;
+      }
+      
+      pm1->release();	
+      pm2->getObjectPtr()->set = 0;
+
+      if (this->next()->putq(pm2) < 0) {
+	if( buffer_[set].message_count() > 0 ) {
+	  pm2->release();
+	  buffer_[set].dequeue_tail(mbq1); // or m1 will be attempted deleted twice
+	}
+	return GADGET_FAIL;
+      }
+    }
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(FlowPhaseSubtractionGadget)
+}
diff --git a/gadgets/mri_core/FlowPhaseSubtractionGadget.h b/gadgets/mri_core/FlowPhaseSubtractionGadget.h
new file mode 100644
index 0000000..69c884a
--- /dev/null
+++ b/gadgets/mri_core/FlowPhaseSubtractionGadget.h
@@ -0,0 +1,38 @@
+#ifndef FlowPhaseSubtractionGadget_H
+#define FlowPhaseSubtractionGadget_H
+
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+
+namespace Gadgetron{
+  
+    class EXPORTGADGETSMRICORE FlowPhaseSubtractionGadget :
+        public Gadget2< ISMRMRD::ImageHeader, hoNDArray< std::complex<float> > >
+    {
+
+    public:
+        GADGET_DECLARE(FlowPhaseSubtractionGadget);
+
+        FlowPhaseSubtractionGadget();
+        virtual ~FlowPhaseSubtractionGadget();
+
+    protected:
+        virtual int process_config(ACE_Message_Block* mb);
+
+        virtual int process(GadgetContainerMessage< ISMRMRD::ImageHeader >* m1,
+            GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2);
+
+    private:
+        unsigned int sets_;
+	boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > buffer_;
+    };
+}
+
+#endif //FlowPhaseSubtractionGadget_H
diff --git a/gadgets/mri_core/GadgetIsmrmrdReadWrite.cpp b/gadgets/mri_core/GadgetIsmrmrdReadWrite.cpp
new file mode 100644
index 0000000..ae8d80c
--- /dev/null
+++ b/gadgets/mri_core/GadgetIsmrmrdReadWrite.cpp
@@ -0,0 +1,6 @@
+#include "GadgetIsmrmrdReadWrite.h"
+
+namespace Gadgetron{
+
+GADGETRON_READER_FACTORY_DECLARE(GadgetIsmrmrdAcquisitionMessageReader)
+}
diff --git a/gadgets/mri_core/GadgetIsmrmrdReadWrite.h b/gadgets/mri_core/GadgetIsmrmrdReadWrite.h
new file mode 100644
index 0000000..abfdd96
--- /dev/null
+++ b/gadgets/mri_core/GadgetIsmrmrdReadWrite.h
@@ -0,0 +1,155 @@
+#ifndef GADGETISMRMRDREADWRITE_H
+#define GADGETISMRMRDREADWRITE_H
+
+#include "GadgetMRIHeaders.h"
+#include "GadgetContainerMessage.h"
+#include "GadgetMessageInterface.h"
+#include "hoNDArray.h"
+#include "url_encode.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+
+#include <ace/SOCK_Stream.h>
+#include <ace/Task.h>
+#include <complex>
+
+namespace Gadgetron{
+
+    class EXPORTGADGETSMRICORE GadgetIsmrmrdAcquisitionMessageWriter : public GadgetMessageWriter
+    {
+
+    public:
+        virtual int write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb)
+        {
+            GadgetContainerMessage<ISMRMRD::Acquisition>* acqmb =
+                dynamic_cast< GadgetContainerMessage<ISMRMRD::Acquisition>* >(mb);
+
+            if (!acqmb) {
+	      GERROR("GadgetAcquisitionMessageWriter, invalid acquisition message objects");
+	      return -1;
+            }
+
+            ssize_t send_cnt = 0;
+
+            GadgetMessageIdentifier id;
+            id.id = GADGET_MESSAGE_ISMRMRD_ACQUISITION;
+
+            if ((send_cnt = sock->send_n (&id, sizeof(GadgetMessageIdentifier))) <= 0) {
+	      GERROR("Unable to send acquisition message identifier\n");
+	      return -1;
+            }
+
+            ISMRMRD::ISMRMRD_AcquisitionHeader acqHead = acqmb->getObjectPtr()->getHead();
+            if ((send_cnt = sock->send_n (&acqHead, sizeof(ISMRMRD::ISMRMRD_AcquisitionHeader))) <= 0) {
+	      GERROR("Unable to send acquisition header\n");
+	      return -1;
+            }
+
+            unsigned long trajectory_elements = acqmb->getObjectPtr()->getHead().trajectory_dimensions*acqmb->getObjectPtr()->getHead().number_of_samples;
+            unsigned long data_elements = acqmb->getObjectPtr()->getHead().active_channels*acqmb->getObjectPtr()->getHead().number_of_samples;
+
+            if (trajectory_elements) {
+                if ((send_cnt = sock->send_n (&acqmb->getObjectPtr()->getTrajPtr()[0], sizeof(float)*trajectory_elements)) <= 0) {
+		  GERROR("Unable to send acquisition trajectory elements\n");
+		  return -1;
+                }
+            }
+
+            if (data_elements) {
+                if ((send_cnt = sock->send_n (&acqmb->getObjectPtr()->getDataPtr()[0], 2*sizeof(float)*data_elements)) <= 0) {
+		  GERROR("Unable to send acquisition data elements\n");
+		  return -1;
+                }
+            }
+
+            return 0;
+        }
+    };
+
+    /**
+    Default implementation of GadgetMessageReader for IsmrmrdAcquisition messages
+    */
+    class EXPORTGADGETSMRICORE GadgetIsmrmrdAcquisitionMessageReader : public GadgetMessageReader
+    {
+
+    public:
+        GADGETRON_READER_DECLARE(GadgetIsmrmrdAcquisitionMessageReader);
+
+        virtual ACE_Message_Block* read(ACE_SOCK_Stream* stream)
+        {
+
+            GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1 =
+                new GadgetContainerMessage<ISMRMRD::AcquisitionHeader>();
+
+            GadgetContainerMessage<hoNDArray< std::complex<float> > >* m2 =
+                new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+            m1->cont(m2);
+
+            ssize_t recv_count = 0;
+
+            if ((recv_count = stream->recv_n(m1->getObjectPtr(), sizeof(ISMRMRD::AcquisitionHeader))) <= 0) {
+	      GERROR("GadgetIsmrmrdAcquisitionMessageReader, failed to read ISMRMRDACQ Header\n");
+	      return 0;
+            }
+
+            if (m1->getObjectPtr()->trajectory_dimensions) {
+                GadgetContainerMessage<hoNDArray< float > >* m3 =
+                    new GadgetContainerMessage< hoNDArray< float > >();
+
+                m2->cont(m3);
+
+                std::vector<size_t> tdims;
+                tdims.push_back(m1->getObjectPtr()->trajectory_dimensions);
+                tdims.push_back(m1->getObjectPtr()->number_of_samples);
+
+                try { m3->getObjectPtr()->create(&tdims);}
+                catch (std::runtime_error &err){
+                    GEXCEPTION(err,"(%P|%t) Allocate trajectory data\n");
+                    m1->release();
+
+                    return 0;
+                }
+
+                if ((recv_count =
+		     stream->recv_n
+		     (m3->getObjectPtr()->get_data_ptr(),
+		     sizeof(float)*tdims[0]*tdims[1])) <= 0) {
+		  
+		        GERROR("Unable to read trajectory data\n");
+		        m1->release();
+                        return 0;
+                }
+
+            }
+
+            std::vector<size_t> adims;
+            adims.push_back(m1->getObjectPtr()->number_of_samples);
+            adims.push_back(m1->getObjectPtr()->active_channels);
+
+            try{ m2->getObjectPtr()->create(&adims); }
+            catch (std::runtime_error &err ){
+                GEXCEPTION(err,"(%P|%t) Allocate sample data\n")
+                    m1->release();
+
+                return 0;
+            }
+
+            if ((recv_count =
+                stream->recv_n
+                (m2->getObjectPtr()->get_data_ptr(),
+                sizeof(std::complex<float>)*adims[0]*adims[1])) <= 0) {
+ 
+	            GERROR("Unable to read Acq data\n");
+                    m1->release();
+                    return 0;
+            }
+
+            return m1;
+        }
+
+    };
+
+}
+#endif //GADGETISMRMRDREADWRITE_H
diff --git a/gadgets/mri_core/GadgetMRIHeaders.h b/gadgets/mri_core/GadgetMRIHeaders.h
new file mode 100644
index 0000000..deb1003
--- /dev/null
+++ b/gadgets/mri_core/GadgetMRIHeaders.h
@@ -0,0 +1,136 @@
+#ifndef GADGETMRIHEADERS_H
+#define GADGETMRIHEADERS_H
+
+#include <ace/Basic_Types.h>
+
+//Data flags
+/*
+#define GADGET_FLAG_ACQ_END                   (1 << 0)
+#define GADGET_FLAG_LAST_ACQ_IN_SLICE         (1 << 1)
+#define GADGET_FLAG_LAST_ACQ_IN_MEAS          (1 << 2)
+#define GADGET_FLAG_LAST_ACQ_IN_CONCAT        (1 << 3)
+#define GADGET_FLAG_FIRST_ACQ_IN_SLICE        (1 << 4)
+#define GADGET_FLAG_FIRST_ACQ_IN_MEAS         (1 << 5)
+#define GADGET_FLAG_FIRST_ACQ_IN_CONCAT       (1 << 6)
+#define GADGET_FLAG_IS_NOISE_SCAN             (1 << 7)
+#define GADGET_FLAG_IS_PATREF_SCAN            (1 << 8)
+#define GADGET_FLAG_IS_PATREFANDIMA_SCAN      (1 << 9)
+
+#define GADGET_FLAG_LAST_IMAGE                (1 << 0)
+
+enum GadgetImageFormats {
+	GADGET_IMAGE_COMPLEX_FLOAT = 0,
+	GADGET_IMAGE_REAL_FLOAT,
+	GADGET_IMAGE_REAL_UNSIGNED_SHORT
+};
+
+enum GadgetImageTypes
+{
+	GADGET_IMAGE_MAGNITUDE = 0,
+	GADGET_IMAGE_PHASE,
+	GADGET_IMAGE_REAL,
+	GADGET_IMAGE_IMAG
+};
+*/
+
+namespace Gadgetron{
+
+enum GadgetMessageID {
+  GADGET_MESSAGE_EXT_ID_MIN                             = 1000,
+  GADGET_MESSAGE_ACQUISITION                            = 1001, /**< DEPRECATED */
+  GADGET_MESSAGE_NEW_MEASUREMENT                        = 1002, /**< DEPRECATED */
+  GADGET_MESSAGE_END_OF_SCAN                            = 1003, /**< DEPRECATED */
+  GADGET_MESSAGE_IMAGE_CPLX_FLOAT                       = 1004, /**< DEPRECATED */
+  GADGET_MESSAGE_IMAGE_REAL_FLOAT                       = 1005, /**< DEPRECATED */
+  GADGET_MESSAGE_IMAGE_REAL_USHORT                      = 1006, /**< DEPRECATED */
+  GADGET_MESSAGE_EMPTY                                  = 1007, /**< DEPRECATED */
+  GADGET_MESSAGE_ISMRMRD_ACQUISITION                    = 1008,
+  GADGET_MESSAGE_ISMRMRD_IMAGE_CPLX_FLOAT               = 1009, /**< DEPRECATED */
+  GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_FLOAT               = 1010, /**< DEPRECATED */
+  GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_USHORT              = 1011, /**< DEPRECATED */
+  GADGET_MESSAGE_DICOM                                  = 1012, /**< DEPRECATED */
+  GADGET_MESSAGE_CLOUD_JOB                              = 1013,
+  GADGET_MESSAGE_GADGETCLOUD_JOB                        = 1014,
+  GADGET_MESSAGE_ISMRMRD_IMAGEWITHATTRIB_CPLX_FLOAT     = 1015, /**< DEPRECATED */
+  GADGET_MESSAGE_ISMRMRD_IMAGEWITHATTRIB_REAL_FLOAT     = 1016, /**< DEPRECATED */
+  GADGET_MESSAGE_ISMRMRD_IMAGEWITHATTRIB_REAL_USHORT    = 1017, /**< DEPRECATED */
+  GADGET_MESSAGE_DICOM_WITHNAME                         = 1018,
+  GADGET_MESSAGE_DEPENDENCY_QUERY                       = 1019,
+  GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_SHORT               = 1020, /**< DEPRECATED */
+  GADGET_MESSAGE_ISMRMRD_IMAGEWITHATTRIB_REAL_SHORT     = 1021, /**< DEPRECATED */
+  GADGET_MESSAGE_ISMRMRD_IMAGE                          = 1022,
+  GADGET_MESSAGE_EXT_ID_MAX                             = 4096
+};
+
+
+/*
+struct ISMRMRD::ImageHeader
+{
+  ACE_UINT32     flags;
+  ACE_UINT16     matrix_size[3];
+  ACE_UINT16     channels;
+  float          position[3];
+  float          quaternion[4];
+  float			 table_position;
+  ACE_UINT16     slice;
+  ACE_UINT16     contrast;
+  ACE_UINT16     set;
+  ACE_UINT16     phase;
+  ACE_UINT16     average;
+  ACE_UINT16     repetition;
+  ACE_UINT32     time_stamp;
+  ACE_UINT32     pmu_time_stamp;
+  ACE_UINT16     image_format;
+  ACE_UINT16     image_type;
+  ACE_UINT16     image_index;
+  ACE_UINT16	 image_series_index;
+
+  ACE_UINT16 get_matrix_size(unsigned int index) {
+    if (index < 3) {
+      return matrix_size[index];
+    } else {
+      return 0;
+    }
+  }
+
+  void set_matrix_size(unsigned int index, ACE_UINT16 size) {
+    if (index < 3) {
+      matrix_size[index] = size;
+    }
+  }
+
+  float get_position(unsigned int index) {
+    if (index < 3) {
+      return position[index];
+    } else {
+      return 0.0f;
+    }
+  }
+
+  void set_position(unsigned int index, float pos)
+  {
+    if (index < 3) {
+      position[index] = pos;
+    }
+  }
+
+  float get_quaternion(unsigned int index) {
+    if (index < 4) {
+      return quaternion[index];
+    } else {
+      return 0.0f;
+    }
+  }
+
+  void set_quaternion(unsigned int index, float quar)
+  {
+    if (index < 4) {
+      quaternion[index] = quar;
+    }
+  }
+
+}; 
+*/
+}
+
+#endif  //GADGETMRIHEADERS_H
diff --git a/gadgets/mri_core/ImageArraySplitGadget.cpp b/gadgets/mri_core/ImageArraySplitGadget.cpp
new file mode 100644
index 0000000..2717b27
--- /dev/null
+++ b/gadgets/mri_core/ImageArraySplitGadget.cpp
@@ -0,0 +1,81 @@
+#include "ImageArraySplitGadget.h"
+
+namespace Gadgetron{
+
+ImageArraySplitGadget::ImageArraySplitGadget()
+{
+
+}
+
+
+int ImageArraySplitGadget::process( GadgetContainerMessage<IsmrmrdImageArray>* m1)
+{
+    
+    //Grab a reference to the buffer containing the imaging data
+    IsmrmrdImageArray & imagearr = *m1->getObjectPtr();
+
+    //7D, fixed order [X, Y, Z, CHA, N, S, LOC]
+    uint16_t X = imagearr.data_.get_size(0);
+    uint16_t Y = imagearr.data_.get_size(1);
+    uint16_t Z = imagearr.data_.get_size(2);
+    uint16_t CHA = imagearr.data_.get_size(3);
+    uint16_t N = imagearr.data_.get_size(4);
+    uint16_t S = imagearr.data_.get_size(5);
+    uint16_t LOC = imagearr.data_.get_size(6);
+
+    //Each image will be [X,Y,Z,CHA] big
+    std::vector<size_t> img_dims(4);
+    img_dims[0] = X;
+    img_dims[1] = Y;
+    img_dims[2] = Z;
+    img_dims[3] = CHA;
+
+    //Loop over N, S and LOC
+    for (uint16_t loc=0; loc < LOC; loc++) {
+        for (uint16_t s=0; s < S; s++) {                
+            for (uint16_t n=0; n < N; n++) {
+        
+                //Create a new image header and copy the header for this n, s and loc
+                GadgetContainerMessage<ISMRMRD::ImageHeader>* cm1 = 
+                        new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+                memcpy(cm1->getObjectPtr(), &imagearr.headers_(n,s,loc), sizeof(ISMRMRD::ImageHeader));
+
+                //Create a new image image
+                // and the 4D data block [X,Y,Z,CHA] for this n, s and loc
+                GadgetContainerMessage< hoNDArray< std::complex<float> > >* cm2 = 
+                        new GadgetContainerMessage<hoNDArray< std::complex<float> > >();
+
+                try{cm2->getObjectPtr()->create(&img_dims);}
+                catch (std::runtime_error &err){
+                    GEXCEPTION(err,"Unable to allocate new image\n");
+                    cm1->release();
+                    cm2->release();
+                    return GADGET_FAIL;
+                }
+                memcpy(cm2->getObjectPtr()->get_data_ptr(), &imagearr.data_(0,0,0,0,n,s,loc), X*Y*Z*CHA*sizeof(std::complex<float>));
+                //Chain them
+                cm1->cont(cm2);
+
+                //Creat a new meta container if needed and copy
+                if (imagearr.meta_.size()>0) {
+                    GadgetContainerMessage< ISMRMRD::MetaContainer >* cm3 = 
+                            new GadgetContainerMessage< ISMRMRD::MetaContainer >();
+                    size_t mindex = loc*N*S + s*N + n;
+                    *cm3->getObjectPtr() = imagearr.meta_[mindex];
+                    cm2->cont(cm3);
+                }
+
+                //Pass the image down the chain
+                if (this->next()->putq(cm1) < 0) {
+                    return GADGET_FAIL;
+                }
+            }
+        }
+    }
+    
+    return GADGET_OK;  
+
+}
+
+GADGET_FACTORY_DECLARE(ImageArraySplitGadget)
+}
diff --git a/gadgets/mri_core/ImageArraySplitGadget.h b/gadgets/mri_core/ImageArraySplitGadget.h
new file mode 100644
index 0000000..dd9aa40
--- /dev/null
+++ b/gadgets/mri_core/ImageArraySplitGadget.h
@@ -0,0 +1,23 @@
+#ifndef IMAGEARRAYSPLIT_H
+#define IMAGEARRAYSPLIT_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include "mri_core_data.h"
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE ImageArraySplitGadget : 
+  public Gadget1<IsmrmrdImageArray>
+    {
+    public:
+      GADGET_DECLARE(ImageArraySplitGadget)
+      ImageArraySplitGadget();
+	
+    protected:
+      virtual int process(GadgetContainerMessage<IsmrmrdImageArray>* m1);
+    };
+}
+#endif //IMAGEARRAYSPLIT_H
diff --git a/gadgets/mri_core/ImageFinishGadget.cpp b/gadgets/mri_core/ImageFinishGadget.cpp
new file mode 100644
index 0000000..9b681ea
--- /dev/null
+++ b/gadgets/mri_core/ImageFinishGadget.cpp
@@ -0,0 +1,31 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "ImageFinishGadget.h"
+
+namespace Gadgetron{
+
+    int ImageFinishGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1)
+    {
+        if (!this->controller_)
+        {
+            GERROR("Cannot return result to controller, no controller set");
+            return -1;
+        }
+
+        GadgetContainerMessage<GadgetMessageIdentifier>* mb = new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+        mb->getObjectPtr()->id = GADGET_MESSAGE_ISMRMRD_IMAGE;
+        mb->cont(m1);
+
+        int ret = this->controller_->output_ready(mb);
+
+        if ((ret < 0))
+        {
+            GERROR("Failed to return massage to controller\n");
+            return GADGET_FAIL;
+        }
+
+        return GADGET_OK;
+    }
+
+    GADGET_FACTORY_DECLARE(ImageFinishGadget);
+}
diff --git a/gadgets/mri_core/ImageFinishGadget.h b/gadgets/mri_core/ImageFinishGadget.h
new file mode 100644
index 0000000..9531db7
--- /dev/null
+++ b/gadgets/mri_core/ImageFinishGadget.h
@@ -0,0 +1,22 @@
+#ifndef IMAGEFINISHGADGET_H
+#define IMAGEFINISHGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "GadgetMRIHeaders.h"
+#include "GadgetStreamController.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+    class EXPORTGADGETSMRICORE ImageFinishGadget : public Gadget1 < ISMRMRD::ImageHeader >
+    {
+    protected:
+        virtual int process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1);
+    };
+}
+
+#endif //IMAGEFINISHGADGET_H
diff --git a/gadgets/mri_core/ImageWriterGadget.cpp b/gadgets/mri_core/ImageWriterGadget.cpp
new file mode 100644
index 0000000..5839fc5
--- /dev/null
+++ b/gadgets/mri_core/ImageWriterGadget.cpp
@@ -0,0 +1,52 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "ImageWriterGadget.h"
+
+#include <fstream>
+namespace Gadgetron{
+template<typename T>
+int ImageWriterGadget<T> ::
+process( GadgetContainerMessage< ISMRMRD::ImageHeader>* m1,
+	 GadgetContainerMessage< hoNDArray< T > >* m2)
+{
+    GDEBUG("Writing image\n");
+
+    char filename[1024];
+    switch (sizeof(T)) {
+     case (8): //Complex float
+     	sprintf(filename, "out_%05d.cplx", (int)this->calls_);
+     	break;
+     case (4): //Real floats
+ 		sprintf(filename, "out_%05d.real", (int)this->calls_);
+ 		break;
+     case (2): //Unsigned short
+ 		sprintf(filename, "out_%05d.short", (int)this->calls_);
+ 		break;
+     default:
+     	sprintf(filename, "out_%05d.cplx", (int)this->calls_);
+     	break;
+     }
+
+    std::ofstream outfile;    
+    outfile.open (filename, std::ios::out|std::ios::binary);
+
+    int ndim = m2->getObjectPtr()->get_number_of_dimensions();
+    int* dims = new int[ndim];
+    size_t elements = 1;
+    for (int d = 0; d < ndim; d++) {
+      dims[d] = m2->getObjectPtr()->get_size(d);
+      elements *= dims[d];
+    }
+    outfile.write((char*)&ndim,sizeof(int));
+    outfile.write((char*)dims,sizeof(int)*ndim);
+    outfile.write((char*)m2->getObjectPtr()->get_data_ptr(),sizeof(T)*elements);
+    outfile.close();
+    delete [] dims;
+
+    this->calls_++;
+    return this->next()->putq(m1);
+}
+
+GADGET_FACTORY_DECLARE(ImageWriterGadgetUSHORT)
+GADGET_FACTORY_DECLARE(ImageWriterGadgetFLOAT)
+GADGET_FACTORY_DECLARE(ImageWriterGadgetCPLX)
+}
diff --git a/gadgets/mri_core/ImageWriterGadget.h b/gadgets/mri_core/ImageWriterGadget.h
new file mode 100644
index 0000000..f944fe8
--- /dev/null
+++ b/gadgets/mri_core/ImageWriterGadget.h
@@ -0,0 +1,50 @@
+#ifndef IMAGEWRITERGADGET_H
+#define IMAGEWRITERGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+  
+  template <typename T> class ImageWriterGadget :
+  public Gadget2<ISMRMRD::ImageHeader, hoNDArray< T > >
+  {
+    public:
+      
+    ImageWriterGadget()
+      : calls_(0)
+	{}
+      
+    protected:
+      virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader>* m1,
+			   GadgetContainerMessage< hoNDArray< T > >* m2);
+      
+      long calls_;      
+  };
+  
+  class EXPORTGADGETSMRICORE ImageWriterGadgetUSHORT :
+  public ImageWriterGadget<ACE_UINT16>
+  {
+  public:
+    GADGET_DECLARE(ImageWriterGadgetUSHORT)
+  };
+
+  class EXPORTGADGETSMRICORE ImageWriterGadgetFLOAT :
+  public ImageWriterGadget<float>
+  {
+  public:
+    GADGET_DECLARE(ImageWriterGadgetFLOAT)
+  };
+
+  class EXPORTGADGETSMRICORE ImageWriterGadgetCPLX :
+  public ImageWriterGadget< std::complex<float> >
+  {
+  public:
+    GADGET_DECLARE(ImageWriterGadgetCPLX)
+  };
+}
+#endif //IMAGEWRITERGADGET_H
diff --git a/gadgets/mri_core/IsmrmrdDumpGadget.cpp b/gadgets/mri_core/IsmrmrdDumpGadget.cpp
new file mode 100644
index 0000000..069677c
--- /dev/null
+++ b/gadgets/mri_core/IsmrmrdDumpGadget.cpp
@@ -0,0 +1,136 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "IsmrmrdDumpGadget.h"
+#include <iomanip>
+
+namespace Gadgetron
+{
+    std::string get_date_time_string()
+    {
+        time_t rawtime;
+        struct tm * timeinfo;
+        time ( &rawtime );
+        timeinfo = localtime ( &rawtime );
+
+        std::stringstream str;
+        str << timeinfo->tm_year+1900
+            << std::setw(2) << std::setfill('0') << timeinfo->tm_mon+1
+            << std::setw(2) << std::setfill('0') << timeinfo->tm_mday
+            << "-"
+            << std::setw(2) << std::setfill('0') << timeinfo->tm_hour
+            << std::setw(2) << std::setfill('0') << timeinfo->tm_min
+            << std::setw(2) << std::setfill('0') << timeinfo->tm_sec;
+
+        std::string ret = str.str();
+
+        return ret;
+    }
+
+    IsmrmrdDumpGadget::IsmrmrdDumpGadget()
+                    : Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >()
+                    , file_prefix_("ISMRMRD_DUMP")
+                    , ismrmrd_file_name_("ISMRMRD_DUMP.h5") //This will be reset during configuration
+                    , append_timestamp_(true)
+    {
+        file_prefix_ = "ISMRMRD_DUMP";
+        append_timestamp_ = true;
+    }
+
+    int IsmrmrdDumpGadget::process_config(ACE_Message_Block* mb)
+    {
+        file_prefix_ = file_prefix.value();
+        if ( file_prefix_.empty() )
+        {
+            file_prefix_ = "ISMRMRD_DUMP";
+        }
+
+        append_timestamp_ = append_timestamp.value();
+
+        //Generate filename
+        if (append_timestamp_)
+        {
+            ismrmrd_file_name_ = file_prefix_ + std::string("_") + get_date_time_string() + std::string(".h5");
+        }
+        else
+        {
+            ismrmrd_file_name_ = file_prefix_ + std::string(".h5");
+        }
+
+        ismrmrd_dataset_ = boost::shared_ptr<ISMRMRD::Dataset>(new ISMRMRD::Dataset(ismrmrd_file_name_.c_str(), "dataset"));
+
+        std::string xml_config(mb->rd_ptr());
+
+        try {
+            ismrmrd_dataset_->writeHeader(xml_config);
+        }
+        catch (...)
+        {
+            GDEBUG("Failed to write XML header to HDF file\n");
+            return GADGET_FAIL;
+        }
+
+        return GADGET_OK;
+    }
+
+    int IsmrmrdDumpGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+    {
+        ISMRMRD::Acquisition ismrmrd_acq;
+
+        ismrmrd_acq.setHead(*m1->getObjectPtr());
+
+        memcpy((void *)ismrmrd_acq.getDataPtr(), m2->getObjectPtr()->get_data_ptr(), 
+               sizeof(float)*m2->getObjectPtr()->get_number_of_elements()*2);
+
+        if (m2->cont())
+        {
+            //Write trajectory
+            if (ismrmrd_acq.trajectory_dimensions() == 0)
+            {
+                GDEBUG("Malformed dataset. Trajectory attached but trajectory dimensions == 0\n");
+                return GADGET_FAIL;
+            }
+
+            GadgetContainerMessage< hoNDArray<float> >* m3 = AsContainerMessage< hoNDArray<float> >(m2->cont());
+
+            if (!m3)
+            {
+                GDEBUG("Error casting trajectory data package");
+                return GADGET_FAIL;
+            } 
+
+            memcpy((void *)ismrmrd_acq.getTrajPtr(), m3->getObjectPtr()->get_data_ptr(),
+		   sizeof(float)*m3->getObjectPtr()->get_number_of_elements());
+
+        }
+        else
+        {
+            if (ismrmrd_acq.trajectory_dimensions() != 0)
+            {
+                GDEBUG("Malformed dataset. Trajectory dimensions not zero but no trajectory attached\n");
+                return GADGET_FAIL;
+            }
+        }
+
+        {
+            try {
+                ismrmrd_dataset_->appendAcquisition(ismrmrd_acq);
+            }
+            catch (...)
+            {
+                GDEBUG("Error appending ISMRMRD Dataset\n");
+                return GADGET_FAIL;
+            }
+        }
+
+        //It is enough to put the first one, since they are linked
+        if (this->next()->putq(m1) == -1)
+        {
+            m1->release();
+	    GERROR("IsmrmrdDumpGadget::process, passing data on to next gadget");
+	    return -1;
+        }
+
+        return 0;
+    }
+
+    GADGET_FACTORY_DECLARE(IsmrmrdDumpGadget)
+}
diff --git a/gadgets/mri_core/IsmrmrdDumpGadget.h b/gadgets/mri_core/IsmrmrdDumpGadget.h
new file mode 100644
index 0000000..8fe569f
--- /dev/null
+++ b/gadgets/mri_core/IsmrmrdDumpGadget.h
@@ -0,0 +1,39 @@
+#ifndef ISMRMRDDUMPGADGET_H
+#define ISMRMRDDUMPGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <ismrmrd/dataset.h>
+
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE IsmrmrdDumpGadget : 
+  public Gadgetron::Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(IsmrmrdDumpGadget);
+
+      IsmrmrdDumpGadget();
+
+    protected:
+      GADGET_PROPERTY(file_prefix, std::string, "Prefix for dump file", "ISMRMRD_DUMP");
+      GADGET_PROPERTY(append_timestamp, bool, "Append timestamp to file name prefix", true);
+
+      virtual int process_config(ACE_Message_Block* mb);
+
+      virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+    private:
+      std::string file_prefix_;
+      std::string ismrmrd_file_name_;
+      boost::shared_ptr<ISMRMRD::Dataset>  ismrmrd_dataset_;
+      bool append_timestamp_;
+    };
+}
+#endif //ISMRMRDDUMPGADGET_H
diff --git a/gadgets/mri_core/MRIImageAttribWriter.cpp b/gadgets/mri_core/MRIImageAttribWriter.cpp
new file mode 100644
index 0000000..ce1a0ad
--- /dev/null
+++ b/gadgets/mri_core/MRIImageAttribWriter.cpp
@@ -0,0 +1,149 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "MRIImageAttribWriter.h"
+#include "GadgetContainerMessage.h"
+#include "hoNDArray.h"
+
+#include <complex>
+
+namespace Gadgetron{
+
+int MRIImageAttribWriter::write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb)
+{
+    GadgetContainerMessage<ISMRMRD::ImageHeader>* imagemb =
+        AsContainerMessage<ISMRMRD::ImageHeader>(mb);
+
+    if (!imagemb)
+    {
+        GERROR("MRIImageAttribWriter::write, invalid image message objects, 1\n");
+        return -1;
+    }
+
+    uint16_t data_type = imagemb->getObjectPtr()->data_type;
+
+    if (data_type == ISMRMRD::ISMRMRD_USHORT)
+    {
+        GadgetContainerMessage< hoNDArray< unsigned short > >* datamb = AsContainerMessage< hoNDArray< unsigned short > >(imagemb->cont());
+        if (!datamb)
+        {
+            GERROR("MRIImageAttribWriter::write, invalid image message objects\n");
+            return -1;
+        }
+
+        if (this->write_data_attrib(sock, imagemb, datamb) != 0)
+        {
+            GERROR("MRIImageAttribWriter::write_data_attrib failed for unsigned short ... \n");
+            return -1;
+        }
+    }
+    else if (data_type == ISMRMRD::ISMRMRD_SHORT)
+    {
+        GadgetContainerMessage< hoNDArray< short > >* datamb = AsContainerMessage< hoNDArray< short > >(imagemb->cont());
+        if (!datamb)
+        {
+            GERROR("MRIImageAttribWriter::write, invalid image message objects\n");
+            return -1;
+        }
+
+        if (this->write_data_attrib(sock, imagemb, datamb) != 0)
+        {
+            GERROR("MRIImageAttribWriter::write_data_attrib failed for short ... \n");
+            return -1;
+        }
+    }
+    else if (data_type == ISMRMRD::ISMRMRD_UINT)
+    {
+        GadgetContainerMessage< hoNDArray< unsigned int > >* datamb = AsContainerMessage< hoNDArray< unsigned int > >(imagemb->cont());
+        if (!datamb)
+        {
+            GERROR("MRIImageAttribWriter::write, invalid image message objects\n");
+            return -1;
+        }
+
+        if (this->write_data_attrib(sock, imagemb, datamb) != 0)
+        {
+            GERROR("MRIImageAttribWriter::write_data_attrib failed for unsigned int ... \n");
+            return -1;
+        }
+    }
+    else if (data_type == ISMRMRD::ISMRMRD_INT)
+    {
+        GadgetContainerMessage< hoNDArray< int > >* datamb = AsContainerMessage< hoNDArray< int > >(imagemb->cont());
+        if (!datamb)
+        {
+            GERROR("MRIImageAttribWriter::write, invalid image message objects\n");
+            return -1;
+        }
+
+        if (this->write_data_attrib(sock, imagemb, datamb) != 0)
+        {
+            GERROR("MRIImageAttribWriter::write_data_attrib failed for int ... \n");
+            return -1;
+        }
+    }
+    else if (data_type == ISMRMRD::ISMRMRD_FLOAT)
+    {
+        GadgetContainerMessage< hoNDArray< float > >* datamb = AsContainerMessage< hoNDArray< float > >(imagemb->cont());
+        if (!datamb)
+        {
+            GERROR("MRIImageAttribWriter::write, invalid image message objects\n");
+            return -1;
+        }
+
+        if (this->write_data_attrib(sock, imagemb, datamb) != 0)
+        {
+            GERROR("MRIImageAttribWriter::write_data_attrib failed for float ... \n");
+            return -1;
+        }
+    }
+    else if (data_type == ISMRMRD::ISMRMRD_DOUBLE)
+    {
+        GadgetContainerMessage< hoNDArray< double > >* datamb = AsContainerMessage< hoNDArray< double > >(imagemb->cont());
+        if (!datamb)
+        {
+            GERROR("MRIImageAttribWriter::write, invalid image message objects\n");
+            return -1;
+        }
+
+        if (this->write_data_attrib(sock, imagemb, datamb) != 0)
+        {
+            GERROR("MRIImageAttribWriter::write_data_attrib failed for double ... \n");
+            return -1;
+        }
+    }
+    else if (data_type == ISMRMRD::ISMRMRD_CXFLOAT)
+    {
+        GadgetContainerMessage< hoNDArray< std::complex<float> > >* datamb = AsContainerMessage< hoNDArray< std::complex<float> > >(imagemb->cont());
+        if (!datamb)
+        {
+            GERROR("MRIImageAttribWriter::write, invalid image message objects\n");
+            return -1;
+        }
+
+        if (this->write_data_attrib(sock, imagemb, datamb) != 0)
+        {
+            GERROR("MRIImageAttribWriter::write_data_attrib failed for std::complex<float> ... \n");
+            return -1;
+        }
+    }
+    else if (data_type == ISMRMRD::ISMRMRD_CXDOUBLE)
+    {
+        GadgetContainerMessage< hoNDArray< std::complex<double> > >* datamb = AsContainerMessage< hoNDArray< std::complex<double> > >(imagemb->cont());
+        if (!datamb)
+        {
+            GERROR("MRIImageAttribWriter::write, invalid image message objects\n");
+            return -1;
+        }
+
+        if (this->write_data_attrib(sock, imagemb, datamb) != 0)
+        {
+            GERROR("MRIImageAttribWriter::write_data_attrib failed for std::complex<double> ... \n");
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+GADGETRON_WRITER_FACTORY_DECLARE(MRIImageAttribWriter)
+
+}
diff --git a/gadgets/mri_core/MRIImageAttribWriter.h b/gadgets/mri_core/MRIImageAttribWriter.h
new file mode 100644
index 0000000..4b572cf
--- /dev/null
+++ b/gadgets/mri_core/MRIImageAttribWriter.h
@@ -0,0 +1,122 @@
+/** \file   MRIImageAttribWriter.h
+    \brief  MRI image writer with meta attributes.
+    \author Hui Xue
+*/
+
+#ifndef MRIImageAttribWriter_H
+#define MRIImageAttribWriter_H
+
+#include "GadgetMessageInterface.h"
+#include "GadgetMRIHeaders.h"
+#include "ismrmrd/meta.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+class MRIImageAttribWriter : public GadgetMessageWriter
+{
+public:
+    virtual int write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb);
+
+    template <typename T> 
+    int write_data_attrib(ACE_SOCK_Stream* sock, GadgetContainerMessage<ISMRMRD::ImageHeader>* header, GadgetContainerMessage< hoNDArray<T> >* data)
+    {
+        typedef unsigned long long size_t_type;
+
+        uint16_t RO = header->getObjectPtr()->matrix_size[0];
+        uint16_t E1 = header->getObjectPtr()->matrix_size[1];
+        uint16_t E2 = header->getObjectPtr()->matrix_size[2];
+
+        unsigned long expected_elements = RO*E1*E2;
+
+        if (expected_elements != data->getObjectPtr()->get_number_of_elements())
+        {
+            GDEBUG("Number of header elements %d is inconsistent with number of elements in NDArray %d\n", expected_elements, data->getObjectPtr()->get_number_of_elements());
+            GDEBUG("Header dimensions: %d, %d, %d\n", RO, E1, E2);
+            GDEBUG("Number of array dimensions: %d:\n", data->getObjectPtr()->get_number_of_dimensions());
+            for (size_t i = 0; i < data->getObjectPtr()->get_number_of_dimensions(); i++)
+            {
+                GDEBUG("Dimensions %d: %d\n", i, data->getObjectPtr()->get_size(i));
+            }
+            return -1;
+        }
+
+        ssize_t send_cnt = 0;
+        GadgetMessageIdentifier id;
+        id.id = GADGET_MESSAGE_ISMRMRD_IMAGE;
+
+        if ((send_cnt = sock->send_n(&id, sizeof(GadgetMessageIdentifier))) <= 0)
+        {
+            GERROR("Unable to send image message identifier\n");
+            return -1;
+        }
+
+        GadgetContainerMessage<ISMRMRD::MetaContainer>* attribmb = AsContainerMessage<ISMRMRD::MetaContainer>(data->cont());
+
+        char* buf = NULL;
+        size_t_type len(0);
+
+        if (!attribmb)
+        {
+            try
+            {
+                std::stringstream str;
+                ISMRMRD::serialize(*attribmb->getObjectPtr(), str);
+                std::string attribContent = str.str();
+                len = attribContent.length() + 1;
+
+                buf = new char[len];
+                GADGET_CHECK_THROW(buf != NULL);
+
+                memset(buf, '\0', sizeof(char)*len);
+                memcpy(buf, attribContent.c_str(), len - 1);
+            }
+            catch (...)
+            {
+                GERROR("Unable to serialize image meta attributes \n");
+                return -1;
+            }
+        }
+
+        header->getObjectPtr()->attribute_string_len = (uint32_t)len;
+
+        if ((send_cnt = sock->send_n(header->getObjectPtr(), sizeof(ISMRMRD::ImageHeader))) <= 0)
+        {
+            GERROR("Unable to send image header\n");
+            return -1;
+        }
+
+        if ((send_cnt = sock->send_n(&len, sizeof(size_t_type))) <= 0)
+        {
+            GERROR("Unable to send image meta attributes length\n");
+            if (buf != NULL) delete[] buf;
+            return -1;
+        }
+
+        if (len>0)
+        {
+            if ((send_cnt = sock->send_n(buf, len)) <= 0)
+            {
+                GERROR("Unable to send image meta attributes\n");
+                if (buf != NULL) delete[] buf;
+                return -1;
+            }
+        }
+
+        if (buf != NULL) delete[] buf;
+
+        if ((send_cnt = sock->send_n(data->getObjectPtr()->get_data_ptr(), sizeof(T)*data->getObjectPtr()->get_number_of_elements())) <= 0)
+        {
+            GERROR("Unable to send image data\n");
+            return -1;
+        }
+
+        return 0;
+    }
+};
+
+}
+#endif
diff --git a/gadgets/mri_core/MRIImageWriter.cpp b/gadgets/mri_core/MRIImageWriter.cpp
new file mode 100644
index 0000000..ec2eeed
--- /dev/null
+++ b/gadgets/mri_core/MRIImageWriter.cpp
@@ -0,0 +1,149 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "MRIImageWriter.h"
+#include "GadgetContainerMessage.h"
+#include "hoNDArray.h"
+
+#include <complex>
+
+namespace Gadgetron{
+
+    int MRIImageWriter::write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb)
+    {
+        GadgetContainerMessage<ISMRMRD::ImageHeader>* imagemb =
+            AsContainerMessage<ISMRMRD::ImageHeader>(mb);
+
+        if (!imagemb)
+        {
+            GERROR("MRIImageWriter::write, invalid image message objects, 1\n");
+            return -1;
+        }
+
+        uint16_t data_type = imagemb->getObjectPtr()->data_type;
+
+        if (data_type == ISMRMRD::ISMRMRD_USHORT)
+        {
+            GadgetContainerMessage< hoNDArray< unsigned short > >* datamb = AsContainerMessage< hoNDArray< unsigned short > >(imagemb->cont());
+            if (!datamb)
+            {
+                GERROR("MRIImageWriter::write, invalid image message objects\n");
+                return -1;
+            }
+
+            if (this->write_data_attrib(sock, imagemb, datamb) != 0)
+            {
+                GERROR("MRIImageWriter::write_data_attrib failed for unsigned short ... \n");
+                return -1;
+            }
+        }
+        else if (data_type == ISMRMRD::ISMRMRD_SHORT)
+        {
+            GadgetContainerMessage< hoNDArray< short > >* datamb = AsContainerMessage< hoNDArray< short > >(imagemb->cont());
+            if (!datamb)
+            {
+                GERROR("MRIImageWriter::write, invalid image message objects\n");
+                return -1;
+            }
+
+            if (this->write_data_attrib(sock, imagemb, datamb) != 0)
+            {
+                GERROR("MRIImageWriter::write_data_attrib failed for short ... \n");
+                return -1;
+            }
+        }
+        else if (data_type == ISMRMRD::ISMRMRD_UINT)
+        {
+            GadgetContainerMessage< hoNDArray< unsigned int > >* datamb = AsContainerMessage< hoNDArray< unsigned int > >(imagemb->cont());
+            if (!datamb)
+            {
+                GERROR("MRIImageWriter::write, invalid image message objects\n");
+                return -1;
+            }
+
+            if (this->write_data_attrib(sock, imagemb, datamb) != 0)
+            {
+                GERROR("MRIImageWriter::write_data_attrib failed for unsigned int ... \n");
+                return -1;
+            }
+        }
+        else if (data_type == ISMRMRD::ISMRMRD_INT)
+        {
+            GadgetContainerMessage< hoNDArray< int > >* datamb = AsContainerMessage< hoNDArray< int > >(imagemb->cont());
+            if (!datamb)
+            {
+                GERROR("MRIImageWriter::write, invalid image message objects\n");
+                return -1;
+            }
+
+            if (this->write_data_attrib(sock, imagemb, datamb) != 0)
+            {
+                GERROR("MRIImageWriter::write_data_attrib failed for int ... \n");
+                return -1;
+            }
+        }
+        else if (data_type == ISMRMRD::ISMRMRD_FLOAT)
+        {
+            GadgetContainerMessage< hoNDArray< float > >* datamb = AsContainerMessage< hoNDArray< float > >(imagemb->cont());
+            if (!datamb)
+            {
+                GERROR("MRIImageWriter::write, invalid image message objects\n");
+                return -1;
+            }
+
+            if (this->write_data_attrib(sock, imagemb, datamb) != 0)
+            {
+                GERROR("MRIImageWriter::write_data_attrib failed for float ... \n");
+                return -1;
+            }
+        }
+        else if (data_type == ISMRMRD::ISMRMRD_DOUBLE)
+        {
+            GadgetContainerMessage< hoNDArray< double > >* datamb = AsContainerMessage< hoNDArray< double > >(imagemb->cont());
+            if (!datamb)
+            {
+                GERROR("MRIImageWriter::write, invalid image message objects\n");
+                return -1;
+            }
+
+            if (this->write_data_attrib(sock, imagemb, datamb) != 0)
+            {
+                GERROR("MRIImageWriter::write_data_attrib failed for double ... \n");
+                return -1;
+            }
+        }
+        else if (data_type == ISMRMRD::ISMRMRD_CXFLOAT)
+        {
+            GadgetContainerMessage< hoNDArray< std::complex<float> > >* datamb = AsContainerMessage< hoNDArray< std::complex<float> > >(imagemb->cont());
+            if (!datamb)
+            {
+                GERROR("MRIImageWriter::write, invalid image message objects\n");
+                return -1;
+            }
+
+            if (this->write_data_attrib(sock, imagemb, datamb) != 0)
+            {
+                GERROR("MRIImageWriter::write_data_attrib failed for std::complex<float> ... \n");
+                return -1;
+            }
+        }
+        else if (data_type == ISMRMRD::ISMRMRD_CXDOUBLE)
+        {
+            GadgetContainerMessage< hoNDArray< std::complex<double> > >* datamb = AsContainerMessage< hoNDArray< std::complex<double> > >(imagemb->cont());
+            if (!datamb)
+            {
+                GERROR("MRIImageWriter::write, invalid image message objects\n");
+                return -1;
+            }
+
+            if (this->write_data_attrib(sock, imagemb, datamb) != 0)
+            {
+                GERROR("MRIImageWriter::write_data_attrib failed for std::complex<double> ... \n");
+                return -1;
+            }
+        }
+
+        return 0;
+    }
+
+    GADGETRON_WRITER_FACTORY_DECLARE(MRIImageWriter)
+
+}
diff --git a/gadgets/mri_core/MRIImageWriter.h b/gadgets/mri_core/MRIImageWriter.h
new file mode 100644
index 0000000..8b63763
--- /dev/null
+++ b/gadgets/mri_core/MRIImageWriter.h
@@ -0,0 +1,122 @@
+/** \file   MRIImageWriter.h
+\brief  MRI image writer with or without meta attributes.
+\author Hui Xue
+*/
+
+#ifndef MRIImageWriter_H
+#define MRIImageWriter_H
+
+#include "GadgetMessageInterface.h"
+#include "GadgetMRIHeaders.h"
+#include "ismrmrd/meta.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+    class MRIImageWriter : public GadgetMessageWriter
+    {
+    public:
+        virtual int write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb);
+
+        template <typename T>
+        int write_data_attrib(ACE_SOCK_Stream* sock, GadgetContainerMessage<ISMRMRD::ImageHeader>* header, GadgetContainerMessage< hoNDArray<T> >* data)
+        {
+            typedef unsigned long long size_t_type;
+
+            uint16_t RO = header->getObjectPtr()->matrix_size[0];
+            uint16_t E1 = header->getObjectPtr()->matrix_size[1];
+            uint16_t E2 = header->getObjectPtr()->matrix_size[2];
+
+            unsigned long expected_elements = RO*E1*E2;
+
+            if (expected_elements != data->getObjectPtr()->get_number_of_elements())
+            {
+                GDEBUG("Number of header elements %d is inconsistent with number of elements in NDArray %d\n", expected_elements, data->getObjectPtr()->get_number_of_elements());
+                GDEBUG("Header dimensions: %d, %d, %d\n", RO, E1, E2);
+                GDEBUG("Number of array dimensions: %d:\n", data->getObjectPtr()->get_number_of_dimensions());
+                for (size_t i = 0; i < data->getObjectPtr()->get_number_of_dimensions(); i++)
+                {
+                    GDEBUG("Dimensions %d: %d\n", i, data->getObjectPtr()->get_size(i));
+                }
+                return -1;
+            }
+
+            ssize_t send_cnt = 0;
+            GadgetMessageIdentifier id;
+            id.id = GADGET_MESSAGE_ISMRMRD_IMAGE;
+
+            if ((send_cnt = sock->send_n(&id, sizeof(GadgetMessageIdentifier))) <= 0)
+            {
+                GERROR("Unable to send image message identifier\n");
+                return -1;
+            }
+
+            GadgetContainerMessage<ISMRMRD::MetaContainer>* attribmb = AsContainerMessage<ISMRMRD::MetaContainer>(data->cont());
+
+            char* buf = NULL;
+            size_t_type len(0);
+
+            if (attribmb)
+            {
+                try
+                {
+                    std::stringstream str;
+                    ISMRMRD::serialize(*attribmb->getObjectPtr(), str);
+                    std::string attribContent = str.str();
+                    len = attribContent.length() + 1;
+
+                    buf = new char[len];
+                    GADGET_CHECK_THROW(buf != NULL);
+
+                    memset(buf, '\0', sizeof(char)*len);
+                    memcpy(buf, attribContent.c_str(), len - 1);
+                }
+                catch (...)
+                {
+                    GERROR("Unable to serialize image meta attributes \n");
+                    return -1;
+                }
+            }
+
+            header->getObjectPtr()->attribute_string_len = (uint32_t)len;
+
+            if ((send_cnt = sock->send_n(header->getObjectPtr(), sizeof(ISMRMRD::ImageHeader))) <= 0)
+            {
+                GERROR("Unable to send image header\n");
+                return -1;
+            }
+
+            if ((send_cnt = sock->send_n(&len, sizeof(size_t_type))) <= 0)
+            {
+                GERROR("Unable to send image meta attributes length\n");
+                if (buf != NULL) delete[] buf;
+                return -1;
+            }
+
+            if (len>0)
+            {
+                if ((send_cnt = sock->send_n(buf, len)) <= 0)
+                {
+                    GERROR("Unable to send image meta attributes\n");
+                    if (buf != NULL) delete[] buf;
+                    return -1;
+                }
+            }
+
+            if (buf != NULL) delete[] buf;
+
+            if ((send_cnt = sock->send_n(data->getObjectPtr()->get_data_ptr(), sizeof(T)*data->getObjectPtr()->get_number_of_elements())) <= 0)
+            {
+                GERROR("Unable to send image data\n");
+                return -1;
+            }
+
+            return 0;
+        }
+    };
+
+}
+#endif
diff --git a/gadgets/mri_core/MaxwellCorrectionGadget.cpp b/gadgets/mri_core/MaxwellCorrectionGadget.cpp
new file mode 100644
index 0000000..0236397
--- /dev/null
+++ b/gadgets/mri_core/MaxwellCorrectionGadget.cpp
@@ -0,0 +1,139 @@
+#include "MaxwellCorrectionGadget.h"
+#include "GadgetronTimer.h"
+#include "Spline.h"
+#include "ismrmrd/xml.h"
+
+#include <numeric>
+#ifdef USE_OMP
+#include <omp.h>
+#endif 
+
+namespace Gadgetron{
+
+#ifdef M_PI
+#undef M_PI
+#endif // M_PI
+#define M_PI 3.14159265358979323846
+
+    MaxwellCorrectionGadget::MaxwellCorrectionGadget()
+        : maxwell_coefficients_present_(false)
+        , maxwell_coefficients_(4,0)
+    {
+    }
+
+    MaxwellCorrectionGadget::~MaxwellCorrectionGadget() {}
+
+    int MaxwellCorrectionGadget::process_config(ACE_Message_Block* mb)
+    {
+
+        ISMRMRD::IsmrmrdHeader h;
+        ISMRMRD::deserialize(mb->rd_ptr(),h);
+
+        if (h.userParameters)
+        {
+            for (std::vector<ISMRMRD::UserParameterDouble>::const_iterator i (h.userParameters->userParameterDouble.begin()); 
+                i != h.userParameters->userParameterDouble.end(); i++)
+            {
+                    if (i->name == "MaxwellCoefficient_0") {
+                        maxwell_coefficients_[0] = i->value;
+                    } else if (i->name == "MaxwellCoefficient_1") {
+                        maxwell_coefficients_[1] = i->value;
+                    } else if (i->name == "MaxwellCoefficient_2") {
+                        maxwell_coefficients_[2] = i->value;
+                    } else if (i->name == "MaxwellCoefficient_3") {
+                        maxwell_coefficients_[3] = i->value;
+                    } else {
+                        GDEBUG("WARNING: unused user parameter parameter %s found\n", i->name.c_str());
+                    }
+            }
+        } else {
+            GDEBUG("MaxwellCorrection coefficients are supposed to be in the UserParameters. No user parameter section found\n");
+            return GADGET_OK;
+        }
+
+        maxwell_coefficients_present_ = true;
+
+        GDEBUG("Maxwell Coefficients: %f, %f, %f, %f\n", maxwell_coefficients_[0], maxwell_coefficients_[1], maxwell_coefficients_[2], maxwell_coefficients_[3]);
+
+        return GADGET_OK;
+    }
+
+    int MaxwellCorrectionGadget::
+        process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+        GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+    {
+        if (maxwell_coefficients_present_) {
+            //GDEBUG("Got coefficients\n");
+
+            int Nx = m2->getObjectPtr()->get_size(0);
+            int Ny = m2->getObjectPtr()->get_size(1);
+            int Nz = m2->getObjectPtr()->get_size(2);
+
+            float dx = m1->getObjectPtr()->field_of_view[0] / Nx;
+            float dy = m1->getObjectPtr()->field_of_view[1] / Ny;
+            float dz = m1->getObjectPtr()->field_of_view[2] / Nz;
+
+            /*
+            GDEBUG("Nx = %d, Ny = %d, Nz = %d\n", Nx, Ny, Nz);
+            GDEBUG("dx = %f, dy = %f, dz = %f\n", dx, dy, dz);
+            GDEBUG("img_pos_x = %f, img_pos_y = %f, img_pos_z = %f\n", m1->getObjectPtr()->position[0], m1->getObjectPtr()->position[1], m1->getObjectPtr()->position[2]);
+            */
+
+            std::vector<float> dR(3,0);
+            std::vector<float> dP(3,0);
+            std::vector<float> dS(3,0);
+            std::vector<float> p(3,0);
+
+            for (int z = 0; z < Nz; z++) {
+                for (int y = 0; y < Ny; y++) {
+                    for (int x = 0; x < Nx; x++) {
+
+                        dR[0] = (x-Nx/2+0.5) * dx * m1->getObjectPtr()->read_dir[0];
+                        dR[1] = (x-Nx/2+0.5) * dx * m1->getObjectPtr()->read_dir[1];
+                        dR[2] = (x-Nx/2+0.5) * dx * m1->getObjectPtr()->read_dir[2];
+
+                        dP[0] = (y-Ny/2+0.5) * dy * m1->getObjectPtr()->phase_dir[0];
+                        dP[1] = (y-Ny/2+0.5) * dy * m1->getObjectPtr()->phase_dir[1];
+                        dP[2] = (y-Ny/2+0.5) * dy * m1->getObjectPtr()->phase_dir[2];
+
+                        if (Nz > 1) {
+                            dS[0] = (z-Nz/2+0.5) * dz * m1->getObjectPtr()->slice_dir[0];
+                            dS[1] = (z-Nz/2+0.5) * dz * m1->getObjectPtr()->slice_dir[1];
+                            dS[2] = (z-Nz/2+0.5) * dz * m1->getObjectPtr()->slice_dir[2];
+                        }
+
+                        p[0] = m1->getObjectPtr()->position[0] + dP[0] + dR[0] + dS[0];
+                        p[1] = m1->getObjectPtr()->position[1] + dP[1] + dR[1] + dS[1];
+                        p[2] = m1->getObjectPtr()->position[2] + dP[2] + dR[2] + dS[2];
+
+                        //Convert to centimeters
+                        p[0] = p[0]/1000.0;
+                        p[1] = p[1]/1000.0;
+                        p[2] = p[2]/1000.0;
+
+                        float delta_phi = maxwell_coefficients_[0]*p[2]*p[2] +
+                            maxwell_coefficients_[1]*(p[0]*p[0] + p[1]*p[1]) + 
+                            maxwell_coefficients_[2]*p[0]*p[2] + 
+                            maxwell_coefficients_[3]*p[1]*p[2];
+
+                        long index = z*Ny*Nx+y*Nx+x;
+                        std::complex<float>* data_ptr = m2->getObjectPtr()->get_data_ptr();
+
+                        std::complex<float> correction = std::polar(1.0f,static_cast<float>(2*M_PI*delta_phi));
+
+                        data_ptr[index] *= correction;
+                    }
+                }
+            }
+
+        }
+
+        if (this->next()->putq(m1) < 0) {
+            GDEBUG("Unable to put data on next Gadgets Q\n");
+            return GADGET_FAIL;
+        }
+        return GADGET_OK;
+    }
+
+    GADGET_FACTORY_DECLARE(MaxwellCorrectionGadget)
+}
diff --git a/gadgets/mri_core/MaxwellCorrectionGadget.h b/gadgets/mri_core/MaxwellCorrectionGadget.h
new file mode 100644
index 0000000..5121715
--- /dev/null
+++ b/gadgets/mri_core/MaxwellCorrectionGadget.h
@@ -0,0 +1,35 @@
+#ifndef MaxwellCorrectionGadget_H
+#define MaxwellCorrectionGadget_H
+
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{  
+
+    class EXPORTGADGETSMRICORE MaxwellCorrectionGadget :
+        public Gadget2< ISMRMRD::ImageHeader, hoNDArray< std::complex<float> > >
+    {
+
+    public:
+        GADGET_DECLARE(MaxwellCorrectionGadget);
+        MaxwellCorrectionGadget();
+        virtual ~MaxwellCorrectionGadget();
+
+
+    protected:
+        virtual int process_config(ACE_Message_Block* mb);
+        virtual int process(GadgetContainerMessage< ISMRMRD::ImageHeader >* m1,
+            GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2);
+	
+    private:
+	std::vector<double> maxwell_coefficients_;
+	bool maxwell_coefficients_present_;
+    };
+}
+
+#endif //MaxwellCorrectionGadget_H
diff --git a/gadgets/mri_core/NoiseAdjustGadget.cpp b/gadgets/mri_core/NoiseAdjustGadget.cpp
new file mode 100644
index 0000000..8cc2911
--- /dev/null
+++ b/gadgets/mri_core/NoiseAdjustGadget.cpp
@@ -0,0 +1,468 @@
+#include "NoiseAdjustGadget.h"
+#include "hoArmadillo.h"
+#include "hoNDArray_elemwise.h"
+#include "hoMatrix.h"
+#include "hoNDArray_linalg.h"
+#include "hoNDArray_elemwise.h"
+
+#ifdef USE_OMP
+#include "omp.h"
+#endif // USE_OMP
+
+#ifndef _WIN32
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif // _WIN32
+
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/split.hpp>
+
+namespace Gadgetron{
+
+  NoiseAdjustGadget::NoiseAdjustGadget()
+    : noise_decorrelation_calculated_(false)
+    , number_of_noise_samples_(0)
+    , number_of_noise_samples_per_acquisition_(0)
+    , noise_bw_scale_factor_(1.0f)
+    , noise_dwell_time_us_(-1.0f)
+    , noiseCovarianceLoaded_(false)
+    , saved_(false)
+  {
+    noise_dependency_prefix_ = "GadgetronNoiseCovarianceMatrix";
+    measurement_id_.clear();
+    measurement_id_of_noise_dependency_.clear();
+    noise_dwell_time_us_preset_ = 0.0;
+    perform_noise_adjust_ = true;
+    pass_nonconformant_data_ = false;
+  }
+
+  NoiseAdjustGadget::~NoiseAdjustGadget()
+  {
+
+  }
+
+  int NoiseAdjustGadget::process_config(ACE_Message_Block* mb)
+  {
+    if ( !workingDirectory.value().empty() ) {
+      noise_dependency_folder_ = workingDirectory.value();
+    }
+    else {
+#ifdef _WIN32
+      noise_dependency_folder_ = std::string("c:\\temp\\gadgetron\\");
+#else
+      noise_dependency_folder_ =  std::string("/tmp/gadgetron/");
+#endif // _WIN32
+    }
+
+    GDEBUG("Folder to store noise dependencies is %s\n", noise_dependency_folder_.c_str());
+
+    if ( !noise_dependency_prefix.value().empty() ) noise_dependency_prefix_ = noise_dependency_prefix.value();
+
+    perform_noise_adjust_ = perform_noise_adjust.value();
+    GDEBUG("NoiseAdjustGadget::perform_noise_adjust_ is %d\n", perform_noise_adjust_);
+
+    pass_nonconformant_data_ = pass_nonconformant_data.value();
+    GDEBUG("NoiseAdjustGadget::pass_nonconformant_data_ is %d\n", pass_nonconformant_data_);
+
+    noise_dwell_time_us_preset_ = noise_dwell_time_us_preset.value();
+
+    ISMRMRD::deserialize(mb->rd_ptr(),current_ismrmrd_header_);
+    
+    if ( current_ismrmrd_header_.acquisitionSystemInformation ) {
+      receiver_noise_bandwidth_ = (float)(current_ismrmrd_header_.acquisitionSystemInformation->relativeReceiverNoiseBandwidth ?
+					  *current_ismrmrd_header_.acquisitionSystemInformation->relativeReceiverNoiseBandwidth : 0.793f);
+      
+      GDEBUG("receiver_noise_bandwidth_ is %f\n", receiver_noise_bandwidth_);
+    }
+
+    // find the measurementID of this scan
+    if ( current_ismrmrd_header_.measurementInformation )
+      {
+	if ( current_ismrmrd_header_.measurementInformation->measurementID )
+	  {
+	    measurement_id_ = *current_ismrmrd_header_.measurementInformation->measurementID;
+	    GDEBUG("Measurement ID is %s\n", measurement_id_.c_str());
+	  }
+
+	// find the noise depencies if any
+	if ( current_ismrmrd_header_.measurementInformation->measurementDependency.size() > 0 )
+	  {
+	    measurement_id_of_noise_dependency_.clear();
+
+	    std::vector<ISMRMRD::MeasurementDependency>::const_iterator iter = current_ismrmrd_header_.measurementInformation->measurementDependency.begin();
+	    for ( ; iter!= current_ismrmrd_header_.measurementInformation->measurementDependency.end(); iter++ )
+	      {
+		std::string dependencyType = iter->dependencyType;
+		std::string dependencyID = iter->measurementID;
+
+		GDEBUG("Found dependency measurement : %s with ID %s\n", dependencyType.c_str(), dependencyID.c_str());
+            
+		if ( dependencyType=="Noise" || dependencyType=="noise" ) {
+		  measurement_id_of_noise_dependency_ = dependencyID;
+		}
+	      }
+        
+	    if ( !measurement_id_of_noise_dependency_.empty() ) {
+	      GDEBUG("Measurement ID of noise dependency is %s\n", measurement_id_of_noise_dependency_.c_str());
+		  
+	      full_name_stored_noise_dependency_ = this->generateNoiseDependencyFilename(generateMeasurementIdOfNoiseDependency(measurement_id_of_noise_dependency_));
+	      GDEBUG("Stored noise dependency is %s\n", full_name_stored_noise_dependency_.c_str());
+		  
+	      // try to load the precomputed noise prewhitener
+	      if ( !this->loadNoiseCovariance() ) {
+		GDEBUG("Stored noise dependency is NOT found : %s\n", full_name_stored_noise_dependency_.c_str());
+		noiseCovarianceLoaded_ = false;
+		noise_dwell_time_us_ = -1;
+		noise_covariance_matrixf_.clear();
+	      } else {
+		GDEBUG("Stored noise dependency is found : %s\n", full_name_stored_noise_dependency_.c_str());
+		GDEBUG("Stored noise dwell time in us is %f\n", noise_dwell_time_us_);
+		GDEBUG("Stored noise channel number is %d\n", noise_covariance_matrixf_.get_size(0));
+		
+		if (noise_ismrmrd_header_.acquisitionSystemInformation) {
+		  if (noise_ismrmrd_header_.acquisitionSystemInformation->coilLabel.size() != 
+		      current_ismrmrd_header_.acquisitionSystemInformation->coilLabel.size()) {
+		    GDEBUG("Length of coil label arrays do not match");
+		    return GADGET_FAIL;
+		  }
+		  
+		  bool labels_match = true;
+		  for (size_t l = 0; l < noise_ismrmrd_header_.acquisitionSystemInformation->coilLabel.size(); l++) {
+		    if (noise_ismrmrd_header_.acquisitionSystemInformation->coilLabel[l].coilNumber != 
+			current_ismrmrd_header_.acquisitionSystemInformation->coilLabel[l].coilNumber) {
+		      labels_match = false; break;
+		    }
+		    if (noise_ismrmrd_header_.acquisitionSystemInformation->coilLabel[l].coilName != 
+			current_ismrmrd_header_.acquisitionSystemInformation->coilLabel[l].coilName) {
+		      labels_match = false; break;
+		    }
+		  }
+		  if (!labels_match) {
+		    GDEBUG("Noise and measurement coil labels don't match\n");
+		    return GADGET_FAIL;
+		  }
+		} else if (current_ismrmrd_header_.acquisitionSystemInformation) {
+		  GDEBUG("Noise ismrmrd header does not have acquisition system information but current header does\n");
+		  return GADGET_FAIL;
+		}
+
+		noiseCovarianceLoaded_ = true;
+		number_of_noise_samples_ = 1; //When we load the matrix, it is already scaled.
+	      }
+	    }
+	  }
+      }
+
+
+    //Let's figure out if some channels are "scale_only"
+    std::string uncomb_str = scale_only_channels_by_name.value();
+    std::vector<std::string> uncomb;
+    if (uncomb_str.size()) {
+      GDEBUG("SCALE ONLY: %s\n",  uncomb_str.c_str());
+      boost::split(uncomb, uncomb_str, boost::is_any_of(","));
+      for (unsigned int i = 0; i < uncomb.size(); i++) {
+	std::string ch = boost::algorithm::trim_copy(uncomb[i]);
+	if (current_ismrmrd_header_.acquisitionSystemInformation) {
+	  for (size_t i = 0; i < current_ismrmrd_header_.acquisitionSystemInformation->coilLabel.size(); i++) {
+	    if (ch == current_ismrmrd_header_.acquisitionSystemInformation->coilLabel[i].coilName) {
+	      scale_only_channels_.push_back(i);//This assumes that the channels are sorted in the header
+	      break;
+	    }
+	  }
+	}
+      }
+    }
+
+#ifdef USE_OMP
+    omp_set_num_threads(1);
+#endif // USE_OMP
+
+    return GADGET_OK;
+  }
+
+  std::string NoiseAdjustGadget::generateMeasurementIdOfNoiseDependency(const std::string& noise_id)
+  {
+    // find the scan prefix
+    std::string measurementStr = measurement_id_;
+    size_t ind  = measurement_id_.find_last_of ("_");
+    if ( ind != std::string::npos ) {
+      measurementStr = measurement_id_.substr(0, ind);
+      measurementStr.append("_");
+      measurementStr.append(noise_id);
+    }
+   
+    return measurementStr;
+  }
+
+  std::string NoiseAdjustGadget::generateNoiseDependencyFilename(const std::string& measurement_id)
+  {
+    std::string full_name_stored_noise_dependency;
+
+    full_name_stored_noise_dependency = noise_dependency_folder_;
+    full_name_stored_noise_dependency.append("/");
+    full_name_stored_noise_dependency.append(noise_dependency_prefix_);
+    full_name_stored_noise_dependency.append("_");
+    full_name_stored_noise_dependency.append(measurement_id);
+
+    return full_name_stored_noise_dependency;
+  }
+
+  bool NoiseAdjustGadget::loadNoiseCovariance()
+  {
+    std::ifstream infile;
+    infile.open (full_name_stored_noise_dependency_.c_str(), std::ios::in|std::ios::binary);
+
+    if (infile.good()) {
+      //Read the XML header of the noise scan
+      uint32_t xml_length;
+      infile.read( reinterpret_cast<char*>(&xml_length), 4);
+      std::string xml_str(xml_length,'\0');
+      infile.read(const_cast<char*>(xml_str.c_str()), xml_length);
+      ISMRMRD::deserialize(xml_str.c_str(), noise_ismrmrd_header_);
+	
+      infile.read( reinterpret_cast<char*>(&noise_dwell_time_us_), sizeof(float));
+
+      size_t len;
+      infile.read( reinterpret_cast<char*>(&len), sizeof(size_t));
+
+      char* buf = new char[len];
+      if ( buf == NULL ) return false;
+
+      infile.read(buf, len);
+
+      if ( !noise_covariance_matrixf_.deserialize(buf, len) )
+	{
+	  delete [] buf;
+	  return false;
+	}
+
+      delete [] buf;
+      infile.close();
+    } else {
+      GDEBUG("Noise prewhitener file is not found. Proceeding without stored noise\n");
+      return false;
+    }
+
+    return true;
+  }
+
+  bool NoiseAdjustGadget::saveNoiseCovariance()
+  {
+    char* buf = NULL;
+    size_t len(0);
+    
+    //Do we have any noise?
+    if (noise_covariance_matrixf_.get_number_of_elements() == 0) {
+      return true;
+    }
+
+    //Scale the covariance matrix before saving
+    hoNDArray< std::complex<float> > covf(noise_covariance_matrixf_);
+
+    if (number_of_noise_samples_ > 1) {
+      covf *= std::complex<float>(1.0/(float)(number_of_noise_samples_-1),0.0);
+    }
+
+    if ( !covf.serialize(buf, len) ) {
+      GDEBUG("Noise covariance serialization failed ...\n");
+      return false;
+    }
+
+    std::stringstream xml_ss;
+    ISMRMRD::serialize(current_ismrmrd_header_, xml_ss);
+    std::string xml_str = xml_ss.str();
+    uint32_t xml_length = static_cast<uint32_t>(xml_str.size());
+
+    std::ofstream outfile;
+    std::string filename  = this->generateNoiseDependencyFilename(measurement_id_);
+    outfile.open (filename.c_str(), std::ios::out|std::ios::binary);
+
+    if (outfile.good())
+      {
+	GDEBUG("write out the noise dependency file : %s\n", filename.c_str());
+	outfile.write( reinterpret_cast<char*>(&xml_length), 4);
+	outfile.write( xml_str.c_str(), xml_length );
+	outfile.write( reinterpret_cast<char*>(&noise_dwell_time_us_), sizeof(float));
+	outfile.write( reinterpret_cast<char*>(&len), sizeof(size_t));
+	outfile.write(buf, len);
+	outfile.close();
+
+	// set the permission for the noise file to be rewritable
+#ifndef _WIN32
+	int res = chmod(filename.c_str(), S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IXGRP|S_IROTH|S_IWOTH|S_IXOTH);
+	if ( res != 0 ) {
+	  GDEBUG("Changing noise prewhitener file permission failed ...\n");
+	}
+#endif // _WIN32
+      } else {
+      delete [] buf;
+      GERROR_STREAM("Noise prewhitener file is not good for writing");
+      return false;
+    }
+
+    delete [] buf;
+    return true;
+  }
+
+  void NoiseAdjustGadget::computeNoisePrewhitener()
+  {
+    GDEBUG("Noise dwell time: %f\n", noise_dwell_time_us_);
+    GDEBUG("receiver_noise_bandwidth: %f\n", receiver_noise_bandwidth_);
+    
+    if (!noise_decorrelation_calculated_) {
+      
+      if (number_of_noise_samples_ > 0 ) {
+	GDEBUG("Calculating noise decorrelation\n");
+	
+	noise_prewhitener_matrixf_ = noise_covariance_matrixf_;
+	
+	//Mask out scale  only channels
+	size_t c = noise_prewhitener_matrixf_.get_size(0);
+	std::complex<float>* dptr = noise_prewhitener_matrixf_.get_data_ptr(); 
+	for (unsigned int ch = 0; ch < scale_only_channels_.size(); ch++) {
+	  for (size_t i = 0; i <  c; i++) {
+	    for (size_t j = 0; j < c; j++) {
+	      if ((i == scale_only_channels_[ch] || (j == scale_only_channels_[ch])) && (i != j)) { //zero if scale only and not on diagonal
+		dptr[i*c+j] = std::complex<float>(0.0,0.0);
+	      }
+	    }
+	  }
+	}
+
+	//Cholesky and invert lower triangular
+	arma::cx_fmat noise_covf = as_arma_matrix(&noise_prewhitener_matrixf_);      
+	noise_covf = arma::inv(arma::trimatu(arma::chol(noise_covf)));
+      
+	noise_decorrelation_calculated_ = true;
+      } else {
+	noise_decorrelation_calculated_ = false;
+      }
+    }
+  }
+
+  int NoiseAdjustGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+  {
+    bool is_noise = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT);
+    unsigned int channels = m1->getObjectPtr()->active_channels;
+    unsigned int samples = m1->getObjectPtr()->number_of_samples;
+
+    //TODO: Remove this
+    if ( measurement_id_.empty() ) {
+      unsigned int muid = m1->getObjectPtr()->measurement_uid;
+      std::ostringstream ostr;
+      ostr << muid;
+      measurement_id_ = ostr.str();
+    }
+
+    if ( is_noise ) {
+      if (noiseCovarianceLoaded_) {
+	m1->release(); //Do not accumulate noise when we have a loaded noise covariance
+	return GADGET_OK;
+      }
+      
+      // this noise can be from a noise scan or it can be from the built-in noise
+      if ( number_of_noise_samples_per_acquisition_ == 0 ) {
+	number_of_noise_samples_per_acquisition_ = samples;
+      }
+
+      if ( noise_dwell_time_us_ < 0 ) {
+	if (noise_dwell_time_us_preset_ > 0.0) {
+	  noise_dwell_time_us_ = noise_dwell_time_us_preset_;
+	} else {
+	  noise_dwell_time_us_ = m1->getObjectPtr()->sample_time_us;
+	}
+      }
+
+      //If noise covariance matrix is not allocated
+      if (noise_covariance_matrixf_.get_number_of_elements() != channels*channels) {
+	std::vector<size_t> dims(2, channels);
+	try {
+	  noise_covariance_matrixf_.create(&dims);
+	  noise_covariance_matrixf_once_.create(&dims);
+	} catch (std::runtime_error& err) {
+	  GEXCEPTION(err, "Unable to allocate storage for noise covariance matrix\n" );
+	  return GADGET_FAIL;
+	}
+
+	Gadgetron::clear(noise_covariance_matrixf_);
+	Gadgetron::clear(noise_covariance_matrixf_once_);
+	number_of_noise_samples_ = 0;
+      }
+
+      std::complex<float>* cc_ptr = noise_covariance_matrixf_.get_data_ptr();
+      std::complex<float>* data_ptr = m2->getObjectPtr()->get_data_ptr();
+      
+      hoNDArray< std::complex<float> > readout(*m2->getObjectPtr());
+      gemm(noise_covariance_matrixf_once_, readout, true, *m2->getObjectPtr(), false);
+      Gadgetron::add(noise_covariance_matrixf_once_, noise_covariance_matrixf_, noise_covariance_matrixf_);
+      
+      number_of_noise_samples_ += samples;
+      m1->release();
+      return GADGET_OK;
+    }
+
+
+    //We should only reach this code if this data is not noise.
+    if ( perform_noise_adjust_ ) {
+      //Calculate the prewhitener if it has not been done
+      if (!noise_decorrelation_calculated_ && (number_of_noise_samples_ > 0)) {
+	if (number_of_noise_samples_ > 1) {
+	  //Scale
+	  noise_covariance_matrixf_ *= std::complex<float>(1.0/(float)(number_of_noise_samples_-1));
+	  number_of_noise_samples_ = 1; //Scaling has been done
+	}
+	computeNoisePrewhitener();
+	acquisition_dwell_time_us_ = m1->getObjectPtr()->sample_time_us;
+	if ((noise_dwell_time_us_ == 0.0f) || (acquisition_dwell_time_us_ == 0.0f)) {
+	  noise_bw_scale_factor_ = 1.0f;
+	} else {
+	  noise_bw_scale_factor_ = (float)std::sqrt(2.0*acquisition_dwell_time_us_/noise_dwell_time_us_*receiver_noise_bandwidth_);
+	}
+
+	noise_prewhitener_matrixf_ *= std::complex<float>(noise_bw_scale_factor_,0.0);
+
+	GDEBUG("Noise dwell time: %f\n", noise_dwell_time_us_);
+	GDEBUG("Acquisition dwell time: %f\n", acquisition_dwell_time_us_);
+	GDEBUG("receiver_noise_bandwidth: %f\n", receiver_noise_bandwidth_);
+	GDEBUG("noise_bw_scale_factor: %f", noise_bw_scale_factor_);
+      }
+
+      if (noise_decorrelation_calculated_) {
+          //Apply prewhitener
+          if ( noise_prewhitener_matrixf_.get_size(0) == m2->getObjectPtr()->get_size(1) ) {
+               hoNDArray<std::complex<float> > tmp(*m2->getObjectPtr());
+               gemm(*m2->getObjectPtr(), tmp, noise_prewhitener_matrixf_);
+          } else {
+               if (!pass_nonconformant_data_) {
+                     m1->release();
+                     GERROR("Number of channels in noise prewhitener %d is incompatible with incoming data %d\n", noise_prewhitener_matrixf_.get_size(0), m2->getObjectPtr()->get_size(1));
+                     return GADGET_FAIL;
+               }
+          }
+      }
+    }
+
+    if (this->next()->putq(m1) == -1) {
+      GDEBUG("Error passing on data to next gadget\n");
+      return GADGET_FAIL;
+    }
+    
+    return GADGET_OK;
+
+  }
+
+  int NoiseAdjustGadget::close(unsigned long flags)
+  {
+    if ( BaseClass::close(flags) != GADGET_OK ) return GADGET_FAIL;
+
+    if ( !noiseCovarianceLoaded_  && !saved_ ){
+      saveNoiseCovariance();
+      saved_ = true;
+    }  
+
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(NoiseAdjustGadget)
+
+} // namespace Gadgetron
diff --git a/gadgets/mri_core/NoiseAdjustGadget.h b/gadgets/mri_core/NoiseAdjustGadget.h
new file mode 100644
index 0000000..53695cf
--- /dev/null
+++ b/gadgets/mri_core/NoiseAdjustGadget.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+#include "GadgetronTimer.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <ismrmrd/xml.h>
+#include <complex>
+
+namespace Gadgetron {
+
+  class EXPORTGADGETSMRICORE NoiseAdjustGadget :
+    public Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(NoiseAdjustGadget);
+
+      typedef Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > > BaseClass;
+
+      NoiseAdjustGadget();
+      virtual ~NoiseAdjustGadget();
+
+      virtual int close(unsigned long flags);
+
+    protected:
+      GADGET_PROPERTY(noise_dependency_prefix, std::string, "Prefix of noise depencency file", "GadgetronNoiseCovarianceMatrix");
+      GADGET_PROPERTY(perform_noise_adjust, bool, "Whether to actually perform the noise adjust", true);
+      GADGET_PROPERTY(pass_nonconformant_data, bool, "Whether to pass data that does not conform", false);
+      GADGET_PROPERTY(noise_dwell_time_us_preset, float, "Preset dwell time for noise measurement", 0.0);
+      GADGET_PROPERTY(scale_only_channels_by_name, std::string, "List of named channels that should only be scaled", "");
+
+      bool noise_decorrelation_calculated_;
+      hoNDArray< std::complex<float> > noise_covariance_matrixf_;
+      hoNDArray< std::complex<float> > noise_prewhitener_matrixf_;
+      hoNDArray< std::complex<float> > noise_covariance_matrixf_once_;
+      std::vector<unsigned int> scale_only_channels_;
+
+      unsigned long long number_of_noise_samples_;
+      unsigned long long number_of_noise_samples_per_acquisition_;
+      float noise_dwell_time_us_;
+      float noise_dwell_time_us_preset_;
+      float acquisition_dwell_time_us_;
+      float noise_bw_scale_factor_;
+      float receiver_noise_bandwidth_;
+      bool noiseCovarianceLoaded_;
+      bool perform_noise_adjust_;
+      bool pass_nonconformant_data_;
+      bool saved_;
+
+      std::string noise_dependency_folder_;
+      std::string noise_dependency_prefix_;
+      std::string measurement_id_;
+      std::string measurement_id_of_noise_dependency_;
+      std::string full_name_stored_noise_dependency_;
+
+      virtual int process_config(ACE_Message_Block* mb);
+      virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+      std::string generateNoiseDependencyFilename(const std::string& measurement_id);
+      std::string generateMeasurementIdOfNoiseDependency(const std::string& noise_id);
+
+      bool loadNoiseCovariance();
+      bool saveNoiseCovariance();
+      void computeNoisePrewhitener();
+
+      //We will store/load a copy of the noise scans XML header to enable us to check which coil layout, etc.
+      ISMRMRD::IsmrmrdHeader current_ismrmrd_header_;
+      ISMRMRD::IsmrmrdHeader noise_ismrmrd_header_;
+
+    };
+}
diff --git a/gadgets/mri_core/NoiseAdjustGadget_unoptimized.cpp b/gadgets/mri_core/NoiseAdjustGadget_unoptimized.cpp
new file mode 100644
index 0000000..27121d1
--- /dev/null
+++ b/gadgets/mri_core/NoiseAdjustGadget_unoptimized.cpp
@@ -0,0 +1,219 @@
+#include "NoiseAdjustGadget_unoptimized.h"
+#include "hoNDArray_fileio.h"
+#include "ismrmrd/xml.h"
+
+namespace Gadgetron{
+
+void choldc(std::complex<double> *a, int n)
+{
+	int i,j,k;
+
+	for (k= 0; k < n; k++)
+	{
+		a[k*n+k] = std::complex<double>(std::sqrt(real(a[k*n+k])),0.0);
+
+		for (i = k+1; i < n; i++)
+		{
+			a[k*n+i] = a[k*n+i]/a[k*n+k];
+		}
+
+		for (j = k + 1; j < n; j++)
+		{
+			for (i = j; i < n; i++)
+			{
+				a[j*n+i] -= conj(a[k*n+j])*a[k*n+i];
+			}
+		}
+	}
+}
+
+void inv_L(std::complex<double> *a, int n)
+{
+	int i,j,k;
+
+	std::complex<double> sum;
+
+	for (i = 0; i < n; i++)
+	{
+
+		a[i*n+i] = std::complex<double>(1.0/real(a[i*n+i]),0.0);
+		for (j = i+1; j < n; j++)
+		{
+			sum = std::complex<double>(0.0,0.0);
+			for (k = i; k < j; k++)
+			{
+				sum -= a[k*n+j]*a[i*n+k];
+			}
+			a[i*n+j] = sum/a[j*n+j];
+		}
+	}
+}
+
+bool noise_decorrelation(std::complex<float>* data, int elements, int coils, std::complex<double>* inv_L_psi)
+{
+	int i,j,k;
+
+	/* We need some temporary storrage to store the data for one element before overwriting the original data */
+	std::complex<double>* tmp_data = new std::complex<double>[coils];
+
+	if (tmp_data == 0)
+	{
+		return false;
+	}
+
+	for (i = 0; i < elements; i++)
+	{
+		for (j = 0; j < coils; j++)
+		{
+			tmp_data[j] = std::complex<double>(0.0,0.0);
+		}
+
+		for (j = 0; j < coils; j++)
+		{
+			for (k = 0; k <= j; k++)
+			{
+				tmp_data[j] += inv_L_psi[k*coils+j] * static_cast< std::complex<double> >(data[k*elements+i]);
+			}
+		}
+
+		for (j = 0; j < coils; j++)
+		{
+			data[j*elements+i] = tmp_data[j];
+		}
+	}
+
+	/* Clean up */
+	delete [] tmp_data;
+
+	return true;
+}
+
+
+
+NoiseAdjustGadget_unoptimized::NoiseAdjustGadget_unoptimized()
+: noise_decorrelation_calculated_(false)
+, number_of_noise_samples_(0)
+, noise_bw_scale_factor_(1.0f)
+, is_configured_(false)
+{
+
+}
+
+
+int NoiseAdjustGadget_unoptimized::process_config(ACE_Message_Block* mb)
+{
+  ISMRMRD::IsmrmrdHeader h;
+  ISMRMRD::deserialize(mb->rd_ptr(),h);
+  
+  if ( h.acquisitionSystemInformation ) {
+    receiver_noise_bandwidth_ = (float)(h.acquisitionSystemInformation->relativeReceiverNoiseBandwidth ?
+					*h.acquisitionSystemInformation->relativeReceiverNoiseBandwidth : 1.0f);
+    
+    GDEBUG_STREAM("receiver_noise_bandwidth_ is " << receiver_noise_bandwidth_);
+  }
+  
+  return GADGET_OK;
+}
+
+int NoiseAdjustGadget_unoptimized
+::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+		GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+
+	bool is_noise = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT);
+	unsigned int channels = m1->getObjectPtr()->active_channels;
+	unsigned int samples = m1->getObjectPtr()->number_of_samples;
+
+	if (is_noise) {
+		noise_dwell_time_us_ = m1->getObjectPtr()->sample_time_us;
+		//If noise covariance matrix is not allocated
+		if (noise_covariance_matrix_.get_number_of_elements() != channels*channels) {
+			std::vector<size_t> dims(2, channels);
+			try{ noise_covariance_matrix_.create(&dims);}
+			catch (std::runtime_error &err){
+				GEXCEPTION(err,"Unable to allocate storage for noise covariance matrix\n");
+				return GADGET_FAIL;
+			}
+			noise_covariance_matrix_.fill(std::complex<double>(0.0,0.0));
+
+			number_of_noise_samples_ = 0;
+		}
+
+		std::complex<double>* cc_ptr = noise_covariance_matrix_.get_data_ptr();
+		std::complex<float>* data_ptr = m2->getObjectPtr()->get_data_ptr();
+
+
+		for (unsigned int s = 0; s < samples; s++) {
+			for (unsigned int i = 0; i < channels; i++) {
+				for (unsigned int j = 0; j < channels; j++) {
+					cc_ptr[j*channels + i] += (data_ptr[i * samples + s] * conj(data_ptr[j * samples + s]));
+				}
+			}
+			number_of_noise_samples_++;
+		}
+	} else {
+		acquisition_dwell_time_us_ = m1->getObjectPtr()->sample_time_us;
+		if (!is_configured_) {
+			if ((noise_dwell_time_us_ == 0.0f) || (acquisition_dwell_time_us_ == 0.0f)) {
+				noise_bw_scale_factor_ = 1.0f;
+			} else {
+				noise_bw_scale_factor_ = std::sqrt(2*acquisition_dwell_time_us_/noise_dwell_time_us_*receiver_noise_bandwidth_);
+			}
+
+			GDEBUG("Noise dwell time: %f\n", noise_dwell_time_us_);
+			GDEBUG("Acquisition dwell time: %f\n", acquisition_dwell_time_us_);
+			GDEBUG("receiver_noise_bandwidth: %f\n", receiver_noise_bandwidth_);
+			GDEBUG("noise_bw_scale_factor: %f\n", noise_bw_scale_factor_);
+			is_configured_ = true;
+		}
+		if (number_of_noise_samples_ > 0) {
+			if (!noise_decorrelation_calculated_) {
+				GDEBUG("Calculating noise decorrelation\n");
+				//1. scale for number of samples
+				std::complex<double>* cc_ptr = noise_covariance_matrix_.get_data_ptr();
+				for (unsigned int i = 0; i < channels*channels; i++) {
+					cc_ptr[i] /= number_of_noise_samples_;
+				}
+
+				//write_nd_array(&noise_covariance_matrix_, "CC.cplx");
+
+				//2. Cholesky decomposition
+				choldc(cc_ptr, channels);
+
+				//write_nd_array(&noise_covariance_matrix_, "CC_chol.cplx");
+
+				//3. Invert lower triangular
+				inv_L(cc_ptr, channels);
+
+				//write_nd_array(&noise_covariance_matrix_, "CC_chol_inv_L.cplx");
+
+				//4. Scale for noise BW
+				for (unsigned int i = 0; i < channels*channels; i++) {
+					cc_ptr[i] *= noise_bw_scale_factor_;
+				}
+
+				noise_decorrelation_calculated_ = true;
+			}
+
+			if (noise_decorrelation_calculated_) {
+				//Noise decorrelate
+				if (!noise_decorrelation(m2->getObjectPtr()->get_data_ptr(), samples, channels, noise_covariance_matrix_.get_data_ptr())) {
+					GDEBUG("Noise Decorrelation Failed\n");
+					return GADGET_FAIL;
+				}
+			}
+		}
+		//It is enough to put the first one, since they are linked
+		if (this->next()->putq(m1) == -1) {
+		  GERROR("NoiseAdjustGadget_unoptimized::process, passing data on to next gadget");
+		  return -1;
+		}
+
+	}
+
+	return GADGET_OK;
+}
+
+
+GADGET_FACTORY_DECLARE(NoiseAdjustGadget_unoptimized)
+}
diff --git a/gadgets/mri_core/NoiseAdjustGadget_unoptimized.h b/gadgets/mri_core/NoiseAdjustGadget_unoptimized.h
new file mode 100644
index 0000000..26088da
--- /dev/null
+++ b/gadgets/mri_core/NoiseAdjustGadget_unoptimized.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE NoiseAdjustGadget_unoptimized : 
+  public Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(NoiseAdjustGadget_unoptimized);
+  
+      NoiseAdjustGadget_unoptimized();
+
+    protected:
+      bool noise_decorrelation_calculated_;
+      hoNDArray< std::complex<double> > noise_covariance_matrix_;
+      unsigned long int number_of_noise_samples_;
+      float noise_dwell_time_us_;
+      float acquisition_dwell_time_us_;
+      float noise_bw_scale_factor_;
+      float receiver_noise_bandwidth_;
+      bool is_configured_;
+
+      virtual int process_config(ACE_Message_Block* mb);
+      virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+      
+    };
+}
diff --git a/gadgets/mri_core/PCACoilGadget.cpp b/gadgets/mri_core/PCACoilGadget.cpp
new file mode 100644
index 0000000..82c25cc
--- /dev/null
+++ b/gadgets/mri_core/PCACoilGadget.cpp
@@ -0,0 +1,300 @@
+/*
+* PCACoilGadget.cpp
+*
+*  Created on: Dec 13, 2011
+*      Author: Michael S. Hansen
+*/
+
+#include "PCACoilGadget.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "hoArmadillo.h"
+#include "hoNDArray_elemwise.h"
+#include "ismrmrd/xml.h"
+#include "hoNDArray_fileio.h"
+
+#include <ace/OS_NS_stdlib.h>
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/split.hpp>
+
+namespace Gadgetron {
+
+    PCACoilGadget::PCACoilGadget()
+        : max_buffered_profiles_(100)
+        , samples_to_use_(16)
+    {
+    }
+
+    PCACoilGadget::~PCACoilGadget()
+    {
+        std::map<int, hoNDArray<std::complex<float> >* >::iterator it;
+        it = pca_coefficients_.begin();
+        while (it != pca_coefficients_.end()) {
+            if (it->second) {
+                delete it->second;
+                it->second = 0;
+            }
+            it++;
+        }
+    }
+
+    int PCACoilGadget::process_config(ACE_Message_Block *mb)
+    {
+      ISMRMRD::IsmrmrdHeader h;
+      ISMRMRD::deserialize(mb->rd_ptr(),h);
+
+      std::string uncomb_str = uncombined_channels_by_name.value();
+      std::vector<std::string> uncomb;
+      if (uncomb_str.size()) {
+	GDEBUG("uncomb_str: %s\n",  uncomb_str.c_str());
+	boost::split(uncomb, uncomb_str, boost::is_any_of(","));
+	for (unsigned int i = 0; i < uncomb.size(); i++) {
+	  std::string ch = boost::algorithm::trim_copy(uncomb[i]);
+	  if (h.acquisitionSystemInformation) {
+	    for (size_t i = 0; i < h.acquisitionSystemInformation->coilLabel.size(); i++) {
+	      if (ch == h.acquisitionSystemInformation->coilLabel[i].coilName) {
+		uncombined_channels_.push_back(i);//This assumes that the channels are sorted in the header
+		break;
+	      }
+	    }
+	  }
+	}
+      }
+
+      present_uncombined_channels.value((int)uncombined_channels_.size());
+      GDEBUG("Number of uncombined channels (present_uncombined_channels) set to %d\n", uncombined_channels_.size());
+
+      return GADGET_OK;
+    }
+
+    int PCACoilGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1, GadgetContainerMessage<hoNDArray<std::complex<float> > > *m2)
+    {
+      bool is_noise = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT);
+
+      //We should not be receiving noise here
+      if (is_noise) {
+	m1->release();
+	return GADGET_OK;
+      }
+
+
+        std::map<int, bool>::iterator it;
+        int location = m1->getObjectPtr()->idx.slice;
+        bool is_last_scan_in_slice = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_LAST_IN_SLICE);
+        int samples_per_profile = m1->getObjectPtr()->number_of_samples;
+        int channels = m1->getObjectPtr()->active_channels;
+
+        it = buffering_mode_.find(location);
+
+        bool is_buffering = true;
+        //Do we have an entry for this location
+        if (it != buffering_mode_.end()) {
+            is_buffering = it->second;
+        } else {
+            //else make an entry. We will always start in buffering mode for a given location.
+            buffering_mode_[location] = is_buffering;
+        }
+
+        if (is_buffering) {
+            buffer_[location].push_back(m1);
+            int profiles_available = buffer_[location].size();
+
+            //Are we ready for calculating PCA
+            if (is_last_scan_in_slice || (profiles_available >= max_buffered_profiles_)) {
+
+                //GDEBUG("Calculating PCA coefficients with %d profiles for %d coils\n", profiles_available, channels);
+                int samples_to_use = samples_per_profile > samples_to_use_ ? samples_to_use_ : samples_per_profile;
+
+                //For some sequences there is so little data, we should just use it all.
+                if (profiles_available < 16) {
+                    samples_to_use = samples_per_profile;
+                }
+
+                int total_samples = samples_to_use*profiles_available;
+
+                std::vector<size_t> dims(2);
+                dims[0] = channels;dims[1] = total_samples;
+
+                hoNDArray< std::complex<float> > A;
+                try{ A.create(&dims); }
+                catch (std::runtime_error & err){
+                    GDEBUG("Unable to create array for PCA calculation\n");
+                    return GADGET_FAIL;
+                }
+
+                std::complex<float>* A_ptr = A.get_data_ptr();
+                size_t sample_counter = 0;
+
+                size_t data_offset = 0;
+                if (m1->getObjectPtr()->center_sample >= (samples_to_use>>1)) {
+                    data_offset = m1->getObjectPtr()->center_sample - (samples_to_use>>1);
+                }
+
+                //GDEBUG("Data offset = %d\n", data_offset);
+
+                hoNDArray<std::complex<float> > means;
+                std::vector<size_t> means_dims; means_dims.push_back(channels);
+
+                try{means.create(&means_dims);}
+                catch (std::runtime_error& err){
+                    GDEBUG("Unable to create temporary stoorage for mean values\n");
+                    return GADGET_FAIL;
+                }
+
+                means.fill(std::complex<float>(0.0f,0.0f));
+
+                std::complex<float>* means_ptr = means.get_data_ptr();
+                for (size_t p = 0; p < profiles_available; p++) {
+                    GadgetContainerMessage<hoNDArray<std::complex<float> > >* m_tmp =
+                        AsContainerMessage<hoNDArray< std::complex<float> > >(buffer_[location][p]->cont());
+
+                    if (!m_tmp) {
+                        GDEBUG("Fatal error, unable to recover data from data buffer (%d,%d)\n", p, profiles_available);
+                        return GADGET_FAIL;
+                    }
+
+                    std::complex<float>* d = m_tmp->getObjectPtr()->get_data_ptr();
+
+		      for (unsigned s = 0; s < samples_to_use; s++) {
+			for (size_t c = 0; c < channels; c++) {
+			  bool uncombined_channel = std::find(uncombined_channels_.begin(),uncombined_channels_.end(), c) != uncombined_channels_.end();
+			  //We use the conjugate of the data so that the output VT of the SVD is the actual PCA coefficient matrix
+			  if (uncombined_channel) {
+			    A_ptr[c + sample_counter*channels] = std::complex<float>(0.0,0.0);
+			  } else {
+			    A_ptr[c + sample_counter*channels] = d[c*samples_per_profile + data_offset + s];
+			    means_ptr[c] += d[c*samples_per_profile + data_offset + s];
+			  }
+			}
+			
+			sample_counter++;
+			//GDEBUG("Sample counter = %d/%d\n", sample_counter, total_samples);
+		      }
+                }
+
+                //Subtract off mean
+                for (size_t c = 0; c < channels; c++) {
+                    for (size_t s = 0; s < total_samples; s++) {
+                        A_ptr[c + s*channels] -=  means_ptr[c]/std::complex<float>(total_samples,0);
+                    }
+                }
+
+                //Collected data for temp matrix, now let's calculate SVD coefficients
+
+                std::vector<size_t> VT_dims;
+                VT_dims.push_back(channels);
+                VT_dims.push_back(channels);
+                pca_coefficients_[location] = new hoNDArray< std::complex<float> >;
+                hoNDArray< std::complex<float> >* VT = pca_coefficients_[location];
+		
+                try {VT->create(&VT_dims);}
+                catch (std::runtime_error& err){
+                    GEXCEPTION(err,"Failed to create array for VT\n");
+                    return GADGET_FAIL;
+                }
+
+                arma::cx_fmat Am = as_arma_matrix(&A);
+                arma::cx_fmat Vm = as_arma_matrix(VT);
+                arma::cx_fmat Um;
+                arma::fvec Sv;
+
+
+                if( !arma::svd_econ(Um,Sv,Vm,Am.st(),'r') ){
+                    GDEBUG("Failed to compute SVD\n");
+                    return GADGET_FAIL;
+                }
+		
+		//We will create a new matrix that explicitly preserves the uncombined channels
+		if (uncombined_channels_.size()) {
+		  hoNDArray< std::complex<float> >* VT_new = new hoNDArray< std::complex<float> >;
+		  try {VT_new->create(&VT_dims);}
+		  catch (std::runtime_error& err){
+                    GEXCEPTION(err,"Failed to create array for VT (new)\n");
+                    return GADGET_FAIL;
+		  }
+
+		  arma::cx_fmat Vm_new = as_arma_matrix(VT_new);
+
+		  size_t uncomb_count = 0;
+		  size_t comb_count = 0;
+		  for (size_t c = 0; c < Vm_new.n_cols; c++) {
+		    bool uncombined_channel = std::find(uncombined_channels_.begin(),uncombined_channels_.end(), c) != uncombined_channels_.end();
+		    if (uncombined_channel) {
+		      for (size_t r = 0; r < Vm_new.n_rows; r++) {
+			if (r == c) {
+			  Vm_new(r,uncomb_count) = 1;
+			} else {
+			  Vm_new(r,uncomb_count) = 0;
+			}
+		      }
+		      uncomb_count++;
+		    } else {
+		      for (size_t r = 0; r < Vm_new.n_rows; r++) { 
+			bool uncombined_channel_row = std::find(uncombined_channels_.begin(),uncombined_channels_.end(), r) != uncombined_channels_.end();
+			if (uncombined_channel_row) {
+			  Vm_new(r,comb_count+uncombined_channels_.size()) = 0;
+			} else {
+			  Vm_new(r,comb_count+uncombined_channels_.size()) = Vm(r,c);
+			}
+		      }
+		      comb_count++;
+		    }
+		  } 
+		  GDEBUG("uncomb_count = %d, comb_count = %d\n", uncomb_count, comb_count);
+
+		  //Delete the old one and set the new one
+		  delete pca_coefficients_[location];
+		  pca_coefficients_[location] = VT_new;
+		}
+
+
+                //Switch off buffering for this slice
+                buffering_mode_[location] = false;
+
+                //Now we should pump all the profiles that we have buffered back through the system
+                for (size_t p = 0; p < profiles_available; p++) {
+                    ACE_Message_Block* mb = buffer_[location][p];
+                    if (inherited::process(mb) != GADGET_OK) {
+                        GDEBUG("Failed to reprocess buffered data\n");
+                        return GADGET_FAIL;
+                    }
+                }
+                //Remove references in this buffer
+                buffer_[location].clear();
+            }
+        } else {
+            //GDEBUG("Not buffering anymore\n");
+            GadgetContainerMessage< hoNDArray< std::complex<float> > >* m3 =
+                new GadgetContainerMessage< hoNDArray< std::complex<float> > >;
+
+            try{m3->getObjectPtr()->create(m2->getObjectPtr()->get_dimensions().get()); }
+            catch (std::runtime_error& err){
+                GEXCEPTION(err,"Unable to create storage for PCA coils\n");
+                m3->release();
+                return GADGET_FAIL;
+            }
+
+            if (pca_coefficients_[location] != 0) {	
+                arma::cx_fmat am3 = as_arma_matrix(m3->getObjectPtr());
+                arma::cx_fmat am2 = as_arma_matrix(m2->getObjectPtr());
+                arma::cx_fmat aPca = as_arma_matrix(pca_coefficients_[location]);
+                am3 = am2*aPca;
+            }
+
+            m1->cont(m3);
+
+            //In case there are trajectories attached. 
+            m3->cont(m2->cont());
+            m2->cont(0);
+
+            m2->release();
+
+            if (this->next()->putq(m1) < 0) {
+                GDEBUG("Unable to put message on Q");
+                return GADGET_FAIL;
+            }
+        }
+        return GADGET_OK;
+    }
+
+    GADGET_FACTORY_DECLARE(PCACoilGadget)
+}
diff --git a/gadgets/mri_core/PCACoilGadget.h b/gadgets/mri_core/PCACoilGadget.h
new file mode 100644
index 0000000..e69f3de
--- /dev/null
+++ b/gadgets/mri_core/PCACoilGadget.h
@@ -0,0 +1,49 @@
+#ifndef PCACOILGADGET_H_
+#define PCACOILGADGET_H_
+
+#include "gadgetron_mricore_export.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd/ismrmrd.h"
+
+#include <complex>
+#include <map>
+
+namespace Gadgetron {
+
+  class EXPORTGADGETSMRICORE PCACoilGadget :
+    public Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >
+  {
+    typedef Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > > inherited;
+  public:
+    GADGET_DECLARE(PCACoilGadget);
+
+    PCACoilGadget();
+    virtual ~PCACoilGadget();
+
+  protected:
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+			GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+  private:
+    GADGET_PROPERTY(uncombined_channels_by_name, std::string, "List of comma separated channels by name", "");
+    GADGET_PROPERTY(present_uncombined_channels, int, "Number of uncombined channels found", 0);
+
+    std::vector<unsigned int> uncombined_channels_;
+    
+    //Map containing buffers, one for each location
+    std::map< int, std::vector< ACE_Message_Block* > > buffer_;
+
+    //Keep track of whether we are buffering for a particular location
+    std::map< int, bool> buffering_mode_;
+
+    //Map for storing PCA coefficients for each location
+    std::map<int, hoNDArray<std::complex<float> >* > pca_coefficients_;
+
+    int max_buffered_profiles_;
+    int samples_to_use_;
+  };
+}
+
+#endif /* PCACOILGADGET_H_ */
diff --git a/gadgets/mri_core/PartialFourierAdjustROGadget.cpp b/gadgets/mri_core/PartialFourierAdjustROGadget.cpp
new file mode 100644
index 0000000..b4a560d
--- /dev/null
+++ b/gadgets/mri_core/PartialFourierAdjustROGadget.cpp
@@ -0,0 +1,134 @@
+#include "PartialFourierAdjustROGadget.h"
+#include "ismrmrd/xml.h"
+
+namespace Gadgetron
+{
+
+PartialFourierAdjustROGadget::PartialFourierAdjustROGadget() : maxRO_(0)
+{
+
+}
+
+int PartialFourierAdjustROGadget::process_config(ACE_Message_Block* mb)
+{
+  ISMRMRD::IsmrmrdHeader h;
+  deserialize(mb->rd_ptr(),h);
+
+  if (h.encoding.size() != 1) {
+    GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+    GDEBUG("This partial fourier gadget only supports one encoding space\n");
+    return GADGET_FAIL;
+  }
+
+  ISMRMRD::EncodingSpaceType e_space = h.encoding[0].encodedSpace;
+  maxRO_ = e_space.matrixSize.x;
+  GDEBUG_STREAM("max RO : " << maxRO_);
+  return GADGET_OK;
+}
+
+int addPrePostZeros(size_t centre_column, size_t samples)
+{
+    // 1 : pre zeros
+    // 2 : post zeros
+    // 0 : no zeros
+    if ( 2*centre_column == samples )
+    {
+        return 0;
+    }
+
+    if ( 2*centre_column < samples )
+    {
+        return 1;
+    }
+
+    if ( 2*centre_column > samples )
+    {
+        return 2;
+    }
+
+    return 0;
+}
+
+int PartialFourierAdjustROGadget
+::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+        GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+
+    bool is_noise = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT).isSet(m1->getObjectPtr()->flags);
+    size_t channels = m1->getObjectPtr()->active_channels;
+    size_t samples = m1->getObjectPtr()->number_of_samples;
+    size_t centre_column = m1->getObjectPtr()->center_sample;
+
+    if (!is_noise) 
+    {
+        // adjust the center echo
+        int az = addPrePostZeros(centre_column, samples);
+
+        if ( az!= 0 && samples < maxRO_ )
+        {
+            GadgetContainerMessage< hoNDArray< std::complex<float> > >* m3 = new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+            if (!m3)
+            {
+                return GADGET_FAIL;
+            }
+
+            std::vector<size_t> data_out_dims = *m2->getObjectPtr()->get_dimensions();
+            data_out_dims[0] = maxRO_;
+            try
+            {
+                m3->getObjectPtr()->create(&data_out_dims);
+            }
+            catch(...)
+            {
+                GDEBUG("Unable to create new data array for downsampled data\n");
+                return GADGET_FAIL;
+            }
+            m3->getObjectPtr()->fill(0);
+
+            std::complex<float>* pM3 = m3->getObjectPtr()->get_data_ptr();
+            std::complex<float>* pM2 = m2->getObjectPtr()->get_data_ptr();
+
+            size_t c;
+            if ( az == 1 ) // pre zeros
+            {
+                for ( c=0; c<channels; c++ )
+                {
+                    memcpy(pM3+c*maxRO_+maxRO_-samples, pM2+c*samples, sizeof( std::complex<float> )*samples);
+                }
+            }
+
+            if ( az == 2 ) // post zeros
+            {
+                for ( c=0; c<channels; c++ )
+                {
+                    memcpy(pM3+c*maxRO_, pM2+c*samples, sizeof( std::complex<float> )*samples);
+                }
+            }
+
+            m2->release(); //We are done with this data
+
+            m1->cont(m3);
+            m1->getObjectPtr()->number_of_samples = data_out_dims[0];
+        }
+
+        if (this->next()->putq(m1) == -1) 
+        {
+	  GERROR("NoiseAdjustGadget::process, passing data on to next gadget");
+	  return -1;
+        }
+    }
+    else
+    {
+        if (this->next()->putq(m1) == -1) 
+        {
+	  GERROR("NoiseAdjustGadget::process, passing data on to next gadget");
+	  return -1;
+        }
+    }
+
+    return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(PartialFourierAdjustROGadget)
+
+}
diff --git a/gadgets/mri_core/PartialFourierAdjustROGadget.h b/gadgets/mri_core/PartialFourierAdjustROGadget.h
new file mode 100644
index 0000000..a2b873c
--- /dev/null
+++ b/gadgets/mri_core/PartialFourierAdjustROGadget.h
@@ -0,0 +1,30 @@
+
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd/ismrmrd.h"
+#include "gadgetron_mricore_export.h"
+
+namespace Gadgetron
+{
+
+/// for incoming readout
+/// if not the noise scan and the partial fourier along readout is detected
+/// the readout data will be realigned with center of echo at the centre of incoming 1D array
+class EXPORTGADGETSMRICORE PartialFourierAdjustROGadget : public Gadgetron::Gadget2<ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+{
+public:
+
+    PartialFourierAdjustROGadget();
+
+protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+        Gadgetron::GadgetContainerMessage< Gadgetron::hoNDArray< std::complex<float> > >* m2);
+
+    unsigned int maxRO_;
+};
+
+}
diff --git a/gadgets/mri_core/PhysioInterpolationGadget.cpp b/gadgets/mri_core/PhysioInterpolationGadget.cpp
new file mode 100644
index 0000000..f36cc95
--- /dev/null
+++ b/gadgets/mri_core/PhysioInterpolationGadget.cpp
@@ -0,0 +1,411 @@
+#include "PhysioInterpolationGadget.h"
+#include "GadgetronTimer.h"
+#include "Spline.h"
+#include "mri_core_def.h"
+#include "ismrmrd/meta.h"
+#include "hoNDBSpline.h"
+#include "ismrmrd/xml.h"
+
+#include <numeric>
+#ifdef USE_OMP
+#include <omp.h>
+#endif 
+
+namespace Gadgetron{
+
+    PhysioInterpolationGadget::PhysioInterpolationGadget() 
+        : phys_time_index_(0)
+        , phases_to_reconstruct_(30)
+        , image_with_attrib_(false)
+        , first_beat_on_trigger_(false)
+        , interp_method_("Spline")
+    {
+    }
+
+    PhysioInterpolationGadget::~PhysioInterpolationGadget() {}
+
+    int PhysioInterpolationGadget::process_config(ACE_Message_Block* mb)
+    {
+        phys_time_index_ = physiology_time_index.value();
+        phases_to_reconstruct_ = phases.value();
+        mode_ = mode.value();
+        first_beat_on_trigger_ = first_beat_on_trigger.value();
+
+        ISMRMRD::IsmrmrdHeader h;
+        ISMRMRD::deserialize(mb->rd_ptr(),h);
+
+        interp_method_ = interp_method.value();
+        if ( interp_method_.empty() ) interp_method_ = "Spline";
+
+        if (h.encoding.size() == 0) {
+            GDEBUG("Missing encoding section");
+            return GADGET_FAIL;
+        }
+
+        ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+        slc_limit_ = e_limits.slice ? e_limits.slice->maximum+1 : 1; 
+
+        buffer_.resize(slc_limit_);
+
+        size_t slc;
+        for ( slc=0; slc<slc_limit_; slc++ )
+        {
+            buffer_[slc] = boost::shared_ptr< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>(ACE_Message_Queue_Base::DEFAULT_HWM * 10, ACE_Message_Queue_Base::DEFAULT_LWM * 10) );
+        }
+
+        time_stamps_.resize(slc_limit_);
+
+        return GADGET_OK;
+    }
+
+    int PhysioInterpolationGadget::close(unsigned long flags)
+    {
+        int ret = Gadget::close(flags);
+
+        if ( flags != 0 )
+        {
+            GDEBUG("PhysioInterpolationGadget::close...\n");
+
+            size_t slc;
+            for ( slc=0; slc<slc_limit_; slc ++ )
+            {
+                GDEBUG("Processing slice: %d ... \n", slc);
+                GDEBUG("Number of items on Q: %d\n", buffer_[slc]->message_count());
+                GDEBUG("Image with attribute flag : %d\n", image_with_attrib_);
+
+                if (time_stamps_[slc].size() != buffer_[slc]->message_count())
+                {
+                    GDEBUG("Inconsistent number of messages and time stamps\n");
+                    buffer_[slc]->flush();
+                    return GADGET_FAIL;
+                }
+
+                float previous = -100.0;
+                float sum_int  = 0.0; 
+                std::vector<float> intervals;
+                float int_count = 0.0;
+                std::vector<size_t> cycle_starts;
+                for (size_t i = 0; i < time_stamps_[slc].size(); i++)
+                {
+                    if ( (time_stamps_[slc][i] < previous) || (first_beat_on_trigger_ && i==0) )
+                    {
+                        cycle_starts.push_back(i);
+                    }
+                    else if (i > 0 )
+                    {
+                        sum_int += time_stamps_[slc][i]-time_stamps_[slc][i-1];
+                        intervals.push_back(time_stamps_[slc][i]-time_stamps_[slc][i-1]);
+                        int_count += 1.0;
+                    }
+                    previous = time_stamps_[slc][i];
+                }
+
+                if ( intervals.empty() ) continue;
+
+                std::sort(intervals.begin(),intervals.end());
+
+                float mean_interval = sum_int/int_count;
+                float median_interval = intervals[(intervals.size()>>1)];
+
+                float average_cycle_length = 0.0f;
+                std::vector<float> cycle_lengths;
+                float count = 0;
+                for (size_t i = 1; i < cycle_starts.size(); i++)
+                {
+                    float clength = time_stamps_[slc][cycle_starts[i]-1] + median_interval - time_stamps_[slc][cycle_starts[i]];
+                    cycle_lengths.push_back(clength);
+                }
+
+                if ( cycle_lengths.empty() )
+                {
+                    size_t phs = time_stamps_[slc].size();
+                    float clength = time_stamps_[slc][phs-1];
+                    cycle_lengths.push_back(clength);
+                }
+
+                std::sort(cycle_lengths.begin(),cycle_lengths.end());
+                float mean_cycle_length = std::accumulate(cycle_lengths.begin(), cycle_lengths.end(), 0.0f)/cycle_lengths.size();
+                float median_cycle_length = cycle_lengths[(cycle_lengths.size()>>1)];
+
+                GDEBUG("We have %d full cyles, first one starting at %d\n", cycle_starts.size()-1, cycle_starts[0]);
+                GDEBUG("Mean/Median frame width %f/%f\n", mean_interval,median_interval);
+                GDEBUG("Mean/Median cycle_length %f/%f\n", mean_cycle_length,median_cycle_length);
+
+                //Correct the first cycle assuming it is of median length:
+                if ( !first_beat_on_trigger_ )
+                {
+                    float first_cycle_offset = (median_cycle_length-median_interval)+time_stamps_[slc][cycle_starts[0]]-time_stamps_[slc][cycle_starts[0]-1];
+                    for (size_t i = 0; i < cycle_starts[0]; i++)
+                    {
+                        time_stamps_[slc][i] += first_cycle_offset;
+                    }
+                }
+
+                //Calculate relative time stamps
+                size_t current_cycle = 0;
+                std::vector<float> relative_cycle_time;
+
+                //Make sure we have cycle lengths for all the cycles we have covered
+                cycle_lengths.insert(cycle_lengths.begin(),median_cycle_length);
+                cycle_lengths.push_back(median_cycle_length);
+
+                for (size_t i = 0; i < time_stamps_[slc].size(); i++)
+                {
+                    if ((current_cycle<cycle_starts.size()) && (i >= cycle_starts[current_cycle]) && (current_cycle < cycle_starts.size()))
+                    {
+                        current_cycle++;
+                    }
+                    relative_cycle_time.push_back(time_stamps_[slc][i]/cycle_lengths[current_cycle-1] + current_cycle);
+                }
+
+                //Make a temporary list of all the data pointers from the Q
+                std::vector< ISMRMRD::ImageHeader* > hptrs;
+                std::vector< hoNDArray< std::complex<float> > * > aptrs;
+                std::vector< ISMRMRD::MetaContainer* > attribptrs;
+
+                ACE_Message_Queue<ACE_MT_SYNCH>::ITERATOR it( *buffer_[slc] );
+                for (ACE_Message_Block* entry = 0;
+                    it.next (entry) != 0;
+                    it.advance ()) 
+                {
+                    GadgetContainerMessage< ISMRMRD::ImageHeader >* tmpm1 =
+                        AsContainerMessage< ISMRMRD::ImageHeader >(entry);
+
+                    GadgetContainerMessage< hoNDArray< std::complex<float> > > * tmpm2 = 
+                        AsContainerMessage< hoNDArray< std::complex<float> >  >(entry->cont());
+
+                    if (!tmpm1 || !tmpm2 )
+                    {
+                        GDEBUG("Failed to cast data on Q, bailing out\n");
+                        buffer_[slc]->flush();
+                        return GADGET_FAIL;
+                    }
+
+                    hptrs.push_back(tmpm1->getObjectPtr());
+                    aptrs.push_back(tmpm2->getObjectPtr());
+
+                    if ( image_with_attrib_ )
+                    {
+                        GadgetContainerMessage< ISMRMRD::MetaContainer > * tmpm3 = 
+                            AsContainerMessage< ISMRMRD::MetaContainer >(entry->cont()->cont());
+
+                        if ( !tmpm3 )
+                        {
+                            GDEBUG("Failed to cast data on Q, bailing out\n");
+                            buffer_[slc]->flush();
+                            return GADGET_FAIL;
+                        }
+
+                        attribptrs.push_back(tmpm3->getObjectPtr());
+                    }
+                }
+
+                //Let's figure out which time points we would like to interpolate on:
+                ///TODO: Deal with mode 1 and other future modes, we are only implementing mode 0 at the moment
+                float phase_interval = 1.0f/static_cast<float>(phases_to_reconstruct_);
+                float max_time = floor(relative_cycle_time[relative_cycle_time.size()-1]);
+                std::vector<float> recon_cycle_time;
+                for (float t=1.0;t<(max_time-0.001);t+=phase_interval)
+                {
+                    recon_cycle_time.push_back(t);
+                }
+
+                if ( mode_ == 1 )
+                {
+                    std::vector<float> recon_cycle_time_first_beat(phases_to_reconstruct_);
+                    memcpy(&recon_cycle_time_first_beat[0], &recon_cycle_time[0], sizeof(float)*phases_to_reconstruct_);
+                    recon_cycle_time = recon_cycle_time_first_beat;
+                }
+
+                //Now we can loop over each pixel and estimate the new frames, but first we have to have somewhere to put the data
+                std::vector< GadgetContainerMessage< ISMRMRD::ImageHeader >* > out_heads;
+                std::vector< GadgetContainerMessage< hoNDArray< std::complex<float> > > * > out_data;
+                std::vector< GadgetContainerMessage< ISMRMRD::MetaContainer> * > out_attrib;
+
+                for (size_t i = 0; i < recon_cycle_time.size(); i++)
+                {
+                    GadgetContainerMessage<ISMRMRD::ImageHeader>* tmpm1 = new GadgetContainerMessage<ISMRMRD::ImageHeader>;
+                    GadgetContainerMessage< hoNDArray< std::complex<float> > >* tmpm2 = new GadgetContainerMessage< hoNDArray< std::complex<float> > >;
+
+                    tmpm1->cont(tmpm2);
+
+                    (*tmpm1->getObjectPtr()) = (*hptrs[0]);
+                    tmpm2->getObjectPtr()->create(aptrs[0]->get_dimensions());
+
+                    out_heads.push_back(tmpm1);
+                    out_data.push_back(tmpm2);
+
+                    unsigned short current_cycle = static_cast<unsigned short>(floor(recon_cycle_time[i] + 0.0001));
+                    unsigned short current_phase = static_cast<unsigned short>((recon_cycle_time[i]+0.0001-current_cycle)/(1.0/static_cast<float>(phases_to_reconstruct_)) + 0.0001);
+
+                    tmpm1->getObjectPtr()->physiology_time_stamp[phys_time_index_] = static_cast<unsigned>(floor((recon_cycle_time[i]+0.0001-current_cycle)*cycle_lengths[current_cycle])); 
+                    tmpm1->getObjectPtr()->phase = current_phase;
+                    tmpm1->getObjectPtr()->image_index = current_phase+1 + (uint16_t)slc*phases_to_reconstruct_;
+                    tmpm1->getObjectPtr()->image_series_index = current_cycle*10;
+
+                    // make sure the phase is within the acquisition limit
+                    if ( tmpm1->getObjectPtr()->phase+1 >= time_stamps_[slc].size() )
+                    {
+                        tmpm1->getObjectPtr()->phase = (uint16_t)(time_stamps_[slc].size()-1);
+                    }
+
+                    if ( image_with_attrib_ )
+                    {
+                        GadgetContainerMessage< ISMRMRD::MetaContainer >* tmpm3 = new GadgetContainerMessage< ISMRMRD::MetaContainer >;
+
+                        tmpm2->cont(tmpm3);
+                        (*tmpm3->getObjectPtr()) = (*attribptrs[0]);
+                        out_attrib.push_back(tmpm3);
+
+                        tmpm3->getObjectPtr()->set("PHS",      (long)tmpm1->getObjectPtr()->phase);
+                        tmpm3->getObjectPtr()->set(GADGETRON_IMAGENUMBER, (long)tmpm1->getObjectPtr()->image_index);
+
+                        tmpm3->getObjectPtr()->append(GADGETRON_DATA_ROLE, "PhysioInterp");
+                        tmpm3->getObjectPtr()->append(GADGETRON_IMAGECOMMENT, "PhysioInterp");
+                        tmpm3->getObjectPtr()->append(GADGETRON_SEQUENCEDESCRIPTION, "_PhysioInterp");
+
+                        tmpm3->getObjectPtr()->append(GADGETRON_IMAGEPROCESSINGHISTORY, "Interp");
+                    }
+                }
+
+                //Let's interpolate the images
+                size_t inelem = relative_cycle_time.size();
+                size_t outelem = recon_cycle_time.size();
+                size_t imageelem = aptrs[0]->get_number_of_elements();
+
+                if ( (interp_method_ == "Spline") || (mode_ != 1) )
+                {
+                    GadgetronTimer interptime("Interpolation Time");
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+                    for (long long p = 0; p < (long long)imageelem; p++)
+                    {
+                        std::vector< std::complex<float> > data_in(inelem);
+
+                        //Get the input data for this pixel
+                        for (size_t i = 0; i < inelem; i++) data_in[i] = aptrs[i]->get_data_ptr()[p];
+
+                        //Interpolate the data
+                        Spline<float, std::complex<float> > sp(relative_cycle_time, data_in);
+                        std::vector<std::complex<float> > data_out = sp[recon_cycle_time];
+
+                        //Copy it to the images
+                        for (size_t i = 0; i < outelem; i++) out_data[i]->getObjectPtr()->get_data_ptr()[p] = data_out[i];
+                    }
+                }
+                else
+                {
+                    GadgetronTimer interptime("Interpolation Time using BSpline");
+
+                    size_t SplineDegree = 5;
+
+                    long long p;
+#pragma omp parallel default(none) shared(SplineDegree, imageelem, inelem, outelem, aptrs, relative_cycle_time, recon_cycle_time, out_data) private(p)
+                    {
+                        hoNDArray< std::complex<float> > data_in(inelem);
+                        hoNDArray< std::complex<float> > data_out(outelem);
+
+                        hoNDArray< std::complex<float> > coeff(inelem);
+
+                        hoNDBSpline< std::complex<float>, 1 > interp;
+
+                        size_t i;
+
+                        size_t num = relative_cycle_time.size();
+
+#pragma omp for
+                        for (p = 0; p < (long long)imageelem; p++)
+                        {
+                            //Get the input data for this pixel
+                            for (i = 0; i < inelem; i++) data_in(i) = aptrs[i]->get_data_ptr()[p];
+
+                            // compute the coefficient
+                            interp.computeBSplineCoefficients(data_in, SplineDegree, coeff);
+
+                            //Interpolate the data
+                            for (i = 0; i < outelem; i++)
+                            {
+                                float x = (num-1)*(recon_cycle_time[i]-relative_cycle_time[0])/(relative_cycle_time[num-1] - relative_cycle_time[0]);
+                                data_out(i) = interp.evaluateBSpline(coeff.begin(), inelem, SplineDegree, 0, x);
+                            }
+
+                            //Copy it to the images
+                            for (i = 0; i < outelem; i++) out_data[i]->getObjectPtr()->get_data_ptr()[p] = data_out[i];
+                        }
+                    }
+                }
+
+                //Send out the images
+                for (size_t i = 0; i < out_heads.size(); i++)
+                {
+                    if (this->next()->putq(out_heads[i]) < 0)
+                    {
+                        GDEBUG("Unable to put data on next Gadgets Q\n");
+                        return GADGET_FAIL;
+                    }
+                }
+
+                //We can get rid of the buffered data now
+                buffer_[slc]->flush();
+            }
+        }
+
+        return ret;
+    }
+
+    int PhysioInterpolationGadget::
+        process(GadgetContainerMessage< ISMRMRD::ImageHeader >* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+    {
+        GadgetContainerMessage<ISMRMRD::ImageHeader>* header = new GadgetContainerMessage<ISMRMRD::ImageHeader>;
+        GadgetContainerMessage< hoNDArray< std::complex<float> > >* img = new GadgetContainerMessage< hoNDArray< std::complex<float> > >;
+
+        (*header->getObjectPtr()) = (*m1->getObjectPtr());
+        (*img->getObjectPtr()) = (*m2->getObjectPtr());
+        header->cont(img);
+
+        GadgetContainerMessage<ISMRMRD::MetaContainer>* m3 = 0;
+        if (m2)
+        {
+            m3 = AsContainerMessage<ISMRMRD::MetaContainer>(m2->cont());
+        }
+
+        if ( m3 )
+        {
+            image_with_attrib_ = true;
+        }
+        else
+        {
+            image_with_attrib_ = false;
+        }
+
+        if ( image_with_attrib_ )
+        {
+            GadgetContainerMessage< ISMRMRD::MetaContainer >* attrib = new GadgetContainerMessage< ISMRMRD::MetaContainer >;
+            (*attrib->getObjectPtr()) = *m3->getObjectPtr();
+            img->cont(attrib);
+        }
+
+        uint16_t slc = header->getObjectPtr()->slice;
+
+        if (buffer_[slc]->enqueue_tail(header) < 0)
+        {
+            GDEBUG("Failed to add image to buffer\n");
+            header->release();
+            return GADGET_FAIL;
+        }
+
+        time_stamps_[slc].push_back( (float)(m1->getObjectPtr()->physiology_time_stamp[phys_time_index_]) );
+
+        if (this->next()->putq(m1) < 0)
+        {
+            GDEBUG("Unable to put data on next Gadgets Q\n");
+            return GADGET_FAIL;
+        }
+
+        return GADGET_OK;
+    }
+
+    GADGET_FACTORY_DECLARE(PhysioInterpolationGadget)
+}
diff --git a/gadgets/mri_core/PhysioInterpolationGadget.h b/gadgets/mri_core/PhysioInterpolationGadget.h
new file mode 100644
index 0000000..be95b40
--- /dev/null
+++ b/gadgets/mri_core/PhysioInterpolationGadget.h
@@ -0,0 +1,59 @@
+#ifndef PhysioInterpolationGadget_H
+#define PhysioInterpolationGadget_H
+
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{  
+
+    class EXPORTGADGETSMRICORE PhysioInterpolationGadget : public Gadget2<ISMRMRD::ImageHeader, hoNDArray< std::complex<float> > >
+    {
+    public:
+        GADGET_DECLARE(PhysioInterpolationGadget);
+
+        PhysioInterpolationGadget();
+        virtual ~PhysioInterpolationGadget();
+
+        inline unsigned short get_number_of_phases() { return phases_to_reconstruct_; }
+
+    protected:
+	GADGET_PROPERTY(physiology_time_index, int, "Physiology time index", 0);
+	GADGET_PROPERTY_LIMITS(mode, int, "Mode, 0=seperate series for each RR, 1=First complete RR only", 0, GadgetPropertyLimitsEnumeration, 0, 1);
+	GADGET_PROPERTY(phases, int, "Number of cardiac phases", 30);
+	GADGET_PROPERTY(first_beat_on_trigger, bool, "Indicates that acquisition was started on trigger", false);
+	GADGET_PROPERTY_LIMITS(interp_method, std::string, "Interpolation method", "Spline", GadgetPropertyLimitsEnumeration, "Spline", "BSpline", "");
+        virtual int process_config(ACE_Message_Block* mb);
+
+        virtual int process(GadgetContainerMessage< ISMRMRD::ImageHeader >* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+        virtual int close(unsigned long flags); //All the work is done here in this Gadget
+
+        unsigned short phys_time_index_;
+        unsigned short phases_to_reconstruct_;
+        unsigned short mode_; //0=seperate series for each complete RR,
+                              //1=First complete RR interval only
+
+        // true, if the first beat is on trigger
+        /// false, the first beat will be ignored
+        bool first_beat_on_trigger_;
+
+        // interpolation method, "Spline" or "BSpline"
+        std::string interp_method_;
+
+    private:
+
+        std::vector< boost::shared_ptr< ACE_Message_Queue<ACE_MT_SYNCH> > > buffer_;
+        std::vector< std::vector<float> > time_stamps_;
+
+        size_t slc_limit_;
+
+        bool image_with_attrib_;
+    };
+}
+
+#endif //PhysioInterpolationGadget_H
diff --git a/gadgets/mri_core/RemoveROOversamplingGadget.cpp b/gadgets/mri_core/RemoveROOversamplingGadget.cpp
new file mode 100644
index 0000000..32530bb
--- /dev/null
+++ b/gadgets/mri_core/RemoveROOversamplingGadget.cpp
@@ -0,0 +1,141 @@
+#include "RemoveROOversamplingGadget.h"
+#include "hoNDFFT.h"
+#include "ismrmrd/xml.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron{
+
+    RemoveROOversamplingGadget::RemoveROOversamplingGadget()
+    {
+    }
+
+    RemoveROOversamplingGadget::~RemoveROOversamplingGadget()
+    {
+    }
+
+    int RemoveROOversamplingGadget::process_config(ACE_Message_Block* mb)
+    {
+
+	ISMRMRD::IsmrmrdHeader h;
+	ISMRMRD::deserialize(mb->rd_ptr(),h);
+
+	if (h.encoding.size() == 0) {
+	  GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+	  GDEBUG("This Gadget needs an encoding description\n");
+	  return GADGET_FAIL;
+	}
+
+	
+	ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+	ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+
+        encodeNx_  = e_space.matrixSize.x;
+        encodeFOV_ = e_space.fieldOfView_mm.x;
+        reconNx_   = r_space.matrixSize.x;
+        reconFOV_  = r_space.fieldOfView_mm.x;
+
+        // limit the number of threads used to be 1
+#ifdef USE_OMP
+        omp_set_num_threads(1);
+        GDEBUG_STREAM("RemoveROOversamplingGadget:omp_set_num_threads(1) ... ");
+#endif // USE_OMP
+
+    // If the encoding and recon matrix size and FOV are the same
+    // then the data is not oversampled and we can safely pass
+    // the data onto the next gadget
+    if ( (encodeNx_ == reconNx_) && (encodeFOV_ == reconFOV_) )
+    {
+      dowork_ = false;
+    }
+    else {
+      dowork_ = true;
+    }
+
+        return GADGET_OK;
+    }
+
+    int RemoveROOversamplingGadget
+        ::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+        GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+    {
+
+      // If we have work to do, do it, otherwise do nothing
+      if (dowork_) {
+
+        GadgetContainerMessage< hoNDArray< std::complex<float> > >* m3 
+            = new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+        if (!m3)
+        {
+            return GADGET_FAIL;
+        }
+
+        std::vector<size_t> data_out_dims = *m2->getObjectPtr()->get_dimensions();
+        if ( !ifft_buf_.dimensions_equal(&data_out_dims) )
+        {
+            ifft_buf_.create(data_out_dims);
+            ifft_res_.create(data_out_dims);
+        }
+
+        float ratioFOV = encodeFOV_/reconFOV_;
+
+        data_out_dims[0] = (size_t)(data_out_dims[0]/ratioFOV);
+        if ( !fft_buf_.dimensions_equal(&data_out_dims) )
+        {
+            fft_buf_.create(data_out_dims);
+            fft_res_.create(data_out_dims);
+        }
+
+        try{ m3->getObjectPtr()->create(&data_out_dims);}
+        catch (std::runtime_error &err)
+        {
+            GEXCEPTION(err,"Unable to create new data array for downsampled data\n");
+            return GADGET_FAIL;
+        }
+
+        size_t sRO = m2->getObjectPtr()->get_size(0);
+        size_t start = (size_t)( (m2->getObjectPtr()->get_size(0)-data_out_dims[0])/ratioFOV );
+
+        size_t dRO = m3->getObjectPtr()->get_size(0);
+        size_t numOfBytes = data_out_dims[0]*sizeof(std::complex<float>);
+
+        int c;
+
+        int CHA = (int)(data_out_dims[1]);
+
+        std::complex<float>* data_in, *data_out;
+
+        hoNDFFT<float>::instance()->ifft(m2->getObjectPtr(), 0);
+        data_in  = m2->getObjectPtr()->get_data_ptr();
+        data_out = m3->getObjectPtr()->get_data_ptr();
+
+        for ( c=0; c<CHA; c++)
+        {
+            memcpy( data_out+c*dRO, data_in+c*sRO+start, numOfBytes );
+        }
+
+        hoNDFFT<float>::instance()->fft(m3->getObjectPtr(), 0);
+
+        m2->release(); //We are done with this data
+
+        m1->cont(m3);
+        m1->getObjectPtr()->number_of_samples = data_out_dims[0];
+        m1->getObjectPtr()->center_sample = (uint16_t)(m1->getObjectPtr()->center_sample/ratioFOV);
+
+      } // end if (dowork_)
+
+      if (this->next()->putq(m1) == -1)
+      {
+	GERROR("RemoveROOversamplingGadget::process, passing data on to next gadget");
+	return GADGET_FAIL;
+      }
+
+      return GADGET_OK;
+    }
+
+
+    GADGET_FACTORY_DECLARE(RemoveROOversamplingGadget)
+}
diff --git a/gadgets/mri_core/RemoveROOversamplingGadget.h b/gadgets/mri_core/RemoveROOversamplingGadget.h
new file mode 100644
index 0000000..1286318
--- /dev/null
+++ b/gadgets/mri_core/RemoveROOversamplingGadget.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+    class EXPORTGADGETSMRICORE RemoveROOversamplingGadget :
+        public Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >
+    {
+    public:
+        GADGET_DECLARE(RemoveROOversamplingGadget);
+
+        RemoveROOversamplingGadget();
+        virtual ~RemoveROOversamplingGadget();
+
+    protected:
+
+        virtual int process_config(ACE_Message_Block* mb);
+
+        virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+            GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+        hoNDArray< std::complex<float> > fft_res_;
+        hoNDArray< std::complex<float> > ifft_res_;
+
+        hoNDArray< std::complex<float> > fft_buf_;
+        hoNDArray< std::complex<float> > ifft_buf_;
+
+        int   encodeNx_;
+        float encodeFOV_;
+        int   reconNx_;
+        float reconFOV_;
+
+	// if true the gadget performs the operation
+	// otherwise, it just passes the data on
+	bool dowork_;
+    };
+}
diff --git a/gadgets/mri_core/SimpleReconGadget.cpp b/gadgets/mri_core/SimpleReconGadget.cpp
new file mode 100644
index 0000000..603e0a9
--- /dev/null
+++ b/gadgets/mri_core/SimpleReconGadget.cpp
@@ -0,0 +1,154 @@
+#include "SimpleReconGadget.h"
+#include "hoNDFFT.h"
+#include "hoNDArray_math.h"
+
+namespace Gadgetron{
+
+SimpleReconGadget::SimpleReconGadget()
+  : image_counter_(0)
+{
+}
+
+
+int SimpleReconGadget::process( GadgetContainerMessage<IsmrmrdReconData>* m1)
+{
+    
+    //Iterate over all the recon bits
+    for(std::vector<IsmrmrdReconBit>::iterator it = m1->getObjectPtr()->rbit_.begin();
+        it != m1->getObjectPtr()->rbit_.end(); ++it)
+    {
+        //Grab a reference to the buffer containing the imaging data
+        //We are ignoring the reference data
+        IsmrmrdDataBuffered & dbuff = it->data_;
+
+        //Data 7D, fixed order [E0, E1, E2, CHA, N, S, LOC]
+        uint16_t E0 = dbuff.data_.get_size(0);
+        uint16_t E1 = dbuff.data_.get_size(1);
+        uint16_t E2 = dbuff.data_.get_size(2);
+        uint16_t CHA = dbuff.data_.get_size(3);
+        uint16_t N = dbuff.data_.get_size(4);
+        uint16_t S = dbuff.data_.get_size(5);
+        uint16_t LOC = dbuff.data_.get_size(6);
+      
+        //Create an image array message
+        GadgetContainerMessage<IsmrmrdImageArray>* cm1 = 
+                new GadgetContainerMessage<IsmrmrdImageArray>();
+
+        //Grab references to the image array data and headers
+        IsmrmrdImageArray & imarray = *cm1->getObjectPtr();
+
+        //The image array data will be [E0,E1,E2,1,N,S,LOC] big
+        //Will collapse across coils at the end
+        std::vector<size_t> data_dims(7);
+        data_dims[0] = E0;
+        data_dims[1] = E1;
+        data_dims[2] = E2;
+        data_dims[3] = 1;
+        data_dims[4] = N;
+        data_dims[5] = S;
+        data_dims[6] = LOC;        
+        imarray.data_.create(&data_dims);
+        
+        //ImageHeaders will be [N, S, LOC]
+        std::vector<size_t> header_dims(3);
+        header_dims[0] = N;
+        header_dims[1] = S;
+        header_dims[2] = LOC;        
+        imarray.headers_.create(&header_dims);
+
+        //We will not add any meta data
+        //so skip the meta_ part
+        
+        //Loop over S and N and LOC
+        for (uint16_t loc=0; loc < LOC; loc++) {
+            for (uint16_t s=0; s < S; s++) {                
+                for (uint16_t n=0; n < N; n++) {
+                    
+                    //Set some information into the image header
+                    //Use the middle acquisition header for some info
+                    //[E1, E2, N, S, LOC]
+                    ISMRMRD::AcquisitionHeader & acqhdr = dbuff.headers_(dbuff.sampling_.sampling_limits_[1].center_,
+                                                                         dbuff.sampling_.sampling_limits_[2].center_,
+                                                                         n, s, loc);                    
+                    imarray.headers_(n,s,loc).matrix_size[0]     = E0;
+                    imarray.headers_(n,s,loc).matrix_size[1]     = E1;
+                    imarray.headers_(n,s,loc).matrix_size[2]     = E2;
+                    imarray.headers_(n,s,loc).field_of_view[0]   = dbuff.sampling_.recon_FOV_[0];
+                    imarray.headers_(n,s,loc).field_of_view[1]   = dbuff.sampling_.recon_FOV_[1];
+                    imarray.headers_(n,s,loc).field_of_view[2]   = dbuff.sampling_.recon_FOV_[2];
+                    imarray.headers_(n,s,loc).channels           = 1;                    
+                    imarray.headers_(n,s,loc).average = acqhdr.idx.average;
+                    imarray.headers_(n,s,loc).slice = acqhdr.idx.slice;
+                    imarray.headers_(n,s,loc).contrast = acqhdr.idx.contrast;
+                    imarray.headers_(n,s,loc).phase = acqhdr.idx.phase;
+                    imarray.headers_(n,s,loc).repetition = acqhdr.idx.repetition;
+                    imarray.headers_(n,s,loc).set = acqhdr.idx.set;
+                    imarray.headers_(n,s,loc).acquisition_time_stamp = acqhdr.acquisition_time_stamp;
+                    imarray.headers_(n,s,loc).position[0] = acqhdr.position[0];
+                    imarray.headers_(n,s,loc).position[1] = acqhdr.position[1];
+                    imarray.headers_(n,s,loc).position[2] = acqhdr.position[2];
+                    imarray.headers_(n,s,loc).read_dir[0] = acqhdr.read_dir[0];
+                    imarray.headers_(n,s,loc).read_dir[1] = acqhdr.read_dir[1];
+                    imarray.headers_(n,s,loc).read_dir[2] = acqhdr.read_dir[2];
+                    imarray.headers_(n,s,loc).phase_dir[0] = acqhdr.phase_dir[0];
+                    imarray.headers_(n,s,loc).phase_dir[1] = acqhdr.phase_dir[1];
+                    imarray.headers_(n,s,loc).phase_dir[2] = acqhdr.phase_dir[2];
+                    imarray.headers_(n,s,loc).slice_dir[0] = acqhdr.slice_dir[0];
+                    imarray.headers_(n,s,loc).slice_dir[1] = acqhdr.slice_dir[1];
+                    imarray.headers_(n,s,loc).slice_dir[2] = acqhdr.slice_dir[2];
+                    imarray.headers_(n,s,loc).patient_table_position[0] = acqhdr.patient_table_position[0];
+                    imarray.headers_(n,s,loc).patient_table_position[1] = acqhdr.patient_table_position[1];
+                    imarray.headers_(n,s,loc).patient_table_position[2] = acqhdr.patient_table_position[2];
+                    imarray.headers_(n,s,loc).data_type = ISMRMRD::ISMRMRD_CXFLOAT;
+                    imarray.headers_(n,s,loc).image_index = ++image_counter_;
+
+                    //Grab a wrapper around the relevant chunk of data [E0,E1,E2,CHA] for this loc, n, and s
+                    //Each chunk will be [E0,E1,E2,CHA] big
+                    std::vector<size_t> chunk_dims(4);
+                    chunk_dims[0] = E0;
+                    chunk_dims[1] = E1;
+                    chunk_dims[2] = E2;
+                    chunk_dims[3] = CHA;
+                    hoNDArray<std::complex<float> > chunk = hoNDArray<std::complex<float> >(chunk_dims, &dbuff.data_(0,0,0,0,n,s,loc));
+
+                    //Do the FFTs in place
+                    hoNDFFT<float>::instance()->ifft(&chunk,0);
+                    hoNDFFT<float>::instance()->ifft(&chunk,1);
+                    if (E2>1) {
+                        hoNDFFT<float>::instance()->ifft(&chunk,2);
+                    }
+
+                    //Square root of the sum of squares
+                    //Each image will be [E0,E1,E2,1] big
+                    std::vector<size_t> img_dims(3);
+                    img_dims[0] = E0;
+                    img_dims[1] = E1;
+                    img_dims[2] = E2;
+                    hoNDArray<std::complex<float> > output = hoNDArray<std::complex<float> >(img_dims, &imarray.data_(0,0,0,0,n,s,loc));
+                    //Zero out the output
+                    clear(output);
+
+                    //Compute d* d in place
+                    multiplyConj(chunk,chunk,chunk);                    
+                    //Add up
+                    for (size_t c = 0; c < CHA; c++) {
+                        output += hoNDArray<std::complex<float> >(img_dims, &chunk(0,0,0,c));
+                    }                    
+                    //Take the square root in place
+                    sqrt_inplace(&output);                    
+               }
+            }
+        }
+
+        //Pass the image array down the chain
+        if (this->next()->putq(cm1) < 0) {
+            return GADGET_FAIL;
+        }
+
+    }
+    return GADGET_OK;  
+
+}
+
+GADGET_FACTORY_DECLARE(SimpleReconGadget)
+}
diff --git a/gadgets/mri_core/SimpleReconGadget.h b/gadgets/mri_core/SimpleReconGadget.h
new file mode 100644
index 0000000..36ca42e
--- /dev/null
+++ b/gadgets/mri_core/SimpleReconGadget.h
@@ -0,0 +1,23 @@
+#ifndef SIMPLERECONGADGET_H
+#define SIMPLERECONGADGET_H
+
+#include "Gadget.h"
+#include "gadgetron_mricore_export.h"
+
+#include "mri_core_data.h"
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE SimpleReconGadget : 
+  public Gadget1<IsmrmrdReconData>
+    {
+    public:
+      GADGET_DECLARE(SimpleReconGadget)
+      SimpleReconGadget();
+	
+    protected:
+      virtual int process(GadgetContainerMessage<IsmrmrdReconData>* m1);
+      long long image_counter_;
+    };
+}
+#endif //SIMPLERECONGADGET_H
diff --git a/gadgets/mri_core/Spline.h b/gadgets/mri_core/Spline.h
new file mode 100644
index 0000000..3fbc058
--- /dev/null
+++ b/gadgets/mri_core/Spline.h
@@ -0,0 +1,129 @@
+/* "THE BEER-WARE LICENSE" (Revision 42): Devin Lane wrote this file. As long as you retain 
+ * this notice you can do whatever you want with this stuff. If we meet some day, and you
+ * think this stuff is worth it, you can buy me a beer in return. */
+
+#include <vector>
+#include <iostream>
+
+/** Templated on type of X, Y. X and Y must have operator +, -, *, /. Y must have defined
+ * a constructor that takes a scalar. */
+template <typename X, typename Y>
+  class Spline {
+ public:
+  /** An empty, invalid spline */
+  Spline() {}
+    
+  /** A spline with x and y values */
+  Spline(const std::vector<X>& x, const std::vector<Y>& y) {
+    if (x.size() != y.size()) {
+      std::cerr << "X and Y must be the same size " << std::endl;
+      return;
+    }
+        
+    if (x.size() < 3) {
+      std::cerr << "Must have at least three points for interpolation" << std::endl;
+      return;
+    }
+        
+    typedef typename std::vector<X>::difference_type size_type;
+        
+    size_type n = y.size() - 1;
+        
+    std::vector<Y> b(n), d(n), a(n), c(n+1), l(n+1), u(n+1), z(n+1);
+    std::vector<X> h(n+1);
+
+    l[0] = Y(1);
+    u[0] = Y(0);
+    z[0] = Y(0);
+    h[0] = x[1] - x[0];
+            
+    for (size_type i = 1; i < n; i++) {
+      h[i] = x[i+1] - x[i];
+      l[i] = Y(2 * (x[i+1] - x[i-1])) - Y(h[i-1]) * u[i-1];
+      u[i] = Y(h[i]) / l[i];
+      a[i] = (Y(3) / Y(h[i])) * (y[i+1] - y[i]) - (Y(3) / Y(h[i-1])) * (y[i] - y[i-1]);
+      z[i] = (a[i] - Y(h[i-1]) * z[i-1]) / l[i];
+    }
+            
+    l[n] = Y(1);
+    z[n] = c[n] = Y(0);
+        
+    for (size_type j = n-1; j >= 0; j--) {
+      c[j] = z[j] - u[j] * c[j+1];
+      b[j] = (y[j+1] - y[j]) / Y(h[j]) - (Y(h[j]) * (c[j+1] + Y(2) * c[j])) / Y(3);
+      d[j] = (c[j+1] - c[j]) / Y(3 * h[j]);
+    }
+        
+    for (size_type i = 0; i < n; i++) {
+      mElements.push_back(Element(x[i], y[i], b[i], c[i], d[i]));
+    }        
+  }
+  virtual ~Spline() {}
+    
+  Y operator[](const X& x) const {
+    return interpolate(x);
+  }
+    
+  Y interpolate(const X&x) const {
+    if (mElements.size() == 0) return Y();
+        
+    typename std::vector<element_type>::const_iterator it;
+    it = std::lower_bound(mElements.begin(), mElements.end(), element_type(x));
+    if (it != mElements.begin()) {
+      it--;
+    }   
+            
+    return it->eval(x);
+  }
+    
+  std::vector<Y> operator[](const std::vector<X>& xx) const {
+    return interpolate(xx);
+  }
+    
+  /* Evaluate at multiple locations, assuming xx is sorted ascending */
+  std::vector<Y> interpolate(const std::vector<X>& xx) const {
+    if (mElements.size() == 0) return std::vector<Y>(xx.size());
+        
+    typename std::vector<X>::const_iterator it;
+    typename std::vector<element_type>::const_iterator it2;
+    it2 = mElements.begin();
+    std::vector<Y> ys;
+    for (it = xx.begin(); it != xx.end(); it++) {
+      it2 = std::lower_bound(it2, mElements.end(), element_type(*it));
+      if (it2 != mElements.begin()) {
+	it2--;
+      }
+                
+      ys.push_back(it2->eval(*it));
+    }
+
+    return ys;
+  }
+
+ protected:
+    
+  class Element {
+  public:
+  Element(X _x) : x(_x) {}
+    Element(X _x, Y _a, Y _b, Y _c, Y _d)
+      : x(_x), a(_a), b(_b), c(_c), d(_d) {}
+        
+    Y eval(const X& xx) const {
+      X xix(xx - x);
+      return a + b * xix + c * (xix * xix) + d * (xix * xix * xix);
+    }
+        
+    bool operator<(const Element& e) const {
+      return x < e.x;
+    }
+    bool operator<(const X& xx) const {
+      return x < xx;
+    }
+        
+    X x;
+    Y a, b, c, d;
+  };
+            
+  typedef Element element_type;
+  std::vector<element_type> mElements;
+};
diff --git a/gadgets/mri_core/WhiteNoiseInjectorGadget.cpp b/gadgets/mri_core/WhiteNoiseInjectorGadget.cpp
new file mode 100644
index 0000000..dc251c4
--- /dev/null
+++ b/gadgets/mri_core/WhiteNoiseInjectorGadget.cpp
@@ -0,0 +1,195 @@
+#include "WhiteNoiseInjectorGadget.h"
+#include "gtPlusUtil.h"
+#include <array>
+#include "ismrmrd/xml.h"
+
+namespace Gadgetron
+{
+
+WhiteNoiseInjectorGadget::WhiteNoiseInjectorGadget() : noise_mean_(0), noise_std_(1.0f)
+{
+    add_noise_ref_ = true;
+    randn_ = new RandGenType();
+
+    acceFactorE1_ = 1;
+    acceFactorE2_ = 1;
+
+    is_interleaved_ = false;
+    is_embeded_ = false;
+    is_seperate_ = false;
+    is_external_ = false;
+    is_other_ = false;
+    is_no_acceleration_ = false;
+}
+
+WhiteNoiseInjectorGadget::~WhiteNoiseInjectorGadget()
+{
+    delete randn_;
+}
+
+int WhiteNoiseInjectorGadget::process_config(ACE_Message_Block* mb)
+{
+    noise_mean_ = noise_mean.value();
+    noise_std_ = noise_std.value();
+    add_noise_ref_ = add_noise_ref.value();
+
+    GDEBUG_STREAM("noise mean is " << noise_mean_);
+    GDEBUG_STREAM("noise std is " << noise_std_);
+    GDEBUG_STREAM("add_noise_ref is " << add_noise_ref_);
+
+    randn_->setPara(noise_mean_, noise_std_);
+
+    // get the current time and generate a seed
+    time_t rawtime;
+    struct tm * timeinfo;
+    time ( &rawtime );
+    timeinfo = localtime ( &rawtime );
+
+    long long seed = (long long)(1e10*(timeinfo->tm_year+1900) + 1e8*(timeinfo->tm_mon+1) + 1e6*timeinfo->tm_mday + 1e4*timeinfo->tm_hour + 1e2*timeinfo->tm_min + timeinfo->tm_sec + std::rand());
+
+    std::array<unsigned int, 10> sequence;
+    sequence[0] = (unsigned int)(1e10*(timeinfo->tm_year+1900));
+    sequence[1] = (unsigned int)(1e8*(timeinfo->tm_mon+1));
+    sequence[2] = (unsigned int)(1e6*timeinfo->tm_mday);
+    sequence[3] = (unsigned int)(1e4*timeinfo->tm_hour);
+    sequence[4] = (unsigned int)(1e2*timeinfo->tm_min);
+    sequence[5] = (unsigned int)(timeinfo->tm_sec);
+
+    std::srand( (unsigned int)seed );
+    sequence[6] = (unsigned int)(std::rand());
+    sequence[7] = (unsigned int)(std::rand());
+    sequence[8] = (unsigned int)(std::rand());
+    sequence[9] = (unsigned int)(std::rand());
+
+    std::seed_seq seedSeq(sequence.begin(), sequence.end());
+    randn_->getRandomer().seed(seedSeq);
+
+    randn_->seed( (unsigned long)seed );
+
+// ---------------------------------------------------------------------------------------------------------
+    ISMRMRD::IsmrmrdHeader h;
+    try {
+      deserialize(mb->rd_ptr(),h);
+    } catch (...) {
+      GDEBUG("Error parsing ISMRMRD Header");
+      throw;
+      return GADGET_FAIL;
+    }
+
+    if( h.encoding.size() != 1)
+    {
+      GDEBUG("Number of encoding spaces: %d\n", h.encoding.size());
+      GDEBUG("This simple WhiteNoiseInjectorGadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    if (!h.encoding[0].parallelImaging) {
+      GDEBUG("Parallel Imaging section not found in header");
+      return GADGET_FAIL;
+    }
+
+    ISMRMRD::ParallelImaging p_imaging = *h.encoding[0].parallelImaging;
+
+    acceFactorE1_ = (double)(p_imaging.accelerationFactor.kspace_encoding_step_1);
+    acceFactorE2_ = (double)(p_imaging.accelerationFactor.kspace_encoding_step_2);
+
+    GDEBUG_STREAM("acceFactorE1_ is " << acceFactorE1_);
+    GDEBUG_STREAM("acceFactorE2_ is " << acceFactorE2_);
+
+    if ( !p_imaging.calibrationMode.is_present() )
+    {
+        GDEBUG("Parallel Imaging calibrationMode not found in header");
+        return GADGET_FAIL;
+    }
+
+    std::string calib = *p_imaging.calibrationMode;
+    if ( calib.compare("interleaved") == 0 )
+    {
+      is_interleaved_ = true;
+      GDEBUG_STREAM("Calibration mode is interleaved");
+    } else if ( calib.compare("embedded") == 0 ) {
+      is_embeded_ = true;
+      GDEBUG_STREAM("Calibration mode is embedded");
+    } else if ( calib.compare("separate") == 0 ) {
+      is_seperate_ = true;
+      GDEBUG_STREAM("Calibration mode is separate");
+    } else if ( calib.compare("external") == 0 ) {
+      is_external_ = true;
+      GDEBUG_STREAM("Calibration mode is external");
+    } else if ( (calib.compare("other") == 0)) {
+      is_other_ = true;
+      GDEBUG_STREAM("Calibration mode is other");
+    } else {
+      GDEBUG("Failed to process parallel imaging calibration mode");
+      return GADGET_FAIL;
+    }
+    
+    return GADGET_OK;
+}
+
+int WhiteNoiseInjectorGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+    bool is_noise = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT).isSet(m1->getObjectPtr()->flags);
+    bool is_scc_correction = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_SURFACECOILCORRECTIONSCAN_DATA).isSet(m1->getObjectPtr()->flags);
+
+    bool is_ref = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_PARALLEL_CALIBRATION).isSet(m1->getObjectPtr()->flags);
+    bool is_ref_kspace = ISMRMRD::FlagBit(ISMRMRD::ISMRMRD_ACQ_IS_PARALLEL_CALIBRATION_AND_IMAGING).isSet(m1->getObjectPtr()->flags);
+
+    size_t channels = m1->getObjectPtr()->active_channels;
+    size_t samples = m1->getObjectPtr()->number_of_samples;
+
+    if (!is_noise && !is_scc_correction )
+    {
+        bool add_noise = true;
+        if ( is_ref && !is_ref_kspace && (is_seperate_||is_external_) )
+        {
+            add_noise = add_noise_ref_;
+
+            if ( !add_noise )
+            {
+                GDEBUG_STREAM("WhiteNoiseInjectorGadget, noise is not added to the ref acquisitions ... ");
+            }
+        }
+
+        if ( add_noise )
+        {
+            if ( !noise_.dimensions_equal(m2->getObjectPtr()) )
+            {
+                noise_.create(m2->getObjectPtr()->get_dimensions());
+                noise_fl_.create(m2->getObjectPtr()->get_dimensions());
+            }
+
+            if ( !randn_->gen(noise_) )
+            {
+                GERROR_STREAM("WhiteNoiseInjectorGadget, randn_->gen(noise_) failed ... ");
+                return GADGET_FAIL;
+            }
+
+            if ( !noise_fl_.copyFrom(noise_) )
+            {
+                GERROR_STREAM("WhiteNoiseInjectorGadget, noise_fl_.copyFrom(noise_) failed ... ");
+                return GADGET_FAIL;
+            }
+
+            try
+            {
+                Gadgetron::add(*m2->getObjectPtr(), noise_fl_, *m2->getObjectPtr());
+            }
+            catch(...)
+            {
+                GERROR_STREAM("WhiteNoiseInjectorGadget, Gadgetron::add(*m2->getObjectPtr(), noise_, *m2->getObjectPtr()) failed ... ");
+                return GADGET_FAIL;
+            }
+        }
+    }
+
+    if (this->next()->putq(m1) == -1) 
+    {
+      GERROR("WhiteNoiseInjectorGadget::process, passing data on to next gadget");
+      return -1;
+    }
+
+    return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(WhiteNoiseInjectorGadget)
+}
diff --git a/gadgets/mri_core/WhiteNoiseInjectorGadget.h b/gadgets/mri_core/WhiteNoiseInjectorGadget.h
new file mode 100644
index 0000000..e2f8835
--- /dev/null
+++ b/gadgets/mri_core/WhiteNoiseInjectorGadget.h
@@ -0,0 +1,65 @@
+
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd/ismrmrd.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "gadgetron_mricore_export.h"
+
+namespace Gadgetron { namespace gtPlus {
+    template <typename T> class gtPlusRandNorm;
+}}
+
+namespace Gadgetron
+{
+
+/// add white noise to the kspace data
+class EXPORTGADGETSMRICORE WhiteNoiseInjectorGadget : public Gadgetron::Gadget2<ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+{
+public:
+
+    GADGET_DECLARE(WhiteNoiseInjectorGadget);
+
+    typedef Gadgetron::gtPlus::gtPlusRandNorm<double> RandGenType;
+
+    WhiteNoiseInjectorGadget();
+    virtual ~WhiteNoiseInjectorGadget();
+
+protected:
+    GADGET_PROPERTY(noise_mean, float, "Noise mean", 0.0);
+    GADGET_PROPERTY(noise_std, float, "Noise standard deviation", 0.0);
+    GADGET_PROPERTY(add_noise_ref, bool, "Add noise to reference scans", false);
+
+    virtual int process_config(ACE_Message_Block* mb);
+
+    virtual int process(Gadgetron::GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+        Gadgetron::GadgetContainerMessage< Gadgetron::hoNDArray< std::complex<float> > >* m2);
+
+    /// whether to add noise to ref acquisition
+    bool add_noise_ref_;
+
+    /// noise mean and standard deviation
+    float noise_mean_;
+    float noise_std_;
+
+    /// random noise generator
+    RandGenType* randn_;
+
+    /// helper memory to store noise
+    hoNDArray< std::complex<double> > noise_;
+    hoNDArray< std::complex<float> > noise_fl_;
+
+    /// calibration mode and rate
+    size_t acceFactorE1_;
+    size_t acceFactorE2_;
+
+    bool is_interleaved_;
+    bool is_embeded_;
+    bool is_seperate_;
+    bool is_external_;
+    bool is_other_;
+    bool is_no_acceleration_;
+};
+
+}
diff --git a/gadgets/mri_core/default.xml b/gadgets/mri_core/default.xml
new file mode 100644
index 0000000..bfd3eef
--- /dev/null
+++ b/gadgets/mri_core/default.xml
@@ -0,0 +1,80 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+  
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>AccTrig</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AcquisitionAccumulateTriggerGadget</classname>
+        <property>
+            <name>trigger_dimension</name>
+            <value>repetition</value>
+        </property>
+        <property>
+          <name>sorting_dimension</name>
+          <value>slice</value>
+        </property>
+    </gadget>
+
+    <gadget>
+        <name>Buff</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>BucketToBufferGadget</classname>
+        <property>
+            <name>N_dimension</name>
+            <value></value>
+        </property>
+        <property>
+          <name>S_dimension</name>
+          <value></value>
+        </property>
+        <property>
+          <name>split_slices</name>
+          <value>true</value>
+        </property>
+    </gadget>
+
+     <gadget>
+      <name>SimpleRecon</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>SimpleReconGadget</classname>
+     </gadget>
+
+     <gadget>
+      <name>ImageArraySplit</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageArraySplitGadget</classname>
+     </gadget>
+
+    <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>  
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/mri_core/default_measurement_dependencies.xml b/gadgets/mri_core/default_measurement_dependencies.xml
new file mode 100644
index 0000000..bfba2f9
--- /dev/null
+++ b/gadgets/mri_core/default_measurement_dependencies.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+                  xmlns="http://gadgetron.sf.net/gadgetron"
+                  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- SNR unit noise adjust gadget 
+         If the scan has asymmetric readout, the center of echo will shifted to the index 0
+         Zeros will be filled into the readout data
+    -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+
+        <!-- File prefix for stored noise prewhitener matrix -->
+        <property>
+            <name>noise_dependency_prefix</name>
+            <value>GadgetronNoiseCovarianceMatrix</value>
+        </property>
+
+        <!-- Whether to pass the nonconformant data without applyting the prewhitening -->
+        <property>
+            <name>pass_nonconformant_data</name>
+            <value>true</value>
+        </property>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/mri_core/default_optimized.xml b/gadgets/mri_core/default_optimized.xml
new file mode 100644
index 0000000..5b56280
--- /dev/null
+++ b/gadgets/mri_core/default_optimized.xml
@@ -0,0 +1,123 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>RemoveROOversampling</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>RemoveROOversamplingGadget</classname>
+  </gadget>
+
+    <gadget>
+        <name>AccTrig</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AcquisitionAccumulateTriggerGadget</classname>
+        <property>
+            <name>trigger_dimension</name>
+            <value>repetition</value>
+        </property>
+        <property>
+          <name>sorting_dimension</name>
+          <value>slice</value>
+        </property>
+    </gadget>
+
+    <gadget>
+        <name>Buff</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>BucketToBufferGadget</classname>
+        <property>
+            <name>N_dimension</name>
+            <value></value>
+        </property>
+        <property>
+          <name>S_dimension</name>
+          <value></value>
+        </property>
+        <property>
+          <name>split_slices</name>
+          <value>true</value>
+        </property>
+    </gadget>
+
+     <gadget>
+      <name>SimpleRecon</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>SimpleReconGadget</classname>
+     </gadget>
+
+     <gadget>
+      <name>ImageArraySplit</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageArraySplitGadget</classname>
+     </gadget>
+
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetCPLX</classname>
+      </gadget>
+  -->
+  
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->
+  
+  <!--
+      <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+      </gadget>
+  -->
+
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/mri_core/default_short.xml b/gadgets/mri_core/default_short.xml
new file mode 100644
index 0000000..62bd6c6
--- /dev/null
+++ b/gadgets/mri_core/default_short.xml
@@ -0,0 +1,92 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+ 
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>AccTrig</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>AcquisitionAccumulateTriggerGadget</classname>
+        <property>
+            <name>trigger_dimension</name>
+            <value>repetition</value>
+        </property>
+        <property>
+          <name>sorting_dimension</name>
+          <value>slice</value>
+        </property>
+    </gadget>
+
+    <gadget>
+        <name>Buff</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>BucketToBufferGadget</classname>
+        <property>
+            <name>N_dimension</name>
+            <value></value>
+        </property>
+        <property>
+          <name>S_dimension</name>
+          <value></value>
+        </property>
+        <property>
+          <name>split_slices</name>
+          <value>true</value>
+        </property>
+    </gadget>
+
+     <gadget>
+      <name>SimpleRecon</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>SimpleReconGadget</classname>
+     </gadget>
+
+     <gadget>
+      <name>ImageArraySplit</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageArraySplitGadget</classname>
+     </gadget>
+
+    <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+     <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/mri_core/gadgetron_mricore_export.h b/gadgets/mri_core/gadgetron_mricore_export.h
new file mode 100644
index 0000000..68e869e
--- /dev/null
+++ b/gadgets/mri_core/gadgetron_mricore_export.h
@@ -0,0 +1,14 @@
+#ifndef GADGETRON_MRICORE_EXPORT_H_
+#define GADGETRON_MRICORE_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_MRICORE__)
+#define EXPORTGADGETSMRICORE __declspec(dllexport)
+#else
+#define EXPORTGADGETSMRICORE __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETSMRICORE
+#endif
+
+#endif /* GADGETRON_MRICORE_EXPORT_H_ */
diff --git a/gadgets/mri_core/ismrmrd_dump.xml b/gadgets/mri_core/ismrmrd_dump.xml
new file mode 100644
index 0000000..2ef2461
--- /dev/null
+++ b/gadgets/mri_core/ismrmrd_dump.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>IsmrmrdDump</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>IsmrmrdDumpGadget</classname>
+    <property><name>file_prefix</name><value>ISMRMRD_DUMP</value></property>
+    <property><name>append_timestamp</name><value>true</value></property>
+  </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/pmri/CMakeLists.txt b/gadgets/pmri/CMakeLists.txt
new file mode 100644
index 0000000..67dfb76
--- /dev/null
+++ b/gadgets/pmri/CMakeLists.txt
@@ -0,0 +1,76 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_GADGET_GPUPMRI__)
+  # The two flags below is to fix Windows problems in relation to multiple defined constructors in our headers
+  SET (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /FORCE:MULTIPLE") 
+  SET (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") 
+endif (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+
+include_directories(   
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri_core
+  ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/dwt/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/fft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers/gpu
+  ${HDF5_INCLUDE_DIR}
+  ${HDF5_INCLUDE_DIR}/cpp
+)
+
+add_library(gadgetron_gpuparallelmri SHARED 
+    gadgetron_gpupmri_export.h
+    GenericReconJob.h
+    gpuCgKtSenseGadget.h
+    gpuCgSenseGadget.h
+    gpuCgSpiritGadget.h
+    gpuGenericSensePrepGadget.h
+    gpuSbSenseGadget.h
+    gpuCgSenseGadget.cpp 
+    gpuCgKtSenseGadget.cpp 
+    gpuSbSenseGadget.cpp 
+    gpuGenericSensePrepGadget.cpp
+    gpuCgSpiritGadget.cpp 
+    gpuSenseGadget.cpp
+    gpuBufferSensePrepGadget.cpp
+    gpuOsSenseGadget.cpp
+    gpuNlcgSenseGadget.cpp
+    gpuLALMSenseGadget.cpp
+  )
+
+set_target_properties(gadgetron_gpuparallelmri PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_gpuparallelmri 
+  gadgetron_gadgetbase
+  gadgetron_toolbox_log
+  gadgetron_toolbox_cpucore gadgetron_toolbox_gpudwt gadgetron_toolbox_gpucore gadgetron_toolbox_gpusolvers gadgetron_toolbox_gpuoperators
+  ${Boost_LIBRARIES} ${ISMRMRD_LIBRARIES} ${FFTW3_LIBRARIES} ${CUDA_LIBRARIES} 
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY}
+  )
+
+target_link_libraries(gadgetron_gpuparallelmri gadgetron_toolbox_gpuparallelmri)
+
+install (TARGETS gadgetron_gpuparallelmri DESTINATION lib COMPONENT main)
+
+install (FILES  gadgetron_gpupmri_export.h
+                GenericReconJob.h
+                gpuCgKtSenseGadget.h
+                gpuCgSenseGadget.h
+                gpuCgSpiritGadget.h
+                gpuGenericSensePrepGadget.h
+                gpuSbSenseGadget.h
+                gpuSenseGadget.h
+                gpuBufferSensePrepGadget.h
+                gpuOsSenseGadget.h
+                gpuLALMSenseGadget.h
+                gpuNlcgSenseGadget.h
+                DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+add_subdirectory(config)
diff --git a/gadgets/pmri/GenericReconJob.h b/gadgets/pmri/GenericReconJob.h
new file mode 100644
index 0000000..0e1cf82
--- /dev/null
+++ b/gadgets/pmri/GenericReconJob.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "hoNDArray.h"
+#include "vector_td.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+
+namespace Gadgetron{
+  
+  class GenericReconJob
+  {
+  public:
+    
+    GenericReconJob() {}
+    ~GenericReconJob() {}
+
+    boost::shared_array<ISMRMRD::ImageHeader> image_headers_;
+
+    boost::shared_ptr< hoNDArray<float_complext> >  dat_host_;
+    boost::shared_ptr< hoNDArray<floatd2>        >  tra_host_;
+    boost::shared_ptr< hoNDArray<float>          >  dcw_host_;
+    boost::shared_ptr< hoNDArray<float_complext> >  csm_host_;
+    boost::shared_ptr< hoNDArray<float_complext> >  reg_host_;
+  };
+}
diff --git a/gadgets/pmri/config/CMakeLists.txt b/gadgets/pmri/config/CMakeLists.txt
new file mode 100644
index 0000000..aa946c7
--- /dev/null
+++ b/gadgets/pmri/config/CMakeLists.txt
@@ -0,0 +1,16 @@
+if (ARMADILLO_FOUND)
+  install (FILES 
+    generic_gpusense_cg.xml 
+    generic_gpusense_cg_singleshot.xml 
+    generic_gpusense_sb_singleshot.xml 
+    generic_gpusense_nlcg_singleshot.xml
+    generic_gpu_ktsense_singleshot.xml 
+    DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
+elseif (ARMADILLO_FOUND)
+  MESSAGE("Armadillo not found, only unoptimized generic trajectory config files will be available")
+endif (ARMADILLO_FOUND)
+
+#install (FILES 
+#  generic_gpusense_cg_unoptimized.xml 
+#  generic_gpusense_sb_unoptimized.xml 
+#  DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH})
diff --git a/gadgets/pmri/config/generic_gpu_ktsense_singleshot.xml b/gadgets/pmri/config/generic_gpu_ktsense_singleshot.xml
new file mode 100644
index 0000000..3d96479
--- /dev/null
+++ b/gadgets/pmri/config/generic_gpu_ktsense_singleshot.xml
@@ -0,0 +1,97 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>readouts_per_frame</name><value>1</value></property>
+      <property><name>frames_per_rotation</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>50</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgKtSenseGadget_slice0</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuCgKtSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name><value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>50</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.1</value></property>
+    <property><name>output_convergence</name>      <value>true</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/pmri/config/generic_gpusense_cg.xml b/gadgets/pmri/config/generic_gpusense_cg.xml
new file mode 100644
index 0000000..f6808de
--- /dev/null
+++ b/gadgets/pmri/config/generic_gpusense_cg.xml
@@ -0,0 +1,95 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>30</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.1</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/pmri/config/generic_gpusense_cg_singleshot.xml b/gadgets/pmri/config/generic_gpusense_cg_singleshot.xml
new file mode 100644
index 0000000..4fb3082
--- /dev/null
+++ b/gadgets/pmri/config/generic_gpusense_cg_singleshot.xml
@@ -0,0 +1,97 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>readouts_per_frame</name><value>1</value></property>
+      <property><name>frames_per_rotation</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>50</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>30</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.1</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/pmri/config/generic_gpusense_nlcg_singleshot.xml b/gadgets/pmri/config/generic_gpusense_nlcg_singleshot.xml
new file mode 100644
index 0000000..a892106
--- /dev/null
+++ b/gadgets/pmri/config/generic_gpusense_nlcg_singleshot.xml
@@ -0,0 +1,98 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>readouts_per_frame</name><value>1</value></property>
+      <property><name>frames_per_rotation</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>50</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+    <gadget>
+      <name>gpuNlcgSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuNlcgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>number_of_cg_iterations</name> <value>30</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>lambda</name>                  <value>0.01</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+    </gadget>
+  
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/pmri/config/generic_gpusense_sb_singleshot.xml b/gadgets/pmri/config/generic_gpusense_sb_singleshot.xml
new file mode 100644
index 0000000..956bbd7
--- /dev/null
+++ b/gadgets/pmri/config/generic_gpusense_sb_singleshot.xml
@@ -0,0 +1,101 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>readouts_per_frame</name><value>1</value></property>
+      <property><name>frames_per_rotation</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>50</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+    <gadget>
+      <name>gpuSbSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>0.1</value></property>
+      <property><name>lambda</name>                  <value>0.2</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+  
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/pmri/gadgetron_gpupmri_export.h b/gadgets/pmri/gadgetron_gpupmri_export.h
new file mode 100644
index 0000000..bd71ffa
--- /dev/null
+++ b/gadgets/pmri/gadgetron_gpupmri_export.h
@@ -0,0 +1,14 @@
+#ifndef GADGETRON_GPUPMRI_EXPORT_H_
+#define GADGETRON_GPUPMRI_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GADGET_GPUPMRI__)
+#define EXPORTGADGETS_GPUPMRI __declspec(dllexport)
+#else
+#define EXPORTGADGETS_GPUPMRI __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETS_GPUPMRI
+#endif
+
+#endif /* GADGETRON_GPUPMRI_EXPORT_H_ */
diff --git a/gadgets/pmri/gpuBufferSensePrepGadget.cpp b/gadgets/pmri/gpuBufferSensePrepGadget.cpp
new file mode 100644
index 0000000..f978c05
--- /dev/null
+++ b/gadgets/pmri/gpuBufferSensePrepGadget.cpp
@@ -0,0 +1,308 @@
+/*
+ * gpuBufferSensePrepGadget.cpp
+ *
+ *  Created on: Dec 10, 2014
+ *      Author: dch
+ */
+
+#include "gpuBufferSensePrepGadget.h"
+#include <ismrmrd/xml.h>
+#include "GenericReconJob.h"
+#include "cuNFFTOperator.h"
+#include "cuNFFT.h"
+#include "vector_td_utilities.h"
+#include <boost/shared_ptr.hpp>
+#include <boost/make_shared.hpp>
+#include "b1_map.h"
+#include "cuCgSolver.h"
+#include "cuNDArray_math.h"
+#include "hoNDArray_math.h"
+#include "hoNDArray_utils.h"
+#include "cuNDArray_fileio.h"
+#include "cudaDeviceManager.h"
+#include <numeric>
+
+namespace Gadgetron {
+
+gpuBufferSensePrepGadget::gpuBufferSensePrepGadget() {
+
+}
+
+gpuBufferSensePrepGadget::~gpuBufferSensePrepGadget() {
+
+}
+
+int gpuBufferSensePrepGadget::process_config(ACE_Message_Block* mb) {
+	ISMRMRD::IsmrmrdHeader h;
+	ISMRMRD::deserialize(mb->rd_ptr(),h);
+
+	auto matrixsize = h.encoding.front().encodedSpace.matrixSize;
+
+
+	profiles_per_frame_ = profiles_per_frame.value();
+	kernel_width_ = kernel_width.value();
+	oversampling_factor_ = buffer_convolution_oversampling_factor.value();
+
+	unsigned int warp_size = cudaDeviceManager::Instance()->warp_size();
+	image_dims_.push_back(((matrixsize.x+warp_size-1)/warp_size)*warp_size);
+	image_dims_.push_back(((matrixsize.y+warp_size-1)/warp_size)*warp_size);
+
+	image_dims_recon_.push_back(((static_cast<size_t>(std::ceil(matrixsize.x*reconstruction_os_factor.value()))+warp_size-1)/warp_size)*warp_size);
+	image_dims_recon_.push_back(((static_cast<size_t>(std::ceil(matrixsize.y*reconstruction_os_factor.value()))+warp_size-1)/warp_size)*warp_size);
+
+	image_dims_recon_os_ = uint64d2
+			(((static_cast<size_t>(std::ceil(image_dims_recon_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+					((static_cast<size_t>(std::ceil(image_dims_recon_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+
+	// In case the warp_size constraint kicked in
+	oversampling_factor_ = float(image_dims_recon_os_[0])/float(image_dims_recon_[0]);
+
+	return GADGET_OK;
+
+}
+
+int gpuBufferSensePrepGadget::process(
+		GadgetContainerMessage<IsmrmrdReconData>* m1) {
+
+	IsmrmrdReconData* recondata= m1->getObjectPtr();
+
+	if (recondata->rbit_.size() != 1){
+		throw std::runtime_error("gpuBufferSensePrepGadget only support a single encoding space");
+	}
+
+	IsmrmrdReconBit& reconbit = recondata->rbit_[0];
+
+	GenericReconJob job;
+
+	IsmrmrdDataBuffered* buffer = &reconbit.data_;
+
+	//Use reference data if available.
+	if (reconbit.ref_.data_.get_number_of_elements()){
+		GDEBUG("Using Reference data for CSM estimation\n");
+		buffer = &reconbit.ref_;
+	}
+
+	size_t ncoils = buffer->headers_[0].active_channels;
+
+
+	std::vector<size_t> new_order = {0,1,2,4,5,6,3};
+
+	boost::shared_ptr<cuNDArray<float>> dcw;
+	boost::shared_ptr<cuNDArray<floatd2>> traj;
+
+	if (buffer->headers_[0].trajectory_dimensions == 3){
+		auto traj_dcw = separate_traj_and_dcw(&buffer->trajectory_);
+		dcw = boost::make_shared<cuNDArray<float>>(std::get<1>(traj_dcw).get());
+		traj = boost::make_shared<cuNDArray<floatd2>>(std::get<0>(traj_dcw).get());
+	} else if (buffer->headers_[0].trajectory_dimensions == 2){
+		auto old_traj_dims = *buffer->trajectory_.get_dimensions();
+		std::vector<size_t> traj_dims (old_traj_dims.begin()+1,old_traj_dims.end()); //Remove first element
+		hoNDArray<floatd2> tmp_traj(traj_dims,(floatd2*)buffer->trajectory_.get_data_ptr());
+		traj = boost::make_shared<cuNDArray<floatd2>>(tmp_traj);
+	} else {
+		throw std::runtime_error("Unsupported number of trajectory dimensions");
+	}
+	{
+		std::cout << "Buffer dims: ";
+		auto tmpdim = *buffer->data_.get_dimensions();
+		for (auto dim : tmpdim)
+			std::cout << dim << " ";
+		std::cout << std::endl;
+		auto permuted = permute((hoNDArray<float_complext>*)&buffer->data_,&new_order);
+		cuNDArray<float_complext> data(*permuted);
+		if (dcw){
+			float scale_factor = float(prod(image_dims_recon_os_))/asum(dcw.get());
+			*dcw *= scale_factor;
+		}
+
+		auto reg_images = reconstruct_regularization(&data,traj.get(),dcw.get(),ncoils);
+		reg_images->squeeze();
+
+		auto csm = estimate_b1_map<float,2>(reg_images.get());
+
+		*reg_images *= *csm;
+		auto combined = sum(reg_images.get(),reg_images->get_number_of_dimensions()-1);
+
+		auto tmp_combined = abs(reg_images.get());
+		auto tmpcsm = abs(csm.get());
+		job.csm_host_ = csm->to_host();
+		job.reg_host_ = combined->to_host();
+	}
+
+
+	IsmrmrdDataBuffered* mainbuffer = &reconbit.data_;
+
+	//Permute as Sensegadgets expect last dimension to be coils. *Sigh*
+	job.dat_host_ =permute((hoNDArray<float_complext>*)&mainbuffer->data_,&new_order);
+
+	if (mainbuffer->headers_[0].trajectory_dimensions >2 ){
+		auto traj_dcw = separate_traj_and_dcw(&mainbuffer->trajectory_);
+		job.tra_host_ = std::get<0>(traj_dcw);
+		job.dcw_host_ = std::get<1>(traj_dcw);
+	} else if (mainbuffer->headers_[0].trajectory_dimensions == 2){
+		auto old_traj_dims = *buffer->trajectory_.get_dimensions();
+		std::vector<size_t> traj_dims (old_traj_dims.begin()+1,old_traj_dims.end()); //Remove first element
+		hoNDArray<floatd2> tmp_traj(traj_dims,(floatd2*)mainbuffer->trajectory_.get_data_ptr());
+		job.tra_host_ = boost::make_shared<hoNDArray<floatd2>>(tmp_traj);
+		auto host_dcw = boost::make_shared<hoNDArray<float>>(traj_dims);
+		fill(host_dcw.get(),1.0f);
+		job.dcw_host_ = host_dcw;
+
+	} else {
+		throw std::runtime_error("Unsupported number of trajectory dimensions");
+	}
+	{
+		float scale_factor = float(prod(image_dims_recon_os_))/asum(job.dcw_host_.get());
+		*job.dcw_host_  *= scale_factor;
+	}
+
+	auto data_dims = *job.dat_host_->get_dimensions();
+		//Sense gadgets expect only 1 dimension for encoding, so collapse the first
+	size_t elements = std::accumulate(data_dims.begin(),data_dims.end()-1,1,std::multiplies<size_t>());
+	std::vector<size_t> new_data_dims = {elements,data_dims.back()};
+	job.dat_host_->reshape(&new_data_dims);
+
+	size_t traj_elements = job.tra_host_->get_number_of_elements();
+	auto traj_dims = *job.tra_host_->get_dimensions();
+	if (traj_elements%profiles_per_frame_)
+		throw std::runtime_error("Profiles per frame must be divisor of the total number of profiles");
+	size_t kpoints_per_frame = traj_dims[0]*profiles_per_frame_;
+	std::vector<size_t> new_traj_dims ={kpoints_per_frame,traj_elements/kpoints_per_frame};
+
+	job.tra_host_->reshape(&new_traj_dims);
+	job.dcw_host_->reshape(&new_traj_dims);
+
+
+	//Let's invent some image headers!
+	size_t total_frames = profiles_per_frame_ > 0 ? mainbuffer->headers_.get_number_of_elements()/profiles_per_frame_ : 1 ;
+	job.image_headers_ = boost::shared_array<ISMRMRD::ImageHeader>(new ISMRMRD::ImageHeader[total_frames]);
+	for (size_t i = 0; i < total_frames; i++){
+		job.image_headers_[i] = create_image_header(mainbuffer->headers_[i*profiles_per_frame_],mainbuffer->sampling_,i,total_frames);
+	}
+
+
+	m1->release(); //We be done with everything now.
+
+	auto header_message = new GadgetContainerMessage<ISMRMRD::ImageHeader>(job.image_headers_[0]);
+
+	auto job_message = new GadgetContainerMessage<GenericReconJob>(job);
+
+	header_message->cont(job_message);
+
+	if (!this->next()->putq(header_message)){
+		GDEBUG("Failed to put message on que");
+		return GADGET_FAIL;
+	} else
+		return GADGET_OK;
+
+
+
+	//cuNDArray<float_complext> reg_images = reconstruct_regularization(reconbit.data_);
+}
+
+boost::shared_ptr<cuNDArray<float_complext> > gpuBufferSensePrepGadget::reconstruct_regularization(
+		cuNDArray<float_complext>* data, cuNDArray<floatd2>* traj, cuNDArray<float>* dcw, size_t ncoils ) {
+
+	if (dcw) { //We have density compensation, so we can get away with gridding
+
+		cuNFFT_plan<float,2> plan(from_std_vector<size_t,2>(image_dims_recon_),image_dims_recon_os_,kernel_width_);
+		std::vector<size_t> csm_dims = image_dims_recon_;
+		csm_dims.push_back(ncoils);
+		auto result = new cuNDArray<float_complext>(csm_dims);
+		GDEBUG("Coils %i \n\n",ncoils);
+
+		std::vector<size_t> flat_dims = {traj->get_number_of_elements()};
+		cuNDArray<floatd2> flat_traj(flat_dims,traj->get_data_ptr());
+		GDEBUG("traj: %i data %i\n",traj->get_number_of_elements(),data->get_number_of_elements());
+		GDEBUG("Preprocessing\n\n");
+		plan.preprocess(&flat_traj,cuNFFT_plan<float,2>::NFFT_PREP_NC2C);
+		GDEBUG("Computing\n\n");
+		plan.compute(data,result,dcw,cuNFFT_plan<float,2>::NFFT_BACKWARDS_NC2C);
+
+		write_nd_array(abs(result).get(),"reg.real");
+		return boost::shared_ptr<cuNDArray<float_complext>>(result);
+
+	} else { //No density compensation, we have to do iterative reconstruction.
+		std::vector<size_t> csm_dims = image_dims_recon_;
+		csm_dims.push_back(ncoils);
+
+		auto E = boost::make_shared<cuNFFTOperator<float,2>>();
+
+		E->setup(from_std_vector<size_t,2>(image_dims_recon_),image_dims_recon_os_,kernel_width_);
+		std::vector<size_t> flat_dims = {traj->get_number_of_elements()};
+		cuNDArray<floatd2> flat_traj(flat_dims,traj->get_data_ptr());
+
+		E->set_domain_dimensions(&csm_dims);
+		cuCgSolver<float_complext> solver;
+		solver.set_max_iterations(200);
+		solver.set_encoding_operator(E);
+		E->set_codomain_dimensions(data->get_dimensions().get());
+		E->preprocess(&flat_traj);
+		auto res = solver.solve(data);
+		return res;
+	}
+}
+
+std::tuple<boost::shared_ptr<hoNDArray<floatd2 > >, boost::shared_ptr<hoNDArray<float >>> gpuBufferSensePrepGadget::separate_traj_and_dcw(
+		hoNDArray<float >* traj_dcw) {
+	std::vector<size_t> dims = *traj_dcw->get_dimensions();
+	std::vector<size_t> reduced_dims(dims.begin()+1,dims.end()); //Copy vector, but leave out first dim
+	auto  dcw = boost::make_shared<hoNDArray<float>>(reduced_dims);
+
+	auto traj = boost::make_shared<hoNDArray<floatd2>>(reduced_dims);
+
+	auto dcw_ptr = dcw->get_data_ptr();
+	auto traj_ptr = traj->get_data_ptr();
+	auto ptr = traj_dcw->get_data_ptr();
+	for (size_t i = 0; i < traj_dcw->get_number_of_elements()/3; i++){
+		traj_ptr[i][0] = ptr[i*3];
+		traj_ptr[i][1] = ptr[i*3+1];
+		dcw_ptr[i] = ptr[i*3+2];
+	}
+
+	return std::make_tuple(traj,dcw);
+
+
+}
+
+ISMRMRD::ImageHeader gpuBufferSensePrepGadget::create_image_header(
+		ISMRMRD::AcquisitionHeader& base_head, const SamplingDescription& samp, size_t idx, size_t num_frames) {
+
+	ISMRMRD::ImageHeader header;
+	header.version = base_head.version;
+
+	header.matrix_size[0] = image_dims_recon_[0];
+	header.matrix_size[1] = image_dims_recon_[1];
+	header.matrix_size[2] = num_frames;
+
+
+	header.field_of_view[0] = samp.recon_FOV_[0];
+	header.field_of_view[1] = samp.recon_FOV_[1];
+	header.field_of_view[2] = samp.recon_FOV_[2];
+
+	header.channels = 1;
+	header.slice = base_head.idx.slice;
+	header.set = base_head.idx.set;
+
+	header.acquisition_time_stamp = base_head.acquisition_time_stamp;
+	memcpy(header.physiology_time_stamp, base_head.physiology_time_stamp, sizeof(uint32_t)*ISMRMRD::ISMRMRD_PHYS_STAMPS);
+
+	memcpy(header.position, base_head.position, sizeof(float)*3);
+	memcpy(header.read_dir, base_head.read_dir, sizeof(float)*3);
+	memcpy(header.phase_dir, base_head.phase_dir, sizeof(float)*3);
+	memcpy(header.slice_dir, base_head.slice_dir, sizeof(float)*3);
+	memcpy(header.patient_table_position, base_head.patient_table_position, sizeof(float)*3);
+
+	header.data_type = ISMRMRD::ISMRMRD_CXFLOAT;
+	header.image_index = idx;
+	header.image_series_index = 0;
+
+	return header;
+
+
+
+}
+
+GADGET_FACTORY_DECLARE(gpuBufferSensePrepGadget)
+
+} /* namespace Gadgetron */
diff --git a/gadgets/pmri/gpuBufferSensePrepGadget.h b/gadgets/pmri/gpuBufferSensePrepGadget.h
new file mode 100644
index 0000000..a279206
--- /dev/null
+++ b/gadgets/pmri/gpuBufferSensePrepGadget.h
@@ -0,0 +1,47 @@
+/*
+ * gpuBufferSensePrepGadget.h
+ *
+ *  Created on: Dec 10, 2014
+ *      Author: dch
+ */
+
+#ifndef GPUBUFFERSENSEPREPGADGET_H_
+#define GPUBUFFERSENSEPREPGADGET_H_
+
+#include "Gadget.h"
+#include "mri_core_data.h"
+#include "cuNDArray.h"
+#include "vector_td.h"
+#include "complext.h"
+
+namespace Gadgetron {
+
+class gpuBufferSensePrepGadget: public Gadgetron::Gadget1<IsmrmrdReconData> {
+public:
+	gpuBufferSensePrepGadget();
+	virtual ~gpuBufferSensePrepGadget();
+
+	virtual int process_config(ACE_Message_Block*mb);
+
+	virtual int process(GadgetContainerMessage<IsmrmrdReconData>* data);
+protected:
+	GADGET_PROPERTY(profiles_per_frame,int,"Number of profiles per frame", 0);
+	GADGET_PROPERTY(kernel_width,float,"Kernel width for NFFT", 5.5);
+	GADGET_PROPERTY(buffer_convolution_oversampling_factor,float,"Oversampling used in buffer NFFT", 1.5);
+	GADGET_PROPERTY(reconstruction_os_factor,float,"Oversampling for recon NFFT", 1.5);
+
+	size_t profiles_per_frame_;
+	float kernel_width_;
+	float oversampling_factor_;
+	int ncoils_;
+	std::vector<size_t> image_dims_;
+	std::vector<size_t> image_dims_recon_;
+	uint64d2 image_dims_recon_os_;
+	boost::shared_ptr<cuNDArray<float_complext>> reconstruct_regularization(cuNDArray<float_complext>* data, cuNDArray<floatd2>* traj, cuNDArray<float>* dcw, size_t coils );
+	static std::tuple<boost::shared_ptr<hoNDArray<floatd2 > >, boost::shared_ptr<hoNDArray<float >>> separate_traj_and_dcw(hoNDArray<float>*);
+	ISMRMRD::ImageHeader create_image_header(ISMRMRD::AcquisitionHeader& header,const SamplingDescription& samp,size_t idx, size_t num_frames);
+
+};
+
+} /* namespace Gadgetron */
+#endif /* GPUBUFFERSENSEPREPGADGET_H_ */
diff --git a/gadgets/pmri/gpuCgKtSenseGadget.cpp b/gadgets/pmri/gpuCgKtSenseGadget.cpp
new file mode 100644
index 0000000..923ecbf
--- /dev/null
+++ b/gadgets/pmri/gpuCgKtSenseGadget.cpp
@@ -0,0 +1,365 @@
+#include "gpuCgKtSenseGadget.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "cuNDFFT.h"
+#include "GadgetMRIHeaders.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "vector_td_utilities.h"
+#include "ismrmrd/xml.h"
+
+namespace Gadgetron{
+
+  gpuCgKtSenseGadget::gpuCgKtSenseGadget()
+    : is_configured_(false)
+    , channels_(0)
+    , frame_counter_(0)
+  {
+    matrix_size_ = uint64d2(0,0);
+    matrix_size_os_ = uint64d2(0,0);
+    matrix_size_seq_ = uint64d2(0,0);
+  }
+
+  gpuCgKtSenseGadget::~gpuCgKtSenseGadget() {}
+
+  int gpuCgKtSenseGadget::process_config( ACE_Message_Block* mb )
+  {
+    //GDEBUG("gpuCgKtSenseGadget::process_config\n");
+
+    device_number_ = deviceno.value();
+
+    int number_of_devices = 0;
+    if (cudaGetDeviceCount(&number_of_devices)!= cudaSuccess) {
+      GDEBUG( "Error: unable to query number of CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (number_of_devices == 0) {
+      GDEBUG( "Error: No available CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (device_number_ >= number_of_devices) {
+      GDEBUG("Adjusting device number from %d to %d\n", device_number_,  (device_number_%number_of_devices));
+      device_number_ = (device_number_%number_of_devices);
+    }
+
+    if (cudaSetDevice(device_number_)!= cudaSuccess) {
+      GDEBUG( "Error: unable to set CUDA device.\n" );
+      return GADGET_FAIL;
+    }
+
+    pass_on_undesired_data_ = pass_on_undesired_data.value();
+    set_number_ = setno.value();
+    slice_number_ = sliceno.value();
+    number_of_iterations_ = number_of_iterations.value();
+    cg_limit_ = cg_limit.value();
+    oversampling_factor_ = oversampling_factor.value();
+    kernel_width_ = kernel_width.value();
+    kappa_ = kappa.value();
+    shutter_radius_ = training_data_shutter_radius.value();
+    rotations_to_discard_ = rotations_to_discard.value();
+    output_convergence_ = output_convergence.value();
+
+    if( (rotations_to_discard_%2) == 1 ){
+      GDEBUG("#rotations to discard must be even.\n");
+      return GADGET_FAIL;
+    }
+
+    // Get the Ismrmrd header
+    //
+    ISMRMRD::IsmrmrdHeader h;
+    ISMRMRD::deserialize(mb->rd_ptr(),h);
+    
+    
+    if (h.encoding.size() != 1) {
+      GDEBUG("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    
+    // Get the encoding space and trajectory description
+    ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+    ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+    ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+    matrix_size_seq_ = uint64d2( r_space.matrixSize.x, r_space.matrixSize.y );
+
+    if (!is_configured_) {
+
+      if (h.acquisitionSystemInformation) {
+	channels_ = h.acquisitionSystemInformation->receiverChannels ? *h.acquisitionSystemInformation->receiverChannels : 1;
+      } else {
+	channels_ = 1;
+      }
+
+      // Allocate encoding operator for non-Cartesian Sense
+      E_ = boost::shared_ptr< cuNonCartesianKtSenseOperator<float,2> >( new cuNonCartesianKtSenseOperator<float,2>() );
+
+      // Allocate preconditioner
+      D_ = boost::shared_ptr< cuCgPreconditioner<float_complext> >( new cuCgPreconditioner<float_complext>() );
+
+      // Allocate regularization image operator
+      R_ = boost::shared_ptr< cuImageOperator<float_complext> >( new cuImageOperator<float_complext>() );
+      R_->set_weight( kappa_ );
+
+      // Setup solver
+      cg_.set_encoding_operator( E_ );        // encoding matrix
+      cg_.add_regularization_operator( R_ );  // regularization matrix
+      cg_.set_preconditioner( D_ );           // preconditioning matrix
+      cg_.set_max_iterations( number_of_iterations_ );
+      cg_.set_tc_tolerance( cg_limit_ );
+      cg_.set_output_mode( (output_convergence_) ? cuCgSolver<float_complext>::OUTPUT_VERBOSE : cuCgSolver<float_complext>::OUTPUT_SILENT );
+
+      is_configured_ = true;
+    }
+
+    return GADGET_OK;
+  }
+
+  int gpuCgKtSenseGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<GenericReconJob> *m2)
+  {
+    // Is this data for this gadget's set/slice?
+    //
+    
+    if( m1->getObjectPtr()->set != set_number_ || m1->getObjectPtr()->slice != slice_number_ ) {      
+      // No, pass it downstream...
+      return this->next()->putq(m1);
+    }
+    
+    //GDEBUG("gpuCgKtSenseGadget::process\n");
+    //GPUTimer timer("gpuCgKtSenseGadget::process");
+
+    if (!is_configured_) {
+      GDEBUG("Data received before configuration was completed\n");
+      return GADGET_FAIL;
+    }
+
+    GenericReconJob* j = m2->getObjectPtr();
+
+    // Some basic validation of the incoming Sense job
+    if (!j->csm_host_.get() || !j->dat_host_.get() || !j->tra_host_.get() || !j->dcw_host_.get()) {
+      GDEBUG("Received an incomplete Sense job\n");
+      return GADGET_FAIL;
+    }
+
+    unsigned int samples = j->dat_host_->get_size(0);
+    unsigned int channels = j->dat_host_->get_size(1);
+    unsigned int rotations = samples / j->tra_host_->get_number_of_elements();
+    unsigned int frames = j->tra_host_->get_size(1)*rotations;
+
+    if( samples%j->tra_host_->get_number_of_elements() ) {
+      GDEBUG("Mismatch between number of samples (%d) and number of k-space coordinates (%d).\nThe first should be a multiplum of the latter.\n", 
+		    samples, j->tra_host_->get_number_of_elements());
+      return GADGET_FAIL;
+    }
+
+    boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2> (j->tra_host_.get()));
+    boost::shared_ptr< cuNDArray<float> > dcw(new cuNDArray<float> (j->dcw_host_.get()));
+    sqrt_inplace(dcw.get()); //Take square root to use for weighting
+    boost::shared_ptr< cuNDArray<float_complext> > csm(new cuNDArray<float_complext> (j->csm_host_.get()));
+    boost::shared_ptr< cuNDArray<float_complext> > device_samples(new cuNDArray<float_complext> (j->dat_host_.get()));
+
+    cudaDeviceProp deviceProp;
+    if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+      GDEBUG( "Error: unable to query device properties.\n" );
+      return GADGET_FAIL;
+    }
+    
+    unsigned int warp_size = deviceProp.warpSize;
+    
+    matrix_size_ = uint64d2( j->reg_host_->get_size(0), j->reg_host_->get_size(1) );    
+
+    matrix_size_os_ =
+      uint64d2(((static_cast<unsigned int>(std::ceil(matrix_size_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+	     ((static_cast<unsigned int>(std::ceil(matrix_size_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+    
+    GDEBUG("Matrix size    : [%d,%d] \n", matrix_size_[0], matrix_size_[1]);    
+    GDEBUG("Matrix size OS : [%d,%d] \n", matrix_size_os_[0], matrix_size_os_[1]);
+
+    std::vector<size_t> image_dims = to_std_vector(matrix_size_);
+    image_dims.push_back(frames);
+    
+    E_->set_domain_dimensions(&image_dims);
+    E_->set_codomain_dimensions(device_samples->get_dimensions().get());
+    E_->set_dcw(dcw);
+    E_->set_csm(csm);
+
+    E_->setup( matrix_size_, matrix_size_os_, static_cast<float>(kernel_width_) );
+    E_->preprocess(traj.get());
+        
+    R_->compute(compute_regularization_image(j).get());
+
+    // Define preconditioning weights
+    boost::shared_ptr< cuNDArray<float> > __precon_weights = sum(abs_square(csm.get()).get(), 2);
+    boost::shared_ptr< cuNDArray<float> > _precon_weights = expand<float>( __precon_weights.get(), frames );
+    boost::shared_ptr<cuNDArray<float> > R_diag = R_->get();
+    *R_diag *= float(kappa_);
+    *_precon_weights += *R_diag;
+    R_diag.reset();
+    reciprocal_sqrt_inplace(_precon_weights.get());	
+    boost::shared_ptr< cuNDArray<float_complext> > precon_weights = real_to_complex<float_complext>( _precon_weights.get() );
+    __precon_weights.reset(); _precon_weights.reset();
+    D_->set_weights( precon_weights );
+
+    *device_samples *= *dcw;
+    // Invoke solver
+    // 
+
+    boost::shared_ptr< cuNDArray<float_complext> > cgresult;
+    
+    {
+      GPUTimer timer("gpuCgKtSenseGadget::solve()");
+      cgresult = cg_.solve(device_samples.get());
+    }
+
+    if (!cgresult.get()) {
+      GDEBUG("Iterative_sense_compute failed\n");
+      return GADGET_FAIL;
+    }
+
+    // Goto from x-f to x-t space
+    cuNDFFT<float>::instance()->fft( cgresult.get(), 2,true );
+
+    /*
+    static int counter = 0;
+    char filename[256];
+    sprintf((char*)filename, "recon_%d.real", counter);
+    write_nd_array<float>( abs(cgresult.get())->to_host().get(), filename );
+    counter++; */
+
+    // If the recon matrix size exceeds the sequence matrix size then crop
+    if( matrix_size_seq_ != matrix_size_ )
+      cgresult = crop<float_complext,2>( (matrix_size_-matrix_size_seq_)>>1, matrix_size_seq_, cgresult.get() );    
+    
+    // Now pass on the reconstructed images
+    //
+
+    unsigned int frames_per_rotation = frames/rotations;
+
+    if( rotations == 1 ){ // this is the case for golden ratio
+      rotations = frames;
+      frames_per_rotation = 1;
+    }
+
+    for( unsigned int frame=0; frame<frames; frame++ ){
+
+      unsigned int rotation_idx = frame/frames_per_rotation;
+
+      // Check if we should discard this frame
+      if( rotation_idx < (rotations_to_discard_>>1) || rotation_idx >= rotations-(rotations_to_discard_>>1) )
+	continue;
+            
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *m = 
+	new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *cm = 
+	new GadgetContainerMessage< hoNDArray< std::complex<float> > >();      
+
+      *m->getObjectPtr() = j->image_headers_[frame];
+      m->cont(cm);
+      
+      std::vector<size_t> img_dims(2);
+      img_dims[0] = matrix_size_seq_[0];
+      img_dims[1] = matrix_size_seq_[1];
+
+      cm->getObjectPtr()->create(&img_dims);
+
+      size_t data_length = prod(matrix_size_seq_);
+
+      cudaMemcpy(cm->getObjectPtr()->get_data_ptr(),
+		 cgresult->get_data_ptr()+frame*data_length,
+		 data_length*sizeof(std::complex<float>),
+		 cudaMemcpyDeviceToHost);
+
+      cudaError_t err = cudaGetLastError();
+      if( err != cudaSuccess ){
+	GDEBUG("Unable to copy result from device to host: %s\n", cudaGetErrorString(err));
+	m->release();
+	return GADGET_FAIL;
+      }
+
+      m->getObjectPtr()->matrix_size[0] = matrix_size_seq_[0];
+      m->getObjectPtr()->matrix_size[1] = matrix_size_seq_[1];
+      m->getObjectPtr()->matrix_size[2] = 1;
+      m->getObjectPtr()->channels       = 1;
+      m->getObjectPtr()->image_index    = frame_counter_ + frame;
+      
+      if (this->next()->putq(m) < 0) {
+	GDEBUG("Failed to put result image on to queue\n");
+	m->release();
+	return GADGET_FAIL;
+      }
+    }
+    
+    frame_counter_ += frames;
+
+    m1->release();
+    return GADGET_OK;
+  }
+
+  boost::shared_ptr< cuNDArray<float_complext> > gpuCgKtSenseGadget::
+  compute_regularization_image( GenericReconJob *job )
+  {
+    // 
+    // Estimate training data
+    // 
+
+    unsigned int num_samples = job->dat_host_->get_size(0);
+    unsigned int num_coils = job->dat_host_->get_size(1);
+    unsigned int num_rotations = num_samples / job->tra_host_->get_number_of_elements();
+    unsigned int frames_per_reconstruction = job->tra_host_->get_size(1)*num_rotations;
+
+    std::vector<size_t> dims = to_std_vector(matrix_size_os_);
+    dims.push_back(frames_per_reconstruction); 
+    dims.push_back(num_coils); 
+
+    cuNDArray<float_complext> image_os(&dims);    
+    cuNDArray<float_complext> data((job->dat_host_).get());
+    cuNDArray<float> dcw((job->dcw_host_).get());
+  
+    // Convolve to Cartesian k-space
+    //
+
+    E_->get_plan()->convolve( &data, &image_os, &dcw, cuNFFT_plan<float,2>::NFFT_CONV_NC2C );
+
+    // Apply shutter
+    //
+
+    if( shutter_radius_ < 0.0001 ){ // If not specified in the configuration then try to make an estimation
+
+      // #profiles/frame : this is just an estimate (we dont have the exact value at this stage)
+      unsigned int profiles_per_frame = num_samples / (frames_per_reconstruction*matrix_size_os_[0]);
+      shutter_radius_ = ((float)matrix_size_os_[0]/(float)matrix_size_[0])*(float)profiles_per_frame/(float)M_PI;
+      GDEBUG("Estimated training data shutter radius: %f\n", shutter_radius_);
+    }
+
+    fill_border<float_complext,2>( shutter_radius_, &image_os );
+    E_->get_plan()->fft( &image_os, cuNFFT_plan<float,2>::NFFT_BACKWARDS );
+    E_->get_plan()->deapodize( &image_os );
+
+    // Remove oversampling
+    //
+
+    dims = to_std_vector(matrix_size_);
+    dims.push_back(frames_per_reconstruction); 
+    dims.push_back(num_coils);
+    cuNDArray<float_complext> image(&dims);
+    crop<float_complext,2>( (matrix_size_os_-matrix_size_)>>1, &image_os, &image );
+
+    // Compute regularization image
+    //
+
+    dims.pop_back();
+    boost::shared_ptr< cuNDArray<float_complext> > reg_image( new cuNDArray<float_complext>(&dims) );
+
+    E_->mult_csm_conj_sum( &image, reg_image.get() );
+    cuNDFFT<float>::instance()->ifft( reg_image.get(), 2, true );
+
+    return reg_image;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuCgKtSenseGadget)
+}
diff --git a/gadgets/pmri/gpuCgKtSenseGadget.h b/gadgets/pmri/gpuCgKtSenseGadget.h
new file mode 100644
index 0000000..f35192b
--- /dev/null
+++ b/gadgets/pmri/gpuCgKtSenseGadget.h
@@ -0,0 +1,82 @@
+#ifndef gpuCgKtSenseGadget_H
+#define gpuCgKtSenseGadget_H
+#pragma once
+
+#include "gadgetron_gpupmri_export.h"
+#include "Gadget.h"
+#include "GenericReconJob.h"
+#include "GadgetMRIHeaders.h"
+#include "cuCgSolver.h"
+#include "cuNonCartesianKtSenseOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuNFFT.h"
+#include "cuImageOperator.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_GPUPMRI gpuCgKtSenseGadget : public Gadget2<ISMRMRD::ImageHeader, GenericReconJob>
+  {
+
+  public:
+    GADGET_DECLARE(gpuCgKtSenseGadget);
+
+    gpuCgKtSenseGadget();
+    virtual ~gpuCgKtSenseGadget();
+
+  protected:
+    GADGET_PROPERTY(deviceno, int, "GPU device number", 0);
+    GADGET_PROPERTY(setno, int, "Set to process", 0);
+    GADGET_PROPERTY(sliceno, int, "Slice to process", 0);
+    GADGET_PROPERTY(number_of_iterations, int, "Number of iterations", 5);
+    GADGET_PROPERTY(cg_limit, float, "Convergence limit for CG solver", 1e-6);
+    GADGET_PROPERTY(oversampling_factor, float, "Recon oversampling factor for NFFT", 1.25);
+    GADGET_PROPERTY(kernel_width, float, "Kernel width for NFFT", 5.5);
+    GADGET_PROPERTY(kappa, float, "Kappa regularization factor", 0.3);
+    GADGET_PROPERTY(training_data_shutter_radius, float, "Shutter radius for training data", 0.0);
+    GADGET_PROPERTY(rotations_to_discard, int, "Number of rotations to dump", 0);
+    GADGET_PROPERTY(output_convergence, bool, "Print convergence information", false);
+
+    virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader > *m1, GadgetContainerMessage< GenericReconJob > *m2 );
+    virtual int process_config( ACE_Message_Block* mb );
+
+    boost::shared_ptr< cuNDArray<float_complext> > compute_regularization_image( GenericReconJob *job );
+
+    int channels_;
+    int device_number_;
+    int set_number_;
+    int slice_number_;
+
+    uint64d2 matrix_size_;
+    uint64d2 matrix_size_os_;
+    uint64d2 matrix_size_seq_;
+
+    unsigned int number_of_iterations_;
+    double cg_limit_;
+    double oversampling_factor_;
+    double kernel_width_;
+    double kappa_;
+    double shutter_radius_;
+    unsigned int rotations_to_discard_;
+
+    bool output_convergence_;
+    bool is_configured_;
+
+    // Define conjugate gradient solver
+    cuCgSolver<float_complext> cg_;
+
+    // Define non-Cartesian Sense Encoding operator
+    boost::shared_ptr< cuNonCartesianKtSenseOperator<float,2> > E_;
+
+    // Define preconditioner
+    boost::shared_ptr< cuCgPreconditioner<float_complext> > D_;
+
+    // Define regularization image operator
+    boost::shared_ptr< cuImageOperator<float_complext> > R_;
+
+    int frame_counter_;
+  };
+}
+#endif //gpuCgKtSenseGadget
diff --git a/gadgets/pmri/gpuCgSenseGadget.cpp b/gadgets/pmri/gpuCgSenseGadget.cpp
new file mode 100644
index 0000000..98c448b
--- /dev/null
+++ b/gadgets/pmri/gpuCgSenseGadget.cpp
@@ -0,0 +1,245 @@
+#include "gpuCgSenseGadget.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "GadgetMRIHeaders.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "ismrmrd/xml.h"
+
+namespace Gadgetron{
+
+  gpuCgSenseGadget::gpuCgSenseGadget()
+    : is_configured_(false)
+    , matrix_size_reported_(0),
+      gpuSenseGadget()
+  {
+    matrix_size_ = uint64d2(0,0);
+    matrix_size_os_ = uint64d2(0,0);
+    matrix_size_seq_ = uint64d2(0,0);
+  }
+
+  gpuCgSenseGadget::~gpuCgSenseGadget() {}
+
+  int gpuCgSenseGadget::process_config( ACE_Message_Block* mb )
+  {
+  	gpuSenseGadget::process_config(mb);
+    //GDEBUG("gpuCgSenseGadget::process_config\n");
+    number_of_iterations_ = number_of_iterations.value();
+    kappa_ = kappa.value();
+
+
+
+    // Get the Ismrmrd header
+    //
+    ISMRMRD::IsmrmrdHeader h;
+    ISMRMRD::deserialize(mb->rd_ptr(),h);
+    
+    
+    if (h.encoding.size() != 1) {
+      GDEBUG("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    
+    // Get the encoding space and trajectory description
+    ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+    ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+    ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+    matrix_size_seq_ = uint64d2( r_space.matrixSize.x, r_space.matrixSize.y );
+
+    if (!is_configured_) {
+
+      if (h.acquisitionSystemInformation) {
+	channels_ = h.acquisitionSystemInformation->receiverChannels ? *h.acquisitionSystemInformation->receiverChannels : 1;
+      } else {
+	channels_ = 1;
+      }
+
+     // Allocate encoding operator for non-Cartesian Sense
+      E_ = boost::shared_ptr< cuNonCartesianSenseOperator<float,2> >( new cuNonCartesianSenseOperator<float,2>() );
+
+      // Allocate preconditioner
+      D_ = boost::shared_ptr< cuCgPreconditioner<float_complext> >( new cuCgPreconditioner<float_complext>() );
+
+      // Allocate regularization image operator
+      R_ = boost::shared_ptr< cuImageOperator<float_complext> >( new cuImageOperator<float_complext>() );
+      R_->set_weight( kappa_ );
+
+      // Setup solver
+      cg_.set_encoding_operator( E_ );        // encoding matrix
+      cg_.add_regularization_operator( R_ );  // regularization matrix
+      cg_.set_preconditioner( D_ );           // preconditioning matrix
+      cg_.set_max_iterations( number_of_iterations_ );
+      cg_.set_tc_tolerance( cg_limit_ );
+      cg_.set_output_mode( (output_convergence_) ? cuCgSolver<float_complext>::OUTPUT_VERBOSE : cuCgSolver<float_complext>::OUTPUT_SILENT);
+
+      is_configured_ = true;
+    }
+
+    return GADGET_OK;
+  }
+
+  int gpuCgSenseGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<GenericReconJob> *m2)
+  {
+    // Is this data for this gadget's set/slice?
+    //
+    
+    if( m1->getObjectPtr()->set != set_number_ || m1->getObjectPtr()->slice != slice_number_ ) {      
+      // No, pass it downstream...
+      return this->next()->putq(m1);
+    }
+    
+    //GDEBUG("gpuCgSenseGadget::process\n");
+
+    boost::shared_ptr<GPUTimer> process_timer;
+    if( output_timing_ )
+      process_timer = boost::shared_ptr<GPUTimer>( new GPUTimer("gpuCgSenseGadget::process()") );
+    
+    if (!is_configured_) {
+      GDEBUG("Data received before configuration was completed\n");
+      return GADGET_FAIL;
+    }
+
+    GenericReconJob* j = m2->getObjectPtr();
+
+    // Some basic validation of the incoming Sense job
+    if (!j->csm_host_.get() || !j->dat_host_.get() || !j->tra_host_.get() || !j->dcw_host_.get() || !j->reg_host_.get()) {
+      GDEBUG("Received an incomplete Sense job\n");
+      return GADGET_FAIL;
+    }
+
+    unsigned int samples = j->dat_host_->get_size(0);
+    unsigned int channels = j->dat_host_->get_size(1);
+    unsigned int rotations = samples / j->tra_host_->get_number_of_elements();
+    unsigned int frames = j->tra_host_->get_size(1)*rotations;
+
+    if( samples%j->tra_host_->get_number_of_elements() ) {
+      GDEBUG("Mismatch between number of samples (%d) and number of k-space coordinates (%d).\nThe first should be a multiplum of the latter.\n",
+                    samples, j->tra_host_->get_number_of_elements());
+      return GADGET_FAIL;
+    }
+
+    boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2> (j->tra_host_.get()));
+    boost::shared_ptr< cuNDArray<float> > dcw(new cuNDArray<float> (j->dcw_host_.get()));
+    sqrt_inplace(dcw.get()); //Take square root to use for weighting
+    boost::shared_ptr< cuNDArray<float_complext> > csm(new cuNDArray<float_complext> (j->csm_host_.get()));
+    boost::shared_ptr< cuNDArray<float_complext> > device_samples(new cuNDArray<float_complext> (j->dat_host_.get()));
+    
+    cudaDeviceProp deviceProp;
+    if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+      GDEBUG( "Error: unable to query device properties.\n" );
+      return GADGET_FAIL;
+    }
+    
+    unsigned int warp_size = deviceProp.warpSize;
+    
+    matrix_size_ = uint64d2( j->reg_host_->get_size(0), j->reg_host_->get_size(1) );    
+
+    matrix_size_os_ =
+      uint64d2(((static_cast<unsigned int>(std::ceil(matrix_size_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+               ((static_cast<unsigned int>(std::ceil(matrix_size_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+
+    if( !matrix_size_reported_ ) {
+      GDEBUG("Matrix size    : [%d,%d] \n", matrix_size_[0], matrix_size_[1]);
+      GDEBUG("Matrix size OS : [%d,%d] \n", matrix_size_os_[0], matrix_size_os_[1]);
+      matrix_size_reported_ = true;
+    }
+
+    std::vector<size_t> image_dims = to_std_vector(matrix_size_);
+    image_dims.push_back(frames);
+    
+    E_->set_domain_dimensions(&image_dims);
+    E_->set_codomain_dimensions(device_samples->get_dimensions().get());
+    E_->set_dcw(dcw);
+    E_->set_csm(csm);
+
+    E_->setup( matrix_size_, matrix_size_os_, static_cast<float>(kernel_width_) );
+    E_->preprocess(traj.get());
+
+    boost::shared_ptr< cuNDArray<float_complext> > reg_image(new cuNDArray<float_complext> (j->reg_host_.get()));
+    R_->compute(reg_image.get());
+
+    // Define preconditioning weights
+    boost::shared_ptr< cuNDArray<float> > _precon_weights = sum(abs_square(csm.get()).get(), 2);
+    boost::shared_ptr<cuNDArray<float> > R_diag = R_->get();
+    *R_diag *= float(kappa_);
+    *_precon_weights += *R_diag;
+    R_diag.reset();
+    reciprocal_sqrt_inplace(_precon_weights.get());	
+    boost::shared_ptr< cuNDArray<float_complext> > precon_weights = real_to_complex<float_complext>( _precon_weights.get() );
+    _precon_weights.reset();
+    D_->set_weights( precon_weights );
+    
+    //Apply dcw weights
+    *device_samples *= *dcw;
+
+
+    /*{
+      static int counter = 0;
+      char filename[256];
+      sprintf((char*)filename, "_traj_%d.real", counter);
+      write_nd_array<floatd2>( traj->to_host().get(), filename );
+      sprintf((char*)filename, "_dcw_%d.real", counter);
+      write_nd_array<float>( dcw->to_host().get(), filename );
+      sprintf((char*)filename, "_csm_%d.cplx", counter);
+      write_nd_array<float_complext>( csm->to_host().get(), filename );
+      sprintf((char*)filename, "_samples_%d.cplx", counter);
+      write_nd_array<float_complext>( device_samples->to_host().get(), filename );
+      sprintf((char*)filename, "_reg_%d.cplx", counter);
+      write_nd_array<float_complext>( reg_image->to_host().get(), filename );
+      counter++; 
+      }*/
+
+    // Invoke solver
+    // 
+
+    boost::shared_ptr< cuNDArray<float_complext> > cgresult;
+
+    {
+      boost::shared_ptr<GPUTimer> solve_timer;
+      if( output_timing_ )
+        solve_timer = boost::shared_ptr<GPUTimer>( new GPUTimer("gpuCgSenseGadget::solve()") );
+      
+      cgresult = cg_.solve(device_samples.get());
+      
+      if( output_timing_ )
+        solve_timer.reset();
+    }
+    
+    if (!cgresult.get()) {
+      GDEBUG("Iterative_sense_compute failed\n");
+      return GADGET_FAIL;
+    }
+
+    /*
+      static int counter = 0;
+      char filename[256];
+      sprintf((char*)filename, "recon_%d.real", counter);
+      write_nd_array<float>( abs(cgresult.get())->to_host().get(), filename );
+      counter++; 
+    */
+
+    // If the recon matrix size exceeds the sequence matrix size then crop
+    if( matrix_size_seq_ != matrix_size_ )
+      cgresult = crop<float_complext,2>( (matrix_size_-matrix_size_seq_)>>1, matrix_size_seq_, cgresult.get() );    
+    
+    // Now pass on the reconstructed images
+    //
+    put_frames_on_que(frames,rotations,j,cgresult.get(),channels);
+    
+    frame_counter_ += frames;
+
+    if( output_timing_ )
+      process_timer.reset();
+
+    m1->release();
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuCgSenseGadget)
+}
diff --git a/gadgets/pmri/gpuCgSenseGadget.h b/gadgets/pmri/gpuCgSenseGadget.h
new file mode 100644
index 0000000..1dd7b84
--- /dev/null
+++ b/gadgets/pmri/gpuCgSenseGadget.h
@@ -0,0 +1,60 @@
+#ifndef gpuCgSenseGadget_H
+#define gpuCgSenseGadget_H
+#pragma once
+
+#include "gadgetron_gpupmri_export.h"
+#include "Gadget.h"
+#include "GenericReconJob.h"
+#include "GadgetMRIHeaders.h"
+#include "cuCgSolver.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuNFFT.h"
+#include "cuImageOperator.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+#include "gpuSenseGadget.h"
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_GPUPMRI gpuCgSenseGadget : public gpuSenseGadget
+   {
+
+  public:
+
+    GADGET_DECLARE(gpuCgSenseGadget);
+
+    gpuCgSenseGadget();
+    virtual ~gpuCgSenseGadget();
+
+  protected:
+    GADGET_PROPERTY(kappa, float, "Regularization factor kappa", 0.3);
+    GADGET_PROPERTY(number_of_iterations, int, "Max number of iterations in CG solver", 5);
+
+    virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader > *m1, GadgetContainerMessage< GenericReconJob > *m2 );
+    virtual int process_config( ACE_Message_Block* mb );
+
+    unsigned int number_of_iterations_;
+    double cg_limit_;
+    double kappa_;
+
+    bool output_timing_;
+    bool matrix_size_reported_;
+    bool is_configured_;
+
+    // Define conjugate gradient solver
+    cuCgSolver<float_complext> cg_;
+
+    // Define non-Cartesian Sense Encoding operator
+    boost::shared_ptr< cuNonCartesianSenseOperator<float,2> > E_;
+
+    // Define preconditioner
+    boost::shared_ptr< cuCgPreconditioner<float_complext> > D_;
+
+    // Define regularization image operator
+    boost::shared_ptr< cuImageOperator<float_complext> > R_;
+
+  };
+}
+#endif //gpuCgSenseGadget
diff --git a/gadgets/pmri/gpuCgSpiritGadget.cpp b/gadgets/pmri/gpuCgSpiritGadget.cpp
new file mode 100644
index 0000000..ea05e0c
--- /dev/null
+++ b/gadgets/pmri/gpuCgSpiritGadget.cpp
@@ -0,0 +1,255 @@
+#include "gpuCgSpiritGadget.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "GadgetMRIHeaders.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "ismrmrd/xml.h"
+#include "gpuSenseGadget.h"
+
+namespace Gadgetron{
+
+  gpuCgSpiritGadget::gpuCgSpiritGadget()
+    : is_configured_(false)
+    , matrix_size_reported_(0), gpuSenseGadget()
+  {
+    
+  }
+
+  gpuCgSpiritGadget::~gpuCgSpiritGadget() {}
+
+  int gpuCgSpiritGadget::process_config( ACE_Message_Block* mb )
+  {
+    gpuSenseGadget::process_config(mb);
+
+
+    number_of_iterations_ = number_of_iterations.value();
+    cg_limit_ = cg_limit.value();
+    kappa_ = kappa.value();
+    
+    // Get the Ismrmrd header
+    //
+    ISMRMRD::IsmrmrdHeader h;
+    ISMRMRD::deserialize(mb->rd_ptr(),h);
+    
+    if (h.encoding.size() != 1) {
+      GDEBUG("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    
+    // Get the encoding space and trajectory description
+    ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+    ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+    ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+    matrix_size_seq_ = uint64d2( r_space.matrixSize.x, r_space.matrixSize.y );
+
+    if (!is_configured_) {
+
+      if (h.acquisitionSystemInformation) {
+	channels_ = h.acquisitionSystemInformation->receiverChannels ? *h.acquisitionSystemInformation->receiverChannels : 1;
+      } else {
+	channels_ = 1;
+      }
+      // Allocate Spirit operators
+      E_ = boost::shared_ptr< cuNFFTOperator<float,2> >( new cuNFFTOperator<float,2>() );
+      S_ = boost::shared_ptr< cuSpirit2DOperator<float> >( new cuSpirit2DOperator<float>() );
+      S_->set_weight( kappa_ );
+
+      // Allocate preconditioner
+      //D_ = boost::shared_ptr< cuCgPreconditioner<float_complext> >( new cuCgPreconditioner<float_complext>() );
+
+      // Allocate regularization image operator
+      //R_ = boost::shared_ptr< cuImageOperator<float_complext> >( new cuImageOperator<float_complext>() );
+      //R_->set_weight( kappa_ );
+
+      // Setup solver
+      cg_.set_encoding_operator( E_ );        // encoding matrix
+      if( kappa_ > 0.0f ) cg_.add_regularization_operator( S_ );  // regularization matrix
+      //cg_.add_regularization_operator( R_ );  // regularization matrix
+      //cg_.set_preconditioner( D_ );           // preconditioning matrix
+      cg_.set_max_iterations( number_of_iterations_ );
+      cg_.set_tc_tolerance( cg_limit_ );
+      cg_.set_output_mode( (this->output_convergence_) ? cuCgSolver<float_complext>::OUTPUT_VERBOSE : cuCgSolver<float_complext>::OUTPUT_SILENT);
+
+      is_configured_ = true;
+    }
+
+    return GADGET_OK;
+  }
+
+  int gpuCgSpiritGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<GenericReconJob> *m2)
+  {
+    // Is this data for this gadget's set/slice?
+    //
+    
+    if( m1->getObjectPtr()->set != set_number_ || m1->getObjectPtr()->slice != slice_number_ ) {      
+      // No, pass it downstream...
+      return this->next()->putq(m1);
+    }
+    
+    //GDEBUG("gpuCgSpiritGadget::process\n");
+
+    boost::shared_ptr<GPUTimer> process_timer;
+    if( output_timing_ )
+      process_timer = boost::shared_ptr<GPUTimer>( new GPUTimer("gpuCgSpiritGadget::process()") );
+    
+    if (!is_configured_) {
+      GDEBUG("Data received before configuration was completed\n");
+      return GADGET_FAIL;
+    }
+
+    GenericReconJob* j = m2->getObjectPtr();
+
+    // Some basic validation of the incoming Spirit job
+    if (!j->csm_host_.get() || !j->dat_host_.get() || !j->tra_host_.get() || !j->dcw_host_.get() || !j->reg_host_.get()) {
+      GDEBUG("Received an incomplete Spirit job\n");
+      return GADGET_FAIL;
+    }
+
+    unsigned int samples = j->dat_host_->get_size(0);
+    unsigned int channels = j->dat_host_->get_size(1);
+    unsigned int rotations = samples / j->tra_host_->get_number_of_elements();
+    unsigned int frames = j->tra_host_->get_size(1)*rotations;
+
+    if( samples%j->tra_host_->get_number_of_elements() ) {
+      GDEBUG("Mismatch between number of samples (%d) and number of k-space coordinates (%d).\nThe first should be a multiplum of the latter.\n",
+                    samples, j->tra_host_->get_number_of_elements());
+      return GADGET_FAIL;
+    }
+
+    boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2> (j->tra_host_.get()));
+    boost::shared_ptr< cuNDArray<float> > dcw(new cuNDArray<float> (j->dcw_host_.get()));
+    sqrt_inplace(dcw.get()); //Take square root to use for weighting
+    boost::shared_ptr< cuNDArray<float_complext> > csm(new cuNDArray<float_complext> (j->csm_host_.get()));
+    boost::shared_ptr< cuNDArray<float_complext> > device_samples(new cuNDArray<float_complext> (j->dat_host_.get()));
+    
+    cudaDeviceProp deviceProp;
+    if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+      GDEBUG( "Error: unable to query device properties.\n" );
+      return GADGET_FAIL;
+    }
+    
+    unsigned int warp_size = deviceProp.warpSize;
+    
+    matrix_size_ = uint64d2( j->reg_host_->get_size(0), j->reg_host_->get_size(1) );    
+
+    matrix_size_os_ =
+      uint64d2(((static_cast<unsigned int>(std::ceil(matrix_size_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+               ((static_cast<unsigned int>(std::ceil(matrix_size_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+
+    if( !matrix_size_reported_ ) {
+      GDEBUG("Matrix size    : [%d,%d] \n", matrix_size_[0], matrix_size_[1]);
+      GDEBUG("Matrix size OS : [%d,%d] \n", matrix_size_os_[0], matrix_size_os_[1]);
+      matrix_size_reported_ = true;
+    }
+
+    std::vector<size_t> image_dims = to_std_vector(matrix_size_);
+
+    image_dims.push_back(frames);
+    image_dims.push_back(channels);
+    GDEBUG("Number of coils: %d %d \n",channels,image_dims.size());
+    
+    E_->set_domain_dimensions(&image_dims);
+    E_->set_codomain_dimensions(device_samples->get_dimensions().get());
+    E_->set_dcw(dcw);
+    E_->setup( matrix_size_, matrix_size_os_, static_cast<float>(kernel_width_) );
+    E_->preprocess(traj.get());
+    
+    boost::shared_ptr< cuNDArray<float_complext> > csm_device( new cuNDArray<float_complext>( csm.get() ));
+    S_->set_calibration_kernels(csm_device);
+    S_->set_domain_dimensions(&image_dims);
+    S_->set_codomain_dimensions(&image_dims);
+
+    /*
+    boost::shared_ptr< cuNDArray<float_complext> > reg_image(new cuNDArray<float_complext> (j->reg_host_.get()));
+    R_->compute(reg_image.get());
+
+    // Define preconditioning weights
+    boost::shared_ptr< cuNDArray<float> > _precon_weights = sum(abs_square(csm.get()).get(), 2);
+    boost::shared_ptr<cuNDArray<float> > R_diag = R_->get();
+    *R_diag *= float(kappa_);
+    *_precon_weights += *R_diag;
+    R_diag.reset();
+    reciprocal_sqrt_inplace(_precon_weights.get());	
+    boost::shared_ptr< cuNDArray<float_complext> > precon_weights = real_to_complex<float_complext>( _precon_weights.get() );
+    _precon_weights.reset();
+    D_->set_weights( precon_weights );
+    */
+
+    /*{
+      static int counter = 0;
+      char filename[256];
+      sprintf((char*)filename, "_traj_%d.real", counter);
+      write_nd_array<floatd2>( traj->to_host().get(), filename );
+      sprintf((char*)filename, "_dcw_%d.real", counter);
+      write_nd_array<float>( dcw->to_host().get(), filename );
+      sprintf((char*)filename, "_csm_%d.cplx", counter);
+      write_nd_array<float_complext>( csm->to_host().get(), filename );
+      sprintf((char*)filename, "_samples_%d.cplx", counter);
+      write_nd_array<float_complext>( device_samples->to_host().get(), filename );
+      sprintf((char*)filename, "_reg_%d.cplx", counter);
+      write_nd_array<float_complext>( reg_image->to_host().get(), filename );
+      counter++; 
+      }*/
+
+    // Invoke solver
+    // 
+
+    boost::shared_ptr< cuNDArray<float_complext> > cgresult;
+
+    {
+      boost::shared_ptr<GPUTimer> solve_timer;
+      if( output_timing_ )
+        solve_timer = boost::shared_ptr<GPUTimer>( new GPUTimer("gpuCgSpiritGadget::solve()") );
+      
+      cgresult = cg_.solve(device_samples.get());
+      
+      if( output_timing_ )
+        solve_timer.reset();
+    }
+    
+    if (!cgresult.get()) {
+      GDEBUG("Iterative_spirit_compute failed\n");
+      return GADGET_FAIL;
+    }
+
+    /*
+      static int counter = 0;
+      char filename[256];
+      sprintf((char*)filename, "recon_%d.real", counter);
+      write_nd_array<float>( abs(cgresult.get())->to_host().get(), filename );
+      counter++; 
+    */
+
+    // If the recon matrix size exceeds the sequence matrix size then crop
+    if( matrix_size_seq_ != matrix_size_ )
+      cgresult = crop<float_complext,2>( (matrix_size_-matrix_size_seq_)>>1, matrix_size_seq_, cgresult.get() );    
+    
+    // Combine coil images
+    //
+
+    cgresult = real_to_complex<float_complext>(sqrt(sum(abs_square(cgresult.get()).get(), 3).get()).get()); // RSS
+    //cgresult = sum(cgresult.get(), 2);
+
+    // Pass on the reconstructed images
+    //
+
+    
+	put_frames_on_que(frames,rotations,j,cgresult.get());
+    frame_counter_ += frames;
+
+    if( output_timing_ )
+      process_timer.reset();
+
+    m1->release();
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuCgSpiritGadget)
+}
diff --git a/gadgets/pmri/gpuCgSpiritGadget.h b/gadgets/pmri/gpuCgSpiritGadget.h
new file mode 100644
index 0000000..6e5cb38
--- /dev/null
+++ b/gadgets/pmri/gpuCgSpiritGadget.h
@@ -0,0 +1,63 @@
+#ifndef gpuCgSpiritGadget_H
+#define gpuCgSpiritGadget_H
+#pragma once
+
+#include "gadgetron_gpupmri_export.h"
+#include "Gadget.h"
+#include "GenericReconJob.h"
+#include "GadgetMRIHeaders.h"
+#include "cuCgSolver.h"
+#include "cuNFFTOperator.h"
+#include "cuSpiritOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuNFFT.h"
+#include "cuImageOperator.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+#include "gpuSenseGadget.h"
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_GPUPMRI gpuCgSpiritGadget : public gpuSenseGadget
+   {
+
+  public:
+
+    GADGET_DECLARE(gpuCgSpiritGadget);
+
+    gpuCgSpiritGadget();
+    virtual ~gpuCgSpiritGadget();
+  protected:
+    GADGET_PROPERTY(number_of_iterations, int, "Number of iterations", 5);
+    GADGET_PROPERTY(kappa, float, "Kappa regularization factor", 0.3);
+
+    virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader > *m1, GadgetContainerMessage< GenericReconJob > *m2 );
+    virtual int process_config( ACE_Message_Block* mb );
+
+
+    unsigned int number_of_iterations_;
+    double cg_limit_;
+   bool matrix_size_reported_;
+    bool is_configured_;
+
+    double kappa_;
+
+    // Define conjugate gradient solver
+    cuCgSolver<float_complext> cg_;
+
+    // Define Spirit encoding operator (NFFT)
+    boost::shared_ptr< cuNFFTOperator<float,2> > E_;
+
+    // Define Spirit regularization operator (convolution consistency)
+    boost::shared_ptr< cuSpirit2DOperator<float> > S_;
+
+    // Define preconditioner
+    //boost::shared_ptr< cuCgPreconditioner<float_complext> > D_;
+
+    // Define regularization image operator
+    //boost::shared_ptr< cuImageOperator<float_complext> > R_;
+    
+  };
+}
+#endif //gpuCgSpiritGadget
diff --git a/gadgets/pmri/gpuGenericSensePrepGadget.cpp b/gadgets/pmri/gpuGenericSensePrepGadget.cpp
new file mode 100644
index 0000000..5e40034
--- /dev/null
+++ b/gadgets/pmri/gpuGenericSensePrepGadget.cpp
@@ -0,0 +1,933 @@
+#include "gpuGenericSensePrepGadget.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "GenericReconJob.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "hoNDArray_utils.h"
+#include "vector_td_operators.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "check_CUDA.h"
+#include "hoNDArray_fileio.h"
+#include "ismrmrd/xml.h"
+
+#include <boost/make_shared.hpp>
+#include <algorithm>
+#include <vector>
+#include <cmath>
+#include <stdexcept>
+
+namespace Gadgetron{
+
+  gpuGenericSensePrepGadget::gpuGenericSensePrepGadget()
+    : slices_(-1)
+    , sets_(-1)
+    , device_number_(-1)
+    , samples_per_readout_(-1)
+  {
+  }
+  
+  gpuGenericSensePrepGadget::~gpuGenericSensePrepGadget() {}
+  
+  int gpuGenericSensePrepGadget::process_config(ACE_Message_Block* mb)
+  {
+    // Get configuration values from config file
+    //
+
+    device_number_ = deviceno.value();
+    rotations_per_reconstruction_ = rotations_per_reconstruction.value();
+    buffer_length_in_rotations_ = buffer_length_in_rotations.value();
+    buffer_using_solver_ = buffer_using_solver.value();
+    output_timing_ = output_timing.value();
+
+    // Currently there are some restrictions on the allowed sliding window configurations
+    //
+    
+    sliding_window_readouts_ = sliding_window_readouts.value();
+    sliding_window_rotations_ = sliding_window_rotations.value();
+
+    if( sliding_window_readouts_>0 && sliding_window_rotations_>0 ){
+      GDEBUG( "Error: Sliding window reconstruction is not yet supported for both readouts and frames simultaneously.\n" );
+      return GADGET_FAIL;
+    }
+
+    if( sliding_window_readouts_>0 && rotations_per_reconstruction_>0 ){
+      GDEBUG( "Error: Sliding window reconstruction over readouts is not yet supported for multiframe reconstructions.\n" );
+      return GADGET_FAIL;
+    }
+    
+    if( sliding_window_rotations_ > 0 && sliding_window_rotations_ >= rotations_per_reconstruction_ ){
+      GDEBUG( "Error: Illegal sliding window configuration.\n" );
+      return GADGET_FAIL;
+    }
+
+    // Setup and validate device configuration
+    //
+
+    int number_of_devices;
+    if (cudaGetDeviceCount(&number_of_devices)!= cudaSuccess) {
+      GDEBUG( "Error: unable to query number of CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (number_of_devices == 0) {
+      GDEBUG( "Error: No available CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (device_number_ >= number_of_devices) {
+      GDEBUG("Adjusting device number from %d to %d\n", device_number_,  (device_number_%number_of_devices));
+      device_number_ = (device_number_%number_of_devices);
+    }
+
+    if (cudaSetDevice(device_number_)!= cudaSuccess) {
+      GDEBUG( "Error: unable to set CUDA device.\n" );
+      return GADGET_FAIL;
+    }
+
+    cudaDeviceProp deviceProp;
+    if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+      GDEBUG( "Error: unable to query device properties.\n" );
+      return GADGET_FAIL;
+    }
+    
+    unsigned int warp_size = deviceProp.warpSize;
+
+    // It is possible to specify one set to use for csm propagation, and then propagate this to all sets
+    //
+
+    propagate_csm_from_set_ = propagate_csm_from_set.value();
+
+    if( propagate_csm_from_set_ > 0 ){
+      GDEBUG("Currently, only set 0 can propagate coil sensitivity maps. Set %d was specified.\n", propagate_csm_from_set_ );
+      return GADGET_FAIL;
+    }
+
+    if( propagate_csm_from_set_ >= 0 ){
+      GDEBUG("Propagating csm from set %d to all sets\n", propagate_csm_from_set_ );
+    }
+
+    // Convolution kernel width and oversampling ratio (for the buffer)
+    //
+
+    kernel_width_ = buffer_convolution_kernel_width.value();
+    oversampling_factor_ = buffer_convolution_oversampling_factor.value();
+
+    // Get the Ismrmrd header
+    //
+    ISMRMRD::IsmrmrdHeader h;
+    ISMRMRD::deserialize(mb->rd_ptr(),h);
+    
+    
+    if (h.encoding.size() != 1) {
+      GDEBUG("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    
+    // Get the encoding space and trajectory description
+    ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+    ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+    ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+    // Matrix sizes (as a multiple of the GPU's warp size)
+    //
+    
+    image_dimensions_.push_back(((e_space.matrixSize.x+warp_size-1)/warp_size)*warp_size);
+    image_dimensions_.push_back(((e_space.matrixSize.y+warp_size-1)/warp_size)*warp_size);
+
+    image_dimensions_recon_.push_back(((static_cast<unsigned int>(std::ceil(e_space.matrixSize.x*reconstruction_os_factor_x.value()))+warp_size-1)/warp_size)*warp_size);  
+    image_dimensions_recon_.push_back(((static_cast<unsigned int>(std::ceil(e_space.matrixSize.y*reconstruction_os_factor_y.value()))+warp_size-1)/warp_size)*warp_size);
+    
+    image_dimensions_recon_os_ = uint64d2
+      (((static_cast<unsigned int>(std::ceil(image_dimensions_recon_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+       ((static_cast<unsigned int>(std::ceil(image_dimensions_recon_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+    
+    // In case the warp_size constraint kicked in
+    oversampling_factor_ = float(image_dimensions_recon_os_[0])/float(image_dimensions_recon_[0]); 
+    
+    GDEBUG("matrix_size_x : %d, recon: %d, recon_os: %d\n", 
+                  image_dimensions_[0], image_dimensions_recon_[0], image_dimensions_recon_os_[0]);
+
+    GDEBUG("matrix_size_y : %d, recon: %d, recon_os: %d\n", 
+                  image_dimensions_[1], image_dimensions_recon_[1], image_dimensions_recon_os_[1]);
+    
+    fov_.push_back(r_space.fieldOfView_mm.x);
+    fov_.push_back(r_space.fieldOfView_mm.y);
+    fov_.push_back(r_space.fieldOfView_mm.z);
+
+    slices_ = e_limits.slice ? e_limits.slice->maximum + 1 : 1;
+    sets_ = e_limits.set ? e_limits.set->maximum + 1 : 1;
+    
+    // Allocate readout and trajectory queues
+    // - one queue for the currently incoming frame
+    // - one queue for the upcoming reconstruction
+
+    frame_readout_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+    recon_readout_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+    frame_traj_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+    recon_traj_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+    image_headers_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+    
+    size_t bsize = sizeof(GadgetContainerMessage< hoNDArray< std::complex<float> > >)*image_dimensions_[0]*10;
+    
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      frame_readout_queue_[i].high_water_mark(bsize);
+      frame_readout_queue_[i].low_water_mark(bsize);
+      frame_traj_queue_[i].high_water_mark(bsize);
+      frame_traj_queue_[i].low_water_mark(bsize);
+    }
+    
+    bsize *= (rotations_per_reconstruction_+1);
+    
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      recon_readout_queue_[i].high_water_mark(bsize);
+      recon_readout_queue_[i].low_water_mark(bsize);
+      recon_traj_queue_[i].high_water_mark(bsize);
+      recon_traj_queue_[i].low_water_mark(bsize);
+    }
+    
+    // Define various per slice/set variables
+    //
+
+    previous_readout_no_ = boost::shared_array<long>(new long[slices_*sets_]);
+    acceleration_factor_ = boost::shared_array<long>(new long[slices_*sets_]);
+    image_counter_ = boost::shared_array<long>(new long[slices_*sets_]);
+    readout_counter_frame_= boost::shared_array<long>(new long[slices_*sets_]);
+    readout_counter_global_= boost::shared_array<long>(new long[slices_*sets_]);
+    readouts_per_frame_= boost::shared_array<long>(new long[slices_*sets_]);
+    frames_per_rotation_= boost::shared_array<long>(new long[slices_*sets_]);
+    buffer_frames_per_rotation_= boost::shared_array<long>(new long[slices_*sets_]);
+    buffer_update_needed_ = boost::shared_array<bool>(new bool[slices_*sets_]);
+    reconfigure_ = boost::shared_array<bool>(new bool[slices_*sets_]);
+    num_coils_ = boost::shared_array<unsigned int>(new unsigned int[slices_*sets_]);
+    
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+
+      previous_readout_no_[i] = -1;
+      acceleration_factor_[i] = -1;
+      image_counter_[i] = 0;
+      readout_counter_frame_[i] = 0;
+      readout_counter_global_[i] = 0;
+      readouts_per_frame_[i] = readouts_per_frame.value();
+      frames_per_rotation_[i] = frames_per_rotation.value();
+      buffer_frames_per_rotation_[i] = buffer_frames_per_rotation.value();
+      num_coils_[i] = 0;
+      buffer_update_needed_[i] = true;
+      reconfigure_[i] = true;
+
+      // Assign some default values ("upper bound estimates") of the (possibly) unknown entities
+      //
+      
+      if( readouts_per_frame_[i] == 0 ){
+        readouts_per_frame_[i] = image_dimensions_[0];
+      }
+      
+      if( frames_per_rotation_[i] == 0 ){
+        frames_per_rotation_[i] = image_dimensions_[0]/readouts_per_frame_[i];
+      }
+
+      // Also remember to set the high/low water marks of the ISMRMRD image header queue
+      //
+
+      bsize = sizeof(GadgetContainerMessage<ISMRMRD::ImageHeader>)*100*
+        std::max(1L, frames_per_rotation_[i]*rotations_per_reconstruction_);
+    
+      image_headers_queue_[i].high_water_mark(bsize);
+      image_headers_queue_[i].low_water_mark(bsize);
+    }
+
+    // If need be the following limitation can be lifted, but it would be a little tedious... 
+    //
+
+    if( buffer_using_solver_ && rotations_per_reconstruction_ < 1 ) {
+      GDEBUG("Error: when buffering using a cg solver, 'rotations_per_reconstruction' must be specified (and strictly positive).");
+    }
+
+    if( buffer_using_solver_ && ( buffer_frames_per_rotation_[0] > 0 || buffer_length_in_rotations_ > 0 ) ){
+      GDEBUG("Error: when buffering using a cg solver, we currently do not support specification of 'buffer_frames_per_rotation' or 'buffer_length_in_rotations'. These values are instead automatically set to match the reconstruction settings.\n");
+      return GADGET_FAIL;
+    }
+            
+    position_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+    read_dir_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+    phase_dir_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+    slice_dir_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      (position_[i])[0] = (position_[i])[1] = (position_[i])[2] = 0.0f;
+      (read_dir_[i])[0] = (read_dir_[i])[1] = (read_dir_[i])[2] = 0.0f;
+      (phase_dir_[i])[0] = (phase_dir_[i])[1] = (phase_dir_[i])[2] = 0.0f;
+      (slice_dir_[i])[0] = (slice_dir_[i])[1] = (slice_dir_[i])[2] = 0.0f;
+    }
+
+    // Allocate accumulation buffer
+    //
+
+    if( buffer_using_solver_ )
+      acc_buffer_cg_ = boost::shared_array< cuSenseBufferCg<float,2> >(new cuSenseBufferCg<float,2>[slices_*sets_]);
+    else
+      acc_buffer_ = boost::shared_array< cuSenseBuffer<float,2> >(new cuSenseBuffer<float,2>[slices_*sets_]);
+    
+    // Allocate remaining shared_arrays
+    //
+    
+    csm_host_ = boost::shared_array< hoNDArray<float_complext> >(new hoNDArray<float_complext>[slices_*sets_]);
+    reg_host_ = boost::shared_array< hoNDArray<float_complext> >(new hoNDArray<float_complext>[slices_*sets_]);
+
+    return GADGET_OK;
+  }
+
+  int gpuGenericSensePrepGadget::
+  process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1,           // header
+          GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2,   // data
+          GadgetContainerMessage< hoNDArray<float> > *m3)                   // traj/dcw
+  {
+    // Noise should have been consumed by the noise adjust (if in the gadget chain)
+    //
+    
+    bool is_noise = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT);
+    if (is_noise) { 
+      m1->release();
+      return GADGET_OK;
+    }
+
+    // Setup timer if asked for
+    //
+
+    boost::shared_ptr<GPUTimer> process_timer;
+    if( output_timing_ )
+      process_timer = boost::shared_ptr<GPUTimer>( new GPUTimer("gpuGenericSensePrepGadget::process()") );
+
+    // Some convienient utility variables
+    //
+
+    unsigned int set = m1->getObjectPtr()->idx.set;
+    unsigned int slice = m1->getObjectPtr()->idx.slice;
+    unsigned int readout = m1->getObjectPtr()->idx.kspace_encode_step_1;
+    unsigned int idx = set*slices_+slice;
+
+    // Get a pointer to the accumulation buffer. 
+    //
+
+    cuSenseBuffer<float,2> *acc_buffer = 
+      (buffer_using_solver_) ? &acc_buffer_cg_[idx] : &acc_buffer_[idx];
+
+    // Have the imaging plane changed?
+    //
+
+    if( !vec_equal(position_[idx], m1->getObjectPtr()->position) ||
+        !vec_equal(read_dir_[idx], m1->getObjectPtr()->read_dir) || 
+        !vec_equal(phase_dir_[idx], m1->getObjectPtr()->phase_dir) ||
+        !vec_equal(slice_dir_[idx], m1->getObjectPtr()->slice_dir) ){
+      
+      // Yes indeed, clear the accumulation buffer and update structs
+      //
+
+      acc_buffer->clear();
+      buffer_update_needed_[idx] = true;
+      
+      memcpy(position_[idx],m1->getObjectPtr()->position,3*sizeof(float));
+      memcpy(read_dir_[idx],m1->getObjectPtr()->read_dir,3*sizeof(float));
+      memcpy(phase_dir_[idx],m1->getObjectPtr()->phase_dir,3*sizeof(float));
+      memcpy(slice_dir_[idx],m1->getObjectPtr()->slice_dir,3*sizeof(float));
+    }
+    
+    // Only when the first readout arrives, do we know the #samples/readout
+    //
+
+    if( samples_per_readout_ == -1 )      
+      samples_per_readout_ = m1->getObjectPtr()->number_of_samples;
+    
+    if( samples_per_readout_ != m1->getObjectPtr()->number_of_samples ){
+      GDEBUG("Unexpected change in the readout length\n");
+      return GADGET_FAIL;
+    }
+    
+    bool new_frame_detected = false;
+
+    // Reconfigure at first pass
+    // - or if the number of coil changes
+    // - or if the reconfigure_ flag is set
+
+    if( num_coils_[idx] != m1->getObjectPtr()->active_channels ){
+      GDEBUG("Reconfiguring (the number of coils changed)\n");
+      num_coils_[idx] = m1->getObjectPtr()->active_channels;
+      reconfigure(set, slice);
+    }
+
+    if( reconfigure_[idx] ){
+      GDEBUG("Reconfiguring (due to boolean indicator)\n");
+      reconfigure(set, slice);
+    }
+
+    // Keep track of the incoming readout ids
+    // - to determine the number of readouts per frame
+    // - to determine the number of frames per rotation
+
+    if (previous_readout_no_[idx] >= 0) {
+
+      if ( readout > previous_readout_no_[idx]) { 
+        // This is not the last readout in the frame.
+        // Make an estimate of the acceleration factor
+        //
+	
+        long tmp_accel = readout - previous_readout_no_[idx];
+
+        if( acceleration_factor_[idx] != tmp_accel )
+          GDEBUG("Detected an acceleration factor of %d\n", tmp_accel);
+	
+        acceleration_factor_[idx] = tmp_accel;
+      }
+      else{ 
+
+        // This is the first readout in a new frame
+        //
+
+        if( readouts_per_frame.value() == 0 &&
+            readout_counter_frame_[idx] > 0 &&
+            readout_counter_frame_[idx] != readouts_per_frame_[idx] ){ 
+
+          // A new acceleration factor is detected
+          //
+
+          GDEBUG("Reconfiguring (acceleration factor changed)\n");
+
+          new_frame_detected = true;
+          readouts_per_frame_[idx] = readout_counter_frame_[idx];
+
+          // Assume that #frames/rotation equals the acceleration factor
+          // If not, or if we cannot deduce the acceleration factor from the difference
+          // of two subsequent readout ids, then 'frames_per_rotation' have to be specified in the config...
+          //
+	    
+          if( frames_per_rotation.value() == 0 ) {
+            frames_per_rotation_[idx] = acceleration_factor_[idx];
+          }
+          reconfigure(set, slice);
+        }
+      }
+    }
+    previous_readout_no_[idx] = readout;
+
+    // Enqueue readout
+    // - unless 'new_frame_detected', then the current readout does not belong to the current frame and we delay enqueing
+
+    if( !new_frame_detected ) {
+      
+      // Memory handling is easier if we make copies for our internal queues
+      frame_readout_queue_[idx].enqueue_tail(duplicate_array(m2));
+      recon_readout_queue_[idx].enqueue_tail(duplicate_array(m2));
+      frame_traj_queue_[idx].enqueue_tail(duplicate_array(m3));
+      recon_traj_queue_[idx].enqueue_tail(duplicate_array(m3));
+    }
+
+    // If the readout is the last of a "true frame" (ignoring any sliding window readouts)
+    // - then update the accumulation buffer
+
+    bool is_last_readout_in_frame = (readout_counter_frame_[idx] == readouts_per_frame_[idx]-1);
+    is_last_readout_in_frame |= new_frame_detected;
+
+    cuNDArray<floatd2> traj;
+    cuNDArray<float> dcw;
+    
+    if( is_last_readout_in_frame ){
+
+      // Get ready to update the csm/regularization buffer
+      //
+
+      // Extract this frame's samples 
+      //
+
+      boost::shared_ptr< hoNDArray<float_complext> > host_samples = 
+        extract_samples_from_queue( &frame_readout_queue_[idx], false, set, slice );
+            
+      cuNDArray<float_complext> samples( host_samples.get() );
+
+      // Extract this frame's trajectory and dcw.
+      //
+
+      extract_trajectory_and_dcw_from_queue( &frame_traj_queue_[idx], false, set, slice, 
+                                             samples_per_readout_*readouts_per_frame_[idx], 1,
+                                             &traj, &dcw );
+
+      // Scale dcw weights to the are of the oversampled recon matrix size
+      float scale_factor = float(prod(image_dimensions_recon_os_))/asum(&dcw);
+      dcw *= scale_factor;
+      
+      // Add this frame to the buffer
+      //
+
+      acc_buffer->set_dcw(boost::shared_ptr< cuNDArray<float> >(new cuNDArray<float>(&dcw)));
+      buffer_update_needed_[idx] |= acc_buffer->add_frame_data( &samples, &traj );
+    }
+
+    // Are we ready to reconstruct (downstream)?
+    //
+
+    long readouts_per_reconstruction = readouts_per_frame_[idx];
+
+    if( rotations_per_reconstruction_ > 0 )
+      readouts_per_reconstruction *= (frames_per_rotation_[idx]*rotations_per_reconstruction_);
+    
+    bool is_last_readout_in_reconstruction = ( recon_readout_queue_[idx].message_count() == readouts_per_reconstruction );
+
+    // Prepare the image header for this frame
+    // - if this is indeed the last profile of a new frame
+    // - or if we are about to reconstruct due to 'sliding_window_profiles_' > 0
+    
+    if( is_last_readout_in_frame || 
+        (is_last_readout_in_reconstruction && image_headers_queue_[idx].message_count() == 0) ){
+      
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *header = new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+      ISMRMRD::AcquisitionHeader *base_head = m1->getObjectPtr();
+
+      {
+        // Initialize header to all zeroes (there is a few fields we do not set yet)
+        ISMRMRD::ImageHeader tmp;
+        *(header->getObjectPtr()) = tmp;
+      }
+
+      header->getObjectPtr()->version = base_head->version;
+
+      header->getObjectPtr()->matrix_size[0] = image_dimensions_recon_[0];
+      header->getObjectPtr()->matrix_size[1] = image_dimensions_recon_[1];
+      header->getObjectPtr()->matrix_size[2] = std::max(1L,frames_per_rotation_[idx]*rotations_per_reconstruction_);
+
+      header->getObjectPtr()->field_of_view[0] = fov_[0];
+      header->getObjectPtr()->field_of_view[1] = fov_[1];
+      header->getObjectPtr()->field_of_view[2] = fov_[2];
+
+      header->getObjectPtr()->channels = num_coils_[idx];
+      header->getObjectPtr()->slice = base_head->idx.slice;
+      header->getObjectPtr()->set = base_head->idx.set;
+
+      header->getObjectPtr()->acquisition_time_stamp = base_head->acquisition_time_stamp;
+      memcpy(header->getObjectPtr()->physiology_time_stamp, base_head->physiology_time_stamp, sizeof(uint32_t)*ISMRMRD::ISMRMRD_PHYS_STAMPS);
+
+      memcpy(header->getObjectPtr()->position, base_head->position, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->read_dir, base_head->read_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->phase_dir, base_head->phase_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->slice_dir, base_head->slice_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->patient_table_position, base_head->patient_table_position, sizeof(float)*3);
+
+      header->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_CXFLOAT;
+      header->getObjectPtr()->image_index = image_counter_[idx]++; 
+      header->getObjectPtr()->image_series_index = idx;
+
+      image_headers_queue_[idx].enqueue_tail(header);
+    }
+    
+    // If it is time to reconstruct (downstream) then prepare the Sense job
+    // 
+
+    if( is_last_readout_in_reconstruction ){
+      
+      // Update csm and regularization images if the buffer has changed (completed a cycle) 
+      // - and at the first pass
+
+      if( buffer_update_needed_[idx] || 
+          csm_host_[idx].get_number_of_elements() == 0 || 
+          reg_host_[idx].get_number_of_elements() == 0 ){
+
+        // Get the accumulated coil images
+        //
+
+        boost::shared_ptr< cuNDArray<float_complext> > csm_data = acc_buffer->get_accumulated_coil_images();
+
+        // Estimate CSM
+        //
+
+        if( propagate_csm_from_set_ < 0 || propagate_csm_from_set_ == set ){	  	  
+        	if (num_coils_[idx] == 1){ //If we only have 1 coil
+        		csm_ = boost::make_shared<cuNDArray<float_complext>>(csm_data->get_dimensions());
+        		fill(csm_.get(),float_complext(1.0,0));
+        	} else
+        		csm_ = estimate_b1_map<float,2>( csm_data.get() );
+        }
+        else{
+          GDEBUG("Set %d is reusing the csm from set %d\n", set, propagate_csm_from_set_);
+          if( csm_.get() == 0x0 ){
+            GDEBUG("Error: csm has not been computed, cannot propagate\n");
+            return GADGET_FAIL;
+          }	  
+        }
+
+        acc_buffer->set_csm(csm_);
+        csm_host_[idx] = *(csm_->to_host());
+	
+        // Compute regularization image
+        //
+
+        boost::shared_ptr< cuNDArray<float_complext> > reg_image;
+        std::vector<size_t> dims;
+    	
+        if( buffer_using_solver_ ){
+
+          //GPUTimer timer("\n\n AVOIDABLE PREPROCESSING. HOW EXPENSIVE?\n\n");
+
+          extract_trajectory_and_dcw_from_queue( &recon_traj_queue_[idx], true, set, slice, 
+                                                 samples_per_readout_*readouts_per_frame_[idx],
+                                                 std::max(1L, frames_per_rotation_[idx]*rotations_per_reconstruction_),
+                                                 &traj, &dcw );
+
+          // Scale dcw weights to the are of the oversampled recon matrix size
+          float scale_factor = float(prod(image_dimensions_recon_os_))/asum(&dcw);
+          dcw *= scale_factor;
+
+          dims = *traj.get_dimensions();
+
+          std::vector<size_t> tmp_dims;
+          tmp_dims.push_back(dims[0]*dims[1]);
+          tmp_dims.push_back(1);
+	  
+          traj.reshape(&tmp_dims);
+          dcw.reshape(&tmp_dims);
+	  
+          ((cuSenseBufferCg<float,2>*)acc_buffer)->preprocess(&traj);
+          ((cuSenseBufferCg<float,2>*)acc_buffer)->set_dcw_for_rhs(boost::shared_ptr< cuNDArray<float> >(new cuNDArray<float>(&dcw)));
+        }
+
+        reg_image = acc_buffer->get_combined_coil_image();	
+        reg_host_[idx] = *(reg_image->to_host());
+	
+        if( buffer_using_solver_ ){
+          traj.reshape(&dims);
+          dcw.reshape(&dims);
+        }
+	
+        /*
+          static int counter = 0;
+          char filename[256];
+          sprintf((char*)filename, "reg_%d.cplx", counter);
+          write_nd_array<float_complext>( &reg_host_[idx], filename );
+          counter++; */
+
+        buffer_update_needed_[idx] = false;
+      }
+
+      // Prepare data array for the downstream reconstruction
+      //
+      
+      boost::shared_ptr< hoNDArray<float_complext> > samples_host = 
+        extract_samples_from_queue( &recon_readout_queue_[idx], true, set, slice );
+      
+      // Preapre the trajectory and dcw arrays.
+      // They have already been computed above 
+      // - if 'rotations_per_reconstruction_' is 0
+      // - if 'buffer_using_solver_' is true
+      
+      if( !(/*rotations_per_reconstruction_ == 0 ||*/ buffer_using_solver_) ){
+      	extract_trajectory_and_dcw_from_queue( &recon_traj_queue_[idx], true, set, slice, 
+                                               samples_per_readout_*readouts_per_frame_[idx],
+                                               std::max(1L, frames_per_rotation_[idx]*rotations_per_reconstruction_),
+                                               &traj, &dcw );
+      }
+
+      // Set up the Sense job
+      //
+
+      GadgetContainerMessage< GenericReconJob > *sj = new GadgetContainerMessage<GenericReconJob>();
+      	
+      sj->getObjectPtr()->dat_host_ = samples_host;      
+      sj->getObjectPtr()->tra_host_ = traj.to_host();
+      sj->getObjectPtr()->dcw_host_ = dcw.to_host();
+      sj->getObjectPtr()->csm_host_ = boost::shared_ptr< hoNDArray<float_complext> >( new hoNDArray<float_complext>(csm_host_[idx]));
+      sj->getObjectPtr()->reg_host_ = boost::shared_ptr< hoNDArray<float_complext> >( new hoNDArray<float_complext>(reg_host_[idx]));
+      
+      // Pull the image headers out of the queue
+      //
+
+      long frames_per_reconstruction = 
+        std::max( 1L, frames_per_rotation_[idx]*rotations_per_reconstruction_ );
+      
+      if( image_headers_queue_[idx].message_count() != frames_per_reconstruction ){
+        sj->release();
+        GDEBUG("Unexpected size of image header queue: %d, %d\n", 
+                      image_headers_queue_[idx].message_count(), frames_per_reconstruction);
+        return GADGET_FAIL;
+      }
+      
+      sj->getObjectPtr()->image_headers_ =
+        boost::shared_array<ISMRMRD::ImageHeader>( new ISMRMRD::ImageHeader[frames_per_reconstruction] );
+      
+      for( unsigned int i=0; i<frames_per_reconstruction; i++ ){	
+
+        ACE_Message_Block *mbq;
+
+        if( image_headers_queue_[idx].dequeue_head(mbq) < 0 ) {
+          sj->release();
+          GDEBUG("Image header dequeue failed\n");
+          return GADGET_FAIL;
+        }
+	
+        GadgetContainerMessage<ISMRMRD::ImageHeader> *m = AsContainerMessage<ISMRMRD::ImageHeader>(mbq);
+        sj->getObjectPtr()->image_headers_[i] = *m->getObjectPtr();
+
+        // In sliding window mode the header might need to go back at the end of the queue for reuse
+        // 
+	
+        if( i >= frames_per_reconstruction-sliding_window_rotations_*frames_per_rotation_[idx] ){
+          image_headers_queue_[idx].enqueue_tail(m);
+        }
+        else {
+          m->release();
+        }
+      }
+      
+      // The Sense Job needs an image header as well. 
+      // Let us just copy the initial one...
+
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *m4 = new GadgetContainerMessage<ISMRMRD::ImageHeader>;
+
+      *m4->getObjectPtr() = sj->getObjectPtr()->image_headers_[0];
+      m4->cont(sj);
+
+      // Pass the Sense job downstream
+      //
+      
+      if (this->next()->putq(m4) < 0) {
+        GDEBUG("Failed to put job on queue.\n");
+        m4->release();
+        return GADGET_FAIL;
+      }
+    }
+    
+    if( is_last_readout_in_frame )
+      readout_counter_frame_[idx] = 0;
+    else{
+      readout_counter_frame_[idx]++;
+    }
+
+    if( new_frame_detected ){
+
+      // The incoming profile was actually the first readout of the next frame, enqueue.
+      //
+
+      frame_readout_queue_[idx].enqueue_tail(duplicate_array(m2));
+      recon_readout_queue_[idx].enqueue_tail(duplicate_array(m2)); 
+      frame_traj_queue_[idx].enqueue_tail(duplicate_array(m3));
+      recon_traj_queue_[idx].enqueue_tail(duplicate_array(m3)); 
+
+      readout_counter_frame_[idx]++;
+    }
+
+    readout_counter_global_[idx]++;
+
+    if( output_timing_ )
+      process_timer.reset();
+    
+    m1->release(); // this is safe, the internal queues hold copies
+    return GADGET_OK;
+  }
+  
+  boost::shared_ptr< hoNDArray<float_complext> > 
+  gpuGenericSensePrepGadget::extract_samples_from_queue ( ACE_Message_Queue<ACE_MT_SYNCH> *queue, 
+                                                          bool sliding_window, unsigned int set, unsigned int slice )
+  {    
+    unsigned int readouts_buffered = queue->message_count();
+    
+    std::vector<size_t> dims;
+    dims.push_back(samples_per_readout_*readouts_buffered);
+    dims.push_back(num_coils_[set*slices_+slice]);
+    
+    boost::shared_ptr< hoNDArray<float_complext> > host_samples(new hoNDArray<float_complext>(&dims));
+    
+    for (unsigned int p=0; p<readouts_buffered; p++) {
+      
+      ACE_Message_Block* mbq;
+      if (queue->dequeue_head(mbq) < 0) {
+        GDEBUG("Message dequeue failed\n");
+        throw std::runtime_error("gpuGenericSensePrepGadget::extract_samples_from_queue: dequeing failed");	
+      }
+      
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *daq = AsContainerMessage<hoNDArray< std::complex<float> > >(mbq);
+	
+      if (!daq) {
+        GDEBUG("Unable to interpret data on message queue\n");
+        throw std::runtime_error("gpuGenericSensePrepGadget::extract_samples_from_queue: failed to interpret data");	
+      }
+	
+      for (unsigned int c = 0; c < num_coils_[set*slices_+slice]; c++) {
+	
+        float_complext *data_ptr = host_samples->get_data_ptr();
+        data_ptr += c*samples_per_readout_*readouts_buffered+p*samples_per_readout_;
+	    
+        std::complex<float> *r_ptr = daq->getObjectPtr()->get_data_ptr();
+        r_ptr += c*daq->getObjectPtr()->get_size(0);
+	  
+        memcpy(data_ptr, r_ptr, samples_per_readout_*sizeof(float_complext));
+      }
+
+      // In sliding window mode the readout might need to go back at the end of the queue
+      // 
+      
+      long readouts_in_sliding_window = sliding_window_readouts_ + 
+        readouts_per_frame_[set*slices_+slice]*frames_per_rotation_[set*slices_+slice]*sliding_window_rotations_;
+
+      if( sliding_window && p >= (readouts_buffered-readouts_in_sliding_window) )
+        queue->enqueue_tail(mbq);
+      else
+        mbq->release();
+    } 
+    
+    return host_samples;
+  }
+  
+  boost::shared_ptr< hoNDArray<float> > 
+  gpuGenericSensePrepGadget::extract_trajectory_from_queue ( ACE_Message_Queue<ACE_MT_SYNCH> *queue, 
+                                                             bool sliding_window, unsigned int set, unsigned int slice )
+  {    
+    if(!queue) {
+      GDEBUG("Illegal queue pointer, cannot extract trajectory\n");
+      throw std::runtime_error("gpuGenericSensePrepGadget::extract_trajectory_from_queue: illegal queue pointer");	
+    }
+
+    if(queue->message_count()==0) {
+      GDEBUG("Empty queue, cannot extract trajectory\n");
+      throw std::runtime_error("gpuGenericSensePrepGadget::extract_trajectory_from_queue: empty queue");	
+    }
+
+    if(samples_per_readout_ < 1) {
+      GDEBUG("Empty queue (%d), cannot extract trajectory\n", samples_per_readout_);
+      throw std::runtime_error("gpuGenericSensePrepGadget::extract_trajectory_from_queue: empty queue");	
+    }
+    
+    unsigned int readouts_buffered = queue->message_count();
+    
+    std::vector<size_t> dims;
+    dims.push_back(3);
+    dims.push_back(samples_per_readout_);
+    dims.push_back(readouts_buffered);
+    
+    boost::shared_ptr< hoNDArray<float> > host_samples(new hoNDArray<float>(&dims));
+    
+    for (unsigned int p=0; p<readouts_buffered; p++) {      
+      ACE_Message_Block* mbq;
+      if (queue->dequeue_head(mbq) < 0) {
+        GDEBUG("Message dequeue failed\n");
+        throw std::runtime_error("gpuGenericSensePrepGadget::extract_trajectory_from_queue: dequeing failed");	
+      }
+      
+      GadgetContainerMessage< hoNDArray<float> > *daq = AsContainerMessage<hoNDArray<float> >(mbq);
+	
+      if (!daq) {
+        GDEBUG("Unable to interpret data on message queue\n");
+        throw std::runtime_error("gpuGenericSensePrepGadget::extract_trajectory_from_queue: failed to interpret data");	
+      }
+
+      float *data_ptr = host_samples->get_data_ptr();
+      data_ptr += 3*samples_per_readout_*p;
+      
+      float *r_ptr = daq->getObjectPtr()->get_data_ptr();
+      
+      memcpy(data_ptr, r_ptr, 3*samples_per_readout_*sizeof(float));
+      
+      // In sliding window mode the readout might need to go back at the end of the queue
+      // 
+      
+      long readouts_in_sliding_window = sliding_window_readouts_ + 
+        readouts_per_frame_[set*slices_+slice]*frames_per_rotation_[set*slices_+slice]*sliding_window_rotations_;
+
+      if( sliding_window && p >= (readouts_buffered-readouts_in_sliding_window) )
+        queue->enqueue_tail(mbq);
+      else
+        mbq->release();
+    } 
+    
+    return host_samples;
+  }
+  
+  void gpuGenericSensePrepGadget::extract_trajectory_and_dcw_from_queue
+  ( ACE_Message_Queue<ACE_MT_SYNCH> *queue, bool sliding_window, unsigned int set, unsigned int slice, 
+    unsigned int samples_per_frame, unsigned int num_frames,
+    cuNDArray<floatd2> *traj, cuNDArray<float> *dcw )
+  {
+    // Extract trajectory and dcw.
+    // They are stored as a float array of dimensions: 3 x #samples_per_readout x #readouts.
+    // We need
+    // - a floatd2 trajectory array 
+    // - a float dcw array 
+    //
+    
+    boost::shared_ptr< hoNDArray<float> > host_traj_dcw =
+      extract_trajectory_from_queue( queue, sliding_window, set, slice );
+    
+    std::vector<size_t> order;
+    order.push_back(1); order.push_back(2); order.push_back(0);
+    
+    boost::shared_ptr< hoNDArray<float> > host_traj_dcw_shifted =
+      permute( host_traj_dcw.get(), &order );
+    
+    std::vector<size_t> dims_1d;
+    dims_1d.push_back(host_traj_dcw_shifted->get_size(0)*host_traj_dcw_shifted->get_size(1));
+    
+    {
+      hoNDArray<float> tmp(&dims_1d, host_traj_dcw_shifted->get_data_ptr()+2*dims_1d[0]);
+      *dcw = tmp;
+    }
+    
+    std::vector<size_t> dims_2d = dims_1d;
+    dims_2d.push_back(2);
+    
+    order.clear();
+    order.push_back(1); order.push_back(0);
+
+    hoNDArray<float> tmp(&dims_2d, host_traj_dcw_shifted->get_data_ptr());
+    cuNDArray<float> __traj(&tmp);
+    boost::shared_ptr< cuNDArray<float> > _traj = permute( &__traj, &order );
+    
+    cuNDArray<floatd2> tmp2(&dims_1d, (floatd2*)_traj->get_data_ptr());
+    
+    *traj = tmp2;
+    
+    unsigned int idx = set*slices_+slice;
+    dims_2d.clear();
+
+    dims_2d.push_back(samples_per_frame);
+    dims_2d.push_back(num_frames);
+
+    dcw->reshape(&dims_2d);
+    traj->reshape(&dims_2d);
+  }
+
+  template<class T> GadgetContainerMessage< hoNDArray<T> >*
+  gpuGenericSensePrepGadget::duplicate_array( GadgetContainerMessage< hoNDArray<T> > *array )
+  {
+    GadgetContainerMessage< hoNDArray<T> > *copy = new GadgetContainerMessage< hoNDArray<T> >();   
+    *(copy->getObjectPtr()) = *(array->getObjectPtr());
+    return copy;
+  }
+
+  void gpuGenericSensePrepGadget::reconfigure(unsigned int set, unsigned int slice)
+  {    
+    unsigned int idx = set*slices_+slice;
+    
+    GDEBUG("\nReconfiguring:\n#readouts/frame:%d\n#frames/rotation: %d\n#rotations/reconstruction:%d\n", 
+                  readouts_per_frame_[idx], frames_per_rotation_[idx], rotations_per_reconstruction_);
+    
+    buffer_frames_per_rotation_[idx] = buffer_frames_per_rotation.value();
+    
+    if( buffer_frames_per_rotation_[idx] == 0 ){
+      buffer_frames_per_rotation_[idx] = frames_per_rotation_[idx];
+    }
+    
+    if( buffer_length_in_rotations.value() == 0 ){
+      buffer_length_in_rotations_ = std::max(1L, rotations_per_reconstruction_);
+    }
+
+    cuSenseBuffer<float,2> *acc_buffer = 
+      (buffer_using_solver_) ? &acc_buffer_cg_[idx] : &acc_buffer_[idx];
+    
+    if( buffer_frames_per_rotation_[idx] == 1 ){ // Is this general enough to detect golden ratio type trajectories?
+
+      acc_buffer->setup( from_std_vector<size_t,2>(image_dimensions_recon_), image_dimensions_recon_os_, 
+                         kernel_width_, num_coils_[idx], 1, buffer_length_in_rotations_ );
+    }else{
+      acc_buffer->setup( from_std_vector<size_t,2>(image_dimensions_recon_), image_dimensions_recon_os_, 
+                         kernel_width_, num_coils_[idx], buffer_length_in_rotations_, buffer_frames_per_rotation_[idx] );
+    }
+    reconfigure_[idx] = false;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuGenericSensePrepGadget)
+}
diff --git a/gadgets/pmri/gpuGenericSensePrepGadget.h b/gadgets/pmri/gpuGenericSensePrepGadget.h
new file mode 100644
index 0000000..7dee0b2
--- /dev/null
+++ b/gadgets/pmri/gpuGenericSensePrepGadget.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include "gadgetron_gpupmri_export.h"
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "vector_td.h"
+#include "cuNFFT.h"
+#include "cuCgPreconditioner.h"
+#include "cuSenseBufferCg.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_GPUPMRI gpuGenericSensePrepGadget :
+    public Gadget3< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> >, hoNDArray<float> >
+  {
+    
+  public:
+    GADGET_DECLARE(gpuGenericSensePrepGadget);
+
+    gpuGenericSensePrepGadget();
+    virtual ~gpuGenericSensePrepGadget();
+
+  protected:
+    GADGET_PROPERTY(deviceno, int, "GPU device number", 0);
+    GADGET_PROPERTY(buffer_length_in_rotations, int, "Number of rotations in a buffer", 1);
+    GADGET_PROPERTY(buffer_using_solver, bool, "Use solver for buffer", false);
+    GADGET_PROPERTY(buffer_convolution_kernel_width, float, "Convolution kernel width for buffer", 5.5);
+    GADGET_PROPERTY(buffer_convolution_oversampling_factor, float, "Oversampling used in buffer convolution", 1.25);
+    GADGET_PROPERTY(reconstruction_os_factor_x, float, "Oversampling for reconstruction in x-direction", 1.0);
+    GADGET_PROPERTY(reconstruction_os_factor_y, float, "Oversampling for reconstruction in y-direction", 1.0);
+    GADGET_PROPERTY(rotations_per_reconstruction, int, "Number of rotations per reconstruction", 0);
+    GADGET_PROPERTY(propagate_csm_from_set, int, "Which set to use for CSM", -1);
+    GADGET_PROPERTY(output_timing, bool, "Output timing information", false);
+    GADGET_PROPERTY(sliding_window_readouts, int, "Number of readouts in sliding window", 0);
+    GADGET_PROPERTY(sliding_window_rotations, int, "Number of rotations in sliding window", 0);
+    GADGET_PROPERTY(readouts_per_frame, int, "Readouts per frame", 0);
+    GADGET_PROPERTY(frames_per_rotation, int, "Frames per rotation", 0);
+    GADGET_PROPERTY(buffer_frames_per_rotation, int, "Frames per rotation in buffer", 1);
+
+
+    virtual int process_config(ACE_Message_Block *mb);
+
+    virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader > *m1,        // header
+			GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2,  // data
+			GadgetContainerMessage< hoNDArray<float> > *m3 );                // traj/dcw
+
+  private:
+
+    inline bool vec_equal(float *in1, float *in2) {
+      for (unsigned int i = 0; i < 3; i++) {
+	if (in1[i] != in2[i]) return false;
+      }
+      return true;
+    }
+    
+    boost::shared_array<bool> reconfigure_;
+    virtual void reconfigure(unsigned int set, unsigned int slice);
+
+    template<class T> GadgetContainerMessage< hoNDArray<T> >* 
+      duplicate_array( GadgetContainerMessage< hoNDArray<T> > *array );
+    
+    boost::shared_ptr< hoNDArray<float_complext> > 
+      extract_samples_from_queue ( ACE_Message_Queue<ACE_MT_SYNCH> *queue, 
+				   bool sliding_window, unsigned int set, unsigned int slice );
+    
+    boost::shared_ptr< hoNDArray<float> > 
+      extract_trajectory_from_queue ( ACE_Message_Queue<ACE_MT_SYNCH> *queue, 
+				      bool sliding_window, unsigned int set, unsigned int slice );
+      
+    void extract_trajectory_and_dcw_from_queue
+      ( ACE_Message_Queue<ACE_MT_SYNCH> *queue, bool sliding_window, unsigned int set, unsigned int slice, 
+	unsigned int samples_per_frame, unsigned int num_frames,
+	cuNDArray<floatd2> *traj, cuNDArray<float> *dcw );
+    
+    int slices_;
+    int sets_;
+    int device_number_;
+    long samples_per_readout_;
+
+    boost::shared_array<long> image_counter_;
+    boost::shared_array<long> readouts_per_frame_;  // for an undersampled frame
+    boost::shared_array<long> frames_per_rotation_; // representing a fully sampled frame
+
+    // The number of rotations to batch per reconstruction. 
+    // Set to '0' to reconstruct frames individually.
+    long rotations_per_reconstruction_; 
+
+    // The number of buffer cycles
+    long buffer_length_in_rotations_; 
+
+    boost::shared_array<long> buffer_frames_per_rotation_; // the number of buffer subcycles
+
+    // Internal book-keping
+    boost::shared_array<long> previous_readout_no_;
+    boost::shared_array<long> acceleration_factor_;
+    boost::shared_array<long> readout_counter_frame_;
+    boost::shared_array<long> readout_counter_global_;
+
+    long sliding_window_readouts_;
+    long sliding_window_rotations_;
+
+    float kernel_width_;
+    float oversampling_factor_;
+
+    boost::shared_array<unsigned int> num_coils_;
+
+    boost::shared_array<float[3]> position_;
+    boost::shared_array<float[3]> read_dir_;
+    boost::shared_array<float[3]> phase_dir_;
+    boost::shared_array<float[3]> slice_dir_;
+
+    bool output_timing_;
+    bool buffer_using_solver_;
+
+    int propagate_csm_from_set_;
+    boost::shared_ptr< cuNDArray<float_complext> > csm_;
+
+    boost::shared_array<bool> buffer_update_needed_;
+
+    boost::shared_array< hoNDArray<float_complext> > csm_host_;
+    boost::shared_array< hoNDArray<float_complext> > reg_host_;
+    
+    boost::shared_array< cuSenseBuffer<float,2> > acc_buffer_;
+    boost::shared_array< cuSenseBufferCg<float,2> > acc_buffer_cg_;
+
+    std::vector<size_t> fov_;
+    std::vector<size_t> image_dimensions_;
+    std::vector<size_t> image_dimensions_recon_;
+    uint64d2 image_dimensions_recon_os_;
+
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > frame_readout_queue_;
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > recon_readout_queue_;
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > frame_traj_queue_;
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > recon_traj_queue_;
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > image_headers_queue_;
+  };
+}
diff --git a/gadgets/pmri/gpuLALMSenseGadget.cpp b/gadgets/pmri/gpuLALMSenseGadget.cpp
new file mode 100644
index 0000000..c0408f2
--- /dev/null
+++ b/gadgets/pmri/gpuLALMSenseGadget.cpp
@@ -0,0 +1,259 @@
+#include "gpuLALMSenseGadget.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "GadgetMRIHeaders.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "ismrmrd/xml.h"
+#include <boost/thread/mutex.hpp>
+#include "cuNDArray_fileio.h"
+#include "eigenTester.h"
+namespace Gadgetron{
+
+#define max_number_of_gpus 10
+  static boost::mutex _mutex[max_number_of_gpus];
+
+  gpuLALMSenseGadget::gpuLALMSenseGadget()
+    : gpuSenseGadget(),
+      is_configured_(false)
+    , prepared_(false)
+  {
+  }
+
+  gpuLALMSenseGadget::~gpuLALMSenseGadget() {}
+
+  int gpuLALMSenseGadget::process_config( ACE_Message_Block* mb )
+  {
+  	gpuSenseGadget::process_config(mb);
+      number_of_iterations_ = number_of_iterations.value();
+
+    exclusive_access_ = exclusive_access.value();
+    lambda_ = lambda.value();
+    coils_per_subset_ = coils_per_subset.value();
+    huber_value_ = huber_value.value();
+
+    // Get the Ismrmrd header
+    //
+    ISMRMRD::IsmrmrdHeader h;
+    ISMRMRD::deserialize(mb->rd_ptr(),h);
+    
+    
+    if (h.encoding.size() != 1) {
+      GDEBUG("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    
+    // Get the encoding space and trajectory description
+    ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+    ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+    ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+    matrix_size_seq_ = uint64d2( r_space.matrixSize.x, r_space.matrixSize.y );
+
+    if (!is_configured_) {
+
+      if (h.acquisitionSystemInformation) {
+	channels_ = h.acquisitionSystemInformation->receiverChannels ? *h.acquisitionSystemInformation->receiverChannels : 1;
+      } else {
+	channels_ = 1;
+      }
+
+      // Allocate encoding operator for non-Cartesian Sense
+      E_ = boost::make_shared<osSenseOperator<cuNDArray<float_complext>,2,cuNFFTOperator<float,2>>>();
+      E_->set_coils_per_subset(coils_per_subset_);
+
+
+
+      // Setup NLCG solver
+      solver_.set_encoding_operator( E_ );
+
+      solver_.set_output_mode( (output_convergence_) ? osLALMSolver<cuNDArray<float_complext>>::OUTPUT_VERBOSE : osLALMSolver<cuNDArray<float_complext>>::OUTPUT_SILENT );
+      solver_.set_max_iterations( number_of_iterations_ );
+
+      solver_.set_alpha(huber_value_);
+      // Add "TV" regularization
+      //
+
+      if( lambda_ > 0.0 ){
+      	for (auto i = 0u; i < 3; i++){
+
+      	auto Dx = boost::make_shared<cuPartialDerivativeOperator<float_complext,3>>(i);
+      	Dx->set_weight(lambda_);
+      	TV_ops.push_back(Dx);
+      	}
+      	solver_.add_regularization_group(TV_ops);
+      }
+
+     // Add "PICCS" regularization
+      //
+
+
+      is_configured_ = true;
+    }
+
+    GDEBUG("gpuLALMSenseGadget::end of process_config\n");
+
+    return GADGET_OK;
+  }
+
+  int gpuLALMSenseGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<GenericReconJob> *m2)
+  {
+    // Is this data for this gadget's set/slice?
+    //
+  	GDEBUG("Starting gpuLALMSenseGadget\n");
+
+    if( m1->getObjectPtr()->set != set_number_ || m1->getObjectPtr()->slice != slice_number_ ) {
+      // No, pass it downstream...
+      return this->next()->putq(m1);
+    }
+
+    //GDEBUG("gpuLALMSenseGadget::process\n");
+    //GPUTimer timer("gpuLALMSenseGadget::process");
+
+    if (!is_configured_) {
+      GDEBUG("\nData received before configuration complete\n");
+      return GADGET_FAIL;
+    }
+
+    GenericReconJob* j = m2->getObjectPtr();
+
+    // Let's first check that this job has the required data...
+    if (!j->csm_host_.get() || !j->dat_host_.get() || !j->tra_host_.get() || !j->dcw_host_.get()) {
+      GDEBUG("Received an incomplete Sense job\n");
+      return GADGET_FAIL;
+    }
+
+    unsigned int samples = j->dat_host_->get_size(0);
+    unsigned int channels = j->dat_host_->get_size(1);
+    unsigned int rotations = samples / j->tra_host_->get_number_of_elements();
+    unsigned int frames = j->tra_host_->get_size(1)*rotations;
+
+    if( samples%j->tra_host_->get_number_of_elements() ) {
+      GDEBUG("Mismatch between number of samples (%d) and number of k-space coordinates (%d).\nThe first should be a multiplum of the latter.\n",
+                    samples, j->tra_host_->get_number_of_elements());
+      return GADGET_FAIL;
+    }
+
+    boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2> (j->tra_host_.get()));
+    boost::shared_ptr< cuNDArray<float> > dcw(new cuNDArray<float> (j->dcw_host_.get()));
+    sqrt_inplace(dcw.get());
+    boost::shared_ptr< cuNDArray<float_complext> > csm(new cuNDArray<float_complext> (j->csm_host_.get()));
+    boost::shared_ptr< cuNDArray<float_complext> > device_samples(new cuNDArray<float_complext> (j->dat_host_.get()));
+
+
+      // Take the reconstruction matrix size from the regulariaztion image.
+      // It could be oversampled from the sequence specified size...
+
+      matrix_size_ = uint64d2( j->reg_host_->get_size(0), j->reg_host_->get_size(1) );
+
+      cudaDeviceProp deviceProp;
+      if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+        GDEBUG( "\nError: unable to query device properties.\n" );
+        return GADGET_FAIL;
+      }
+
+      unsigned int warp_size = deviceProp.warpSize;
+
+      matrix_size_os_ =
+        uint64d2(((static_cast<unsigned int>(std::ceil(matrix_size_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+                 ((static_cast<unsigned int>(std::ceil(matrix_size_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+
+      GDEBUG("Matrix size    : [%d,%d] \n", matrix_size_[0], matrix_size_[1]);
+      GDEBUG("Matrix size OS : [%d,%d] \n", matrix_size_os_[0], matrix_size_os_[1]);
+
+      std::vector<size_t> image_dims = to_std_vector(matrix_size_);
+      image_dims.push_back(frames);
+
+      E_->set_domain_dimensions(&image_dims);
+      E_->set_codomain_dimensions(device_samples->get_dimensions().get());
+      E_->set_csm(csm);
+      E_->setup( matrix_size_, matrix_size_os_, kernel_width_ );
+      E_->preprocess(traj.get());
+
+      for (auto op : TV_ops){
+      	op->set_domain_dimensions(&image_dims);
+      	op->set_codomain_dimensions(&image_dims);
+      }
+
+
+      reg_image_ = boost::shared_ptr< cuNDArray<float_complext> >(new cuNDArray<float_complext>(&image_dims));
+
+      // These operators need their domain/codomain set before being added to the solver
+      //
+
+      E_->set_dcw(dcw);
+
+
+
+      GDEBUG("Prepared\n");
+
+        // Expand the average image to the number of frames
+    //
+
+
+    // Define preconditioning weights
+    //
+
+        //Apply weights
+    *device_samples *= *dcw;
+
+    // Invoke solver
+    //
+
+    boost::shared_ptr< cuNDArray<float_complext> > result;
+    {
+      GDEBUG("Running NLCG solver\n");
+      GPUTimer timer("Running NLCG solver");
+
+      // Optionally, allow exclusive (per device) access to the solver
+      // This may not matter much in terms of speed, but it can in terms of memory consumption
+      //
+
+      if( exclusive_access_ )
+        _mutex[device_number_].lock();
+
+      result = solver_.solve(device_samples.get());
+
+      if( exclusive_access_ )
+        _mutex[device_number_].unlock();
+    }
+
+    // Provide some info about the scaling between the regularization and reconstruction.
+    // If it is not close to one, PICCS does not work optimally...
+    //
+
+
+    if (!result.get()) {
+      GDEBUG("\nNon-linear conjugate gradient solver failed\n");
+      return GADGET_FAIL;
+    }
+
+    /*
+      static int counter = 0;
+      char filename[256];
+      sprintf((char*)filename, "recon_sb_%d.cplx", counter);
+      write_nd_array<float_complext>( sbresult->to_host().get(), filename );
+      counter++; */
+
+    // If the recon matrix size exceeds the sequence matrix size then crop
+    if( matrix_size_seq_ != matrix_size_ )
+      result = crop<float_complext,2>( (matrix_size_-matrix_size_seq_)>>1, matrix_size_seq_, result.get() );
+
+
+    // Now pass on the reconstructed images
+    //
+    this->put_frames_on_que(frames,rotations,j,result.get(),channels);
+
+    frame_counter_ += frames;
+    m1->release();
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuLALMSenseGadget)
+}
+
diff --git a/gadgets/pmri/gpuLALMSenseGadget.h b/gadgets/pmri/gpuLALMSenseGadget.h
new file mode 100644
index 0000000..bbc6b7a
--- /dev/null
+++ b/gadgets/pmri/gpuLALMSenseGadget.h
@@ -0,0 +1,86 @@
+#ifndef gpuSbSenseGadget_H
+#define gpuSbSenseGadget_H
+#pragma once
+
+#include <ace/Synch.h>
+#include <ace/Mutex.h>
+
+#include "gadgetron_gpupmri_export.h"
+#include "Gadget.h"
+#include "GenericReconJob.h"
+#include "GadgetMRIHeaders.h"
+#include "cuNlcgSolver.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuNFFT.h"
+#include "cuImageOperator.h"
+#include "ismrmrd/ismrmrd.h"
+#include "cuTvOperator.h"
+#include "cuTvPicsOperator.h"
+#include "osSenseOperator.h"
+#include "cuNFFTOperator.h"
+#include "osLALMSolver.h"
+#include "gpuSenseGadget.h"
+
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_GPUPMRI gpuLALMSenseGadget : public gpuSenseGadget
+  {
+
+  public:
+
+    gpuLALMSenseGadget();
+    virtual ~gpuLALMSenseGadget();
+
+  protected:
+    GADGET_PROPERTY(lambda, float, "Lambda regularization factor", 1e-6);
+    //GADGET_PROPERTY(alpha, float, "Alpha regularization factor", 0.5);
+    //GADGET_PROPERTY(kappa, float, "Kappa regularization factor", 1.0);
+    GADGET_PROPERTY(number_of_iterations, int, "Number of solver iterations", 0);
+    GADGET_PROPERTY(exclusive_access, bool,"Forces 1 gadget per GPU",false);
+    GADGET_PROPERTY(coils_per_subset, int,"Number of coils to use for each subset",1);
+    GADGET_PROPERTY(huber_value,float,"Value of the huber regularization (should be small)",0);
+    
+    virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader >* m1, GadgetContainerMessage< GenericReconJob > * m2 );
+    virtual int process_config( ACE_Message_Block* mb );
+
+    int coils_per_subset_;
+
+    uint64d2 matrix_size_;
+    uint64d2 matrix_size_os_;
+    uint64d2 matrix_size_seq_;
+
+    unsigned int number_of_iterations_;
+
+
+    bool exclusive_access_;
+    double lambda_;
+    double huber_value_;
+    //double alpha_;
+    //double kappa_;
+
+    bool is_configured_;
+    bool prepared_;
+
+    // Define non-linear conjugate gradient solver
+    osLALMSolver<cuNDArray<float_complext>> solver_;
+
+    // Define non-Cartesian Sense Encoding operator
+    boost::shared_ptr< osSenseOperator<cuNDArray<float_complext>,2,cuNFFTOperator<float,2>>> E_;
+
+    // Average image for regularization
+
+    boost::shared_ptr< cuNDArray<float_complext> > reg_image_;
+
+    std::vector<boost::shared_ptr<linearOperator<cuNDArray<float_complext>>>> TV_ops;
+/*
+    boost::shared_ptr<cuTvOperator<float_complext,3> > TV_;
+    boost::shared_ptr<cuTvPicsOperator<float_complext,3> > PICS_;
+*/
+  };
+}
+#endif //gpuSbSenseGadget
+
diff --git a/gadgets/pmri/gpuNlcgSenseGadget.cpp b/gadgets/pmri/gpuNlcgSenseGadget.cpp
new file mode 100644
index 0000000..362654f
--- /dev/null
+++ b/gadgets/pmri/gpuNlcgSenseGadget.cpp
@@ -0,0 +1,374 @@
+#include "gpuNlcgSenseGadget.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "GadgetMRIHeaders.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "ismrmrd/xml.h"
+#include <boost/thread/mutex.hpp>
+
+namespace Gadgetron{
+
+#define max_number_of_gpus 10
+  static boost::mutex _mutex[max_number_of_gpus];
+
+  gpuNlcgSenseGadget::gpuNlcgSenseGadget()
+    : is_configured_(false)
+    , prepared_(false)
+    , channels_(0)
+    , frame_counter_(0)
+  {
+    matrix_size_ = uint64d2(0,0);
+    matrix_size_os_ = uint64d2(0,0);
+    matrix_size_seq_ = uint64d2(0,0);
+  }
+
+  gpuNlcgSenseGadget::~gpuNlcgSenseGadget() {}
+
+  int gpuNlcgSenseGadget::process_config( ACE_Message_Block* mb )
+  {
+    GDEBUG("gpuNlcgSenseGadget::process_config\n");
+
+    device_number_ = deviceno.value();
+
+    int number_of_devices = 0;
+    if (cudaGetDeviceCount(&number_of_devices)!= cudaSuccess) {
+      GDEBUG( "Error: unable to query number of CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (number_of_devices == 0) {
+      GDEBUG( "Error: No available CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (device_number_ >= number_of_devices) {
+      GDEBUG("Adjusting device number from %d to %d\n", device_number_,  (device_number_%number_of_devices));
+      device_number_ = (device_number_%number_of_devices);
+    }
+
+    if (cudaSetDevice(device_number_)!= cudaSuccess) {
+      GDEBUG( "Error: unable to set CUDA device.\n" );
+      return GADGET_FAIL;
+    }
+
+    pass_on_undesired_data_ = pass_on_undesired_data.value();
+    set_number_ = setno.value();
+    slice_number_ = sliceno.value();
+
+    number_of_cg_iterations_ = number_of_cg_iterations.value();
+    cg_limit_ = cg_limit.value();
+    oversampling_factor_ = oversampling_factor.value();
+    kernel_width_ = kernel_width.value();
+
+    lambda_ = lambda.value();
+    alpha_ = alpha.value();
+    rotations_to_discard_ = rotations_to_discard.value();
+    output_convergence_ = output_convergence.value();
+    exclusive_access_ = exclusive_access.value();
+
+    if( (rotations_to_discard_%2) == 1 ){
+      GDEBUG("#rotations to discard must be even.\n");
+      return GADGET_FAIL;
+    }
+
+    // Get the Ismrmrd header
+    //
+    ISMRMRD::IsmrmrdHeader h;
+    ISMRMRD::deserialize(mb->rd_ptr(),h);
+    
+    
+    if (h.encoding.size() != 1) {
+      GDEBUG("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    
+    // Get the encoding space and trajectory description
+    ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+    ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+    ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+    matrix_size_seq_ = uint64d2( r_space.matrixSize.x, r_space.matrixSize.y );
+
+    if (!is_configured_) {
+
+      if (h.acquisitionSystemInformation) {
+	channels_ = h.acquisitionSystemInformation->receiverChannels ? *h.acquisitionSystemInformation->receiverChannels : 1;
+      } else {
+	channels_ = 1;
+      }
+
+      // Allocate encoding operator for non-Cartesian Sense
+      E_ = boost::shared_ptr< cuNonCartesianSenseOperator<float,2> >( new cuNonCartesianSenseOperator<float,2>() );
+
+
+		// Allocate preconditioner
+      D_ = boost::shared_ptr< cuCgPreconditioner<float_complext> >( new cuCgPreconditioner<float_complext>() );
+
+
+      TV_ = boost::shared_ptr<cuTvOperator<float_complext,3> >(new cuTvOperator<float_complext,3>);
+      PICS_ = boost::shared_ptr<cuTvPicsOperator<float_complext,3> >(new cuTvPicsOperator<float_complext,3>);
+
+
+      // Setup NLCG solver
+      solver_ = cuNlcgSolver<float_complext>();
+      solver_.set_encoding_operator( E_ );
+
+      solver_.set_output_mode( (output_convergence_) ? cuNlcgSolver<float_complext>::OUTPUT_VERBOSE : cuNlcgSolver<float_complext>::OUTPUT_SILENT );
+      solver_.set_max_iterations( number_of_cg_iterations_ );
+      solver_.set_tc_tolerance(cg_limit_);
+      solver_.set_preconditioner( D_ );
+
+      is_configured_ = true;
+    }
+
+    GDEBUG("gpuNlcgSenseGadget::end of process_config\n");
+
+    return GADGET_OK;
+  }
+
+  int gpuNlcgSenseGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<GenericReconJob> *m2)
+  {
+    // Is this data for this gadget's set/slice?
+    //
+
+    if( m1->getObjectPtr()->set != set_number_ || m1->getObjectPtr()->slice != slice_number_ ) {
+      // No, pass it downstream...
+      return this->next()->putq(m1);
+    }
+
+    //GDEBUG("gpuNlcgSenseGadget::process\n");
+    //GPUTimer timer("gpuNlcgSenseGadget::process");
+
+    if (!is_configured_) {
+      GDEBUG("\nData received before configuration complete\n");
+      return GADGET_FAIL;
+    }
+
+    GenericReconJob* j = m2->getObjectPtr();
+
+    // Let's first check that this job has the required data...
+    if (!j->csm_host_.get() || !j->dat_host_.get() || !j->tra_host_.get() || !j->dcw_host_.get()) {
+      GDEBUG("Received an incomplete Sense job\n");
+      return GADGET_FAIL;
+    }
+
+    unsigned int samples = j->dat_host_->get_size(0);
+    unsigned int channels = j->dat_host_->get_size(1);
+    unsigned int rotations = samples / j->tra_host_->get_number_of_elements();
+    unsigned int frames = j->tra_host_->get_size(1)*rotations;
+
+    if( samples%j->tra_host_->get_number_of_elements() ) {
+      GDEBUG("Mismatch between number of samples (%d) and number of k-space coordinates (%d).\nThe first should be a multiplum of the latter.\n",
+                    samples, j->tra_host_->get_number_of_elements());
+      return GADGET_FAIL;
+    }
+
+    boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2> (j->tra_host_.get()));
+    boost::shared_ptr< cuNDArray<float> > dcw(new cuNDArray<float> (j->dcw_host_.get()));
+    boost::shared_ptr< cuNDArray<float_complext> > csm(new cuNDArray<float_complext> (j->csm_host_.get()));
+    boost::shared_ptr< cuNDArray<float_complext> > device_samples(new cuNDArray<float_complext> (j->dat_host_.get()));
+
+    if( !prepared_){
+
+      // Take the reconstruction matrix size from the regulariaztion image.
+      // It could be oversampled from the sequence specified size...
+
+      matrix_size_ = uint64d2( j->reg_host_->get_size(0), j->reg_host_->get_size(1) );
+
+      cudaDeviceProp deviceProp;
+      if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+        GDEBUG( "\nError: unable to query device properties.\n" );
+        return GADGET_FAIL;
+      }
+
+      unsigned int warp_size = deviceProp.warpSize;
+
+      matrix_size_os_ =
+        uint64d2(((static_cast<unsigned int>(std::ceil(matrix_size_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+                 ((static_cast<unsigned int>(std::ceil(matrix_size_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+
+      GDEBUG("Matrix size    : [%d,%d] \n", matrix_size_[0], matrix_size_[1]);
+      GDEBUG("Matrix size OS : [%d,%d] \n", matrix_size_os_[0], matrix_size_os_[1]);
+
+      std::vector<size_t> image_dims = to_std_vector(matrix_size_);
+      image_dims.push_back(frames);
+
+      E_->set_domain_dimensions(&image_dims);
+      E_->set_codomain_dimensions(device_samples->get_dimensions().get());
+
+      reg_image_ = boost::shared_ptr< cuNDArray<float_complext> >(new cuNDArray<float_complext>(&image_dims));
+
+      // These operators need their domain/codomain set before being added to the solver
+      //
+
+      // Add "TV" regularization
+      //
+
+      if( lambda_ > 0.0 ){
+      	TV_->set_weight((1.0-alpha_)*lambda_);
+      	solver_.add_nonlinear_operator(TV_);
+      }
+
+      // Add "PICCS" regularization
+      //
+
+      if( alpha_ > 0.0 ){
+        PICS_->set_prior(reg_image_);
+        PICS_->set_weight(alpha_*lambda_);
+        solver_.add_nonlinear_operator(PICS_);
+      }
+
+      prepared_ = true;
+    }
+
+    E_->set_dcw(dcw);
+    E_->set_csm(csm);
+    E_->setup( matrix_size_, matrix_size_os_, static_cast<float>(kernel_width_) );
+    E_->preprocess(traj.get());
+
+    // Expand the average image to the number of frames
+    //
+
+    {
+      cuNDArray<float_complext> tmp(*j->reg_host_);
+      *reg_image_ = *expand( &tmp, frames );
+    }
+
+    // Define preconditioning weights
+    //
+
+    boost::shared_ptr< cuNDArray<float> > _precon_weights = sum(abs_square(csm.get()).get(), 2);
+    reciprocal_sqrt_inplace(_precon_weights.get());
+    boost::shared_ptr< cuNDArray<float_complext> > precon_weights = real_to_complex<float_complext>( _precon_weights.get() );
+    _precon_weights.reset();
+    D_->set_weights( precon_weights );
+    precon_weights.reset();
+
+    //Apply weights
+    *device_samples *= *dcw;
+
+    // Invoke solver
+    //
+
+    boost::shared_ptr< cuNDArray<float_complext> > result;
+    {
+      GDEBUG("Running NLCG solver\n");
+      GPUTimer timer("Running NLCG solver");
+
+      // Optionally, allow exclusive (per device) access to the solver
+      // This may not matter much in terms of speed, but it can in terms of memory consumption
+      //
+
+      if( exclusive_access_ )
+        _mutex[device_number_].lock();
+
+      result = solver_.solve(device_samples.get());
+
+      if( exclusive_access_ )
+        _mutex[device_number_].unlock();
+    }
+
+    // Provide some info about the scaling between the regularization and reconstruction.
+    // If it is not close to one, PICCS does not work optimally...
+    //
+
+    if( alpha_ > 0.0 ){
+      cuNDArray<float_complext> gpureg(j->reg_host_.get());
+      boost::shared_ptr< cuNDArray<float_complext> > gpurec = sum(result.get(),2);
+      *gpurec /= float(result->get_size(2));
+      float scale = abs(dot(gpurec.get(), gpurec.get())/dot(gpurec.get(),&gpureg));
+      GDEBUG("Scaling factor between regularization and reconstruction is %f.\n", scale);
+    }
+
+    if (!result.get()) {
+      GDEBUG("\nNon-linear conjugate gradient solver failed\n");
+      return GADGET_FAIL;
+    }
+
+    /*
+      static int counter = 0;
+      char filename[256];
+      sprintf((char*)filename, "recon_sb_%d.cplx", counter);
+      write_nd_array<float_complext>( sbresult->to_host().get(), filename );
+      counter++; */
+
+    // If the recon matrix size exceeds the sequence matrix size then crop
+    if( matrix_size_seq_ != matrix_size_ )
+      result = crop<float_complext,2>( (matrix_size_-matrix_size_seq_)>>1, matrix_size_seq_, result.get() );
+
+    // Now pass on the reconstructed images
+    //
+
+    unsigned int frames_per_rotation = frames/rotations;
+
+    if( rotations == 1 ){ // this is the case for golden ratio
+      rotations = frames;
+      frames_per_rotation = 1;
+    }
+
+    for( unsigned int frame=0; frame<frames; frame++ ){
+
+      unsigned int rotation_idx = frame/frames_per_rotation;
+
+      // Check if we should discard this frame
+      if( rotation_idx < (rotations_to_discard_>>1) || rotation_idx >= rotations-(rotations_to_discard_>>1) )
+        continue;
+
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *cm =
+        new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *m =
+        new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+      *m->getObjectPtr() = j->image_headers_[frame];
+      m->getObjectPtr()->matrix_size[0] = matrix_size_seq_[0];
+      m->getObjectPtr()->matrix_size[1] = matrix_size_seq_[1];
+      m->cont(cm);
+
+      std::vector<size_t> img_dims(2);
+      img_dims[0] = matrix_size_seq_[0];
+      img_dims[1] = matrix_size_seq_[1];
+
+      cm->getObjectPtr()->create(&img_dims);
+
+      size_t data_length = prod(matrix_size_seq_);
+
+      cudaMemcpy(cm->getObjectPtr()->get_data_ptr(),
+                 result->get_data_ptr()+frame*data_length,
+                 data_length*sizeof(std::complex<float>),
+                 cudaMemcpyDeviceToHost);
+
+      cudaError_t err = cudaGetLastError();
+      if( err != cudaSuccess ){
+        GDEBUG("\nUnable to copy result from device to host: %s", cudaGetErrorString(err));
+        m->release();
+        return GADGET_FAIL;
+      }
+
+      m->getObjectPtr()->matrix_size[0] = img_dims[0];
+      m->getObjectPtr()->matrix_size[1] = img_dims[1];
+      m->getObjectPtr()->matrix_size[2] = 1;
+      m->getObjectPtr()->channels       = 1;
+      m->getObjectPtr()->image_index    = frame_counter_ + frame;
+
+      if (this->next()->putq(m) < 0) {
+        GDEBUG("\nFailed to result image on to Q\n");
+        m->release();
+        return GADGET_FAIL;
+      }
+    }
+
+    frame_counter_ += frames;
+    m1->release();
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuNlcgSenseGadget)
+}
+
diff --git a/gadgets/pmri/gpuNlcgSenseGadget.h b/gadgets/pmri/gpuNlcgSenseGadget.h
new file mode 100644
index 0000000..b09c3c2
--- /dev/null
+++ b/gadgets/pmri/gpuNlcgSenseGadget.h
@@ -0,0 +1,94 @@
+#ifndef gpuSbSenseGadget_H
+#define gpuSbSenseGadget_H
+#pragma once
+
+#include <ace/Synch.h>
+#include <ace/Mutex.h>
+
+#include "gadgetron_gpupmri_export.h"
+#include "Gadget.h"
+#include "GenericReconJob.h"
+#include "GadgetMRIHeaders.h"
+#include "cuNlcgSolver.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuNFFT.h"
+#include "cuImageOperator.h"
+#include "ismrmrd/ismrmrd.h"
+#include "cuTvOperator.h"
+#include "cuTvPicsOperator.h"
+
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_GPUPMRI gpuNlcgSenseGadget : public Gadget2< ISMRMRD::ImageHeader, GenericReconJob >
+  {
+
+  public:
+
+    gpuNlcgSenseGadget();
+    virtual ~gpuNlcgSenseGadget();
+
+  protected:
+    GADGET_PROPERTY(deviceno, int, "GPU device number", 0);
+    GADGET_PROPERTY(setno, int, "Which set to process", 0);
+    GADGET_PROPERTY(sliceno, int, "Which slice to process", 0);
+    GADGET_PROPERTY(cg_limit, float, "Convervence limit for CG", 1e-6);
+    GADGET_PROPERTY(oversampling_factor, float, "Oversampling factor for NFFT", 1.5);
+    GADGET_PROPERTY(kernel_width, float, "Kernel width for NFFT", 5.5);
+    GADGET_PROPERTY(lambda, float, "Lambda regularization factor", 1e-6);
+    GADGET_PROPERTY(alpha, float, "Alpha regularization factor", 0.5);
+    GADGET_PROPERTY(exclusive_access, bool, "Exclusive access to solver", false);
+    GADGET_PROPERTY(number_of_cg_iterations, int, "Number of CG iterations", 0);
+    GADGET_PROPERTY(rotations_to_discard, int, "Rotations to discard", 0);
+    GADGET_PROPERTY(output_convergence, bool, "Output convergence information", false);
+    
+    virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader >* m1, GadgetContainerMessage< GenericReconJob > * m2 );
+    virtual int process_config( ACE_Message_Block* mb );
+
+    int channels_;
+    int device_number_;
+    int set_number_;
+    int slice_number_;
+
+    uint64d2 matrix_size_;
+    uint64d2 matrix_size_os_;
+    uint64d2 matrix_size_seq_;
+
+    unsigned int number_of_cg_iterations_;
+
+    double cg_limit_;
+    double oversampling_factor_;
+    double kernel_width_;
+
+    double lambda_;
+    double alpha_;
+    unsigned int rotations_to_discard_;
+
+    bool output_convergence_;
+    bool exclusive_access_;
+    bool is_configured_;
+    bool prepared_;
+
+    // Define non-linear conjugate gradient solver
+    cuNlcgSolver<float_complext> solver_;
+
+    // Define non-Cartesian Sense Encoding operator
+    boost::shared_ptr< cuNonCartesianSenseOperator<float,2> > E_;
+
+    // Define preconditioner
+    boost::shared_ptr< cuCgPreconditioner<float_complext> > D_;
+
+    // Average image for regularization
+    boost::shared_ptr< cuNDArray<float_complext> > reg_image_;
+
+    boost::shared_ptr<cuTvOperator<float_complext,3> > TV_;
+    boost::shared_ptr<cuTvPicsOperator<float_complext,3> > PICS_;
+
+    int frame_counter_;
+  };
+}
+#endif //gpuSbSenseGadget
+
diff --git a/gadgets/pmri/gpuOsSenseGadget.cpp b/gadgets/pmri/gpuOsSenseGadget.cpp
new file mode 100644
index 0000000..ae77d0b
--- /dev/null
+++ b/gadgets/pmri/gpuOsSenseGadget.cpp
@@ -0,0 +1,278 @@
+#include "gpuOsSenseGadget.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "GadgetMRIHeaders.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "ismrmrd/xml.h"
+#include <boost/thread/mutex.hpp>
+#include "cuNDArray_fileio.h"
+
+namespace Gadgetron{
+
+#define max_number_of_gpus 10
+  static boost::mutex _mutex[max_number_of_gpus];
+
+  gpuOsSenseGadget::gpuOsSenseGadget()
+    : gpuSenseGadget(),
+      is_configured_(false)
+    , prepared_(false)
+  {
+  }
+
+  gpuOsSenseGadget::~gpuOsSenseGadget() {}
+
+  int gpuOsSenseGadget::process_config( ACE_Message_Block* mb )
+  {
+  	gpuSenseGadget::process_config(mb);
+      number_of_iterations_ = number_of_iterations.value();
+
+    exclusive_access_ = exclusive_access.value();
+    lambda_ = lambda.value();
+    alpha_ = alpha.value();
+    coils_per_subset_ = coils_per_subset.value();
+    kappa_ = kappa.value();
+
+    // Get the Ismrmrd header
+    //
+    ISMRMRD::IsmrmrdHeader h;
+    ISMRMRD::deserialize(mb->rd_ptr(),h);
+    
+    
+    if (h.encoding.size() != 1) {
+      GDEBUG("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    
+    // Get the encoding space and trajectory description
+    ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+    ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+    ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+    matrix_size_seq_ = uint64d2( r_space.matrixSize.x, r_space.matrixSize.y );
+
+    if (!is_configured_) {
+
+      if (h.acquisitionSystemInformation) {
+	channels_ = h.acquisitionSystemInformation->receiverChannels ? *h.acquisitionSystemInformation->receiverChannels : 1;
+      } else {
+	channels_ = 1;
+      }
+
+      // Allocate encoding operator for non-Cartesian Sense
+      E_ = boost::make_shared<osSenseOperator<cuNDArray<float_complext>,2,cuNFFTOperator<float,2>>>();
+      E_->set_coils_per_subset(coils_per_subset_);
+
+
+
+      // Setup NLCG solver
+      solver_.set_encoding_operator( E_ );
+
+      solver_.set_output_mode( (output_convergence_) ? osMOMSolver<cuNDArray<float_complext>>::OUTPUT_VERBOSE : osMOMSolver<cuNDArray<float_complext>>::OUTPUT_SILENT );
+      solver_.set_max_iterations( number_of_iterations_ );
+      solver_.set_kappa(kappa_);
+      TV_ = boost::shared_ptr<cuTvOperator<float_complext,3> >(new cuTvOperator<float_complext,3>);
+      PICS_ = boost::shared_ptr<cuTvPicsOperator<float_complext,3> >(new cuTvPicsOperator<float_complext,3>);
+
+
+      // Add "TV" regularization
+      //
+
+      if( lambda_ > 0.0 ){
+      	TV_->set_weight((1.0-alpha_)*lambda_);
+      	solver_.add_nonlinear_operator(TV_);
+      }
+
+      // Add "PICCS" regularization
+      //
+
+      if( alpha_ > 0.0 && lambda_ > 0.0 ){
+        PICS_->set_weight(alpha_*lambda_);
+        solver_.add_nonlinear_operator(PICS_);
+      }
+
+      is_configured_ = true;
+    }
+
+    GDEBUG("gpuOsSenseGadget::end of process_config\n");
+
+    return GADGET_OK;
+  }
+
+  int gpuOsSenseGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<GenericReconJob> *m2)
+  {
+    // Is this data for this gadget's set/slice?
+    //
+  	GDEBUG("Starting gpuOsSenseGadget\n");
+
+    if( m1->getObjectPtr()->set != set_number_ || m1->getObjectPtr()->slice != slice_number_ ) {
+      // No, pass it downstream...
+      return this->next()->putq(m1);
+    }
+
+    //GDEBUG("gpuOsSenseGadget::process\n");
+    //GPUTimer timer("gpuOsSenseGadget::process");
+
+    if (!is_configured_) {
+      GDEBUG("\nData received before configuration complete\n");
+      return GADGET_FAIL;
+    }
+
+    GenericReconJob* j = m2->getObjectPtr();
+
+    // Let's first check that this job has the required data...
+    if (!j->csm_host_.get() || !j->dat_host_.get() || !j->tra_host_.get() || !j->dcw_host_.get()) {
+      GDEBUG("Received an incomplete Sense job\n");
+      return GADGET_FAIL;
+    }
+
+    unsigned int samples = j->dat_host_->get_size(0);
+    unsigned int channels = j->dat_host_->get_size(1);
+    unsigned int rotations = samples / j->tra_host_->get_number_of_elements();
+    unsigned int frames = j->tra_host_->get_size(1)*rotations;
+
+    if( samples%j->tra_host_->get_number_of_elements() ) {
+      GDEBUG("Mismatch between number of samples (%d) and number of k-space coordinates (%d).\nThe first should be a multiplum of the latter.\n",
+                    samples, j->tra_host_->get_number_of_elements());
+      return GADGET_FAIL;
+    }
+
+    boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2> (j->tra_host_.get()));
+    boost::shared_ptr< cuNDArray<float> > dcw(new cuNDArray<float> (j->dcw_host_.get()));
+    sqrt_inplace(dcw.get());
+    boost::shared_ptr< cuNDArray<float_complext> > csm(new cuNDArray<float_complext> (j->csm_host_.get()));
+    boost::shared_ptr< cuNDArray<float_complext> > device_samples(new cuNDArray<float_complext> (j->dat_host_.get()));
+
+
+      // Take the reconstruction matrix size from the regulariaztion image.
+      // It could be oversampled from the sequence specified size...
+
+      matrix_size_ = uint64d2( j->reg_host_->get_size(0), j->reg_host_->get_size(1) );
+
+      cudaDeviceProp deviceProp;
+      if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+        GDEBUG( "\nError: unable to query device properties.\n" );
+        return GADGET_FAIL;
+      }
+
+      unsigned int warp_size = deviceProp.warpSize;
+
+      matrix_size_os_ =
+        uint64d2(((static_cast<unsigned int>(std::ceil(matrix_size_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+                 ((static_cast<unsigned int>(std::ceil(matrix_size_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+
+      GDEBUG("Matrix size    : [%d,%d] \n", matrix_size_[0], matrix_size_[1]);
+      GDEBUG("Matrix size OS : [%d,%d] \n", matrix_size_os_[0], matrix_size_os_[1]);
+
+      std::vector<size_t> image_dims = to_std_vector(matrix_size_);
+      image_dims.push_back(frames);
+
+      E_->set_domain_dimensions(&image_dims);
+      E_->set_codomain_dimensions(device_samples->get_dimensions().get());
+      E_->set_csm(csm);
+      E_->setup( matrix_size_, matrix_size_os_, kernel_width_ );
+      E_->preprocess(traj.get());
+
+
+      reg_image_ = boost::shared_ptr< cuNDArray<float_complext> >(new cuNDArray<float_complext>(&image_dims));
+
+      // These operators need their domain/codomain set before being added to the solver
+      //
+      GDEBUG("Making precon image\n");
+      {
+      	linearOperator<cuNDArray<float_complext>>* op = E_.get();
+				auto precon_image = boost::make_shared<cuNDArray<float_complext>>(image_dims);
+				fill(precon_image.get(),float_complext(1));
+
+				/*cuNDArray<float_complext> tmp_samples(device_samples->get_dimensions());
+				op->mult_M(precon_image.get(),&tmp_samples);
+				op->mult_MH(&tmp_samples,precon_image.get());
+				abs_inplace(precon_image.get());*/
+				solver_.set_preconditioning_image(precon_image);
+				solver_.set_beta(0.01);
+      }
+      E_->set_dcw(dcw);
+      GDEBUG("Prepared\n");
+
+        // Expand the average image to the number of frames
+    //
+
+    {
+      cuNDArray<float_complext> tmp(*j->reg_host_);
+      *reg_image_ = *expand( &tmp, frames );
+    }
+    PICS_->set_prior(reg_image_);
+
+    // Define preconditioning weights
+    //
+
+        //Apply weights
+    *device_samples *= *dcw;
+
+    // Invoke solver
+    //
+
+    boost::shared_ptr< cuNDArray<float_complext> > result;
+    {
+      GDEBUG("Running NLCG solver\n");
+      GPUTimer timer("Running NLCG solver");
+
+      // Optionally, allow exclusive (per device) access to the solver
+      // This may not matter much in terms of speed, but it can in terms of memory consumption
+      //
+
+      if( exclusive_access_ )
+        _mutex[device_number_].lock();
+
+      result = solver_.solve(device_samples.get());
+
+      if( exclusive_access_ )
+        _mutex[device_number_].unlock();
+    }
+
+    // Provide some info about the scaling between the regularization and reconstruction.
+    // If it is not close to one, PICCS does not work optimally...
+    //
+
+    if( alpha_ > 0.0 ){
+      cuNDArray<float_complext> gpureg(j->reg_host_.get());
+      boost::shared_ptr< cuNDArray<float_complext> > gpurec = sum(result.get(),2);
+      *gpurec /= float(result->get_size(2));
+      float scale = abs(dot(gpurec.get(), gpurec.get())/dot(gpurec.get(),&gpureg));
+      GDEBUG("Scaling factor between regularization and reconstruction is %f.\n", scale);
+    }
+
+    if (!result.get()) {
+      GDEBUG("\nNon-linear conjugate gradient solver failed\n");
+      return GADGET_FAIL;
+    }
+
+    /*
+      static int counter = 0;
+      char filename[256];
+      sprintf((char*)filename, "recon_sb_%d.cplx", counter);
+      write_nd_array<float_complext>( sbresult->to_host().get(), filename );
+      counter++; */
+
+    // If the recon matrix size exceeds the sequence matrix size then crop
+    if( matrix_size_seq_ != matrix_size_ )
+      result = crop<float_complext,2>( (matrix_size_-matrix_size_seq_)>>1, matrix_size_seq_, result.get() );
+
+
+    // Now pass on the reconstructed images
+    //
+    this->put_frames_on_que(frames,rotations,j,result.get(),channels);
+
+    frame_counter_ += frames;
+    m1->release();
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuOsSenseGadget)
+}
+
diff --git a/gadgets/pmri/gpuOsSenseGadget.h b/gadgets/pmri/gpuOsSenseGadget.h
new file mode 100644
index 0000000..9c50bdf
--- /dev/null
+++ b/gadgets/pmri/gpuOsSenseGadget.h
@@ -0,0 +1,82 @@
+#ifndef gpuSbSenseGadget_H
+#define gpuSbSenseGadget_H
+#pragma once
+
+#include <ace/Synch.h>
+#include <ace/Mutex.h>
+
+#include "gadgetron_gpupmri_export.h"
+#include "Gadget.h"
+#include "GenericReconJob.h"
+#include "GadgetMRIHeaders.h"
+#include "cuNlcgSolver.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuNFFT.h"
+#include "cuImageOperator.h"
+#include "ismrmrd/ismrmrd.h"
+#include "cuTvOperator.h"
+#include "cuTvPicsOperator.h"
+#include "osSenseOperator.h"
+#include "cuNFFTOperator.h"
+#include "osMOMSolver.h"
+#include "osSPSSolver.h"
+#include "gpuSenseGadget.h"
+
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_GPUPMRI gpuOsSenseGadget : public gpuSenseGadget
+  {
+
+  public:
+
+    gpuOsSenseGadget();
+    virtual ~gpuOsSenseGadget();
+
+  protected:
+    GADGET_PROPERTY(lambda, float, "Lambda regularization factor", 1e-6);
+    GADGET_PROPERTY(alpha, float, "Alpha regularization factor", 0.5);
+    GADGET_PROPERTY(kappa, float, "Kappa regularization factor", 1.0);
+    GADGET_PROPERTY(number_of_iterations, int, "Number of solver iterations", 0);
+    GADGET_PROPERTY(exclusive_access, bool,"Forces 1 gadget per GPU",false);
+    GADGET_PROPERTY(coils_per_subset, int,"Number of coils to use for each subset",1);
+    
+    virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader >* m1, GadgetContainerMessage< GenericReconJob > * m2 );
+    virtual int process_config( ACE_Message_Block* mb );
+
+    int coils_per_subset_;
+
+    uint64d2 matrix_size_;
+    uint64d2 matrix_size_os_;
+    uint64d2 matrix_size_seq_;
+
+    unsigned int number_of_iterations_;
+
+
+    bool exclusive_access_;
+    double lambda_;
+    double alpha_;
+    double kappa_;
+
+    bool is_configured_;
+    bool prepared_;
+
+    // Define non-linear conjugate gradient solver
+    osMOMSolver<cuNDArray<float_complext>> solver_;
+
+    // Define non-Cartesian Sense Encoding operator
+    boost::shared_ptr< osSenseOperator<cuNDArray<float_complext>,2,cuNFFTOperator<float,2>>> E_;
+
+    // Average image for regularization
+    boost::shared_ptr< cuNDArray<float_complext> > reg_image_;
+
+    boost::shared_ptr<cuTvOperator<float_complext,3> > TV_;
+    boost::shared_ptr<cuTvPicsOperator<float_complext,3> > PICS_;
+
+  };
+}
+#endif //gpuSbSenseGadget
+
diff --git a/gadgets/pmri/gpuSbSenseGadget.cpp b/gadgets/pmri/gpuSbSenseGadget.cpp
new file mode 100644
index 0000000..b7ed38e
--- /dev/null
+++ b/gadgets/pmri/gpuSbSenseGadget.cpp
@@ -0,0 +1,364 @@
+#include "gpuSbSenseGadget.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "GadgetMRIHeaders.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "ismrmrd/xml.h"
+
+#include <boost/thread/mutex.hpp>
+#include <boost/make_shared.hpp>
+
+namespace Gadgetron{
+
+#define max_number_of_gpus 10
+  static boost::mutex _mutex[max_number_of_gpus];
+  
+  gpuSbSenseGadget::gpuSbSenseGadget()
+    : is_configured_(false)
+    , prepared_(false)
+    , gpuSenseGadget()
+  {
+  }
+
+  gpuSbSenseGadget::~gpuSbSenseGadget() {}
+
+  int gpuSbSenseGadget::process_config( ACE_Message_Block* mb )
+  {
+    gpuSenseGadget::process_config(mb);
+    
+    number_of_sb_iterations_ = number_of_sb_iterations.value();
+    number_of_cg_iterations_ = number_of_cg_iterations.value();
+    cg_limit_ = cg_limit.value();
+    mu_ = mu.value();
+    lambda_ = lambda.value();
+    alpha_ = alpha.value();
+    gamma_ = gamma.value();
+    exclusive_access_ = exclusive_access.value();
+    is_cyclic_= is_cyclic.value();
+
+    // Get the Ismrmrd header
+    //
+    ISMRMRD::IsmrmrdHeader h;
+    ISMRMRD::deserialize(mb->rd_ptr(),h);
+    
+    
+    if (h.encoding.size() != 1) {
+      GDEBUG("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    
+    // Get the encoding space and trajectory description
+    ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+    ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+    ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+    matrix_size_seq_ = uint64d2( r_space.matrixSize.x, r_space.matrixSize.y );
+
+    if (!is_configured_) {
+
+      if (h.acquisitionSystemInformation) {
+	channels_ = h.acquisitionSystemInformation->receiverChannels ? *h.acquisitionSystemInformation->receiverChannels : 1;
+      } else {
+	channels_ = 1;
+      }
+
+      // Allocate encoding operator for non-Cartesian Sense
+      E_ = boost::shared_ptr< cuNonCartesianSenseOperator<float,2> >( new cuNonCartesianSenseOperator<float,2>() );
+      E_->set_weight(mu_);
+
+      // Allocate preconditioner
+      D_ = boost::shared_ptr< cuCgPreconditioner<float_complext> >( new cuCgPreconditioner<float_complext>() );
+
+      Rx1_ = boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> >
+        ( new cuPartialDerivativeOperator<float_complext,3>(0) );
+      Rx1_->set_weight( (1.0-alpha_)*lambda_ );
+
+      Ry1_ = boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> >
+        ( new cuPartialDerivativeOperator<float_complext,3>(1) );
+      Ry1_->set_weight( (1.0-alpha_)*lambda_ );
+
+      Rz1_ = boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> >
+        ( new cuPartialDerivativeOperator<float_complext,3>(2) );
+      Rz1_->set_weight( (1.0-alpha_)*lambda_ );
+
+
+      Rt1_ = boost::shared_ptr< cuPartialDerivativeOperator2<float_complext,3> >
+              ( new cuPartialDerivativeOperator2<float_complext,3>() );
+      Rt1_->set_weight( (1.0-alpha_)*lambda_ );
+
+      Rx2_ = boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> >
+        ( new cuPartialDerivativeOperator<float_complext,3>(0) );
+      Rx2_->set_weight( alpha_*lambda_ );
+
+      Ry2_ = boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> >
+        ( new cuPartialDerivativeOperator<float_complext,3>(1) );
+      Ry2_->set_weight( alpha_*lambda_ );
+
+      Rz2_ = boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> >
+        ( new cuPartialDerivativeOperator<float_complext,3>(2) );
+      Rz2_->set_weight( alpha_*lambda_ );
+
+      Rt2_ = boost::shared_ptr< cuPartialDerivativeOperator2<float_complext,3> >
+              ( new cuPartialDerivativeOperator2<float_complext,3>() );
+      Rt2_->set_weight( alpha_*lambda_ );
+
+      W_ = boost::make_shared<cuDWTOperator<float_complext,3>>();
+      W_->set_weight(gamma_);
+      W2_ = boost::make_shared<cuDWTOperator<float_complext,3>>();
+      W2_->set_weight(gamma_);
+
+      // Setup split-Bregman solver
+      sb_.set_encoding_operator( E_ );
+            
+      sb_.set_max_outer_iterations(number_of_sb_iterations_);
+      sb_.set_max_inner_iterations(1);
+      sb_.set_output_mode( (output_convergence_) ? cuSbcCgSolver<float_complext>::OUTPUT_VERBOSE : cuSbcCgSolver<float_complext>::OUTPUT_SILENT );
+      
+      sb_.get_inner_solver()->set_max_iterations( number_of_cg_iterations_ );
+      sb_.get_inner_solver()->set_tc_tolerance( cg_limit_ );
+      //sb_.get_inner_solver()->set_output_mode( (output_convergence_) ? cuCgSolver<float_complext>::OUTPUT_VERBOSE : cuCgSolver<float_complext>::OUTPUT_SILENT );
+      sb_.get_inner_solver()->set_preconditioner( D_ );
+
+      is_configured_ = true;
+    }
+
+    GDEBUG("gpuSbSenseGadget::end of process_config\n");
+
+    return GADGET_OK;
+  }
+
+  int gpuSbSenseGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<GenericReconJob> *m2)
+  {
+    // Is this data for this gadget's set/slice?
+    //
+    
+    if( m1->getObjectPtr()->set != set_number_ || m1->getObjectPtr()->slice != slice_number_ ) {      
+      // No, pass it downstream...
+      return this->next()->putq(m1);
+    }
+
+    //GDEBUG("gpuSbSenseGadget::process\n");
+    //GPUTimer timer("gpuSbSenseGadget::process");
+
+    if (!is_configured_) {
+      GDEBUG("\nData received before configuration complete\n");
+      return GADGET_FAIL;
+    }
+
+    GenericReconJob* j = m2->getObjectPtr();
+
+    // Let's first check that this job has the required data...
+    if (!j->csm_host_.get() || !j->dat_host_.get() || !j->tra_host_.get() || !j->dcw_host_.get()) {
+      GDEBUG("Received an incomplete Sense job\n");
+      return GADGET_FAIL;
+    }
+
+    unsigned int samples = j->dat_host_->get_size(0);
+    unsigned int channels = j->dat_host_->get_size(1);
+    unsigned int rotations = samples / j->tra_host_->get_number_of_elements();
+    unsigned int frames = j->tra_host_->get_size(1)*rotations;
+
+    if( samples%j->tra_host_->get_number_of_elements() ) {
+      GDEBUG("Mismatch between number of samples (%d) and number of k-space coordinates (%d).\nThe first should be a multiplum of the latter.\n",
+                    samples, j->tra_host_->get_number_of_elements());
+      return GADGET_FAIL;
+    }
+
+    boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2> (j->tra_host_.get()));
+    boost::shared_ptr< cuNDArray<float> > dcw(new cuNDArray<float> (j->dcw_host_.get()));
+    sqrt_inplace(dcw.get());
+    boost::shared_ptr< cuNDArray<float_complext> > csm(new cuNDArray<float_complext> (j->csm_host_.get()));
+    boost::shared_ptr< cuNDArray<float_complext> > device_samples(new cuNDArray<float_complext> (j->dat_host_.get()));
+    
+    if( !prepared_){
+
+      // Take the reconstruction matrix size from the regulariaztion image. 
+      // It could be oversampled from the sequence specified size...
+      
+      matrix_size_ = uint64d2( j->reg_host_->get_size(0), j->reg_host_->get_size(1) );
+      
+      cudaDeviceProp deviceProp;
+      if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+        GDEBUG( "\nError: unable to query device properties.\n" );
+        return GADGET_FAIL;
+      }
+
+      unsigned int warp_size = deviceProp.warpSize;
+
+      matrix_size_os_ =
+        uint64d2(((static_cast<unsigned int>(std::ceil(matrix_size_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+                 ((static_cast<unsigned int>(std::ceil(matrix_size_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+      
+      GDEBUG("Matrix size    : [%d,%d] \n", matrix_size_[0], matrix_size_[1]);
+      GDEBUG("Matrix size OS : [%d,%d] \n", matrix_size_os_[0], matrix_size_os_[1]);
+
+      std::vector<size_t> image_dims = to_std_vector(matrix_size_);
+      image_dims.push_back(frames);
+      
+      E_->set_domain_dimensions(&image_dims);
+      E_->set_codomain_dimensions(device_samples->get_dimensions().get());
+            
+      reg_image_ = boost::shared_ptr< cuNDArray<float_complext> >(new cuNDArray<float_complext>(&image_dims));
+      
+      // These operators need their domain/codomain set before being added to the solver
+      //
+
+      Rx1_->set_domain_dimensions(&image_dims);
+      Rx1_->set_codomain_dimensions(&image_dims);
+      
+      Ry1_->set_domain_dimensions(&image_dims);
+      Ry1_->set_codomain_dimensions(&image_dims);
+      
+      Rz1_->set_domain_dimensions(&image_dims);
+      Rz1_->set_codomain_dimensions(&image_dims);
+      
+      Rt1_->set_domain_dimensions(&image_dims);
+      Rt1_->set_codomain_dimensions(&image_dims);
+
+      Rx2_->set_domain_dimensions(&image_dims);
+      Rx2_->set_codomain_dimensions(&image_dims);
+      
+      Ry2_->set_domain_dimensions(&image_dims);
+      Ry2_->set_codomain_dimensions(&image_dims);
+
+      Rt2_->set_domain_dimensions(&image_dims);
+      Rt2_->set_codomain_dimensions(&image_dims);
+
+      Rz2_->set_domain_dimensions(&image_dims);
+      Rz2_->set_codomain_dimensions(&image_dims);
+
+      W_->set_domain_dimensions(&image_dims);
+      W_->set_codomain_dimensions(&image_dims);
+      W2_->set_domain_dimensions(&image_dims);
+      W2_->set_codomain_dimensions(&image_dims);
+      W2_->set_shift(2);
+      
+      // Add "TV" regularization
+      // 
+      
+      if( alpha_<1.0 ){
+        sb_.add_regularization_group_operator( Rx1_ ); 
+        sb_.add_regularization_group_operator( Ry1_ ); 
+        if(frames>1)
+        	if (is_cyclic_)
+        		sb_.add_regularization_group_operator( Rz1_ );
+        	else
+        		sb_.add_regularization_group_operator( Rt1_ );
+        sb_.add_group();
+      }
+      
+      // Add "PICCS" regularization
+      //
+
+      if( alpha_ > 0.0 ){
+        sb_.add_regularization_group_operator( Rx2_ ); 
+        sb_.add_regularization_group_operator( Ry2_ ); 
+        if(frames>1)
+        	if (is_cyclic_)
+        		sb_.add_regularization_group_operator( Rz2_ );
+        	else
+        		sb_.add_regularization_group_operator( Rt2_ );
+        sb_.add_group(reg_image_);
+      }
+      
+      if (gamma_ > 0.0){
+    	  sb_.add_regularization_operator(W_);
+    	  sb_.add_regularization_operator(W2_);
+      }
+      prepared_ = true;
+    }
+    
+    E_->set_dcw(dcw);
+    E_->set_csm(csm);    
+    E_->setup( matrix_size_, matrix_size_os_, static_cast<float>(kernel_width_) );
+    E_->preprocess(traj.get());
+
+    // Expand the average image to the number of frames
+    //
+
+    {
+      cuNDArray<float_complext> tmp(*j->reg_host_);
+      *reg_image_ = *expand( &tmp, frames );
+    }
+
+    // Define preconditioning weights
+    //
+
+    boost::shared_ptr< cuNDArray<float> > _precon_weights = sum(abs_square(csm.get()).get(), 2);
+    reciprocal_sqrt_inplace(_precon_weights.get());	
+    boost::shared_ptr< cuNDArray<float_complext> > precon_weights = real_to_complex<float_complext>( _precon_weights.get() );
+    _precon_weights.reset();
+    D_->set_weights( precon_weights );
+    precon_weights.reset();
+    
+    //Apply weights
+    *device_samples *= *dcw;
+
+    // Invoke solver
+    //
+
+    boost::shared_ptr< cuNDArray<float_complext> > sbresult;
+    {
+      GDEBUG("Running split Bregman solver\n");
+      GPUTimer timer("Running split Bregman solver");
+
+      // Optionally, allow exclusive (per device) access to the solver
+      // This may not matter much in terms of speed, but it can in terms of memory consumption
+      //
+
+      if( exclusive_access_ )
+        _mutex[device_number_].lock();
+
+      sbresult = sb_.solve(device_samples.get());
+
+      if( exclusive_access_ )
+        _mutex[device_number_].unlock();
+    }
+
+    // Provide some info about the scaling between the regularization and reconstruction.
+    // If it is not close to one, PICCS does not work optimally...
+    // 
+
+    if( alpha_ > 0.0 ){
+      cuNDArray<float_complext> gpureg(j->reg_host_.get());
+      boost::shared_ptr< cuNDArray<float_complext> > gpurec = sum(sbresult.get(),2);
+      *gpurec /= float(sbresult->get_size(2));
+      float scale = abs(dot(gpurec.get(), gpurec.get())/dot(gpurec.get(),&gpureg));
+      GDEBUG("Scaling factor between regularization and reconstruction is %f.\n", scale);
+    }
+    
+    if (!sbresult.get()) {
+      GDEBUG("\nSplit Bregman solver failed\n");
+      return GADGET_FAIL;
+    }
+    
+    /*
+      static int counter = 0;
+      char filename[256];
+      sprintf((char*)filename, "recon_sb_%d.cplx", counter);
+      write_nd_array<float_complext>( sbresult->to_host().get(), filename );
+      counter++; */
+
+    // If the recon matrix size exceeds the sequence matrix size then crop
+    if( matrix_size_seq_ != matrix_size_ )
+      sbresult = crop<float_complext,2>( (matrix_size_-matrix_size_seq_)>>1, matrix_size_seq_, sbresult.get() );
+        
+    // Now pass on the reconstructed images
+    //
+
+	put_frames_on_que(frames,rotations,j,sbresult.get());
+
+    frame_counter_ += frames;
+    m1->release();
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuSbSenseGadget)
+}
+
diff --git a/gadgets/pmri/gpuSbSenseGadget.h b/gadgets/pmri/gpuSbSenseGadget.h
new file mode 100644
index 0000000..f35b7b5
--- /dev/null
+++ b/gadgets/pmri/gpuSbSenseGadget.h
@@ -0,0 +1,89 @@
+#ifndef gpuSbSenseGadget_H
+#define gpuSbSenseGadget_H
+#pragma once
+
+#include <ace/Synch.h>
+#include <ace/Mutex.h>
+
+#include "gadgetron_gpupmri_export.h"
+#include "Gadget.h"
+#include "GenericReconJob.h"
+#include "GadgetMRIHeaders.h"
+#include "cuSbcCgSolver.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuPartialDerivativeOperator2.h"
+#include "cuNFFT.h"
+#include "cuImageOperator.h"
+#include "ismrmrd/ismrmrd.h"
+#include "gpuSenseGadget.h"
+#include <complex>
+#include "cuDWTOperator.h"
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_GPUPMRI gpuSbSenseGadget : public gpuSenseGadget
+  {
+
+  public:
+    GADGET_DECLARE(gpuSbSenseGadget);
+
+    gpuSbSenseGadget();
+    virtual ~gpuSbSenseGadget();
+
+  protected:
+    GADGET_PROPERTY(number_of_sb_iterations, int, "Number of split Bregman iterations", 20);
+    GADGET_PROPERTY(number_of_cg_iterations, int, "Number of conjugate gradient iterations", 10);
+    GADGET_PROPERTY(mu, float, "Mu regularization parameter", 1.0);
+    GADGET_PROPERTY(lambda, float, "Lambda regularization parameter", 2.0);
+    GADGET_PROPERTY(gamma, float, "Gamma regularization parameter", 0.0);
+    GADGET_PROPERTY(alpha, float, "Alpha regularization parameter", 0.5);
+    GADGET_PROPERTY(is_cyclic, bool, "Is cyclic", true);
+    GADGET_PROPERTY(exclusive_access, bool, "Exclusive access to solver", false);
+
+    virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader >* m1, GadgetContainerMessage< GenericReconJob > * m2 );
+    virtual int process_config( ACE_Message_Block* mb );
+
+
+    unsigned int number_of_cg_iterations_;
+    unsigned int number_of_sb_iterations_;
+    double cg_limit_;
+    double mu_;
+    double lambda_;
+    double alpha_;
+    double gamma_;
+    unsigned int rotations_to_discard_;
+
+    bool exclusive_access_;
+    bool is_configured_;
+    bool prepared_;
+    bool is_cyclic_; //True if 3rd dimension of the data is cyclic (i.e. cardiac)
+
+    // Define constraint Split Bregman solver
+    cuSbcCgSolver<float_complext> sb_;
+
+    // Define non-Cartesian Sense Encoding operator
+    boost::shared_ptr< cuNonCartesianSenseOperator<float,2> > E_;
+
+    // Define preconditioner
+    boost::shared_ptr< cuCgPreconditioner<float_complext> > D_;
+
+    // Average image for regularization
+    boost::shared_ptr< cuNDArray<float_complext> > reg_image_;
+
+    // Define regularization operators
+    boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> > Rx1_;
+    boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> > Rx2_;
+    boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> > Ry1_;
+    boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> > Ry2_;
+    boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> > Rz1_;
+    boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> > Rz2_;
+    boost::shared_ptr< cuPartialDerivativeOperator2<float_complext,3> > Rt1_;
+    boost::shared_ptr< cuPartialDerivativeOperator2<float_complext,3> > Rt2_;
+    boost::shared_ptr< cuDWTOperator<float_complext,3> > W_;
+    boost::shared_ptr< cuDWTOperator<float_complext,3> > W2_;
+	
+  };
+}
+#endif //gpuSbSenseGadget
diff --git a/gadgets/pmri/gpuSenseGadget.cpp b/gadgets/pmri/gpuSenseGadget.cpp
new file mode 100644
index 0000000..56d08c2
--- /dev/null
+++ b/gadgets/pmri/gpuSenseGadget.cpp
@@ -0,0 +1,150 @@
+/*
+ * gpuSenseGadget.cpp
+ *
+ *  Created on: Nov 17, 2014
+ *      Author: dch
+ */
+
+#include "gpuSenseGadget.h"
+#include "cuNDArray.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_math.h"
+#include "cuNDArray_math.h"
+namespace Gadgetron {
+
+gpuSenseGadget::gpuSenseGadget() {
+	matrix_size_ = uint64d2(0u,0u);
+	matrix_size_os_ = uint64d2(0u,0u);
+	matrix_size_seq_ = uint64d2(0u,0u);
+}
+
+gpuSenseGadget::~gpuSenseGadget() {
+	// TODO Auto-generated destructor stub
+}
+
+int gpuSenseGadget::process_config(ACE_Message_Block* mb) {
+  device_number_ = deviceno.value();
+
+  int number_of_devices = 0;
+  if (cudaGetDeviceCount(&number_of_devices)!= cudaSuccess) {
+    GDEBUG( "Error: unable to query number of CUDA devices.\n" );
+    return GADGET_FAIL;
+  }
+
+  if (number_of_devices == 0) {
+    GDEBUG( "Error: No available CUDA devices.\n" );
+    return GADGET_FAIL;
+  }
+
+  if (device_number_ >= number_of_devices) {
+    GDEBUG("Adjusting device number from %d to %d\n", device_number_,  (device_number_%number_of_devices));
+    device_number_ = (device_number_%number_of_devices);
+  }
+  
+  if (cudaSetDevice(device_number_)!= cudaSuccess) {
+    GDEBUG( "Error: unable to set CUDA device.\n" );
+    return GADGET_FAIL;
+  }
+
+  pass_on_undesired_data_ = pass_on_undesired_data.value();
+  set_number_ = setno.value();
+  slice_number_ = sliceno.value();
+  oversampling_factor_ = oversampling_factor.value();
+  kernel_width_ = kernel_width.value();
+  output_convergence_ = output_convergence.value();
+  output_timing_ = output_timing.value();
+  rotations_to_discard_ = rotations_to_discard.value();
+  
+  if( (rotations_to_discard_%2) == 1 ){
+    GDEBUG("#rotations to discard must be even.\n");
+    return GADGET_FAIL;
+  }
+  save_individual_frames_ = save_individual_frames.value();
+
+}
+
+int gpuSenseGadget::put_frames_on_que(int frames,int rotations, GenericReconJob* j, cuNDArray<float_complext>* cgresult,int channels) {
+
+	unsigned int frames_per_rotation = frames/rotations;
+
+	if( rotations == 1 ){ // this is the case for golden ratio
+		rotations = frames;
+		frames_per_rotation = 1;
+	}
+	if (save_individual_frames_){
+		for( unsigned int frame=0; frame<frames; frame++ ){
+
+			unsigned int rotation_idx = frame/frames_per_rotation;
+
+			// Check if we should discard this frame
+			if( rotation_idx < (rotations_to_discard_>>1) || rotation_idx >= rotations-(rotations_to_discard_>>1) )
+				continue;
+
+			GadgetContainerMessage<ISMRMRD::ImageHeader> *m =
+					new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+			GadgetContainerMessage< hoNDArray< std::complex<float> > > *cm =
+					new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+			*m->getObjectPtr() = j->image_headers_[frame];
+			m->cont(cm);
+
+			std::vector<size_t> img_dims {cgresult->get_size(0),cgresult->get_size(1)};
+
+			cm->getObjectPtr()->create(&img_dims);
+
+			size_t data_length = cm->getObjectPtr()->get_number_of_bytes();
+
+			cudaMemcpy(cm->getObjectPtr()->get_data_ptr(),
+					cgresult->get_data_ptr()+frame*cm->getObjectPtr()->get_number_of_elements(),
+					data_length,
+					cudaMemcpyDeviceToHost);
+
+			cudaError_t err = cudaGetLastError();
+			if( err != cudaSuccess ){
+				GDEBUG("Unable to copy result from device to host: %s\n", cudaGetErrorString(err));
+				m->release();
+				return GADGET_FAIL;
+			}
+
+			m->getObjectPtr()->matrix_size[0] = img_dims[0];
+			m->getObjectPtr()->matrix_size[1] = img_dims[1];
+			m->getObjectPtr()->matrix_size[2] = 1;
+			m->getObjectPtr()->channels       = 1;
+			m->getObjectPtr()->image_index    = frame_counter_ + frame;
+
+			if (this->next()->putq(m) < 0) {
+				GDEBUG("Failed to put result image on to queue\n");
+				m->release();
+				return GADGET_FAIL;
+			}
+		}
+	} else{
+		std::vector<size_t> img_dims { cgresult->get_size(0),cgresult->get_size(1),(size_t)frames};
+
+		auto cm =
+				new GadgetContainerMessage< hoNDArray< std::complex<float> > >(img_dims);
+		cgresult->to_host(reinterpret_cast<hoNDArray<float_complext>*>(cm->getObjectPtr()));
+		auto m =
+				new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+		*m->getObjectPtr() = j->image_headers_[0]; //Just use the first header
+		m->cont(cm);
+
+		m->getObjectPtr()->matrix_size[0] = img_dims[0];
+		m->getObjectPtr()->matrix_size[1] = img_dims[1];
+		m->getObjectPtr()->matrix_size[2] = img_dims[2];
+		m->getObjectPtr()->channels       = channels;
+		m->getObjectPtr()->image_index    = frame_counter_;
+
+		if (this->next()->putq(m) < 0) {
+			GDEBUG("Failed to put result image on to queue\n");
+			m->release();
+			return GADGET_FAIL;
+		}
+
+	}
+
+}
+
+} /* namespace Gadgetron */
diff --git a/gadgets/pmri/gpuSenseGadget.h b/gadgets/pmri/gpuSenseGadget.h
new file mode 100644
index 0000000..a2bceb5
--- /dev/null
+++ b/gadgets/pmri/gpuSenseGadget.h
@@ -0,0 +1,58 @@
+/*
+ * gpuSenseGadget.h
+ *
+ *  Created on: Nov 17, 2014
+ *      Author: dch
+ */
+
+#ifndef GPUSENSEGADGET_H_
+#define GPUSENSEGADGET_H_
+
+#include "Gadget.h"
+#include <ismrmrd/ismrmrd.h>
+#include <ismrmrd/xml.h>
+#include "vector_td.h"
+#include "GenericReconJob.h"
+#include "cuNDArray.h"
+namespace Gadgetron {
+
+class gpuSenseGadget: public Gadget2<ISMRMRD::ImageHeader, GenericReconJob>{
+public:
+  gpuSenseGadget();
+  virtual ~gpuSenseGadget();
+  virtual int process_config(ACE_Message_Block* mb);
+  
+protected:
+  GADGET_PROPERTY(deviceno,int,"GPU device number", 0);
+  GADGET_PROPERTY(setno,int,"Set number to process", 0);
+  GADGET_PROPERTY(sliceno,int,"Slice number to process",0);
+  GADGET_PROPERTY(cg_limit, float, "Residual limit for CG convergence", 1e-6);
+  GADGET_PROPERTY(oversampling_factor, float, "Oversampling factor for NFFT", 1.5);
+  GADGET_PROPERTY(kernel_width, float, "Kernel width for NFFT", 5.5);
+  GADGET_PROPERTY(save_individual_frames, bool, "Save individual frames", true);
+  GADGET_PROPERTY(output_convergence, bool, "Ouput convergence information", false);
+  GADGET_PROPERTY(rotations_to_discard, int, "Number of rotations to dump", 0);
+  GADGET_PROPERTY(output_timing, bool, "Output timing information", false);
+
+  virtual int put_frames_on_que(int frames,int rotations, GenericReconJob* j, cuNDArray<float_complext>* cgresult, int channels = 1);
+  int channels_;
+  int device_number_;
+  int set_number_;
+  int slice_number_;
+  
+  uint64d2 matrix_size_;
+  uint64d2 matrix_size_os_;
+  uint64d2 matrix_size_seq_;
+  double oversampling_factor_;
+  double kernel_width_;
+  unsigned int rotations_to_discard_;
+  
+  bool output_convergence_;
+  bool output_timing_;
+  bool save_individual_frames_;
+  
+  int frame_counter_;
+};
+
+} /* namespace Gadgetron */
+#endif /* GPUSENSEGADGET_H_ */
diff --git a/gadgets/python/CMakeLists.txt b/gadgets/python/CMakeLists.txt
new file mode 100644
index 0000000..fba6d4e
--- /dev/null
+++ b/gadgets/python/CMakeLists.txt
@@ -0,0 +1,75 @@
+IF (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_PYTHON__)
+ENDIF (WIN32)
+
+# TODO: The Gadgetron Python code uses Numpy C-API code
+# from Numpy versions < 1.7. If Numpy version is >= 1.7
+# you will get compiler warnings
+#add_definitions(-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION)
+
+find_package(Ismrmrd REQUIRED)
+
+message(STATUS ${Boost_INCLUDE_DIR} ${Boost_LIBRARIES})
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/apps/gadgetron
+  ${CMAKE_BINARY_DIR}/apps/gadgetron
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+  ${CMAKE_SOURCE_DIR}/toolboxes/python
+  ${PYTHON_INCLUDE_PATH}
+  ${NUMPY_INCLUDE_DIRS}
+  ${Boost_INCLUDE_DIR}
+  )
+
+add_library(GadgetronPythonMRI MODULE GadgetronPythonMRI.cpp 
+  GadgetReference.cpp 
+  GadgetInstrumentationStreamController.cpp)
+
+#We should probably not set soversion on the python module. Causes problems with clang
+#set_target_properties(GadgetronPythonMRI PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+add_library(gadgetron_python SHARED
+	PythonGadget.cpp
+	GadgetReference.cpp
+	GadgetInstrumentationStreamController.cpp
+	GadgetronPythonMRI.cpp)
+
+set_target_properties(gadgetron_python PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_python
+    gadgetron_gadgetbase
+    gadgetron_toolbox_python
+    gadgetron_toolbox_log
+    ${ISMRMRD_LIBRARIES}
+    optimized ${ACE_LIBRARIES}
+    debug ${ACE_DEBUG_LIBRARY}
+    ${PYTHON_LIBRARIES}
+    ${Boost_LIBRARIES}
+    ${MKL_LIBRARIES})
+
+target_link_libraries(GadgetronPythonMRI
+    gadgetron_gadgetbase
+    gadgetron_toolbox_log
+    gadgetron_toolbox_python
+    ${ISMRMRD_LIBRARIES}
+    optimized ${ACE_LIBRARIES}
+    debug ${ACE_DEBUG_LIBRARY}
+    ${PYTHON_LIBRARIES}
+    ${Boost_LIBRARIES}
+    ${MKL_LIBRARIES})
+
+IF (WIN32)
+    SET_TARGET_PROPERTIES(GadgetronPythonMRI PROPERTIES SUFFIX .pyd)
+    SET_TARGET_PROPERTIES(gadgetron_python PROPERTIES LINK_FLAGS "/LIBPATH:${PYTHON_INCLUDE_DIR}/../libs" )
+ENDIF(WIN32)
+
+set_target_properties(GadgetronPythonMRI PROPERTIES PREFIX "")
+
+install(TARGETS gadgetron_python DESTINATION lib COMPONENT main)
+install(TARGETS GadgetronPythonMRI DESTINATION ${GADGETRON_INSTALL_PYTHON_MODULE_PATH} COMPONENT main)
+
+add_subdirectory(config)
+add_subdirectory(gadgets)
+add_subdirectory(utils)
+#add_subdirectory(examples)
+
diff --git a/gadgets/python/GadgetInstrumentationStreamController.cpp b/gadgets/python/GadgetInstrumentationStreamController.cpp
new file mode 100644
index 0000000..b4d3e77
--- /dev/null
+++ b/gadgets/python/GadgetInstrumentationStreamController.cpp
@@ -0,0 +1,321 @@
+#include "GadgetInstrumentationStreamController.h"
+#include "EndGadget.h"
+#include "AcquisitionFinishGadget.h"
+#include "ImageFinishGadget.h"
+#include "log.h"
+#include "hoNDArray.h"
+#include "GadgetMessageInterface.h"
+#include "GadgetMRIHeaders.h"
+#include <string.h>
+#include <ismrmrd/ismrmrd.h>
+#include <ismrmrd/meta.h>
+
+namespace Gadgetron
+{
+  GadgetInstrumentationStreamController::GadgetInstrumentationStreamController()
+  {
+    if (this->open() != GADGET_OK) {
+      throw std::runtime_error("Unable to initialize GadgetInstrumentationStreamController");
+    }
+  }
+
+  GadgetInstrumentationStreamController::~GadgetInstrumentationStreamController()
+  {
+    if (this->close() != GADGET_OK) {
+      throw std::runtime_error("Unable to shut down sream in  GadgetInstrumentationStreamController");
+    }
+  }
+  
+  int GadgetInstrumentationStreamController::open()
+  {
+    GadgetModule *head = 0;
+    GadgetModule *tail = 0;
+    
+    if (tail == 0) {
+      Gadget* eg = new EndGadget();
+      if (eg) {
+	eg->set_controller(this);
+      }
+      
+      ACE_NEW_RETURN(tail,
+		     ACE_Module<ACE_MT_SYNCH>( ACE_TEXT("EndGadget"),
+					       eg ),
+		     -1);
+      
+      stream_.open(0,head,tail);
+    }
+
+    //Adding some gadgets to "capture data and return to the stream"
+    if (this->prepend_gadget("ImageFinishFloat","gadgetron_mricore","ImageFinishGadget") != GADGET_OK) return GADGET_FAIL;
+    this->find_gadget("ImageFinishFloat")->pass_on_undesired_data(true);
+
+    /*
+    if (this->prepend_gadget("ImageFinishCplx","gadgetron_mricore","ImageFinishGadgetCPLX") != GADGET_OK) return GADGET_FAIL;
+    this->find_gadget("ImageFinishCplx")->pass_on_undesired_data(true);
+
+    if (this->prepend_gadget("ImageFinishUShort","gadgetron_mricore","ImageFinishGadgetUSHORT") != GADGET_OK) return GADGET_FAIL;
+    this->find_gadget("ImageFinishUShort")->pass_on_undesired_data(true);
+    */
+
+    if (this->prepend_gadget("AcquisitionFinish","gadgetron_mricore","AcquisitionFinishGadget") != GADGET_OK) return GADGET_FAIL;
+    this->find_gadget("AcquisitionFinish")->pass_on_undesired_data(true);
+
+
+    return GADGET_OK;
+  }
+
+  int GadgetInstrumentationStreamController::close()
+  {
+    stream_.close(1); //Shutdown gadgets and wait for them
+    return GADGET_OK;
+  }
+
+  int GadgetInstrumentationStreamController::prepend_gadget(const char* gadgetname,
+							   const char* dllname, 
+							   const char* classname)
+  {
+    GadgetModule* m = create_gadget_module(dllname,
+					   classname,
+					   gadgetname);
+    
+    Gadget* g = dynamic_cast<Gadget*>(m->writer());//Get the gadget out of the module
+
+    //We will set this very high to prevent race conditions in "mixed environments" such as when using Python or Matlab in Gadgets
+    g->msg_queue()->high_water_mark(ACE_Message_Queue_Base::DEFAULT_HWM*100000);
+    
+    if (!m) {
+      GERROR("Failed to create GadgetModule from %s:%s\n",
+	     classname,
+	     dllname);
+      return GADGET_FAIL;
+    }
+    
+    if (stream_.push(m) < 0) {
+      GERROR("Failed to push Gadget %s onto stream\n", gadgetname);
+      delete m;
+      return GADGET_FAIL;
+    }
+    return GADGET_OK;
+  }
+
+  template <class T1, class T2, class T3> int GadgetInstrumentationStreamController::return_data(ACE_Message_Block* mb)
+  {
+    static int counter = 0;
+    GadgetContainerMessage<T1>* m1 = AsContainerMessage<T1>(mb);
+    GadgetContainerMessage<T2>* m2 = AsContainerMessage<T2>(mb->cont());
+    GadgetContainerMessage<T3>* m3 = AsContainerMessage<T3>(m2->cont());
+
+    if (!m1 || !m2) {
+      GERROR("Unable to convert input container messages");
+      return GADGET_FAIL;
+    }
+    
+    {
+      GILLock lock;
+      try {
+	if (m3) {
+	  std::stringstream str;
+	  ISMRMRD::serialize(*m3->getObjectPtr(), str);
+	  python_gadget_.attr("put_next")(*m1->getObjectPtr(),m2->getObjectPtr(),str.str());
+	} else {
+	  python_gadget_.attr("put_next")(*m1->getObjectPtr(),m2->getObjectPtr());
+	}
+      } catch(boost::python::error_already_set const &) {
+	GERROR("Passing data on to python wrapper gadget failed\n");
+	PyErr_Print();
+	return GADGET_FAIL;
+      }
+    }
+    return GADGET_OK;
+  }
+
+  int GadgetInstrumentationStreamController::output_ready(ACE_Message_Block* mb)
+  {
+    GadgetContainerMessage<GadgetMessageIdentifier>* m0 = AsContainerMessage<GadgetMessageIdentifier>(mb);
+    if (!m0) {
+      GERROR("Unable to extract GadgetMessageIdentifier\n");
+      mb->release();
+      return GADGET_FAIL;
+    }
+
+    GadgetContainerMessage<ISMRMRD::ImageHeader>* m_tmp = 0;
+
+    switch (m0->getObjectPtr()->id) {
+    case (GADGET_MESSAGE_ACQUISITION):
+      if (0 != this->return_data<ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> >, ISMRMRD::MetaContainer >(m0->cont()) )
+	{
+	  GERROR("Unable to convert and return GADGET_MESSAGE_ACQUISITON\n");
+	  m0->release();
+	  return GADGET_FAIL;
+	}
+      break;
+
+    case (GADGET_MESSAGE_ISMRMRD_IMAGE):
+      m_tmp = AsContainerMessage<ISMRMRD::ImageHeader>(m0->cont());
+      if (!m_tmp) {
+	GERROR("Error converting header of GADGET_MESSAGE_ISMRMRD_IMAG\n");
+	mb->release();
+	return GADGET_FAIL;
+      }
+      switch (m_tmp->getObjectPtr()->data_type) {
+	
+      case (ISMRMRD::ISMRMRD_USHORT):
+	if (0 != this->return_data<ISMRMRD::ImageHeader, hoNDArray< uint16_t >, ISMRMRD::MetaContainer >(m0->cont()) )
+	  {
+	    GERROR("Unable to convert and return GADGET_MESSAGE_ISMRMRD_IMAGE\n");
+	    m0->release();
+	    return GADGET_FAIL;
+	  }
+	break;
+      case (ISMRMRD::ISMRMRD_SHORT):
+	if (0 != this->return_data<ISMRMRD::ImageHeader, hoNDArray< int16_t >, ISMRMRD::MetaContainer >(m0->cont()) )
+	  {
+	    GERROR("Unable to convert and return GADGET_MESSAGE_ISMRMRD_IMAGE\n");
+	    m0->release();
+	    return GADGET_FAIL;
+	  }
+	break;
+      case (ISMRMRD::ISMRMRD_UINT):
+	if (0 != this->return_data<ISMRMRD::ImageHeader, hoNDArray< uint32_t >, ISMRMRD::MetaContainer >(m0->cont()) )
+	  {
+	    GERROR("Unable to convert and return GADGET_MESSAGE_ISMRMRD_IMAGE\n");
+	    m0->release();
+	    return GADGET_FAIL;
+	  }
+	break;
+      case (ISMRMRD::ISMRMRD_INT):
+	if (0 != this->return_data<ISMRMRD::ImageHeader, hoNDArray< int32_t >, ISMRMRD::MetaContainer >(m0->cont()) )
+	  {
+	    GERROR("Unable to convert and return GADGET_MESSAGE_ISMRMRD_IMAGE\n");
+	    m0->release();
+	    return GADGET_FAIL;
+	  }
+	break;
+      case (ISMRMRD::ISMRMRD_FLOAT):
+	if (0 != this->return_data<ISMRMRD::ImageHeader, hoNDArray< float >, ISMRMRD::MetaContainer >(m0->cont()) )
+	  {
+	    GERROR("Unable to convert and return GADGET_MESSAGE_ISMRMRD_IMAGE\n");
+	    m0->release();
+	    return GADGET_FAIL;
+	  }
+	break;
+      case (ISMRMRD::ISMRMRD_DOUBLE):
+	if (0 != this->return_data<ISMRMRD::ImageHeader, hoNDArray< double >, ISMRMRD::MetaContainer >(m0->cont()) )
+	  {
+	    GERROR("Unable to convert and return GADGET_MESSAGE_ISMRMRD_IMAGE\n");
+	    m0->release();
+	    return GADGET_FAIL;
+	  }
+	break;
+      case (ISMRMRD::ISMRMRD_CXFLOAT):
+	if (0 != this->return_data<ISMRMRD::ImageHeader, hoNDArray< std::complex<float> >, ISMRMRD::MetaContainer >(m0->cont()) )
+	  {
+	    GERROR("Unable to convert and return GADGET_MESSAGE_ISMRMRD_IMAGE\n");
+	    m0->release();
+	    return GADGET_FAIL;
+	  }
+	break;
+	
+      case (ISMRMRD::ISMRMRD_CXDOUBLE):
+	if (0 != this->return_data<ISMRMRD::ImageHeader, hoNDArray< std::complex<double> >, ISMRMRD::MetaContainer >(m0->cont()) )
+	  {
+	    GERROR("Unable to convert and return GADGET_MESSAGE_ISMRMRD_IMAGE\n");
+	    m0->release();
+	    return GADGET_FAIL;
+	  }
+	break;
+      }
+      break;
+    case (GADGET_MESSAGE_CLOSE):
+      break;
+    default:
+      GERROR("Unsupported message ID (%d) encountered\n", m0->getObjectPtr()->id);
+      mb->release();
+      return GADGET_FAIL;
+    }
+    
+    mb->release();
+    return GADGET_OK;
+  }
+
+  void GadgetInstrumentationStreamController::set_parameter(const char* gadgetname, const char* parameter, const char* value)
+  {
+    Gadget* g = this->find_gadget(gadgetname);
+    if (!g) {
+      throw std::runtime_error("Unable to find Gadget for setting parameter");
+    }
+    g->set_parameter(parameter,value,false);
+  }
+
+
+  int GadgetInstrumentationStreamController::put_config(const char* config)
+  {
+    size_t l = std::strlen(config);
+    ACE_Message_Block* mb = new ACE_Message_Block(l+1);
+    memcpy(mb->wr_ptr(),config,l+1);
+    mb->wr_ptr(l+1);
+    mb->set_flags(Gadget::GADGET_MESSAGE_CONFIG);
+    if (stream_.put(mb) == -1) {
+      GERROR("Failed to put configuration on stream, too long wait, %d\n",  ACE_OS::last_error () ==  EWOULDBLOCK);
+      mb->release();
+      return GADGET_FAIL;
+    }
+    return GADGET_OK;
+  }
+
+  template<class TH, class TD>
+  int GadgetInstrumentationStreamController::put_data(TH header, boost::python::object arr, const char* meta)
+  {
+    GadgetContainerMessage< TH >* m1 = new GadgetContainerMessage< TH >;
+    memcpy(m1->getObjectPtr(), &header, sizeof(TH));
+
+    // this works because the python converter for hoNDArray<std::complex<float>>
+    // is registered in the python_toolbox
+    GadgetContainerMessage< hoNDArray< TD > >* m2;
+    m2 = new GadgetContainerMessage< hoNDArray< TD > >(
+            boost::python::extract<hoNDArray < TD > >(arr)());
+    m1->cont(m2);
+
+    if (meta) {
+      GadgetContainerMessage< ISMRMRD::MetaContainer >* m3 = 
+	new GadgetContainerMessage< ISMRMRD::MetaContainer >;
+      
+      ISMRMRD::deserialize(meta, *m3->getObjectPtr());
+      m2->cont(m3);
+    }
+
+
+    ACE_Time_Value wait = ACE_OS::gettimeofday() + ACE_Time_Value(0,10000); //10ms from now
+    if (stream_.put(m1) == -1) {
+      GERROR("Failed to put stuff on stream, too long wait, %d\n",  ACE_OS::last_error () ==  EWOULDBLOCK);
+      m1->release();
+      return GADGET_FAIL;
+    }
+    return GADGET_OK;
+  }
+
+  int GadgetInstrumentationStreamController::put_acquisition(ISMRMRD::AcquisitionHeader acq, 
+							     boost::python::object arr, const char* meta)
+  {
+    return put_data<ISMRMRD::AcquisitionHeader, std::complex<float> >(acq, arr);
+  }
+
+  int GadgetInstrumentationStreamController::put_image_cplx(ISMRMRD::ImageHeader img, 
+							    boost::python::object arr, const char* meta)
+  {
+    return put_data<ISMRMRD::ImageHeader, std::complex<float> >(img, arr, meta);
+  }
+
+  int GadgetInstrumentationStreamController::put_image_float(ISMRMRD::ImageHeader img, 
+							    boost::python::object arr, const char* meta)
+  {
+    return put_data<ISMRMRD::ImageHeader, float >(img, arr, meta);
+  }
+
+  int GadgetInstrumentationStreamController::put_image_ushort(ISMRMRD::ImageHeader img, 
+							    boost::python::object arr, const char* meta)
+  {
+    return put_data<ISMRMRD::ImageHeader, unsigned short >(img, arr, meta);
+  }
+
+}
diff --git a/gadgets/python/GadgetInstrumentationStreamController.h b/gadgets/python/GadgetInstrumentationStreamController.h
new file mode 100644
index 0000000..04f4ca0
--- /dev/null
+++ b/gadgets/python/GadgetInstrumentationStreamController.h
@@ -0,0 +1,144 @@
+#ifndef GADGETINSTRUMENTATIONSTREAMCONTROLLER_H
+#define GADGETINSTRUMENTATIONSTREAMCONTROLLER_H
+
+#include "GadgetStreamInterface.h"
+#include "Gadget.h"
+#include <ismrmrd/ismrmrd.h>
+#include "python_toolbox.h"
+#include <boost/python.hpp>
+
+namespace Gadgetron{
+
+
+
+class GadgetInstrumentationStreamController 
+  : public GadgetStreamInterface
+{
+public:
+  GadgetInstrumentationStreamController();
+  int open();
+  int close();
+  int prepend_gadget(const char* gadgetname, 
+		    const char* dllname, 
+		    const char* classname);
+
+  virtual ~GadgetInstrumentationStreamController();
+
+  template<class TH, class TD> int put_data(TH header, boost::python::object arr, const char* meta = 0);
+  int put_config(const char* config);
+  int put_acquisition(ISMRMRD::AcquisitionHeader acq, boost::python::object arr, const char* meta = 0);
+  int put_image_cplx(ISMRMRD::ImageHeader img, boost::python::object arr, const char* meta = 0);
+  int put_image_float(ISMRMRD::ImageHeader img, boost::python::object arr, const char* meta = 0);
+  int put_image_ushort(ISMRMRD::ImageHeader img, boost::python::object arr, const char* meta = 0);
+  int set_python_gadget(boost::python::object g)
+  {
+    python_gadget_ = g;
+    boost::python::incref(python_gadget_.ptr());
+    return GADGET_OK;
+  }
+
+  virtual int output_ready(ACE_Message_Block* mb);
+  void set_parameter(const char* gadgetname, const char* parameter, const char* value);
+
+ protected:
+  boost::python::object python_gadget_;
+  template <class T1, class T2, class T3> int return_data(ACE_Message_Block* mb);
+};
+
+class GadgetInstrumentationStreamControllerWrapper
+{
+ public:
+  GadgetInstrumentationStreamControllerWrapper() 
+    {
+      // ensure boost can convert between hoNDArrays and NumPy arrays automatically
+      register_converter<hoNDArray<std::complex<float> > >();
+      register_converter<hoNDArray< float > >();
+      register_converter<hoNDArray< unsigned short > >();
+      // ensure boost can convert ISMRMRD headers automatically
+      register_converter<ISMRMRD::ImageHeader>();
+      register_converter<ISMRMRD::AcquisitionHeader>();
+
+      cntrl_ = new GadgetInstrumentationStreamController;
+    }
+
+  ~GadgetInstrumentationStreamControllerWrapper()
+    {
+      delete cntrl_;
+    }
+
+  int prepend_gadget(const char* gadgetname, 
+		    const char* dllname, 
+		    const char* classname)
+  {
+    return cntrl_->prepend_gadget(gadgetname,dllname,classname);
+  }
+
+  int put_config(const char* config)
+  {
+    return cntrl_->put_config(config);
+  }
+
+  int put_acquisition(ISMRMRD::AcquisitionHeader acq, boost::python::object arr)
+  {
+    return cntrl_->put_acquisition(acq, arr);
+  }
+
+
+  int put_image_cplx(ISMRMRD::ImageHeader img, boost::python::object arr)
+  {
+    return cntrl_->put_image_cplx(img,arr);
+  }
+
+  int put_image_cplx_attr(ISMRMRD::ImageHeader img, boost::python::object arr, const char* meta = 0)
+  {
+    return cntrl_->put_image_cplx(img,arr, meta);
+  }
+
+  int put_image_float(ISMRMRD::ImageHeader img, boost::python::object arr)
+  {
+    return cntrl_->put_image_float(img,arr);
+  }
+
+  int put_image_float_attr(ISMRMRD::ImageHeader img, boost::python::object arr, const char* meta = 0)
+  {
+    return cntrl_->put_image_float(img,arr, meta);
+  }
+
+  int put_image_ushort(ISMRMRD::ImageHeader img, boost::python::object arr)
+  {
+    return cntrl_->put_image_ushort(img,arr);
+  }
+
+  int put_image_ushort_attr(ISMRMRD::ImageHeader img, boost::python::object arr, const char* meta = 0)
+  {
+    return cntrl_->put_image_ushort(img,arr, meta);
+  }
+
+
+  int close()
+  {
+    // allow other threads to finish returning data to Python
+    Py_BEGIN_ALLOW_THREADS;
+    cntrl_->close();
+    Py_END_ALLOW_THREADS;
+    return 0;
+  }
+
+  int set_python_gadget(boost::python::object g)
+  {
+    return cntrl_->set_python_gadget(g);
+  }
+
+  void set_parameter(const char* gadgetname, const char* parameter, const char* value)
+  {
+    cntrl_->set_parameter(gadgetname, parameter, value);
+  }
+
+ protected:
+  GadgetInstrumentationStreamController* cntrl_;
+  
+
+};
+
+}
+#endif //GADGETINSTRUMENTATIONSTREAMCONTROLLER_H
diff --git a/gadgets/python/GadgetReference.cpp b/gadgets/python/GadgetReference.cpp
new file mode 100644
index 0000000..0dcb93d
--- /dev/null
+++ b/gadgets/python/GadgetReference.cpp
@@ -0,0 +1,109 @@
+#include "Gadget.h"
+#include "GadgetReference.h"
+#include "GadgetContainerMessage.h"
+#include "hoNDArray.h"
+#include <ismrmrd/ismrmrd.h>
+#include <ismrmrd/meta.h>
+
+/* #include <boost/preprocessor/stringize.hpp> */
+#include <boost/python.hpp>
+
+namespace Gadgetron{
+
+  GadgetReference::GadgetReference()
+    : gadget_(nullptr)
+  {
+  }
+
+  GadgetReference::~GadgetReference()
+  {
+  }
+
+  template<class TH, class TD>
+  int GadgetReference::return_data(TH header, boost::python::object arr, const char* meta)
+  {
+    GadgetContainerMessage< TH >* m1 = new GadgetContainerMessage< TH >;
+    memcpy(m1->getObjectPtr(), &header, sizeof(TH));
+
+    // this works because the python converter for hoNDArray<std::complex<float>>
+    // is registered in the python_toolbox
+    GadgetContainerMessage< hoNDArray< TD > >* m2;
+    m2 = new GadgetContainerMessage< hoNDArray< TD > >(
+	    boost::python::extract<hoNDArray < TD > >(arr)());
+    m1->cont(m2);
+
+    if (meta) {
+      GadgetContainerMessage< ISMRMRD::MetaContainer >* m3 = 
+	new GadgetContainerMessage< ISMRMRD::MetaContainer >;
+      
+      ISMRMRD::deserialize(meta, *m3->getObjectPtr());
+      m2->cont(m3);
+    }
+
+    if (gadget_) {
+      //ACE_Time_Value wait = ACE_OS::gettimeofday() + ACE_Time_Value(0,1000); //1ms from now
+      ACE_Time_Value nowait (ACE_OS::gettimeofday ());
+      //GDEBUG("Returning data (%s)\n", gadget_->module()->name());
+      if (gadget_->next()->putq(m1,&nowait) == -1) {
+	m1->release();
+	//if (gadget_->next()->putq(m1) == -1) {
+	/*
+	  GDEBUG("Putting message on Queue failed (%s)\n", gadget_->module()->name());
+	  GDEBUG("Message Q: low mark %d, high mark %d, message bytes %d, message count %d\n",
+	  gadget_->next()->msg_queue()->low_water_mark(), gadget_->next()->msg_queue()->high_water_mark(),
+	  gadget_->next()->msg_queue()->message_bytes(),gadget_->next()->msg_queue()->message_count());
+	*/
+	//GDEBUG("FAIL Returning data (%s)\n", gadget_->module()->name());
+	return GADGET_FAIL;
+      } else {
+	//GDEBUG("SUCCESS Returning data (%s)\n", gadget_->module()->name());
+
+	return GADGET_OK;
+      }
+      //return gadget_->next()->putq(m1);
+    } else {
+      GDEBUG("Data received from python, but no Gadget registered for output\n");
+      m1->release();
+      return GADGET_OK;
+    }
+
+    return GADGET_OK;
+  }
+
+  int GadgetReference::return_acquisition(ISMRMRD::AcquisitionHeader acq, boost::python::object arr)
+  {
+    return return_data<ISMRMRD::AcquisitionHeader, std::complex<float> >(acq, arr, 0);
+  }
+
+  int GadgetReference::return_image_cplx(ISMRMRD::ImageHeader img, boost::python::object arr)
+  {
+    return return_data<ISMRMRD::ImageHeader, std::complex<float> >(img, arr, 0);
+  }
+
+  int GadgetReference::return_image_cplx_attr(ISMRMRD::ImageHeader img, boost::python::object arr, const char* meta)
+  {
+    return return_data<ISMRMRD::ImageHeader, std::complex<float> >(img, arr, meta);
+  }
+
+
+  int GadgetReference::return_image_float(ISMRMRD::ImageHeader img, boost::python::object arr)
+  {
+    return return_data<ISMRMRD::ImageHeader, float>(img, arr, 0);
+  }
+
+  int GadgetReference::return_image_float_attr(ISMRMRD::ImageHeader img, boost::python::object arr, const char* meta)
+  {
+    return return_data<ISMRMRD::ImageHeader, float>(img, arr, meta);
+  }
+
+  int GadgetReference::return_image_ushort(ISMRMRD::ImageHeader img, boost::python::object arr)
+  {
+    return return_data<ISMRMRD::ImageHeader, unsigned short>(img, arr, 0);
+  }
+
+  int GadgetReference::return_image_ushort_attr(ISMRMRD::ImageHeader img, boost::python::object arr, const char* meta)
+  {
+    return return_data<ISMRMRD::ImageHeader, unsigned short>(img, arr, meta);
+  }
+
+}
diff --git a/gadgets/python/GadgetReference.h b/gadgets/python/GadgetReference.h
new file mode 100644
index 0000000..511faa2
--- /dev/null
+++ b/gadgets/python/GadgetReference.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "Gadget.h"
+#include "gadgetronpython_export.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <boost/python.hpp>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSPYTHON GadgetReference
+  {
+
+  public:
+    GadgetReference();
+    ~GadgetReference();
+
+    int set_gadget(Gadget* g)
+    {
+      gadget_ = g;
+      return 0;
+    }
+
+    template<class TH, class TD> int return_data(TH header, boost::python::object arr, const char* meta = 0);
+    int return_acquisition(ISMRMRD::AcquisitionHeader acq, boost::python::object arr);
+    int return_image_cplx(ISMRMRD::ImageHeader img, boost::python::object arr);
+    int return_image_cplx_attr(ISMRMRD::ImageHeader img, boost::python::object arr, const char* meta);
+    int return_image_float(ISMRMRD::ImageHeader img, boost::python::object arr);
+    int return_image_float_attr(ISMRMRD::ImageHeader img, boost::python::object arr, const char* meta);
+    int return_image_ushort(ISMRMRD::ImageHeader img, boost::python::object arr);
+    int return_image_ushort_attr(ISMRMRD::ImageHeader img, boost::python::object arr, const char* meta);
+
+  protected:
+    Gadget* gadget_;
+  };
+}
diff --git a/gadgets/python/GadgetronPythonMRI.cpp b/gadgets/python/GadgetronPythonMRI.cpp
new file mode 100644
index 0000000..295e8f5
--- /dev/null
+++ b/gadgets/python/GadgetronPythonMRI.cpp
@@ -0,0 +1,37 @@
+#include "GadgetReference.h"
+#include "GadgetInstrumentationStreamController.h"
+#include <boost/python.hpp>
+#include "../mri_core/GadgetMRIHeaders.h"
+#include <ismrmrd/ismrmrd.h>
+
+using namespace boost::python;
+
+BOOST_PYTHON_MODULE(GadgetronPythonMRI)
+{
+    boost::python::numeric::array::set_module_and_type("numpy", "ndarray");
+
+    class_<Gadgetron::GadgetReference>("GadgetReference")
+      .def("return_acquisition", &Gadgetron::GadgetReference::return_acquisition)
+      .def("return_image_cplx", &Gadgetron::GadgetReference::return_image_cplx)
+      .def("return_image_cplx_attr", &Gadgetron::GadgetReference::return_image_cplx_attr)
+      .def("return_image_float", &Gadgetron::GadgetReference::return_image_float)
+      .def("return_image_float_attr", &Gadgetron::GadgetReference::return_image_float_attr)
+      .def("return_image_ushort", &Gadgetron::GadgetReference::return_image_ushort)
+      .def("return_image_ushort_attr", &Gadgetron::GadgetReference::return_image_ushort_attr)
+      ;
+
+    class_<Gadgetron::GadgetInstrumentationStreamControllerWrapper>("GadgetInstrumentationStreamController")
+      .def("put_config", &Gadgetron::GadgetInstrumentationStreamControllerWrapper::put_config)
+      .def("put_acquisition", &Gadgetron::GadgetInstrumentationStreamControllerWrapper::put_acquisition)
+      .def("put_image_cplx", &Gadgetron::GadgetInstrumentationStreamControllerWrapper::put_image_cplx)
+      .def("put_image_cplx_attr", &Gadgetron::GadgetInstrumentationStreamControllerWrapper::put_image_cplx_attr)
+      .def("put_image_float", &Gadgetron::GadgetInstrumentationStreamControllerWrapper::put_image_float)
+      .def("put_image_float_attr", &Gadgetron::GadgetInstrumentationStreamControllerWrapper::put_image_float_attr)
+      .def("put_image_ushort", &Gadgetron::GadgetInstrumentationStreamControllerWrapper::put_image_ushort)
+      .def("put_image_ushort_attr", &Gadgetron::GadgetInstrumentationStreamControllerWrapper::put_image_ushort_attr)
+      .def("prepend_gadget", &Gadgetron::GadgetInstrumentationStreamControllerWrapper::prepend_gadget)
+      .def("close", &Gadgetron::GadgetInstrumentationStreamControllerWrapper::close)
+      .def("set_python_gadget", &Gadgetron::GadgetInstrumentationStreamControllerWrapper::set_python_gadget)
+      .def("set_parameter", &Gadgetron::GadgetInstrumentationStreamControllerWrapper::set_parameter)
+      ;    
+}
diff --git a/gadgets/python/PythonGadget.cpp b/gadgets/python/PythonGadget.cpp
new file mode 100644
index 0000000..fc96640
--- /dev/null
+++ b/gadgets/python/PythonGadget.cpp
@@ -0,0 +1,71 @@
+#include "PythonGadget.h"
+
+namespace Gadgetron{
+  int PythonGadget::process(ACE_Message_Block* mb)
+  {
+    GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* hma = AsContainerMessage<ISMRMRD::AcquisitionHeader>(mb);
+    if (hma) {
+      GadgetContainerMessage< hoNDArray< std::complex<float> > >* dmb = AsContainerMessage< hoNDArray< std::complex<float> > >(hma->cont());
+      if (!dmb) {
+	GERROR("Error converting data array from message block for Acquisition\n");
+	hma->release();
+	return GADGET_FAIL;;
+      }
+      GadgetContainerMessage< ISMRMRD::MetaContainer>* mmb = AsContainerMessage< ISMRMRD::MetaContainer >(dmb->cont());
+      this->process(hma,dmb,mmb);
+    } else {
+      GadgetContainerMessage<ISMRMRD::ImageHeader>* hmi = AsContainerMessage<ISMRMRD::ImageHeader>(mb);
+      if (!hmi) {
+	if (pass_on_undesired_data.value()) {
+	  return this->next()->putq(mb);
+	} else {
+	  GERROR("This is neither an acquisition or an image. Something is wrong here");
+	  mb->release();
+	  return GADGET_FAIL;
+	}
+      }
+      
+      ISMRMRD::ImageHeader* h = hmi->getObjectPtr();
+      GadgetContainerMessage< ISMRMRD::MetaContainer>* mmb = 0;
+
+      if (hmi->cont()) {
+	mmb = AsContainerMessage< ISMRMRD::MetaContainer >(hmi->cont()->cont());
+      }
+
+      switch (h->data_type) {	
+      case (ISMRMRD::ISMRMRD_USHORT):
+	return this->process(hmi, AsContainerMessage< hoNDArray< uint16_t > >(hmi->cont()), mmb);
+	break;
+      case (ISMRMRD::ISMRMRD_SHORT):
+	return this->process(hmi, AsContainerMessage< hoNDArray< int16_t > >(hmi->cont()), mmb);
+	break;
+      case (ISMRMRD::ISMRMRD_UINT):
+	return this->process(hmi, AsContainerMessage< hoNDArray< uint32_t > >(hmi->cont()), mmb);
+	break;
+      case (ISMRMRD::ISMRMRD_INT):
+	return this->process(hmi, AsContainerMessage< hoNDArray< int32_t > >(hmi->cont()), mmb);
+	break;
+      case (ISMRMRD::ISMRMRD_FLOAT):
+	return this->process(hmi, AsContainerMessage< hoNDArray< float > >(hmi->cont()), mmb);
+	break;
+      case (ISMRMRD::ISMRMRD_DOUBLE):
+	return this->process(hmi, AsContainerMessage< hoNDArray< double > >(hmi->cont()), mmb);
+	break;
+      case (ISMRMRD::ISMRMRD_CXFLOAT):
+	return this->process(hmi, AsContainerMessage< hoNDArray< std::complex<float> > >(hmi->cont()), mmb);
+	break;
+      case (ISMRMRD::ISMRMRD_CXDOUBLE):
+	return this->process(hmi, AsContainerMessage< hoNDArray< std::complex<double> > >(hmi->cont()), mmb);
+	break;
+      default:
+	GERROR("Unknown image data_type %d received\n", h->data_type);
+	hmi->release();
+	return GADGET_FAIL;
+	break;
+      }
+    }
+    return GADGET_OK;
+  }
+  
+  GADGET_FACTORY_DECLARE(PythonGadget)
+}
diff --git a/gadgets/python/PythonGadget.h b/gadgets/python/PythonGadget.h
new file mode 100644
index 0000000..69a3e38
--- /dev/null
+++ b/gadgets/python/PythonGadget.h
@@ -0,0 +1,209 @@
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "GadgetReference.h"
+#include "gadgetronpython_export.h"
+#include "python_toolbox.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <ismrmrd/meta.h>
+#include <boost/python.hpp>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSPYTHON PythonGadget : public BasicPropertyGadget
+    {
+    public:
+      GADGET_DECLARE(PythonGadget);
+
+      /*
+	We are overloading this function from the base class to be able to capture a copy
+	of the properties that should be passed on to the Python class itself.
+       */
+      virtual int set_parameter(const char* name, const char* val, bool trigger = true) {
+	GadgetPropertyBase* p = this->find_property(name);
+	if (p) {
+	  //This is a property, pass it on to the Gadget base class
+	  return Gadget::set_parameter(name,val,trigger);
+	} else {
+	  //This is probably information for the Python class itself
+	  this->parameters_python_[std::string(name)] = std::string(val);
+	}
+	return GADGET_OK;
+      }
+
+    protected:
+      int process_config(ACE_Message_Block* mb)
+      {
+          if (initialize_python() != GADGET_OK) {
+            GDEBUG("Failed to initialize Python in Gadget %s\n", this->module()->name());
+            return GADGET_FAIL;
+          }
+
+          // ensure boost can convert between hoNDArrays and NumPy arrays automatically
+          register_converter<hoNDArray< std::complex<float> > >();
+          register_converter<hoNDArray< float > >();
+          register_converter<hoNDArray< unsigned short > >();
+
+          // ensure boost can convert ISMRMRD headers automatically
+          register_converter<ISMRMRD::ImageHeader>();
+          register_converter<ISMRMRD::AcquisitionHeader>();
+
+	  std::string pypath        = python_path.value();
+	  std::string pymod         = python_module.value();
+	  std::string pyclass       = python_class.value();
+
+          GDEBUG("Python Path            : %s\n", pypath.c_str());
+          GDEBUG("Python Module          : %s\n", pymod.c_str());
+          GDEBUG("Python Class           : %s\n", pyclass.c_str());
+
+        if (add_python_path(pypath) != GADGET_OK) {
+            GDEBUG("Failed to add paths in Gadget %s\n", this->module()->name());
+            return GADGET_FAIL;
+        }
+
+        std::string module_name = pymod;
+        std::string class_name = pyclass;
+
+        if (module_name.size() == 0) {
+            GDEBUG("Null module name received in Gadget %s\n", this->module()->name());
+            return GADGET_FAIL;
+        }
+        if (class_name.size() == 0) {
+            GDEBUG("Null class name received in Gadget %s\n", this->module()->name());
+            return GADGET_FAIL;
+        }
+
+        GILLock lock;
+        try {
+            module_ = boost::python::import(module_name.c_str());
+
+            // Reload the module so changes take place at Gadgetron runtime
+            boost::python::import("__main__").attr("__dict__")[module_name.c_str()] = module_;
+            std::string tmp = std::string("reload(") + std::string(module_name.c_str()) + std::string(")\n");
+
+            //GDEBUG("Reloading with command: %s\n", tmp.c_str());
+            boost::python::exec(tmp.c_str(), boost::python::import("__main__").attr("__dict__"));
+
+            gadget_ref_ = boost::shared_ptr<GadgetReference>(new GadgetReference());
+            gadget_ref_->set_gadget(this);
+
+            // Create instance of class (passing gadget reference)
+            class_ = module_.attr(class_name.c_str())(gadget_ref_.get());
+            // Increment reference count of Python class so that both the C++
+            // destructor and the interpreter can decrement its reference count
+            boost::python::incref(class_.ptr());
+
+        } catch (boost::python::error_already_set const &) {
+            GDEBUG("Error loading python modules in Gadget %s\n", this->module()->name());
+            PyErr_Print();
+            return GADGET_FAIL;
+        }
+
+	//Transfer all properties/parameters to Python gadget
+	std::map<std::string,std::string>::iterator it;
+	it = parameters_python_.begin();
+	while (it != parameters_python_.end()) {
+	  std::string var_name = it->first;
+	  std::string var_val  = it->second;
+	  try {
+	    boost::python::object set_parameter_fn = class_.attr("set_parameter");
+	    boost::python::object ignored = set_parameter_fn(var_name,var_val);
+	  } catch (boost::python::error_already_set const &) {
+            GERROR("Error setting PythonGadget parameters in Gadget %s\n", this->module()->name());
+            PyErr_Print();
+            return GADGET_FAIL;
+	  }
+	  it++;
+	}
+
+        try {
+            // retrieve and call python gadget's process_config method
+            boost::python::object process_config_fn = class_.attr("process_config");
+            boost::python::object ignored = process_config_fn(
+                    boost::python::object(std::string(mb->rd_ptr())));
+        } catch (boost::python::error_already_set const &) {
+            GERROR("Error calling process_config in Gadget %s\n", this->module()->name());
+            PyErr_Print();
+            return GADGET_FAIL;
+        }
+
+        return GADGET_OK;
+      }
+
+      template <typename H, typename D> int process(GadgetContainerMessage<H>* hmb,
+						    GadgetContainerMessage< hoNDArray< D > >* dmb,
+						    GadgetContainerMessage< ISMRMRD::MetaContainer>* mmb)
+      {
+	if (!dmb) {
+	  GERROR("Received null pointer to data block");
+	  return GADGET_FAIL;
+	}
+	
+	// We want to avoid a deadlock for the Python GIL if this python call
+	// results in an output that the GadgetReference will not be able to
+	// get rid of.
+	// This is kind of a nasty busy wait, maybe we should add an event
+	// handler to the NotificationStrategy of the Q or something, but
+	// for now, this will do it.
+	while (this->next()->msg_queue()->is_full()) {
+	  // GDEBUG("Gadget (%s) sleeping while downstream Gadget (%s) does some work\n",
+	  //        this->module()->name(), this->next()->module()->name());
+	  // Sleep for 10ms while the downstream Gadget does some work
+	  ACE_Time_Value tv(0,10000);
+	  ACE_OS::sleep(tv);
+	}
+	
+	H head = *hmb->getObjectPtr();
+	hoNDArray< D > *data = dmb->getObjectPtr();
+	ISMRMRD::MetaContainer* meta = 0;
+	if (mmb) {
+	  meta = mmb->getObjectPtr();
+	}
+	
+	GILLock lock;
+	try {
+	  boost::python::object process_fn = class_.attr("process");
+	  int res;
+	  if (meta) {
+	    std::stringstream str;
+	    ISMRMRD::serialize(*meta, str);
+	    res = boost::python::extract<int>(process_fn(head, data, str.str()));
+	  } else {
+	    res = boost::python::extract<int>(process_fn(head, data));
+	  }
+	  if (res != GADGET_OK) {
+	    GDEBUG("Gadget (%s) Returned from python call with error\n",
+		   this->module()->name());
+	    return GADGET_FAIL;
+	  }
+	  //Else we are done with this now.
+	  hmb->release();
+	} catch(boost::python::error_already_set const &) {
+	  GDEBUG("Passing data on to python module failed\n");
+	  PyErr_Print();
+	  return GADGET_FAIL;
+	}
+	return GADGET_OK;
+      }
+
+      virtual int process(ACE_Message_Block* mb); 
+
+    protected:
+      GADGET_PROPERTY(python_module, std::string, "Python module containing the Python Gadget class to be loaded", "");
+      GADGET_PROPERTY(python_class, std::string, "Python class to load from python module", "");
+      GADGET_PROPERTY(python_path, std::string, "Path(s) to add to the to the Python search path", "");
+
+    private:
+      boost::python::object module_;
+      boost::python::object class_;
+      boost::shared_ptr<GadgetReference> gadget_ref_;
+
+      /*
+	We are going to keep a copy of the parameters in this gadget that are not properties.
+	They should be passed on to the Python class. 
+      */
+      std::map< std::string, std::string> parameters_python_; 
+    };
+}
diff --git a/gadgets/python/config/CMakeLists.txt b/gadgets/python/config/CMakeLists.txt
new file mode 100644
index 0000000..baff67e
--- /dev/null
+++ b/gadgets/python/config/CMakeLists.txt
@@ -0,0 +1,2 @@
+install(FILES python.xml python_short.xml python_tpat_snr_scale.xml
+  DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
diff --git a/gadgets/python/config/python.xml b/gadgets/python/config/python.xml
new file mode 100644
index 0000000..204173e
--- /dev/null
+++ b/gadgets/python/config/python.xml
@@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>RemoveOversamplingPython</name>
+      <dll>gadgetron_python</dll>
+      <classname>PythonGadget</classname>
+      <property><name>python_path</name>                  <value>/home/myuser/scripts/python</value></property>
+      <property><name>python_module</name>                <value>remove_2x_oversampling</value></property>
+      <property><name>python_class</name>                <value>Remove2xOversampling</value></property>
+    </gadget>
+
+    <gadget>
+      <name>AccReconPython</name>
+      <dll>gadgetron_python</dll>
+      <classname>PythonGadget</classname>
+      <property><name>python_path</name>                  <value>/home/myuser/scripts/python</value></property>
+      <property><name>python_module</name>                <value>accumulate_and_recon</value></property>
+      <property><name>python_class</name>                <value>AccumulateAndRecon</value></property>
+    </gadget>
+
+    <gadget>
+      <name>CoilCombinePython</name>
+      <dll>gadgetron_python</dll>
+      <classname>PythonGadget</classname>
+      <property><name>python_path</name>                  <value>/home/myuser/scripts/python</value></property>
+      <property><name>python_module</name>                <value>rms_coil_combine</value></property>
+      <property><name>python_class</name>                <value>RMSCoilCombine</value></property>
+    </gadget>
+
+    <gadget>
+      <name>ImageViewPython</name>
+      <dll>gadgetron_python</dll>
+      <classname>PythonGadget</classname>
+      <property><name>python_path</name>                  <value>/home/myuser/scripts/python</value></property>
+      <property><name>python_module</name>                <value>image_viewer</value></property>
+      <property><name>python_class</name>                <value>ImageViewer</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+     </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/python/config/python_short.xml b/gadgets/python/config/python_short.xml
new file mode 100644
index 0000000..1b3a191
--- /dev/null
+++ b/gadgets/python/config/python_short.xml
@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>RemoveOversamplingPython</name>
+      <dll>gadgetron_python</dll>
+      <classname>PythonGadget</classname>
+      <property><name>python_path</name>                  <value>/home/myuser/scripts/python</value></property>
+      <property><name>python_module</name>                <value>remove_2x_oversampling</value></property>
+      <property><name>python_class</name>                <value>Remove2xOversampling</value></property>
+    </gadget>
+
+    <gadget>
+      <name>AccReconPython</name>
+      <dll>gadgetron_python</dll>
+      <classname>PythonGadget</classname>
+      <property><name>python_path</name>                  <value>/home/myuser/scripts/python</value></property>
+      <property><name>python_module</name>                <value>accumulate_and_recon</value></property>
+      <property><name>python_class</name>                <value>AccumulateAndRecon</value></property>
+    </gadget>
+
+    <gadget>
+      <name>CoilCombinePython</name>
+      <dll>gadgetron_python</dll>
+      <classname>PythonGadget</classname>
+      <property><name>python_path</name>                  <value>/home/myuser/scripts/python</value></property>
+      <property><name>python_module</name>                <value>rms_coil_combine</value></property>
+      <property><name>python_class</name>                <value>RMSCoilCombine</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+     </gadget>
+
+     <gadget>
+      <name>Autoscale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+     </gadget>
+    
+     <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+  
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/python/config/python_tpat_snr_scale.xml b/gadgets/python/config/python_tpat_snr_scale.xml
new file mode 100644
index 0000000..05c1b87
--- /dev/null
+++ b/gadgets/python/config/python_tpat_snr_scale.xml
@@ -0,0 +1,71 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdj</name>
+      <dll>gadgetron_python</dll>
+      <classname>PythonGadget</classname>
+      <property><name>python_module</name>                <value>tpat_snr_scale</value></property>
+      <property><name>python_class</name>                <value>NoiseAdj</value></property>
+    </gadget>
+
+    <gadget>
+      <name>RemOS</name>
+      <dll>gadgetron_python</dll>
+      <classname>PythonGadget</classname>
+      <property><name>python_module</name>                <value>tpat_snr_scale</value></property>
+      <property><name>python_class</name>                <value>RemOS</value></property>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_python</dll>
+      <classname>PythonGadget</classname>
+      <property><name>python_module</name>                <value>tpat_snr_scale</value></property>
+      <property><name>python_class</name>                <value>PCA</value></property>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduce</name>
+      <dll>gadgetron_python</dll>
+      <classname>PythonGadget</classname>
+      <property><name>python_module</name>                <value>tpat_snr_scale</value></property>
+      <property><name>python_class</name>                <value>CoilReduce</value></property>
+    </gadget>
+
+    <gadget>
+      <name>Recon</name>
+      <dll>gadgetron_python</dll>
+      <classname>PythonGadget</classname>
+      <property><name>python_module</name>                <value>tpat_snr_scale</value></property>
+      <property><name>python_class</name>                 <value>Recon</value></property>
+      <property><name>pmri_method</name>                   <value>grappa</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+     </gadget>
+  
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/python/examples/mixed_gadgets.py b/gadgets/python/examples/mixed_gadgets.py
new file mode 100644
index 0000000..68c49cd
--- /dev/null
+++ b/gadgets/python/examples/mixed_gadgets.py
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Mar 10 11:11:39 2015
+
+ at author: Michael S.Hansens
+"""
+
+
+#%% imports
+import os
+import sys
+
+sys.path.append(os.environ['GADGETRON_HOME'] + '/share/gadgetron/python')
+
+import ismrmrd
+import ismrmrd.xsd
+import numpy as np
+from ismrmrdtools import show
+from remove_2x_oversampling import Remove2xOversampling
+from accumulate_and_recon import AccumulateAndRecon
+from rms_coil_combine import RMSCoilCombine
+from gadgetron import WrapperGadget
+import GadgetronPythonMRI as g
+import threading
+import time
+
+#%% Setup gadgets
+g4 = WrapperGadget("gadgetron_mricore","ExtractGadget", next_gadget=None)
+g3 = RMSCoilCombine(g4)
+g2 = AccumulateAndRecon(g3)
+g1 = Remove2xOversampling(g2)
+g0 = WrapperGadget("gadgetron_mricore","NoiseAdjustGadget",next_gadget=g1)
+
+
+def gadget_wait_function(first_gadget):
+    g = first_gadget;
+    while (g):
+        g.wait()
+        g = g.next_gadget
+
+def gadget_config(first_gadget, conf):
+    g = first_gadget;
+    while (g):
+        g.process_config(conf)
+        g = g.next_gadget
+    
+
+#%% Load file
+filename = 'testdata.h5'
+if not os.path.isfile(filename):
+    print("%s is not a valid file" % filename)
+    raise SystemExit
+dset = ismrmrd.Dataset(filename, 'dataset', create_if_needed=False)
+
+#%% Send in data
+#First ISMRMRD XML header
+gadget_config(g0,dset.read_xml_header())
+
+# Loop through the rest of the acquisitions and stuff
+for acqnum in range(0,dset.number_of_acquisitions()):
+    acq = dset.read_acquisition(acqnum)
+    g0.process(acq.getHead(),acq.data.astype('complex64'))
+
+#%%
+gadget_wait_function(g0)
+
+print g4.get_results()
diff --git a/gadgets/python/examples/mixed_gadgets_gpu.py b/gadgets/python/examples/mixed_gadgets_gpu.py
new file mode 100644
index 0000000..589b915
--- /dev/null
+++ b/gadgets/python/examples/mixed_gadgets_gpu.py
@@ -0,0 +1,181 @@
+#%% imports
+import os
+import sys
+
+sys.path.append(os.environ['GADGETRON_HOME'] + '/share/gadgetron/python')
+
+import ismrmrd
+import ismrmrd.xsd
+import numpy as np
+from ismrmrdtools import show
+from gadgetron import WrapperGadget
+import GadgetronPythonMRI as g
+
+
+  # <gadget>
+  #   <name>NoiseAdjust</name>
+  #   <dll>gadgetron_mricore</dll>
+  #   <classname>NoiseAdjustGadget</classname>
+  # </gadget>
+
+g1 = WrapperGadget("gadgetron_mricore","NoiseAdjustGadget")
+
+  
+  # <gadget>
+  #   <name>PCA</name>
+  #   <dll>gadgetron_mricore</dll>
+  #   <classname>PCACoilGadget</classname>
+  # </gadget>
+  
+g2 = WrapperGadget("gadgetron_mricore","PCACoilGadget", next_gadget=None)
+g1.next_gadget = g2
+
+  # <gadget>
+  #   <name>CoilReduction</name>
+  #   <dll>gadgetron_mricore</dll>
+  #   <classname>CoilReductionGadget</classname>
+  #   <property><name>coils_out</name><value>16</value></property>
+  # </gadget>
+
+g3 = WrapperGadget("gadgetron_mricore","CoilReductionGadget", next_gadget=None)
+g3.set_parameter("CoilReductionGadget","coils_out","16");
+g2.next_gadget = g3
+
+  # <gadget>
+  #   <name>gpuSpiralSensePrepGadget</name>
+  #   <dll>gadgetron_spiral</dll>
+  #   <classname>gpuSpiralSensePrepGadget</classname>
+  #   <property><name>deviceno</name><value>0</value></property>
+  #   <property><name>use_multiframe_grouping</name><value>true</value></property>
+  #   <property><name>propagate_csm_from_set</name><value>0</value></property>
+  #   <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+  #   <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+  #   <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+  #   <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  # </gadget>
+  
+  # <gadget>
+  #   <name>gpuCgSenseGadget</name>
+  #   <dll>gadgetron_gpuparallelmri</dll>
+  #   <classname>gpuCgSenseGadget</classname>
+  #   <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+  #   <property><name>deviceno</name>                <value>0</value></property>
+  #   <property><name>setno</name>                   <value>0</value></property>
+  #   <property><name>number_of_iterations</name>    <value>10</value></property>
+  #   <property><name>cg_limit</name>                <value>1e-6</value></property>
+  #   <property><name>oversampling_factor</name>     <value>1.25</value></property>
+  #   <property><name>kernel_width</name>            <value>5.5</value></property>
+  #   <property><name>kappa</name>                   <value>0.3</value></property>
+  # </gadget>
+
+  # <gadget>
+  #   <name>gpuCgSenseGadget</name>
+  #   <dll>gadgetron_gpuparallelmri</dll>
+  #   <classname>gpuCgSenseGadget</classname>
+  #   <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+  #   <property><name>deviceno</name>                <value>0</value></property>
+  #   <property><name>setno</name>                   <value>1</value></property>
+  #   <property><name>number_of_iterations</name>    <value>10</value></property>
+  #   <property><name>cg_limit</name>                <value>1e-6</value></property>
+  #   <property><name>oversampling_factor</name>     <value>1.25</value></property>
+  #   <property><name>kernel_width</name>            <value>5.5</value></property>
+  #   <property><name>kappa</name>                   <value>0.3</value></property>
+  # </gadget>
+
+g4 = WrapperGadget("gadgetron_gpuparallelmri","gpuCgSenseGadget",gadgetname="gpuCgSenseGadget1", next_gadget=None)
+g4.prepend_gadget("gadgetron_gpuparallelmri","gpuCgSenseGadget", gadgetname="gpuCgSenseGadget2")
+g4.prepend_gadget("gadgetron_spiral","gpuSpiralSensePrepGadget",gadgetname="gpuSpiralSensePrepGadget")
+
+g4.set_parameter("gpuSpiralSensePrepGadget","deviceno","0")
+g4.set_parameter("gpuSpiralSensePrepGadget","use_multiframe_grouping","true")
+g4.set_parameter("gpuSpiralSensePrepGadget","propagate_csm_from_set","0")
+g4.set_parameter("gpuSpiralSensePrepGadget","buffer_convolution_kernel_width","5.5")
+g4.set_parameter("gpuSpiralSensePrepGadget","buffer_convolution_oversampling_factor","1.25")
+g4.set_parameter("gpuSpiralSensePrepGadget","reconstruction_os_factor_x","1.5")
+g4.set_parameter("gpuSpiralSensePrepGadget","reconstruction_os_factor_y","1.5")
+
+g4.set_parameter("gpuCgSenseGadget1","pass_on_undesired_data","true")
+g4.set_parameter("gpuCgSenseGadget1","deviceno","0")
+g4.set_parameter("gpuCgSenseGadget1","setno","0")
+g4.set_parameter("gpuCgSenseGadget1","number_of_iterations","10")
+g4.set_parameter("gpuCgSenseGadget1","cg_limit","1e-6")
+g4.set_parameter("gpuCgSenseGadget1","oversampling_factor","1.25")
+g4.set_parameter("gpuCgSenseGadget1","kernel_width","5.5")
+g4.set_parameter("gpuCgSenseGadget1","kappa","0.3")
+
+g4.set_parameter("gpuCgSenseGadget2","pass_on_undesired_data","true")
+g4.set_parameter("gpuCgSenseGadget2","deviceno","0") #Think this should be "1"
+g4.set_parameter("gpuCgSenseGadget2","setno","1")
+g4.set_parameter("gpuCgSenseGadget2","number_of_iterations","10")
+g4.set_parameter("gpuCgSenseGadget2","cg_limit","1e-6")
+g4.set_parameter("gpuCgSenseGadget2","oversampling_factor","1.25")
+g4.set_parameter("gpuCgSenseGadget2","kernel_width","5.5")
+g4.set_parameter("gpuCgSenseGadget2","kappa","0.3")
+
+g3.next_gadget = g4
+
+  # <gadget>
+  #   <name>PhaseSubtraction</name>
+  #   <dll>gadgetron_mricore</dll>
+  #   <classname>FlowPhaseSubtractionGadget</classname>
+  # </gadget>
+  
+g5 = WrapperGadget("gadgetron_mricore","FlowPhaseSubtractionGadget", next_gadget=None)
+
+g4.next_gadget = g5
+  # <gadget>
+  #   <name>MaxwellCorrection</name>
+  #   <dll>gadgetron_mricore</dll>
+  #   <classname>MaxwellCorrectionGadget</classname>
+  # </gadget>
+
+g6 = WrapperGadget("gadgetron_mricore","MaxwellCorrectionGadget", next_gadget=None)
+
+g5.next_gadget = g6;
+  
+  # <gadget>
+  #   <name>Extract</name>
+  #   <dll>gadgetron_mricore</dll>
+  #   <classname>ExtractGadget</classname>
+  #   <property><name>extract_mask</name><value>9</value></property>
+  # </gadget>
+
+g7 = WrapperGadget("gadgetron_mricore","ExtractGadget", next_gadget=None)
+g7.set_parameter("ExtractGadget","extract_mask","9")
+
+g6.next_gadget = g7
+
+def gadget_wait_function(first_gadget):
+    g = first_gadget;
+    while (g):
+        g.wait()
+        g = g.next_gadget
+
+def gadget_config(first_gadget, conf):
+    g = first_gadget;
+    while (g):
+        g.process_config(conf)
+        g = g.next_gadget
+    
+
+#%% Load file
+filename = '/home/hansenms/temp/simple_spiral.h5'
+if not os.path.isfile(filename):
+    print("%s is not a valid file" % filename)
+    raise SystemExit
+dset = ismrmrd.Dataset(filename, 'dataset', create_if_needed=False)
+
+#%% Send in data
+#First ISMRMRD XML header
+gadget_config(g1,dset.read_xml_header())
+
+# Loop through the rest of the acquisitions and stuff
+for acqnum in range(0,dset.number_of_acquisitions()):
+    print "Sending in acquisition " + str(acqnum) + " of " + str(dset.number_of_acquisitions())
+    acq = dset.read_acquisition(acqnum)
+    g1.process(acq.getHead(),acq.data.astype('complex64'))
+
+# #%%
+gadget_wait_function(g1)
+
+print g7.get_results()
diff --git a/gadgets/python/examples/pure_python_demo.py b/gadgets/python/examples/pure_python_demo.py
new file mode 100644
index 0000000..7c81ef0
--- /dev/null
+++ b/gadgets/python/examples/pure_python_demo.py
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Mar  1 19:59:54 2015
+
+This is a simple example of using Python Gadgets in a standalone Python 
+environment. 
+
+In order to run this example, you need the ismrmrd-python API and also the 
+ismrmrd-python-tools toolbox. 
+
+These tools depend on h5py (version 2.3 or higher)
+
+To install the tools:
+
+ISMRMRD Python API:
+
+git clone https://github.com/ismrmrd/ismrmrd-python.git
+cd ismrmrd-python
+sudo python setup.py install
+
+ISMRMRD Python tools:
+
+git clone https://github.com/ismrmrd/ismrmrd-python-tools.git
+cd ismrmrd-python-tools
+sudo python setup.py install
+
+
+ at author: Michael S. Hansen
+"""
+
+#%% imports
+import os
+import sys
+import ismrmrd
+import ismrmrd.xsd
+import numpy as np
+from ismrmrdtools import show
+from remove_2x_oversampling import Remove2xOversampling
+from accumulate_and_recon import AccumulateAndRecon
+from rms_coil_combine import RMSCoilCombine
+
+#%% Setup gadgets
+g3 = RMSCoilCombine()
+g2 = AccumulateAndRecon(g3)
+g1 = Remove2xOversampling(g2)
+
+
+#%% Load file
+filename = 'testdata.h5'
+if not os.path.isfile(filename):
+    print("%s is not a valid file" % filename)
+    raise SystemExit
+dset = ismrmrd.Dataset(filename, 'dataset', create_if_needed=False)
+
+#%% Send in data
+#First ISMRMRD XML header
+g1.process_config(dset.read_xml_header())
+g2.process_config(dset.read_xml_header())
+g3.process_config(dset.read_xml_header())
+
+# Loop through the rest of the acquisitions and stuff
+for acqnum in range(0,dset.number_of_acquisitions()):
+    acq = dset.read_acquisition(acqnum)
+    g1.process(acq.getHead(),acq.data)
+    
+#%%
+#Get result and display    
+res = g3.get_results()
+show.imshow(np.squeeze(abs(res[0][1])))
\ No newline at end of file
diff --git a/gadgets/python/gadgetronpython_export.h b/gadgets/python/gadgetronpython_export.h
new file mode 100644
index 0000000..b84050a
--- /dev/null
+++ b/gadgets/python/gadgetronpython_export.h
@@ -0,0 +1,14 @@
+#ifndef GADGETRONPYTHON_EXPORT_H_
+#define GADGETRONPYTHON_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_PYTHON__) || defined (gadgetronpython_EXPORTS)
+#define EXPORTGADGETSPYTHON __declspec(dllexport)
+#else
+#define EXPORTGADGETSPYTHON __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETSPYTHON
+#endif
+
+#endif /* GADGETRONPYTHON_EXPORT_H_ */
diff --git a/gadgets/python/gadgets/CMakeLists.txt b/gadgets/python/gadgets/CMakeLists.txt
new file mode 100644
index 0000000..92b5350
--- /dev/null
+++ b/gadgets/python/gadgets/CMakeLists.txt
@@ -0,0 +1,8 @@
+install(FILES
+    gadgetron.py
+    rms_coil_combine.py
+    remove_2x_oversampling.py
+    accumulate_and_recon.py
+    image_viewer.py
+    tpat_snr_scale.py
+    DESTINATION ${GADGETRON_INSTALL_PYTHON_MODULE_PATH} COMPONENT main)
diff --git a/gadgets/python/gadgets/accumulate_and_recon.py b/gadgets/python/gadgets/accumulate_and_recon.py
new file mode 100644
index 0000000..efe3eb6
--- /dev/null
+++ b/gadgets/python/gadgets/accumulate_and_recon.py
@@ -0,0 +1,64 @@
+import numpy as np
+from ismrmrdtools import transform
+from gadgetron import Gadget
+import ismrmrd
+import ismrmrd.xsd
+
+class AccumulateAndRecon(Gadget):
+    def __init__(self, next_gadget=None):
+        Gadget.__init__(self,next_gadget)
+        self.myBuffer = None
+        self.myCounter = 1
+        self.mySeries = 1
+        self.header = None
+        self.enc = None
+
+    def process_config(self, conf):
+        self.header = ismrmrd.xsd.CreateFromDocument(conf)
+        self.enc = self.header.encoding[0]
+
+    def process(self, acq, data,*args):
+        if self.myBuffer is None:
+            channels = acq.active_channels
+            if self.enc.encodingLimits.slice != None:
+                nslices = self.enc.encodingLimits.slice.maximum + 1
+            else:
+                nslices = 1
+            eNz = self.enc.encodedSpace.matrixSize.z
+            eNy = self.enc.encodedSpace.matrixSize.y
+            eNx = self.enc.encodedSpace.matrixSize.x
+        
+            self.myBuffer = np.zeros((channels,nslices,eNz,eNy,eNx>>1),dtype=np.complex64)
+
+        line_offset = self.enc.encodedSpace.matrixSize.y/2 - self.enc.encodingLimits.kspace_encoding_step_1.center             
+        self.myBuffer[:,acq.idx.slice,acq.idx.kspace_encode_step_2,acq.idx.kspace_encode_step_1+line_offset,:] = data
+
+        if (acq.flags & (1<<7)): #Is this the last scan in slice
+            image = transform.transform_kspace_to_image(self.myBuffer,dim=(2,3,4))
+            image = image * np.product(image.shape)*100 #Scaling for the scanner
+            #Create a new image header and transfer value
+            img_head = ismrmrd.ImageHeader()
+            img_head.channels = acq.active_channels
+            img_head.slice = acq.idx.slice
+            img_head.matrix_size[0] = self.myBuffer.shape[4]
+            img_head.matrix_size[1] = self.myBuffer.shape[3]
+            img_head.matrix_size[2] = self.myBuffer.shape[2]
+            img_head.position = acq.position
+            img_head.read_dir = acq.read_dir
+            img_head.phase_dir = acq.phase_dir
+            img_head.slice_dir = acq.slice_dir
+            img_head.patient_table_position = acq.patient_table_position
+            img_head.acquisition_time_stamp = acq.acquisition_time_stamp
+            img_head.image_index = self.myCounter
+            img_head.image_series_index = self.mySeries
+            img_head.data_type = ismrmrd.DATATYPE_CXFLOAT
+            self.myCounter += 1
+            if self.myCounter > 5:
+                    self.mySeries += 1
+                    self.myCounter = 1
+
+            #Return image to Gadgetron
+            self.put_next(img_head,image.astype('complex64'),*args)
+            
+        #print "Returning to Gadgetron"
+        return 0 #Everything OK
diff --git a/gadgets/python/gadgets/gadgetron.py b/gadgets/python/gadgets/gadgetron.py
new file mode 100644
index 0000000..9b6a57f
--- /dev/null
+++ b/gadgets/python/gadgets/gadgetron.py
@@ -0,0 +1,177 @@
+try:
+    import GadgetronPythonMRI
+    import ismrmrd
+except ImportError:
+    pass
+
+import time
+import numpy as np
+
+class Gadget(object):
+    def __init__(self, next_gadget=None):
+        self.next_gadget = next_gadget
+        self.params = dict()
+        self.results = []
+
+    def set_parameter(self, name, value):
+        self.params[name] = value
+
+    def get_parameter(self, name):
+        return self.params.get(name, None)
+
+    def __call__(self, *args):
+        self.process(args)
+        return self.get_results()
+
+    def set_next_gadget(self, gadget):
+        self.next_gadget = gadget
+
+    def process_config(self, conf):
+        pass
+
+    def process(self, header, *args):
+        # do work here
+        self.put_next(header,*args)
+
+    def wait(self):
+        pass
+    
+    def put_next(self, *args):
+        if self.next_gadget is not None:
+            if isinstance(self.next_gadget, Gadget):
+                if len(args) == 3 and not isinstance(args[2],ismrmrd.Meta): #Data with meta data we assume
+                    meta = ismrmrd.Meta()
+                    meta = ismrmrd.Meta.deserialize(args[2])
+                    new_args = (args[0], args[1], meta)
+                    self.next_gadget.process(*new_args)
+                else:
+                    self.next_gadget.process(*args)
+            elif isinstance(self.next_gadget, GadgetronPythonMRI.GadgetReference):
+                if len(args) > 3:
+                    raise Exception("Only two or 3 return arguments are currently supported when returning to Gadgetron framework")
+                if isinstance(args[0], ismrmrd.AcquisitionHeader):
+                    self.next_gadget.return_acquisition(args[0],args[1].astype('complex64'))
+                elif isinstance(args[0], ismrmrd.ImageHeader):
+                    header = args[0]
+                    if (args[1].dtype == np.uint16):
+                        if len(args) == 3:
+                            self.next_gadget.return_image_ushort_attr(header,args[1], args[2].serialize())
+                        else:
+                            self.next_gadget.return_image_ushort(header,args[1])
+                    elif (args[1].dtype == np.float32):
+                        if len(args) == 3:
+                            self.next_gadget.return_image_float_attr(header, args[1], args[2].serialize())
+                        else:
+                            self.next_gadget.return_image_float(header,args[1])
+                    else:
+                        if len(args) == 3:
+                            self.next_gadget.return_image_cplx_attr(header, args[1].astype('complex64'), args[2].serialize())
+                        else:
+                            self.next_gadget.return_image_cplx(header,args[1].astype('complex64'))
+                else:
+                    raise("Unsupported types when returning to Gadgetron framework")
+            else:
+                raise("next_gadget is set to unsupported type")
+        else:
+            self.results.append(list(args))
+
+    def get_results(self):
+        results = self.results
+        self.results = []
+        return results
+
+
+class WrappedGadget(object):
+    def __init__(self, dllname, classname, gadgetname):
+        self.gadgetname = gadgetname
+        self.classname = classname
+        self.dllname = dllname
+        self.parameters = dict()
+    
+class WrapperGadget(Gadget):
+    
+    def __init__(self, dllname, classname, gadgetname=None, next_gadget=None):
+        if gadgetname == None:
+            gadgetname = classname
+        Gadget.__init__(self, next_gadget)
+        self.controller_ = GadgetronPythonMRI.GadgetInstrumentationStreamController()
+        self.controller_.prepend_gadget(gadgetname,dllname,classname)
+        self.controller_.set_python_gadget(self)
+        self.wrapped_gadgets = list()
+        self.wrapped_gadgets.append(WrappedGadget(dllname,classname,gadgetname))
+
+    def prepend_gadget(self,dllname, classname, gadgetname=None):
+        self.controller_.prepend_gadget(gadgetname,dllname,classname)
+        self.wrapped_gadgets.insert(0,WrappedGadget(dllname,classname,gadgetname))
+
+    def wait(self):
+       self.controller_.close()
+
+    def process_config(self, conf):
+        self.controller_.put_config(conf)
+        return 0
+
+    def process(self, header, *args):
+        if len(args) > 2:
+            raise("Only two or three arguments are currently supported when sending data to Gadgetron framework")
+        if isinstance(header, ismrmrd.AcquisitionHeader):
+            self.controller_.put_acquisition(header,args[0].astype('complex64'))
+        elif isinstance(header, ismrmrd.ImageHeader):
+            if (args[0].dtype == np.uint16):
+                if len(args) == 2:
+                    self.controller_.put_image_ushort_attr(header,args[0], args[1].serialize())
+                else:
+                    self.controller_.put_image_ushort(header,args[0])
+            elif (args[0].dtype == np.float32):
+                if len(args) == 2:
+                    self.controller_.put_image_float_attr(header, args[0], args[1].serialize())
+                else:
+                    self.controller_.put_image_float(header,args[0])
+            else:   
+                if len(args) == 2:
+                    self.controller_.put_image_cplx_attr(header, args[0].astype('complex64'), args[1].serialize())
+                else:
+                    self.controller_.put_image_cplx(header,args[0].astype('complex64'))
+        else:
+            raise("Unsupported types when sending data to Gadgetron framework")
+        return 0
+
+    def set_parameter(self,gadgetname,parameter,value):
+        for g in self.wrapped_gadgets:
+            if g.gadgetname == gadgetname:
+                g.parameters[parameter] = value
+                break
+
+        self.controller_.set_parameter(gadgetname,parameter,value)
+
+class FunctionGadget(Gadget):
+    """A Gadget with a configurable `process` function.
+
+    Params:
+        fn: `process` function
+    """
+    def __init__(self, fn, next_gadget=None):
+        super(FunctionGadget, self).__init__(next_gadget)
+        self.process = fn
+
+
+def gadget_chain_wait(first_gadget):
+    g = first_gadget;
+    while (g):
+        g.wait()
+        g = g.next_gadget
+
+def gadget_chain_config(first_gadget, conf):
+    g = first_gadget;
+    while (g):
+        g.process_config(conf)
+        g = g.next_gadget
+        
+def get_last_gadget(first_gadget):
+    g = first_gadget;
+    while (True):
+        if g.next_gadget:
+            g = g.next_gadget
+        else:
+            break
+    return g
diff --git a/gadgets/python/gadgets/image_viewer.py b/gadgets/python/gadgets/image_viewer.py
new file mode 100644
index 0000000..62cd1ba
--- /dev/null
+++ b/gadgets/python/gadgets/image_viewer.py
@@ -0,0 +1,70 @@
+from gadgetron import Gadget
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.figure import Figure   
+
+#from matplotlib.axes import Subplot   
+# uncomment to select /GTK/GTKAgg/GTKCairo
+from matplotlib.backends.backend_gtk import FigureCanvasGTK as FigureCanvas
+#from matplotlib.backends.backend_gtkagg import FigureCanvasGTKAgg as FigureCanvas
+#from matplotlib.backends.backend_gtkcairo import FigureCanvasGTKCairo as FigureCanvas
+
+# or NavigationToolbar for classic
+#from matplotlib.backends.backend_gtk import NavigationToolbar2GTK as NavigationToolbar
+from matplotlib.backends.backend_gtkagg import NavigationToolbar2GTKAgg as NavigationToolbar
+
+import pygtk
+pygtk.require('2.0')
+import gtk
+
+class ImageViewWindow:
+    def __init__(self, img_data):
+        self.window = gtk.Window(gtk.WINDOW_TOPLEVEL)
+        self.window.connect("delete_event", self.delete_event)
+        self.window.connect('key_press_event', self.on_key_press_event)
+        self.window.set_default_size(400,300)
+        self.window.set_title("Gadgetron Image Viewer")
+
+        self.vbox = gtk.VBox()
+        self.window.add(self.vbox)
+
+        self.fig = Figure(figsize=(5,4), dpi=100)
+        
+        plt.gray()
+
+        self.ax = self.fig.add_subplot(111)
+        self.img_ax = self.ax.imshow(np.squeeze(np.abs(img_data)))
+
+        self.canvas = FigureCanvas(self.fig)  # a gtk.DrawingArea
+        self.vbox.pack_start(self.canvas)
+        self.toolbar = NavigationToolbar(self.canvas, self.window)
+        self.vbox.pack_start(self.toolbar, False, False)
+        self.window.show_all()
+
+    def delete_event(self, widget, event, data=None):
+        gtk.main_quit()
+        return False
+   
+    def on_key_press_event(self, widget, event, data=None):
+        keyname = gtk.gdk.keyval_name(event.keyval)
+        if (keyname == "Escape"):
+            self.window.destroy()
+            gtk.main_quit()
+            return False
+
+    def main(self):
+        gtk.main()
+
+
+class ImageViewer(Gadget):
+    def process_config(self, cfg):
+        print "Attempting to open window"
+        print "Window running"
+        #Configuration Ignored
+
+    def process(self, h,im):
+        myWindow = ImageViewWindow(im)
+        myWindow.main()
+
+        self.put_next(h,im)
+        return 0
diff --git a/gadgets/python/gadgets/remove_2x_oversampling.py b/gadgets/python/gadgets/remove_2x_oversampling.py
new file mode 100644
index 0000000..c256f42
--- /dev/null
+++ b/gadgets/python/gadgets/remove_2x_oversampling.py
@@ -0,0 +1,22 @@
+import numpy as np
+from ismrmrdtools import transform
+from gadgetron import Gadget
+
+class Remove2xOversampling(Gadget):
+
+    def process_config(self, conf):
+        #print "remove 2x oversampling: Configuration received"
+        #print str(conf)
+        return
+
+    def process(self, acq, data):
+        orig_size = list(data.shape);
+        data2 = data.reshape([(data.size/data.shape[data.ndim-1]), data.shape[data.ndim-1]])
+        new_length = data2.shape[1]>>1
+        data2 = transform.transform_image_to_kspace(transform.transform_kspace_to_image(data2,dim=(1,))[:,(0+(new_length>>1)):(new_length+(new_length>>1))],dim=(1,))
+        orig_size[data.ndim-1] = new_length
+        data2.reshape(tuple(orig_size))
+        acq.samples = new_length
+
+        self.put_next(acq,data2)
+        return 0
diff --git a/gadgets/python/gadgets/rms_coil_combine.py b/gadgets/python/gadgets/rms_coil_combine.py
new file mode 100644
index 0000000..113f0c3
--- /dev/null
+++ b/gadgets/python/gadgets/rms_coil_combine.py
@@ -0,0 +1,13 @@
+import numpy as np
+from gadgetron import Gadget
+
+class RMSCoilCombine(Gadget):
+
+    def process_config(self, cfg):
+        print "RMS Coil Combine, Config ignored"
+
+    def process(self, h, im):
+        combined_image = np.sqrt(np.sum(np.square(np.abs(im)),axis=0))
+        h.channels = 1
+        self.put_next(h,combined_image.astype('complex64'))
+        return 0
diff --git a/gadgets/python/gadgets/tpat_snr_scale.py b/gadgets/python/gadgets/tpat_snr_scale.py
new file mode 100644
index 0000000..c006b69
--- /dev/null
+++ b/gadgets/python/gadgets/tpat_snr_scale.py
@@ -0,0 +1,281 @@
+import ismrmrd
+import ismrmrd.xsd
+import numpy as np
+from ismrmrdtools import transform, coils, grappa
+from gadgetron import Gadget
+import copy 
+import math
+
+class RemOS(Gadget):
+    def process_config(self, conf):
+        return
+
+    def process(self, acq, data,*args):
+        if not acq.isFlagSet(ismrmrd.ACQ_IS_NOISE_MEASUREMENT):
+            ro_length = acq.number_of_samples
+            padded_ro_length = (acq.number_of_samples-acq.center_sample)*2
+            if padded_ro_length != ro_length: #partial fourier
+                data2 = np.zeros((data.shape[0], padded_ro_length),dtype=np.complex64)
+                offset = (padded_ro_length>>1)  - acq.center_sample
+                data2[:,0+offset:offset+ro_length] = data
+            else:
+                data2 = data
+    
+            data2=transform.transform_kspace_to_image(data2,dim=(1,))
+            data2=data2[:,(padded_ro_length>>2):(padded_ro_length>>2)+(padded_ro_length>>1)]
+            data2=transform.transform_image_to_kspace(data2,dim=(1,)) * np.sqrt(float(padded_ro_length)/ro_length)
+            acq.center_sample = padded_ro_length>>2
+            acq.number_of_samples = data2.shape[1]
+            self.put_next(acq,data2,*args)
+        return 0                                                                                     
+
+class NoiseAdj(Gadget):
+    def __init__(self, next_gadget = None):
+        Gadget.__init__(self, next_gadget)
+        self.noise_data = list()
+        self.noise_dmtx = None
+    def process(self,acq,data,*args):
+        if acq.isFlagSet(ismrmrd.ACQ_IS_NOISE_MEASUREMENT):
+            self.noise_data.append((acq,data))
+        else:
+            if len(self.noise_data):
+                profiles = len(self.noise_data)
+                channels = self.noise_data[0][1].shape[0]
+                samples_per_profile = self.noise_data[0][1].shape[1]
+                noise = np.zeros((channels,profiles*samples_per_profile),dtype=np.complex64)
+                counter = 0
+                for p in self.noise_data:
+                    noise[:,counter*samples_per_profile:(counter*samples_per_profile+samples_per_profile)] = p[1]
+                    counter = counter + 1
+                
+                scale = (acq.sample_time_us/self.noise_data[0][0].sample_time_us)*0.79
+                self.noise_dmtx = coils.calculate_prewhitening(noise,scale_factor=scale)
+                
+                #Test the noise adjust
+                d = self.noise_data[0][1]
+                d2 = coils.apply_prewhitening(d, self.noise_dmtx)                
+                self.noise_data = list()
+            
+            if self.noise_dmtx is not None:
+                data2 = coils.apply_prewhitening(data, self.noise_dmtx)
+            else:
+                data2 = data
+                
+            self.put_next(acq,data2)
+        return 0
+
+class PCA(Gadget):
+    def __init__(self, next_gadget=None):
+        Gadget.__init__(self, next_gadget) 
+        self.calib_data = list()
+        self.pca_mtx = None
+        self.max_calib_profiles = 100
+        self.samples_to_use = 16
+        self.buffering = True
+        
+    def process(self,acq,data,*args):    
+        if self.buffering:
+            self.calib_data.append((acq,data))
+            
+            if (len(self.calib_data)>=self.max_calib_profiles or acq.isFlagSet(ismrmrd.ACQ_LAST_IN_SLICE)):
+                #We are done buffering calculate pca transformation
+                if self.samples_to_use < acq.number_of_samples:
+                    samp_to_use = self.samples_to_use
+                    
+                if (len(self.calib_data) < 16):
+                    samp_to_use = acq.number_of_samples
+                    
+                total_samples = samp_to_use*len(self.calib_data)
+                channels = data.shape[0]
+                
+                A = np.zeros((total_samples,channels), dtype=np.complex64)
+                counter = 0
+                for p in self.calib_data:
+                    d = p[1][:, acq.center_sample-(samp_to_use>>1):acq.center_sample+(samp_to_use>>1)]
+                    A[counter*samp_to_use:counter*samp_to_use+samp_to_use,:] = np.transpose(d)
+                    counter = counter+1
+                
+                m = np.mean(A,0)
+                A_m = A - m.reshape((1,m.shape[0]))
+                U, s, V = np.linalg.svd(A_m, full_matrices=False)
+                
+                self.pca_mtx = V
+                
+                #Empty calib_data
+                for p in self.calib_data:
+                    data2 = np.dot(self.pca_mtx,p[1])
+                    self.put_next(p[0],data2)
+     
+                self.buffering = False
+                self.calib_data = list()
+                return 0
+        else:
+            if self.pca_mtx is not None:
+                data2 = np.dot(self.pca_mtx,data)
+                self.put_next(acq,data2,*args)
+            else:
+                self.put_next(acq,data,*args)
+            
+        return 0
+
+class CoilReduce(Gadget):
+    def __init__(self, next_gadget = None):
+        Gadget.__init__(self, next_gadget)
+        self.coils_out = 16
+        
+    def process_config(self, conf):
+        coils_out = self.get_parameter("coils_out")
+        if (coils_out is not None):
+            self.coils_out = int(coils_out)
+
+    def process(self, acq, data, *args):
+        if acq.active_channels > self.coils_out:
+            data2 = data[0:self.coils_out,:]
+            acq.active_channels = self.coils_out
+        else:
+            data2 = data
+            
+        self.put_next(acq,data2,*args)
+        return 0
+
+
+class Recon(Gadget):
+    def __init__(self, next_gadget=None):
+        Gadget.__init__(self, next_gadget) 
+        self.header = None
+        self.enc = None
+        self.acc_factor = None
+        self.buffer = None
+        self.samp_mask = None
+        self.header_proto = None
+        self.calib_buffer = list()
+        self.unmix = None
+        self.gmap = None
+        self.calib_frames = 0
+        self.method = 'grappa'
+    
+    def process_config(self, cfg):
+        self.header = ismrmrd.xsd.CreateFromDocument(cfg)
+        self.enc = self.header.encoding[0]
+
+        #Parallel imaging factor
+        self.acc_factor = self.enc.parallelImaging.accelerationFactor.kspace_encoding_step_1
+        
+        reps = self.enc.encodingLimits.repetition.maximum+1
+        phs = self.enc.encodingLimits.phase.maximum+1
+        if reps > phs:
+            self.calib_frames = reps
+        else:
+            self.calib_frames = phs
+            
+        if self.calib_frames < self.acc_factor:
+            self.calib_frames = self.acc_factor
+        
+        #Frames should be a multiple of the acceleration factor
+        self.frames = math.floor(self.calib_frames/self.acc_factor)*self.acc_factor
+
+        pmri_method =  self.get_parameter('pmri_method')
+        if pmri_method == 'grappa' or pmri_method == 'sense':
+            self.method = pmri_method
+
+    def process(self, acq, data,*args):
+
+        if self.buffer is None:
+            # Matrix size
+            eNx = self.enc.encodedSpace.matrixSize.x
+            eNy = self.enc.encodedSpace.matrixSize.y
+            eNz = self.enc.encodedSpace.matrixSize.z
+            rNx = self.enc.reconSpace.matrixSize.x
+            rNy = self.enc.reconSpace.matrixSize.y
+            rNz = self.enc.reconSpace.matrixSize.z
+
+            # Field of View
+            eFOVx = self.enc.encodedSpace.fieldOfView_mm.x
+            eFOVy = self.enc.encodedSpace.fieldOfView_mm.y
+            eFOVz = self.enc.encodedSpace.fieldOfView_mm.z
+            rFOVx = self.enc.reconSpace.fieldOfView_mm.x
+            rFOVy = self.enc.reconSpace.fieldOfView_mm.y
+            rFOVz = self.enc.reconSpace.fieldOfView_mm.z
+        
+            channels = acq.active_channels
+
+            if data.shape[1] != rNx:
+                raise("Error, Recon gadget expects data to be on correct matrix size in RO direction")
+                
+            if (rNz != 1):
+                rasie("Error Recon Gadget only supports 2D for now")
+                
+            self.buffer = np.zeros((channels, rNy, rNx),dtype=np.complex64)
+            self.samp_mask = np.zeros(self.buffer.shape[1:])
+            self.header_proto = ismrmrd.ImageHeader()
+            self.header_proto.matrix_size[0] = rNx
+            self.header_proto.matrix_size[1] = rNy
+            self.header_proto.matrix_size[2] = rNz
+            self.header_proto.field_of_view[0] = rFOVx
+            self.header_proto.field_of_view[1] = rFOVy
+            self.header_proto.field_of_view[0] = rFOVz
+        
+        #Now put data in buffer
+        line_offset = self.buffer.shape[1]/2 - self.enc.encodingLimits.kspace_encoding_step_1.center                                                                                 
+        self.buffer[:,acq.idx.kspace_encode_step_1+line_offset,:] = data                                                          
+        self.samp_mask[acq.idx.kspace_encode_step_1+line_offset,:] = 1
+        
+        #If last scan in buffer, do FFT and fill image header
+        if acq.isFlagSet(ismrmrd.ACQ_LAST_IN_ENCODE_STEP1) or acq.isFlagSet(ismrmrd.ACQ_LAST_IN_SLICE):
+            img_head = copy.deepcopy(self.header_proto)
+            img_head.position = acq.position                                                                                                                               
+            img_head.read_dir = acq.read_dir                                                                                                                               
+            img_head.phase_dir = acq.phase_dir                                                                                                                             
+            img_head.slice_dir = acq.slice_dir                                                                                                                             
+            img_head.patient_table_position = acq.patient_table_position                                                                                                   
+            img_head.acquisition_time_stamp = acq.acquisition_time_stamp                                                                                                   
+            img_head.slice = acq.idx.slice
+            img_head.channels = 1
+            
+            scale = self.samp_mask.size/(1.0*np.sum(self.samp_mask[:]));
+
+            #We have not yet calculated unmixing coefficients
+            if self.unmix is None:
+                self.calib_buffer.append((img_head,self.buffer.copy()))
+                self.buffer[:] = 0
+                self.samp_mask[:] = 0
+                
+                if len(self.calib_buffer) >= self.calib_frames:
+                    cal_data = np.zeros(self.calib_buffer[0][1].shape, dtype=np.complex64)
+                    for c in self.calib_buffer:
+                        cal_data = cal_data + c[1]
+                        
+                    mask = np.squeeze(np.sum(np.abs(cal_data),0))
+                    mask = np.ones(mask.shape)*(np.abs(mask)>0.0)
+                    target = None #cal_data[0:8,:,:]
+                    
+                    coil_images = transform.transform_kspace_to_image(cal_data,dim=(1,2))
+                    (csm,rho) = coils.calculate_csm_walsh(coil_images)
+                    
+                    if self.method == 'grappa':
+                        self.unmix, self.gmap = grappa.calculate_grappa_unmixing(cal_data, 
+                                                                                 self.acc_factor, 
+                                                                                 data_mask=mask, 
+                                                                                 kernel_size=(4,5), 
+                                                                                 csm=csm)
+                    elif self.method == 'sense':
+                        self.unmix, self.gmap = sense.calculate_sense_unmixing(self.acc_factor, csm)
+                    else:
+                        raise Exception('Unknown parallel imaging method: ' + str(self.method))
+                        
+                    for c in self.calib_buffer:
+                        recon = transform.transform_kspace_to_image(c[1],dim=(1,2))*np.sqrt(scale)
+                        recon = np.squeeze(np.sum(recon * self.unmix,0))
+                        self.put_next(c[0], recon,*args)
+                        
+                return 0
+                
+            if self.unmix is None:
+                raise Exception("We should never reach this point without unmixing coefficients")
+                
+            recon = transform.transform_kspace_to_image(self.buffer,dim=(1,2))*np.sqrt(scale)
+            recon = np.squeeze(np.sum(recon * self.unmix,0))
+            self.buffer[:] = 0
+            self.samp_mask[:] = 0
+            self.put_next(img_head,recon,*args)
+        return 0
diff --git a/gadgets/python/kspaceandimage.py b/gadgets/python/kspaceandimage.py
new file mode 100644
index 0000000..6c42e90
--- /dev/null
+++ b/gadgets/python/kspaceandimage.py
@@ -0,0 +1,20 @@
+import numpy as np
+import numpy.fft as ft
+import numpy.fft.helper as fth
+
+def ktoi(data,axis=-1):
+    if (axis == -1):
+        ax = fth.arange(0,data.ndim)
+    else:
+        ax = axis
+
+    return fth.fftshift(ft.ifftn(fth.ifftshift(data,axes=ax),axes=ax),axes=ax)
+
+def itok(data,axis=-1):
+    if (axis == -1):
+        ax = fth.arange(0,data.ndim)
+    else:
+        ax = axis
+
+
+    return fth.fftshift(ft.fftn(fth.ifftshift(data,axes=ax),axes=ax),axes=ax)
diff --git a/gadgets/python/utils/CMakeLists.txt b/gadgets/python/utils/CMakeLists.txt
new file mode 100644
index 0000000..de816ca
--- /dev/null
+++ b/gadgets/python/utils/CMakeLists.txt
@@ -0,0 +1,3 @@
+install(FILES
+  gadgetron_run_python_chain.py  gadgetron_xml_to_python.py gadgetron_python_to_xml.py
+  DESTINATION ${GADGETRON_INSTALL_PYTHON_MODULE_PATH} COMPONENT main)
diff --git a/gadgets/python/utils/gadgetron_python_to_xml.py b/gadgets/python/utils/gadgetron_python_to_xml.py
new file mode 100644
index 0000000..ba7e1dc
--- /dev/null
+++ b/gadgets/python/utils/gadgetron_python_to_xml.py
@@ -0,0 +1,83 @@
+import sys
+import os
+import inspect
+import gadgetron
+
+sys.path.append(os.environ['GADGETRON_HOME'] + '/share/gadgetron/python')
+
+def convert_to_xml(first_gadget):
+    g = first_gadget
+
+    xml_string  = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+    xml_string += "<gadgetronStreamConfiguration xsi:schemaLocation=\"http://gadgetron.sf.net/gadgetron gadgetron.xsd\" xmlns=\"http://gadgetron.sf.net/gadgetron\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n"
+
+    #Add standard readers and writers
+    xml_string += "<reader>\n"
+    xml_string += "  <slot>1008</slot>\n"
+    xml_string += "  <dll>gadgetron_mricore</dll>\n"
+    xml_string += "  <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>\n"
+    xml_string += "</reader>\n"
+    xml_string += "\n"
+    xml_string += "<writer>\n"
+    xml_string += "  <slot>1022</slot>\n"
+    xml_string += "  <dll>gadgetron_mricore</dll>\n"
+    xml_string += "  <classname>MRIImageWriter</classname>\n"
+    xml_string += "</writer>\n"
+    xml_string += "\n"
+
+    while (g):
+        if isinstance(g,gadgetron.WrapperGadget):
+            for wg in g.wrapped_gadgets:
+                xml_string += "<gadget>\n"
+                xml_string += "  <name>" + wg.gadgetname +"</name>\n"
+                xml_string += "  <dll>" + wg.dllname + "</dll>\n"
+                xml_string += "  <classname>" + wg.classname + "</classname>\n"
+                for p in wg.parameters:
+                    xml_string += "    <property>\n"
+                    xml_string += "      <name>" + str(p) + "</name>\n"
+                    xml_string += "      <value>" + str(wg.parameters[p]) + "</value>\n"
+                    xml_string += "    </property>\n"
+                xml_string += "</gadget>\n\n"
+        else:
+            xml_string += "<gadget>\n"
+            xml_string += "  <name>" + str(g.__class__.__name__) +"</name>\n"
+            xml_string += "  <dll>gadgetron_python</dll>\n"
+            xml_string += "  <classname>PythonGadget</classname>\n"
+            xml_string += "  <property>\n"
+            xml_string += "    <name>python_module</name>\n"
+            xml_string += "    <value>" + g.__module__ + "</value>\n"
+            xml_string += "  </property>\n"            
+            xml_string += "  <property>\n"
+            xml_string += "    <name>python_class</name>\n"
+            xml_string += "    <value>" + g.__class__.__name__ + "</value>\n"
+            xml_string += "  </property>\n"
+            for p in g.params:
+                xml_string += "  <property>\n"
+                xml_string += "    <name>" + str(p) + "</name>\n"
+                xml_string += "    <value>" + str(g.params[p]) + "</value>\n"
+                xml_string += "  </property>\n"
+            xml_string += "</gadget>\n\n"
+
+        g = g.next_gadget
+
+    xml_string += "</gadgetronStreamConfiguration>\n"
+    return xml_string
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print "Usage: " + sys.argv[0] + "<gadgetron_python_chain.py>"
+        raise Exception("Invalid number of arguments")
+
+    python_function_file = sys.argv[1]
+    if not os.path.isfile(python_function_file):
+        print("%s is not a valid file" % python_function_file)
+        raise SystemExit
+    
+    namespace = {}
+    execfile(python_function_file, namespace)
+    globals().update(namespace)
+
+    g0 = define_gadget_chain() #Call function from imported file
+
+    xml_string = convert_to_xml(g0)
+    print xml_string
diff --git a/gadgets/python/utils/gadgetron_run_python_chain.py b/gadgets/python/utils/gadgetron_run_python_chain.py
new file mode 100644
index 0000000..ccab8ca
--- /dev/null
+++ b/gadgets/python/utils/gadgetron_run_python_chain.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+#%% imports
+import os
+import sys
+import ismrmrd
+import ismrmrd.xsd
+import numpy as np
+from ismrmrdtools import show
+
+sys.path.append(os.environ['GADGETRON_HOME'] + '/share/gadgetron/python')
+
+def gadget_wait_function(first_gadget):
+    g = first_gadget;
+    while (g):
+        g.wait()
+        g = g.next_gadget
+
+def gadget_config(first_gadget, conf):
+    g = first_gadget;
+    while (g):
+        g.process_config(conf)
+        g = g.next_gadget
+
+def get_last_gadget(first_gadget):
+    g = first_gadget
+    while (g.next_gadget):
+        g = g.next_gadget
+    return g
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print "Usage: " + sys.argv[0] + " <gadgetron_python_chain.py> <ismrmrd_out.h5> <ismrmrd_out.h5>"
+        raise Exception("Invalid number of arguments.")
+
+    python_function_file = sys.argv[1]
+    filename = sys.argv[2]
+    filename_out = sys.argv[3]
+
+    if not os.path.isfile(python_function_file):
+        print("%s is not a valid file" % python_function_file)
+        raise SystemExit
+    
+    namespace = {}
+    execfile(python_function_file, namespace)
+    globals().update(namespace)
+
+    g0 = define_gadget_chain() #Call function from imported file
+
+    #%% Load file
+    if not os.path.isfile(filename):
+        print("%s is not a valid file" % filename)
+        raise SystemExit
+    dset = ismrmrd.Dataset(filename, 'dataset', create_if_needed=False)
+
+    #%% Send in data
+    #First ISMRMRD XML header
+    gadget_config(g0,dset.read_xml_header())
+            
+    # Loop through the rest of the acquisitions and stuff
+    for acqnum in range(0,dset.number_of_acquisitions()):
+        acq = dset.read_acquisition(acqnum)
+        g0.process(acq.getHead(),acq.data.astype('complex64'))
+
+    #%%
+    gadget_wait_function(g0)
+
+    res = get_last_gadget(g0).get_results()
+    print "Received " + str(len(res)) + " result items"
+
+    out_dset = ismrmrd.Dataset(filename_out, "out")
+    for o in res:
+        print "Appending image to out file"
+        img = ismrmrd.Image(head=o[0])
+        img.data.ravel()[:] = o[1].ravel()[:] 
+        out_dset.append_image("image_%d" % img.image_series_index, img)
+
+    
diff --git a/gadgets/python/utils/gadgetron_xml_to_python.py b/gadgets/python/utils/gadgetron_xml_to_python.py
new file mode 100644
index 0000000..fb10269
--- /dev/null
+++ b/gadgets/python/utils/gadgetron_xml_to_python.py
@@ -0,0 +1,70 @@
+from sys import argv
+import xml.etree.ElementTree as et
+
+def add_ns(tag):
+    ns = 'http://gadgetron.sf.net/gadgetron'
+    return '{' + ns + '}' + tag
+
+def convert_xml(xmlfilename):
+    last_gadget_was_wrapper = False
+    next_gadget = 'None'
+        
+    tree = et.parse(xmlfilename)
+    root = tree.getroot()
+    header_string = '# Automatically generated Python representation of ' + xmlfilename + '\n\n'
+    header_string += 'from gadgetron import WrapperGadget\n'
+    function_string = 'def define_gadget_chain():\n'
+    counter = 1
+    for gadget in reversed(root.findall(add_ns('gadget'))):
+        gadgetname = gadget.findall(add_ns('name'))[0].text
+        dllname = gadget.findall(add_ns('dll'))[0].text
+        classname = gadget.findall(add_ns('classname'))[0].text
+        counter += 1
+        if classname.lower().find('pythongadget') >= 0:
+            object_name = 'g' + str(counter)
+            python_module = None
+            python_class = None
+            prop_set_string = ''
+            for p in gadget.findall(add_ns('property')):
+                pname = p.findall(add_ns('name'))[0].text
+                pvalue = p.findall(add_ns('value'))[0].text
+
+                if (pname == 'python_module'):
+                    python_module = pvalue
+                elif (pname == 'python_class'):
+                    python_class = pvalue
+                else:
+                    prop_set_string += '    ' + object_name + '.set_parameter(\"' + pname + '\", \"' + pvalue + '\")\n'
+
+            header_string += 'from ' + python_module + ' import ' + python_class + '\n'
+            function_string += '    ' + object_name + ' = ' + python_class + '(next_gadget=' + next_gadget + ')\n'
+            function_string += prop_set_string
+            last_gadget_was_wrapper = False
+        else:
+            if not last_gadget_was_wrapper:
+                object_name = 'g' + str(counter)
+                function_string += '    ' + object_name + ' = WrapperGadget("' + dllname + '", "' + classname + '", gadgetname="' + gadgetname + '", next_gadget=' + next_gadget + ')\n'
+                last_gadget_was_wrapper = True
+            else:
+                function_string += '    ' + object_name + '.prepend_gadget("' + dllname + '", "' + classname + '", gadgetname="' + gadgetname + '")\n'
+
+            for p in gadget.findall(add_ns('property')):
+                pname = p.findall(add_ns('name'))[0].text
+                pvalue = p.findall(add_ns('value'))[0].text
+                if not pvalue:
+                    pvalue = ''
+                function_string += '    ' + object_name + '.set_parameter("' + gadgetname + '", "' + pname + '", "' + pvalue + '")\n'
+        
+        next_gadget = object_name
+
+    function_string += '    ' + 'return ' + object_name 
+    function_code = header_string
+    function_code += '\n'
+    function_code += function_string
+    return function_code
+ 
+if __name__ == "__main__":
+    if len(argv) != 2:
+        print "Usage: " + argv[0] + " <configuration.xml>"
+        raise "Invalid number of arguments. Please provide the name of a gadgetron configuration file"
+    print convert_xml(argv[1])
diff --git a/gadgets/radial/CMakeLists.txt b/gadgets/radial/CMakeLists.txt
new file mode 100644
index 0000000..6fe1acf
--- /dev/null
+++ b/gadgets/radial/CMakeLists.txt
@@ -0,0 +1,61 @@
+IF (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_RADIAL__)
+ENDIF (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+  ${CMAKE_SOURCE_DIR}/gadgets/pmri
+  ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/fft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators/gpu
+  ${ARMADILLO_INCLUDE_DIRS}
+  )
+
+include_directories(${CUDA_INCLUDE_DIRS})
+
+if (ARMADILLO_FOUND)
+  list(APPEND PHASE_GADGET RadialPhaseCorrectionGadget.h RadialPhaseCorrectionGadget.cpp)
+  install (FILES  RadialPhaseCorrectionGadget.h DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+elseif (ARMADILLO_FOUND)
+  MESSAGE("Armadillo not found, not compiling radial phase correction gadget")
+endif (ARMADILLO_FOUND)
+
+add_library(gadgetron_gpuradial SHARED 
+  gadgetron_radial_export.h
+  gpuRadialPrepGadget.h gpuRadialPrepGadget.cpp 
+  gpuRadialSensePrepGadget.h gpuRadialSensePrepGadget.cpp 
+  gpuRadialSpiritPrepGadget.h gpuRadialSpiritPrepGadget.cpp 
+  gpuRetroGatedSensePrepGadget.h gpuRetroGatedSensePrepGadget.cpp
+  ${PHASE_GADGET})
+
+set_target_properties(gadgetron_gpuradial PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_gpuradial
+  gadgetron_gadgetbase
+  gadgetron_toolbox_log
+  gadgetron_toolbox_gpunfft gadgetron_toolbox_gpusolvers gadgetron_toolbox_gpuoperators gadgetron_toolbox_cpucore gadgetron_toolbox_cpucore_math gadgetron_toolbox_gpucore
+  ${ISMRMRD_LIBRARIES} ${FFTW3_LIBRARIES} ${CUDA_LIBRARIES}
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY}
+  )
+
+target_link_libraries(gadgetron_gpuradial gadgetron_toolbox_gpuparallelmri )
+
+install (FILES  gadgetron_radial_export.h
+                gpuRadialPrepGadget.h 
+                gpuRadialSensePrepGadget.h 
+                gpuRadialSpiritPrepGadget.h 
+                gpuRetroGatedSensePrepGadget.h 
+                DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+install (TARGETS gadgetron_gpuradial DESTINATION lib COMPONENT main)
+
+add_subdirectory(config)
diff --git a/gadgets/radial/RadialPhaseCorrectionGadget.cpp b/gadgets/radial/RadialPhaseCorrectionGadget.cpp
new file mode 100644
index 0000000..0a25e48
--- /dev/null
+++ b/gadgets/radial/RadialPhaseCorrectionGadget.cpp
@@ -0,0 +1,310 @@
+#include "RadialPhaseCorrectionGadget.h"
+#include "hoNDArray_elemwise.h"
+#include "hoArmadillo.h"
+#include "hoNDArray_fileio.h"
+#include "ismrmrd/xml.h"
+
+#define _USE_MATH_DEFINES
+#include <math.h>
+#include <cmath>
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif 
+
+namespace Gadgetron{
+  
+  RadialPhaseCorrectionGadget::RadialPhaseCorrectionGadget()
+    : slices_(-1)
+    , sets_(-1)
+    , channels_(-1)
+    , profiles_counter_(0)
+  {
+  }
+  
+  int RadialPhaseCorrectionGadget::
+  process_config( ACE_Message_Block *mb )
+  {
+    ISMRMRD::IsmrmrdHeader h;
+    ISMRMRD::deserialize(mb->rd_ptr(),h);
+    
+    
+    if (h.encoding.size() != 1) {
+      GDEBUG("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    
+    // Get the encoding space and trajectory description
+    ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+    ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+    ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+    slices_ = e_limits.slice ? e_limits.slice->maximum + 1 : 1;
+    sets_ = e_limits.set ? e_limits.set->maximum + 1 : 1;
+
+    if (h.acquisitionSystemInformation) {
+      channels_ = h.acquisitionSystemInformation->receiverChannels ? *h.acquisitionSystemInformation->receiverChannels : 128;
+    }
+
+    mode_ = mode.value();
+    order_ = order.value();
+    profiles_ = profiles.value();
+
+    if( profiles_ < 1 ) {
+      GDEBUG("The number of profiles to estimate polynomial fit is too low.\n");
+      return GADGET_FAIL;
+    }
+
+    fit_calculated_ = boost::shared_array<bool>(new bool[sets_*slices_]);
+    polyfit_ = boost::shared_array<double>(new double[(order_+1)*channels_*sets_*slices_]);   
+    profiles_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+
+    size_t bsize = sizeof(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>)*profiles_*10;
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      fit_calculated_[i] = false;
+      profiles_queue_[i].high_water_mark(bsize);
+      profiles_queue_[i].low_water_mark(bsize);
+    }    
+    
+    return GADGET_OK;
+  }
+  
+  int RadialPhaseCorrectionGadget
+  ::process( GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1,
+             GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2 )
+  {
+
+    // Pass any noise measurements down the chain
+    //
+    
+    bool is_noise = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT);
+    if (is_noise) { 
+      if (this->next()->putq(m1) < 0) {
+        GDEBUG("Failed to pass on noise samples.\n");
+        return GADGET_FAIL;
+      }
+      return GADGET_OK;
+    }
+
+    // For now we require that this gadget is inserted before any coil reduction gadgets
+    //
+
+    if( channels_ != m1->getObjectPtr()->active_channels ){
+      GDEBUG("Unexpected number of coils encountered. Did you insert the phase correction gadget after a coil reduction gadget? In that case invert the order of these gadgets\n");
+      return GADGET_FAIL;
+    }
+
+    unsigned int slice = m1->getObjectPtr()->idx.slice;
+    unsigned int set = m1->getObjectPtr()->idx.set;
+    int idx = set*slices_+slice;
+
+    if( !fit_calculated_[idx] ){
+
+      // Enqueue the first 'profiles_' profiles...
+      //
+      
+      profiles_queue_[idx].enqueue_tail(m1);
+
+      // ...before estimating the polynomial fit of order 'order_'
+      //
+
+      if( profiles_queue_[idx].message_count() == profiles_ ){
+
+        // Perform polynomial fit,
+        // assemble system matix A.
+        //
+        
+        arma::mat A( profiles_, order_+1 );
+        
+        for( int m=0; m<profiles_; m++ ){
+
+          double angle = get_projection_angle(m);          
+
+          for( int n=0; n<order_+1; n++ ){
+            A(m,n) = pow( angle, double(n) );
+          }
+        }
+
+        // Assemble right hand side
+        //
+        
+        arma::mat b( profiles_, channels_ );
+        //double prev_phase[channels_];
+        std::vector<double> prev_phase(channels_);
+        ACE_Message_Queue<ACE_MT_SYNCH>::ITERATOR iter(profiles_queue_[idx]);
+        
+        for( int m=0; m<profiles_; m++ ){                     
+          
+          ACE_Message_Block* mbq = 0x0;
+          iter.next( mbq );
+          iter.advance();
+          
+          if(!mbq) {
+            GDEBUG("Unable to interpret data on message queue (1)\n");
+            return GADGET_FAIL;
+          }
+          
+          GadgetContainerMessage< hoNDArray< std::complex<float> > > *_profile = 
+            AsContainerMessage< hoNDArray< std::complex<float> > >(mbq->cont());
+        
+          if(!_profile) {
+            GDEBUG("Unable to interpret data on message queue (2)\n");
+            return GADGET_FAIL;
+          }
+          
+          hoNDArray< std::complex<float> > *profile = _profile->getObjectPtr();
+
+          // A unique fit for each coil
+          //
+
+          for( unsigned int coil=0; coil<channels_; coil++ ){
+            
+            // 'arg' returns angles in the interval (-pi;pi)
+            // Make sure that no discontinouities arise on the graph as they cannot be fitted
+            //
+            
+            std::complex<float> sample = profile->get_data_ptr()[coil*profile->get_size(0)+(profile->get_size(0)>>1)];
+            double phase = double(std::arg(sample));
+
+            if( m>0 && std::abs(phase-prev_phase[coil])>M_PI ){
+
+              // It appears as if phase wrapping has occurred, make correction...
+              //
+
+              if( phase<prev_phase[coil] )
+                phase += 2.0*M_PI;
+              else
+                phase -= 2.0*M_PI;                
+            }
+
+            b(m,coil) = phase;
+            prev_phase[coil] = phase;
+          }
+        }
+        
+        // Linear least squares fit, i.e. solve "A^T A x = b"
+        //
+        
+        std::vector<size_t> dims; dims.push_back(order_+1); dims.push_back(channels_);
+        hoNDArray<double> vec( &dims, &polyfit_[set*(order_+1)*channels_*slices_+slice*(order_+1)*channels_] );
+
+        arma::mat x = as_arma_matrix(&vec);          
+        x = arma::solve(A.t()*A,A.t()*b);
+
+        /*
+        static int counter = 0;
+        char filename[256];
+        sprintf((char*)filename, "_polyfit_%d.real", counter);
+        write_nd_array<double>( &vec, filename );
+        */
+        
+        // Phase correct buffered profiles
+        //
+
+        for( int m=0; m<profiles_; m++ ){          
+
+          ACE_Message_Block *mbq;
+          if( profiles_queue_[idx].dequeue_head(mbq) < 0 ){
+            GDEBUG("Message dequeue failed\n");
+            GADGET_FAIL;
+          }
+
+          GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *header = 
+            AsContainerMessage<ISMRMRD::AcquisitionHeader>(mbq);
+          
+          if(!header) {
+            GDEBUG("Unable to interpret data on message queue (3)\n");
+            return GADGET_FAIL;
+          }
+
+          phase_correct(header);
+
+          if (this->next()->putq(header) < 0) {
+            GDEBUG("Failed to put data on queue\n");
+            return GADGET_FAIL;
+          }          
+        }
+        fit_calculated_[idx] = true;
+      }
+    }
+    else{
+      
+      // Phase correct profile
+      //
+      
+      phase_correct(m1);
+      
+      if (this->next()->putq(m1) < 0) {
+        GDEBUG("Failed to put data on queue\n");
+        return GADGET_FAIL;
+      }          
+    }
+
+    return GADGET_OK;
+  }  
+  
+
+  double RadialPhaseCorrectionGadget
+  ::get_projection_angle( unsigned int idx )
+  {
+    if(!(mode_ == 2 || mode_ == 3 )){
+      throw std::runtime_error("RadialPhaseCorrectionGadget: currently only trajectory modes 2 and 3 are supported (golden ratio)");;
+    }
+
+    double angle_step;
+    if( mode_ == 2 )
+      angle_step = M_PI/((std::sqrt(5.0)+1.0)*0.5); // GR_ORIGINAL
+    else if( mode_ == 3 ){
+      angle_step = M_PI*(3.0-std::sqrt(5.0))*0.5;   // GR_SMALLEST
+    }
+    return fmod(idx*angle_step, 2.0*M_PI);
+  }
+
+  void RadialPhaseCorrectionGadget
+  ::phase_correct( GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1 )
+  {
+    unsigned int slice = m1->getObjectPtr()->idx.slice;
+    unsigned int set = m1->getObjectPtr()->idx.set;
+    double angle = get_projection_angle(profiles_counter_);
+
+    for( unsigned int coil=0; coil<channels_; coil++ ){
+
+      double estimated_phase = 0.0;
+
+      for( unsigned int i=0; i<order_+1; i++ ){
+
+        double weight = polyfit_[set*(order_+1)*channels_*slices_ +
+                                 slice*(order_+1)*channels_ +
+                                 coil*(order_+1) + 
+                                 i ];
+
+        double power = std::pow(angle, double(i));
+
+        estimated_phase += (weight*power);
+      }
+      
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *_profile = 
+        AsContainerMessage<hoNDArray< std::complex<float> > >(m1->cont());
+      
+      if(!_profile) {
+        GDEBUG("Unable to phase correct profile\n");
+        return;
+      }
+
+      hoNDArray< std::complex<float> > *profile = _profile->getObjectPtr();      
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+      for( int i=0; i<profile->get_size(0); i++ ){
+        std::complex<float> sample = profile->get_data_ptr()[coil*profile->get_size(0)+i];
+        float phase = std::arg(sample);
+        float mag = std::abs(sample);
+        profile->get_data_ptr()[coil*profile->get_size(0)+i] = std::polar( mag, phase-float(estimated_phase) );
+      }
+    }
+    profiles_counter_++;
+  }
+
+  GADGET_FACTORY_DECLARE(RadialPhaseCorrectionGadget)
+
+} // namespace Gadgetron
diff --git a/gadgets/radial/RadialPhaseCorrectionGadget.h b/gadgets/radial/RadialPhaseCorrectionGadget.h
new file mode 100644
index 0000000..f1ed2aa
--- /dev/null
+++ b/gadgets/radial/RadialPhaseCorrectionGadget.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_radial_export.h"
+
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron {
+
+  class EXPORTGADGETS_RADIAL RadialPhaseCorrectionGadget :
+    public Gadget2<ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+  {
+  public:
+    GADGET_DECLARE(RadialPhaseCorrectionGadget);
+    RadialPhaseCorrectionGadget();
+    ~RadialPhaseCorrectionGadget() {};
+    
+  protected:
+    GADGET_PROPERTY_LIMITS(mode,int, "Radial mode", 3, GadgetPropertyLimitsEnumeration, 2,3);
+    GADGET_PROPERTY(order,int,"Order of polynomial fit", 6);
+    GADGET_PROPERTY(profiles, int, "Number of profiles to estimate fit", 500);
+    
+    virtual int process_config( ACE_Message_Block *mb );
+    
+    virtual int process( GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1,
+                         GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2);
+    
+    unsigned int mode_;
+    unsigned int order_;
+    unsigned int profiles_;
+    unsigned int profiles_counter_;
+    int slices_;
+    int sets_;
+    int channels_;
+    boost::shared_array<bool> fit_calculated_;
+    boost::shared_array<double> polyfit_;
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > profiles_queue_;
+
+  private:
+    double get_projection_angle( unsigned int profile_idx );
+    void phase_correct( GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* );
+  };
+}
diff --git a/gadgets/radial/config/CMakeLists.txt b/gadgets/radial/config/CMakeLists.txt
new file mode 100644
index 0000000..f571c76
--- /dev/null
+++ b/gadgets/radial/config/CMakeLists.txt
@@ -0,0 +1,33 @@
+if (ARMADILLO_FOUND)
+  install (FILES 
+    fixed_radial_mode0_realtime.xml 
+    fixed_radial_mode1_realtime.xml 
+    golden_radial_mode2_realtime.xml 
+    fixed_radial_mode0_gpusense_cg.xml 
+    fixed_radial_mode1_gpusense_cg.xml 
+    golden_radial_mode2_gpusense_cg.xml 
+    golden_radial_mode3_gpusense_cg.xml 
+    fixed_radial_mode0_gpusense_sb.xml 
+    fixed_radial_mode1_gpusense_sb.xml 
+    golden_radial_mode2_gpusense_sb.xml
+    golden_radial_mode2_gpusense_nlcg.xml  
+    golden_radial_mode3_gpusense_sb.xml 
+    fixed_radial_mode0_gpu_ktsense.xml 
+    fixed_radial_mode1_gpu_ktsense.xml 
+    golden_radial_mode2_gpu_ktsense.xml 
+    golden_radial_mode3_os_realtime.xml
+    spirit.xml
+    DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
+elseif (ARMADILLO_FOUND)
+  MESSAGE("Armadillo not found, only unoptimized radial config files will be available")
+endif (ARMADILLO_FOUND)
+
+install (FILES 
+  fixed_radial_mode0_gpusense_cg_unoptimized.xml 
+  fixed_radial_mode1_gpusense_cg_unoptimized.xml 
+  golden_radial_mode2_gpusense_cg_unoptimized.xml 
+  golden_radial_mode2_gpusense_nlcg_unoptimized.xml
+  fixed_radial_mode0_gpusense_sb_unoptimized.xml 
+  fixed_radial_mode1_gpusense_sb_unoptimized.xml 
+  golden_radial_mode2_gpusense_sb_unoptimized.xml 
+  DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
diff --git a/gadgets/radial/config/fixed_radial_mode0_gpu_ktsense.xml b/gadgets/radial/config/fixed_radial_mode0_gpu_ktsense.xml
new file mode 100644
index 0000000..6a53161
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode0_gpu_ktsense.xml
@@ -0,0 +1,131 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>0</value></property>
+      <property><name>rotations_per_reconstruction</name><value>8</value></property>
+      <property><name>buffer_length_in_rotations</name><value>8</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+      <property><name>sliding_window_rotations</name><value>4</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>4</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>4</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>4</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode0_gpusense_cg.xml b/gadgets/radial/config/fixed_radial_mode0_gpusense_cg.xml
new file mode 100644
index 0000000..9a85158
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode0_gpusense_cg.xml
@@ -0,0 +1,127 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>0</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode0_gpusense_cg_unoptimized.xml b/gadgets/radial/config/fixed_radial_mode0_gpusense_cg_unoptimized.xml
new file mode 100644
index 0000000..b10d18b
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode0_gpusense_cg_unoptimized.xml
@@ -0,0 +1,114 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget_unoptimized</classname>
+    </gadget>
+
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>0</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode0_gpusense_sb.xml b/gadgets/radial/config/fixed_radial_mode0_gpusense_sb.xml
new file mode 100644
index 0000000..e06686c
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode0_gpusense_sb.xml
@@ -0,0 +1,137 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>0</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuSbSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode0_gpusense_sb_unoptimized.xml b/gadgets/radial/config/fixed_radial_mode0_gpusense_sb_unoptimized.xml
new file mode 100644
index 0000000..0876019
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode0_gpusense_sb_unoptimized.xml
@@ -0,0 +1,124 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget_unoptimized</classname>
+    </gadget>
+
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>0</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuSbSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode0_realtime.xml b/gadgets/radial/config/fixed_radial_mode0_realtime.xml
new file mode 100644
index 0000000..be8e53b
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode0_realtime.xml
@@ -0,0 +1,123 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>8</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>0</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode1_gpu_ktsense.xml b/gadgets/radial/config/fixed_radial_mode1_gpu_ktsense.xml
new file mode 100644
index 0000000..f4e8fda
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode1_gpu_ktsense.xml
@@ -0,0 +1,131 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+      <property><name>sliding_window_rotations</name><value>2</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>2</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>2</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>2</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode1_gpusense_cg.xml b/gadgets/radial/config/fixed_radial_mode1_gpusense_cg.xml
new file mode 100644
index 0000000..662c59d
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode1_gpusense_cg.xml
@@ -0,0 +1,127 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode1_gpusense_cg_unoptimized.xml b/gadgets/radial/config/fixed_radial_mode1_gpusense_cg_unoptimized.xml
new file mode 100644
index 0000000..01004f7
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode1_gpusense_cg_unoptimized.xml
@@ -0,0 +1,114 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget_unoptimized</classname>
+    </gadget>
+
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode1_gpusense_sb.xml b/gadgets/radial/config/fixed_radial_mode1_gpusense_sb.xml
new file mode 100644
index 0000000..f2f0522
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode1_gpusense_sb.xml
@@ -0,0 +1,137 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuSbSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode1_gpusense_sb_unoptimized.xml b/gadgets/radial/config/fixed_radial_mode1_gpusense_sb_unoptimized.xml
new file mode 100644
index 0000000..3e1dcbb
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode1_gpusense_sb_unoptimized.xml
@@ -0,0 +1,124 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget_unoptimized</classname>
+    </gadget>
+
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuSbSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode1_realtime.xml b/gadgets/radial/config/fixed_radial_mode1_realtime.xml
new file mode 100644
index 0000000..597b3ff
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode1_realtime.xml
@@ -0,0 +1,123 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>8</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>1</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode2_gpu_ktsense.xml b/gadgets/radial/config/golden_radial_mode2_gpu_ktsense.xml
new file mode 100644
index 0000000..04187f8
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode2_gpu_ktsense.xml
@@ -0,0 +1,133 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>2</value></property>
+      <property><name>profiles_per_frame</name><value>16</value></property>
+      <property><name>rotations_per_reconstruction</name><value>32</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>32</value></property>
+      <property><name>buffer_length_in_rotations</name><value>1</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+      <property><name>sliding_window_rotations</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name><value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>25</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>16</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>25</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>16</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>25</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>16</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode2_gpusense_cg.xml b/gadgets/radial/config/golden_radial_mode2_gpusense_cg.xml
new file mode 100644
index 0000000..d042251
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode2_gpusense_cg.xml
@@ -0,0 +1,129 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>2</value></property>
+      <property><name>profiles_per_frame</name><value>16</value></property>
+      <property><name>rotations_per_reconstruction</name><value>16</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>16</value></property>
+      <property><name>buffer_length_in_rotations</name><value>2</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name><value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode2_gpusense_cg_unoptimized.xml b/gadgets/radial/config/golden_radial_mode2_gpusense_cg_unoptimized.xml
new file mode 100644
index 0000000..557611f
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode2_gpusense_cg_unoptimized.xml
@@ -0,0 +1,116 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget_unoptimized</classname>
+    </gadget>
+
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>2</value></property>
+      <property><name>profiles_per_frame</name><value>16</value></property>
+      <property><name>rotations_per_reconstruction</name><value>16</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>16</value></property>
+      <property><name>buffer_length_in_rotations</name><value>2</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name><value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode2_gpusense_nlcg.xml b/gadgets/radial/config/golden_radial_mode2_gpusense_nlcg.xml
new file mode 100644
index 0000000..4de8ece
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode2_gpusense_nlcg.xml
@@ -0,0 +1,132 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>2</value></property>
+      <property><name>profiles_per_frame</name><value>16</value></property>
+      <property><name>rotations_per_reconstruction</name><value>32</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>32</value></property>
+      <property><name>buffer_length_in_rotations</name><value>1</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+      <gadget>
+      <name>gpuNlcgSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuNlcgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_cg_iterations</name> <value>50</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>lambda</name>                  <value>1e-6</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+      <gadget>
+      <name>gpuNlcgSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuNlcgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_cg_iterations</name> <value>50</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>lambda</name>                  <value>1e-6</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+       <gadget>
+      <name>gpuNlcgSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuNlcgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_cg_iterations</name> <value>50</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>lambda</name>                  <value>1e-6</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode2_gpusense_nlcg_unoptimized.xml b/gadgets/radial/config/golden_radial_mode2_gpusense_nlcg_unoptimized.xml
new file mode 100644
index 0000000..0fb4f95
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode2_gpusense_nlcg_unoptimized.xml
@@ -0,0 +1,120 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget_unoptimized</classname>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>2</value></property>
+      <property><name>profiles_per_frame</name><value>16</value></property>
+      <property><name>rotations_per_reconstruction</name><value>32</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>32</value></property>
+      <property><name>buffer_length_in_rotations</name><value>1</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuNlcgSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuNlcgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_cg_iterations</name> <value>30</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>lambda</name>                  <value>0.01</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuNlcgSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuNlcgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_cg_iterations</name> <value>30</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>lambda</name>                  <value>0.01</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuNlcgSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuNlcgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_cg_iterations</name> <value>30</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>lambda</name>                  <value>0.01</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+    </gadget>
+    
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode2_gpusense_sb.xml b/gadgets/radial/config/golden_radial_mode2_gpusense_sb.xml
new file mode 100644
index 0000000..a697874
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode2_gpusense_sb.xml
@@ -0,0 +1,139 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>2</value></property>
+      <property><name>profiles_per_frame</name><value>16</value></property>
+      <property><name>rotations_per_reconstruction</name><value>32</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>32</value></property>
+      <property><name>buffer_length_in_rotations</name><value>1</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuSbSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode2_gpusense_sb_unoptimized.xml b/gadgets/radial/config/golden_radial_mode2_gpusense_sb_unoptimized.xml
new file mode 100644
index 0000000..8fff4fb
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode2_gpusense_sb_unoptimized.xml
@@ -0,0 +1,126 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget_unoptimized</classname>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>2</value></property>
+      <property><name>profiles_per_frame</name><value>16</value></property>
+      <property><name>rotations_per_reconstruction</name><value>32</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>32</value></property>
+      <property><name>buffer_length_in_rotations</name><value>1</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuSbSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode2_realtime.xml b/gadgets/radial/config/golden_radial_mode2_realtime.xml
new file mode 100644
index 0000000..a0e977a
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode2_realtime.xml
@@ -0,0 +1,124 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>8</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>2</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>profiles_per_frame</name><value>32</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name><value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name><value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name><value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+    <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode3_gpusense_cg.xml b/gadgets/radial/config/golden_radial_mode3_gpusense_cg.xml
new file mode 100644
index 0000000..a0ce4c9
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode3_gpusense_cg.xml
@@ -0,0 +1,129 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>3</value></property>
+      <property><name>profiles_per_frame</name><value>32</value></property>
+      <property><name>rotations_per_reconstruction</name><value>8</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>8</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name><value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>40</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>40</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>40</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode3_gpusense_sb.xml b/gadgets/radial/config/golden_radial_mode3_gpusense_sb.xml
new file mode 100644
index 0000000..6f82dd9
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode3_gpusense_sb.xml
@@ -0,0 +1,139 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>3</value></property>
+      <property><name>profiles_per_frame</name><value>16</value></property>
+      <property><name>rotations_per_reconstruction</name><value>32</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>32</value></property>
+      <property><name>buffer_length_in_rotations</name><value>1</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuSbSenseGadget_slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode3_os_realtime.xml b/gadgets/radial/config/golden_radial_mode3_os_realtime.xml
new file mode 100644
index 0000000..dddfa4b
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode3_os_realtime.xml
@@ -0,0 +1,162 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1022</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriter</classname>
+    </writer>
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>8</value></property>
+    </gadget>
+     <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>3</value></property>
+      <property><name>profiles_per_frame</name><value>32</value></property>
+      <property><name>rotations_per_reconstruction</name><value>32</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>32</value></property>
+      <property><name>buffer_length_in_rotations</name><value>1</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <!--gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>3</value></property>
+      <property><name>profiles_per_frame</name><value>16</value></property>
+      <property><name>rotations_per_reconstruction</name><value>16</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>16</value></property>
+      <property><name>buffer_length_in_rotations</name><value>2</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget-->
+    
+    <gadget>
+      <name>slice0</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuLALMSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name><value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>20</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>coils_per_subset</name>        <value>4</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+      <property><name>lambda</name><value>3</value></property>
+      <property><name>huber_value</name><value>1</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuLALMSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>number_of_iterations at slice0</value></property>
+      <property><name>oversampling_factor</name>     <value>oversampling_factor at slice0</value></property>
+      <property><name>kernel_width</name>            <value>kernel_width at slice0</value></property>
+      <property><name>coils_per_subset</name>        <value>coils_per_subset at slice0</value></property>
+      <property><name>output_convergence</name><value>output_convergence at slice0</value></property>
+      <property><name>lambda</name><value>lambda at slice0</value></property>
+      <property><name>huber_value</name><value>huber_value at slice0</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpuparallelmri</dll>
+      <classname>gpuLALMSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+        <property><name>number_of_iterations</name>    <value>number_of_iterations at slice0</value></property>
+      <property><name>oversampling_factor</name>     <value>oversampling_factor at slice0</value></property>
+      <property><name>kernel_width</name>            <value>kernel_width at slice0</value></property>
+      <property><name>coils_per_subset</name>        <value>coils_per_subset at slice0</value></property>
+      <property><name>output_convergence</name><value>output_convergence at slice0</value></property>
+      <property><name>lambda</name><value>lambda at slice0</value></property>
+      <property><name>huber_value</name><value>huber_value at slice0</value></property>
+   </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <!--gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget-->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinish</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadget</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/spirit.xml b/gadgets/radial/config/spirit.xml
new file mode 100644
index 0000000..f4d2cb6
--- /dev/null
+++ b/gadgets/radial/config/spirit.xml
@@ -0,0 +1,96 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+                              xmlns="http://gadgetron.sf.net/gadgetron"
+                              xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+<!--  
+  <gadget>
+    <name>gpuRadialSpiritPrepGadget</name>
+    <dll>gadgetron_gpuradial</dll>
+    <classname>gpuRadialSpiritPrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>mode</name><value>3</value></property>
+    <property><name>profiles_per_frame</name><value>16</value></property>
+    <property><name>rotations_per_reconstruction</name><value>16</value></property>
+    <property><name>buffer_frames_per_rotation</name><value>16</value></property>
+    <property><name>buffer_length_in_rotations</name><value>2</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+    <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+    <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+-->
+
+    <gadget>
+      <name>gpuRadialSpiritPrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSpiritPrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>3</value></property>
+      <property><name>profiles_per_frame</name><value>16</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>4</value></property>
+      <property><name>buffer_length_in_rotations</name><value>8</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+      <property><name>number_of_iterations</name><value>25</value></property>
+    </gadget>
+
+  <gadget>
+    <name>gpuCgSpiritGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuCgSpiritGadget</classname>
+    <property><name>pass_on_undesired_data</name><value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>sliceno</name>                 <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>40</value></property>
+    <property><name>cg_limit</name>                <value>1e-9</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.0</value></property>
+    <property><name>output_convergence</name>      <value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>
+  
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/gadgetron_radial_export.h b/gadgets/radial/gadgetron_radial_export.h
new file mode 100644
index 0000000..ee08991
--- /dev/null
+++ b/gadgets/radial/gadgetron_radial_export.h
@@ -0,0 +1,14 @@
+#ifndef GADGETRON_RADIAL_EXPORT_H_
+#define GADGETRON_RADIAL_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_RADIAL__)
+#define EXPORTGADGETS_RADIAL __declspec(dllexport)
+#else
+#define EXPORTGADGETS_RADIAL __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETS_RADIAL
+#endif
+
+#endif /* GADGETRON_GPURADIAL_EXPORT_H_ */
diff --git a/gadgets/radial/gpuRadialPrepGadget.cpp b/gadgets/radial/gpuRadialPrepGadget.cpp
new file mode 100644
index 0000000..e967fa9
--- /dev/null
+++ b/gadgets/radial/gpuRadialPrepGadget.cpp
@@ -0,0 +1,939 @@
+#include "gpuRadialPrepGadget.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "GenericReconJob.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "vector_td_operators.h"
+#include "GPUTimer.h"
+#include "check_CUDA.h"
+#include "radial_utilities.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_fileio.h"
+#include "ismrmrd/xml.h"
+
+#include <algorithm>
+#include <vector>
+#include <cmath>
+
+namespace Gadgetron{
+
+  gpuRadialPrepGadget::gpuRadialPrepGadget()
+    : slices_(-1)
+    , sets_(-1)
+    , device_number_(-1)
+    , mode_(-1)
+    , samples_per_profile_(-1)
+  {
+  }
+  
+  gpuRadialPrepGadget::~gpuRadialPrepGadget() {}
+  
+  int gpuRadialPrepGadget::process_config(ACE_Message_Block* mb)
+  {
+    //GDEBUG("gpuRadialPrepGadget::process_config\n");
+
+    // Get configuration values from config file
+    //
+
+    mode_ = mode.value();
+    device_number_ = deviceno.value();
+    rotations_per_reconstruction_ = rotations_per_reconstruction.value();
+    buffer_length_in_rotations_ = buffer_length_in_rotations.value();
+    buffer_using_solver_ = buffer_using_solver.value();
+    output_timing_ = output_timing.value();
+
+    // Currently there are some restrictions on the allowed sliding window configurations
+    //
+    
+    sliding_window_profiles_ = sliding_window_profiles.value();
+    sliding_window_rotations_ = sliding_window_rotations.value();
+
+    if( sliding_window_profiles_>0 && sliding_window_rotations_>0 ){
+      GDEBUG( "Error: Sliding window reconstruction is not yet supported for both profiles and frames simultaneously.\n" );
+      return GADGET_FAIL;
+    }
+
+    if( sliding_window_profiles_>0 && rotations_per_reconstruction_>0 ){
+      GDEBUG( "Error: Sliding window reconstruction over profiles is not yet supported for multiframe reconstructions.\n" );
+      return GADGET_FAIL;
+    }
+    
+    if( sliding_window_rotations_ > 0 && sliding_window_rotations_ >= rotations_per_reconstruction_ ){
+      GDEBUG( "Error: Illegal sliding window configuration.\n" );
+      return GADGET_FAIL;
+    }
+
+    // Setup and validate device configuration
+    //
+
+    int number_of_devices;
+    if (cudaGetDeviceCount(&number_of_devices)!= cudaSuccess) {
+      GDEBUG( "Error: unable to query number of CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (number_of_devices == 0) {
+      GDEBUG( "Error: No available CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (device_number_ >= number_of_devices) {
+      GDEBUG("Adjusting device number from %d to %d\n", device_number_,  (device_number_%number_of_devices));
+      device_number_ = (device_number_%number_of_devices);
+    }
+
+    if (cudaSetDevice(device_number_)!= cudaSuccess) {
+      GDEBUG( "Error: unable to set CUDA device.\n" );
+      return GADGET_FAIL;
+    }
+
+    cudaDeviceProp deviceProp;
+    if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+      GDEBUG( "Error: unable to query device properties.\n" );
+      return GADGET_FAIL;
+    }
+    
+    unsigned int warp_size = deviceProp.warpSize;
+
+    // Convolution kernel width and oversampling ratio (for the buffer)
+    //
+
+    kernel_width_ = buffer_convolution_kernel_width.value();
+    oversampling_factor_ = buffer_convolution_oversampling_factor.value();
+
+    // Get the Ismrmrd header
+    //
+    ISMRMRD::IsmrmrdHeader h;
+    ISMRMRD::deserialize(mb->rd_ptr(),h);
+    
+    
+    if (h.encoding.size() != 1) {
+      GDEBUG("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    
+    // Get the encoding space and trajectory description
+    ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+    ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+    ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+    ISMRMRD::TrajectoryDescription traj_desc;
+    // Matrix sizes (as a multiple of the GPU's warp size)
+    //
+    
+    image_dimensions_.push_back(((e_space.matrixSize.x+warp_size-1)/warp_size)*warp_size);
+    image_dimensions_.push_back(((e_space.matrixSize.y+warp_size-1)/warp_size)*warp_size);
+
+    image_dimensions_recon_.push_back(((static_cast<unsigned int>(std::ceil(e_space.matrixSize.x*reconstruction_os_factor_x.value()))+warp_size-1)/warp_size)*warp_size);  
+    image_dimensions_recon_.push_back(((static_cast<unsigned int>(std::ceil(e_space.matrixSize.y*reconstruction_os_factor_y.value()))+warp_size-1)/warp_size)*warp_size);
+    
+    image_dimensions_recon_os_ = uint64d2
+      (((static_cast<unsigned int>(std::ceil(image_dimensions_recon_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+       ((static_cast<unsigned int>(std::ceil(image_dimensions_recon_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+    
+    // In case the warp_size constraint kicked in
+    oversampling_factor_ = float(image_dimensions_recon_os_[0])/float(image_dimensions_recon_[0]); 
+    
+    GDEBUG("matrix_size_x : %d, recon: %d, recon_os: %d\n", 
+                  image_dimensions_[0], image_dimensions_recon_[0], image_dimensions_recon_os_[0]);
+
+    GDEBUG("matrix_size_y : %d, recon: %d, recon_os: %d\n", 
+                  image_dimensions_[1], image_dimensions_recon_[1], image_dimensions_recon_os_[1]);
+    
+    fov_.push_back(r_space.fieldOfView_mm.x);
+    fov_.push_back(r_space.fieldOfView_mm.y);
+    fov_.push_back(r_space.fieldOfView_mm.z);
+
+    slices_ = e_limits.slice ? e_limits.slice->maximum + 1 : 1;
+    sets_ = e_limits.set ? e_limits.set->maximum + 1 : 1;
+    
+    // Allocate profile queues
+    // - one queue for the currently incoming frame
+    // - one queue for the next reconstruction
+
+    frame_profiles_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+    recon_profiles_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+    image_headers_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+
+    size_t bsize = sizeof(GadgetContainerMessage< hoNDArray< std::complex<float> > >)*image_dimensions_[0]*10;
+
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      frame_profiles_queue_[i].high_water_mark(bsize);
+      frame_profiles_queue_[i].low_water_mark(bsize);
+    }
+    
+    bsize *= (rotations_per_reconstruction_+1);
+    
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      recon_profiles_queue_[i].high_water_mark(bsize);
+      recon_profiles_queue_[i].low_water_mark(bsize);
+    }
+
+    // Define some profile counters for book-keeping
+    //
+
+    previous_profile_ = boost::shared_array<long>(new long[slices_*sets_]);
+    image_counter_ = boost::shared_array<long>(new long[slices_*sets_]);
+    profiles_counter_frame_= boost::shared_array<long>(new long[slices_*sets_]);
+    profiles_counter_global_= boost::shared_array<long>(new long[slices_*sets_]);
+    profiles_per_frame_= boost::shared_array<long>(new long[slices_*sets_]);
+    frames_per_rotation_= boost::shared_array<long>(new long[slices_*sets_]);
+    buffer_frames_per_rotation_= boost::shared_array<long>(new long[slices_*sets_]);
+    buffer_update_needed_ = boost::shared_array<bool>(new bool[slices_*sets_]);
+    reconfigure_ = boost::shared_array<bool>(new bool[slices_*sets_]);
+    num_coils_ = boost::shared_array<unsigned int>(new unsigned int[slices_*sets_]);
+    
+    if( !previous_profile_.get() ||
+        !image_counter_.get() || 
+        !profiles_counter_frame_.get() ||
+        !profiles_counter_global_.get() ||
+        !profiles_per_frame_.get() || 
+        !frames_per_rotation_.get() ||
+        !buffer_frames_per_rotation_.get() ||
+        !buffer_update_needed_.get() ||
+        !num_coils_.get() ||
+        !reconfigure_ ){
+      GDEBUG("Failed to allocate host memory (1)\n");
+      return GADGET_FAIL;
+    }
+
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+
+      previous_profile_[i] = -1;
+      image_counter_[i] = 0;
+      profiles_counter_frame_[i] = 0;
+      profiles_counter_global_[i] = 0;
+      profiles_per_frame_[i] = profiles_per_frame.value();
+      frames_per_rotation_[i] = frames_per_rotation.value();
+      buffer_frames_per_rotation_[i] = buffer_frames_per_rotation.value();
+      num_coils_[i] = 0;
+      buffer_update_needed_[i] = true;
+      reconfigure_[i] = true;
+
+      // Assign some default values ("upper bound estimates") of the (possibly) unknown entities
+      //
+      
+      if( profiles_per_frame_[i] == 0 ){
+        profiles_per_frame_[i] = image_dimensions_[0];
+      }
+      
+      if( frames_per_rotation_[i] == 0 ){
+        if( mode_ == 2 || mode_ == 3 ) // golden ratio
+          frames_per_rotation_[i] = 1;
+        else
+          frames_per_rotation_[i] = image_dimensions_[0]/profiles_per_frame_[i];
+      }
+
+      bsize = sizeof(GadgetContainerMessage<ISMRMRD::ImageHeader>)*100*
+        std::max(1L, frames_per_rotation_[i]*rotations_per_reconstruction_);
+    
+      image_headers_queue_[i].high_water_mark(bsize);
+      image_headers_queue_[i].low_water_mark(bsize);
+    }
+        
+    position_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+    read_dir_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+    phase_dir_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+    slice_dir_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+
+    if( !position_.get() || !read_dir_.get() || !phase_dir_.get() || !slice_dir_.get() ){
+      GDEBUG("Failed to allocate host memory (2)\n");
+      return GADGET_FAIL;
+    }
+
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      (position_[i])[0] = (position_[i])[1] = (position_[i])[2] = 0.0f;
+      (read_dir_[i])[0] = (read_dir_[i])[1] = (read_dir_[i])[2] = 0.0f;
+      (phase_dir_[i])[0] = (phase_dir_[i])[1] = (phase_dir_[i])[2] = 0.0f;
+      (slice_dir_[i])[0] = (slice_dir_[i])[1] = (slice_dir_[i])[2] = 0.0f;
+    }
+
+    // Allocate accumulation buffer
+    //
+
+    allocate_accumulation_buffer( slices_*sets_ );
+    
+    // Allocate remaining shared_arrays
+    //
+    
+    csm_host_ = boost::shared_array< hoNDArray<float_complext> >(new hoNDArray<float_complext>[slices_*sets_]);
+    reg_host_ = boost::shared_array< hoNDArray<float_complext> >(new hoNDArray<float_complext>[slices_*sets_]);
+
+    host_traj_recon_ = boost::shared_array< hoNDArray<floatd2> >(new hoNDArray<floatd2>[slices_*sets_]);
+    host_weights_recon_ = boost::shared_array< hoNDArray<float> >(new hoNDArray<float>[slices_*sets_]);
+
+    if( !csm_host_.get() || !reg_host_.get() || !host_traj_recon_.get() || !host_weights_recon_ ){
+      GDEBUG("Failed to allocate host memory (3)\n");
+      return GADGET_FAIL;
+    }
+
+    return GADGET_OK;
+  }
+
+  int gpuRadialPrepGadget::
+  process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1,
+          GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2)
+  {
+    // Noise should have been consumed by the noise adjust (if in the gadget chain)
+    //
+    
+    bool is_noise = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT);
+    if (is_noise) { 
+      m1->release();
+      return GADGET_OK;
+    }
+
+    unsigned int profile = m1->getObjectPtr()->idx.kspace_encode_step_1;
+    unsigned int slice = m1->getObjectPtr()->idx.slice;
+    unsigned int set = m1->getObjectPtr()->idx.set;
+
+    // Only when the first profile arrives, do we know the #samples/profile
+    //
+
+    if( samples_per_profile_ == -1 )      
+      samples_per_profile_ = m1->getObjectPtr()->number_of_samples;
+    
+    if( samples_per_profile_ != m1->getObjectPtr()->number_of_samples ){
+      GDEBUG("Unexpected change in the incoming profiles' lengths\n");
+      return GADGET_FAIL;
+    }
+    
+    //GDEBUG("gpuRadialPrepGadget::process\n");
+
+    boost::shared_ptr<GPUTimer> process_timer;
+    if( output_timing_ )
+      process_timer = boost::shared_ptr<GPUTimer>( new GPUTimer("gpuRadialPrepGadget::process()") );
+
+    // Reconfigure at first pass
+    // - or if the number of coil changes
+    // - or if the reconfigure_ flag is set
+
+    if( num_coils_[set*slices_+slice] != m1->getObjectPtr()->active_channels ){
+      GDEBUG("Reconfiguring due to change in the number of coils\n");
+      num_coils_[set*slices_+slice] = m1->getObjectPtr()->active_channels;
+      reconfigure(set, slice);
+    }
+
+    if( reconfigure_[set*slices_+slice] ){
+      GDEBUG("Reconfiguring due to boolean indicator\n");
+      reconfigure(set, slice);
+    }
+
+    // Get a pointer to the accumulation buffer. 
+    //
+    
+    cuBuffer<float,2> *acc_buffer = get_buffer_ptr(set*slices_+slice);
+
+    // Have the imaging plane changed?
+    //
+
+    if( !vec_equal(position_[set*slices_+slice], m1->getObjectPtr()->position) ||
+        !vec_equal(read_dir_[set*slices_+slice], m1->getObjectPtr()->read_dir) || 
+        !vec_equal(phase_dir_[set*slices_+slice], m1->getObjectPtr()->phase_dir) ||
+        !vec_equal(slice_dir_[set*slices_+slice], m1->getObjectPtr()->slice_dir) ){
+      
+      // Yes indeed, clear the accumulation buffer
+      acc_buffer->clear();
+      buffer_update_needed_[set*slices_+slice] = true;
+      
+      memcpy(position_[set*slices_+slice],m1->getObjectPtr()->position,3*sizeof(float));
+      memcpy(read_dir_[set*slices_+slice],m1->getObjectPtr()->read_dir,3*sizeof(float));
+      memcpy(phase_dir_[set*slices_+slice],m1->getObjectPtr()->phase_dir,3*sizeof(float));
+      memcpy(slice_dir_[set*slices_+slice],m1->getObjectPtr()->slice_dir,3*sizeof(float));
+    }
+        
+    bool new_frame_detected = false;
+
+    // Keep track of the incoming profile ids (mode dependent)
+    // - to determine the number of profiles per frame
+    // - to determine the number of frames per rotation
+    //
+
+    if (previous_profile_[set*slices_+slice] >= 0) {
+
+      if ( profile > previous_profile_[set*slices_+slice]) { // this is not the last profile in the frame
+        if( mode_ == 0 && frames_per_rotation.value() == 0 ){
+          unsigned int acceleration_factor = profile - previous_profile_[set*slices_+slice];
+          if( acceleration_factor != frames_per_rotation_[set*slices_+slice] ){
+            GDEBUG("Reconfiguring due to change in acceleration factor\n");
+            frames_per_rotation_[set*slices_+slice] = acceleration_factor;
+            reconfigure(set, slice);
+          }
+        }
+      }
+      else{ // This is the first profile in a new frame
+        if( profiles_per_frame.value() == 0 && // make sure the user did not specify a desired value for this variable
+            profiles_counter_frame_[set*slices_+slice] > 0 &&
+            profiles_counter_frame_[set*slices_+slice] != profiles_per_frame_[set*slices_+slice] ){ // a new acceleration factor is detected
+          GDEBUG("Reconfiguring due to new slice detection\n");
+          new_frame_detected = true;
+          profiles_per_frame_[set*slices_+slice] = profiles_counter_frame_[set*slices_+slice];
+          if( mode_ == 1 && frames_per_rotation.value() == 0 )
+            frames_per_rotation_[set*slices_+slice] = image_dimensions_[0]/profiles_per_frame_[set*slices_+slice];
+          reconfigure(set, slice);
+        }
+      }
+    }
+    previous_profile_[set*slices_+slice] = profile;
+
+    // Enqueue profile
+    // - if 'new_frame_detected' the current profile does not belong to the current frame and we delay enqueing
+
+    if( !new_frame_detected ) {
+      
+      // Memory handling is easier if we make copies for our internal queues
+      frame_profiles_queue_[set*slices_+slice].enqueue_tail(duplicate_profile(m2));
+      recon_profiles_queue_[set*slices_+slice].enqueue_tail(duplicate_profile(m2));
+    }
+
+    // If the profile is the last of a "true frame" (ignoring any sliding window profiles)
+    // - then update the accumulation buffer
+
+    bool is_last_profile_in_frame = (profiles_counter_frame_[set*slices_+slice] == profiles_per_frame_[set*slices_+slice]-1);
+    is_last_profile_in_frame |= new_frame_detected;
+
+    if( is_last_profile_in_frame ){
+
+      // Extract this frame's samples to update the csm/regularization buffer
+      //
+
+      boost::shared_ptr< hoNDArray<float_complext> > host_samples = 
+        extract_samples_from_queue( &frame_profiles_queue_[set*slices_+slice], false, set, slice );
+
+      if( host_samples.get() == 0x0 ){
+        GDEBUG("Failed to extract frame data from queue\n");
+        return GADGET_FAIL;
+      }
+      
+      cuNDArray<float_complext> samples( host_samples.get() );
+      
+      long profile_offset = profiles_counter_global_[set*slices_+slice] - ((new_frame_detected) ? 1 : 0);
+      boost::shared_ptr< cuNDArray<floatd2> > traj = calculate_trajectory_for_frame(profile_offset, set, slice);
+
+      buffer_update_needed_[set*slices_+slice] |= acc_buffer->add_frame_data( &samples, traj.get() );
+    }
+    
+    // Are we ready to reconstruct (downstream)?
+    //
+    
+    long profiles_per_reconstruction = profiles_per_frame_[set*slices_+slice];
+    
+    if( rotations_per_reconstruction_ > 0 )
+      profiles_per_reconstruction *= (frames_per_rotation_[set*slices_+slice]*rotations_per_reconstruction_);
+    
+    bool is_last_profile_in_reconstruction = ( recon_profiles_queue_[set*slices_+slice].message_count() == profiles_per_reconstruction );
+        
+    // Prepare the image header for this frame
+    // - if this is indeed the last profile of a new frame
+    // - or if we are about to reconstruct due to 'sliding_window_profiles_' > 0
+
+    if( is_last_profile_in_frame || 
+        (is_last_profile_in_reconstruction && image_headers_queue_[set*slices_+slice].message_count() == 0) ){
+      
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *header = new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+      ISMRMRD::AcquisitionHeader *base_head = m1->getObjectPtr();
+
+      {
+        // Initialize header to all zeroes (there is a few fields we do not set yet)
+        ISMRMRD::ImageHeader tmp;
+        *(header->getObjectPtr()) = tmp;
+      }
+
+      header->getObjectPtr()->version = base_head->version;
+
+      header->getObjectPtr()->matrix_size[0] = image_dimensions_recon_[0];
+      header->getObjectPtr()->matrix_size[1] = image_dimensions_recon_[1];
+      header->getObjectPtr()->matrix_size[2] = std::max(1L,frames_per_rotation_[set*slices_+slice]*rotations_per_reconstruction_);
+
+      header->getObjectPtr()->field_of_view[0] = fov_[0];
+      header->getObjectPtr()->field_of_view[1] = fov_[1];
+      header->getObjectPtr()->field_of_view[2] = fov_[2];
+
+      header->getObjectPtr()->channels = num_coils_[set*slices_+slice];
+      header->getObjectPtr()->slice = base_head->idx.slice;
+      header->getObjectPtr()->set = base_head->idx.set;
+
+      header->getObjectPtr()->acquisition_time_stamp = base_head->acquisition_time_stamp;
+      memcpy(header->getObjectPtr()->physiology_time_stamp, base_head->physiology_time_stamp, sizeof(uint32_t)*ISMRMRD::ISMRMRD_PHYS_STAMPS);
+
+      memcpy(header->getObjectPtr()->position, base_head->position, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->read_dir, base_head->read_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->phase_dir, base_head->phase_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->slice_dir, base_head->slice_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->patient_table_position, base_head->patient_table_position, sizeof(float)*3);
+
+      header->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_CXFLOAT;
+      header->getObjectPtr()->image_index = image_counter_[set*slices_+slice]++; 
+      header->getObjectPtr()->image_series_index = set*slices_+slice;
+
+      image_headers_queue_[set*slices_+slice].enqueue_tail(header);
+    }
+    
+    // If it is time to reconstruct (downstream) then prepare the Sense job
+    // 
+
+    if( is_last_profile_in_reconstruction ){
+      
+      // Update csm and regularization images if the buffer has changed (completed a cycle) 
+      // - and at the first pass
+      
+      if( buffer_update_needed_[set*slices_+slice] || 
+          csm_host_[set*slices_+slice].get_number_of_elements() == 0 || 
+          reg_host_[set*slices_+slice].get_number_of_elements() == 0 ){
+
+        // Compute and set CSM (in derived Sense/Spirit/... class)
+        //
+
+        csm_host_[set*slices_+slice] = *compute_csm( set*slices_+slice );
+	                
+        // Compute regularization image
+        //
+        
+        reg_host_[set*slices_+slice] = *compute_reg( set, slice, new_frame_detected );
+		
+        /*
+          static int counter = 0;
+          char filename[256];
+          sprintf((char*)filename, "_reg_%d.real", counter);
+          write_nd_array<float>( abs(&reg_host_[set*slices_+slice]).get(), filename );
+          counter++; */
+
+        buffer_update_needed_[set*slices_+slice] = false;
+      }
+
+      // Prepare data array of the profiles for the downstream reconstruction
+      //
+      
+      boost::shared_ptr< hoNDArray<float_complext> > samples_host = 
+        extract_samples_from_queue( &recon_profiles_queue_[set*slices_+slice], true, set, slice );
+      
+      if( samples_host.get() == 0x0 ){
+        GDEBUG("Failed to extract frame data from queue\n");
+        return GADGET_FAIL;
+      }
+           
+      // The trajectory needs to be updated on the fly:
+      // - for golden ratio based acquisitions
+      // - when we are reconstructing frame-by-frame
+      
+      if( mode_ == 2 || mode_ == 3 || rotations_per_reconstruction_ == 0 ){
+        calculate_trajectory_for_reconstruction
+          ( profiles_counter_global_[set*slices_+slice] - ((new_frame_detected) ? 1 : 0), set, slice );
+      }
+      
+      // Set up Sense job
+      //
+
+      GadgetContainerMessage< GenericReconJob >* m4 = new GadgetContainerMessage< GenericReconJob >();
+	
+      m4->getObjectPtr()->dat_host_ = samples_host;
+      m4->getObjectPtr()->tra_host_ = boost::shared_ptr< hoNDArray<floatd2> >(new hoNDArray<floatd2>(host_traj_recon_[set*slices_+slice]));
+      m4->getObjectPtr()->dcw_host_ = boost::shared_ptr< hoNDArray<float> >(new hoNDArray<float>(host_weights_recon_[set*slices_+slice]));
+      m4->getObjectPtr()->csm_host_ = boost::shared_ptr< hoNDArray<float_complext> >( new hoNDArray<float_complext>(csm_host_[set*slices_+slice]));
+      m4->getObjectPtr()->reg_host_ = boost::shared_ptr< hoNDArray<float_complext> >( new hoNDArray<float_complext>(reg_host_[set*slices_+slice]));
+
+      // Pull the image headers out of the queue
+      //
+
+      long frames_per_reconstruction = 
+        std::max( 1L, frames_per_rotation_[set*slices_+slice]*rotations_per_reconstruction_ );
+      
+      if( image_headers_queue_[set*slices_+slice].message_count() != frames_per_reconstruction ){
+        m4->release();
+        GDEBUG("Unexpected size of image header queue: %d, %d\n", 
+                      image_headers_queue_[set*slices_+slice].message_count(), frames_per_reconstruction);
+        return GADGET_FAIL;
+      }
+
+      m4->getObjectPtr()->image_headers_ =
+        boost::shared_array<ISMRMRD::ImageHeader>( new ISMRMRD::ImageHeader[frames_per_reconstruction] );
+      
+      for( unsigned int i=0; i<frames_per_reconstruction; i++ ){	
+
+        ACE_Message_Block *mbq;
+
+        if( image_headers_queue_[set*slices_+slice].dequeue_head(mbq) < 0 ) {
+          m4->release();
+          GDEBUG("Image header dequeue failed\n");
+          return GADGET_FAIL;
+        }
+	
+        GadgetContainerMessage<ISMRMRD::ImageHeader> *m = AsContainerMessage<ISMRMRD::ImageHeader>(mbq);
+        m4->getObjectPtr()->image_headers_[i] = *m->getObjectPtr();
+
+        // In sliding window mode the header might need to go back at the end of the queue for reuse
+        // 
+	
+        if( i >= frames_per_reconstruction-sliding_window_rotations_*frames_per_rotation_[set*slices_+slice] ){
+          image_headers_queue_[set*slices_+slice].enqueue_tail(m);
+        }
+        else {
+          m->release();
+        }
+      }      
+      
+      // The Sense Job needs an image header as well. 
+      // Let us just copy the initial one...
+
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *m3 = new GadgetContainerMessage<ISMRMRD::ImageHeader>;
+      *m3->getObjectPtr() = m4->getObjectPtr()->image_headers_[0];
+      m3->cont(m4);
+      
+      //GDEBUG("Putting job on queue\n");
+      
+      if (this->next()->putq(m3) < 0) {
+        GDEBUG("Failed to put job on queue.\n");
+        m3->release();
+        return GADGET_FAIL;
+      }
+    }
+    
+    if( is_last_profile_in_frame )
+      profiles_counter_frame_[set*slices_+slice] = 0;
+    else{
+      profiles_counter_frame_[set*slices_+slice]++;
+    }
+
+    if( new_frame_detected ){
+
+      // This is the first profile of the next frame, enqueue.
+      // We have encountered deadlocks if the same profile is enqueued twice in different queues. Hence the copy.
+      
+      frame_profiles_queue_[set*slices_+slice].enqueue_tail(duplicate_profile(m2));
+      recon_profiles_queue_[set*slices_+slice].enqueue_tail(duplicate_profile(m2)); 
+
+      profiles_counter_frame_[set*slices_+slice]++;
+    }
+
+    profiles_counter_global_[set*slices_+slice]++;
+
+    if( output_timing_ )
+      process_timer.reset();
+    
+    m1->release(); // the internal queues hold copies
+    return GADGET_OK;
+  }
+  
+  int 
+  gpuRadialPrepGadget::calculate_trajectory_for_reconstruction(long profile_offset, unsigned int set, unsigned int slice)
+  {   
+    //GDEBUG("Calculating trajectory for reconstruction\n");
+
+    switch(mode_){
+      
+    case 0:
+    case 1:
+      {
+        if( rotations_per_reconstruction_ == 0 ){
+
+          long local_frame = (profile_offset/profiles_per_frame_[set*slices_+slice])%frames_per_rotation_[set*slices_+slice];
+          float angular_offset = M_PI/float(profiles_per_frame_[set*slices_+slice])*float(local_frame)/float(frames_per_rotation_[set*slices_+slice]);	  
+
+          host_traj_recon_[set*slices_+slice] = *compute_radial_trajectory_fixed_angle_2d<float>
+            ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], 1, angular_offset )->to_host();	
+        }
+        else{
+          host_traj_recon_[set*slices_+slice] = *compute_radial_trajectory_fixed_angle_2d<float>
+            ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], frames_per_rotation_[set*slices_+slice] )->to_host();
+        }
+      }
+      break;
+      
+    case 2:
+    case 3:
+      {
+        if( rotations_per_reconstruction_ == 0 ){	  
+          unsigned int first_profile_in_reconstruction = std::max(0L, profile_offset-profiles_per_frame_[set*slices_+slice]+1);
+          host_traj_recon_[set*slices_+slice] = *compute_radial_trajectory_golden_ratio_2d<float>
+            ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], 1, first_profile_in_reconstruction,
+              (mode_==2) ? GR_ORIGINAL : GR_SMALLEST )->to_host();	
+        }
+        else{
+          unsigned int first_profile_in_reconstruction = 
+            std::max(0L, profile_offset-profiles_per_frame_[set*slices_+slice]*frames_per_rotation_[set*slices_+slice]*rotations_per_reconstruction_+1);
+          host_traj_recon_[set*slices_+slice] = *compute_radial_trajectory_golden_ratio_2d<float>
+            ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], 
+              frames_per_rotation_[set*slices_+slice]*rotations_per_reconstruction_, first_profile_in_reconstruction,
+              (mode_==2) ? GR_ORIGINAL : GR_SMALLEST )->to_host();
+        }	  
+      }
+      break;
+	
+    default:
+      GDEBUG("Illegal trajectory mode\n");
+      return GADGET_FAIL;
+      break;
+    }
+    return GADGET_OK;
+  }  
+
+  int
+  gpuRadialPrepGadget::calculate_density_compensation_for_reconstruction( unsigned int set, unsigned int slice)
+  {
+    //GDEBUG("Calculating dcw for reconstruction\n");
+    
+    switch(mode_){
+      
+    case 0:
+    case 1:
+      host_weights_recon_[set*slices_+slice] = *compute_radial_dcw_fixed_angle_2d<float>
+        ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], oversampling_factor_, 
+          1.0f/(float(samples_per_profile_)/float(image_dimensions_recon_[0])) )->to_host();
+      break;
+      
+    case 2:
+    case 3:
+      host_weights_recon_[set*slices_+slice] = *compute_radial_dcw_golden_ratio_2d<float>
+        ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], oversampling_factor_, 
+          1.0f/(float(samples_per_profile_)/float(image_dimensions_recon_[0])),0,
+          (mode_==2) ? GR_ORIGINAL : GR_SMALLEST )->to_host();
+      break;
+      
+    default:
+      GDEBUG("Illegal dcw mode\n");
+      return GADGET_FAIL;
+      break;
+    }
+    return GADGET_OK;
+  }
+  
+  boost::shared_ptr< cuNDArray<floatd2> > 
+  gpuRadialPrepGadget::calculate_trajectory_for_frame(long profile_offset, unsigned int set, unsigned int slice)
+  {
+    //GDEBUG("Calculating trajectory for buffer frame\n");
+
+    boost::shared_ptr< cuNDArray<floatd2> > result;
+
+    switch(mode_){
+
+    case 0:
+    case 1:
+      {
+        long local_frame = (profile_offset/profiles_per_frame_[set*slices_+slice])%frames_per_rotation_[set*slices_+slice];
+        float angular_offset = M_PI/float(profiles_per_frame_[set*slices_+slice])*float(local_frame)/float(frames_per_rotation_[set*slices_+slice]);	  
+
+        result = compute_radial_trajectory_fixed_angle_2d<float>
+          ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], 1, angular_offset );  
+      }
+      break;
+	
+    case 2:
+    case 3:
+      { 
+        unsigned int first_profile_in_buffer = std::max(0L, profile_offset-profiles_per_frame_[set*slices_+slice]+1);
+        result = compute_radial_trajectory_golden_ratio_2d<float>
+          ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], 1, first_profile_in_buffer,
+            (mode_==2) ? GR_ORIGINAL : GR_SMALLEST );
+      }
+      break;	
+	
+    default:
+      GDEBUG("Illegal trajectory mode\n");
+      break;
+    }
+    
+    return result;
+  }
+
+  boost::shared_ptr< cuNDArray<float> >
+  gpuRadialPrepGadget::calculate_density_compensation_for_frame(unsigned int set, unsigned int slice)
+  {    
+    //GDEBUG("Calculating dcw for buffer frame\n");
+
+    switch(mode_){
+      
+    case 0:
+    case 1:
+      return compute_radial_dcw_fixed_angle_2d<float>
+        ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], oversampling_factor_, 1.0f/(float(samples_per_profile_)/float(image_dimensions_recon_[0])) );
+      break;
+      
+    case 2:
+    case 3:
+      return compute_radial_dcw_golden_ratio_2d<float>
+        ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], oversampling_factor_, 
+          1.0f/(float(samples_per_profile_)/float(image_dimensions_recon_[0])),0,
+          (mode_==2) ? GR_ORIGINAL : GR_SMALLEST );
+      break;
+      
+    default:
+      GDEBUG("Illegal dcw mode\n");
+      return boost::shared_ptr< cuNDArray<float> >();
+      break;
+    }   
+  }
+
+
+  boost::shared_ptr< cuNDArray<floatd2> > 
+  gpuRadialPrepGadget::calculate_trajectory_for_rhs(long profile_offset, unsigned int set, unsigned int slice)
+  {
+    //GDEBUG("Calculating trajectory for rhs\n");
+
+    switch(mode_){
+
+    case 0:
+    case 1:
+      return compute_radial_trajectory_fixed_angle_2d<float>
+        ( samples_per_profile_, profiles_per_frame_[set*slices_+slice]*buffer_frames_per_rotation_[set*slices_+slice], 1 );
+      break;
+	
+    case 2:
+    case 3:
+      { 
+        unsigned int first_profile = 
+          std::max(0L, profile_offset-profiles_per_frame_[set*slices_+slice]*
+                   buffer_frames_per_rotation_[set*slices_+slice]*
+                   buffer_length_in_rotations_+1);
+
+        return compute_radial_trajectory_golden_ratio_2d<float>
+          ( samples_per_profile_, 
+            profiles_per_frame_[set*slices_+slice]*
+            buffer_frames_per_rotation_[set*slices_+slice]*buffer_length_in_rotations_, 
+            1, first_profile,
+            (mode_==2) ? GR_ORIGINAL : GR_SMALLEST );
+      }
+      break;	
+	
+    default:
+      GDEBUG("Illegal trajectory mode\n");
+      return boost::shared_ptr< cuNDArray<floatd2> >();
+      break;
+    }
+  }
+  
+  boost::shared_ptr< cuNDArray<float> >
+  gpuRadialPrepGadget::calculate_density_compensation_for_rhs(unsigned int set, unsigned int slice)
+  {
+    //GDEBUG("Calculating dcw for rhs\n");
+    
+    switch(mode_){
+      
+    case 0:
+    case 1:
+      {
+        unsigned int num_profiles = 
+          profiles_per_frame_[set*slices_+slice]*buffer_frames_per_rotation_[set*slices_+slice];
+
+        return compute_radial_dcw_fixed_angle_2d<float>
+          ( samples_per_profile_, num_profiles, oversampling_factor_, 
+            1.0f/(float(samples_per_profile_)/float(image_dimensions_recon_[0])) );
+      }
+      break;
+      
+    case 2:
+    case 3:
+      {
+        unsigned int num_profiles = 
+          profiles_per_frame_[set*slices_+slice]*buffer_frames_per_rotation_[set*slices_+slice]*buffer_length_in_rotations_;
+
+        return compute_radial_dcw_golden_ratio_2d<float>
+          ( samples_per_profile_, num_profiles, oversampling_factor_, 
+            1.0f/(float(samples_per_profile_)/float(image_dimensions_recon_[0])),0,
+            (mode_==2) ? GR_ORIGINAL : GR_SMALLEST );
+      }
+      break;
+      
+    default:
+      GDEBUG("Illegal dcw mode\n");
+      return boost::shared_ptr< cuNDArray<float> >();
+      break;
+    }
+  }
+
+  boost::shared_ptr< hoNDArray<float_complext> > gpuRadialPrepGadget::
+  extract_samples_from_queue( ACE_Message_Queue<ACE_MT_SYNCH> *queue, bool sliding_window,
+                              unsigned int set, unsigned int slice )
+  {    
+    //GDEBUG("Emptying queue...\n");
+
+    unsigned int profiles_buffered = queue->message_count();
+    
+    std::vector<size_t> dims;
+    dims.push_back(samples_per_profile_*profiles_buffered);
+    dims.push_back(num_coils_[set*slices_+slice]);
+    
+    boost::shared_ptr< hoNDArray<float_complext> > host_samples(new hoNDArray<float_complext>(&dims));
+    
+    for (unsigned int p=0; p<profiles_buffered; p++) {
+
+      ACE_Message_Block* mbq;
+      if (queue->dequeue_head(mbq) < 0) {
+        GDEBUG("Message dequeue failed\n");
+        return boost::shared_ptr< hoNDArray<float_complext> >();
+      }
+      
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *daq = AsContainerMessage<hoNDArray< std::complex<float> > >(mbq);
+	
+      if (!daq) {
+        GDEBUG("Unable to interpret data on message queue\n");
+        return boost::shared_ptr< hoNDArray<float_complext> >();
+      }
+	
+      for (unsigned int c = 0; c < num_coils_[set*slices_+slice]; c++) {
+	
+        float_complext *data_ptr = host_samples->get_data_ptr();
+        data_ptr += c*samples_per_profile_*profiles_buffered+p*samples_per_profile_;
+	    
+        std::complex<float> *r_ptr = daq->getObjectPtr()->get_data_ptr();
+        r_ptr += c*daq->getObjectPtr()->get_size(0);
+	  
+        memcpy(data_ptr,r_ptr,samples_per_profile_*sizeof(float_complext));
+      }
+
+      // In sliding window mode the profile might need to go back at the end of the queue
+      // 
+      
+      long profiles_in_sliding_window = sliding_window_profiles_ + 
+        profiles_per_frame_[set*slices_+slice]*frames_per_rotation_[set*slices_+slice]*sliding_window_rotations_;
+
+      if( sliding_window && p >= (profiles_buffered-profiles_in_sliding_window) )
+        queue->enqueue_tail(mbq);
+      else
+        mbq->release();
+    } 
+    
+    return host_samples;
+  }
+  
+  GadgetContainerMessage< hoNDArray< std::complex<float> > >*
+  gpuRadialPrepGadget::duplicate_profile( GadgetContainerMessage< hoNDArray< std::complex<float> > > *profile )
+  {
+    GadgetContainerMessage< hoNDArray< std::complex<float> > > *copy = 
+      new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+    
+    *copy->getObjectPtr() = *profile->getObjectPtr();
+    
+    return copy;
+  }
+
+  void gpuRadialPrepGadget::reconfigure(unsigned int set, unsigned int slice, bool use_dcw)
+  {    
+    GDEBUG("\nReconfiguring:\n#profiles/frame:%d\n#frames/rotation: %d\n#rotations/reconstruction:%d\n", 
+                  profiles_per_frame_[set*slices_+slice], frames_per_rotation_[set*slices_+slice], rotations_per_reconstruction_);
+
+    calculate_trajectory_for_reconstruction(0, set, slice);
+    calculate_density_compensation_for_reconstruction(set, slice);
+    
+    buffer_frames_per_rotation_[set*slices_+slice] = buffer_frames_per_rotation.value();
+
+    if( buffer_frames_per_rotation_[set*slices_+slice] == 0 ){
+      if( mode_ == 2 || mode_ == 3 )
+        buffer_frames_per_rotation_[set*slices_+slice] = 
+          image_dimensions_recon_os_[0]/profiles_per_frame_[set*slices_+slice];
+      else
+        buffer_frames_per_rotation_[set*slices_+slice] = frames_per_rotation_[set*slices_+slice];
+    }
+    
+    cuBuffer<float,2> *acc_buffer = get_buffer_ptr(set*slices_+slice);
+
+    acc_buffer->setup( from_std_vector<size_t,2>(image_dimensions_recon_), image_dimensions_recon_os_, 
+                       kernel_width_, num_coils_[set*slices_+slice], 
+                       buffer_length_in_rotations_, buffer_frames_per_rotation_[set*slices_+slice] );
+    
+    if(use_dcw){
+      boost::shared_ptr< cuNDArray<float> > device_weights_frame = calculate_density_compensation_for_frame(set, slice);
+      acc_buffer->set_dcw(device_weights_frame);
+    }
+
+    reconfigure_[set*slices_+slice] = false;
+  }
+}
diff --git a/gadgets/radial/gpuRadialPrepGadget.h b/gadgets/radial/gpuRadialPrepGadget.h
new file mode 100644
index 0000000..7af355f
--- /dev/null
+++ b/gadgets/radial/gpuRadialPrepGadget.h
@@ -0,0 +1,222 @@
+#pragma once
+
+#include "gadgetron_radial_export.h"
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "vector_td.h"
+#include "cuNFFT.h"
+#include "cuCgPreconditioner.h"
+#include "cuBuffer.h"
+#include "cuSenseBufferCg.h"
+#include "cuSpiritBuffer.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+
+/*
+  ------------------------------------------
+  Trajectory modes for radial reconstruction
+  ------------------------------------------
+  
+  Mode 0 and Mode 1 are variants of 'fixed' radial trajectories with interframe rotation.
+  Mode 2 and Mode 3 denote radial trajectories with golden ratio based angular profile spacings.
+  
+  Let 
+  'i' denote the number of profiles per (undersampled) frame
+  'j' denote the number of frames per trajectory rotation (to obtain a fully sampled acquisition)
+  'h' denote a variable of type ISMRMRD::AcquisitionHeader
+
+  It is possible to explicitly set 'i' and 'j' in the Gadgetron configuration file.
+  For some modes this is (partly) required, 
+  for others they will be automatically determined from the incoming profile headers.
+  
+  Mode 0:
+  -------
+  For each rotation cycle profiles are numbered using the scheme
+
+    0+0*j,0+1*j,0+2*j,...,0+(i-1)*j, (1st frame)
+    1+0*j,1+1*j,1+2*j,...,1+(i-1)*j, (2nd frame)
+    2+0*j,2+1*j,2+2*j,...,2+(i-1)*j, (3rd frame)
+    ...,
+    (j-1)+0*j,(j-1)+1*j,(j-1)+2*j,...,(j-1)+(i-1)*j
+
+  as given in h.idx.kspace_encode_step_1.
+  Both 'i' and 'j' are automatically derived and thus need not be explicitly specified in a configuration file.
+  For mode 0 both 'i' and 'j' can be changed dynamically as desired e.g. for real-time imaging.
+
+  Mode 1:
+  -------
+  Profiles are numbered 0,1,2,...,i-1, 0,1,2,...,i-1, ... as given in h.idx.kspace_encode_step_1.
+  'j' is estimated as 'matrix_size'/'i' and should be explicitly set in the configuration file if this is not the case, e.g.:
+  <property><name>frames_per_rotation</name><value>8</value></property>
+      
+
+  Mode 2 and Mode 3:
+  -------
+  Profiles are numbered 
+  0,1,2,...,i-1, 0,1,2,...,i-1, 0,1,2,...,i-1, ...
+  or
+  0,1,2,...,i-1, i,i+1,i+2,...,2*i-1, 2*i,2*i+1,2*i+2,3*i-1, ...
+  as given in h.idx.kspace_encode_step_1.
+  'i' should be explicitly specified in the Gadgetron configuration file, e.g.:
+  <property><name>profiles_per_frame</name><value>32</value></property>
+  If not it defaults to i=32.
+  'j' is explicitly set to '1' even if specified in the configuration file.
+*/
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_RADIAL gpuRadialPrepGadget :
+    public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+  {
+
+  public:
+
+    gpuRadialPrepGadget();
+    virtual ~gpuRadialPrepGadget();
+
+  protected:
+    GADGET_PROPERTY_LIMITS(mode,int,"Radial mode", 0, GadgetPropertyLimitsEnumeration, 0,1,2,3);
+    GADGET_PROPERTY(deviceno,int,"GPU Device Number", 0);
+    GADGET_PROPERTY(buffer_length_in_rotations, int, "Number of rotations in a buffer", 1);
+    GADGET_PROPERTY(buffer_using_solver, bool, "Use solver for buffer", false);
+    GADGET_PROPERTY(buffer_convolution_kernel_width, float, "Convolution kernel width for buffer", 5.5);
+    GADGET_PROPERTY(buffer_convolution_oversampling_factor, float, "Oversampling used in buffer convolution", 1.25);
+    GADGET_PROPERTY(reconstruction_os_factor_x, float, "Oversampling for reconstruction in x-direction", 1.0);
+    GADGET_PROPERTY(reconstruction_os_factor_y, float, "Oversampling for reconstruction in y-direction", 1.0);
+    GADGET_PROPERTY(rotations_per_reconstruction, int, "Number of rotations per reconstruction", 0);
+    GADGET_PROPERTY(output_timing, bool, "Output timing information", false);
+    GADGET_PROPERTY(sliding_window_profiles, int, "Number of profiles in sliding window", 0);
+    GADGET_PROPERTY(sliding_window_rotations, int, "Number of rotations in sliding window", 0);
+    GADGET_PROPERTY(profiles_per_frame, int, "Profiles per frame", 0);
+    GADGET_PROPERTY(frames_per_rotation, int, "Frames per rotation", 0);
+    GADGET_PROPERTY(buffer_frames_per_rotation, int, "Frames per rotation in buffer", 0);
+
+    virtual int process_config(ACE_Message_Block *mb);
+
+    virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader > *m1,
+			GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2);
+
+    virtual void reconfigure(unsigned int set, unsigned int slice, bool use_dcw = true);
+
+    virtual boost::shared_ptr< hoNDArray<float_complext> > compute_csm( unsigned int buffer_idx ) = 0;
+
+    virtual boost::shared_ptr< hoNDArray<float_complext> > compute_reg
+      ( unsigned int set, unsigned int slice, bool new_frame ) = 0;
+    
+    virtual void allocate_accumulation_buffer( unsigned int num_buffers ) = 0;
+    
+    boost::shared_array<bool> reconfigure_;
+
+    GadgetContainerMessage< hoNDArray< std::complex<float> > >*
+      duplicate_profile( GadgetContainerMessage< hoNDArray< std::complex<float> > > *profile );
+
+    boost::shared_ptr< hoNDArray<float_complext> > 
+      extract_samples_from_queue( ACE_Message_Queue<ACE_MT_SYNCH> *queue,
+				  bool acknowledge_sliding_window,
+				  unsigned int set, unsigned int slice );
+
+    // Compute trajectory/dcw for a reconstruction (to store internally)
+    //
+
+    int calculate_trajectory_for_reconstruction(long profile_offset, unsigned int set, unsigned int slice);
+    int calculate_density_compensation_for_reconstruction(unsigned int set, unsigned int slice);
+
+    // Compute trajectory/dcw for adding (usually undersampled) frames to the accumulation buffer
+    //
+
+    boost::shared_ptr< cuNDArray<floatd2> > 
+      calculate_trajectory_for_frame(long profile_offset, unsigned int set, unsigned int slice);
+
+    boost::shared_ptr< cuNDArray<float> >
+      calculate_density_compensation_for_frame(unsigned int set, unsigned int slice);
+
+    // Compute trajectory/dcw for the fully sampled accumulation buffer (iterative buffer mode only)
+    //
+
+    boost::shared_ptr< cuNDArray<floatd2> > 
+      calculate_trajectory_for_rhs(long profile_offset, unsigned int set, unsigned int slice);
+
+    boost::shared_ptr< cuNDArray<float> > 
+      calculate_density_compensation_for_rhs(unsigned int set, unsigned int slice);
+
+    int slices_;
+    int sets_;
+    int device_number_;
+    int mode_; // See note above
+    long samples_per_profile_;
+
+    boost::shared_array<long> image_counter_;
+    boost::shared_array<long> profiles_per_frame_;  // for an undersampled frame
+    boost::shared_array<long> frames_per_rotation_; // representing a fully sampled frame
+
+    // The number of rotations to batch per reconstruction. 
+    // Set to '0' to reconstruct frames individually.
+    long rotations_per_reconstruction_; 
+
+    // The number of buffer cycles
+    long buffer_length_in_rotations_; 
+
+    boost::shared_array<long> buffer_frames_per_rotation_; // the number of buffer subcycles
+
+    // Internal book-keping
+    boost::shared_array<long> previous_profile_;
+    boost::shared_array<long> profiles_counter_frame_;
+    boost::shared_array<long> profiles_counter_global_;
+
+    long sliding_window_profiles_;
+    long sliding_window_rotations_;
+
+    float kernel_width_;
+    float oversampling_factor_;
+
+    boost::shared_array<unsigned int> num_coils_;
+
+    boost::shared_array<float[3]> position_;
+    boost::shared_array<float[3]> read_dir_;
+    boost::shared_array<float[3]> phase_dir_;
+    boost::shared_array<float[3]> slice_dir_;
+
+    bool output_timing_;
+    bool buffer_using_solver_;
+
+    boost::shared_array<bool> buffer_update_needed_;
+
+    boost::shared_array< hoNDArray<floatd2> > host_traj_recon_;
+    boost::shared_array< hoNDArray<float> > host_weights_recon_;
+    
+    boost::shared_array< hoNDArray<float_complext> > csm_host_;
+    boost::shared_array< hoNDArray<float_complext> > reg_host_;
+    
+    // We would like to make a single array of the buffer base class
+    // but encounter yet unexplainable heap corruptions if we do.
+    // Hence this workaround:
+    //boost::shared_array< cuBuffer<float,2> > acc_buffer_;
+    boost::shared_array< cuSenseBuffer<float,2> > acc_buffer_sense_;
+    boost::shared_array< cuSenseBufferCg<float,2> > acc_buffer_sense_cg_;
+    boost::shared_array< cuSpiritBuffer<float,2> > acc_buffer_spirit_;
+    virtual cuBuffer<float,2>* get_buffer_ptr(int idx) = 0;
+    // <-- end of workaround
+
+    std::vector<size_t> fov_;
+    std::vector<size_t> image_dimensions_;
+    std::vector<size_t> image_dimensions_recon_;
+    uint64d2 image_dimensions_recon_os_;
+
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > frame_profiles_queue_;
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > recon_profiles_queue_;
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > image_headers_queue_;
+
+  private:
+
+    inline bool vec_equal(float *in1, float *in2) {
+      for (unsigned int i = 0; i < 3; i++) {
+        if (in1[i] != in2[i]) return false;
+      }
+      return true;
+    }   
+  };
+}
diff --git a/gadgets/radial/gpuRadialSensePrepGadget.cpp b/gadgets/radial/gpuRadialSensePrepGadget.cpp
new file mode 100644
index 0000000..3452fe5
--- /dev/null
+++ b/gadgets/radial/gpuRadialSensePrepGadget.cpp
@@ -0,0 +1,89 @@
+#include "gpuRadialSensePrepGadget.h"
+#include "b1_map.h"
+#include "cuSenseBufferCg.h"
+
+namespace Gadgetron{
+
+  boost::shared_ptr< hoNDArray<float_complext> > 
+  gpuRadialSensePrepGadget::compute_csm( unsigned int idx )
+  {    
+    // Estimate and update csm related data structures
+    //
+  
+    cuSenseBuffer<float,2> *acc_buffer = 
+      (this->buffer_using_solver_) ? &this->acc_buffer_sense_cg_[idx] : &this->acc_buffer_sense_[idx];
+  
+    boost::shared_ptr< cuNDArray<float_complext> > csm_data = 
+      acc_buffer->get_accumulated_coil_images();
+    
+    if( !csm_data.get() ){
+      GDEBUG("Error during accumulation buffer computation\n");
+      return boost::shared_ptr< hoNDArray<float_complext> >();
+    }
+    
+    boost::shared_ptr< cuNDArray<float_complext> > csm = 
+      estimate_b1_map<float,2>( csm_data.get() );
+  
+    if( !csm.get() ){
+      GDEBUG("Error during coil estimation\n");
+      return boost::shared_ptr< hoNDArray<float_complext> >();
+    }            
+    
+    acc_buffer->set_csm(csm);
+    return csm->to_host(); 
+  }
+  
+  boost::shared_ptr< hoNDArray<float_complext> > 
+  gpuRadialSensePrepGadget::compute_reg( unsigned int set, unsigned int slice, bool new_frame )
+  {    
+    // Estimate and update regularization image related data structures
+    //
+    
+    cuSenseBuffer<float,2> *acc_buffer = (this->buffer_using_solver_) ? 
+      &this->acc_buffer_sense_cg_[set*this->slices_+slice] : &this->acc_buffer_sense_[set*this->slices_+slice];
+
+    if( buffer_using_solver_ && ( mode_ == 2 || mode_ == 3 ) ){
+      static_cast<cuSenseBufferCg<float,2>*>( acc_buffer )->preprocess
+        ( calculate_trajectory_for_rhs( this->profiles_counter_global_[set*this->slices_+slice] - ((new_frame) ? 1 : 0), set, slice).get());
+    }
+    
+    boost::shared_ptr< cuNDArray<float_complext> > reg_image = 
+      acc_buffer->get_combined_coil_image();
+    
+    if( !reg_image.get() ){
+      GDEBUG("Error computing regularization image\n");
+      return boost::shared_ptr< hoNDArray<float_complext> >();
+    }            
+    
+    return reg_image->to_host();
+  }
+
+  void 
+  gpuRadialSensePrepGadget::allocate_accumulation_buffer( unsigned int size )
+  {    
+    // Allocate accumulation buffer
+    //
+  
+    if( this->buffer_using_solver_ ){
+      this->acc_buffer_sense_cg_ = boost::shared_array< cuSenseBufferCg<float,2> >(new cuSenseBufferCg<float,2>[size]);
+    }
+    else{
+      this->acc_buffer_sense_ = boost::shared_array< cuSenseBuffer<float,2> >(new cuSenseBuffer<float,2>[size]);
+    }
+  }
+
+  void gpuRadialSensePrepGadget::reconfigure(unsigned int set, unsigned int slice, bool use_dcw)
+  {    
+    gpuRadialPrepGadget::reconfigure(set, slice, use_dcw);
+    
+    if( buffer_using_solver_ ){
+
+      if(use_dcw) 
+        this->acc_buffer_sense_cg_[set*this->slices_+slice].set_dcw_for_rhs(calculate_density_compensation_for_rhs(set, slice));
+
+      this->acc_buffer_sense_cg_[set*this->slices_+slice].preprocess(calculate_trajectory_for_rhs(0, set, slice).get());
+    }    
+  }
+
+  GADGET_FACTORY_DECLARE(gpuRadialSensePrepGadget)
+}
diff --git a/gadgets/radial/gpuRadialSensePrepGadget.h b/gadgets/radial/gpuRadialSensePrepGadget.h
new file mode 100644
index 0000000..9ca0519
--- /dev/null
+++ b/gadgets/radial/gpuRadialSensePrepGadget.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "gpuRadialPrepGadget.h"
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_RADIAL gpuRadialSensePrepGadget : public gpuRadialPrepGadget
+  {
+    
+  public:
+    GADGET_DECLARE(gpuRadialSensePrepGadget);
+    gpuRadialSensePrepGadget() : gpuRadialPrepGadget() {}
+    virtual ~gpuRadialSensePrepGadget() {}
+    
+  protected:
+    
+    virtual void reconfigure(unsigned int set, unsigned int slice, bool use_dcw = true);
+
+    virtual boost::shared_ptr< hoNDArray<float_complext> > compute_csm( unsigned int buffer_idx );
+
+    virtual boost::shared_ptr< hoNDArray<float_complext> > compute_reg( unsigned int set, 
+                                                                        unsigned int slice, 
+                                                                        bool new_frame );
+    
+    virtual void allocate_accumulation_buffer( unsigned int num_buffers );
+    
+    virtual cuBuffer<float,2>* get_buffer_ptr(int idx){
+      return (this->buffer_using_solver_) ? &this->acc_buffer_sense_cg_[idx] : &this->acc_buffer_sense_[idx];
+    }
+  };
+}
diff --git a/gadgets/radial/gpuRadialSpiritPrepGadget.cpp b/gadgets/radial/gpuRadialSpiritPrepGadget.cpp
new file mode 100644
index 0000000..9f6a43d
--- /dev/null
+++ b/gadgets/radial/gpuRadialSpiritPrepGadget.cpp
@@ -0,0 +1,98 @@
+#include "gpuRadialSpiritPrepGadget.h"
+#include "spirit_calibration.h"
+#include "cuSpiritBuffer.h"
+#include "cuNDFFT.h"
+#include "cuSpiritOperator.h"
+#include "hoNDArray_fileio.h"
+
+namespace Gadgetron{
+
+  gpuRadialSpiritPrepGadget::gpuRadialSpiritPrepGadget() : gpuRadialPrepGadget() {}
+
+  int 
+  gpuRadialSpiritPrepGadget::process_config(ACE_Message_Block* mb)
+  {
+    return gpuRadialPrepGadget::process_config(mb);
+  }
+  
+  boost::shared_ptr< hoNDArray<float_complext> > 
+  gpuRadialSpiritPrepGadget::compute_csm( unsigned int idx )
+  {    
+    // Estimate and update csm related data structures
+    //
+  
+    cuSpiritBuffer<float,2> *acc_buffer = &this->acc_buffer_spirit_[idx];
+  
+    boost::shared_ptr< cuNDArray<float_complext> > csm_data = 
+      acc_buffer->get_accumulated_coil_images();
+
+    std::vector<size_t> dims_to_xform;
+    dims_to_xform.push_back(0); dims_to_xform.push_back(1);    
+    cuNDFFT<float>::instance()->fft( csm_data.get(), &dims_to_xform );
+    
+    boost::shared_ptr< cuNDArray<float_complext> > csm =       
+      estimate_spirit_kernels( csm_data.get(), 7 ); // TODO: let the kernel size be user defined
+
+
+
+/*
+    // --> START debug output
+    boost::shared_ptr< cuSpirit2DOperator<float> > C( new cuSpirit2DOperator<float>() );
+		C->set_calibration_kernels(csm);
+    static int counter = 0;
+    char filename[256];
+    cuNDFFT<float>::instance()->ifft( csm_data.get(), &dims_to_xform );
+    //boost::shared_ptr< cuSpirit2DOperator<float> > C( new cuSpirit2DOperator<float>() );
+    //C->set_calibration_kernels(csm);
+    sprintf((char*)filename, "_before_%d.real", counter);
+    write_nd_array<float>( abs(csm_data.get())->to_host().get(), filename );
+    cuNDArray<float_complext> after(csm_data->get_dimensions()); C->mult_M(csm_data.get(),&after);
+    sprintf((char*)filename, "_after_%d.real", counter);
+    write_nd_array<float>( abs(&after)->to_host().get(), filename );
+    sprintf((char*)filename, "_spirit_calibration_%d.real", counter);
+    write_nd_array<float>( abs(csm.get())->to_host().get(), filename );    
+    counter++;
+    // <-- END debug output
+*/
+
+    return csm->to_host(); 
+  }
+  
+  boost::shared_ptr< hoNDArray<float_complext> > 
+  gpuRadialSpiritPrepGadget::compute_reg( unsigned int set, unsigned int slice, bool new_frame )
+  {    
+    // Estimate and update regularization image related data structures
+    //
+    
+    cuSpiritBuffer<float,2> *acc_buffer = &this->acc_buffer_spirit_[set*this->slices_+slice];
+    boost::shared_ptr< cuNDArray<float_complext> > reg_image = acc_buffer->get_combined_coil_image();
+    
+    if( !reg_image.get() ){
+      GDEBUG("Error computing regularization image\n");
+      return boost::shared_ptr< hoNDArray<float_complext> >();
+    }            
+    
+    return reg_image->to_host();
+  }
+
+  void 
+  gpuRadialSpiritPrepGadget::allocate_accumulation_buffer( unsigned int size )
+  {    
+    this->acc_buffer_spirit_ = boost::shared_array< cuSpiritBuffer<float,2> >(new cuSpiritBuffer<float,2>[size]);
+  }
+
+  void gpuRadialSpiritPrepGadget::reconfigure(unsigned int set, unsigned int slice, bool use_dcw)
+  {    
+    gpuRadialPrepGadget::reconfigure(set, slice, use_dcw);
+    //gpuRadialPrepGadget::reconfigure(set, slice, false);
+    
+    cuSpiritBuffer<float,2> *acc_buffer = &this->acc_buffer_spirit_[set*this->slices_+slice];
+
+    if( use_dcw ) 
+      acc_buffer->set_dcw_for_rhs(calculate_density_compensation_for_rhs(set, slice));
+
+    acc_buffer->preprocess(calculate_trajectory_for_rhs(0, set, slice).get());
+  }
+
+  GADGET_FACTORY_DECLARE(gpuRadialSpiritPrepGadget)
+}
diff --git a/gadgets/radial/gpuRadialSpiritPrepGadget.h b/gadgets/radial/gpuRadialSpiritPrepGadget.h
new file mode 100644
index 0000000..8d9824a
--- /dev/null
+++ b/gadgets/radial/gpuRadialSpiritPrepGadget.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include "gpuRadialPrepGadget.h"
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_RADIAL gpuRadialSpiritPrepGadget : public gpuRadialPrepGadget
+  {
+    
+  public:
+    GADGET_DECLARE(gpuRadialSpiritPrepGadget);
+    gpuRadialSpiritPrepGadget();
+    virtual ~gpuRadialSpiritPrepGadget() {}
+    
+  protected:
+    
+    virtual int process_config(ACE_Message_Block *mb);
+
+    virtual void reconfigure(unsigned int set, unsigned int slice, bool use_dcw = true );
+
+    virtual boost::shared_ptr< hoNDArray<float_complext> > compute_csm( unsigned int buffer_idx );
+
+    virtual boost::shared_ptr< hoNDArray<float_complext> > compute_reg( unsigned int set, 
+                                                                        unsigned int slice, 
+                                                                        bool new_frame );
+    
+    virtual void allocate_accumulation_buffer( unsigned int num_buffers );
+
+    virtual cuBuffer<float,2>* get_buffer_ptr(int idx){
+      return &this->acc_buffer_spirit_[idx];
+    }
+  };
+}
diff --git a/gadgets/radial/gpuRetroGatedSensePrepGadget.cpp b/gadgets/radial/gpuRetroGatedSensePrepGadget.cpp
new file mode 100644
index 0000000..af7d8d0
--- /dev/null
+++ b/gadgets/radial/gpuRetroGatedSensePrepGadget.cpp
@@ -0,0 +1,870 @@
+#include "gpuRetroGatedSensePrepGadget.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "GenericReconJob.h"
+#include "cuNDArray_elemwise.h"
+#include "hoNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "vector_td_operators.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "check_CUDA.h"
+#include "radial_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "ismrmrd/xml.h"
+
+#include <algorithm>
+#include <vector>
+#include <cmath>
+
+namespace Gadgetron{
+
+  gpuRetroGatedSensePrepGadget::gpuRetroGatedSensePrepGadget()
+    : slices_(-1)
+    , sets_(-1)
+    , samples_per_profile_(-1)
+    , phys_time_index_(0)
+  {
+
+  }
+  
+  gpuRetroGatedSensePrepGadget::~gpuRetroGatedSensePrepGadget() {}
+  
+  int gpuRetroGatedSensePrepGadget::process_config(ACE_Message_Block* mb)
+  {
+    // Get configuration values from config file
+    //
+
+    mode_ = mode.value();
+    device_number_ = deviceno.value();
+    profiles_per_frame_ = profiles_per_frame.value();
+    frames_per_cardiac_cycle_ = frames_per_cardiac_cycle.value();
+    profiles_per_buffer_frame_ = profiles_per_buffer_frame.value();
+    num_buffer_frames_inner_ = number_of_buffer_frames_inner.value();
+    num_buffer_frames_outer_ = number_of_buffer_frames_outer.value();
+    buffer_using_solver_ = buffer_using_solver.value();
+    output_timing_ = output_timing.value();
+    phys_time_index_ = physiology_time_index.value();
+
+    // Check that a golden ratio based reconstruction mode was specified
+    //
+
+    if( !(mode_ == 2 || mode_ == 3) ){
+      GDEBUG( "Only radial reconstruction modes {2,3} (golden ratio based) are supported.\n" );
+      return GADGET_FAIL;
+    }
+    
+    // Setup and validate device configuration
+    //
+
+    int number_of_devices;
+    if (cudaGetDeviceCount(&number_of_devices)!= cudaSuccess) {
+      GDEBUG( "Error: unable to query number of CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (number_of_devices == 0) {
+      GDEBUG( "Error: No available CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (device_number_ >= number_of_devices) {
+      GDEBUG("Adjusting device number from %d to %d\n", device_number_,  (device_number_%number_of_devices));
+      device_number_ = (device_number_%number_of_devices);
+    }
+
+    if (cudaSetDevice(device_number_)!= cudaSuccess) {
+      GDEBUG( "Error: unable to set CUDA device.\n" );
+      return GADGET_FAIL;
+    }
+
+    cudaDeviceProp deviceProp;
+    if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+      GDEBUG( "Error: unable to query device properties.\n" );
+      return GADGET_FAIL;
+    }
+    
+    // Convolution kernel width and oversampling ratio (for the buffer)
+    //
+
+    kernel_width_ = buffer_convolution_kernel_width.value();
+    oversampling_factor_ = buffer_convolution_oversampling_factor.value();
+
+    // Get the Ismrmrd header
+    //
+
+    ISMRMRD::IsmrmrdHeader h;
+    ISMRMRD::deserialize(mb->rd_ptr(),h);
+    
+    
+    if (h.encoding.size() != 1) {
+      GDEBUG("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    
+    // Get the encoding space and trajectory description
+    ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+    ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+    ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+
+
+    // Matrix sizes (as a multiple of the GPU's warp size)
+    //
+    
+    unsigned int warp_size = deviceProp.warpSize;
+
+    image_dimensions_.push_back(((e_space.matrixSize.x+warp_size-1)/warp_size)*warp_size);
+    image_dimensions_.push_back(((e_space.matrixSize.y+warp_size-1)/warp_size)*warp_size);
+
+    image_dimensions_recon_.push_back(((static_cast<unsigned int>(std::ceil(e_space.matrixSize.x*reconstruction_os_factor_x.value()))+warp_size-1)/warp_size)*warp_size);  
+    image_dimensions_recon_.push_back(((static_cast<unsigned int>(std::ceil(e_space.matrixSize.y*reconstruction_os_factor_y.value()))+warp_size-1)/warp_size)*warp_size);
+    
+    image_dimensions_recon_os_ = uint64d2
+      (((static_cast<unsigned int>(std::ceil(image_dimensions_recon_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+       ((static_cast<unsigned int>(std::ceil(image_dimensions_recon_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+    
+    // In case the warp_size constraint kicked in
+    oversampling_factor_ = float(image_dimensions_recon_os_[0])/float(image_dimensions_recon_[0]); 
+    
+    GDEBUG("matrix_size_x : %d, recon: %d, recon_os: %d\n", 
+                  image_dimensions_[0], image_dimensions_recon_[0], image_dimensions_recon_os_[0]);
+
+    GDEBUG("matrix_size_y : %d, recon: %d, recon_os: %d\n", 
+                  image_dimensions_[1], image_dimensions_recon_[1], image_dimensions_recon_os_[1]);
+    
+    fov_.push_back(r_space.fieldOfView_mm.x);
+    fov_.push_back(r_space.fieldOfView_mm.y);
+    fov_.push_back(r_space.fieldOfView_mm.z);
+
+    slices_ = e_limits.slice ? e_limits.slice->maximum + 1 : 1;
+    sets_ = e_limits.set ? e_limits.set->maximum + 1 : 1;
+    
+    // Allocate profile queues
+    // - one queue for the currently incoming frame (for the accumulation buffer)
+    // - one queue for the next reconstruction
+    
+    buffer_profiles_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+    recon_profiles_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+
+    size_t bsize = sizeof(GadgetContainerMessage< hoNDArray< std::complex<float> > >)*profiles_per_buffer_frame_*10;
+
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      buffer_profiles_queue_[i].high_water_mark(bsize);
+      buffer_profiles_queue_[i].low_water_mark(bsize);
+    }
+
+    bsize = sizeof(GadgetContainerMessage< hoNDArray< std::complex<float> > >)*profiles_per_frame_*frames_per_cardiac_cycle_*10;
+    
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      recon_profiles_queue_[i].high_water_mark(bsize);
+      recon_profiles_queue_[i].low_water_mark(bsize);
+    }
+    
+    // Define some profile counters for book-keeping
+    //
+
+    image_counter_ = boost::shared_array<long>(new long[slices_*sets_]);
+    num_coils_ = boost::shared_array<unsigned int>(new unsigned int[slices_*sets_]);
+    first_profile_acq_time_ = boost::shared_array<unsigned int>(new unsigned int[slices_*sets_]);
+    first_profile_phys_time_ = boost::shared_array<unsigned int>(new unsigned int[slices_*sets_]);
+    previous_timestamp_ = boost::shared_array<unsigned int>(new unsigned int[slices_*sets_]);
+    profiles_counter_global_ = boost::shared_array<long>(new long[slices_*sets_]);
+    Rw_reached_ = boost::shared_array<bool>(new bool[slices_*sets_]);
+    Rw_offset_ = boost::shared_array<unsigned int>(new unsigned int[slices_*sets_]);
+    buffer_update_needed_ = boost::shared_array<bool>(new bool[slices_*sets_]);
+    reconfigure_ = boost::shared_array<bool>(new bool[slices_*sets_]);
+    
+    if( !image_counter_.get() || 
+        !num_coils_.get() || 
+        !first_profile_acq_time_.get() ||
+        !first_profile_phys_time_.get() ||
+        !previous_timestamp_.get() ||
+        !profiles_counter_global_.get() ||
+        !Rw_reached_.get() ||
+        !Rw_offset_.get() ||
+        !buffer_update_needed_.get() ||
+        !reconfigure_ ){
+      GDEBUG("Failed to allocate host memory (1)\n");
+      return GADGET_FAIL;
+    }
+
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      image_counter_[i] = 0;
+      num_coils_[i] = 0;
+      previous_timestamp_[i] = 0;
+      profiles_counter_global_[i] = 0;
+      Rw_reached_[i] = false;
+      Rw_offset_[i] = 0;
+      buffer_update_needed_[i] = true;
+      reconfigure_[i] = true;
+    }
+        
+    position_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+    read_dir_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+    phase_dir_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+    slice_dir_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+
+    if( !position_.get() || !read_dir_.get() || !phase_dir_.get() || !slice_dir_.get() ){
+      GDEBUG("Failed to allocate host memory (2)\n");
+      return GADGET_FAIL;
+    }
+
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      (position_[i])[0] = (position_[i])[1] = (position_[i])[2] = 0.0f;
+      (read_dir_[i])[0] = (read_dir_[i])[1] = (read_dir_[i])[2] = 0.0f;
+      (phase_dir_[i])[0] = (phase_dir_[i])[1] = (phase_dir_[i])[2] = 0.0f;
+      (slice_dir_[i])[0] = (slice_dir_[i])[1] = (slice_dir_[i])[2] = 0.0f;
+    }
+
+    // Allocate accumulation buffer
+    //
+
+    if( buffer_using_solver_ )
+      acc_buffer_cg_ = boost::shared_array< cuSenseBufferCg<float,2> >(new cuSenseBufferCg<float,2>[slices_*sets_]);
+    else
+      acc_buffer_ = boost::shared_array< cuSenseBuffer<float,2> >(new cuSenseBuffer<float,2>[slices_*sets_]);
+    
+    // Allocate remaining shared_arrays
+    //
+    
+    csm_host_ = boost::shared_array< hoNDArray<float_complext> >(new hoNDArray<float_complext>[slices_*sets_]);
+    reg_host_ = boost::shared_array< hoNDArray<float_complext> >(new hoNDArray<float_complext>[slices_*sets_]);
+
+    host_weights_recon_ = boost::shared_array< hoNDArray<float> >(new hoNDArray<float>[slices_*sets_]);
+
+    if( !csm_host_.get() || !reg_host_.get() || !host_weights_recon_ ){
+      GDEBUG("Failed to allocate host memory (3)\n");
+      return GADGET_FAIL;
+    }
+
+    return GADGET_OK;
+  }
+
+  int gpuRetroGatedSensePrepGadget::
+  process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1,
+          GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2)
+  {
+    // Noise should have been consumed by the noise adjust (if in the gadget chain)
+    //
+    
+    bool is_noise = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT);
+    if (is_noise) { 
+      m1->release();
+      return GADGET_OK;
+    }
+
+    unsigned int slice = m1->getObjectPtr()->idx.slice;
+    unsigned int set = m1->getObjectPtr()->idx.set;
+
+    unsigned int profile = m1->getObjectPtr()->idx.kspace_encode_step_1;
+
+    unsigned int current_timestamp = m1->getObjectPtr()->physiology_time_stamp[phys_time_index_];
+    unsigned int previous_timestamp = previous_timestamp_[set*slices_+slice];
+    
+    bool new_cardiac_cycle_detected = (current_timestamp < previous_timestamp);
+
+    previous_timestamp_[set*slices_+slice] = current_timestamp;
+
+    if( !Rw_reached_[set*slices_+slice] && !new_cardiac_cycle_detected ){ 
+      Rw_offset_[set*slices_+slice]++;
+      m1->release();
+      return GADGET_OK;
+    }
+
+    if( !Rw_reached_[set*slices_+slice] && new_cardiac_cycle_detected ){ 
+      Rw_reached_[set*slices_+slice] = true;
+      profiles_counter_global_[set*slices_+slice] = Rw_offset_[set*slices_+slice];
+      new_cardiac_cycle_detected = false;
+    }
+
+    boost::shared_ptr<GPUTimer> process_timer;
+    if( output_timing_ )
+      process_timer = boost::shared_ptr<GPUTimer>( new GPUTimer("gpuRetroGatedSensePrepGadget::process()") );
+
+    // Get a pointer to the accumulation buffer. 
+    //
+
+    cuSenseBuffer<float,2> *acc_buffer = (buffer_using_solver_) ? &acc_buffer_cg_[set*slices_+slice] : &acc_buffer_[set*slices_+slice];
+
+    // Have the imaging plane changed?
+    //
+
+    if( !vec_equal(position_[set*slices_+slice], m1->getObjectPtr()->position) ||
+        !vec_equal(read_dir_[set*slices_+slice], m1->getObjectPtr()->read_dir) || 
+        !vec_equal(phase_dir_[set*slices_+slice], m1->getObjectPtr()->phase_dir) ||
+        !vec_equal(slice_dir_[set*slices_+slice], m1->getObjectPtr()->slice_dir) ){
+      
+      // Yes indeed, clear the accumulation buffer
+      acc_buffer->clear();
+      buffer_update_needed_[set*slices_+slice] = true;
+      
+      memcpy(position_[set*slices_+slice],m1->getObjectPtr()->position,3*sizeof(float));
+      memcpy(read_dir_[set*slices_+slice],m1->getObjectPtr()->read_dir,3*sizeof(float));
+      memcpy(phase_dir_[set*slices_+slice],m1->getObjectPtr()->phase_dir,3*sizeof(float));
+      memcpy(slice_dir_[set*slices_+slice],m1->getObjectPtr()->slice_dir,3*sizeof(float));
+    }
+    
+    // Only when the first profile arrives, do we know the #samples/profile
+    //
+
+    if( samples_per_profile_ == -1 )      
+      samples_per_profile_ = m1->getObjectPtr()->number_of_samples;
+    
+    if( samples_per_profile_ != m1->getObjectPtr()->number_of_samples ){
+      GDEBUG("Unexpected change in the incoming profiles' lengths\n");
+      return GADGET_FAIL;
+    }
+    
+    // Reconfigure at first pass
+    // - or if the number of coil changes
+    // - or if the reconfigure_ flag is set
+
+    if( num_coils_[set*slices_+slice] != m1->getObjectPtr()->active_channels ){
+      GDEBUG("Reconfiguring due to change in the number of coils\n");
+      num_coils_[set*slices_+slice] = m1->getObjectPtr()->active_channels;
+      reconfigure(set, slice);
+    }
+
+    if( reconfigure_[set*slices_+slice] ){
+      GDEBUG("Reconfiguring due to boolean indicator\n");
+      reconfigure(set, slice);
+    }
+
+    // Enqueue profile
+    // - if 'new_cardiac_cycle_detected' the current profile does not
+    //   belong to the current cardiac cycle and we delay enqueing
+    //
+
+    buffer_profiles_queue_[set*slices_+slice].enqueue_tail(duplicate_profile(m2));
+    
+    if( !new_cardiac_cycle_detected ) {
+      if( recon_profiles_queue_[set*slices_+slice].message_count() == 0 ){
+        first_profile_acq_time_[set*slices_+slice] = m1->getObjectPtr()->acquisition_time_stamp;
+        first_profile_phys_time_[set*slices_+slice] = m1->getObjectPtr()->physiology_time_stamp[phys_time_index_];
+      }
+      recon_profiles_queue_[set*slices_+slice].enqueue_tail(duplicate_profile(m2));
+    }
+    
+    // If the profile is the last of a "buffer frame" 
+    // - then update the accumulation buffer
+    //
+    
+    bool is_last_profile_in_buffer_frame = 
+      ( buffer_profiles_queue_[set*slices_+slice].message_count() == profiles_per_buffer_frame_ );
+    
+    if( is_last_profile_in_buffer_frame ){
+      
+      // Extract this frame's samples to update the csm/regularization buffer
+      //
+      
+      boost::shared_ptr< hoNDArray<float_complext> > host_samples = 
+        extract_samples_from_buffer_queue( set, slice );
+      
+      if( host_samples.get() == 0x0 ){
+        GDEBUG("Failed to extract buffer samples from queue\n");
+        return GADGET_FAIL;
+      }
+      
+      cuNDArray<float_complext> samples( host_samples.get() );
+      
+      long profile_offset = profiles_counter_global_[set*slices_+slice];
+      boost::shared_ptr< cuNDArray<floatd2> > traj = calculate_trajectory_for_buffer(profile_offset, set, slice);
+      
+      buffer_update_needed_[set*slices_+slice] |= acc_buffer->add_frame_data( &samples, traj.get() );
+    }
+    
+    // Perform reconstruction if it is time...
+    //
+      
+    if( new_cardiac_cycle_detected ){
+      
+      // Prepare the image headers for the reconstruction
+      //
+      
+      boost::shared_array<ISMRMRD::ImageHeader> headers( new ISMRMRD::ImageHeader[frames_per_cardiac_cycle_] );
+      
+      for( unsigned int i=0; i<frames_per_cardiac_cycle_; i++ ){
+        
+        ISMRMRD::AcquisitionHeader *base_head = m1->getObjectPtr();
+        ISMRMRD::ImageHeader *header = &headers[i];
+        
+        {
+          // Initialize header to all zeroes (there is a few fields we do not set yet)
+          ISMRMRD::ImageHeader tmp;
+          *header = tmp;
+        }
+        
+        header->version = base_head->version;
+        
+        header->matrix_size[0] = image_dimensions_recon_[0];
+        header->matrix_size[1] = image_dimensions_recon_[1];
+        header->matrix_size[2] = 1;
+        
+        header->field_of_view[0] = fov_[0];
+        header->field_of_view[1] = fov_[1];
+        header->field_of_view[2] = fov_[2];
+        
+        header->channels = num_coils_[set*slices_+slice];
+        header->slice = base_head->idx.slice;
+        header->set = base_head->idx.set;
+        
+        header->acquisition_time_stamp = 
+          first_profile_acq_time_[set*slices_+slice] + 
+          i*(base_head->acquisition_time_stamp-first_profile_acq_time_[set*slices_+slice])/frames_per_cardiac_cycle_;
+
+        header->physiology_time_stamp[phys_time_index_] = 
+          first_profile_phys_time_[set*slices_+slice] + 
+          i*(base_head->physiology_time_stamp[phys_time_index_]-first_profile_phys_time_[set*slices_+slice])/frames_per_cardiac_cycle_;
+
+        memcpy(header->position, base_head->position, sizeof(float)*3);
+        memcpy(header->read_dir, base_head->read_dir, sizeof(float)*3);
+        memcpy(header->phase_dir, base_head->phase_dir, sizeof(float)*3);
+        memcpy(header->slice_dir, base_head->slice_dir, sizeof(float)*3);
+        memcpy(header->patient_table_position, base_head->patient_table_position, sizeof(float)*3);
+        
+        header->data_type = ISMRMRD::ISMRMRD_CXFLOAT;
+        header->image_index = image_counter_[set*slices_+slice]++; 
+        header->image_series_index = set*slices_+slice;        
+      }
+      
+      // Update csm and regularization images
+      //
+
+      if( buffer_update_needed_[set*slices_+slice] || 
+          csm_host_[set*slices_+slice].get_number_of_elements() == 0 || 
+          reg_host_[set*slices_+slice].get_number_of_elements() == 0 ) {
+
+        // Get the accumulated coil images
+        //
+        
+        boost::shared_ptr< cuNDArray<float_complext> > csm_data = acc_buffer->get_accumulated_coil_images();
+        
+        if( !csm_data.get() ){
+          GDEBUG("Error during accumulation buffer computation\n");
+          return GADGET_FAIL;
+        }            
+	
+        // Estimate CSM
+        //
+
+        boost::shared_ptr< cuNDArray<float_complext> > csm = estimate_b1_map<float,2>( csm_data.get() );
+
+        if( !csm.get() ){
+          GDEBUG("Error during coil estimation\n");
+          return GADGET_FAIL;
+        }            
+      
+        acc_buffer->set_csm(csm);
+        csm_host_[set*slices_+slice] = *(csm->to_host());
+      
+        // Compute regularization image
+        //
+
+        boost::shared_ptr< cuNDArray<float_complext> > reg_image;
+	
+        if( buffer_using_solver_ ){
+          ((cuSenseBufferCg<float,2>*)acc_buffer)->preprocess( calculate_trajectory_for_rhs( profiles_counter_global_[set*slices_+slice], set, slice).get() );
+        }
+      
+        reg_image = acc_buffer->get_combined_coil_image();
+        
+        if( !reg_image.get() ){
+          GDEBUG("Error computing regularization image\n");
+          return GADGET_FAIL;
+        }            
+	
+        reg_host_[set*slices_+slice] = *(reg_image->to_host());
+        
+        /*
+          static int counter = 0;
+          char filename[256];
+          sprintf((char*)filename, "reg_%d.real", counter);
+          write_nd_array<float>( abs(&reg_host_[set*slices_+slice]).get(), filename );
+          counter++;  */
+
+        buffer_update_needed_[set*slices_+slice] = false;        
+      }
+
+      // Prepare data array of the profiles for the downstream reconstruction
+      //
+      
+      boost::shared_ptr< hoNDArray<float_complext> > samples_host( new hoNDArray<float_complext>() );
+      boost::shared_ptr< hoNDArray<floatd2> > traj_host( new hoNDArray<floatd2> );
+
+      if( extract_samples_and_trajectory_from_recon_queue( set, slice, samples_host, traj_host ) != GADGET_OK ){
+        GDEBUG("Failed to extract samples and/or trajectories.\n");
+        return GADGET_FAIL;
+      }        
+      
+      // Set up Sense job
+      //
+
+      GadgetContainerMessage< GenericReconJob >* m4 = new GadgetContainerMessage< GenericReconJob >();
+	
+      m4->getObjectPtr()->dat_host_ = samples_host;
+      m4->getObjectPtr()->tra_host_ = traj_host;
+      m4->getObjectPtr()->dcw_host_ = boost::shared_ptr< hoNDArray<float> >(new hoNDArray<float>(host_weights_recon_[set*slices_+slice]));
+      m4->getObjectPtr()->csm_host_ = boost::shared_ptr< hoNDArray<float_complext> >( new hoNDArray<float_complext>(csm_host_[set*slices_+slice]));
+      m4->getObjectPtr()->reg_host_ = boost::shared_ptr< hoNDArray<float_complext> >( new hoNDArray<float_complext>(reg_host_[set*slices_+slice]));
+      m4->getObjectPtr()->image_headers_ = headers;
+      
+      // The Sense Job needs an image header as well. 
+      // Let us just copy the initial one...
+      //
+
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *m3 = new GadgetContainerMessage<ISMRMRD::ImageHeader>;
+      *m3->getObjectPtr() = m4->getObjectPtr()->image_headers_[0];
+      m3->cont(m4);
+      
+      if (this->next()->putq(m3) < 0) {
+        GDEBUG("Failed to put job on queue.\n");
+        m3->release();
+        return GADGET_FAIL;
+      }
+    }
+    
+    // This is was first profile of a new cardiac cycle, enqueue (since this was postponed above).
+    //
+
+    if( new_cardiac_cycle_detected ){      
+      if( recon_profiles_queue_[set*slices_+slice].message_count() == 0 ){
+        first_profile_acq_time_[set*slices_+slice] = m1->getObjectPtr()->acquisition_time_stamp;
+        first_profile_phys_time_[set*slices_+slice] = m1->getObjectPtr()->physiology_time_stamp[phys_time_index_];
+      }
+      recon_profiles_queue_[set*slices_+slice].enqueue_tail(duplicate_profile(m2)); 
+    }
+    
+    profiles_counter_global_[set*slices_+slice]++;
+
+    if( output_timing_ )
+      process_timer.reset();
+    
+    m1->release(); // the internal queues hold copies
+    return GADGET_OK;
+  }
+  
+  int
+  gpuRetroGatedSensePrepGadget::calculate_density_compensation_for_reconstruction( unsigned int set, unsigned int slice )
+  {
+    switch(mode_){
+      
+    case 2:
+    case 3:
+      host_weights_recon_[set*slices_+slice] = *compute_radial_dcw_golden_ratio_2d<float>
+        ( samples_per_profile_, profiles_per_frame_, oversampling_factor_, 
+          1.0f/(float(samples_per_profile_)/float(image_dimensions_recon_[0])), 0,
+          (mode_==2) ? GR_ORIGINAL : GR_SMALLEST )->to_host();
+      break;
+      
+    default:
+      GDEBUG("Illegal dcw mode\n");
+      return GADGET_FAIL;
+      break;
+    }
+    return GADGET_OK;
+  }
+  
+  boost::shared_ptr< cuNDArray<floatd2> > 
+  gpuRetroGatedSensePrepGadget::calculate_trajectory_for_buffer( long profile_offset, unsigned int set, unsigned int slice )
+  {
+    boost::shared_ptr< cuNDArray<floatd2> > result;
+
+    switch(mode_){
+
+    case 2:
+    case 3:
+      { 
+
+        long first_profile_in_buffer = profile_offset + 1 - profiles_per_buffer_frame_;
+
+        result = compute_radial_trajectory_golden_ratio_2d<float>
+          ( samples_per_profile_, profiles_per_buffer_frame_, 1, first_profile_in_buffer, (mode_==2) ? GR_ORIGINAL : GR_SMALLEST );
+
+      }
+      break;	
+	
+    default:
+      GDEBUG("Illegal trajectory mode\n");
+      break;
+    }
+    
+    return result;
+  }
+
+  boost::shared_ptr< cuNDArray<float> >
+  gpuRetroGatedSensePrepGadget::calculate_density_compensation_for_buffer( unsigned int set, unsigned int slice )
+  {    
+    switch(mode_){
+      
+    case 2:
+    case 3:
+      return compute_radial_dcw_golden_ratio_2d<float>
+        ( samples_per_profile_, profiles_per_buffer_frame_, oversampling_factor_, 
+          1.0f/(float(samples_per_profile_)/float(image_dimensions_recon_[0])), 0,
+          (mode_==2) ? GR_ORIGINAL : GR_SMALLEST );
+      break;
+      
+    default:
+      GDEBUG("Illegal dcw mode\n");
+      return boost::shared_ptr< cuNDArray<float> >();
+      break;
+    }   
+  }
+
+
+  boost::shared_ptr< cuNDArray<floatd2> > 
+  gpuRetroGatedSensePrepGadget::calculate_trajectory_for_rhs( long profile_offset, unsigned int set, unsigned int slice )
+  {
+    switch(mode_){
+
+    case 2:
+    case 3:
+      { 
+
+        long first_profile =
+          std::max( 0L, profile_offset + 1 - profiles_per_buffer_frame_*num_buffer_frames_inner_ );
+
+        return compute_radial_trajectory_golden_ratio_2d<float>
+          ( samples_per_profile_, profiles_per_buffer_frame_*num_buffer_frames_inner_, 1, first_profile, (mode_==2) ? GR_ORIGINAL : GR_SMALLEST );
+      }
+      break;	
+	
+    default:
+      GDEBUG("Illegal trajectory mode\n");
+      return boost::shared_ptr< cuNDArray<floatd2> >();
+      break;
+    }
+  }
+  
+  boost::shared_ptr< cuNDArray<float> >
+  gpuRetroGatedSensePrepGadget::calculate_density_compensation_for_rhs( unsigned int set, unsigned int slice )
+  {
+    switch(mode_){
+      
+    case 2:
+    case 3:
+      {
+
+        long num_profiles = profiles_per_buffer_frame_*num_buffer_frames_inner_;
+
+        return compute_radial_dcw_golden_ratio_2d<float>
+          ( samples_per_profile_, num_profiles, oversampling_factor_, 
+            1.0f/(float(samples_per_profile_)/float(image_dimensions_recon_[0])), 0,
+            (mode_==2) ? GR_ORIGINAL : GR_SMALLEST );
+
+      }
+      break;
+      
+    default:
+      GDEBUG("Illegal dcw mode\n");
+      return boost::shared_ptr< cuNDArray<float> >();
+      break;
+    }
+  }
+
+  boost::shared_ptr< hoNDArray<float_complext> > 
+  gpuRetroGatedSensePrepGadget::extract_samples_from_buffer_queue( unsigned int set, unsigned int slice )
+  {    
+    ACE_Message_Queue<ACE_MT_SYNCH> *queue = &buffer_profiles_queue_[set*slices_+slice];
+
+    unsigned int profiles_buffered = queue->message_count();
+    
+    std::vector<size_t> dims;
+    dims.push_back(samples_per_profile_*profiles_buffered);
+    dims.push_back(num_coils_[set*slices_+slice]);
+    
+    boost::shared_ptr< hoNDArray<float_complext> > host_samples(new hoNDArray<float_complext>(&dims));
+    
+    for (unsigned int p=0; p<profiles_buffered; p++) {
+
+      ACE_Message_Block* mbq;
+      if (queue->dequeue_head(mbq) < 0) {
+        GDEBUG("Message dequeue failed\n");
+        return boost::shared_ptr< hoNDArray<float_complext> >();
+      }
+      
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *daq = AsContainerMessage<hoNDArray< std::complex<float> > >(mbq);
+	
+      if (!daq) {
+        GDEBUG("Unable to interpret data on message queue\n");
+        return boost::shared_ptr< hoNDArray<float_complext> >();
+      }
+	
+      for (unsigned int c = 0; c < num_coils_[set*slices_+slice]; c++) {
+	
+        float_complext *data_ptr = host_samples->get_data_ptr();
+        data_ptr += c*samples_per_profile_*profiles_buffered+p*samples_per_profile_;
+	    
+        std::complex<float> *r_ptr = daq->getObjectPtr()->get_data_ptr();
+        r_ptr += c*daq->getObjectPtr()->get_size(0);
+	  
+        memcpy(data_ptr,r_ptr,samples_per_profile_*sizeof(float_complext));
+      }
+      
+      mbq->release();
+    } 
+    
+    return host_samples;
+  }
+  
+  int gpuRetroGatedSensePrepGadget::extract_samples_and_trajectory_from_recon_queue
+  ( unsigned int set, unsigned int slice, boost::shared_ptr< hoNDArray<float_complext> > samples, boost::shared_ptr< hoNDArray<floatd2> > trajectory )
+  {    
+    // Extract samples from queue and put into buffer 
+    //
+
+    ACE_Message_Queue<ACE_MT_SYNCH> *queue = &recon_profiles_queue_[set*slices_+slice];
+    long profiles_buffered = queue->message_count();
+    
+    std::vector<size_t> dims_per_readout;
+    dims_per_readout.push_back(samples_per_profile_);
+    dims_per_readout.push_back(num_coils_[set*slices_+slice]);
+    
+    std::vector<size_t> dims_for_buffer = dims_per_readout;
+    dims_for_buffer.push_back(profiles_buffered);
+    
+    hoNDArray< std::complex<float> > host_buffer(&dims_for_buffer);
+
+    for (long p=0; p<profiles_buffered; p++) {
+      
+      ACE_Message_Block* mbq;
+      if (queue->dequeue_head(mbq) < 0) {
+        GDEBUG("Message dequeue failed\n");
+        return GADGET_FAIL;
+      }
+      
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *daq = 
+        AsContainerMessage<hoNDArray< std::complex<float> > >(mbq);
+	
+      if (!daq) {
+        GDEBUG("Unable to interpret data on message queue\n");
+        return GADGET_FAIL;
+      }
+
+      {
+        // Copy daq into host_buffer array
+        hoNDArray< std::complex<float> > tmp( &dims_per_readout, host_buffer.get_data_ptr() + p*dims_per_readout[0]*dims_per_readout[1] );
+        if( !tmp.dimensions_equal( daq->getObjectPtr()->get_dimensions().get() )){
+          GDEBUG("Unexpected dimensionality of array on message queue\n");
+          return GADGET_FAIL;
+        }
+        tmp = *daq->getObjectPtr();
+      }
+      mbq->release();
+    } 
+
+    // Create trajectory array according to the samples buffer
+    //
+
+    long first_profile_in_buffer = 
+      profiles_counter_global_[set*slices_+slice] - profiles_buffered;
+    
+    boost::shared_ptr< hoNDArray<floatd2> > host_traj = compute_radial_trajectory_golden_ratio_2d<float>
+      ( samples_per_profile_, profiles_buffered, 1, first_profile_in_buffer, (mode_==2) ? GR_ORIGINAL : GR_SMALLEST )->to_host();
+
+    host_traj->squeeze();
+
+    // Prepare samples and trajecotry arrays according to the current 
+    // 'profiles_per_frame_' and 'frames_per_cardiac_cycle_' settings
+    //
+    
+    std::vector<size_t> recon_dims;
+    recon_dims.push_back(samples_per_profile_*profiles_per_frame_*frames_per_cardiac_cycle_);
+    recon_dims.push_back(num_coils_[set*slices_+slice]);
+    
+    std::vector<size_t> traj_dims_frame;
+    traj_dims_frame.push_back( samples_per_profile_*profiles_per_frame_ );
+
+    std::vector<size_t> traj_dims = traj_dims_frame;
+    traj_dims.push_back( frames_per_cardiac_cycle_ );
+
+    samples->create( recon_dims );
+    trajectory->create( traj_dims );
+    
+    for( long frame=0; frame<frames_per_cardiac_cycle_; frame++ ){
+      
+      long first_profile = 
+        (long)(float(frame)*float(profiles_buffered-profiles_per_frame_)/float(frames_per_cardiac_cycle_-1));
+      // Just to be sure we do run get out-of-bounds due to rounding errors in the float<->int math
+      //
+
+      if( first_profile < 0 ){
+        GDEBUG("\nWARNING: first profile is negative. Corrected.");
+        first_profile = 0;
+      }
+
+      if (first_profile + profiles_per_frame_ - 1  > profiles_buffered -1 ){
+        GDEBUG("\nWARNING: first profile is out of bounds for the last profile. Corrected.");
+        first_profile = profiles_buffered - profiles_per_frame_;
+      }
+
+      //printf( "\nFor frame %ld: The first profile has index %ld (of %ld).", frame, first_profile, profiles_buffered );
+        
+      for( long coil=0; coil<num_coils_[set*slices_+slice]; coil++ ){
+        
+        for( long profile = 0; profile<profiles_per_frame_; profile++ ){
+
+          // Copy samples for profile
+          //
+
+          memcpy( samples->get_data_ptr() + 
+                  coil*samples_per_profile_*profiles_per_frame_*frames_per_cardiac_cycle_ +
+                  frame*samples_per_profile_*profiles_per_frame_ + 
+                  profile*samples_per_profile_,
+                  
+                  host_buffer.get_data_ptr() + 
+                  (first_profile + profile) * samples_per_profile_*num_coils_[set*slices_+slice]+
+                  coil*samples_per_profile_,
+                  
+                  sizeof(std::complex<float>)*samples_per_profile_);
+          
+          // Copy trajectory for profile
+          //
+
+          memcpy( trajectory->get_data_ptr() + 
+                  frame*samples_per_profile_*profiles_per_frame_ + 
+                  profile*samples_per_profile_,
+                  
+                  host_traj->get_data_ptr() + 
+                  (first_profile + profile) * samples_per_profile_,
+                  
+                  sizeof(floatd2)*samples_per_profile_);
+        }
+      }
+    }
+    return GADGET_OK;
+  }
+  
+  GadgetContainerMessage< hoNDArray< std::complex<float> > >*
+  gpuRetroGatedSensePrepGadget::duplicate_profile( GadgetContainerMessage< hoNDArray< std::complex<float> > > *profile )
+  {
+    GadgetContainerMessage< hoNDArray< std::complex<float> > > *copy = 
+      new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+    
+    *copy->getObjectPtr() = *profile->getObjectPtr();
+    
+    return copy;
+  }
+
+  void gpuRetroGatedSensePrepGadget::reconfigure(unsigned int set, unsigned int slice)
+  {    
+    calculate_density_compensation_for_reconstruction(set, slice);
+    
+    cuSenseBuffer<float,2> *acc_buffer = (buffer_using_solver_) ? &acc_buffer_cg_[set*slices_+slice] : &acc_buffer_[set*slices_+slice];
+
+    acc_buffer->setup( from_std_vector<size_t,2>(image_dimensions_recon_), image_dimensions_recon_os_, 
+                       kernel_width_, num_coils_[set*slices_+slice],                        
+                       num_buffer_frames_outer_, num_buffer_frames_inner_ );
+    
+    boost::shared_ptr< cuNDArray<float> > device_weights = calculate_density_compensation_for_buffer(set, slice);
+    acc_buffer->set_dcw(device_weights);
+
+    if( buffer_using_solver_ ){
+      ((cuSenseBufferCg<float,2>*) acc_buffer)->set_dcw_for_rhs(calculate_density_compensation_for_rhs(set, slice));
+      ((cuSenseBufferCg<float,2>*) acc_buffer)->preprocess(calculate_trajectory_for_rhs(0, set, slice).get());
+    }
+    
+    reconfigure_[set*slices_+slice] = false;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuRetroGatedSensePrepGadget)
+}
diff --git a/gadgets/radial/gpuRetroGatedSensePrepGadget.h b/gadgets/radial/gpuRetroGatedSensePrepGadget.h
new file mode 100644
index 0000000..6b245ff
--- /dev/null
+++ b/gadgets/radial/gpuRetroGatedSensePrepGadget.h
@@ -0,0 +1,148 @@
+#pragma once
+
+#include "gadgetron_radial_export.h"
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "vector_td.h"
+#include "cuNFFT.h"
+#include "cuCgPreconditioner.h"
+#include "cuSenseBufferCg.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+
+/*
+  Prep gadget for retrospectively gated Sense based on golden ratio sampling.
+  Thus only radial modes 2-3 are supported.  
+*/
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_RADIAL gpuRetroGatedSensePrepGadget :
+    public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+  {
+
+  public:
+    GADGET_DECLARE(gpuRetroGatedSensePrepGadget);
+    gpuRetroGatedSensePrepGadget();
+    virtual ~gpuRetroGatedSensePrepGadget();
+
+  protected:
+    GADGET_PROPERTY_LIMITS(mode,int,"Radial mode", 2, GadgetPropertyLimitsEnumeration, 2, 3);
+    GADGET_PROPERTY(deviceno,int,"GPU Device Number", 0);
+    GADGET_PROPERTY(profiles_per_frame, int, "Profiles per frame", 16);
+    GADGET_PROPERTY(frames_per_cardiac_cycle, int, "Frames in a cardiac cycle", 30);
+    GADGET_PROPERTY(profiles_per_buffer_frame, int, "Profiles in each buffer frame", 32);
+    GADGET_PROPERTY(number_of_buffer_frames_inner, int, "Number of inner buffer frames", 8);
+    GADGET_PROPERTY(number_of_buffer_frames_outer, int, "Number of outer buffer frames", 1);
+    GADGET_PROPERTY(buffer_using_solver, bool, "Use solver for buffer", false);
+    GADGET_PROPERTY(output_timing, bool, "Output timing information", false);
+    GADGET_PROPERTY(physiology_time_index, int, "Physiology time index", 0);
+    GADGET_PROPERTY(buffer_convolution_kernel_width, float, "Convolution kernel width for buffer", 5.5);
+    GADGET_PROPERTY(buffer_convolution_oversampling_factor, float, "Oversampling used in buffer convolution", 1.25);
+    GADGET_PROPERTY(reconstruction_os_factor_x, float, "Oversampling for reconstruction in x-direction", 1.0);
+    GADGET_PROPERTY(reconstruction_os_factor_y, float, "Oversampling for reconstruction in y-direction", 1.0);
+
+
+    virtual int process_config(ACE_Message_Block *mb);
+
+    virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader > *m1,
+			GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2);
+
+  private:
+
+    inline bool vec_equal(float *in1, float *in2) {
+      for (unsigned int i = 0; i < 3; i++) {
+        if (in1[i] != in2[i]) return false;
+      }
+      return true;
+    }
+    
+    boost::shared_array<bool> reconfigure_;
+    virtual void reconfigure(unsigned int set, unsigned int slice);
+
+    GadgetContainerMessage< hoNDArray< std::complex<float> > >*
+      duplicate_profile( GadgetContainerMessage< hoNDArray< std::complex<float> > > *profile );
+
+    boost::shared_ptr< hoNDArray<float_complext> > extract_samples_from_buffer_queue( unsigned int set, unsigned int slice );
+
+    int extract_samples_and_trajectory_from_recon_queue
+      ( unsigned int set, unsigned int slice, boost::shared_ptr< hoNDArray<float_complext> > samples, boost::shared_ptr< hoNDArray<floatd2> > trajectory );
+
+    int calculate_density_compensation_for_reconstruction(unsigned int set, unsigned int slice);
+
+    boost::shared_ptr< cuNDArray<floatd2> > 
+      calculate_trajectory_for_buffer(long profile_offset, unsigned int set, unsigned int slice);
+
+    boost::shared_ptr< cuNDArray<float> >
+      calculate_density_compensation_for_buffer(unsigned int set, unsigned int slice);
+
+    boost::shared_ptr< cuNDArray<floatd2> > 
+      calculate_trajectory_for_rhs(long profile_offset, unsigned int set, unsigned int slice);
+
+    boost::shared_ptr< cuNDArray<float> > 
+      calculate_density_compensation_for_rhs(unsigned int set, unsigned int slice);
+
+    int slices_;
+    int sets_;
+    int device_number_;
+    int mode_;
+
+    unsigned short phys_time_index_;
+
+    long samples_per_profile_;
+    long profiles_per_frame_;
+    long frames_per_cardiac_cycle_;
+
+    // The number of buffer cycles
+    long profiles_per_buffer_frame_;
+    long num_buffer_frames_inner_; 
+    long num_buffer_frames_outer_;
+
+    // Internal book-keeping
+    boost::shared_array<unsigned int> first_profile_acq_time_;
+    boost::shared_array<unsigned int> first_profile_phys_time_;
+    boost::shared_array<unsigned int> previous_timestamp_;
+    boost::shared_array<long> profiles_counter_global_;
+
+    // We will discard profiles until the first R-wave is encountered
+    boost::shared_array<bool> Rw_reached_;
+    boost::shared_array<unsigned int> Rw_offset_;
+
+    // For the buffer
+    float kernel_width_;
+    float oversampling_factor_;
+
+    boost::shared_array<long> image_counter_;
+    boost::shared_array<unsigned int> num_coils_;
+
+    boost::shared_array<float[3]> position_;
+    boost::shared_array<float[3]> read_dir_;
+    boost::shared_array<float[3]> phase_dir_;
+    boost::shared_array<float[3]> slice_dir_;
+
+    bool output_timing_;
+    bool buffer_using_solver_;
+
+    boost::shared_array<bool> buffer_update_needed_;
+
+    boost::shared_array< hoNDArray<float> > host_weights_recon_;
+    
+    boost::shared_array< hoNDArray<float_complext> > csm_host_;
+    boost::shared_array< hoNDArray<float_complext> > reg_host_;
+    
+    boost::shared_array< cuSenseBuffer<float,2> > acc_buffer_;
+    boost::shared_array< cuSenseBufferCg<float,2> > acc_buffer_cg_;
+
+    std::vector<size_t> fov_;
+    std::vector<size_t> image_dimensions_;
+    std::vector<size_t> image_dimensions_recon_;
+    uint64d2 image_dimensions_recon_os_;
+
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > buffer_profiles_queue_;
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > recon_profiles_queue_;
+  };
+}
diff --git a/gadgets/spiral/CMakeLists.txt b/gadgets/spiral/CMakeLists.txt
new file mode 100644
index 0000000..bfebb94
--- /dev/null
+++ b/gadgets/spiral/CMakeLists.txt
@@ -0,0 +1,45 @@
+IF (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_SPIRAL__)
+ENDIF (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+  ${CMAKE_SOURCE_DIR}/gadgets/pmri
+  ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators/gpu
+  ${CUDA_INCLUDE_DIRS}
+  )
+
+add_library(gadgetron_spiral SHARED 
+  gadgetron_spiral_export.h 
+  vds.cpp 
+  gpuSpiralSensePrepGadget.h gpuSpiralSensePrepGadget.cpp 
+  SpiralToGenericGadget.h SpiralToGenericGadget.cpp)
+
+set_target_properties(gadgetron_spiral PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_spiral
+  gadgetron_gadgetbase
+  gadgetron_toolbox_log
+  gadgetron_toolbox_cpucore gadgetron_toolbox_gpucore gadgetron_toolbox_gpunfft gadgetron_toolbox_gpusolvers gadgetron_toolbox_gpuoperators
+  ${ISMRMRD_LIBRARIES} ${FFTW3_LIBRARIES} ${CUDA_LIBRARIES}
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY}
+  )
+
+target_link_libraries(gadgetron_spiral gadgetron_toolbox_gpuparallelmri)
+
+install (TARGETS gadgetron_spiral DESTINATION lib COMPONENT main)
+install (FILES vds.h gadgetron_spiral_export.h 
+                     gpuSpiralSensePrepGadget.h 
+                     SpiralToGenericGadget.h 
+                     DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+add_subdirectory(config)
diff --git a/gadgets/spiral/SpiralToGenericGadget.cpp b/gadgets/spiral/SpiralToGenericGadget.cpp
new file mode 100644
index 0000000..9676f0b
--- /dev/null
+++ b/gadgets/spiral/SpiralToGenericGadget.cpp
@@ -0,0 +1,218 @@
+#include "SpiralToGenericGadget.h"
+#include "ismrmrd/xml.h"
+#include "vds.h"
+
+#include <algorithm>
+#include <vector>
+
+namespace Gadgetron{
+
+  SpiralToGenericGadget::SpiralToGenericGadget()
+    : samples_to_skip_start_(0)
+    , samples_to_skip_end_(0)
+    , samples_per_interleave_(0)
+    , prepared_(false)
+  {
+  }
+
+  SpiralToGenericGadget::~SpiralToGenericGadget() {}
+
+  int SpiralToGenericGadget::process_config(ACE_Message_Block* mb)
+  {
+    // Start parsing the ISMRMRD XML header
+    //
+  ISMRMRD::IsmrmrdHeader h;
+  ISMRMRD::deserialize(mb->rd_ptr(),h);
+  
+  
+  if (h.encoding.size() != 1) {
+    GDEBUG("This Gadget only supports one encoding space\n");
+    return GADGET_FAIL;
+  }
+
+  // Get the encoding space and trajectory description
+  ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+  ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+  ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+  ISMRMRD::TrajectoryDescription traj_desc;
+
+  if (h.encoding[0].trajectoryDescription) {
+    traj_desc = *h.encoding[0].trajectoryDescription;
+  } else {
+    GDEBUG("Trajectory description missing");
+    return GADGET_FAIL;
+  }
+
+  if (traj_desc.identifier != "HargreavesVDS2000") {
+    GDEBUG("Expected trajectory description identifier 'HargreavesVDS2000', not found.");
+    return GADGET_FAIL;
+  }
+
+
+  long interleaves = -1;
+  long fov_coefficients = -1;
+  long sampling_time_ns = -1;
+  double max_grad = -1.0;
+  double max_slew = -1.0;
+  double fov_coeff = -1.0;
+  double kr_max = -1.0;
+
+  
+  for (std::vector<ISMRMRD::UserParameterLong>::iterator i (traj_desc.userParameterLong.begin()); i != traj_desc.userParameterLong.end(); ++i) {
+    if (i->name == "interleaves") {
+      interleaves = i->value;
+    } else if (i->name == "fov_coefficients") {
+      fov_coefficients = i->value;
+    } else if (i->name == "SamplingTime_ns") {
+      sampling_time_ns = i->value;
+    } else {
+      GDEBUG("WARNING: unused trajectory parameter %s found\n", i->name.c_str());
+    }
+  }
+
+  for (std::vector<ISMRMRD::UserParameterDouble>::iterator i (traj_desc.userParameterDouble.begin()); i != traj_desc.userParameterDouble.end(); ++i) {
+    if (i->name == "MaxGradient_G_per_cm") {
+      max_grad = i->value;
+    } else if (i->name == "MaxSlewRate_G_per_cm_per_s") {
+        max_slew = i->value;
+    } else if (i->name == "FOVCoeff_1_cm") {
+      fov_coeff = i->value;
+    } else if (i->name == "krmax_per_cm") {
+      kr_max= i->value;
+    } else {
+      GDEBUG("WARNING: unused trajectory parameter %s found\n", i->name.c_str());
+    }
+  }
+  
+  if ((interleaves < 0) || (fov_coefficients < 0) || (sampling_time_ns < 0) || (max_grad < 0) || (max_slew < 0) || (fov_coeff < 0) || (kr_max < 0)) {
+    GDEBUG("Appropriate parameters for calculating spiral trajectory not found in XML configuration\n");
+    return GADGET_FAIL;
+  }
+
+  Tsamp_ns_ = sampling_time_ns;
+  Nints_ = interleaves;
+  interleaves_ = static_cast<int>(Nints_);
+  
+  gmax_ = max_grad;
+  smax_ = max_slew;
+  krmax_ = kr_max;
+  fov_ = fov_coeff;
+  
+  samples_to_skip_start_  =  0; //n.get<int>(std::string("samplestoskipstart.value"))[0];
+  samples_to_skip_end_    = -1; //n.get<int>(std::string("samplestoskipend.value"))[0];
+  
+  GDEBUG("smax:                    %f\n", smax_);
+  GDEBUG("gmax:                    %f\n", gmax_);
+  GDEBUG("Tsamp_ns:                %d\n", Tsamp_ns_);
+  GDEBUG("Nints:                   %d\n", Nints_);
+  GDEBUG("fov:                     %f\n", fov_);
+  GDEBUG("krmax:                   %f\n", krmax_);
+  GDEBUG("samples_to_skip_start_ : %d\n", samples_to_skip_start_);
+  GDEBUG("samples_to_skip_end_   : %d\n", samples_to_skip_end_);
+  
+  return GADGET_OK;
+  }
+  
+  int SpiralToGenericGadget::
+  process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1,
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2)
+  {
+    // Noise should have been consumed by the noise adjust, but just in case...
+    //
+
+    bool is_noise = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT);
+    if (is_noise) {
+      m1->release();
+      return GADGET_OK;
+    }
+
+    // Compute hoNDArray of trajectory and weights at first pass
+    //
+
+    if (!prepared_) {
+
+      int     nfov   = 1;         /*  number of fov coefficients.             */
+      int     ngmax  = 1e5;       /*  maximum number of gradient samples      */
+      double  *xgrad;             /*  x-component of gradient.                */
+      double  *ygrad;             /*  y-component of gradient.                */
+      double  *x_trajectory;
+      double  *y_trajectory;
+      double  *weighting;
+      int     ngrad;
+      double sample_time = (1.0*Tsamp_ns_) * 1e-9;
+
+      // Calculate gradients 
+      calc_vds(smax_,gmax_,sample_time,sample_time,Nints_,&fov_,nfov,krmax_,ngmax,&xgrad,&ygrad,&ngrad);
+
+      samples_per_interleave_ = std::min(ngrad,static_cast<int>(m1->getObjectPtr()->number_of_samples));
+      GDEBUG("Using %d samples per interleave\n", samples_per_interleave_);
+
+      // Calculate the trajectory and weights
+      calc_traj(xgrad, ygrad, samples_per_interleave_, Nints_, sample_time, krmax_, &x_trajectory, &y_trajectory, &weighting);
+
+      std::vector<size_t> trajectory_dimensions;
+      trajectory_dimensions.push_back(3);
+      trajectory_dimensions.push_back(samples_per_interleave_*Nints_);
+
+      host_traj_ = boost::shared_ptr< hoNDArray<float> >(new hoNDArray<float>(&trajectory_dimensions));
+
+      {
+	float* co_ptr = reinterpret_cast<float*>(host_traj_->get_data_ptr());
+	
+	for (int i = 0; i < (samples_per_interleave_*Nints_); i++) {
+	  co_ptr[i*3+0] = -x_trajectory[i]/2;
+	  co_ptr[i*3+1] = -y_trajectory[i]/2;
+	  co_ptr[i*3+2] = weighting[i];
+	}
+      }
+
+      delete [] xgrad;
+      delete [] ygrad;
+      delete [] x_trajectory;
+      delete [] y_trajectory;
+      delete [] weighting;
+
+      prepared_ = true;
+    }
+
+    // Adjustments based in the incoming data
+    //
+
+    if (samples_to_skip_end_ == -1) {
+      samples_to_skip_end_ = m1->getObjectPtr()->number_of_samples-samples_per_interleave_;
+      GDEBUG("Adjusting samples_to_skip_end_ = %d\n", samples_to_skip_end_);
+    }
+
+    // Define some utility variables
+    //
+
+    unsigned int samples_to_copy = m1->getObjectPtr()->number_of_samples-samples_to_skip_end_;
+    unsigned int interleave = m1->getObjectPtr()->idx.kspace_encode_step_1;
+
+    // Prepare for a new array continuation for the trajectory/weights of the incoming profile
+    //
+
+    std::vector<size_t> trajectory_dimensions;
+    trajectory_dimensions.push_back(3);
+    trajectory_dimensions.push_back(samples_per_interleave_);
+    
+    hoNDArray<float> *traj_source = new hoNDArray<float>
+      (&trajectory_dimensions, host_traj_->get_data_ptr()+3*samples_per_interleave_*interleave);
+    
+    // Make a new array as continuation of m1, and pass along
+    //
+
+    GadgetContainerMessage< hoNDArray<float> > *cont = new GadgetContainerMessage< hoNDArray<float> >();
+    *(cont->getObjectPtr()) = *traj_source;
+    m2->cont(cont);
+    
+    if (this->next()->putq(m1) < 0) {
+      GDEBUG("Failed to put job on queue.\n");
+      return GADGET_FAIL;
+    }
+    
+    return GADGET_OK;
+  }
+  
+  GADGET_FACTORY_DECLARE(SpiralToGenericGadget)
+}
diff --git a/gadgets/spiral/SpiralToGenericGadget.h b/gadgets/spiral/SpiralToGenericGadget.h
new file mode 100644
index 0000000..d28bd30
--- /dev/null
+++ b/gadgets/spiral/SpiralToGenericGadget.h
@@ -0,0 +1,50 @@
+#ifndef SpiralToGenericGadget_H
+#define SpiralToGenericGadget_H
+#pragma once
+
+#include "gadgetron_spiral_export.h"
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_SPIRAL SpiralToGenericGadget :
+    public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+  {
+
+  public:
+    GADGET_DECLARE(SpiralToGenericGadget);
+
+    SpiralToGenericGadget();
+    virtual ~SpiralToGenericGadget();
+
+  protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+    
+    virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1,
+			GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2);
+    
+  private:
+    int samples_to_skip_start_;
+    int samples_to_skip_end_;
+    int samples_per_interleave_;
+    int interleaves_;
+    long    Tsamp_ns_;
+    long    Nints_;
+    long    acceleration_factor_;
+    double  gmax_;
+    double  smax_;
+    double  krmax_;
+    double  fov_;
+    bool prepared_;
+    
+    boost::shared_ptr< hoNDArray<float> > host_traj_;
+  };
+}
+#endif //SpiralToGenericGadget_H
diff --git a/gadgets/spiral/config/CMakeLists.txt b/gadgets/spiral/config/CMakeLists.txt
new file mode 100644
index 0000000..ed1c006
--- /dev/null
+++ b/gadgets/spiral/config/CMakeLists.txt
@@ -0,0 +1,16 @@
+if (ARMADILLO_FOUND)
+  install (FILES 
+    spiral_flow_gpusense_cg.xml 
+    spiral_flow_gpusense_sb.xml 
+    spiral_flow_generic_gpusense_cg.xml 
+    spiral_flow_generic_gpusense_sb.xml 
+    spiral_interactive.xml 
+    DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
+elseif (ARMADILLO_FOUND)
+  MESSAGE("Armadillo not found, only unoptimized spiral config files will be available")
+endif (ARMADILLO_FOUND)
+
+install (FILES 
+  spiral_flow_gpusense_cg_unoptimized.xml 
+  spiral_flow_gpusense_sb_unoptimized.xml 
+  DESTINATION ${GADGETRON_INSTALL_CONFIG_PATH} COMPONENT main)
diff --git a/gadgets/spiral/config/spiral_flow_generic_gpusense_cg.xml b/gadgets/spiral/config/spiral_flow_generic_gpusense_cg.xml
new file mode 100644
index 0000000..647b582
--- /dev/null
+++ b/gadgets/spiral/config/spiral_flow_generic_gpusense_cg.xml
@@ -0,0 +1,131 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>SpiralToGenericGadget</name>
+    <dll>gadgetron_spiral</dll>
+    <classname>SpiralToGenericGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>rotations_per_reconstruction</name><value>16</value></property>
+    <property><name>propagate_csm_from_set</name><value>0</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+    <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+    <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>20</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.1</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>1</value></property>
+    <property><name>number_of_iterations</name>    <value>20</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.1</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhaseSubtraction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FlowPhaseSubtractionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>MaxwellCorrection</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>MaxwellCorrectionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+    <property><name>extract_mask</name><value>9</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+ 
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/spiral/config/spiral_flow_generic_gpusense_sb.xml b/gadgets/spiral/config/spiral_flow_generic_gpusense_sb.xml
new file mode 100644
index 0000000..bfb07b5
--- /dev/null
+++ b/gadgets/spiral/config/spiral_flow_generic_gpusense_sb.xml
@@ -0,0 +1,141 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>SpiralToGenericGadget</name>
+    <dll>gadgetron_spiral</dll>
+    <classname>SpiralToGenericGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>rotations_per_reconstruction</name><value>16</value></property>
+    <property><name>propagate_csm_from_set</name><value>0</value></property>
+    <property><name>buffer_using_solver</name><value>true</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+    <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+    <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuSbSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuSbSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_cg_iterations</name> <value>10</value></property>
+    <property><name>number_of_sb_iterations</name> <value>20</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>mu</name>                      <value>1.0</value></property>
+    <property><name>lambda</name>                  <value>0.05</value></property>
+    <property><name>alpha</name>                   <value>0.5</value></property>
+    <property><name>exclusive_access</name><value>true</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuSbSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuSbSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>1</value></property>
+    <property><name>number_of_cg_iterations</name> <value>10</value></property>
+    <property><name>number_of_sb_iterations</name> <value>20</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>mu</name>                      <value>1.0</value></property>
+    <property><name>lambda</name>                  <value>0.05</value></property>
+    <property><name>alpha</name>                   <value>0.5</value></property>
+    <property><name>exclusive_access</name><value>true</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhaseSubtraction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FlowPhaseSubtractionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>MaxwellCorrection</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>MaxwellCorrectionGadget</classname>
+  </gadget>
+  
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+    <property><name>extract_mask</name><value>9</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/spiral/config/spiral_flow_gpusense_cg.xml b/gadgets/spiral/config/spiral_flow_gpusense_cg.xml
new file mode 100644
index 0000000..ac7a6c1
--- /dev/null
+++ b/gadgets/spiral/config/spiral_flow_gpusense_cg.xml
@@ -0,0 +1,123 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuSpiralSensePrepGadget</name>
+    <dll>gadgetron_spiral</dll>
+    <classname>gpuSpiralSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>use_multiframe_grouping</name><value>true</value></property>
+    <property><name>propagate_csm_from_set</name><value>0</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+    <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+    <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.3</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>1</value></property>
+    <property><name>number_of_iterations</name>    <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.3</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhaseSubtraction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FlowPhaseSubtractionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>MaxwellCorrection</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>MaxwellCorrectionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+    <property><name>extract_mask</name><value>9</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/spiral/config/spiral_flow_gpusense_cg_ecg.xml b/gadgets/spiral/config/spiral_flow_gpusense_cg_ecg.xml
new file mode 100644
index 0000000..8340e06
--- /dev/null
+++ b/gadgets/spiral/config/spiral_flow_gpusense_cg_ecg.xml
@@ -0,0 +1,131 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuSpiralSensePrepGadget</name>
+    <dll>gadgetron_spiral</dll>
+    <classname>gpuSpiralSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>use_multiframe_grouping</name><value>true</value></property>
+    <property><name>propagate_csm_from_set</name><value>0</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+    <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+    <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.3</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>1</value></property>
+    <property><name>number_of_iterations</name>    <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.3</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhaseSubtraction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FlowPhaseSubtractionGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>MaxwellCorrection</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>MaxwellCorrectionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PhysioInterpolation</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PhysioInterpolationGadget</classname>
+    <property><name>phases</name><value>30</value></property>
+  </gadget>
+
+  
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+    <property><name>extract_mask</name><value>9</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/spiral/config/spiral_flow_gpusense_cg_unoptimized.xml b/gadgets/spiral/config/spiral_flow_gpusense_cg_unoptimized.xml
new file mode 100644
index 0000000..c750044
--- /dev/null
+++ b/gadgets/spiral/config/spiral_flow_gpusense_cg_unoptimized.xml
@@ -0,0 +1,105 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget_unoptimized</classname>
+  </gadget>
+  
+  <gadget>
+    <name>gpuSpiralSensePrepGadget</name>
+    <dll>gadgetron_spiral</dll>
+    <classname>gpuSpiralSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>use_multiframe_grouping</name><value>true</value></property>
+  </gadget>
+  
+    <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.3</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>1</value></property>
+    <property><name>number_of_iterations</name>    <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.3</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhaseSubtraction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FlowPhaseSubtractionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>MaxwellCorrection</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>MaxwellCorrectionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+    <property><name>extract_mask</name><value>9</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/spiral/config/spiral_flow_gpusense_sb.xml b/gadgets/spiral/config/spiral_flow_gpusense_sb.xml
new file mode 100644
index 0000000..cc97794
--- /dev/null
+++ b/gadgets/spiral/config/spiral_flow_gpusense_sb.xml
@@ -0,0 +1,132 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuSpiralSensePrepGadget</name>
+    <dll>gadgetron_spiral</dll>
+    <classname>gpuSpiralSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>use_multiframe_grouping</name><value>true</value></property>
+    <property><name>propagate_csm_from_set</name><value>0</value></property>
+    <property><name>buffer_using_solver</name><value>true</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+    <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+    <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuSbSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuSbSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_cg_iterations</name> <value>10</value></property>
+    <property><name>number_of_sb_iterations</name> <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>mu</name>                      <value>1.0</value></property>
+    <property><name>lambda</name>                  <value>2.0</value></property>
+    <property><name>alpha</name>                   <value>0.5</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuSbSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuSbSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>1</value></property>
+    <property><name>number_of_cg_iterations</name> <value>10</value></property>
+    <property><name>number_of_sb_iterations</name> <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>mu</name>                      <value>1.0</value></property>
+    <property><name>lambda</name>                  <value>2.0</value></property>
+    <property><name>alpha</name>                   <value>0.5</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhaseSubtraction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FlowPhaseSubtractionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>MaxwellCorrection</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>MaxwellCorrectionGadget</classname>
+  </gadget>
+  
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+    <property><name>extract_mask</name><value>9</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/spiral/config/spiral_flow_gpusense_sb_unoptimized.xml b/gadgets/spiral/config/spiral_flow_gpusense_sb_unoptimized.xml
new file mode 100644
index 0000000..dd30e97
--- /dev/null
+++ b/gadgets/spiral/config/spiral_flow_gpusense_sb_unoptimized.xml
@@ -0,0 +1,111 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget_unoptimized</classname>
+  </gadget>
+  
+  <gadget>
+    <name>gpuSpiralSensePrepGadget</name>
+    <dll>gadgetron_spiral</dll>
+    <classname>gpuSpiralSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>use_multiframe_grouping</name><value>true</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuSbSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuSbSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_cg_iterations</name> <value>10</value></property>
+    <property><name>number_of_sb_iterations</name> <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>mu</name>                      <value>1.0</value></property>
+    <property><name>lambda</name>                  <value>2.0</value></property>
+    <property><name>alpha</name>                   <value>0.5</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuSbSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuSbSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>1</value></property>
+    <property><name>number_of_cg_iterations</name> <value>10</value></property>
+    <property><name>number_of_sb_iterations</name> <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>mu</name>                      <value>1.0</value></property>
+    <property><name>lambda</name>                  <value>2.0</value></property>
+    <property><name>alpha</name>                   <value>0.5</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhaseSubtraction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FlowPhaseSubtractionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>MaxwellCorrection</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>MaxwellCorrectionGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+    <property><name>extract_mask</name><value>9</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/spiral/config/spiral_interactive.xml b/gadgets/spiral/config/spiral_interactive.xml
new file mode 100644
index 0000000..a69f83e
--- /dev/null
+++ b/gadgets/spiral/config/spiral_interactive.xml
@@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1022</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriter</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>8</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuSpiralSensePrepGadget</name>
+    <dll>gadgetron_spiral</dll>
+    <classname>gpuSpiralSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>4.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>5</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>4.5</value></property>
+    <property><name>kappa</name>                   <value>0.3</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpuparallelmri</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>1</value></property>
+    <property><name>number_of_iterations</name>    <value>5</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>4.5</value></property>
+    <property><name>kappa</name>                   <value>0.3</value></property>
+  </gadget>
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>ImageFinish</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadget</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/spiral/gadgetron_spiral_export.h b/gadgets/spiral/gadgetron_spiral_export.h
new file mode 100644
index 0000000..5052304
--- /dev/null
+++ b/gadgets/spiral/gadgetron_spiral_export.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#if defined (WIN32)
+#ifdef __BUILD_GADGETRON_SPIRAL__
+#define EXPORTGADGETS_SPIRAL __declspec(dllexport)
+#else
+#define EXPORTGADGETS_SPIRAL __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETS_SPIRAL
+#endif
diff --git a/gadgets/spiral/gpuSpiralSensePrepGadget.cpp b/gadgets/spiral/gpuSpiralSensePrepGadget.cpp
new file mode 100644
index 0000000..2bf6fc1
--- /dev/null
+++ b/gadgets/spiral/gpuSpiralSensePrepGadget.cpp
@@ -0,0 +1,687 @@
+#include "gpuSpiralSensePrepGadget.h"
+#include "GenericReconJob.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "vector_td.h"
+#include "vector_td_operators.h"
+#include "check_CUDA.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "vds.h"
+#include "ismrmrd/xml.h"
+
+#include <algorithm>
+#include <vector>
+
+namespace Gadgetron{
+
+  gpuSpiralSensePrepGadget::gpuSpiralSensePrepGadget()
+    : samples_to_skip_start_(0)
+    , samples_to_skip_end_(0)
+    , samples_per_interleave_(0)
+    , prepared_(false)
+    , use_multiframe_grouping_(false)
+    , acceleration_factor_(0)
+  {
+  }
+
+  gpuSpiralSensePrepGadget::~gpuSpiralSensePrepGadget() {}
+
+  int gpuSpiralSensePrepGadget::process_config(ACE_Message_Block* mb)
+  {
+
+    int number_of_devices = 0;
+    if (cudaGetDeviceCount(&number_of_devices)!= cudaSuccess) {
+      GDEBUG( "Error: unable to query number of CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (number_of_devices == 0) {
+      GDEBUG( "Error: No available CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    device_number_ = deviceno.value();
+
+    if (device_number_ >= number_of_devices) {
+      GDEBUG("Adjusting device number from %d to %d\n", device_number_,  (device_number_%number_of_devices));
+      device_number_ = (device_number_%number_of_devices);
+    }
+
+    if (cudaSetDevice(device_number_)!= cudaSuccess) {
+      GDEBUG( "Error: unable to set CUDA device.\n" );
+      return GADGET_FAIL;
+    }
+
+    cudaDeviceProp deviceProp;
+    if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+      GDEBUG( "Error: unable to query device properties.\n" );
+      return GADGET_FAIL;
+    }
+    
+    unsigned int warp_size = deviceProp.warpSize;
+
+    propagate_csm_from_set_ = propagate_csm_from_set.value();
+
+    if( propagate_csm_from_set_ > 0 ){
+      GDEBUG("Currently, only set 0 can propagate coil sensitivity maps. Set %d was specified.\n", propagate_csm_from_set_ );
+      return GADGET_FAIL;
+    }
+
+    if( propagate_csm_from_set_ >= 0 ){
+      GDEBUG("Propagating csm from set %d to all sets\n", propagate_csm_from_set_ );
+    }
+
+    buffer_using_solver_ = buffer_using_solver.value();
+    use_multiframe_grouping_ = use_multiframe_grouping.value();
+
+    if( buffer_using_solver_ && !use_multiframe_grouping_ ){
+      GDEBUG("Enabling 'buffer_using_solver' requires also enabling 'use_multiframe_grouping'.\n" );
+      return GADGET_FAIL;
+    }
+
+    // Start parsing the ISMRMRD XML header
+    //
+
+    ISMRMRD::IsmrmrdHeader h;
+    ISMRMRD::deserialize(mb->rd_ptr(),h);
+    
+    
+    if (h.encoding.size() != 1) {
+      GDEBUG("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    
+    // Get the encoding space and trajectory description
+    ISMRMRD::EncodingSpace e_space = h.encoding[0].encodedSpace;
+    ISMRMRD::EncodingSpace r_space = h.encoding[0].reconSpace;
+    ISMRMRD::EncodingLimits e_limits = h.encoding[0].encodingLimits;
+    ISMRMRD::TrajectoryDescription traj_desc;
+
+    // Determine reconstruction matrix sizes
+    //
+
+    kernel_width_ = buffer_convolution_kernel_width.value();
+    oversampling_factor_ = buffer_convolution_oversampling_factor.value();
+    
+    image_dimensions_recon_.push_back(((static_cast<unsigned int>(std::ceil(e_space.matrixSize.x*reconstruction_os_factor_x.value()))+warp_size-1)/warp_size)*warp_size);  
+    image_dimensions_recon_.push_back(((static_cast<unsigned int>(std::ceil(e_space.matrixSize.y*reconstruction_os_factor_y.value()))+warp_size-1)/warp_size)*warp_size);
+      
+    image_dimensions_recon_os_ = uint64d2
+      (((static_cast<unsigned int>(std::ceil(image_dimensions_recon_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+       ((static_cast<unsigned int>(std::ceil(image_dimensions_recon_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+    
+    // In case the warp_size constraint kicked in
+    oversampling_factor_ = float(image_dimensions_recon_os_[0])/float(image_dimensions_recon_[0]);
+
+
+    if (h.encoding[0].trajectoryDescription) {
+      traj_desc = *h.encoding[0].trajectoryDescription;
+    } else {
+      GDEBUG("Trajectory description missing");
+      return GADGET_FAIL;
+    }
+    
+    if (traj_desc.identifier != "HargreavesVDS2000") {
+      GDEBUG("Expected trajectory description identifier 'HargreavesVDS2000', not found.");
+      return GADGET_FAIL;
+    }
+    
+    
+    long interleaves = -1;
+    long fov_coefficients = -1;
+    long sampling_time_ns = -1;
+    double max_grad = -1.0;
+    double max_slew = -1.0;
+    double fov_coeff = -1.0;
+    double kr_max = -1.0;
+    
+    
+    for (std::vector<ISMRMRD::UserParameterLong>::iterator i (traj_desc.userParameterLong.begin()); i != traj_desc.userParameterLong.end(); ++i) {
+      if (i->name == "interleaves") {
+        interleaves = i->value;
+      } else if (i->name == "fov_coefficients") {
+        fov_coefficients = i->value;
+      } else if (i->name == "SamplingTime_ns") {
+        sampling_time_ns = i->value;
+      } else {
+        GDEBUG("WARNING: unused trajectory parameter %s found\n", i->name.c_str());
+      }
+    }
+
+    for (std::vector<ISMRMRD::UserParameterDouble>::iterator i (traj_desc.userParameterDouble.begin()); i != traj_desc.userParameterDouble.end(); ++i) {
+      if (i->name == "MaxGradient_G_per_cm") {
+	max_grad = i->value;
+      } else if (i->name == "MaxSlewRate_G_per_cm_per_s") {
+	max_slew = i->value;
+      } else if (i->name == "FOVCoeff_1_cm") {
+	fov_coeff = i->value;
+      } else if (i->name == "krmax_per_cm") {
+	kr_max= i->value;
+      } else {
+	GDEBUG("WARNING: unused trajectory parameter %s found\n", i->name.c_str());
+      }
+    }
+    
+    if ((interleaves < 0) || (fov_coefficients < 0) || (sampling_time_ns < 0) || (max_grad < 0) || (max_slew < 0) || (fov_coeff < 0) || (kr_max < 0)) {
+      GDEBUG("Appropriate parameters for calculating spiral trajectory not found in XML configuration\n");
+      return GADGET_FAIL;
+    }
+    
+    
+    Tsamp_ns_ = sampling_time_ns;
+    Nints_ = interleaves;
+    interleaves_ = static_cast<int>(Nints_);
+
+    gmax_ = max_grad;
+    smax_ = max_slew;
+    krmax_ = kr_max;
+    fov_ = fov_coeff;
+
+    samples_to_skip_start_  = 0; //n.get<int>(std::string("samplestoskipstart.value"))[0];
+    samples_to_skip_end_    = -1; //n.get<int>(std::string("samplestoskipend.value"))[0];
+
+    fov_vec_.push_back(r_space.fieldOfView_mm.x);
+    fov_vec_.push_back(r_space.fieldOfView_mm.y);
+    fov_vec_.push_back(r_space.fieldOfView_mm.z);
+
+    slices_ = e_limits.slice ? e_limits.slice->maximum + 1 : 1;
+    sets_ = e_limits.set ? e_limits.set->maximum + 1 : 1;
+
+    buffer_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+
+    image_headers_queue_ = 
+      boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+
+    size_t bsize = sizeof(GadgetContainerMessage<ISMRMRD::ImageHeader>)*100*Nints_;
+
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      image_headers_queue_[i].high_water_mark(bsize);
+      image_headers_queue_[i].low_water_mark(bsize);
+    }
+
+    GDEBUG("smax:                    %f\n", smax_);
+    GDEBUG("gmax:                    %f\n", gmax_);
+    GDEBUG("Tsamp_ns:                %d\n", Tsamp_ns_);
+    GDEBUG("Nints:                   %d\n", Nints_);
+    GDEBUG("fov:                     %f\n", fov_);
+    GDEBUG("krmax:                   %f\n", krmax_);
+    GDEBUG("samples_to_skip_start_ : %d\n", samples_to_skip_start_);
+    GDEBUG("samples_to_skip_end_   : %d\n", samples_to_skip_end_);
+    GDEBUG("recon matrix_size_x    : %d\n", image_dimensions_recon_[0]);
+    GDEBUG("recon matrix_size_y    : %d\n", image_dimensions_recon_[1]);
+
+    return GADGET_OK;
+  }
+
+  int gpuSpiralSensePrepGadget::
+  process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1,
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2)
+  {
+    // Noise should have been consumed by the noise adjust, but just in case...
+    //
+
+    bool is_noise = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT);
+    if (is_noise) {
+      m1->release();
+      return GADGET_OK;
+    }
+
+    if (!prepared_) {
+
+      int     nfov   = 1;         /*  number of fov coefficients.             */
+      int     ngmax  = 1e5;       /*  maximum number of gradient samples      */
+      double  *xgrad;             /*  x-component of gradient.                */
+      double  *ygrad;             /*  y-component of gradient.                */
+      double  *x_trajectory;
+      double  *y_trajectory;
+      double  *weighting;
+      int     ngrad;
+      //int     count;
+      double sample_time = (1.0*Tsamp_ns_) * 1e-9;
+
+      /*	call c-function here to calculate gradients */
+      calc_vds(smax_,gmax_,sample_time,sample_time,Nints_,&fov_,nfov,krmax_,ngmax,&xgrad,&ygrad,&ngrad);
+      samples_per_interleave_ = std::min(ngrad,static_cast<int>(m1->getObjectPtr()->number_of_samples));
+
+      GDEBUG("Using %d samples per interleave\n", samples_per_interleave_);
+
+      /* Calcualte the trajectory and weights*/
+      calc_traj(xgrad, ygrad, samples_per_interleave_, Nints_, sample_time, krmax_, &x_trajectory, &y_trajectory, &weighting);
+
+      host_traj_ = boost::shared_ptr< hoNDArray<floatd2> >(new hoNDArray<floatd2>);
+      host_weights_ = boost::shared_ptr< hoNDArray<float> >(new hoNDArray<float>);
+
+      std::vector<size_t> trajectory_dimensions;
+      trajectory_dimensions.push_back(samples_per_interleave_*Nints_);
+
+      host_traj_->create(&trajectory_dimensions);
+      host_weights_->create(&trajectory_dimensions);
+
+      {
+	float* co_ptr = reinterpret_cast<float*>(host_traj_->get_data_ptr());
+	float* we_ptr =  reinterpret_cast<float*>(host_weights_->get_data_ptr());
+	
+	for (int i = 0; i < (samples_per_interleave_*Nints_); i++) {
+	  co_ptr[i*2]   = -x_trajectory[i]/2;
+	  co_ptr[i*2+1] = -y_trajectory[i]/2;
+	  we_ptr[i] = weighting[i];
+	}
+      }
+
+      delete [] xgrad;
+      delete [] ygrad;
+      delete [] x_trajectory;
+      delete [] y_trajectory;
+      delete [] weighting;
+
+      // Setup the NFFT plan
+      //
+
+      cuNDArray<floatd2> traj(*host_traj_);
+      dcw_buffer_ = boost::shared_ptr< cuNDArray<float> >( new cuNDArray<float>(*host_weights_) );
+	
+      nfft_plan_.setup( from_std_vector<size_t,2>(image_dimensions_recon_), image_dimensions_recon_os_, kernel_width_ );
+      nfft_plan_.preprocess(&traj, cuNFFT_plan<float,2>::NFFT_PREP_NC2C);
+
+      // Setup the non-Cartesian Sense encoding operator 
+      //
+      
+      E_ = boost::shared_ptr< cuNonCartesianSenseOperator<float,2> >(new cuNonCartesianSenseOperator<float,2>);
+      E_->setup( from_std_vector<size_t,2>(image_dimensions_recon_), image_dimensions_recon_os_, kernel_width_ );
+      
+      // Setup cg solver if the csm/regularization image is to be based hereon
+      //
+
+      if( buffer_using_solver_ ){
+
+	E_->set_dcw(sqrt(dcw_buffer_.get()));
+
+	D_ = boost::shared_ptr< cuCgPreconditioner<float_complext> >( new cuCgPreconditioner<float_complext>() );
+	cg_.set_encoding_operator( E_ );
+	cg_.set_preconditioner( D_ );
+	cg_.set_max_iterations( 2 );
+	cg_.set_tc_tolerance( 1e-6 );
+	cg_.set_output_mode( cuCgSolver<float_complext>::OUTPUT_SILENT);
+      }
+
+      prepared_ = true;
+    }
+
+    // Allocate host data buffer if it is NULL
+    //
+
+    if (!host_data_buffer_.get()) {
+
+      std::vector<size_t> data_dimensions;
+      data_dimensions.push_back(samples_per_interleave_*interleaves_);
+      data_dimensions.push_back(m1->getObjectPtr()->active_channels);
+
+      host_data_buffer_ = boost::shared_array< hoNDArray<float_complext> >
+	(new hoNDArray<float_complext>[slices_*sets_]);
+      
+      if (!host_data_buffer_.get()) {
+	GDEBUG("Unable to allocate array for host data buffer\n");
+	return GADGET_FAIL;
+      }
+
+      for (unsigned int i = 0; i < slices_*sets_; i++) {
+	host_data_buffer_[i].create(&data_dimensions);
+	host_data_buffer_[i].fill(0.0f);
+      }
+    }
+
+    // Allocate various counters if they are NULL
+    //
+
+    if( !image_counter_.get() ){
+      image_counter_ = boost::shared_array<long>(new long[slices_*sets_]);
+      for( unsigned int i=0; i<slices_*sets_; i++ )
+	image_counter_[i] = 0;
+    }
+
+    if( !interleaves_counter_singleframe_.get() ){
+      interleaves_counter_singleframe_ = boost::shared_array<long>(new long[slices_*sets_]);
+      for( unsigned int i=0; i<slices_*sets_; i++ )
+	interleaves_counter_singleframe_[i] = 0;
+    }
+
+    if( !interleaves_counter_multiframe_.get() ){
+      interleaves_counter_multiframe_ = boost::shared_array<long>(new long[slices_*sets_]);
+      for( unsigned int i=0; i<slices_*sets_; i++ )
+	interleaves_counter_multiframe_[i] = 0;
+    }
+
+    // Define some utility variables
+    //
+
+    unsigned int samples_to_copy = m1->getObjectPtr()->number_of_samples-samples_to_skip_end_;
+    unsigned int interleave = m1->getObjectPtr()->idx.kspace_encode_step_1;
+    unsigned int slice = m1->getObjectPtr()->idx.slice;
+    unsigned int set = m1->getObjectPtr()->idx.set;
+    unsigned int samples_per_channel =  host_data_buffer_[set*slices_+slice].get_size(0);
+
+    // Some book-keeping to keep track of the frame count
+    //
+
+    interleaves_counter_singleframe_[set*slices_+slice]++;
+    interleaves_counter_multiframe_[set*slices_+slice]++;
+
+    // Duplicate the profile to avoid double deletion in case problems are encountered below.
+    // Enque profile until all profiles for the reconstruction have been received.
+    //
+    
+    buffer_[set*slices_+slice].enqueue_tail(duplicate_profile(m1));
+    
+    // Copy profile into the accumulation buffer for csm/regularization estimation
+    //
+
+    ISMRMRD::AcquisitionHeader base_head = *m1->getObjectPtr();
+
+    if (samples_to_skip_end_ == -1) {
+      samples_to_skip_end_ = m1->getObjectPtr()->number_of_samples-samples_per_interleave_;
+      GDEBUG("Adjusting samples_to_skip_end_ = %d\n", samples_to_skip_end_);
+    }
+
+    std::complex<float>* data_ptr = reinterpret_cast< std::complex<float>* >
+      (host_data_buffer_[set*slices_+slice].get_data_ptr());
+
+    std::complex<float>* profile_ptr = m2->getObjectPtr()->get_data_ptr();
+
+    for (unsigned int c = 0; c < m1->getObjectPtr()->active_channels; c++) {
+      memcpy(data_ptr+c*samples_per_channel+interleave*samples_to_copy,
+	     profile_ptr+c*m1->getObjectPtr()->number_of_samples, samples_to_copy*sizeof(std::complex<float>));
+    }
+
+    // Have we received sufficient data for a new frame?
+    //
+
+    bool is_last_scan_in_slice = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_LAST_IN_SLICE);
+
+    if (is_last_scan_in_slice) {
+
+      // This was the final profile of a frame
+      //
+
+      if( Nints_%interleaves_counter_singleframe_[set*slices_+slice] ){
+	GDEBUG("Unexpected number of interleaves encountered in frame\n");
+	return GADGET_FAIL;
+      }
+
+      // Has the acceleration factor changed?
+      //
+
+      if( acceleration_factor_ != Nints_/interleaves_counter_singleframe_[set*slices_+slice] ){
+
+	GDEBUG("Change of acceleration factor detected\n");
+	acceleration_factor_ =  Nints_/interleaves_counter_singleframe_[set*slices_+slice];
+
+	// The encoding operator needs to have its domain/codomain dimensions set accordingly
+	//
+	
+	if( buffer_using_solver_ ){
+
+	  std::vector<size_t> domain_dims = image_dimensions_recon_;
+	  
+	  std::vector<size_t> codomain_dims = *host_traj_->get_dimensions();
+	  codomain_dims.push_back(m1->getObjectPtr()->active_channels);
+	  
+	  E_->set_domain_dimensions(&domain_dims);
+	  E_->set_codomain_dimensions(&codomain_dims);
+
+	  cuNDArray<floatd2> traj(*host_traj_);
+	  E_->preprocess(&traj);
+	}
+      }
+
+      // Prepare an image header for this frame
+      //
+
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *header = new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+      ISMRMRD::AcquisitionHeader *base_head = m1->getObjectPtr();
+
+      {
+	// Initialize header to all zeroes (there is a few fields we do not set yet)
+	ISMRMRD::ImageHeader tmp;
+	*(header->getObjectPtr()) = tmp;
+      }
+
+      header->getObjectPtr()->version = base_head->version;
+
+      header->getObjectPtr()->matrix_size[0] = image_dimensions_recon_[0];
+      header->getObjectPtr()->matrix_size[1] = image_dimensions_recon_[1];
+      header->getObjectPtr()->matrix_size[2] = acceleration_factor_;
+
+      header->getObjectPtr()->field_of_view[0] = fov_vec_[0];
+      header->getObjectPtr()->field_of_view[1] = fov_vec_[1];
+      header->getObjectPtr()->field_of_view[2] = fov_vec_[2];
+
+      header->getObjectPtr()->channels = base_head->active_channels;
+      header->getObjectPtr()->slice = base_head->idx.slice;
+      header->getObjectPtr()->set = base_head->idx.set;
+
+      header->getObjectPtr()->acquisition_time_stamp = base_head->acquisition_time_stamp;
+      memcpy(header->getObjectPtr()->physiology_time_stamp, base_head->physiology_time_stamp, sizeof(uint32_t)*ISMRMRD::ISMRMRD_PHYS_STAMPS);
+
+      memcpy(header->getObjectPtr()->position, base_head->position, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->read_dir, base_head->read_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->phase_dir, base_head->phase_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->slice_dir, base_head->slice_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->patient_table_position, base_head->patient_table_position, sizeof(float)*3);
+
+      header->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_CXFLOAT;
+      header->getObjectPtr()->image_index = image_counter_[set*slices_+slice]++; 
+      header->getObjectPtr()->image_series_index = set*slices_+slice;
+
+      // Enque header until we are ready to assemble a Sense job
+      //
+
+      image_headers_queue_[set*slices_+slice].enqueue_tail(header);
+
+      // Check if it is time to reconstruct.
+      // I.e. prepare and pass a Sense job downstream...
+      //
+
+      if( !use_multiframe_grouping_ || 
+	  (use_multiframe_grouping_ && interleaves_counter_multiframe_[set*slices_+slice] == Nints_) ){
+
+	unsigned int num_coils = m1->getObjectPtr()->active_channels;
+	
+	// Compute coil images from the fully sampled data buffer
+	//
+
+	std::vector<size_t> image_dims;
+	image_dims.push_back(image_dimensions_recon_[0]);
+	image_dims.push_back(image_dimensions_recon_[1]);
+	image_dims.push_back(num_coils);
+	
+	cuNDArray<float_complext> image(&image_dims);
+	cuNDArray<float_complext> data(&host_data_buffer_[set*slices_+slice]);
+	
+	nfft_plan_.compute( &data, &image, dcw_buffer_.get(), cuNFFT_plan<float,2>::NFFT_BACKWARDS_NC2C );
+
+	// Check if we need to compute a new csm
+	//
+	
+	if( propagate_csm_from_set_ < 0 || propagate_csm_from_set_ == set ){	  	  
+	  csm_ = estimate_b1_map<float,2>( &image ); // Estimates csm
+	}
+	else{
+	  //GDEBUG("Set %d is reusing the csm from set %d\n", set, propagate_csm_from_set_);
+	  if( csm_.get() == 0x0 ){
+	    GDEBUG("Error, csm has not been computed\n");
+	    return GADGET_FAIL;
+	  }	  
+	}
+	E_->set_csm(csm_);
+
+	// Compute regularization using basic coil combination
+	//
+	
+	image_dims.pop_back();
+	cuNDArray<float_complext> reg_image(&image_dims);
+	E_->mult_csm_conj_sum( &image, &reg_image );
+	
+	if( buffer_using_solver_ ){
+	  
+	  // Compute regularization using cg solver
+	  //
+	  
+	  // Define preconditioning weights
+	  boost::shared_ptr< cuNDArray<float> > _precon_weights = sum(abs_square(csm_.get()).get(), 2);
+	  reciprocal_sqrt_inplace(_precon_weights.get());	
+	  boost::shared_ptr< cuNDArray<float_complext> > precon_weights = real_to_complex<float_complext>( _precon_weights.get() );
+	  _precon_weights.reset();
+	  D_->set_weights( precon_weights );
+	  
+	  // Solve from the plain coil combination
+	  reg_image = *cg_.solve_from_rhs(&reg_image);
+	}
+
+	// Get ready to fill in the Sense job
+	//
+
+	boost::shared_ptr< hoNDArray<float_complext> > csm_host = csm_->to_host();
+	boost::shared_ptr< hoNDArray<float_complext> > reg_host = reg_image.to_host();
+
+	unsigned int profiles_buffered = buffer_[set*slices_+slice].message_count();
+
+	std::vector<size_t> ddimensions;
+	ddimensions.push_back(samples_per_interleave_*interleaves_counter_singleframe_[set*slices_+slice]*
+			      ((use_multiframe_grouping_) ? acceleration_factor_ : 1));
+	ddimensions.push_back(num_coils);
+	
+	boost::shared_ptr< hoNDArray<float_complext> > data_host(new hoNDArray<float_complext>(&ddimensions));
+
+	ddimensions.clear();
+	ddimensions.push_back(samples_per_interleave_*interleaves_counter_singleframe_[set*slices_+slice]);
+	ddimensions.push_back((use_multiframe_grouping_) ? acceleration_factor_ : 1);
+
+	boost::shared_ptr< hoNDArray<floatd2> > traj_host(new hoNDArray<floatd2>(&ddimensions));
+	boost::shared_ptr< hoNDArray<float> > dcw_host(new hoNDArray<float>(&ddimensions));
+	
+	for (unsigned int p = 0; p < profiles_buffered; p++) {
+	  ACE_Message_Block* mbq;
+	  if (buffer_[set*slices_+slice].dequeue_head(mbq) < 0) {
+	    GDEBUG("Message dequeue failed\n");
+	    return GADGET_FAIL;
+	  }
+
+	  GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* acq = 
+	    AsContainerMessage<ISMRMRD::AcquisitionHeader>(mbq);
+
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > >* daq = 
+	    AsContainerMessage<hoNDArray< std::complex<float> > >(mbq->cont());
+
+	  if (!acq || !daq) {
+	    GDEBUG("Unable to interpret data on message Q\n");
+	    return GADGET_FAIL;
+	  }
+
+	  for (unsigned int c = 0; c < num_coils; c++) {
+	    float_complext* data_ptr = data_host->get_data_ptr();
+	    data_ptr += c*samples_per_interleave_*profiles_buffered+p*samples_per_interleave_;
+
+	    std::complex<float>* r_ptr = daq->getObjectPtr()->get_data_ptr();
+	    r_ptr += c*daq->getObjectPtr()->get_size(0);
+
+	    memcpy(data_ptr,r_ptr,samples_per_interleave_*sizeof(float_complext));
+	  }
+
+	  floatd2* traj_ptr = traj_host->get_data_ptr();
+	  traj_ptr += p*samples_per_interleave_;
+
+	  floatd2* t_ptr = host_traj_->get_data_ptr();
+	  t_ptr += acq->getObjectPtr()->idx.kspace_encode_step_1*samples_per_interleave_;
+
+	  memcpy(traj_ptr,t_ptr,samples_per_interleave_*sizeof(floatd2));
+
+	  float* dcw_ptr = dcw_host->get_data_ptr();
+	  dcw_ptr += p*samples_per_interleave_;
+
+	  float* d_ptr = host_weights_->get_data_ptr();
+	  d_ptr += acq->getObjectPtr()->idx.kspace_encode_step_1*samples_per_interleave_;
+
+	  memcpy(dcw_ptr,d_ptr,samples_per_interleave_*sizeof(float));
+
+	  mbq->release();
+	}
+
+	GadgetContainerMessage< GenericReconJob >* m4 = new GadgetContainerMessage< GenericReconJob >();
+
+	m4->getObjectPtr()->dat_host_ = data_host;
+	m4->getObjectPtr()->csm_host_ = csm_host;
+	m4->getObjectPtr()->reg_host_ = reg_host;
+	m4->getObjectPtr()->tra_host_ = traj_host;
+	m4->getObjectPtr()->dcw_host_ = dcw_host;
+
+	// Pull the image headers out of the queue
+	//
+	
+	long frames_per_reconstruction = (use_multiframe_grouping_) ? acceleration_factor_ : 1;
+      
+	if( image_headers_queue_[set*slices_+slice].message_count() != frames_per_reconstruction ){
+	  m4->release();
+	  GDEBUG("Unexpected size of image header queue: %d, %d\n", 
+			image_headers_queue_[set*slices_+slice].message_count(), frames_per_reconstruction);
+	  return GADGET_FAIL;
+	}
+	
+	m4->getObjectPtr()->image_headers_ =
+	  boost::shared_array<ISMRMRD::ImageHeader>( new ISMRMRD::ImageHeader[frames_per_reconstruction] );
+	
+	for( unsigned int i=0; i<frames_per_reconstruction; i++ ){	
+	  
+	  ACE_Message_Block *mbq;
+	  
+	  if( image_headers_queue_[set*slices_+slice].dequeue_head(mbq) < 0 ) {
+	    m4->release();
+	    GDEBUG("Image header dequeue failed\n");
+	    return GADGET_FAIL;
+	  }
+	  
+	  GadgetContainerMessage<ISMRMRD::ImageHeader> *m = AsContainerMessage<ISMRMRD::ImageHeader>(mbq);
+	  m4->getObjectPtr()->image_headers_[i] = *m->getObjectPtr();
+	  m->release();
+	}
+
+	// The Sense Job needs an image header as well. 
+	// Let us just copy the initial one...
+	
+	GadgetContainerMessage<ISMRMRD::ImageHeader> *m3 = new GadgetContainerMessage<ISMRMRD::ImageHeader>;
+	*m3->getObjectPtr() = m4->getObjectPtr()->image_headers_[0];
+	m3->cont(m4);
+	
+	if (this->next()->putq(m3) < 0) {
+	  GDEBUG("Failed to put job on queue.\n");
+	  m3->release();
+	  return GADGET_FAIL;
+	}
+	interleaves_counter_multiframe_[set*slices_+slice] = 0;
+      }
+      interleaves_counter_singleframe_[set*slices_+slice] = 0;
+    }
+    m1->release();
+    return GADGET_OK;
+  }
+
+  GadgetContainerMessage<ISMRMRD::AcquisitionHeader>*
+  gpuSpiralSensePrepGadget::duplicate_profile( GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *profile )
+  {
+    GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *copy = 
+      new GadgetContainerMessage<ISMRMRD::AcquisitionHeader>();
+    
+    GadgetContainerMessage< hoNDArray< std::complex<float> > > *cont_copy = 
+      new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+    
+    *copy->getObjectPtr() = *profile->getObjectPtr();
+    *(cont_copy->getObjectPtr()) = *(AsContainerMessage<hoNDArray< std::complex<float> > >(profile->cont())->getObjectPtr());
+    
+    copy->cont(cont_copy);
+    return copy;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuSpiralSensePrepGadget)
+}
diff --git a/gadgets/spiral/gpuSpiralSensePrepGadget.h b/gadgets/spiral/gpuSpiralSensePrepGadget.h
new file mode 100644
index 0000000..be80628
--- /dev/null
+++ b/gadgets/spiral/gpuSpiralSensePrepGadget.h
@@ -0,0 +1,100 @@
+#ifndef gpuSpiralSensePrepGadget_H
+#define gpuSpiralSensePrepGadget_H
+#pragma once
+
+#include "gadgetron_spiral_export.h"
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "cuCgSolver.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuNFFT.h"
+#include "hoNDArray.h"
+#include "vector_td.h"
+#include "cuNFFT.h"
+
+#include <ismrmrd/ismrmrd.h>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_SPIRAL gpuSpiralSensePrepGadget :
+    public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+  {
+
+  public:
+    GADGET_DECLARE(gpuSpiralSensePrepGadget);
+
+    gpuSpiralSensePrepGadget();
+    virtual ~gpuSpiralSensePrepGadget();
+
+  protected:
+    GADGET_PROPERTY(deviceno, int, "GPU device number", 0);
+    GADGET_PROPERTY(propagate_csm_from_set, int, "Which set to use for CSM", -1);
+    GADGET_PROPERTY(buffer_using_solver, bool, "Use solver for buffer", false);
+    GADGET_PROPERTY(use_multiframe_grouping, bool, "Use multiframe grouping", false);
+    GADGET_PROPERTY(buffer_convolution_kernel_width, float, "Convolution kernel width for buffer", 5.5);
+    GADGET_PROPERTY(buffer_convolution_oversampling_factor, float, "Oversampling used in buffer convolution", 1.25);
+    GADGET_PROPERTY(reconstruction_os_factor_x, float, "Oversampling for reconstruction in x-direction", 1.0);
+    GADGET_PROPERTY(reconstruction_os_factor_y, float, "Oversampling for reconstruction in y-direction", 1.0);
+
+    virtual int process_config(ACE_Message_Block* mb);
+    
+    virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1,
+			GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2);
+    
+    virtual GadgetContainerMessage<ISMRMRD::AcquisitionHeader>*
+      duplicate_profile( GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *profile );
+    
+  private:
+    int samples_to_skip_start_;
+    int samples_to_skip_end_;
+    int samples_per_interleave_;
+    int interleaves_;
+    int slices_;
+    int sets_;
+    boost::shared_array<long> image_counter_;
+    int device_number_;
+
+    long    Tsamp_ns_;
+    long    Nints_;
+    boost::shared_array<long> interleaves_counter_singleframe_;
+    boost::shared_array<long> interleaves_counter_multiframe_;
+    long    acceleration_factor_;
+    double  gmax_;
+    double  smax_;
+    double  krmax_;
+    double  fov_;
+
+    bool prepared_;
+    bool use_multiframe_grouping_;
+    bool buffer_using_solver_;
+
+    int propagate_csm_from_set_;
+
+    float kernel_width_;
+    float oversampling_factor_;
+
+    boost::shared_ptr< hoNDArray<floatd2> > host_traj_;
+    boost::shared_ptr< hoNDArray<float> > host_weights_;
+    
+    boost::shared_array< hoNDArray<float_complext> > host_data_buffer_;
+    boost::shared_ptr< cuNDArray<float> > dcw_buffer_;
+
+    std::vector<size_t> fov_vec_;
+    std::vector<size_t> image_dimensions_recon_;
+    uint64d2 image_dimensions_recon_os_;
+
+    cuNFFT_plan<float,2> nfft_plan_;
+    cuCgSolver<float_complext> cg_;
+    boost::shared_ptr< cuNDArray<float_complext> > csm_;
+    boost::shared_ptr< cuNonCartesianSenseOperator<float,2> > E_;
+    boost::shared_ptr< cuCgPreconditioner<float_complext> > D_;
+
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > buffer_;
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > image_headers_queue_;
+  };
+}
+#endif //gpuSpiralSensePrepGadget_H
diff --git a/gadgets/spiral/vds.cpp b/gadgets/spiral/vds.cpp
new file mode 100644
index 0000000..0591509
--- /dev/null
+++ b/gadgets/spiral/vds.cpp
@@ -0,0 +1,495 @@
+#include "vds.h"
+
+#include <math.h>
+#include <stdio.h>
+
+#define GAMMA 	4258.0		/* Hz/G */
+#define PI	3.141592	/* pi */
+
+#define DEBUG_VDS	0
+/* #define TESTCODE 	For testing as regular C code... */
+
+/*
+  %
+  %	VARIABLE DENSITY SPIRAL GENERATION:
+  %	----------------------------------
+  %
+  %	This is a general description of how the following C code
+  %	works.  This text is taken from a matlab script, vds.m, from
+  %	which the C code was derived.  However, note that the C code
+  %	runs considerably faster.
+  %
+  %
+  %	Function generates variable density spiral which traces
+  %	out the trajectory
+  %				 
+  %			k(t) = r(t) exp(i*q(t)), 		[1]
+  %
+  %	Where q IS THE SAME AS theta, and r IS THE SAME AS kr.
+  %
+  %		r and q are chosen to satisfy:
+  %
+  %		1) Maximum gradient amplitudes and slew rates.
+  %		2) Maximum gradient due to FOV, where FOV can
+  %		   vary with k-space radius r, as
+  %
+  %			FOV(r) = F0 + F1*r + F2*r*r 		[2]
+  %
+  %
+  %	INPUTS:
+  %	-------
+  %	smax = maximum slew rate G/cm/s
+  %	gmax = maximum gradient G/cm (limited by Gmax or FOV)
+  %	T = sampling period (s) for gradient AND acquisition.
+  %	N = number of interleaves.
+  %	F0,F1,F2 = FOV coefficients with respect to r - see above.
+  %	rmax= value of k-space radius at which to stop (cm^-1).
+  %		rmax = 1/(2*resolution);
+  %
+  %
+  %	OUTPUTS:
+  %	--------
+  %	k = k-space trajectory (kx+iky) in cm-1.
+  %	g = gradient waveform (Gx+iGy) in G/cm.
+  %	s = derivative of g (Sx+iSy) in G/cm/s.
+  %	time = time points corresponding to above (s).
+  %	r = k-space radius vs time (used to design spiral)
+  %	theta = atan2(ky,kx) = k-space angle vs time.
+  %
+  %
+  %	METHODS:
+  %	--------
+  %	Let r1 and r2 be the first derivatives of r in [1].	
+  %	Let q1 and q2 be the first derivatives of theta in [1].	
+  %	Also, r0 = r, and q0 = theta - sometimes both are used.
+  %	F = F(r) defined by F0,F1,F2.
+  %
+  %	Differentiating [1], we can get G = a(r0,r1,q0,q1,F)	
+  %	and differentiating again, we get S = b(r0,r1,r2,q0,q1,q2,F)
+  %
+  %	(functions a() and b() are reasonably easy to obtain.)
+  %
+  %	FOV limits put a constraint between r and q:
+  %
+  %		dr/dq = N/(2*pi*F)				[3]	
+  %
+  %	We can use [3] and the chain rule to give 
+  %
+  %		q1 = 2*pi*F/N * r1				[4]
+  %
+  %	and
+  %
+  %		q2 = 2*pi/N*dF/dr*r1^2 + 2*pi*F/N*r2		[5]
+  %
+  %
+  %
+  %	Now using [4] and [5], we can substitute for q1 and q2
+  %	in functions a() and b(), giving
+  %
+  %		G = c(r0,r1,F)
+  %	and 	S = d(r0,r1,r2,F,dF/dr)
+  %
+  %
+  %	Using the fact that the spiral should be either limited
+  %	by amplitude (Gradient or FOV limit) or slew rate, we can
+  %	solve 
+  %		|c(r0,r1,F)| = |Gmax|  				[6]
+  %
+  %	analytically for r1, or
+  %	
+  %	  	|d(r0,r1,r2,F,dF/dr)| = |Smax|	 		[7]
+  %
+  %	analytically for r2.
+  %
+  %	[7] is a quadratic equation in r2.  The smaller of the 
+  %	roots is taken, and the real part of the root is used to
+  %	avoid possible numeric errors - the roots should be real
+  %	always.
+  %
+  %	The choice of whether or not to use [6] or [7], and the
+  %	solving for r2 or r1 is done by calcthetadotdot().
+  %
+  %	Once the second derivative of theta(q) or r is obtained,
+  %	it can be integrated to give q1 and r1, and then integrated
+  %	again to give q and r.  The gradient waveforms follow from
+  %	q and r. 	
+  %
+  %	Brian Hargreaves -- Sept 2000.
+  %
+  %
+*/
+
+namespace Gadgetron{
+
+
+  /* ----------------------------------------------------------------------- */
+  void calcthetadotdot(double slewmax, double gradmax, double kr, 
+		       double krdot, double Tgsample, double Tdsample, int Ninterleaves,
+		       double* fov, int numfov, double* thetadotdot, double* krdotdot)
+  /*
+   * Function calculates the 2nd derivative of kr and theta at each
+   * sample point within calc_vds().  ie, this is the iterative loop
+   * for calc_vds.  See the text at the top of this file for more details
+   * */
+
+  //double slewmax;		/*	Maximum slew rate, G/cm/s		*/
+  //double gradmax;		/* 	maximum gradient amplitude, G/cm	*/
+  //double kr;		/* 	Current kr. */
+  //double krdot;		/*	Current krdot. */
+  //double Tgsample;	/*	Gradient Sample period (s) 	*/
+  //double Tdsample;	/*	Data Sample period (s) 		*/
+  //int Ninterleaves;	/*	Number of interleaves			*/
+  //double *fov;		/*	FOV coefficients		*/
+  //int numfov;		/*	Number of FOV coefficients		*/
+  //double *thetadotdot;	/*	[output] 2nd derivative of theta.	*/
+  //double *krdotdot;	/*	[output] 2nd derivative of kr		*/
+
+  /* ----------------------------------------------------------------------- */
+  {
+    double fovval=0;	/* FOV for this value of kr	*/
+    double dfovdrval=0;	/* dFOV/dkr for this value of kr	*/
+    double gmaxfov;		/* FOV-limited Gmax.	*/
+    double maxkrdot;
+    int count;
+
+    double tpf;	/* Used to simplify expressions. */
+    double tpfsq;	/* 	" 		"        */
+
+    double qdfA, qdfB, qdfC;	/* Quadratic formula coefficients */
+    double rootparta,rootpartb;
+
+
+
+    if (DEBUG_VDS>1)
+      {
+	printf("calcthetadotdot:  slewmax=%8.2f, gmax=%6.2f, \n",
+	       slewmax,gradmax);
+	printf("        kr=%8.4f, Tg=%9.6f, N=%d, nfov=%d \n", 
+	       kr,Tgsample,Ninterleaves,numfov);
+      }
+
+    /* Calculate the actual FOV and dFOV/dkr for this R,
+     * based on the fact that the FOV is expressed 
+     * as a polynomial in kr.*/
+
+    for (count=0; count < numfov; count++)
+      {
+	fovval = fovval + fov[count]*pow(kr,count);
+	if (count > 0)
+	  dfovdrval = dfovdrval + count*fov[count]*pow(kr,count-1);
+      }
+
+    /* Calculate FOV limit on gmax.  This is the rate of motion along
+     * a trajectory, and really should not be a limitation.  Thus,
+     * it is reasonable to comment out the following lines. */
+
+    gmaxfov = 1/GAMMA / fovval / Tdsample;	
+    if (gradmax > gmaxfov)
+      gradmax = gmaxfov;	
+
+
+    /* Maximum dkr/dt, based on gradient amplitude.  */
+
+    maxkrdot = sqrt(pow(GAMMA*gradmax,2) / (1+pow(2*PI*fovval*kr/Ninterleaves,2)));
+    if (DEBUG_VDS>1)
+      printf("calcthetadotdot:  maxkrdot = %g \n",maxkrdot);
+
+    /* These two are just to simplify expressions below */
+    tpf = 2*PI*fovval/Ninterleaves;
+    tpfsq = pow(tpf,2);
+    if (DEBUG_VDS>1)
+      printf("calcthetadotdot:  tpf = %8.4f,  tpfsq = %8.4f  \n",tpf,tpfsq);
+
+
+
+
+    if (krdot > maxkrdot)	/* Then choose krdotdot so that krdot is in range */
+      {	
+	*krdotdot = (maxkrdot - krdot)/Tgsample;
+      }
+
+    else			/* Choose krdotdot based on max slew rate limit. */
+      {
+
+	/* Set up for quadratic formula solution. */
+
+	qdfA = 1+tpfsq*kr*kr;
+	qdfB = 2*tpfsq*kr*krdot*krdot + 
+	  2*tpfsq/fovval*dfovdrval*kr*kr*krdot*krdot;
+	qdfC = pow(tpfsq*kr*krdot*krdot,2) + 4*tpfsq*pow(krdot,4) +
+	  pow(tpf*dfovdrval/fovval*kr*krdot*krdot,2) +
+	  4*tpfsq*dfovdrval/fovval*kr*pow(krdot,4) -
+	  pow(GAMMA*slewmax,2);
+
+	if (DEBUG_VDS>1)
+	  printf("calcthetadotdot:  qdfA, qdfB, qdfC = %g, %g, %g \n",
+		 qdfA, qdfB, qdfC);
+
+	rootparta = -qdfB/(2*qdfA);
+	rootpartb = qdfB*qdfB/(4*qdfA*qdfA) - qdfC/qdfA;
+	if (DEBUG_VDS>1)
+	  printf("calcthetadotdot:  rootparta, rootpartb = %g, %g \n",
+		 rootparta, rootpartb);
+
+	if (rootpartb < 0)	/* Safety check - if complex, take real part.*/
+
+	  *krdotdot = rootparta;
+
+	else
+	  *krdotdot = rootparta + sqrt(rootpartb);
+
+
+	/* Could check resulting slew rate here, as in q2r21.m. */
+      }
+
+    /* Calculate thetadotdot */
+
+	
+    *thetadotdot = tpf*dfovdrval/fovval*krdot*krdot + tpf*(*krdotdot);
+
+    if (DEBUG_VDS>1)
+      printf("calcthetadot:  r=%8.4f,  r'=%8.4f,  r''=%g  q''=%g \n",
+	     kr,krdot,*krdotdot,*thetadotdot);
+
+  }
+
+
+  /* ----------------------------------------------------------------------- */
+  void EXPORTGADGETS_SPIRAL 
+  calc_vds(double slewmax,double gradmax,double Tgsample,double Tdsample,int Ninterleaves,
+	   double* fov, int numfov,double krmax,
+	   int ngmax, double** xgrad,double** ygrad,int* numgrad)
+
+  /*	Function designs a variable-density spiral gradient waveform
+   *	that is defined by a number of interleaves, resolution (or max number
+   *	of samples), and field-of-view.  
+   *	The field-of-view is a polynomial function of the
+   *	k-space radius, so fov is an array of coefficients so that
+   *
+   *	FOV = fov[0]+fov[1]*kr+fov[2]*kr^2+ ... +fov[numfov-1]*kr^(numfov-1)
+   *
+   * 	Gradient design is subject to a constant-slew-rate-limit model,
+   * 	with maximum slew rate slewmax, and maximum gradient amplitude
+   * 	of gradmax.  
+   *
+   * 	Tgsample is the gradient sampling rate, and Tdsample is the data
+   * 	sampling rate.  It is highly recommended to OVERSAMPLE the gradient
+   * 	in the design to make the integration more stable.
+   *
+   * */
+
+  //double slewmax;		/*	Maximum slew rate, G/cm/s		*/
+  //double gradmax;		/* 	maximum gradient amplitude, G/cm	*/
+  //double Tgsample;	/*	Gradient Sample period (s)		*/
+  //double Tdsample;	/*	Data Sample period (s)			*/
+  //int Ninterleaves;	/*	Number of interleaves			*/
+  //double *fov;		/*	FOV coefficients		*/
+  //int numfov;		/*	Number of FOV coefficients		*/
+  //double krmax;		/*	Maximum k-space extent (/cm)		*/
+  //int ngmax;		/*	Maximum number of gradient samples	*/
+  //double **xgrad;		/* 	[output] X-component of gradient (G/cm) */
+  //double **ygrad;		/*	[output] Y-component of gradient (G/cm)	*/
+  //int *numgrad;		/* 	[output] Number of gradient samples */
+
+  /* ----------------------------------------------------------------------- */
+  {
+    int gradcount=0;
+
+    double kr=0;			/* Current value of kr	*/
+    double krdot = 0;		/* Current value of 1st derivative of kr */
+    double krdotdot = 0;		/* Current value of 2nd derivative of kr */
+
+    double theta=0;			/* Current value of theta */
+    double thetadot=0;		/* Current value of 1st derivative of theta */
+    double thetadotdot=0;		/* Current value of 2nd derivative of theta */
+
+    double lastkx=0;		/* x-component of last k-location. */
+    double lastky=0;		/* y-component of last k-location */
+    double kx, ky;			/* x and y components of current k-location */
+
+    double *gxptr, *gyptr;		/* Pointers to gradient variables. */
+
+
+
+
+    if (DEBUG_VDS>0)
+      printf("calc_vds:  First run. \n");
+
+    /* First just find the gradient length. */
+
+    while ((kr < krmax) && (gradcount < ngmax))
+      {
+	calcthetadotdot(slewmax,gradmax,kr,krdot,Tgsample,Tdsample,
+			Ninterleaves, fov,numfov, &thetadotdot, &krdotdot);
+
+	/* Integrate to obtain new values of kr, krdot, theta and thetadot:*/
+
+	thetadot = thetadot + thetadotdot * Tgsample;
+	theta = theta + thetadot * Tgsample;
+
+	krdot = krdot + krdotdot * Tgsample;
+	kr = kr + krdot * Tgsample;
+
+	gradcount++;
+
+      }
+
+
+
+    /* Allocate memory for gradients. */
+
+    *numgrad = gradcount;
+    if (DEBUG_VDS>0)
+      printf("Allocating for %d gradient points. \n",*numgrad);
+
+    //*xgrad = (double *)malloc(*numgrad*sizeof(double));
+    //*ygrad = (double *)malloc(*numgrad*sizeof(double));
+
+    *xgrad = new double[*numgrad*sizeof(double)];
+    *ygrad = new double[*numgrad*sizeof(double)];
+
+    /* Reset parameters */
+
+    kr=0;
+    krdot=0;
+    theta=0;
+    thetadot=0;
+    gradcount=0;
+    gxptr = *xgrad;
+    gyptr = *ygrad;
+
+
+    /* Now re-calculate gradient to find length. */
+
+    if (DEBUG_VDS>0)
+      printf("calc_vds:  First run. \n");
+
+    while ((kr < krmax) && (gradcount < ngmax))
+      {
+	calcthetadotdot(slewmax,gradmax,kr,krdot,Tgsample,Tdsample,
+			Ninterleaves, fov,numfov, &thetadotdot, &krdotdot);
+
+	/* Integrate to obtain new values of kr, krdot, theta and thetadot:*/
+
+	thetadot = thetadot + thetadotdot * Tgsample;
+	theta = theta + thetadot * Tgsample;
+
+	krdot = krdot + krdotdot * Tgsample;
+	kr = kr + krdot * Tgsample;
+
+	/* Define current gradient values from kr and theta. */
+
+	kx = kr * cos(theta);
+	ky = kr * sin(theta);
+	*gxptr++ = (1/GAMMA/Tgsample) * (kx-lastkx);
+	*gyptr++ = (1/GAMMA/Tgsample) * (ky-lastky);
+	lastkx = kx;
+	lastky = ky;
+
+	if (DEBUG_VDS>0)
+	  printf("Current kr is %6.3f \n",kr);
+
+	gradcount++;
+      }
+
+  }
+ 
+
+
+  /* ----------------------------------------------------------------------- */
+  void EXPORTGADGETS_SPIRAL 
+  calc_traj(double* xgrad, double* ygrad, int ngrad, int Nints, double Tgsamp, double krmax,
+	    double** x_trajectory, double** y_trajectory,
+	    double** weights) //, double** y_weights)
+  /*
+   *inputs: 
+   *      xgrad   X gradient waveform
+   *      ygrad   Y gradient waveform
+   *      ngrad   number of gradient samples
+   *      Nints   number of interleaves
+   *      Tgsamp  sampling time for gradients
+   *
+   *outputs:
+   *      x_trajectory    X position in k-space
+   *      y_trajectory    Y position in k-space
+   *      x_weights       X weighting
+   *      y_weights       Y weighting
+   *
+   **/
+  {
+    int     gradcount   =0;
+    double  x_tr        =0.0;		/* Current value of x_traj	*/
+    double  y_tr        =0.0;     /* Current value of -traj */
+    double  rotation    =0.0;       /* rotation of trajectory */
+    
+    double  abs_w       =0.0;
+    double  ang_g       =0.0;
+    double  ang_t       =0.0;
+    double  tp_w        =0.0;     /* crrent weight value */
+    
+    double  *txptr, *typtr;		/* Pointers to trajectory variables. */
+    double  *wptr;      		/* Pointers to weight variables. */
+    
+    *x_trajectory   = new double[(ngrad*Nints)*sizeof(double)];
+    *y_trajectory   = new double[(ngrad*Nints)*sizeof(double)];
+    *weights        = new double[(ngrad*Nints)*sizeof(double)];
+    
+    txptr       = *x_trajectory;
+    typtr       = *y_trajectory;
+    wptr        = *weights;
+    
+    int inter = 0;
+    for(inter = 0; inter < Nints; inter++)
+      {
+        rotation = (inter * 2 * PI)/Nints;
+        x_tr = 0;
+        y_tr = 0;
+        float x_temp, y_temp;
+        for(gradcount = 0; gradcount < ngrad; gradcount++)
+	  {
+            if (gradcount > 0)
+	      {
+                x_tr += (GAMMA)*xgrad[gradcount-1]*Tgsamp;
+                y_tr += (GAMMA)*ygrad[gradcount-1]*Tgsamp;
+	      }
+
+            x_temp = (x_tr * cos(rotation)) + (y_tr * sin(rotation));
+            y_temp = -(x_tr * sin(rotation)) + (y_tr * cos(rotation));
+            *(txptr++) = x_temp/krmax;
+            *(typtr++) = y_temp/krmax;      
+  
+            //abs(g(:)
+            abs_w   = sqrt((pow(xgrad[gradcount],2)) + (pow(ygrad[gradcount],2)));
+
+            if(xgrad[gradcount] == 0.0)
+	      {
+                ang_g = PI/2;
+	      }
+            else
+	      {
+		ang_g   =  atan2(ygrad[gradcount], xgrad[gradcount]);   //angle of gradient
+	      }
+            
+            if(x_tr == 0.0)
+	      {
+                ang_t = PI/2;
+	      }  
+            else
+	      {
+                ang_t   =  atan2(y_tr, x_tr);                          // angle of trajectory
+	      }
+        
+	    tp_w    = sin(ang_g-ang_t);       
+	    tp_w    = sqrt(pow(tp_w, 2));    //abs(tp_w);
+	    //              mexPrintf("tp_w = %f\n", tp_w);
+	    tp_w    = abs_w * tp_w;
+       
+	    //       mexPrintf("abs_w = %f, ang_g =%f, ang_t = %f, tp_w = %f\n",abs_w, ang_g, ang_t, tp_w);
+        
+            *wptr++ = tp_w;
+	    //g = gradients, k = trajectory
+	    //        weights = abs(g(:)) .* abs(sin(angle(g(:))-angle(k(:))));
+	  }    
+      }    
+  }
+}
diff --git a/gadgets/spiral/vds.h b/gadgets/spiral/vds.h
new file mode 100644
index 0000000..1399464
--- /dev/null
+++ b/gadgets/spiral/vds.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "gadgetron_spiral_export.h"
+
+namespace Gadgetron{
+
+  void EXPORTGADGETS_SPIRAL 
+  calc_vds(double slewmax,double gradmax,double Tgsample,double Tdsample,int Ninterleaves,
+	   double* fov, int numfov,double krmax,
+	   int ngmax, double** xgrad,double** ygrad,int* numgrad);
+  
+  void EXPORTGADGETS_SPIRAL 
+  calc_traj(double* xgrad, double* ygrad, int ngrad, int Nints, double Tgsamp, double krmax,
+	    double** x_trajectory, double** y_trajectory, double** weights);  
+}
diff --git a/gadgets/util/CMakeLists.txt b/gadgets/util/CMakeLists.txt
new file mode 100644
index 0000000..9f7e4bd
--- /dev/null
+++ b/gadgets/util/CMakeLists.txt
@@ -0,0 +1,28 @@
+IF (WIN32)
+    ADD_DEFINITIONS(-D__BUILD_GADGETRON_UTIL_GADGETS__)
+ENDIF (WIN32)
+
+include_directories(
+    ${CMAKE_SOURCE_DIR}/toolboxes/core
+)
+
+add_library(gadgetron_util SHARED 
+    gadgetron_util_gadgets_export.h 
+    ParameterRelayGadget.h
+    ParameterRelayGadget.cpp
+)
+
+set_target_properties(gadgetron_util PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})                                                                                                                                                                                                      
+
+target_link_libraries(gadgetron_util
+    gadgetron_gadgetbase
+    gadgetron_toolbox_log
+    ${ACE_LIBRARIES}
+)
+
+install(FILES 
+    gadgetron_util_gadgets_export.h
+    ParameterRelayGadget.h
+    DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+install(TARGETS gadgetron_util DESTINATION lib COMPONENT main)
diff --git a/gadgets/util/ParameterRelayGadget.cpp b/gadgets/util/ParameterRelayGadget.cpp
new file mode 100644
index 0000000..df6e79f
--- /dev/null
+++ b/gadgets/util/ParameterRelayGadget.cpp
@@ -0,0 +1,18 @@
+#include "ParameterRelayGadget.h"
+
+namespace Gadgetron{
+int ParameterRelayGadget
+::process(ACE_Message_Block* m)
+{
+  if (this->next()->putq(m) == -1) {
+    m->release();
+    GERROR("ParameterRelayGadget::process, passing data on to next gadget");
+    return -1;
+  }
+
+  return GADGET_OK;
+}
+GADGET_FACTORY_DECLARE(ParameterRelayGadget)
+}
+
+
diff --git a/gadgets/util/ParameterRelayGadget.h b/gadgets/util/ParameterRelayGadget.h
new file mode 100644
index 0000000..29eb4b5
--- /dev/null
+++ b/gadgets/util/ParameterRelayGadget.h
@@ -0,0 +1,18 @@
+#ifndef PARAMETERRELAYGADGET_H
+#define PARAMETERRELAYGADGET_H
+
+#include "Gadget.h"
+#include "gadgetron_util_gadgets_export.h"
+
+namespace Gadgetron{
+
+  class EXPORTUTILGADGETS ParameterRelayGadget : public Gadget
+    {
+    public:
+      GADGET_DECLARE(ParameterRelayGadget);
+      
+    protected:
+      virtual int process(ACE_Message_Block* m);
+    };
+}
+#endif //PARAMETERRELAYGADGET_H
diff --git a/gadgets/util/gadgetron_util_gadgets_export.h b/gadgets/util/gadgetron_util_gadgets_export.h
new file mode 100644
index 0000000..7f75c2e
--- /dev/null
+++ b/gadgets/util/gadgetron_util_gadgets_export.h
@@ -0,0 +1,14 @@
+#ifndef GADGETRON_UTIL_GADGETS_EXPORT_H_
+#define GADGETRON_UTIL_GADGETS_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_UTIL_GADGETS__)
+#define EXPORTUTILGADGETS __declspec(dllexport)
+#else
+#define EXPORTUTILGADGETS __declspec(dllimport)
+#endif
+#else
+#define EXPORTUTILGADGETS
+#endif
+
+#endif // GADGETRON_UTIL_GADGETS_EXPORT_H_
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 0000000..cd4c86c
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,79 @@
+if (GTEST_FOUND AND ARMADILLO_FOUND)
+
+ENABLE_TESTING()
+
+if(WIN32)
+link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/fft/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/fft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${Boost_INCLUDE_DIR}
+  ${ARMADILLO_INCLUDE_DIRS}
+  ${GTEST_INCLUDE_DIRS}
+  ${ACE_INCLUDE_DIR}
+  ${ISMRMRD_INCLUDE_DIR}
+  ${FFTW3_INCLUDE_DIR}
+  )
+
+if (CUDA_FOUND)
+    include_directories(${CUDA_INCLUDE_DIRS})
+endif (CUDA_FOUND)
+
+link_libraries(
+    gadgetron_toolbox_cpucore 
+    gadgetron_toolbox_cpucore_math
+    gadgetron_toolbox_cpufft
+    ${BOOST_LIBRARIES}
+    ${GTEST_LIBRARIES} 
+    ${ARMADILLO_LIBRARIES}
+    )
+
+if ( CUDA_FOUND )
+
+    include_directories( ${CUDA_INCLUDE_DIRS} )
+
+  cuda_add_executable(test_all 
+      tests.cpp 
+      hoNDArray_elemwise_test.cpp 
+      hoNDArray_blas_test.cpp 
+      hoNDArray_utils_test.cpp
+      hoNDFFT_test.cpp
+      vector_td_test.cpp
+      cuNDArray_elemwise_test.cpp 
+      cuNDArray_operators_test.cpp 
+      cuNDArray_blas_test.cpp 
+      cuNDArray_utils_test.cpp
+      vector_td_test.cpp
+      cuVector_td_test_kernels.h 
+      cuVector_td_test_kernels.cu 
+      cuNDFFT_test.cpp
+      )
+else ( CUDA_FOUND )
+    add_executable(test_all 
+      tests.cpp 
+      hoNDArray_elemwise_test.cpp 
+      hoNDArray_blas_test.cpp 
+      hoNDArray_utils_test.cpp
+      hoNDFFT_test.cpp
+      )
+endif ( CUDA_FOUND )
+
+if ( CUDA_FOUND )
+  target_link_libraries(test_all 
+    gadgetron_toolbox_gpucore
+    gadgetron_toolbox_gpufft
+    )
+endif( CUDA_FOUND )
+
+add_test(test_all test_all)
+
+endif (GTEST_FOUND AND ARMADILLO_FOUND)
+
+add_subdirectory(integration)
diff --git a/test/cuNDArray_Vector_td_test.cpp b/test/cuNDArray_Vector_td_test.cpp
new file mode 100644
index 0000000..f5b2279
--- /dev/null
+++ b/test/cuNDArray_Vector_td_test.cpp
@@ -0,0 +1,50 @@
+/*
+ * cuGTBLAS_test.cpp
+ *
+ *  Created on: Feb 28, 2013
+ *      Author: Dae
+ */
+#include "gtest/gtest.h"
+
+
+#include <vector>
+#include "complext.h"
+#include "cuNDArray.h"
+#include "vector_td_utilities.h"
+
+using namespace Gadgetron;
+using testing::Types;
+template <typename T> class cuNDArray_vector_td_Test : public ::testing::Test {
+	protected:
+	 virtual void SetUp() {
+		 unsigned int vdims[] = {37}; //Using prime numbers for setup because they are messy
+		 dims= std::vector<unsigned int>(vdims,vdims+sizeof(vdims)/sizeof(unsigned int));
+		 cuData = cuNDArray<vector_td<T,3> >(&dims);
+		 cuData.clear();
+	}
+	 cuNDArray<vector_td<T,3> > cuData;
+	 std::vector<unsigned int> dims;
+
+
+};
+
+//typedef Types<float,double,float_complext,double_complext> Implementations;
+typedef Types<float,double> Implementations;
+
+TYPED_TEST_CASE(cuNDArray_vector_td_Test, Implementations);
+
+TYPED_TEST(cuNDArray_vector_td_Test,absTest){
+	this->cuData.fill(vector_td<TypeParam,3>(-2));
+	this->cuData.abs();
+	vector_td<TypeParam,3> expected(2);
+	vector_td<TypeParam,3> result = this->cuData.get_device_ptr()[2];
+	EXPECT_EQ(expected,result);
+}
+
+TYPED_TEST(cuNDArray_vector_td_Test,sqrtTest){
+	this->cuData.fill(vector_td<TypeParam,3>(12.1));
+	this->cuData.sqrt();
+	vector_td<TypeParam,3> expected(TypeParam(3.478505426));
+	vector_td<TypeParam,3> result = this->cuData.get_device_ptr()[2];
+	EXPECT_FLOAT_EQ(result[1],expected[1]);
+}
diff --git a/test/cuNDArray_blas_test.cpp b/test/cuNDArray_blas_test.cpp
new file mode 100644
index 0000000..6aec76e
--- /dev/null
+++ b/test/cuNDArray_blas_test.cpp
@@ -0,0 +1,156 @@
+#include "cuNDArray_blas.h"
+#include "cuNDArray_elemwise.h"
+
+#include <gtest/gtest.h>
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class cuNDArray_blas_Real : public ::testing::Test 
+{
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+
+TYPED_TEST_CASE(cuNDArray_blas_Real, realImplementations);
+
+TYPED_TEST(cuNDArray_blas_Real,dotTest){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements(),real(dot(&this->Array,&this->Array)));
+  fill(&this->Array2,TypeParam(2));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements()*2,real(dot(&this->Array,&this->Array2)));
+}
+
+TYPED_TEST(cuNDArray_blas_Real,axpyTest){
+  fill(&this->Array,TypeParam(71));
+  fill(&this->Array2,TypeParam(97));
+  axpy(TypeParam(11),&this->Array,&this->Array2);
+  TypeParam val = this->Array2[10];
+  EXPECT_FLOAT_EQ(878,real(val));
+}
+
+TYPED_TEST(cuNDArray_blas_Real,nrm2Test){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(std::sqrt((double)this->Array.get_number_of_elements()),nrm2(&this->Array));
+  fill(&this->Array,TypeParam(3));
+  EXPECT_FLOAT_EQ(std::sqrt(3.0*3.0*this->Array.get_number_of_elements()),nrm2(&this->Array));
+}
+
+TYPED_TEST(cuNDArray_blas_Real,asumTest){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements(),real(asum(&this->Array)));
+  fill(&this->Array,TypeParam(-3));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements()*3,real(asum(&this->Array)));
+}
+
+TYPED_TEST(cuNDArray_blas_Real,aminTest){
+  fill(&this->Array,TypeParam(100));
+  TypeParam tmp(-50);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[23], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(23,amin(&this->Array));
+  tmp = TypeParam(2);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[48], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(48,amin(&this->Array));
+}
+
+TYPED_TEST(cuNDArray_blas_Real,amaxTest){
+  fill(&this->Array,TypeParam(1));
+  TypeParam tmp(2);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[23], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(23,amax(&this->Array));
+  tmp = TypeParam(-50);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[48], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(48,amax(&this->Array));
+}
+
+
+template <typename T> class cuNDArray_blas_Cplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+typedef Types</*std::complex<float>, std::complex<double>,*/ float_complext, double_complext> cplxImplementations;
+
+TYPED_TEST_CASE(cuNDArray_blas_Cplx, cplxImplementations);
+
+TYPED_TEST(cuNDArray_blas_Cplx,dotTest){
+  fill(&this->Array,TypeParam(1,1));
+  TypeParam res = dot(&this->Array,&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,-1)*TypeParam(1,1))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(0,imag(res));
+  fill(&this->Array2,TypeParam(2,2));
+  res = dot(&this->Array2,&this->Array2);
+  EXPECT_FLOAT_EQ(real(TypeParam(2,-2)*TypeParam(2,2))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(0,imag(res));
+  res = dot(&this->Array,&this->Array2);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,-1)*TypeParam(2,2))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,-1)*TypeParam(2,2))*this->Array.get_number_of_elements(),imag(res));
+}
+
+TYPED_TEST(cuNDArray_blas_Cplx,axpyTest){
+  fill(&this->Array,TypeParam(71.1,23.3));
+  fill(&this->Array2,TypeParam(97.9,654.2));
+  axpy(TypeParam(11.4),&this->Array,&this->Array2);
+  TypeParam got = this->Array2[546];
+  TypeParam wanted = TypeParam(71.1,23.3)*TypeParam(11.4)+TypeParam(97.9,654.2);
+  EXPECT_FLOAT_EQ(real(wanted),real(got));
+  EXPECT_FLOAT_EQ(imag(wanted),imag(got));
+}
+
+TYPED_TEST(cuNDArray_blas_Cplx,nrm2Test){
+  fill(&this->Array,TypeParam(1,1));
+  EXPECT_FLOAT_EQ(std::sqrt(real(TypeParam(1,-1)*TypeParam(1,1))*this->Array.get_number_of_elements()),nrm2(&this->Array));
+  fill(&this->Array,TypeParam(3.24,7.4));
+  // There will be rounding errors from the sum, so loosen comparison
+  EXPECT_NEAR(std::sqrt(real(TypeParam(3.24,-7.4)*TypeParam(3.24,7.4))*this->Array.get_number_of_elements()),nrm2(&this->Array),0.001);
+}
+
+TYPED_TEST(cuNDArray_blas_Cplx,asumTest){
+  fill(&this->Array,TypeParam(-3,1));
+  EXPECT_NEAR(4*this->Array.get_number_of_elements(),asum(&this->Array),0.1);
+}
+
+TYPED_TEST(cuNDArray_blas_Cplx,aminTest){
+  fill(&this->Array,TypeParam(100,101));
+  TypeParam tmp(-50,-51);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[23], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(23,amin(&this->Array));
+  tmp = TypeParam(2,100);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[48], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(23,amin(&this->Array));
+  tmp = TypeParam(-2,-76);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[1000], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(1000,amin(&this->Array));
+}
+
+TYPED_TEST(cuNDArray_blas_Cplx,amaxTest){
+  fill(&this->Array,TypeParam(1,1));
+  TypeParam tmp(4,4);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[768], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(768,amax(&this->Array));
+  tmp = TypeParam(6,1);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[48], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(768,amax(&this->Array));
+  tmp = TypeParam(-3,-6);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[999], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(999,amax(&this->Array));
+}
diff --git a/test/cuNDArray_elemwise_test.cpp b/test/cuNDArray_elemwise_test.cpp
new file mode 100644
index 0000000..d5c6b80
--- /dev/null
+++ b/test/cuNDArray_elemwise_test.cpp
@@ -0,0 +1,379 @@
+#include "cuNDArray_elemwise.h"
+#include "complext.h"
+
+#include <gtest/gtest.h>
+#include <complex>
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class cuNDArray_elemwise_TestReal : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+template <typename T> class cuNDArray_elemwise_TestCplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+template <typename T> class cuNDArray_elemwise_TestCplx2 : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+template <typename T> class cuNDArray_elemwise_TestCplx3 : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+template <typename T> class cuNDArray_elemwise_TestCplx4 : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<typename realType<T>::Type>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<typename realType<T>::Type> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+typedef Types</*std::complex<float>, std::complex<double>,*/ float_complext, double_complext> cplxImplementations;
+typedef Types<float_complext, double_complext> cplxtImplementations;
+
+TYPED_TEST_CASE(cuNDArray_elemwise_TestReal, realImplementations);
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,fillTest){
+  fill(&this->Array,TypeParam(1.1));
+  EXPECT_FLOAT_EQ(1.1,TypeParam(this->Array[5]));
+  fill(&this->Array,TypeParam(27.45));
+  EXPECT_FLOAT_EQ(27.45,TypeParam(this->Array[3242]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,clearTest){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(1,TypeParam(this->Array[5324]));
+  clear(&this->Array);
+  EXPECT_FLOAT_EQ(0,TypeParam(this->Array[5324]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,absTest){
+  fill(&this->Array,TypeParam(-5.5));
+  EXPECT_FLOAT_EQ(TypeParam(-5.5),TypeParam(this->Array[13]));
+  EXPECT_FLOAT_EQ(TypeParam(5.5),TypeParam(abs(&this->Array)->at(13)));
+  fill(&this->Array,TypeParam(-1.3));
+  EXPECT_FLOAT_EQ(TypeParam(-1.3),TypeParam(this->Array[2454]));
+  abs_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(1.3),TypeParam(this->Array[2454]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,absSquareTest){
+  fill(&this->Array,TypeParam(-5.5));
+  EXPECT_FLOAT_EQ(TypeParam(-5.5),TypeParam(this->Array[13]));
+  EXPECT_FLOAT_EQ(TypeParam(-5.5*-5.5),TypeParam(abs_square(&this->Array)->at(13)));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,sqrtTest){
+  fill(&this->Array,TypeParam(17.9));
+  EXPECT_FLOAT_EQ(std::sqrt(TypeParam(17.9)),TypeParam(sqrt(&this->Array)->at(23433)));
+  fill(&this->Array,TypeParam(3.14));
+  sqrt_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(std::sqrt(TypeParam(3.14)),TypeParam(this->Array[32343]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,squareTest){
+  fill(&this->Array,TypeParam(1.7));
+  EXPECT_FLOAT_EQ(TypeParam(1.7)*TypeParam(1.7),TypeParam(square(&this->Array)->at(22542)));
+  fill(&this->Array,TypeParam(31.4));
+  square_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(31.4)*TypeParam(31.4),TypeParam(this->Array[652252]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,reciprocalTest){
+  fill(&this->Array,TypeParam(11.7));
+  EXPECT_FLOAT_EQ(TypeParam(1)/TypeParam(11.7),TypeParam(reciprocal(&this->Array)->at(45452)));
+  fill(&this->Array,TypeParam(314.114));
+  reciprocal_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(1)/TypeParam(314.114),TypeParam(this->Array[43432]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,reciprocal_sqrtTest){
+  fill(&this->Array,TypeParam(1.9));
+  EXPECT_FLOAT_EQ(TypeParam(1)/std::sqrt(TypeParam(1.9)),TypeParam(reciprocal_sqrt(&this->Array)->at(12345)));
+  fill(&this->Array,TypeParam(1.14));
+  reciprocal_sqrt_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(1)/std::sqrt(TypeParam(1.14)),TypeParam(this->Array[0]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,sgnTest){
+  fill(&this->Array,TypeParam(-5.7));
+  TypeParam tmp(101.1);
+  TypeParam tmp2(0.0);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[91], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[19100], &tmp2, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_FLOAT_EQ(TypeParam(-1),TypeParam(sgn(&this->Array)->at(28)));
+  EXPECT_FLOAT_EQ(TypeParam(1),TypeParam(sgn(&this->Array)->at(91)));
+  EXPECT_FLOAT_EQ(TypeParam(0),TypeParam(sgn(&this->Array)->at(19100)));
+  fill(&this->Array,TypeParam(-5.7));
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[9100], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[19100], &tmp2, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  sgn_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(-1),TypeParam(this->Array[2800]));
+  EXPECT_FLOAT_EQ(TypeParam(1),TypeParam(this->Array[9100]));
+  EXPECT_FLOAT_EQ(TypeParam(0),TypeParam(this->Array[19100]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,clampTest){
+  fill(&this->Array,TypeParam(-5.7));
+  TypeParam tmp(101.3);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[354222], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  clamp(&this->Array,TypeParam(4.9),TypeParam(100.0));
+  EXPECT_FLOAT_EQ(TypeParam(4.9),this->Array[3435]);
+  EXPECT_FLOAT_EQ(TypeParam(100.0),this->Array[354222]);
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,clamp_minTest){
+  fill(&this->Array,TypeParam(-5.7));
+  TypeParam tmp(-101.3);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[91], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  clamp_min(&this->Array,TypeParam(-10.6));
+  EXPECT_FLOAT_EQ(TypeParam(-5.7),this->Array[28]);
+  EXPECT_FLOAT_EQ(TypeParam(-10.6),this->Array[91]);
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,clamp_maxTest){
+  fill(&this->Array,TypeParam(5.7));
+  TypeParam tmp(101.3);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[91], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  clamp_max(&this->Array,TypeParam(10.6));
+  EXPECT_FLOAT_EQ(TypeParam(5.7),this->Array[28]);
+  EXPECT_FLOAT_EQ(TypeParam(10.6),this->Array[91]);
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,normalizeTest){
+  fill(&this->Array,TypeParam(50));
+  TypeParam tmp(-200);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[23], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  normalize(&this->Array,110);
+  EXPECT_FLOAT_EQ(TypeParam(50)*TypeParam(110)/abs(TypeParam(-200)),this->Array[12345]);
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,shrink1Test){
+  fill(&this->Array,TypeParam(1.2));
+  shrink1(&this->Array,0.75);
+  EXPECT_FLOAT_EQ(TypeParam(1.2)/abs(TypeParam(1.2))*std::max(abs(TypeParam(1.2))-0.75,0.0),this->Array[125]);
+  fill(&this->Array,TypeParam(1));
+  shrink1(&this->Array,2.0);
+  EXPECT_FLOAT_EQ(0.0,this->Array[125]);
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,shrinkdTest){
+  fill(&this->Array,TypeParam(1.2));
+  fill(&this->Array2,TypeParam(4.0));
+  shrinkd(&this->Array,&this->Array2,1.0);
+  EXPECT_FLOAT_EQ(TypeParam(1.2)/TypeParam(4.0)*std::max(4.0-1.0,0.0),this->Array[125]);
+  shrinkd(&this->Array,&this->Array2,8.0);
+  EXPECT_FLOAT_EQ(0.0,this->Array[125]);
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,realTest){
+  fill(&this->Array,TypeParam(1.2));
+  EXPECT_FLOAT_EQ(TypeParam(1.2),real(&this->Array)->at(125));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,imagTest){
+  fill(&this->Array,TypeParam(1.2));
+  EXPECT_FLOAT_EQ(TypeParam(0.0),imag(&this->Array)->at(125));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,conjTest){
+  fill(&this->Array,TypeParam(1.2));
+  EXPECT_FLOAT_EQ(TypeParam(1.2),real(&this->Array)->at(125));
+}
+
+TYPED_TEST_CASE(cuNDArray_elemwise_TestCplx, cplxImplementations);
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,fillTest){
+  fill(&this->Array,TypeParam(1.1,2.2));
+  EXPECT_FLOAT_EQ(1.1,real(TypeParam(this->Array[52323])));
+  EXPECT_FLOAT_EQ(2.2,imag(TypeParam(this->Array[52323])));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,clearTest){
+  fill(&this->Array,TypeParam(1,1));
+  clear(&this->Array);
+  EXPECT_FLOAT_EQ(0,real(TypeParam(this->Array[325])));
+  EXPECT_FLOAT_EQ(0,imag(TypeParam(this->Array[325])));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,absTest){
+  fill(&this->Array,TypeParam(-5.5,7.7));
+  EXPECT_FLOAT_EQ(std::sqrt(5.5*5.5+7.7*7.7),abs(&this->Array)->at(32113));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,absSquareTest){
+  fill(&this->Array,TypeParam(-5.5,7.7));
+  EXPECT_FLOAT_EQ(5.5*5.5+7.7*7.7,abs_square(&this->Array)->at(32113));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,sqrtTest){
+  fill(&this->Array,TypeParam(17.9,3.5));
+  EXPECT_NEAR(real(sqrt(TypeParam(17.9,3.5))),real(sqrt(&this->Array)->at(2131)),0.00001);
+  EXPECT_NEAR(imag(sqrt(TypeParam(17.9,3.5))),imag(sqrt(&this->Array)->at(2131)),0.00001);
+  fill(&this->Array,TypeParam(3.14,4.13));
+  sqrt_inplace(&this->Array);
+  EXPECT_NEAR(real(sqrt(TypeParam(3.14,4.13))),real(this->Array[120000]),0.00001);
+  EXPECT_NEAR(imag(sqrt(TypeParam(3.14,4.13))),imag(this->Array[120000]),0.00001);
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,squareTest){
+  fill(&this->Array,TypeParam(1.7,7.1));
+  EXPECT_FLOAT_EQ(real(TypeParam(1.7,7.1)*TypeParam(1.7,7.1)),real(square(&this->Array)->at(22123)));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1.7,7.1)*TypeParam(1.7,7.1)),imag(square(&this->Array)->at(22123)));
+  fill(&this->Array,TypeParam(31.4,4.31));
+  square_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(31.4,4.31)*TypeParam(31.4,4.31)),real(this->Array[51234]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(31.4,4.31)*TypeParam(31.4,4.31)),imag(this->Array[51234]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,reciprocalTest){
+  fill(&this->Array,TypeParam(1.9,2.7));
+  EXPECT_FLOAT_EQ(real(TypeParam(1,0)/TypeParam(1.9,2.7)),real(reciprocal(&this->Array)->at(11232)));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,0)/TypeParam(1.9,2.7)),imag(reciprocal(&this->Array)->at(11232)));
+  fill(&this->Array,TypeParam(1.14,4.32));
+  reciprocal_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,0)/TypeParam(1.14,4.32)),real(this->Array[10]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,0)/TypeParam(1.14,4.32)),imag(this->Array[10]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,reciprocal_sqrtTest){
+  fill(&this->Array,TypeParam(1.9,2.7));
+  EXPECT_FLOAT_EQ(real(TypeParam(1,0)/sqrt(TypeParam(1.9,2.7))),real(reciprocal_sqrt(&this->Array)->at(12543)));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,0)/sqrt(TypeParam(1.9,2.7))),imag(reciprocal_sqrt(&this->Array)->at(12543)));
+  fill(&this->Array,TypeParam(1.14,4.32));
+  reciprocal_sqrt_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,0)/sqrt(TypeParam(1.14,4.32))),real(this->Array[10000]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,0)/sqrt(TypeParam(1.14,4.32))),imag(this->Array[10000]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,realImagTest){
+  fill(&this->Array,TypeParam(3.4,4.2));
+  EXPECT_FLOAT_EQ(3.4,real(&this->Array)->at(33425));
+  EXPECT_FLOAT_EQ(4.2,imag(&this->Array)->at(45));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,conjTest){
+  fill(&this->Array,TypeParam(3.4,4.2));
+  EXPECT_FLOAT_EQ(3.4,real(conj(&this->Array)->at(33425)));
+  EXPECT_FLOAT_EQ(-4.2,imag(conj(&this->Array)->at(45)));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,normalizeTest){
+  fill(&this->Array,TypeParam(50,50));
+  TypeParam tmp(-200,-200);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[23], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  normalize(&this->Array,110);
+  EXPECT_FLOAT_EQ(real(TypeParam(50,50)*real(TypeParam(110,110))/abs(TypeParam(-200,-200))),real(&this->Array)->at(12345));
+  EXPECT_FLOAT_EQ(imag(TypeParam(50,50)*real(TypeParam(110,110))/abs(TypeParam(-200,-200))),imag(&this->Array)->at(12345));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,clampTest){
+  fill(&this->Array,TypeParam(-5.7, -4.6));
+  TypeParam tmp(101.3,203.4);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[354222], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));  
+  clamp(&this->Array,real(TypeParam(4.9,0)),real(TypeParam(100.0,0)));
+  EXPECT_FLOAT_EQ(real(TypeParam(4.9,0)),real(&this->Array)->at(3435));
+  EXPECT_FLOAT_EQ(real(TypeParam(100.0,0)),real(&this->Array)->at(354222));
+  EXPECT_FLOAT_EQ(imag(TypeParam(4.9,0)),imag(&this->Array)->at(3435));
+  EXPECT_FLOAT_EQ(imag(TypeParam(100.0,0)),imag(&this->Array)->at(354222));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,clamp_minTest){
+  fill(&this->Array,TypeParam(-5.7, -4.6));
+  TypeParam tmp(-101.3,-203.4);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[91], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));  
+  clamp_min(&this->Array, real(TypeParam(-10.6,0)));
+  EXPECT_FLOAT_EQ(real(TypeParam(-5.7,0)),real(&this->Array)->at(28));
+  EXPECT_FLOAT_EQ(real(TypeParam(-10.6,0)),real(&this->Array)->at(91));
+  EXPECT_FLOAT_EQ(imag(TypeParam(-5.7,0)),imag(&this->Array)->at(28));
+  EXPECT_FLOAT_EQ(imag(TypeParam(-10.6,0)),imag(&this->Array)->at(91));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,clamp_maxTest){
+  fill(&this->Array,TypeParam(5.7, 4.6));
+  TypeParam tmp(101.3,203.4);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[91], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));  
+  clamp_max(&this->Array,real(TypeParam(10.6,0)));
+  EXPECT_FLOAT_EQ(real(TypeParam(5.7,0)),real(&this->Array)->at(28));
+  EXPECT_FLOAT_EQ(real(TypeParam(10.6,0)),real(&this->Array)->at(91));
+  EXPECT_FLOAT_EQ(imag(TypeParam(5.7,0)),imag(&this->Array)->at(28));
+  EXPECT_FLOAT_EQ(imag(TypeParam(10.6,0)),imag(&this->Array)->at(91));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,shrink1Test){
+  fill(&this->Array,TypeParam(1.2,1.4));
+  shrink1(&this->Array,0.75);
+  EXPECT_FLOAT_EQ(real(TypeParam(1.2,1.4)/abs(TypeParam(1.2,1.4)))*std::max(abs(TypeParam(1.2,1.4))-0.75,0.0),real(&this->Array)->at(125));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1.2,1.4)/abs(TypeParam(1.2,1.4)))*std::max(abs(TypeParam(1.2,1.4))-0.75,0.0),imag(&this->Array)->at(125));
+  fill(&this->Array,TypeParam(1,1));
+  shrink1(&this->Array,2.0);
+  EXPECT_FLOAT_EQ(0.0,real(&this->Array)->at(125));
+  EXPECT_FLOAT_EQ(0.0,imag(&this->Array)->at(23125));
+}
+
+TYPED_TEST_CASE(cuNDArray_elemwise_TestCplx4, cplxImplementations);
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx4,shrinkdTest){
+  fill(&this->Array,TypeParam(1.2,1.4));
+  fill(&this->Array2,real(TypeParam(4.0,4.0)));
+  shrinkd(&this->Array,&this->Array2,1.0);
+  EXPECT_FLOAT_EQ(real(TypeParam(1.2,1.4)/real(TypeParam(4.0,4.0)))*std::max(4.0-1.0,0.0),real(&this->Array)->at(125));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1.2,1.4)/imag(TypeParam(4.0,4.0)))*std::max(4.0-1.0,0.0),imag(&this->Array)->at(125));
+  shrinkd(&this->Array,&this->Array2,8.0);
+  EXPECT_FLOAT_EQ(0.0,real(&this->Array)->at(125));
+  EXPECT_FLOAT_EQ(0.0,imag(&this->Array)->at(23125));
+}
+
+TYPED_TEST_CASE(cuNDArray_elemwise_TestCplx3, cplxtImplementations);
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx3,realToCplxTest){
+  fill(&this->Array,TypeParam(3.4,4.2));
+  EXPECT_FLOAT_EQ(3.4,real(real_to_complex<TypeParam>(real(&this->Array).get())->at(33425)));
+  EXPECT_FLOAT_EQ(0.0,imag(real_to_complex<TypeParam>(real(&this->Array).get())->at(33425)));
+}
diff --git a/test/cuNDArray_operators_test.cpp b/test/cuNDArray_operators_test.cpp
new file mode 100644
index 0000000..de22a09
--- /dev/null
+++ b/test/cuNDArray_operators_test.cpp
@@ -0,0 +1,243 @@
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "complext.h"
+
+#include <gtest/gtest.h>
+#include <complex>
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class cuNDArray_operators_TestReal : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+template <typename T> class cuNDArray_operators_TestCplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+typedef Types</*std::complex<float>, std::complex<double>,*/ float_complext, double_complext> cplxImplementations;
+
+TYPED_TEST_CASE(cuNDArray_operators_TestReal, realImplementations);
+
+TYPED_TEST(cuNDArray_operators_TestReal,equalsAddTest1){
+  TypeParam v1 = TypeParam(46865.35435);
+  TypeParam v2 = TypeParam(13784.34);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array += this->Array2;
+  EXPECT_FLOAT_EQ(v1+v2,this->Array[idx]);
+}
+
+TYPED_TEST(cuNDArray_operators_TestReal,equalsAddTest2){
+  TypeParam v1 = TypeParam(98.4);
+  TypeParam v2 = TypeParam(2.2);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array += v2;
+  EXPECT_FLOAT_EQ(v1+v2,this->Array[idx]);
+}
+
+TYPED_TEST(cuNDArray_operators_TestReal,equalsSubtractTest1){
+  TypeParam v1 = TypeParam(98475334.34);
+  TypeParam v2 = TypeParam(2452.234);
+  unsigned int idx = 124999;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array -= this->Array2;
+  EXPECT_FLOAT_EQ(v1-v2,this->Array[idx]);
+}
+
+TYPED_TEST(cuNDArray_operators_TestReal,equalsSubtractTest2){
+  TypeParam v1 = TypeParam(4.4);
+  TypeParam v2 = TypeParam(9212.21);
+  unsigned int idx = 122131;
+  fill(&this->Array,v1);
+  this->Array -= v2;
+  EXPECT_FLOAT_EQ(v1-v2,this->Array[idx]);
+}
+
+TYPED_TEST(cuNDArray_operators_TestReal,equalsMultiplyTest1){
+  TypeParam v1 = TypeParam(342.145);
+  TypeParam v2 = TypeParam(43545.43);
+  unsigned int idx = 12344;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array *= this->Array2;
+  EXPECT_FLOAT_EQ(v1*v2,this->Array[idx]);
+}
+
+TYPED_TEST(cuNDArray_operators_TestReal,equalsMultiplyTest2){
+  TypeParam v1 = TypeParam(43534.443);
+  TypeParam v2 = TypeParam(92.842);
+  unsigned int idx = 96735;
+  fill(&this->Array,v1);
+  this->Array *= v2;
+  EXPECT_FLOAT_EQ(v1*v2,this->Array[idx]);
+}
+
+TYPED_TEST(cuNDArray_operators_TestReal,equalsDivideTest1){
+  TypeParam v1 = TypeParam(644.24);
+  TypeParam v2 = TypeParam(38564.64);
+  unsigned int idx = 98322;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array /= this->Array2;
+  EXPECT_FLOAT_EQ(v1/v2,this->Array[idx]);
+}
+
+TYPED_TEST(cuNDArray_operators_TestReal,equalsDivideTest2){
+  TypeParam v1 = TypeParam(56342.24);
+  TypeParam v2 = TypeParam(23434.34);
+  unsigned int idx = 91;
+  fill(&this->Array,v1);
+  this->Array /= v2;
+  EXPECT_FLOAT_EQ(v1/v2,this->Array[idx]);
+}
+
+TYPED_TEST_CASE(cuNDArray_operators_TestCplx, cplxImplementations);
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsAddTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array += this->Array2;
+  EXPECT_FLOAT_EQ(real(v1+v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1+v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsAddTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array += v2;
+  EXPECT_FLOAT_EQ(real(v1+v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1+v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsAddTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array += real(v2);
+  EXPECT_FLOAT_EQ(real(v1+v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1+v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsSubtractTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array -= this->Array2;
+  EXPECT_FLOAT_EQ(real(v1-v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1-v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsSubtractTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array -= v2;
+  EXPECT_FLOAT_EQ(real(v1-v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1-v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsSubtractTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array -= real(v2);
+  EXPECT_FLOAT_EQ(real(v1-v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1-v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsMultiplyTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array *= this->Array2;
+  EXPECT_FLOAT_EQ(real(v1*v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1*v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsMultiplyTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array *= v2;
+  EXPECT_FLOAT_EQ(real(v1*v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1*v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsMultiplyTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array *= real(v2);
+  EXPECT_FLOAT_EQ(real(v1*v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1*v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsDivideTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array /= this->Array2;
+  EXPECT_FLOAT_EQ(real(v1/v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1/v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsDivideTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array /= v2;
+  EXPECT_FLOAT_EQ(real(v1/v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1/v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsDivideTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array /= real(v2);
+  EXPECT_FLOAT_EQ(real(v1/v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1/v2),imag(this->Array[idx]));
+}
diff --git a/test/cuNDArray_test.cpp b/test/cuNDArray_test.cpp
new file mode 100644
index 0000000..9e7b69f
--- /dev/null
+++ b/test/cuNDArray_test.cpp
@@ -0,0 +1,83 @@
+/*
+ * cuNDArray_test.cpp
+ *
+ *  Created on: Mar 1, 2013
+ *      Author: Dae
+ */
+
+
+#include "gtest/gtest.h"
+#include "cuNDArray.h"
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+
+template <typename T> class cuNDArray_Test : public ::testing::Test {
+	protected:
+	 virtual void SetUp() {
+		 unsigned int vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+		 dims= std::vector<unsigned int>(vdims,vdims+sizeof(vdims)/sizeof(unsigned int));
+		 Array =cuNDArray<T>(&dims);
+		 Array2 =cuNDArray<T>(&dims);
+
+
+	}
+	 std::vector<unsigned int> dims;
+	 cuNDArray<T> Array;
+	 cuNDArray<T> Array2;
+
+};
+
+typedef Types<float,double,float_complext,double_complext> Implementations;
+
+TYPED_TEST_CASE(cuNDArray_Test, Implementations);
+
+TYPED_TEST(cuNDArray_Test,fillTest){
+	this->Array.fill(TypeParam(1));
+	TypeParam res = this->Array.get_device_ptr()[5];
+	EXPECT_FLOAT_EQ(1,real(res));
+	this->Array.fill(TypeParam(27));
+	res = this->Array.get_device_ptr()[42];
+	EXPECT_FLOAT_EQ(27,real(res));
+}
+
+
+TYPED_TEST(cuNDArray_Test,clearTest){
+	this->Array.fill(TypeParam(1));
+	TypeParam res = this->Array.get_device_ptr()[5];
+	EXPECT_FLOAT_EQ(1,real(res));
+	this->Array.clear();
+	res = this->Array.get_device_ptr()[5];
+	EXPECT_FLOAT_EQ(0,real(res));
+}
+
+TYPED_TEST(cuNDArray_Test,equalsMultiplyTest){
+	this->Array.fill(TypeParam(2));
+	this->Array2.fill(TypeParam(4));
+	this->Array *= this->Array2;
+	TypeParam res = this->Array.get_device_ptr()[105];
+	EXPECT_FLOAT_EQ(8,real(res));
+
+}
+
+TYPED_TEST(cuNDArray_Test,absTest){
+	this->Array.fill(TypeParam(2.2));
+	this->Array.abs();
+	TypeParam res = this->Array.get_device_ptr()[121];
+	EXPECT_FLOAT_EQ(real(res),2.2);
+	this->Array.fill(TypeParam(-2.2));
+	this->Array.abs();
+	res = this->Array.get_device_ptr()[121];
+	EXPECT_FLOAT_EQ(real(res),2.2);
+}
+
+
+TYPED_TEST(cuNDArray_Test,sqrtTest){
+	this->Array.fill(TypeParam(12.1));
+	this->Array.sqrt();
+	TypeParam res = this->Array.get_device_ptr()[121];
+	EXPECT_FLOAT_EQ(real(res),3.478505426);
+
+}
diff --git a/test/cuNDArray_utils_test.cpp b/test/cuNDArray_utils_test.cpp
new file mode 100644
index 0000000..e6c5e0e
--- /dev/null
+++ b/test/cuNDArray_utils_test.cpp
@@ -0,0 +1,241 @@
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_elemwise.h"
+#include "complext.h"
+
+#include <gtest/gtest.h>
+#include <complex>
+#include <vector>
+#include "vector_td_utilities.h"
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class cuNDArray_utils_TestReal : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+template <typename T> class cuNDArray_utils_TestCplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+typedef Types</*std::complex<float>, std::complex<double>,*/ float_complext, double_complext> cplxImplementations;
+
+TYPED_TEST_CASE(cuNDArray_utils_TestReal, realImplementations);
+
+TYPED_TEST(cuNDArray_utils_TestReal,permuteTest){
+
+  fill(&this->Array,TypeParam(1));
+
+  std::vector<size_t> order;
+  order.push_back(0); order.push_back(1); order.push_back(2); order.push_back(3);
+  
+  TypeParam tmp(2);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[37], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+
+  EXPECT_FLOAT_EQ(1, permute(&this->Array,&order)->at(0));
+  EXPECT_FLOAT_EQ(2, permute(&this->Array,&order)->at(37));
+
+  order.clear();
+  order.push_back(1); order.push_back(0); order.push_back(2); order.push_back(3);
+
+  EXPECT_FLOAT_EQ(2, permute(&this->Array,&order)->at(1));
+
+  order.clear();
+  order.push_back(3); order.push_back(1); order.push_back(2); order.push_back(0);
+
+  EXPECT_FLOAT_EQ(2, permute(&this->Array,&order)->at(19));
+
+  order.clear();
+  order.push_back(2); order.push_back(0); order.push_back(1); order.push_back(3);
+
+  EXPECT_FLOAT_EQ(2, permute(&this->Array,&order)->at(851));
+}
+
+TYPED_TEST(cuNDArray_utils_TestReal,shiftDimTest){
+
+  fill(&this->Array,TypeParam(1));
+
+  TypeParam tmp(2);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[37], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+
+  EXPECT_FLOAT_EQ(1, shift_dim(&this->Array,0)->at(0));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,0)->at(37));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,1)->at(1));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,-1)->at(37*19));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,2)->at(23*37*19));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,3)->at(37*19));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,4)->at(37));
+}
+
+TYPED_TEST(cuNDArray_utils_TestReal,sumTest){
+  TypeParam v1 = TypeParam(12.34);
+  unsigned int idx = 0;
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(49*v1,sum(&this->Array,1)->at(idx));
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(23*v1,sum(&this->Array,2)->at(idx));
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(19*v1,sum(&this->Array,3)->at(idx));
+}
+
+
+TYPED_TEST(cuNDArray_utils_TestReal,meanTest){
+  TypeParam v1 = TypeParam(12.34);
+  unsigned int idx = 0;
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(v1,mean(&this->Array));
+
+}
+TYPED_TEST_CASE(cuNDArray_utils_TestCplx, cplxImplementations);
+
+
+
+TYPED_TEST(cuNDArray_utils_TestCplx,meanTest){
+  TypeParam v1 = TypeParam(12.34);
+  unsigned int idx = 0;
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(real(v1),real(mean(&this->Array)));
+
+}
+
+TYPED_TEST(cuNDArray_utils_TestCplx,permuteTest){
+  
+  fill(&this->Array,TypeParam(1,1));
+
+  std::vector<size_t> order;
+  order.push_back(0); order.push_back(1); order.push_back(2); order.push_back(3);
+  
+  TypeParam tmp(2,3);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[37], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+
+  EXPECT_FLOAT_EQ(1, real(permute(&this->Array,&order)->at(0)));
+  EXPECT_FLOAT_EQ(1, imag(permute(&this->Array,&order)->at(0)));
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array,&order)->at(37)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array,&order)->at(37)));
+
+  order.clear();
+  order.push_back(1); order.push_back(0); order.push_back(2); order.push_back(3);
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array,&order)->at(1)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array,&order)->at(1)));
+
+  order.clear();
+  order.push_back(3); order.push_back(1); order.push_back(2); order.push_back(0);
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array,&order)->at(19)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array,&order)->at(19)));
+
+  order.clear();
+  order.push_back(2); order.push_back(0); order.push_back(1); order.push_back(3);
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array,&order)->at(851)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array,&order)->at(851)));
+}
+
+TYPED_TEST(cuNDArray_utils_TestCplx,shiftDimTest){
+
+  fill(&this->Array,TypeParam(1,1));
+
+  TypeParam tmp(2,3);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[37], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+
+  EXPECT_FLOAT_EQ(1, real(shift_dim(&this->Array,0)->at(0)));
+  EXPECT_FLOAT_EQ(1, imag(shift_dim(&this->Array,0)->at(0)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,0)->at(37)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,0)->at(37)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,1)->at(1)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,1)->at(1)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,-1)->at(37*19)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,-1)->at(37*19)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,2)->at(23*37*19)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,2)->at(23*37*19)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,3)->at(37*19)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,3)->at(37*19)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,4)->at(37)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,4)->at(37)));
+}
+
+TYPED_TEST(cuNDArray_utils_TestCplx,sumTest){
+  TypeParam v1 = TypeParam(12.34, 56.78);
+  unsigned int idx = 0;
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(real(TypeParam(49)*v1),real(sum(&this->Array,1)->at(idx)));
+  EXPECT_FLOAT_EQ(imag(TypeParam(49)*v1),imag(sum(&this->Array,1)->at(idx)));
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(real(TypeParam(23)*v1),real(sum(&this->Array,2)->at(idx)));
+  EXPECT_FLOAT_EQ(imag(TypeParam(23)*v1),imag(sum(&this->Array,2)->at(idx)));
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(real(TypeParam(19)*v1),real(sum(&this->Array,3)->at(idx)));
+  EXPECT_FLOAT_EQ(imag(TypeParam(19)*v1),imag(sum(&this->Array,3)->at(idx)));
+}
+
+TYPED_TEST(cuNDArray_utils_TestCplx,padTest){
+  TypeParam v1 = TypeParam(12.34, 56.78);
+  unsigned int idx = 0;
+
+  fill(&this->Array,v1);
+
+  vector_td<size_t,4> size = from_std_vector<size_t,4>(this->dims);
+  size *= 2;
+
+  boost::shared_ptr<cuNDArray<TypeParam> > out = pad<TypeParam,4>(size,&this->Array);
+
+  double scale = std::pow(2.0,4);
+  EXPECT_EQ(out->get_number_of_elements(),this->Array.get_number_of_elements()*scale);
+  EXPECT_FLOAT_EQ(real(mean(out.get()))*scale,real(mean(&this->Array)));
+  EXPECT_FLOAT_EQ(imag(mean(out.get()))*scale,imag(mean(&this->Array)));
+}
+
+
+TEST(padTest,largeSize){
+// So, this test is mainly here because pad apparently fails for large sized arrays.
+	size_t vdims[] = {192,192,50};
+	std::vector<size_t> dims(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+	size_t vdims2[] = {256,256,256};
+	std::vector<size_t> dims2(vdims2,vdims2+sizeof(vdims2)/sizeof(size_t));
+
+	cuNDArray<float_complext> in(&dims);
+	fill(&in,float_complext(1));
+	cuNDArray<float_complext> out(&dims2);
+
+	pad<float_complext,3>(&in,&out);
+
+	EXPECT_FLOAT_EQ(nrm2(&in),nrm2(&out));
+
+}
diff --git a/test/cuNDFFT_test.cpp b/test/cuNDFFT_test.cpp
new file mode 100644
index 0000000..e682fda
--- /dev/null
+++ b/test/cuNDFFT_test.cpp
@@ -0,0 +1,47 @@
+#include "cuNDFFT.h"
+#include "cuNDArray_math.h"
+#include "complext.h"
+#include <gtest/gtest.h>
+#include <boost/random.hpp>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template<typename REAL> class cuNDFFT_test : public ::testing::Test {
+protected:
+	virtual void SetUp(){
+		boost::random::mt19937 rng;
+		boost::random::uniform_real_distribution<REAL> uni(0,1);
+		std::vector<size_t > dimensions(3,128);
+
+		hoNDArray<complext<REAL> > tmp(dimensions);
+		complext<REAL>* data = tmp.get_data_ptr();
+
+		for (size_t i = 0; i < tmp.get_number_of_elements(); i++)
+			data[i] = complext<REAL>(uni(rng),uni(rng));
+
+		Array = cuNDArray<complext<REAL> >(tmp);
+		Array2 = Array;
+	}
+
+	cuNDArray<complext<REAL> > Array;
+
+	cuNDArray<complext<REAL> > Array2;
+
+};
+typedef Types<float, double> realImplementations;
+TYPED_TEST_CASE(cuNDFFT_test, realImplementations);
+
+TYPED_TEST(cuNDFFT_test,fftNrm2Test){
+	cuNDFFT<TypeParam>::instance()->fft(&this->Array);
+
+	EXPECT_NEAR(nrm2(&this->Array2),nrm2(&this->Array),nrm2(&this->Array)*1e-3);
+
+}
+
+TYPED_TEST(cuNDFFT_test,ifftNrm2Test){
+	cuNDFFT<TypeParam>::instance()->ifft(&this->Array);
+
+	EXPECT_NEAR(nrm2(&this->Array2),nrm2(&this->Array),nrm2(&this->Array)*1e-3);
+
+}
diff --git a/test/cuVector_td_test_kernels.cu b/test/cuVector_td_test_kernels.cu
new file mode 100644
index 0000000..c10485d
--- /dev/null
+++ b/test/cuVector_td_test_kernels.cu
@@ -0,0 +1,237 @@
+#include "cuVector_td_test_kernels.h"
+#include "check_CUDA.h"
+#include "vector_td_utilities.h"
+#include "cuNDArray.h"
+#include "cudaDeviceManager.h"
+#include "thrust/device_vector.h"
+
+
+using namespace Gadgetron;
+template<class T, unsigned int D> __global__ void abs_kernel(vector_td<T,D>* data, unsigned int size){
+	 const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	 if (idx < size) data[idx] = abs(data[idx]);
+}
+
+
+template<class T, unsigned int D> void Gadgetron::test_abs(cuNDArray< vector_td<T,D> >* data){
+
+	dim3 dimBlock(std::min(cudaDeviceManager::Instance()->max_griddim(),(int)data->get_number_of_elements()));
+	dim3 dimGrid((dimBlock.x-1)/data->get_number_of_elements()+1);
+	abs_kernel<<<dimGrid,dimBlock>>>(data->get_data_ptr(),data->get_number_of_elements());
+	cudaThreadSynchronize();
+	CHECK_FOR_CUDA_ERROR();
+}
+
+
+template<typename T, unsigned int D>
+struct test_norm_functor : public thrust::unary_function<T,vector_td<T,D> >
+{
+ __host__ __device__ T operator()(const vector_td<T,D> &x) const {return norm(x);}
+};
+template<class T, unsigned int D> thrust::device_vector<T> Gadgetron::test_norm(cuNDArray< vector_td<T,D> >* data){
+
+	thrust::device_vector<T> out(data->get_number_of_elements());
+	thrust::transform(data->begin(),data->end(),out.begin(),test_norm_functor<T,D>());
+	cudaThreadSynchronize();
+	CHECK_FOR_CUDA_ERROR();
+	return out;
+}
+
+
+
+template<typename T, unsigned int D>
+struct test_min_functor : public thrust::unary_function<T,vector_td<T,D> >
+{
+ __host__ __device__ T operator()(const vector_td<T,D> &x) const {return min(x);}
+};
+template<class T, unsigned int D> thrust::device_vector<T> Gadgetron::test_min(cuNDArray< vector_td<T,D> >* data){
+
+	thrust::device_vector<T> out(data->get_number_of_elements());
+	thrust::transform(data->begin(),data->end(),out.begin(),test_min_functor<T,D>());
+	cudaThreadSynchronize();
+	CHECK_FOR_CUDA_ERROR();
+	return out;
+}
+
+
+template<typename T, unsigned int D>
+struct test_max_functor : public thrust::unary_function<T,vector_td<T,D> >
+{
+ __host__ __device__ T operator()(const vector_td<T,D> &x) const {return max(x);}
+};
+template<class T, unsigned int D> thrust::device_vector<T> Gadgetron::test_max(cuNDArray< vector_td<T,D> >* data){
+
+	thrust::device_vector<T> out(data->get_number_of_elements());
+	thrust::transform(data->begin(),data->end(),out.begin(),test_max_functor<T,D>());
+	cudaThreadSynchronize();
+	CHECK_FOR_CUDA_ERROR();
+	return out;
+}
+
+template<typename T, unsigned int D>
+struct test_amin_functor : public thrust::binary_function<vector_td<T,D>, vector_td<T,D>, vector_td<T,D> >
+{
+	__host__ __device__ vector_td<T,D> operator()(const vector_td<T,D> &x, const vector_td<T,D> &y) const {return amin(x,y);}
+
+};
+
+template<class T, unsigned int D> boost::shared_ptr<cuNDArray<vector_td<T,D> > > Gadgetron::test_amin(cuNDArray< vector_td<T,D> >* data1, cuNDArray< vector_td<T,D> >* data2){
+	boost::shared_ptr<cuNDArray<vector_td<T,D> > > out( new cuNDArray<vector_td<T,D> >(data1->get_dimensions()));
+	thrust::transform(data1->begin(),data1->end(),data2->begin(),out->begin(),test_amin_functor<T,D>());
+	return out;
+}
+
+
+template<typename T, unsigned int D>
+struct test_amax_functor : public thrust::binary_function<vector_td<T,D>, vector_td<T,D>, vector_td<T,D> >
+{
+	__host__ __device__ vector_td<T,D> operator()(const vector_td<T,D> &x, const vector_td<T,D> &y) const {return amax(x,y);}
+
+};
+
+template<class T, unsigned int D> boost::shared_ptr<cuNDArray<vector_td<T,D> > > Gadgetron::test_amax(cuNDArray< vector_td<T,D> >* data1, cuNDArray< vector_td<T,D> >* data2){
+	boost::shared_ptr<cuNDArray<vector_td<T,D> > > out( new cuNDArray<vector_td<T,D> >(data1->get_dimensions()));
+	thrust::transform(data1->begin(),data1->end(),data2->begin(),out->begin(),test_amax_functor<T,D>());
+	return out;
+}
+
+template<typename T, unsigned int D>
+class test_amin2_functor : public thrust::unary_function<vector_td<T,D>, vector_td<T,D> >
+{
+public:
+	test_amin2_functor(T _val): val(_val){};
+	__host__ __device__ vector_td<T,D> operator()(const vector_td<T,D> &x) const {return amin(x,val);}
+	T val;
+};
+
+template<class T, unsigned int D> boost::shared_ptr<cuNDArray<vector_td<T,D> > > Gadgetron::test_amin2(cuNDArray< vector_td<T,D> >* data1, T val){
+	boost::shared_ptr<cuNDArray<vector_td<T,D> > > out( new cuNDArray<vector_td<T,D> >(data1->get_dimensions()));
+	thrust::transform(data1->begin(),data1->end(),out->begin(),test_amin2_functor<T,D>(val));
+	return out;
+}
+
+
+template<typename T, unsigned int D>
+class test_amax2_functor : public thrust::unary_function<vector_td<T,D>, vector_td<T,D> >
+{
+public:
+	test_amax2_functor(T _val): val(_val){};
+	__host__ __device__ vector_td<T,D> operator()(const vector_td<T,D> &x) const {return amax(x,val);}
+	T val;
+};
+
+template<class T, unsigned int D> boost::shared_ptr<cuNDArray<vector_td<T,D> > > Gadgetron::test_amax2(cuNDArray< vector_td<T,D> >* data1, T val){
+	boost::shared_ptr<cuNDArray<vector_td<T,D> > > out( new cuNDArray<vector_td<T,D> >(data1->get_dimensions()));
+	thrust::transform(data1->begin(),data1->end(),out->begin(),test_amax2_functor<T,D>(val));
+	return out;
+}
+
+
+
+template<class T, unsigned int D> void Gadgetron::vector_fill(cuNDArray< vector_td<T,D> >* data,  vector_td<T,D> val){
+	thrust::fill(data->begin(),data->end(),val);
+}
+
+
+template void Gadgetron::test_abs<float,1>(cuNDArray< vector_td<float,1> > *);
+template void Gadgetron::test_abs<float,2>(cuNDArray< vector_td<float,2> > *);
+template  void Gadgetron::test_abs<float,3>(cuNDArray< vector_td<float,3> > *);
+template  void Gadgetron::test_abs<float,4>(cuNDArray< vector_td<float,4> > *);
+
+template  void Gadgetron::test_abs<double,1>(cuNDArray< vector_td<double,1> > *);
+template void Gadgetron::test_abs<double,2>(cuNDArray< vector_td<double,2> > *);
+template void Gadgetron::test_abs<double,3>(cuNDArray< vector_td<double,3> > *);
+template void Gadgetron::test_abs<double,4>(cuNDArray< vector_td<double,4> > *);
+
+
+template thrust::device_vector<float> Gadgetron::test_norm<float,1>(cuNDArray< vector_td<float,1> > *);
+template thrust::device_vector<float> Gadgetron::test_norm<float,2>(cuNDArray< vector_td<float,2> > *);
+template  thrust::device_vector<float> Gadgetron::test_norm<float,3>(cuNDArray< vector_td<float,3> > *);
+template  thrust::device_vector<float> Gadgetron::test_norm<float,4>(cuNDArray< vector_td<float,4> > *);
+
+template  thrust::device_vector<double> Gadgetron::test_norm<double,1>(cuNDArray< vector_td<double,1> > *);
+template thrust::device_vector<double> Gadgetron::test_norm<double,2>(cuNDArray< vector_td<double,2> > *);
+template thrust::device_vector<double> Gadgetron::test_norm<double,3>(cuNDArray< vector_td<double,3> > *);
+template thrust::device_vector<double> Gadgetron::test_norm<double,4>(cuNDArray< vector_td<double,4> > *);
+
+
+template thrust::device_vector<float> Gadgetron::test_min<float,1>(cuNDArray< vector_td<float,1> > *);
+template thrust::device_vector<float> Gadgetron::test_min<float,2>(cuNDArray< vector_td<float,2> > *);
+template  thrust::device_vector<float> Gadgetron::test_min<float,3>(cuNDArray< vector_td<float,3> > *);
+template  thrust::device_vector<float> Gadgetron::test_min<float,4>(cuNDArray< vector_td<float,4> > *);
+
+template  thrust::device_vector<double> Gadgetron::test_min<double,1>(cuNDArray< vector_td<double,1> > *);
+template thrust::device_vector<double> Gadgetron::test_min<double,2>(cuNDArray< vector_td<double,2> > *);
+template thrust::device_vector<double> Gadgetron::test_min<double,3>(cuNDArray< vector_td<double,3> > *);
+template thrust::device_vector<double> Gadgetron::test_min<double,4>(cuNDArray< vector_td<double,4> > *);
+
+
+template thrust::device_vector<float> Gadgetron::test_max<float,1>(cuNDArray< vector_td<float,1> > *);
+template thrust::device_vector<float> Gadgetron::test_max<float,2>(cuNDArray< vector_td<float,2> > *);
+template  thrust::device_vector<float> Gadgetron::test_max<float,3>(cuNDArray< vector_td<float,3> > *);
+template  thrust::device_vector<float> Gadgetron::test_max<float,4>(cuNDArray< vector_td<float,4> > *);
+
+template  thrust::device_vector<double> Gadgetron::test_max<double,1>(cuNDArray< vector_td<double,1> > *);
+template thrust::device_vector<double> Gadgetron::test_max<double,2>(cuNDArray< vector_td<double,2> > *);
+template thrust::device_vector<double> Gadgetron::test_max<double,3>(cuNDArray< vector_td<double,3> > *);
+template thrust::device_vector<double> Gadgetron::test_max<double,4>(cuNDArray< vector_td<double,4> > *);
+
+
+
+template boost::shared_ptr<cuNDArray<vector_td<float,1> > > Gadgetron::test_amin<float,1>(cuNDArray< vector_td<float,1> > *,cuNDArray< vector_td<float,1> > *);
+template boost::shared_ptr<cuNDArray<vector_td<float,2> > > Gadgetron::test_amin<float,2>(cuNDArray< vector_td<float,2> > *, cuNDArray< vector_td<float,2> > *);
+template  boost::shared_ptr<cuNDArray<vector_td<float,3> > > Gadgetron::test_amin<float,3>(cuNDArray< vector_td<float,3> > *, cuNDArray< vector_td<float,3> > *);
+template  boost::shared_ptr<cuNDArray<vector_td<float,4> > > Gadgetron::test_amin<float,4>(cuNDArray< vector_td<float,4> > *, cuNDArray< vector_td<float,4> > *);
+
+template  boost::shared_ptr<cuNDArray<vector_td<double,1> > > Gadgetron::test_amin<double,1>(cuNDArray< vector_td<double,1> > *, cuNDArray< vector_td<double,1> > *);
+template boost::shared_ptr<cuNDArray<vector_td<double,2> > > Gadgetron::test_amin<double,2>(cuNDArray< vector_td<double,2> > *, cuNDArray< vector_td<double,2> > *);
+template boost::shared_ptr<cuNDArray<vector_td<double,3> > > Gadgetron::test_amin<double,3>(cuNDArray< vector_td<double,3> > *, cuNDArray< vector_td<double,3> > *);
+template boost::shared_ptr<cuNDArray<vector_td<double,4> > > Gadgetron::test_amin<double,4>(cuNDArray< vector_td<double,4> > *, cuNDArray< vector_td<double,4> > *);
+
+
+
+template boost::shared_ptr<cuNDArray<vector_td<float,1> > > Gadgetron::test_amin2<float,1>(cuNDArray< vector_td<float,1> > *, float );
+template boost::shared_ptr<cuNDArray<vector_td<float,2> > > Gadgetron::test_amin2<float,2>(cuNDArray< vector_td<float,2> > *, float);
+template  boost::shared_ptr<cuNDArray<vector_td<float,3> > > Gadgetron::test_amin2<float,3>(cuNDArray< vector_td<float,3> > *, float);
+template  boost::shared_ptr<cuNDArray<vector_td<float,4> > > Gadgetron::test_amin2<float,4>(cuNDArray< vector_td<float,4> > *, float);
+
+template  boost::shared_ptr<cuNDArray<vector_td<double,1> > > Gadgetron::test_amin2<double,1>(cuNDArray< vector_td<double,1> > *, double);
+template boost::shared_ptr<cuNDArray<vector_td<double,2> > > Gadgetron::test_amin2<double,2>(cuNDArray< vector_td<double,2> > *, double);
+template boost::shared_ptr<cuNDArray<vector_td<double,3> > > Gadgetron::test_amin2<double,3>(cuNDArray< vector_td<double,3> > *, double);
+template boost::shared_ptr<cuNDArray<vector_td<double,4> > > Gadgetron::test_amin2<double,4>(cuNDArray< vector_td<double,4> > *, double);
+
+
+
+template boost::shared_ptr<cuNDArray<vector_td<float,1> > > Gadgetron::test_amax<float,1>(cuNDArray< vector_td<float,1> > *,cuNDArray< vector_td<float,1> > *);
+template boost::shared_ptr<cuNDArray<vector_td<float,2> > > Gadgetron::test_amax<float,2>(cuNDArray< vector_td<float,2> > *, cuNDArray< vector_td<float,2> > *);
+template  boost::shared_ptr<cuNDArray<vector_td<float,3> > > Gadgetron::test_amax<float,3>(cuNDArray< vector_td<float,3> > *, cuNDArray< vector_td<float,3> > *);
+template  boost::shared_ptr<cuNDArray<vector_td<float,4> > > Gadgetron::test_amax<float,4>(cuNDArray< vector_td<float,4> > *, cuNDArray< vector_td<float,4> > *);
+
+template  boost::shared_ptr<cuNDArray<vector_td<double,1> > > Gadgetron::test_amax<double,1>(cuNDArray< vector_td<double,1> > *, cuNDArray< vector_td<double,1> > *);
+template boost::shared_ptr<cuNDArray<vector_td<double,2> > > Gadgetron::test_amax<double,2>(cuNDArray< vector_td<double,2> > *, cuNDArray< vector_td<double,2> > *);
+template boost::shared_ptr<cuNDArray<vector_td<double,3> > > Gadgetron::test_amax<double,3>(cuNDArray< vector_td<double,3> > *, cuNDArray< vector_td<double,3> > *);
+template boost::shared_ptr<cuNDArray<vector_td<double,4> > > Gadgetron::test_amax<double,4>(cuNDArray< vector_td<double,4> > *, cuNDArray< vector_td<double,4> > *);
+
+
+template boost::shared_ptr<cuNDArray<vector_td<float,1> > > Gadgetron::test_amax2<float,1>(cuNDArray< vector_td<float,1> > *, float );
+template boost::shared_ptr<cuNDArray<vector_td<float,2> > > Gadgetron::test_amax2<float,2>(cuNDArray< vector_td<float,2> > *, float);
+template  boost::shared_ptr<cuNDArray<vector_td<float,3> > > Gadgetron::test_amax2<float,3>(cuNDArray< vector_td<float,3> > *, float);
+template  boost::shared_ptr<cuNDArray<vector_td<float,4> > > Gadgetron::test_amax2<float,4>(cuNDArray< vector_td<float,4> > *, float);
+
+template  boost::shared_ptr<cuNDArray<vector_td<double,1> > > Gadgetron::test_amax2<double,1>(cuNDArray< vector_td<double,1> > *, double);
+template boost::shared_ptr<cuNDArray<vector_td<double,2> > > Gadgetron::test_amax2<double,2>(cuNDArray< vector_td<double,2> > *, double);
+template boost::shared_ptr<cuNDArray<vector_td<double,3> > > Gadgetron::test_amax2<double,3>(cuNDArray< vector_td<double,3> > *, double);
+template boost::shared_ptr<cuNDArray<vector_td<double,4> > > Gadgetron::test_amax2<double,4>(cuNDArray< vector_td<double,4> > *, double);
+
+
+
+template void Gadgetron::vector_fill<float,1>(cuNDArray< vector_td<float,1> > *, vector_td<float,1>);
+template void Gadgetron::vector_fill<float,2>(cuNDArray< vector_td<float,2> > *, vector_td<float,2>);
+template void Gadgetron::vector_fill<float,3>(cuNDArray< vector_td<float,3> > *, vector_td<float,3>);
+template void Gadgetron::vector_fill<float,4>(cuNDArray< vector_td<float,4> > *, vector_td<float,4>);
+
+
+template void Gadgetron::vector_fill<double,1>(cuNDArray< vector_td<double,1> > *, vector_td<double,1>);
+template void Gadgetron::vector_fill<double,2>(cuNDArray< vector_td<double,2> > *, vector_td<double,2>);
+template void Gadgetron::vector_fill<double,3>(cuNDArray< vector_td<double,3> > *, vector_td<double,3>);
+template void Gadgetron::vector_fill<double,4>(cuNDArray< vector_td<double,4> > *, vector_td<double,4>);
diff --git a/test/cuVector_td_test_kernels.h b/test/cuVector_td_test_kernels.h
new file mode 100644
index 0000000..07280fb
--- /dev/null
+++ b/test/cuVector_td_test_kernels.h
@@ -0,0 +1,18 @@
+#pragma once
+#include "vector_td.h"
+#include "cuNDArray.h"
+#include "thrust/device_vector.h"
+namespace Gadgetron{
+
+template<class T, unsigned int D> void vector_fill(cuNDArray< vector_td<T,D> >* data,  vector_td<T,D> val);
+template<class T, unsigned int D> void test_abs(cuNDArray< vector_td<T,D> >* data);
+template<class T, unsigned int D> thrust::device_vector<T> test_norm(cuNDArray< vector_td<T,D> >* data);
+template<class T, unsigned int D> thrust::device_vector<T> test_min(cuNDArray< vector_td<T,D> >* data);
+
+template<class T, unsigned int D> thrust::device_vector<T> test_max(cuNDArray< vector_td<T,D> >* data);
+
+template<class T, unsigned int D> boost::shared_ptr<cuNDArray<vector_td<T,D> > > test_amax(cuNDArray< vector_td<T,D> >* data1, cuNDArray< vector_td<T,D> >* data2);
+template<class T, unsigned int D> boost::shared_ptr<cuNDArray<vector_td<T,D> > > test_amin(cuNDArray< vector_td<T,D> >* data1, cuNDArray< vector_td<T,D> >* data2);
+template<class T, unsigned int D> boost::shared_ptr<cuNDArray<vector_td<T,D> > > test_amin2(cuNDArray< vector_td<T,D> >* data, T val);
+template<class T, unsigned int D> boost::shared_ptr<cuNDArray<vector_td<T,D> > > test_amax2(cuNDArray< vector_td<T,D> >* data, T val);
+}
diff --git a/test/hoCuGTBLAS_test.cpp b/test/hoCuGTBLAS_test.cpp
new file mode 100644
index 0000000..78cdce7
--- /dev/null
+++ b/test/hoCuGTBLAS_test.cpp
@@ -0,0 +1,80 @@
+/*
+ * hoCuGTBLAS_test.cpp
+ *
+ *  Created on: Feb 28, 2013
+ *      Author: Dae
+ */
+#include "gtest/gtest.h"
+#include "hoCuNDArray_blas.h"
+#include <vector>
+using namespace Gadgetron;
+using testing::Types;
+template <typename T> class hoCuGTBLAS_Test : public ::testing::Test {
+	protected:
+	 virtual void SetUp() {
+		 unsigned int vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+		 dims= std::vector<unsigned int>(vdims,vdims+sizeof(vdims)/sizeof(unsigned int));
+		 Array =hoCuNDArray<T>(&dims);
+		 Array2 =hoCuNDArray<T>(&dims);
+
+
+	}
+	 std::vector<unsigned int> dims;
+	 hoCuNDArray<T> Array;
+	 hoCuNDArray<T> Array2;
+
+};
+
+typedef Types<float,double,float_complext,double_complext> Implementations;
+
+TYPED_TEST_CASE(hoCuGTBLAS_Test, Implementations);
+
+
+TYPED_TEST(hoCuGTBLAS_Test,dotTest){
+	this->Array.fill(TypeParam(1));
+	EXPECT_FLOAT_EQ(this->Array.get_number_of_elements(),real(dot(&this->Array,&this->Array)));
+
+	this->Array2.fill(TypeParam(2));
+	EXPECT_FLOAT_EQ(this->Array.get_number_of_elements()*2,real(dot(&this->Array,&this->Array2)));
+}
+
+TYPED_TEST(hoCuGTBLAS_Test,axpyTest){
+	this->Array.fill(TypeParam(71));
+	this->Array2.fill(TypeParam(97));
+	axpy(TypeParam(11),&this->Array,&this->Array2);
+
+	TypeParam val = this->Array2.get_data_ptr()[10];
+	EXPECT_FLOAT_EQ(878,real(val));
+
+}
+
+TYPED_TEST(hoCuGTBLAS_Test,nrm2Test){
+	this->Array.fill(TypeParam(1));
+	EXPECT_FLOAT_EQ(std::sqrt((double)this->Array.get_number_of_elements()),nrm2(&this->Array));
+	this->Array.fill(TypeParam(3));
+	EXPECT_FLOAT_EQ(std::sqrt(3.0*3.0*this->Array.get_number_of_elements()),nrm2(&this->Array));
+}
+
+TYPED_TEST(hoCuGTBLAS_Test,asumTest){
+	this->Array.fill(TypeParam(1));
+	EXPECT_FLOAT_EQ(this->Array.get_number_of_elements(),real(asum(&this->Array)));
+	this->Array.fill(TypeParam(-3));
+	EXPECT_FLOAT_EQ(this->Array.get_number_of_elements()*3,real(asum(&this->Array)));
+}
+
+TYPED_TEST(hoCuGTBLAS_Test,aminTest){
+	this->Array.fill(TypeParam(100));
+	this->Array.get_data_ptr()[23]=TypeParam(-50);
+	EXPECT_EQ(23,amin(&this->Array));
+	this->Array.get_data_ptr()[48]=TypeParam(2);
+	EXPECT_EQ(48,amin(&this->Array));
+
+}
+TYPED_TEST(hoCuGTBLAS_Test,amaxTest){
+	this->Array.fill(TypeParam(1));
+	this->Array.get_data_ptr()[23]=TypeParam(2);
+	EXPECT_EQ(23,amax(&this->Array));
+	this->Array.get_data_ptr()[48]=TypeParam(-50);
+	EXPECT_EQ(48,amax(&this->Array));
+
+}
diff --git a/test/hoCuNDArray_elemwise_test.cpp b/test/hoCuNDArray_elemwise_test.cpp
new file mode 100644
index 0000000..d09ca8c
--- /dev/null
+++ b/test/hoCuNDArray_elemwise_test.cpp
@@ -0,0 +1,144 @@
+#include "hoCuNDArray_blas.h"
+#include "hoCuNDArray_elemwise.h"
+#include <gtest/gtest.h>
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class hoCuNDArray_blas_Real : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    unsigned int vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<unsigned int>(vdims,vdims+sizeof(vdims)/sizeof(unsigned int));
+    Array = hoCuNDArray<T>(&dims);
+    Array2 = hoCuNDArray<T>(&dims);
+  }
+  std::vector<unsigned int> dims;
+  hoCuNDArray<T> Array;
+  hoCuNDArray<T> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+
+TYPED_TEST_CASE(hoCuNDArray_blas_Real, realImplementations);
+
+TYPED_TEST(hoCuNDArray_blas_Real,dotTest){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements(),real(dot(&this->Array,&this->Array)));
+  fill(&this->Array2,TypeParam(2));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements()*2,real(dot(&this->Array,&this->Array2)));
+}
+
+TYPED_TEST(hoCuNDArray_blas_Real,axpyTest){
+  fill(&this->Array,TypeParam(71));
+  fill(&this->Array2,TypeParam(97));
+  axpy(TypeParam(11),&this->Array,&this->Array2);
+  TypeParam val = this->Array2.get_data_ptr()[10];
+  EXPECT_FLOAT_EQ(878,real(val));
+}
+
+TYPED_TEST(hoCuNDArray_blas_Real,nrm2Test){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(std::sqrt((double)this->Array.get_number_of_elements()),nrm2(&this->Array));
+  fill(&this->Array,TypeParam(3));
+  EXPECT_FLOAT_EQ(std::sqrt(3.0*3.0*this->Array.get_number_of_elements()),nrm2(&this->Array));
+}
+
+TYPED_TEST(hoCuNDArray_blas_Real,asumTest){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements(),real(asum(&this->Array)));
+  fill(&this->Array,TypeParam(-3));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements()*3,real(asum(&this->Array)));
+}
+
+TYPED_TEST(hoCuNDArray_blas_Real,aminTest){
+  fill(&this->Array,TypeParam(100));
+  this->Array.get_data_ptr()[23]=TypeParam(-50);
+  EXPECT_EQ(23,amin(&this->Array));
+  this->Array.get_data_ptr()[48]=TypeParam(2);
+  EXPECT_EQ(48,amin(&this->Array));
+}
+
+TYPED_TEST(hoCuNDArray_blas_Real,amaxTest){
+  fill(&this->Array,TypeParam(1));
+  this->Array.get_data_ptr()[23]=TypeParam(2);
+  EXPECT_EQ(23,amax(&this->Array));
+  this->Array.get_data_ptr()[48]=TypeParam(-50);
+  EXPECT_EQ(48,amax(&this->Array));
+}
+
+
+template <typename T> class hoCuNDArray_blas_Cplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    unsigned int vdims[] = {37, 49}; //Using prime numbers for setup because they are messy
+    dims = std::vector<unsigned int>(vdims,vdims+sizeof(vdims)/sizeof(unsigned int));
+    Array = hoCuNDArray<T>(&dims);
+    Array2 = hoCuNDArray<T>(&dims);
+  }
+  std::vector<unsigned int> dims;
+  hoCuNDArray<T> Array;
+  hoCuNDArray<T> Array2;
+};
+
+typedef Types<float_complext, double_complext> cplxImplementations;
+
+TYPED_TEST_CASE(hoCuNDArray_blas_Cplx, cplxImplementations);
+
+TYPED_TEST(hoCuNDArray_blas_Cplx,dotTest){
+  fill(&this->Array,TypeParam(1,1));
+  TypeParam res = dot(&this->Array,&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,-1)*TypeParam(1,1))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(0,imag(res));
+  fill(&this->Array2,TypeParam(2,2));
+  res = dot(&this->Array2,&this->Array2);
+  EXPECT_FLOAT_EQ(real(TypeParam(2,-2)*TypeParam(2,2))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(0,imag(res));
+  res = dot(&this->Array,&this->Array2);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,-1)*TypeParam(2,2))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,-1)*TypeParam(2,2))*this->Array.get_number_of_elements(),imag(res));
+}
+
+TYPED_TEST(hoCuNDArray_blas_Cplx,axpyTest){
+  fill(&this->Array,TypeParam(71.1,23.3));
+  fill(&this->Array2,TypeParam(97.9,654.2));
+  axpy(TypeParam(11.4),&this->Array,&this->Array2);
+  TypeParam got = this->Array2.get_data_ptr()[546];
+  TypeParam wanted = TypeParam(71.1,23.3)*TypeParam(11.4)+TypeParam(97.9,654.2);
+  EXPECT_FLOAT_EQ(real(wanted),real(got));
+  EXPECT_FLOAT_EQ(imag(wanted),imag(got));
+}
+
+TYPED_TEST(hoCuNDArray_blas_Cplx,nrm2Test){
+  fill(&this->Array,TypeParam(1,1));
+  EXPECT_FLOAT_EQ(std::sqrt(real(TypeParam(1,-1)*TypeParam(1,1))*this->Array.get_number_of_elements()),nrm2(&this->Array));
+  fill(&this->Array,TypeParam(3.24,7.4));
+  // There will be rounding errors from the sum, so loosen comparison
+  EXPECT_NEAR(std::sqrt(real(TypeParam(3.24,-7.4)*TypeParam(3.24,7.4))*this->Array.get_number_of_elements()),nrm2(&this->Array),0.001);
+}
+
+TYPED_TEST(hoCuNDArray_blas_Cplx,asumTest){
+  fill(&this->Array,TypeParam(-3,1));
+  EXPECT_NEAR(4*this->Array.get_number_of_elements(),asum(&this->Array),0.0001);
+}
+
+TYPED_TEST(hoCuNDArray_blas_Cplx,aminTest){
+  fill(&this->Array,TypeParam(100,101));
+  this->Array.get_data_ptr()[23]=TypeParam(-50,-51);
+  EXPECT_EQ(23,amin(&this->Array));
+  this->Array.get_data_ptr()[48]=TypeParam(2,100);
+  EXPECT_EQ(23,amin(&this->Array));
+  this->Array.get_data_ptr()[1000]=TypeParam(-2,-76);
+  EXPECT_EQ(1000,amin(&this->Array));
+}
+
+TYPED_TEST(hoCuNDArray_blas_Cplx,amaxTest){
+  fill(&this->Array,TypeParam(1,1));
+  this->Array.get_data_ptr()[768]=TypeParam(4,4);
+  EXPECT_EQ(768,amax(&this->Array));
+  this->Array.get_data_ptr()[48]=TypeParam(6,1);
+  EXPECT_EQ(768,amax(&this->Array));
+  this->Array.get_data_ptr()[999]=TypeParam(-3,-6);
+  EXPECT_EQ(999,amax(&this->Array));
+}
diff --git a/test/hoNDArray_blas_test.cpp b/test/hoNDArray_blas_test.cpp
new file mode 100644
index 0000000..2b2f28e
--- /dev/null
+++ b/test/hoNDArray_blas_test.cpp
@@ -0,0 +1,144 @@
+#include "hoNDArray_math.h"
+#include "hoNDArray_elemwise.h"
+#include <gtest/gtest.h>
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class hoNDArray_blas_Real : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+
+TYPED_TEST_CASE(hoNDArray_blas_Real, realImplementations);
+
+TYPED_TEST(hoNDArray_blas_Real,dotTest){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements(),real(dot(&this->Array,&this->Array)));
+  fill(&this->Array2,TypeParam(2));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements()*2,real(dot(&this->Array,&this->Array2)));
+}
+
+TYPED_TEST(hoNDArray_blas_Real,axpyTest){
+  fill(&this->Array,TypeParam(71));
+  fill(&this->Array2,TypeParam(97));
+  axpy(TypeParam(11),&this->Array,&this->Array2);
+  TypeParam val = this->Array2.get_data_ptr()[10];
+  EXPECT_FLOAT_EQ(878,real(val));
+}
+
+TYPED_TEST(hoNDArray_blas_Real,nrm2Test){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(std::sqrt((double)this->Array.get_number_of_elements()),nrm2(&this->Array));
+  fill(&this->Array,TypeParam(3));
+  EXPECT_FLOAT_EQ(std::sqrt(3.0*3.0*this->Array.get_number_of_elements()),nrm2(&this->Array));
+}
+
+TYPED_TEST(hoNDArray_blas_Real,asumTest){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements(),real(asum(&this->Array)));
+  fill(&this->Array,TypeParam(-3));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements()*3,real(asum(&this->Array)));
+}
+
+TYPED_TEST(hoNDArray_blas_Real,aminTest){
+  fill(&this->Array,TypeParam(100));
+  this->Array.get_data_ptr()[23]=TypeParam(-50);
+  EXPECT_EQ(23,amin(&this->Array));
+  this->Array.get_data_ptr()[48]=TypeParam(2);
+  EXPECT_EQ(48,amin(&this->Array));
+}
+
+TYPED_TEST(hoNDArray_blas_Real,amaxTest){
+  fill(&this->Array,TypeParam(1));
+  this->Array.get_data_ptr()[23]=TypeParam(2);
+  EXPECT_EQ(23,amax(&this->Array));
+  this->Array.get_data_ptr()[48]=TypeParam(-50);
+  EXPECT_EQ(48,amax(&this->Array));
+}
+
+
+template <typename T> class hoNDArray_blas_Cplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+typedef Types<std::complex<float>, std::complex<double>, float_complext, double_complext> cplxImplementations;
+
+TYPED_TEST_CASE(hoNDArray_blas_Cplx, cplxImplementations);
+
+TYPED_TEST(hoNDArray_blas_Cplx,dotTest){
+  fill(&this->Array,TypeParam(1,1));
+  TypeParam res = dot(&this->Array,&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,-1)*TypeParam(1,1))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(0,imag(res));
+  fill(&this->Array2,TypeParam(2,2));
+  res = dot(&this->Array2,&this->Array2);
+  EXPECT_FLOAT_EQ(real(TypeParam(2,-2)*TypeParam(2,2))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(0,imag(res));
+  res = dot(&this->Array,&this->Array2);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,-1)*TypeParam(2,2))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,-1)*TypeParam(2,2))*this->Array.get_number_of_elements(),imag(res));
+}
+
+TYPED_TEST(hoNDArray_blas_Cplx,axpyTest){
+  fill(&this->Array,TypeParam(71.1,23.3));
+  fill(&this->Array2,TypeParam(97.9,654.2));
+  axpy(TypeParam(11.4),&this->Array,&this->Array2);
+  TypeParam got = this->Array2.get_data_ptr()[546];
+  TypeParam wanted = TypeParam(71.1,23.3)*TypeParam(11.4)+TypeParam(97.9,654.2);
+  EXPECT_FLOAT_EQ(real(wanted),real(got));
+  EXPECT_FLOAT_EQ(imag(wanted),imag(got));
+}
+
+TYPED_TEST(hoNDArray_blas_Cplx,nrm2Test){
+  fill(&this->Array,TypeParam(1,1));
+  EXPECT_FLOAT_EQ(std::sqrt(real(TypeParam(1,-1)*TypeParam(1,1))*this->Array.get_number_of_elements()),nrm2(&this->Array));
+  fill(&this->Array,TypeParam(3.24,7.4));
+  // There will be rounding errors from the sum, so loosen comparison
+  EXPECT_NEAR(std::sqrt(real(TypeParam(3.24,-7.4)*TypeParam(3.24,7.4))*this->Array.get_number_of_elements()),nrm2(&this->Array),0.001);
+}
+
+TYPED_TEST(hoNDArray_blas_Cplx,asumTest){
+  fill(&this->Array,TypeParam(-3,1));
+  EXPECT_NEAR(4*this->Array.get_number_of_elements(),asum(&this->Array),0.0001);
+}
+
+TYPED_TEST(hoNDArray_blas_Cplx,aminTest){
+  fill(&this->Array,TypeParam(100,101));
+  this->Array.get_data_ptr()[23]=TypeParam(-50,-51);
+  EXPECT_EQ(23,amin(&this->Array));
+  this->Array.get_data_ptr()[48]=TypeParam(2,100);
+  EXPECT_EQ(23,amin(&this->Array));
+  this->Array.get_data_ptr()[1000]=TypeParam(-2,-76);
+  EXPECT_EQ(1000,amin(&this->Array));
+}
+
+TYPED_TEST(hoNDArray_blas_Cplx,amaxTest){
+  fill(&this->Array,TypeParam(1,1));
+  this->Array.get_data_ptr()[768]=TypeParam(4,4);
+  EXPECT_EQ(768,amax(&this->Array));
+  this->Array.get_data_ptr()[48]=TypeParam(6,1);
+  EXPECT_EQ(768,amax(&this->Array));
+  this->Array.get_data_ptr()[999]=TypeParam(-3,-6);
+  EXPECT_EQ(999,amax(&this->Array));
+}
diff --git a/test/hoNDArray_elemwise_test.cpp b/test/hoNDArray_elemwise_test.cpp
new file mode 100644
index 0000000..bda1f47
--- /dev/null
+++ b/test/hoNDArray_elemwise_test.cpp
@@ -0,0 +1,618 @@
+#include "hoNDArray_elemwise.h"
+#include "complext.h"
+
+#include <gtest/gtest.h>
+#include <complex>
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class hoNDArray_elemwise_TestReal : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+template <typename T> class hoNDArray_elemwise_TestCplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+template <typename T> class hoNDArray_elemwise_TestCplx2 : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+template <typename T> class hoNDArray_elemwise_TestCplx3 : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+template <typename T> class hoNDArray_elemwise_TestCplx4 : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<typename realType<T>::Type>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+  hoNDArray<typename realType<T>::Type> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+typedef Types<std::complex<float>, std::complex<double>, float_complext, double_complext> cplxImplementations;
+typedef Types<std::complex<float>, std::complex<double> > stdCplxImplementations;
+typedef Types<float_complext, double_complext> cplxtImplementations;
+
+TYPED_TEST_CASE(hoNDArray_elemwise_TestReal, realImplementations);
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,fillTest){
+  fill(&this->Array,TypeParam(1.1));
+  EXPECT_FLOAT_EQ(1.1,TypeParam(this->Array.get_data_ptr()[5]));
+  fill(&this->Array,TypeParam(27.45));
+  EXPECT_FLOAT_EQ(27.45,TypeParam(this->Array.get_data_ptr()[3242]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,clearTest){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(1,TypeParam(this->Array.get_data_ptr()[5324]));
+  clear(&this->Array);
+  EXPECT_FLOAT_EQ(0,TypeParam(this->Array.get_data_ptr()[5324]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,absTest){
+  fill(&this->Array,TypeParam(-5.5));
+  EXPECT_FLOAT_EQ(TypeParam(-5.5),TypeParam(this->Array.get_data_ptr()[13]));
+  EXPECT_FLOAT_EQ(TypeParam(5.5),TypeParam(abs(&this->Array)->get_data_ptr()[13]));
+  fill(&this->Array,TypeParam(-1.3));
+  EXPECT_FLOAT_EQ(TypeParam(-1.3),TypeParam(this->Array.get_data_ptr()[2454]));
+  abs_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(1.3),TypeParam(this->Array.get_data_ptr()[2454]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,absSquareTest){
+  fill(&this->Array,TypeParam(-5.5));
+  EXPECT_FLOAT_EQ(TypeParam(-5.5),TypeParam(this->Array.get_data_ptr()[13]));
+  EXPECT_FLOAT_EQ(TypeParam(-5.5*-5.5),TypeParam(abs_square(&this->Array)->get_data_ptr()[13]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,sqrtTest){
+  fill(&this->Array,TypeParam(17.9));
+  EXPECT_FLOAT_EQ(std::sqrt(TypeParam(17.9)),TypeParam(sqrt(&this->Array)->get_data_ptr()[23433]));
+  fill(&this->Array,TypeParam(3.14));
+  sqrt_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(std::sqrt(TypeParam(3.14)),TypeParam(this->Array.get_data_ptr()[32343]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,squareTest){
+  fill(&this->Array,TypeParam(1.7));
+  EXPECT_FLOAT_EQ(TypeParam(1.7)*TypeParam(1.7),TypeParam(square(&this->Array)->get_data_ptr()[22542]));
+  fill(&this->Array,TypeParam(31.4));
+  square_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(31.4)*TypeParam(31.4),TypeParam(this->Array.get_data_ptr()[652252]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,reciprocalTest){
+  fill(&this->Array,TypeParam(11.7));
+  EXPECT_FLOAT_EQ(TypeParam(1)/TypeParam(11.7),TypeParam(reciprocal(&this->Array)->get_data_ptr()[45452]));
+  fill(&this->Array,TypeParam(314.114));
+  reciprocal_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(1)/TypeParam(314.114),TypeParam(this->Array.get_data_ptr()[43432]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,reciprocal_sqrtTest){
+  fill(&this->Array,TypeParam(1.9));
+  EXPECT_FLOAT_EQ(TypeParam(1)/std::sqrt(TypeParam(1.9)),TypeParam(reciprocal_sqrt(&this->Array)->get_data_ptr()[12345]));
+  fill(&this->Array,TypeParam(1.14));
+  reciprocal_sqrt_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(1)/std::sqrt(TypeParam(1.14)),TypeParam(this->Array.get_data_ptr()[0]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,sgnTest){
+  fill(&this->Array,TypeParam(-5.7));
+  this->Array.get_data_ptr()[91] = TypeParam(101.1);
+  this->Array.get_data_ptr()[19100] = TypeParam(0);
+  EXPECT_FLOAT_EQ(TypeParam(-1),TypeParam(sgn(&this->Array)->get_data_ptr()[28]));
+  EXPECT_FLOAT_EQ(TypeParam(1),TypeParam(sgn(&this->Array)->get_data_ptr()[91]));
+  EXPECT_FLOAT_EQ(TypeParam(0),TypeParam(sgn(&this->Array)->get_data_ptr()[19100]));
+  fill(&this->Array,TypeParam(-5.7));
+  this->Array.get_data_ptr()[9100] = TypeParam(101.1);
+  this->Array.get_data_ptr()[19100] = TypeParam(0);
+  sgn_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(-1),TypeParam(this->Array.get_data_ptr()[2800]));
+  EXPECT_FLOAT_EQ(TypeParam(1),TypeParam(this->Array.get_data_ptr()[9100]));
+  EXPECT_FLOAT_EQ(TypeParam(0),TypeParam(this->Array.get_data_ptr()[19100]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,clampTest){
+  fill(&this->Array,TypeParam(-5.7));
+  this->Array.get_data_ptr()[354222] = TypeParam(101.3);
+  clamp(&this->Array,TypeParam(4.9),TypeParam(100.0));
+  EXPECT_FLOAT_EQ(TypeParam(4.9),this->Array.get_data_ptr()[3435]);
+  EXPECT_FLOAT_EQ(TypeParam(100.0),this->Array.get_data_ptr()[354222]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,clamp_minTest){
+  fill(&this->Array,TypeParam(-5.7));
+  this->Array.get_data_ptr()[91] = TypeParam(-101.3);
+  clamp_min(&this->Array,TypeParam(-10.6));
+  EXPECT_FLOAT_EQ(TypeParam(-5.7),this->Array.get_data_ptr()[28]);
+  EXPECT_FLOAT_EQ(TypeParam(-10.6),this->Array.get_data_ptr()[91]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,clamp_maxTest){
+  fill(&this->Array,TypeParam(5.7));
+  this->Array.get_data_ptr()[91] = TypeParam(101.3);
+  clamp_max(&this->Array,TypeParam(10.6));
+  EXPECT_FLOAT_EQ(TypeParam(5.7),this->Array.get_data_ptr()[28]);
+  EXPECT_FLOAT_EQ(TypeParam(10.6),this->Array.get_data_ptr()[91]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,normalizeTest){
+  fill(&this->Array,TypeParam(50));
+  this->Array.get_data_ptr()[23]=TypeParam(-200);
+  normalize(&this->Array,110);
+  EXPECT_FLOAT_EQ(TypeParam(50)*TypeParam(110)/abs(TypeParam(-200)),this->Array.get_data_ptr()[12345]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,shrink1Test){
+  fill(&this->Array,TypeParam(1.2));
+  shrink1(&this->Array,0.75);
+  EXPECT_FLOAT_EQ(TypeParam(1.2)/abs(TypeParam(1.2))*std::max(abs(TypeParam(1.2))-0.75,0.0),this->Array.get_data_ptr()[125]);
+  fill(&this->Array,TypeParam(1));
+  shrink1(&this->Array,2.0);
+  EXPECT_FLOAT_EQ(0.0,this->Array.get_data_ptr()[125]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,shrinkdTest){
+  fill(&this->Array,TypeParam(1.2));
+  fill(&this->Array2,TypeParam(4.0));
+  shrinkd(&this->Array,&this->Array2,1.0);
+  EXPECT_FLOAT_EQ(TypeParam(1.2)/TypeParam(4.0)*std::max(4.0-1.0,0.0),this->Array.get_data_ptr()[125]);
+  shrinkd(&this->Array,&this->Array2,8.0);
+  EXPECT_FLOAT_EQ(0.0,this->Array.get_data_ptr()[125]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,realTest){
+  fill(&this->Array,TypeParam(1.2));
+  EXPECT_FLOAT_EQ(TypeParam(1.2),real(&this->Array)->at(125));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,imagTest){
+  fill(&this->Array,TypeParam(1.2));
+  EXPECT_FLOAT_EQ(TypeParam(0.0),imag(&this->Array)->at(125));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,conjTest){
+  fill(&this->Array,TypeParam(1.2));
+  EXPECT_FLOAT_EQ(TypeParam(1.2),real(&this->Array)->at(125));
+  EXPECT_FLOAT_EQ(TypeParam(0.0),imag(&this->Array)->at(125));
+}
+
+TYPED_TEST_CASE(hoNDArray_elemwise_TestCplx, cplxImplementations);
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,fillTest){
+  fill(&this->Array,TypeParam(1.1,2.2));
+  EXPECT_FLOAT_EQ(1.1,real(TypeParam(this->Array.get_data_ptr()[52323])));
+  EXPECT_FLOAT_EQ(2.2,imag(TypeParam(this->Array.get_data_ptr()[52323])));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,clearTest){
+  fill(&this->Array,TypeParam(1,1));
+  clear(&this->Array);
+  EXPECT_FLOAT_EQ(0,real(TypeParam(this->Array.get_data_ptr()[325])));
+  EXPECT_FLOAT_EQ(0,imag(TypeParam(this->Array.get_data_ptr()[325])));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,absTest){
+  fill(&this->Array,TypeParam(-5.5,7.7));
+  EXPECT_FLOAT_EQ(std::sqrt(5.5*5.5+7.7*7.7),abs(&this->Array)->get_data_ptr()[32113]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,absSquareTest){
+  fill(&this->Array,TypeParam(-5.5,7.7));
+  EXPECT_FLOAT_EQ(5.5*5.5+7.7*7.7,abs_square(&this->Array)->get_data_ptr()[32113]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,sqrtTest){
+  fill(&this->Array,TypeParam(17.9,3.5));
+  EXPECT_NEAR(real(sqrt(TypeParam(17.9,3.5))),real(sqrt(&this->Array)->get_data_ptr()[2131]),0.00001);
+  EXPECT_NEAR(imag(sqrt(TypeParam(17.9,3.5))),imag(sqrt(&this->Array)->get_data_ptr()[2131]),0.00001);
+  fill(&this->Array,TypeParam(3.14,4.13));
+  sqrt_inplace(&this->Array);
+  EXPECT_NEAR(real(sqrt(TypeParam(3.14,4.13))),real(this->Array.get_data_ptr()[120000]),0.00001);
+  EXPECT_NEAR(imag(sqrt(TypeParam(3.14,4.13))),imag(this->Array.get_data_ptr()[120000]),0.00001);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,squareTest){
+  fill(&this->Array,TypeParam(1.7,7.1));
+  EXPECT_FLOAT_EQ(real(TypeParam(1.7,7.1)*TypeParam(1.7,7.1)),real(square(&this->Array)->get_data_ptr()[22123]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1.7,7.1)*TypeParam(1.7,7.1)),imag(square(&this->Array)->get_data_ptr()[22123]));
+  fill(&this->Array,TypeParam(31.4,4.31));
+  square_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(31.4,4.31)*TypeParam(31.4,4.31)),real(this->Array.get_data_ptr()[51234]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(31.4,4.31)*TypeParam(31.4,4.31)),imag(this->Array.get_data_ptr()[51234]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,reciprocalTest){
+  fill(&this->Array,TypeParam(1.9,2.7));
+  EXPECT_FLOAT_EQ(real(TypeParam(1,0)/TypeParam(1.9,2.7)),real(reciprocal(&this->Array)->get_data_ptr()[11232]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,0)/TypeParam(1.9,2.7)),imag(reciprocal(&this->Array)->get_data_ptr()[11232]));
+  fill(&this->Array,TypeParam(1.14,4.32));
+  reciprocal_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,0)/TypeParam(1.14,4.32)),real(this->Array.get_data_ptr()[10]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,0)/TypeParam(1.14,4.32)),imag(this->Array.get_data_ptr()[10]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,reciprocal_sqrtTest){
+  fill(&this->Array,TypeParam(1.9,2.7));
+  EXPECT_FLOAT_EQ(real(TypeParam(1,0)/sqrt(TypeParam(1.9,2.7))),real(reciprocal_sqrt(&this->Array)->get_data_ptr()[12543]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,0)/sqrt(TypeParam(1.9,2.7))),imag(reciprocal_sqrt(&this->Array)->get_data_ptr()[12543]));
+  fill(&this->Array,TypeParam(1.14,4.32));
+  reciprocal_sqrt_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,0)/sqrt(TypeParam(1.14,4.32))),real(this->Array.get_data_ptr()[10000]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,0)/sqrt(TypeParam(1.14,4.32))),imag(this->Array.get_data_ptr()[10000]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,realImagTest){
+  fill(&this->Array,TypeParam(3.4,4.2));
+  EXPECT_FLOAT_EQ(3.4,real(&this->Array)->get_data_ptr()[33425]);
+  EXPECT_NEAR(4.2,imag(&this->Array)->get_data_ptr()[45], 0.000001);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,conjTest){
+  fill(&this->Array,TypeParam(3.4,4.2));
+  EXPECT_FLOAT_EQ(3.4,real(conj(&this->Array)->at(33425)));
+  EXPECT_NEAR(-4.2,imag(conj(&this->Array)->at(45)), 0.000001);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,normalizeTest){
+  fill(&this->Array,TypeParam(50,50));
+  this->Array.get_data_ptr()[23]=TypeParam(-200,-200);
+  normalize(&this->Array,110);
+  EXPECT_FLOAT_EQ(real(TypeParam(50,50)*real(TypeParam(110,110))/abs(TypeParam(-200,-200))),real(&this->Array)->get_data_ptr()[12345]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(50,50)*real(TypeParam(110,110))/abs(TypeParam(-200,-200))),imag(&this->Array)->get_data_ptr()[12345]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,clampTest){
+  fill(&this->Array,TypeParam(-5.7, -4.6));
+  this->Array.get_data_ptr()[354222] = TypeParam(101.3,203.4);
+  clamp(&this->Array,real(TypeParam(4.9,0)),real(TypeParam(100.0,0)));
+  EXPECT_FLOAT_EQ(real(TypeParam(4.9,0)),real(&this->Array)->get_data_ptr()[3435]);
+  EXPECT_FLOAT_EQ(real(TypeParam(100.0,0)),real(&this->Array)->get_data_ptr()[354222]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(4.9,0)),imag(&this->Array)->get_data_ptr()[3435]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(100.0,0)),imag(&this->Array)->get_data_ptr()[354222]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,clamp_minTest){
+  fill(&this->Array,TypeParam(-5.7, -4.6));
+  this->Array.get_data_ptr()[91] = TypeParam(-101.3, -203.4);
+  clamp_min(&this->Array, real(TypeParam(-10.6,0)));
+  EXPECT_FLOAT_EQ(real(TypeParam(-5.7,0)),real(&this->Array)->get_data_ptr()[28]);
+  EXPECT_FLOAT_EQ(real(TypeParam(-10.6,0)),real(&this->Array)->get_data_ptr()[91]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(-5.7,0)),imag(&this->Array)->get_data_ptr()[28]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(-10.6,0)),imag(&this->Array)->get_data_ptr()[91]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,clamp_maxTest){
+  fill(&this->Array,TypeParam(5.7, 4.6));
+  this->Array.get_data_ptr()[91] = TypeParam(101.3, 203.4);
+  clamp_max(&this->Array,real(TypeParam(10.6,0)));
+  EXPECT_FLOAT_EQ(real(TypeParam(5.7,0)),real(&this->Array)->get_data_ptr()[28]);
+  EXPECT_FLOAT_EQ(real(TypeParam(10.6,0)),real(&this->Array)->get_data_ptr()[91]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(5.7,0)),imag(&this->Array)->get_data_ptr()[28]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(10.6,0)),imag(&this->Array)->get_data_ptr()[91]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,shrink1Test){
+  fill(&this->Array,TypeParam(1.2,1.4));
+  shrink1(&this->Array,0.75);
+  EXPECT_FLOAT_EQ(real(TypeParam(1.2,1.4)/abs(TypeParam(1.2,1.4)))*std::max(abs(TypeParam(1.2,1.4))-0.75,0.0),real(&this->Array)->get_data_ptr()[125]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(1.2,1.4)/abs(TypeParam(1.2,1.4)))*std::max(abs(TypeParam(1.2,1.4))-0.75,0.0),imag(&this->Array)->get_data_ptr()[125]);
+  fill(&this->Array,TypeParam(1,1));
+  shrink1(&this->Array,2.0);
+  EXPECT_FLOAT_EQ(0.0,real(&this->Array)->get_data_ptr()[125]);
+  EXPECT_FLOAT_EQ(0.0,imag(&this->Array)->get_data_ptr()[23125]);
+}
+
+TYPED_TEST_CASE(hoNDArray_elemwise_TestCplx4, cplxImplementations);
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx4,shrinkdTest){
+  fill(&this->Array,TypeParam(1.2,1.4));
+  fill(&this->Array2,real(TypeParam(4.0,4.0)));
+  shrinkd(&this->Array,&this->Array2,1.0);
+  EXPECT_FLOAT_EQ(real(TypeParam(1.2,1.4)/real(TypeParam(4.0,4.0)))*std::max(4.0-1.0,0.0),real(&this->Array)->get_data_ptr()[125]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(1.2,1.4)/imag(TypeParam(4.0,4.0)))*std::max(4.0-1.0,0.0),imag(&this->Array)->get_data_ptr()[125]);
+  shrinkd(&this->Array,&this->Array2,8.0);
+  EXPECT_FLOAT_EQ(0.0,real(&this->Array)->get_data_ptr()[125]);
+  EXPECT_FLOAT_EQ(0.0,imag(&this->Array)->get_data_ptr()[23125]);
+}
+
+TYPED_TEST_CASE(hoNDArray_elemwise_TestCplx2, stdCplxImplementations);
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx2,realToCplxTest){
+  fill(&this->Array,TypeParam(3.4,4.2));
+  EXPECT_FLOAT_EQ(3.4,real(real_to_complex<TypeParam>(real(&this->Array).get())->get_data_ptr()[33425]));
+  EXPECT_FLOAT_EQ(0.0,imag(real_to_complex<TypeParam>(real(&this->Array).get())->get_data_ptr()[33425]));
+}
+
+TYPED_TEST_CASE(hoNDArray_elemwise_TestCplx3, cplxtImplementations);
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx3,realToCplxTest){
+  fill(&this->Array,TypeParam(3.4,4.2));
+  EXPECT_FLOAT_EQ(3.4,real(real_to_complex<TypeParam>(real(&this->Array).get())->get_data_ptr()[33425]));
+  EXPECT_FLOAT_EQ(0.0,imag(real_to_complex<TypeParam>(real(&this->Array).get())->get_data_ptr()[33425]));
+}
+
+template <typename T> class hoNDArray_operators_TestReal : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    size_t vdims2[] = {37, 49}; //Smaller dimensionality to test batch mode
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    dims2 = std::vector<size_t>(vdims2,vdims2+sizeof(vdims2)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims2);
+  }
+  std::vector<size_t> dims;
+  std::vector<size_t> dims2;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+template <typename T> class hoNDArray_operators_TestCplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    size_t vdims2[] = {37, 49}; //Smaller dimensionality to test batch mode
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    dims2 = std::vector<size_t>(vdims2,vdims2+sizeof(vdims2)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims2);
+  }
+  std::vector<size_t> dims;
+  std::vector<size_t> dims2;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+typedef Types<std::complex<float>, std::complex<double>, float_complext, double_complext> cplxImplementations;
+
+TYPED_TEST_CASE(hoNDArray_operators_TestReal, realImplementations);
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsAddTest1){
+  TypeParam v1 = TypeParam(46865.35435);
+  TypeParam v2 = TypeParam(13784.34);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array += this->Array2;
+  EXPECT_FLOAT_EQ(v1+v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsAddTest2){
+  TypeParam v1 = TypeParam(98.4);
+  TypeParam v2 = TypeParam(2.2);
+  unsigned int idx = 12295;
+  fill(&this->Array,v1);
+  this->Array += v2;
+  EXPECT_FLOAT_EQ(v1+v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsSubtractTest1){
+  TypeParam v1 = TypeParam(98475334.34);
+  TypeParam v2 = TypeParam(2452.234);
+  unsigned int idx = 124999;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array -= this->Array2;
+  EXPECT_FLOAT_EQ(v1-v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsSubtractTest2){
+  TypeParam v1 = TypeParam(4.4);
+  TypeParam v2 = TypeParam(9212.21);
+  unsigned int idx = 122131;
+  fill(&this->Array,v1);
+  this->Array -= v2;
+  EXPECT_FLOAT_EQ(v1-v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsMultiplyTest1){
+  TypeParam v1 = TypeParam(342.145);
+  TypeParam v2 = TypeParam(43545.43);
+  unsigned int idx = 12344;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array *= this->Array2;
+  EXPECT_FLOAT_EQ(v1*v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsMultiplyTest2){
+  TypeParam v1 = TypeParam(43534.443);
+  TypeParam v2 = TypeParam(92.842);
+  unsigned int idx = 96735;
+  fill(&this->Array,v1);
+  this->Array *= v2;
+  EXPECT_FLOAT_EQ(v1*v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsDivideTest1){
+  TypeParam v1 = TypeParam(644.24);
+  TypeParam v2 = TypeParam(38564.64);
+  unsigned int idx = 98322;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array /= this->Array2;
+  EXPECT_FLOAT_EQ(v1/v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsDivideTest2){
+  TypeParam v1 = TypeParam(56342.24);
+  TypeParam v2 = TypeParam(23434.34);
+  unsigned int idx = 12591;
+  fill(&this->Array,v1);
+  this->Array /= v2;
+  EXPECT_FLOAT_EQ(v1/v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST_CASE(hoNDArray_operators_TestCplx, cplxImplementations);
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsAddTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array += this->Array2;
+  EXPECT_FLOAT_EQ(real(v1+v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1+v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsAddTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 12925;
+  fill(&this->Array,v1);
+  this->Array += v2;
+  EXPECT_FLOAT_EQ(real(v1+v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1+v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsAddTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 12295;
+  fill(&this->Array,v1);
+  this->Array += real(v2);
+  EXPECT_FLOAT_EQ(real(v1+v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1+v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsSubtractTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array -= this->Array2;
+  EXPECT_FLOAT_EQ(real(v1-v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1-v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsSubtractTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 12925;
+  fill(&this->Array,v1);
+  this->Array -= v2;
+  EXPECT_FLOAT_EQ(real(v1-v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1-v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsSubtractTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 12925;
+  fill(&this->Array,v1);
+  this->Array -= real(v2);
+  EXPECT_FLOAT_EQ(real(v1-v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1-v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsMultiplyTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array *= this->Array2;
+  EXPECT_FLOAT_EQ(real(v1*v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1*v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsMultiplyTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 12925;
+  fill(&this->Array,v1);
+  this->Array *= v2;
+  EXPECT_FLOAT_EQ(real(v1*v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1*v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsMultiplyTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 12295;
+  fill(&this->Array,v1);
+  this->Array *= real(v2);
+  EXPECT_FLOAT_EQ(real(v1*v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1*v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsDivideTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array /= this->Array2;
+  EXPECT_FLOAT_EQ(real(v1/v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1/v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsDivideTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 12295;
+  fill(&this->Array,v1);
+  this->Array /= v2;
+  EXPECT_FLOAT_EQ(real(v1/v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1/v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsDivideTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 12295;
+  fill(&this->Array,v1);
+  this->Array /= real(v2);
+  EXPECT_FLOAT_EQ(real(v1/v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1/v2),imag(this->Array.get_data_ptr()[idx]));
+}
diff --git a/test/hoNDArray_operators_test.cpp b/test/hoNDArray_operators_test.cpp
new file mode 100644
index 0000000..90ce1e9
--- /dev/null
+++ b/test/hoNDArray_operators_test.cpp
@@ -0,0 +1,249 @@
+#include "hoNDArray_elemwise.h"
+#include "complext.h"
+
+#include <gtest/gtest.h>
+#include <complex>
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class hoNDArray_operators_TestReal : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    size_t vdims2[] = {37, 49}; //Smaller dimensionality to test batch mode
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    dims2 = std::vector<size_t>(vdims2,vdims2+sizeof(vdims2)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims2);
+  }
+  std::vector<size_t> dims;
+  std::vector<size_t> dims2;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+template <typename T> class hoNDArray_operators_TestCplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    size_t vdims2[] = {37, 49}; //Smaller dimensionality to test batch mode
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    dims2 = std::vector<size_t>(vdims2,vdims2+sizeof(vdims2)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims2);
+  }
+  std::vector<size_t> dims;
+  std::vector<size_t> dims2;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+typedef Types<std::complex<float>, std::complex<double>, float_complext, double_complext> cplxImplementations;
+
+TYPED_TEST_CASE(hoNDArray_operators_TestReal, realImplementations);
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsAddTest1){
+  TypeParam v1 = TypeParam(46865.35435);
+  TypeParam v2 = TypeParam(13784.34);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array += this->Array2;
+  EXPECT_FLOAT_EQ(v1+v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsAddTest2){
+  TypeParam v1 = TypeParam(98.4);
+  TypeParam v2 = TypeParam(2.2);
+  unsigned int idx = 12295;
+  fill(&this->Array,v1);
+  this->Array += v2;
+  EXPECT_FLOAT_EQ(v1+v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsSubtractTest1){
+  TypeParam v1 = TypeParam(98475334.34);
+  TypeParam v2 = TypeParam(2452.234);
+  unsigned int idx = 124999;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array -= this->Array2;
+  EXPECT_FLOAT_EQ(v1-v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsSubtractTest2){
+  TypeParam v1 = TypeParam(4.4);
+  TypeParam v2 = TypeParam(9212.21);
+  unsigned int idx = 122131;
+  fill(&this->Array,v1);
+  this->Array -= v2;
+  EXPECT_FLOAT_EQ(v1-v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsMultiplyTest1){
+  TypeParam v1 = TypeParam(342.145);
+  TypeParam v2 = TypeParam(43545.43);
+  unsigned int idx = 12344;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array *= this->Array2;
+  EXPECT_FLOAT_EQ(v1*v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsMultiplyTest2){
+  TypeParam v1 = TypeParam(43534.443);
+  TypeParam v2 = TypeParam(92.842);
+  unsigned int idx = 96735;
+  fill(&this->Array,v1);
+  this->Array *= v2;
+  EXPECT_FLOAT_EQ(v1*v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsDivideTest1){
+  TypeParam v1 = TypeParam(644.24);
+  TypeParam v2 = TypeParam(38564.64);
+  unsigned int idx = 98322;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array /= this->Array2;
+  EXPECT_FLOAT_EQ(v1/v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsDivideTest2){
+  TypeParam v1 = TypeParam(56342.24);
+  TypeParam v2 = TypeParam(23434.34);
+  unsigned int idx = 12591;
+  fill(&this->Array,v1);
+  this->Array /= v2;
+  EXPECT_FLOAT_EQ(v1/v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST_CASE(hoNDArray_operators_TestCplx, cplxImplementations);
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsAddTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array += this->Array2;
+  EXPECT_FLOAT_EQ(real(v1+v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1+v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsAddTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 12925;
+  fill(&this->Array,v1);
+  this->Array += v2;
+  EXPECT_FLOAT_EQ(real(v1+v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1+v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsAddTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 12295;
+  fill(&this->Array,v1);
+  this->Array += real(v2);
+  EXPECT_FLOAT_EQ(real(v1+v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1+v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsSubtractTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array -= this->Array2;
+  EXPECT_FLOAT_EQ(real(v1-v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1-v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsSubtractTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 12925;
+  fill(&this->Array,v1);
+  this->Array -= v2;
+  EXPECT_FLOAT_EQ(real(v1-v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1-v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsSubtractTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 12925;
+  fill(&this->Array,v1);
+  this->Array -= real(v2);
+  EXPECT_FLOAT_EQ(real(v1-v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1-v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsMultiplyTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array *= this->Array2;
+  EXPECT_FLOAT_EQ(real(v1*v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1*v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsMultiplyTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 12925;
+  fill(&this->Array,v1);
+  this->Array *= v2;
+  EXPECT_FLOAT_EQ(real(v1*v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1*v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsMultiplyTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 12295;
+  fill(&this->Array,v1);
+  this->Array *= real(v2);
+  EXPECT_FLOAT_EQ(real(v1*v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1*v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsDivideTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array /= this->Array2;
+  EXPECT_FLOAT_EQ(real(v1/v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1/v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsDivideTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 12295;
+  fill(&this->Array,v1);
+  this->Array /= v2;
+  EXPECT_FLOAT_EQ(real(v1/v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1/v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsDivideTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 12295;
+  fill(&this->Array,v1);
+  this->Array /= real(v2);
+  EXPECT_FLOAT_EQ(real(v1/v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1/v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
diff --git a/test/hoNDArray_utils_test.cpp b/test/hoNDArray_utils_test.cpp
new file mode 100644
index 0000000..dfe0f8c
--- /dev/null
+++ b/test/hoNDArray_utils_test.cpp
@@ -0,0 +1,186 @@
+#include "hoNDArray_utils.h"
+#include "hoNDArray_elemwise.h"
+#include "complext.h"
+#include "GadgetronTimer.h"
+
+#include <gtest/gtest.h>
+#include <complex>
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class hoNDArray_utils_TestReal : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+};
+
+template <typename T> class hoNDArray_utils_TestCplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+};
+
+typedef Types<float, double> realImplementations;
+typedef Types</*std::complex<float>, std::complex<double>,*/ float_complext, double_complext> cplxImplementations;
+
+TYPED_TEST_CASE(hoNDArray_utils_TestReal, realImplementations);
+
+TYPED_TEST(hoNDArray_utils_TestReal,permuteTest){
+
+  fill(&this->Array,TypeParam(1));
+
+  std::vector<size_t> order;
+  order.push_back(0); order.push_back(1); order.push_back(2); order.push_back(3);
+
+  this->Array.get_data_ptr()[37] = TypeParam(2);
+
+  EXPECT_FLOAT_EQ(1, permute(&this->Array,&order)->at(0));
+  EXPECT_FLOAT_EQ(2, permute(&this->Array,&order)->at(37));
+
+  order.clear();
+  order.push_back(1); order.push_back(0); order.push_back(2); order.push_back(3);
+
+  EXPECT_FLOAT_EQ(2, permute(&this->Array,&order)->at(1));
+
+  order.clear();
+  order.push_back(3); order.push_back(1); order.push_back(2); order.push_back(0);
+
+  EXPECT_FLOAT_EQ(2, permute(&this->Array,&order)->at(19));
+
+  order.clear();
+  order.push_back(2); order.push_back(0); order.push_back(1); order.push_back(3);
+
+  EXPECT_FLOAT_EQ(2, permute(&this->Array,&order)->at(851));
+}
+
+TYPED_TEST(hoNDArray_utils_TestReal,shiftDimTest){
+
+  fill(&this->Array,TypeParam(1));
+  this->Array.get_data_ptr()[37] = 2;
+
+  EXPECT_FLOAT_EQ(1, shift_dim(&this->Array,0)->at(0));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,0)->at(37));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,1)->at(1));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,-1)->at(37*19));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,2)->at(23*37*19));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,3)->at(37*19));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,4)->at(37));
+}
+
+TYPED_TEST(hoNDArray_utils_TestReal,sumTest){
+  TypeParam v1 = TypeParam(12.34);
+  unsigned int idx = 0;
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(49*v1,sum(&this->Array,1)->get_data_ptr()[idx]);
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(23*v1,sum(&this->Array,2)->get_data_ptr()[idx]);
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(19*v1,sum(&this->Array,3)->get_data_ptr()[idx]);
+}
+
+TYPED_TEST_CASE(hoNDArray_utils_TestCplx, cplxImplementations);
+
+TYPED_TEST(hoNDArray_utils_TestCplx,permuteTest){
+
+  fill(&this->Array,TypeParam(1,1));
+
+  std::vector<size_t> order;
+  order.push_back(0); order.push_back(1); order.push_back(2); order.push_back(3);
+  
+  this->Array.get_data_ptr()[37] = TypeParam(2,3);
+
+  EXPECT_FLOAT_EQ(1, real(permute(&this->Array,&order)->at(0)));
+  EXPECT_FLOAT_EQ(1, imag(permute(&this->Array,&order)->at(0)));
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array,&order)->at(37)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array,&order)->at(37)));
+
+  order.clear();
+  order.push_back(1); order.push_back(0); order.push_back(2); order.push_back(3);
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array,&order)->at(1)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array,&order)->at(1)));
+
+  order.clear();
+  order.push_back(3); order.push_back(1); order.push_back(2); order.push_back(0);
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array,&order)->at(19)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array,&order)->at(19)));
+
+  order.clear();
+  order.push_back(2); order.push_back(0); order.push_back(1); order.push_back(3);
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array,&order)->at(851)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array,&order)->at(851)));
+
+  order.clear();
+  order.push_back(0); order.push_back(1); order.push_back(3); order.push_back(2);
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array, &order)->at(37)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array, &order)->at(37)));
+
+  order.clear();
+  order.push_back(0); order.push_back(2); order.push_back(3); order.push_back(1);
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array, &order)->at(37*23*19)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array, &order)->at(37*23*19)));
+}
+
+TYPED_TEST(hoNDArray_utils_TestCplx,shiftDimTest){
+
+  fill(&this->Array,TypeParam(1,1));
+  this->Array.get_data_ptr()[37]=TypeParam(2,3);
+
+  EXPECT_FLOAT_EQ(1, real(shift_dim(&this->Array,0)->at(0)));
+  EXPECT_FLOAT_EQ(1, imag(shift_dim(&this->Array,0)->at(0)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,0)->at(37)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,0)->at(37)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,1)->at(1)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,1)->at(1)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,-1)->at(37*19)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,-1)->at(37*19)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,2)->at(23*37*19)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,2)->at(23*37*19)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,3)->at(37*19)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,3)->at(37*19)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,4)->at(37)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,4)->at(37)));
+}
+
+TYPED_TEST(hoNDArray_utils_TestCplx,sumTest){
+  TypeParam v1 = TypeParam(12.34, 56.78);
+  unsigned int idx = 0;
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(real(TypeParam(49)*v1),real(sum(&this->Array,1)->get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(49)*v1),imag(sum(&this->Array,1)->get_data_ptr()[idx]));
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(real(TypeParam(23)*v1),real(sum(&this->Array,2)->get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(23)*v1),imag(sum(&this->Array,2)->get_data_ptr()[idx]));
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(real(TypeParam(19)*v1),real(sum(&this->Array,3)->get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(19)*v1),imag(sum(&this->Array,3)->get_data_ptr()[idx]));
+}
diff --git a/test/hoNDFFT_test.cpp b/test/hoNDFFT_test.cpp
new file mode 100644
index 0000000..5ec9fba
--- /dev/null
+++ b/test/hoNDFFT_test.cpp
@@ -0,0 +1,46 @@
+#include "hoNDFFT.h"
+#include "hoNDArray_math.h"
+#include "complext.h"
+#include <gtest/gtest.h>
+#include <boost/random.hpp>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template<typename REAL> class hoNDFFT_test : public ::testing::Test {
+protected:
+	virtual void SetUp(){
+		boost::random::mt19937 rng;
+		boost::random::uniform_real_distribution<REAL> uni(0,1);
+		std::vector<size_t > dimensions(3,128);
+
+		Array = hoNDArray<complext<REAL> >(dimensions);
+		complext<REAL>* data = Array.get_data_ptr();
+
+		for (size_t i = 0; i < Array.get_number_of_elements(); i++)
+			data[i] = complext<REAL>(uni(rng),uni(rng));
+
+		Array2 = Array;
+	}
+
+	hoNDArray<complext<REAL> > Array;
+
+	hoNDArray<complext<REAL> > Array2;
+
+};
+typedef Types<float, double> realImplementations;
+TYPED_TEST_CASE(hoNDFFT_test, realImplementations);
+
+TYPED_TEST(hoNDFFT_test,fftNrm2Test){
+	hoNDFFT<TypeParam>::instance()->fft(&this->Array);
+
+	EXPECT_NEAR(nrm2(&this->Array2),nrm2(&this->Array),nrm2(&this->Array)*1e-3);
+
+}
+
+TYPED_TEST(hoNDFFT_test,ifftNrm2Test){
+	hoNDFFT<TypeParam>::instance()->ifft(&this->Array);
+
+	EXPECT_NEAR(nrm2(&this->Array2),nrm2(&this->Array),nrm2(&this->Array)*1e-3);
+
+}
diff --git a/test/integration/.gitignore b/test/integration/.gitignore
new file mode 100644
index 0000000..df420e0
--- /dev/null
+++ b/test/integration/.gitignore
@@ -0,0 +1,8 @@
+*#
+.#*
+*~
+data/
+test/
+*.log
+index.html
+test_cases.txt
diff --git a/test/integration/CMakeLists.txt b/test/integration/CMakeLists.txt
new file mode 100644
index 0000000..88deff4
--- /dev/null
+++ b/test/integration/CMakeLists.txt
@@ -0,0 +1,20 @@
+
+# detect the environment
+# CUDA_FOUND, MKL_FOUND, WIN32, UNIX, APPLE, PYTHONLIBS_FOUND
+
+# read in what will be tested for any environment
+FILE(GLOB TEST_CASE_ALWAYS RELATIVE ${CMAKE_SOURCE_DIR}/test/integration/  ./cases/*.cfg  )
+
+set(TEST_CASE ${TEST_CASE_ALWAYS})
+
+# Remove this after WIN python issue is resolved
+if (WIN32)
+  string(REPLACE "simple_gre_python" "simple_gre" TEST_CASE2 ${TEST_CASE})
+  string(REPLACE "cfg" "cfg;" TEST_CASE ${TEST_CASE2})
+endif (WIN32)
+
+# write the available test cases to a file
+file(WRITE ./test_cases.txt)
+foreach(test ${TEST_CASE})
+    file(APPEND ./test_cases.txt "${test}\n")
+endforeach()
diff --git a/test/integration/cases/cpu_grappa_simple.cfg b/test/integration/cases/cpu_grappa_simple.cfg
new file mode 100644
index 0000000..4563c63
--- /dev/null
+++ b/test/integration/cases/cpu_grappa_simple.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/rtgrappa/acc_data_with_device_2.dat
+siemens_parameter_xml=IsmrmrdParameterMap.xml
+siemens_parameter_xsl=IsmrmrdParameterMap.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=0
+siemens_dependency_measurement3=0
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=0
+out_folder=test
+ismrmrd=grappa_rate2.h5
+result_h5=grappa_rate2_cpu_out.h5
+reference_h5= data/rtgrappa/grappa_rate2_cpu_out.h5
+
+[TEST]
+gadgetron_configuration=grappa_float_cpu.xml
+reference_dataset=grappa_float_cpu.xml/image_0/data
+result_dataset=grappa_float_cpu.xml/image_0/data
+compare_dimensions=1
+compare_values=0
+compare_scales=1
+# Note that this reconstruction does not yield the same result every time, timing of GPU weights update changes the noise behavior
+comparison_threshold_values=1e-1
+comparison_threshold_scales=1e-2
+
+[REQUIREMENTS]
+system_memory=2048
+python_support=0
+gpu_support=0
+gpu_memory=1024
diff --git a/test/integration/cases/epi_2d.cfg b/test/integration/cases/epi_2d.cfg
new file mode 100644
index 0000000..274bcd3
--- /dev/null
+++ b/test/integration/cases/epi_2d.cfg
@@ -0,0 +1,30 @@
+[FILES]
+siemens_dat=data/epi/meas_MID517_nih_ep2d_bold_fa60_FID82077.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens_EPI.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=1
+out_folder=test
+ismrmrd=epi_2d.h5
+result_h5=epi_2d_out.h5
+reference_h5= data/epi/epi_2d_out_20150406_sji.h5 
+
+[TEST]
+gadgetron_configuration=epi.xml
+reference_dataset=epi.xml/image_0/data
+result_dataset=epi.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=2e-5
+comparison_threshold_scales=1e-5
+
+[REQUIREMENTS]
+system_memory=1024
+python_support=0
+gpu_support=0
+gpu_memory=0
diff --git a/test/integration/cases/gpu_fixed_radial_mode1_cg.cfg b/test/integration/cases/gpu_fixed_radial_mode1_cg.cfg
new file mode 100644
index 0000000..df6ce9f
--- /dev/null
+++ b/test/integration/cases/gpu_fixed_radial_mode1_cg.cfg
@@ -0,0 +1,30 @@
+[FILES]
+siemens_dat=data/radial_phantom/meas_MID00133_FID20080_CV_Radial_Fixed_Angle_128_x8_32phs.dat
+siemens_parameter_xml=IsmrmrdParameterMap.xml
+siemens_parameter_xsl=IsmrmrdParameterMap.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=0
+siemens_dependency_measurement3=0
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=0
+out_folder=test
+ismrmrd=fixed_radial.h5
+result_h5=fixed_radial_mode1_cg_out.h5
+reference_h5= data/radial_phantom/fixed_radial_mode1.h5
+
+[TEST]
+gadgetron_configuration=fixed_radial_mode1_gpusense_cg.xml
+reference_dataset=cg/image_0.img
+result_dataset=fixed_radial_mode1_gpusense_cg.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=1e-2
+comparison_threshold_scales=1e-2
+
+[REQUIREMENTS]
+system_memory=2048
+python_support=0
+gpu_support=1
+gpu_memory=1024
diff --git a/test/integration/cases/gpu_fixed_radial_mode1_ktsense.cfg b/test/integration/cases/gpu_fixed_radial_mode1_ktsense.cfg
new file mode 100644
index 0000000..75d1926
--- /dev/null
+++ b/test/integration/cases/gpu_fixed_radial_mode1_ktsense.cfg
@@ -0,0 +1,30 @@
+[FILES]
+siemens_dat=data/radial_phantom/meas_MID00133_FID20080_CV_Radial_Fixed_Angle_128_x8_32phs.dat
+siemens_parameter_xml=IsmrmrdParameterMap.xml
+siemens_parameter_xsl=IsmrmrdParameterMap.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=0
+siemens_dependency_measurement3=0
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=0
+out_folder=test
+ismrmrd=fixed_radial.h5
+result_h5=fixed_radial_mode1_ktsense_out.h5
+reference_h5= data/radial_phantom/fixed_radial_mode1.h5
+
+[TEST]
+gadgetron_configuration=fixed_radial_mode1_gpu_ktsense.xml
+reference_dataset=kt/image_0.img
+result_dataset=fixed_radial_mode1_gpu_ktsense.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=1e-2
+comparison_threshold_scales=1e-2
+
+[REQUIREMENTS]
+system_memory=2048
+python_support=0
+gpu_support=1
+gpu_memory=1024
diff --git a/test/integration/cases/gpu_fixed_radial_mode1_realtime.cfg b/test/integration/cases/gpu_fixed_radial_mode1_realtime.cfg
new file mode 100644
index 0000000..823b315
--- /dev/null
+++ b/test/integration/cases/gpu_fixed_radial_mode1_realtime.cfg
@@ -0,0 +1,30 @@
+[FILES]
+siemens_dat=data/radial_phantom/meas_MID00133_FID20080_CV_Radial_Fixed_Angle_128_x8_32phs.dat
+siemens_parameter_xml=IsmrmrdParameterMap.xml
+siemens_parameter_xsl=IsmrmrdParameterMap.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=0
+siemens_dependency_measurement3=0
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=0
+out_folder=test
+ismrmrd=fixed_radial.h5
+result_h5=fixed_radial_mode1_realtime_out.h5
+reference_h5= data/radial_phantom/fixed_radial_mode1.h5
+
+[TEST]
+gadgetron_configuration=fixed_radial_mode1_realtime.xml
+reference_dataset=realtime/image_0.img
+result_dataset=fixed_radial_mode1_realtime.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=1e-2
+comparison_threshold_scales=1e-2
+
+[REQUIREMENTS]
+system_memory=2048
+python_support=0
+gpu_support=1
+gpu_memory=1024
diff --git a/test/integration/cases/gpu_golden_radial_mode2_cg.cfg b/test/integration/cases/gpu_golden_radial_mode2_cg.cfg
new file mode 100644
index 0000000..2622cd4
--- /dev/null
+++ b/test/integration/cases/gpu_golden_radial_mode2_cg.cfg
@@ -0,0 +1,30 @@
+[FILES]
+siemens_dat=data/radial_phantom/meas_MID00135_FID20082_CV_Radial_Golden_Angle_128_512_views.dat
+siemens_parameter_xml=IsmrmrdParameterMap.xml
+siemens_parameter_xsl=IsmrmrdParameterMap.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=0
+siemens_dependency_measurement3=0
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=0
+out_folder=test
+ismrmrd=golden_radial.h5
+result_h5=golden_radial_mode2_cg_out.h5
+reference_h5= data/radial_phantom/golden_radial_mode2.h5
+
+[TEST]
+gadgetron_configuration=golden_radial_mode2_gpusense_cg.xml
+reference_dataset=cg/image_0.img
+result_dataset=golden_radial_mode2_gpusense_cg.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=1e-2
+comparison_threshold_scales=1e-2
+
+[REQUIREMENTS]
+system_memory=2048
+python_support=0
+gpu_support=1
+gpu_memory=1024
diff --git a/test/integration/cases/gpu_golden_radial_mode2_ktsense.cfg b/test/integration/cases/gpu_golden_radial_mode2_ktsense.cfg
new file mode 100644
index 0000000..6b6ff71
--- /dev/null
+++ b/test/integration/cases/gpu_golden_radial_mode2_ktsense.cfg
@@ -0,0 +1,30 @@
+[FILES]
+siemens_dat=data/radial_phantom/meas_MID00135_FID20082_CV_Radial_Golden_Angle_128_512_views.dat
+siemens_parameter_xml=IsmrmrdParameterMap.xml
+siemens_parameter_xsl=IsmrmrdParameterMap.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=0
+siemens_dependency_measurement3=0
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=0
+out_folder=test
+ismrmrd=golden_radial.h5
+result_h5=golden_radial_mode2_ktsense_out.h5
+reference_h5= data/radial_phantom/golden_radial_mode2.h5
+
+[TEST]
+gadgetron_configuration=golden_radial_mode2_gpu_ktsense.xml
+reference_dataset=kt/image_0.img
+result_dataset=golden_radial_mode2_gpu_ktsense.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=1e-2
+comparison_threshold_scales=1e-2
+
+[REQUIREMENTS]
+system_memory=2048
+python_support=0
+gpu_support=1
+gpu_memory=1024
diff --git a/test/integration/cases/gpu_golden_radial_mode2_realtime.cfg b/test/integration/cases/gpu_golden_radial_mode2_realtime.cfg
new file mode 100644
index 0000000..f76c5c9
--- /dev/null
+++ b/test/integration/cases/gpu_golden_radial_mode2_realtime.cfg
@@ -0,0 +1,30 @@
+[FILES]
+siemens_dat=data/radial_phantom/meas_MID00135_FID20082_CV_Radial_Golden_Angle_128_512_views.dat
+siemens_parameter_xml=IsmrmrdParameterMap.xml
+siemens_parameter_xsl=IsmrmrdParameterMap.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=0
+siemens_dependency_measurement3=0
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=0
+out_folder=test
+ismrmrd=golden_radial.h5
+result_h5=golden_radial_mode2_realtime_out.h5
+reference_h5= data/radial_phantom/golden_radial_mode2.h5
+
+[TEST]
+gadgetron_configuration=golden_radial_mode2_realtime.xml
+reference_dataset=realtime/image_0.img
+result_dataset=golden_radial_mode2_realtime.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=1e-2
+comparison_threshold_scales=1e-2
+
+[REQUIREMENTS]
+system_memory=2048
+python_support=0
+gpu_support=1
+gpu_memory=1024
diff --git a/test/integration/cases/gpu_golden_radial_mode2_sb.cfg b/test/integration/cases/gpu_golden_radial_mode2_sb.cfg
new file mode 100644
index 0000000..8ac28e9
--- /dev/null
+++ b/test/integration/cases/gpu_golden_radial_mode2_sb.cfg
@@ -0,0 +1,30 @@
+[FILES]
+siemens_dat=data/radial_phantom/meas_MID00135_FID20082_CV_Radial_Golden_Angle_128_512_views.dat
+siemens_parameter_xml=IsmrmrdParameterMap.xml
+siemens_parameter_xsl=IsmrmrdParameterMap.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=0
+siemens_dependency_measurement3=0
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=0
+out_folder=test
+ismrmrd=golden_radial.h5
+result_h5=golden_radial_mode2_sb_out.h5
+reference_h5= data/radial_phantom/golden_radial_mode2.h5
+
+[TEST]
+gadgetron_configuration=golden_radial_mode2_gpusense_sb.xml
+reference_dataset=sb/image_0.img
+result_dataset=golden_radial_mode2_gpusense_sb.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=1e-2
+comparison_threshold_scales=1e-2
+
+[REQUIREMENTS]
+system_memory=2048
+python_support=0
+gpu_support=1
+gpu_memory=2048
diff --git a/test/integration/cases/gpu_grappa_simple.cfg b/test/integration/cases/gpu_grappa_simple.cfg
new file mode 100644
index 0000000..2e6961c
--- /dev/null
+++ b/test/integration/cases/gpu_grappa_simple.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/rtgrappa/acc_data_with_device_2.dat
+siemens_parameter_xml=IsmrmrdParameterMap.xml
+siemens_parameter_xsl=IsmrmrdParameterMap.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=0
+siemens_dependency_measurement3=0
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=0
+out_folder=test
+ismrmrd=grappa_rate2.h5
+result_h5=grappa_rate2_out.h5
+reference_h5= data/rtgrappa/grappa_rate2_cpu_out.h5
+
+[TEST]
+gadgetron_configuration=grappa_float.xml
+reference_dataset=grappa_float_cpu.xml/image_0/data
+result_dataset=grappa_float.xml/image_0/data
+compare_dimensions=1
+compare_values=0
+compare_scales=1
+# Note that this reconstruction does not yield the same result every time, timing of GPU weights update changes the noise behavior
+comparison_threshold_values=1e-1
+comparison_threshold_scales=1e-2
+
+[REQUIREMENTS]
+system_memory=2048
+python_support=0
+gpu_support=1
+gpu_memory=1024
diff --git a/test/integration/cases/gpu_spiral.cfg b/test/integration/cases/gpu_spiral.cfg
new file mode 100644
index 0000000..33ff43d
--- /dev/null
+++ b/test/integration/cases/gpu_spiral.cfg
@@ -0,0 +1,30 @@
+[FILES]
+siemens_dat=data/spiral/meas_MID1132_MiniIRT_spiral_16int_tr500_acc1_10reps_FID13142.dat
+siemens_parameter_xml=IsmrmrdParameterMap.xml
+siemens_parameter_xsl=IsmrmrdParameterMap.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=0
+siemens_dependency_measurement3=0
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=0
+out_folder=test
+ismrmrd=simple_spiral.h5
+result_h5=simple_spiral_out.h5
+reference_h5= data/spiral/simple_spiral_out.h5
+
+[TEST]
+gadgetron_configuration=spiral_flow_gpusense_cg.xml
+reference_dataset=spiral_cg/image_0.img
+result_dataset=spiral_flow_gpusense_cg.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=1e-2
+comparison_threshold_scales=1e-2
+
+[REQUIREMENTS]
+system_memory=2048
+python_support=0
+gpu_support=1
+gpu_memory=1024
diff --git a/test/integration/cases/gpu_spiral_sb.cfg b/test/integration/cases/gpu_spiral_sb.cfg
new file mode 100644
index 0000000..e1c8ff4
--- /dev/null
+++ b/test/integration/cases/gpu_spiral_sb.cfg
@@ -0,0 +1,30 @@
+[FILES]
+siemens_dat=data/spiral/meas_MID1132_MiniIRT_spiral_16int_tr500_acc1_10reps_FID13142.dat
+siemens_parameter_xml=IsmrmrdParameterMap.xml
+siemens_parameter_xsl=IsmrmrdParameterMap.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=0
+siemens_dependency_measurement3=0
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=0
+out_folder=test
+ismrmrd=simple_spiral.h5
+result_h5=simple_spiral_out.h5
+reference_h5= data/spiral/simple_spiral_out.h5
+
+[TEST]
+gadgetron_configuration=spiral_flow_gpusense_sb.xml
+reference_dataset=spiral_sb/image_0.img
+result_dataset=spiral_flow_gpusense_sb.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=1e-2
+comparison_threshold_scales=1e-2
+
+[REQUIREMENTS]
+system_memory=2048
+python_support=0
+gpu_support=1
+gpu_memory=1024
diff --git a/test/integration/cases/gtplus_3D_head.cfg b/test/integration/cases/gtplus_3D_head.cfg
new file mode 100644
index 0000000..201fe6b
--- /dev/null
+++ b/test/integration/cases/gtplus_3D_head.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/gtplus/3D_head/meas_MID00156_FID05944_GRE_128iso_p2x2.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=1
+out_folder=test
+ismrmrd=gtplus_3D_head.h5
+result_h5=gtplus_3D_head_out.h5
+reference_h5= data/gtplus/3D_head/gtplus_3D_ref_20140924.h5
+
+[TEST]
+gadgetron_configuration=GT_3DT_Cartesian.xml
+reference_dataset=gtplus_3D/image_0/data
+result_dataset=GT_3DT_Cartesian.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.1
+comparison_threshold_scales=0.1
+
+[REQUIREMENTS]
+system_memory=16384
+python_support=0
+gpu_support=0
+gpu_memory=0
+
diff --git a/test/integration/cases/gtplus_FatWater.cfg b/test/integration/cases/gtplus_FatWater.cfg
new file mode 100644
index 0000000..b286bf1
--- /dev/null
+++ b/test/integration/cases/gtplus_FatWater.cfg
@@ -0,0 +1,32 @@
+[FILES]
+siemens_dat=data/gtplus/FatWater/meas_MID00342_3e2i_R4.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=1
+out_folder=test
+ismrmrd=gtplus_FatWater.h5
+result_h5=gtplus_FatWater_out.h5
+reference_h5= data/gtplus/FatWater/gtplus_FatWater_ref.h5
+
+[TEST]
+gadgetron_configuration=GT_2DT_FatWater.xml
+reference_dataset=gtplus_FatWater/image_0.img
+result_dataset=GT_2DT_FatWater.xml/image_0/data
+
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.75
+comparison_threshold_scales=0.1
+
+[REQUIREMENTS]
+system_memory=4096
+python_support=0
+gpu_support=0
+gpu_memory=1024
+
diff --git a/test/integration/cases/gtplus_FetalHASTE.cfg b/test/integration/cases/gtplus_FetalHASTE.cfg
new file mode 100644
index 0000000..577f433
--- /dev/null
+++ b/test/integration/cases/gtplus_FetalHASTE.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/gtplus/FetalHASTE/raw30488.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=1
+out_folder=test
+ismrmrd=gtplus_FetalHASTE.h5
+result_h5=gtplus_FetalHASTE_out.h5
+reference_h5= data/gtplus/FetalHASTE/gtplus_FetalHASTE_ref_20140826.h5
+
+[TEST]
+gadgetron_configuration=GT_2DT_HASTE.xml
+reference_dataset=gtplus_FetalHASTE/image_0.img
+result_dataset=GT_2DT_HASTE.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.75
+comparison_threshold_scales=0.75
+
+[REQUIREMENTS]
+system_memory=16384
+python_support=0
+gpu_support=0
+gpu_memory=0
+
diff --git a/test/integration/cases/gtplus_LGE.cfg b/test/integration/cases/gtplus_LGE.cfg
new file mode 100644
index 0000000..6725920
--- /dev/null
+++ b/test/integration/cases/gtplus_LGE.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/gtplus/LGE/meas_MID00083_9_slice.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=1
+out_folder=test
+ismrmrd=gtplus_LGE.h5
+result_h5=gtplus_LGE_out.h5
+reference_h5= data/gtplus/LGE/gtplus_LGE_ref_20140826.h5
+
+[TEST]
+gadgetron_configuration=GT_2DT_LGE.xml
+reference_dataset=gtplus_LGE/image_0.img
+result_dataset=GT_2DT_LGE.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.11
+comparison_threshold_scales=0.05
+
+[REQUIREMENTS]
+system_memory=4096
+python_support=0
+gpu_support=0
+gpu_memory=1024
+
diff --git a/test/integration/cases/gtplus_Perfusion.cfg b/test/integration/cases/gtplus_Perfusion.cfg
new file mode 100644
index 0000000..7c84fc0
--- /dev/null
+++ b/test/integration/cases/gtplus_Perfusion.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/gtplus/Perfusion/meas_MID00045_R3_AIF_ON.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=1
+out_folder=test
+ismrmrd=gtplus_Perfusion.h5
+result_h5=gtplus_Perfusion_out.h5
+reference_h5= data/gtplus/Perfusion/gtplus_Perfusion_ref_20140826.h5
+
+[TEST]
+gadgetron_configuration=GT_2DT_Perfusion.xml
+reference_dataset=gtplus_Perfusion/image_0.img
+result_dataset=GT_2DT_Perfusion.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.1
+comparison_threshold_scales=0.075
+
+[REQUIREMENTS]
+system_memory=4096
+python_support=0
+gpu_support=0
+gpu_memory=1024
+
diff --git a/test/integration/cases/gtplus_T2W.cfg b/test/integration/cases/gtplus_T2W.cfg
new file mode 100644
index 0000000..e6a178e
--- /dev/null
+++ b/test/integration/cases/gtplus_T2W.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/gtplus/T2W/meas_MID00057_T2w.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=1
+out_folder=test
+ismrmrd=gtplus_T2W.h5
+result_h5=gtplus_T2W_out.h5
+reference_h5= data/gtplus/T2W/gtplus_T2W_ref.h5
+
+[TEST]
+gadgetron_configuration=GT_2DT_T2W.xml
+reference_dataset=gtplus_T2W/image_0.img
+result_dataset=GT_2DT_T2W.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.075
+comparison_threshold_scales=0.075
+
+[REQUIREMENTS]
+system_memory=4096
+python_support=0
+gpu_support=0
+gpu_memory=1024
+
diff --git a/test/integration/cases/gtplus_localizer.cfg b/test/integration/cases/gtplus_localizer.cfg
new file mode 100644
index 0000000..ce34699
--- /dev/null
+++ b/test/integration/cases/gtplus_localizer.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/gtplus/localizer/meas_MID00026.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=1
+out_folder=test
+ismrmrd=gtplus_localizer.h5
+result_h5=gtplus_localizer_out.h5
+reference_h5= data/gtplus/localizer/gtplus_localizer_ref.h5
+
+[TEST]
+gadgetron_configuration=GT_2DT_Cartesian.xml
+reference_dataset=gtplus_localizer/image_0.img
+result_dataset=GT_2DT_Cartesian.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.75
+comparison_threshold_scales=0.75
+
+[REQUIREMENTS]
+system_memory=4096
+python_support=0
+gpu_support=0
+gpu_memory=1024
+
diff --git a/test/integration/cases/gtplus_molli.cfg b/test/integration/cases/gtplus_molli.cfg
new file mode 100644
index 0000000..e1f3aa2
--- /dev/null
+++ b/test/integration/cases/gtplus_molli.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/gtplus/MOLLI/20100330_10h33m11s_5562.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens_VB17.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=-1
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=0
+out_folder=test
+ismrmrd=gtplus_molli.h5
+result_h5=gtplus_molli_out.h5
+reference_h5= data/gtplus/MOLLI/gtplus_molli_ref_20141105.h5
+
+[TEST]
+gadgetron_configuration=GT_2DT_MOLLI.xml
+reference_dataset=GT_2DT_MOLLI.xml/image_0/data
+result_dataset=GT_2DT_MOLLI.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.06
+comparison_threshold_scales=0.05
+
+[REQUIREMENTS]
+system_memory=4096
+python_support=0
+gpu_support=0
+gpu_memory=1024
+
diff --git a/test/integration/cases/gtplus_real_time_cine.cfg b/test/integration/cases/gtplus_real_time_cine.cfg
new file mode 100644
index 0000000..bccd2e6
--- /dev/null
+++ b/test/integration/cases/gtplus_real_time_cine.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/gtplus/RealTimeCine/meas_MID21_CINE_R4.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=1
+out_folder=test
+ismrmrd=gtplus_real_time_cine.h5
+result_h5=gtplus_real_time_cine_out.h5
+reference_h5= data/gtplus/RealTimeCine/gtplus_real_time_cine_ref_20150328.h5
+
+[TEST]
+gadgetron_configuration=GT_2DT_RealTimeCine.xml
+reference_dataset=GT_2DT_RealTimeCine.xml/image_0/data
+result_dataset=GT_2DT_RealTimeCine.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.1
+comparison_threshold_scales=0.1
+
+[REQUIREMENTS]
+system_memory=4096
+python_support=0
+gpu_support=0
+gpu_memory=1024
+
diff --git a/test/integration/cases/gtplus_real_time_cine_9slices.cfg b/test/integration/cases/gtplus_real_time_cine_9slices.cfg
new file mode 100644
index 0000000..b78d794
--- /dev/null
+++ b/test/integration/cases/gtplus_real_time_cine_9slices.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/gtplus/RealTimeCine_9slices/meas_MID00832.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=1
+out_folder=test
+ismrmrd=gtplus_real_time_cine_9slices.h5
+result_h5=gtplus_real_time_cine_9slices_out.h5
+reference_h5= data/gtplus/RealTimeCine_9slices/gtplus_real_time_cine_ref_20150328.h5
+
+[TEST]
+gadgetron_configuration=GT_2DT_RealTimeCine.xml
+reference_dataset=GT_2DT_RealTimeCine.xml/image_0/data
+result_dataset=GT_2DT_RealTimeCine.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.75
+comparison_threshold_scales=0.75
+
+[REQUIREMENTS]
+system_memory=4096
+python_support=0
+gpu_support=0
+gpu_memory=1024
+
diff --git a/test/integration/cases/gtplus_sasha.cfg b/test/integration/cases/gtplus_sasha.cfg
new file mode 100644
index 0000000..6028de1
--- /dev/null
+++ b/test/integration/cases/gtplus_sasha.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/gtplus/sasha/20140325_15h59m29s_7720.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=1
+out_folder=test
+ismrmrd=gtplus_sasha.h5
+result_h5=gtplus_sasha_out.h5
+reference_h5= data/gtplus/sasha/gtplus_sasha_ref.h5
+
+[TEST]
+gadgetron_configuration=GT_2DT_Cartesian_GFactor.xml
+reference_dataset=gtplus_sasha/image_0.img
+result_dataset=GT_2DT_Cartesian_GFactor.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.06
+comparison_threshold_scales=0.05
+
+[REQUIREMENTS]
+system_memory=4096
+python_support=0
+gpu_support=0
+gpu_memory=1024
+
diff --git a/test/integration/cases/gtplus_snr_unit_recon_builtin_noise.cfg b/test/integration/cases/gtplus_snr_unit_recon_builtin_noise.cfg
new file mode 100644
index 0000000..ad4d182
--- /dev/null
+++ b/test/integration/cases/gtplus_snr_unit_recon_builtin_noise.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/gtplus/snr_unit_recon_builtin_noise/meas_MID00127_FID02864_GRE_reps=150__WIP724_sPAT=4.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=-1
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=0
+out_folder=test
+ismrmrd=gtplus_snr_unit_recon_builtin_noise.h5
+result_h5=gtplus_snr_unit_recon_builtin_noise_out.h5
+reference_h5= data/gtplus/snr_unit_recon_builtin_noise/gtplus_snr_unit_recon_ref.h5
+
+[TEST]
+gadgetron_configuration=GT_2DT_Cartesian_GFactor.xml
+reference_dataset=gtplus_snr_unit_recon/image_0.img
+result_dataset=GT_2DT_Cartesian_GFactor.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.075
+comparison_threshold_scales=0.075
+
+[REQUIREMENTS]
+system_memory=4096
+python_support=0
+gpu_support=0
+gpu_memory=1024
+
diff --git a/test/integration/cases/gtplus_snr_unit_recon_ipat4.cfg b/test/integration/cases/gtplus_snr_unit_recon_ipat4.cfg
new file mode 100644
index 0000000..6e49eb5
--- /dev/null
+++ b/test/integration/cases/gtplus_snr_unit_recon_ipat4.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/gtplus/snr_unit_recon_ipat4/meas_MID00175_FID02912_GRE_reps=150_iPAT=4.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=1
+out_folder=test
+ismrmrd=gtplus_snr_unit_recon_ipat4.h5
+result_h5=gtplus_snr_unit_recon_ipat4_out.h5
+reference_h5= data/gtplus/snr_unit_recon_ipat4/gtplus_snr_unit_recon_ref.h5
+
+[TEST]
+gadgetron_configuration=GT_2DT_Cartesian_GFactor.xml
+reference_dataset=gtplus_snr_unit_recon/image_0.img
+result_dataset=GT_2DT_Cartesian_GFactor.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.075
+comparison_threshold_scales=0.075
+
+[REQUIREMENTS]
+system_memory=16384
+python_support=0
+gpu_support=0
+gpu_memory=1024
+
diff --git a/test/integration/cases/gtplus_snr_unit_recon_prospective_cine.cfg b/test/integration/cases/gtplus_snr_unit_recon_prospective_cine.cfg
new file mode 100644
index 0000000..6fc86a3
--- /dev/null
+++ b/test/integration/cases/gtplus_snr_unit_recon_prospective_cine.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/gtplus/ProspectiveCine/meas_MID00209_FID07469_CV_gtPlus_2D_epat4.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=1
+out_folder=test
+ismrmrd=gtplus_snr_unit_recon_ProspectiveCine.h5
+result_h5=gtplus_snr_unit_recon_ProspectiveCine_out.h5
+reference_h5= data/gtplus/ProspectiveCine/gtplus_snr_unit_recon_ref_20140826.h5
+
+[TEST]
+gadgetron_configuration=GT_2DT_Cartesian_GFactor.xml
+reference_dataset=gtplus_snr_unit_recon/image_0.img
+result_dataset=GT_2DT_Cartesian_GFactor.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.05
+comparison_threshold_scales=0.05
+
+[REQUIREMENTS]
+system_memory=4096
+python_support=0
+gpu_support=0
+gpu_memory=1024
+
diff --git a/test/integration/cases/gtplus_snr_unit_recon_spat2_asym_pf.cfg b/test/integration/cases/gtplus_snr_unit_recon_spat2_asym_pf.cfg
new file mode 100644
index 0000000..b6e55b7
--- /dev/null
+++ b/test/integration/cases/gtplus_snr_unit_recon_spat2_asym_pf.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/gtplus/snr_unit_recon_spat2_asym_pf/meas_MID00156_FID03210_GRE_reps=50_sPAT2_xRes256_dummy1_FOV240_strong_asym_7_8pf.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=1
+out_folder=test
+ismrmrd=gtplus_snr_unit_recon_spat2_asym_pf.h5
+result_h5=gtplus_snr_unit_recon_spat2_asym_pf_out.h5
+reference_h5= data/gtplus/snr_unit_recon_spat2_asym_pf/gtplus_snr_unit_recon_ref.h5
+
+[TEST]
+gadgetron_configuration=GT_2DT_Cartesian_GFactor.xml
+reference_dataset=gtplus_snr_unit_recon/image_0.img
+result_dataset=GT_2DT_Cartesian_GFactor.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.075
+comparison_threshold_scales=0.075
+
+[REQUIREMENTS]
+system_memory=4096
+python_support=0
+gpu_support=0
+gpu_memory=1024
+
diff --git a/test/integration/cases/gtplus_snr_unit_recon_spat3.cfg b/test/integration/cases/gtplus_snr_unit_recon_spat3.cfg
new file mode 100644
index 0000000..b72c9e0
--- /dev/null
+++ b/test/integration/cases/gtplus_snr_unit_recon_spat3.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/gtplus/snr_unit_recon_spat3/meas_MID00152_FID03206_GRE_reps=20_sPAT3_xRes256_dummy1_FOV240.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=1
+out_folder=test
+ismrmrd=gtplus_snr_unit_recon_spat3.h5
+result_h5=gtplus_snr_unit_recon_spat3_out.h5
+reference_h5= data/gtplus/snr_unit_recon_spat3/gtplus_snr_unit_recon_ref.h5
+
+[TEST]
+gadgetron_configuration=GT_2DT_Cartesian_GFactor.xml
+reference_dataset=gtplus_snr_unit_recon/image_0.img
+result_dataset=GT_2DT_Cartesian_GFactor.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.075
+comparison_threshold_scales=0.075
+
+[REQUIREMENTS]
+system_memory=4096
+python_support=0
+gpu_support=0
+gpu_memory=1024
+
diff --git a/test/integration/cases/gtplus_snr_unit_recon_tpat3.cfg b/test/integration/cases/gtplus_snr_unit_recon_tpat3.cfg
new file mode 100644
index 0000000..fbd428a
--- /dev/null
+++ b/test/integration/cases/gtplus_snr_unit_recon_tpat3.cfg
@@ -0,0 +1,31 @@
+[FILES]
+siemens_dat=data/gtplus/snr_unit_recon_tpat3/meas_MID00171_FID02908_GRE_reps=150_TPAT=3.dat
+siemens_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=-1
+siemens_dependency_measurement3=-1
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=1
+out_folder=test
+ismrmrd=gtplus_snr_unit_recon_tpat3.h5
+result_h5=gtplus_snr_unit_recon_tpat3_out.h5
+reference_h5= data/gtplus/snr_unit_recon_tpat3/gtplus_snr_unit_recon_ref.h5
+
+[TEST]
+gadgetron_configuration=GT_2DT_Cartesian_GFactor.xml
+reference_dataset=gtplus_snr_unit_recon/image_0.img
+result_dataset=GT_2DT_Cartesian_GFactor.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.075
+comparison_threshold_scales=0.075
+
+[REQUIREMENTS]
+system_memory=8192
+python_support=0
+gpu_support=0
+gpu_memory=1024
+
diff --git a/test/integration/cases/simple_gre.cfg b/test/integration/cases/simple_gre.cfg
new file mode 100644
index 0000000..a2d8e1b
--- /dev/null
+++ b/test/integration/cases/simple_gre.cfg
@@ -0,0 +1,30 @@
+[FILES]
+siemens_dat=data/simple_gre/meas_MiniGadgetron_GRE.dat
+siemens_parameter_xml=IsmrmrdParameterMap.xml
+siemens_parameter_xsl=IsmrmrdParameterMap.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=0
+siemens_dependency_measurement3=0
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=0
+out_folder=test
+ismrmrd=simple_gre.h5
+result_h5=simple_gre_out.h5
+reference_h5= data/simple_gre/simple_gre_out_20150110_msh.h5 
+
+[TEST]
+gadgetron_configuration=default.xml
+reference_dataset=default.xml/image_0/data
+result_dataset=default.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=1e-5
+comparison_threshold_scales=1e-5
+
+[REQUIREMENTS]
+system_memory=1024
+python_support=0
+gpu_support=0
+gpu_memory=0
diff --git a/test/integration/cases/simple_gre_3d.cfg b/test/integration/cases/simple_gre_3d.cfg
new file mode 100644
index 0000000..5c9a744
--- /dev/null
+++ b/test/integration/cases/simple_gre_3d.cfg
@@ -0,0 +1,32 @@
+[FILES]
+siemens_dat=data/gre_3d/meas_MID248_gre_FID30644.dat
+siemens_parameter_xml=IsmrmrdParameterMap.xml
+siemens_parameter_xsl=IsmrmrdParameterMap.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=0
+siemens_dependency_measurement3=0
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=0
+out_folder=test
+ismrmrd=simple_gre_3d.h5
+result_h5=simple_gre_out_3d.h5
+reference_h5= data/gre_3d/simple_gre_out_3d_20150110_msh.h5
+
+[TEST]
+gadgetron_configuration=default_optimized.xml
+reference_dataset=default_optimized.xml/image_0/data
+result_dataset=default_optimized.xml/image_0/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=1e-5
+comparison_threshold_scales=1e-5
+
+[REQUIREMENTS]
+system_memory=2048
+python_support=0
+gpu_support=0
+gpu_memory=0
+
+
diff --git a/test/integration/cases/simple_gre_python.cfg b/test/integration/cases/simple_gre_python.cfg
new file mode 100644
index 0000000..1a6daf2
--- /dev/null
+++ b/test/integration/cases/simple_gre_python.cfg
@@ -0,0 +1,30 @@
+[FILES]
+siemens_dat=data/simple_gre/meas_MiniGadgetron_GRE.dat
+siemens_parameter_xml=IsmrmrdParameterMap.xml
+siemens_parameter_xsl=IsmrmrdParameterMap.xsl
+siemens_dependency_measurement1=0
+siemens_dependency_measurement2=0
+siemens_dependency_measurement3=0
+siemens_dependency_parameter_xml=IsmrmrdParameterMap_Siemens.xml
+siemens_dependency_parameter_xsl=IsmrmrdParameterMap_Siemens.xsl
+siemens_data_measurement=0
+out_folder=test
+ismrmrd=simple_gre.h5
+result_h5=simple_gre_out.h5
+reference_h5= data/simple_gre/simple_gre_out.h5
+
+[TEST]
+gadgetron_configuration=python_short.xml
+reference_dataset=python_short.xml/image_1.img
+result_dataset=python_short.xml/image_1/data
+compare_dimensions=1
+compare_values=1
+compare_scales=1
+comparison_threshold_values=0.002
+comparison_threshold_scales=0.001
+
+[REQUIREMENTS]
+system_memory=2048
+python_support=1
+gpu_support=0
+gpu_memory=0
diff --git a/test/integration/data.txt b/test/integration/data.txt
new file mode 100644
index 0000000..55681f4
--- /dev/null
+++ b/test/integration/data.txt
@@ -0,0 +1,68 @@
+gre_3d/simple_gre_out_3d_20150110_msh.h5: 849e09e3c0d2fd357ad7f7d53af7f63e
+gre_3d/meas_MID248_gre_FID30644.dat: 39ac16864627691cf7d84aa2ce13c1ae
+radial_phantom/fixed_radial_mode1.h5: 3a8e10388a3a11683c7e611537c1bd44
+radial_phantom/golden_radial_mode2.h5: 42c60da3121fa50b8c04e1711b8f4659
+radial_phantom/meas_MID00133_FID20080_CV_Radial_Fixed_Angle_128_x8_32phs.dat: 58f8de6b6e755c4d3dcd0c7dace3b8f6
+radial_phantom/meas_MID00135_FID20082_CV_Radial_Golden_Angle_128_512_views.dat: 0326afbb168982f4144704781a08b3ec
+rtgrappa/acc_data_with_device_2.dat: ac0b59c6c8989c94738e41e2c4b5ec13
+rtgrappa/grappa_rate2_out_20150110_msh.h5: 53268231efa796ce2315edc272e81e3a
+rtgrappa/grappa_rate2_cpu_out.h5: 5fb75eefe3828a74cffe7da6f521d841
+simple_gre/meas_MiniGadgetron_GRE.dat: 7c5c255522e42367546b4045560afcf8
+simple_gre/simple_gre_out.h5: 624ac3178e15e27e52489f330b3fffa5
+simple_gre/simple_gre_out_20150110_msh.h5: 13870af335ec1708a15fc139d0ffd4bb
+spiral/simple_spiral_out.h5: 44be83612c69f008ee71a47fecd3c7ed
+spiral/meas_MID1132_MiniIRT_spiral_16int_tr500_acc1_10reps_FID13142.dat: 763baf3d7d0acff185ec9a3c85d5a3f3
+gtplus/3D/gtplus_3D_l1spirit_ref.h5: b1df7a765522483c348c7a89550d3cac
+gtplus/3D/gtplus_3D_ref.h5: 4ba53157bf30e73041bda8f9b18bcb1b
+gtplus/3D/meas_MID00370_embedded_2by2.dat: bf92f5ac70492994a0816cafc3308854
+gtplus/3D_head/gtplus_3D_l1spirit_ref.h5: 996d4cf35cec9dcf0e0598cefed80d90
+gtplus/3D_head/gtplus_3D_ref.h5: 30e3e26657220f6d75640a6901d0bce9
+gtplus/3D_head/gtplus_3D_ref_20140924.h5: c2546f8fd01ae5bc7bf5713061507f78
+gtplus/3D_head/meas_MID00156_FID05944_GRE_128iso_p2x2.dat: 8077b0d4b957a0cffb7cdf0bdb0014fc
+gtplus/FatWater/gtplus_FatWater_ref.h5: 928d02fcffda37f7227a685b718af82e
+gtplus/FatWater/meas_MID00342_3e2i_R4.dat: 8d9e3aedae8b31f63551cd4e85cd6a53
+gtplus/FetalHASTE/gtplus_FetalHASTE_ref.h5: e0dcb71b3cedfad6178101c0f51ef7c0
+gtplus/FetalHASTE/gtplus_FetalHASTE_ref_20140826.h5: 6bac57c8df79cff1dbd4ccf0559ad444
+gtplus/FetalHASTE/raw30488.dat: 897973938d67a1094f7249520ff15e57
+gtplus/LGE/gtplus_LGE_ref.h5: d78d8ebe7393628648b7f8370f6debb6
+gtplus/LGE/gtplus_LGE_ref_20140826.h5: 80f0d661b08b21b602153b6f471d998f
+gtplus/LGE/meas_MID00083_9_slice.dat: 0912ac1bf7d2adb6a74515c563b91d2e
+gtplus/localizer/gtplus_localizer_ref.h5: 1b2dcf4a942f9616c957fbe444b625b5
+gtplus/localizer/meas_MID00026.dat: ef5954ad53996d5d1455a95985a743a6
+gtplus/MOLLI/20100330_10h33m11s_5562.dat: db50a52977aafd3c8d100fa9aa7d8dcf
+gtplus/MOLLI/gtplus_molli_ref.h5: e1c1e53a5cdbc78575a33d011cb9c65b
+gtplus/MOLLI/gtplus_molli_ref_20141105.h5: 719957e63625feccc50e5ac3615de38c
+gtplus/Perfusion/gtplus_Perfusion_ref.h5: 1845fd154dc204df36e11dfb4d1bba88
+gtplus/Perfusion/gtplus_Perfusion_ref_20140826.h5: da1a5e0ee6db499344f3fde4a44bf85b
+gtplus/Perfusion/meas_MID00045_R3_AIF_ON.dat: 7f82e4e95f83cf3e876abba8030f3b34
+gtplus/ProspectiveCine/gtplus_snr_unit_recon_ref.h5: cd8df1e755f731dfda828c46a503b90f
+gtplus/ProspectiveCine/gtplus_snr_unit_recon_ref_20140826.h5: 6b1c6042d2c8e98eb9983a14dbe91724
+gtplus/ProspectiveCine/meas_MID00209_FID07469_CV_gtPlus_2D_epat4.dat: c36f3c1a0fbb27fec13ea983aeceae13
+gtplus/RealTimeCine/gtplus_real_time_cine_l1spirit_ref.h5: 04a3063da8248b26cd01ad5475dffab9
+gtplus/RealTimeCine/gtplus_real_time_cine_ref.h5: 1e02fbbc9a3af892cec51dff1e12e55e
+gtplus/RealTimeCine/gtplus_real_time_cine_ref_20150328.h5: 821dfe36e7ac6b0e404731b2b973fdd1 
+gtplus/RealTimeCine/gtplus_real_time_cine_spirit_ref.h5: d9c774e7140145395d0a469c543ac3e9
+gtplus/RealTimeCine/meas_MID21_CINE_R4.dat: 3c985b16468580a056350a0fc5473934
+gtplus/RealTimeCine_9slices/gtplus_real_time_cine_l1spirit_ref.h5: d01c3a629565d18f34cf79ca0ed26133
+gtplus/RealTimeCine_9slices/gtplus_real_time_cine_ref.h5: 603a563ddd5d584226bfb3ede13a53cb
+gtplus/RealTimeCine_9slices/gtplus_real_time_cine_ref_20150328.h5: d9f71016fb7a18721a8c0d56e1828175 
+gtplus/RealTimeCine_9slices/gtplus_real_time_cine_spirit_ref.h5: 1c00dec61da7dfdb3ff1dcba4c50d152
+gtplus/RealTimeCine_9slices/meas_MID00832.dat: a8ae3a7f00ffd33ca43b33ebbdf931b0
+gtplus/sasha/20140325_15h59m29s_7720.dat: 85df92b153e6199f4892a787d1f37e96
+gtplus/sasha/gtplus_sasha_ref.h5: 1ff0885e33829c51a89c90342fcf6221
+gtplus/snr_unit_recon_builtin_noise/gtplus_snr_unit_recon_ref.h5: 25748ce15e5c28573e21bed10c726c53
+gtplus/snr_unit_recon_builtin_noise/meas_MID00127_FID02864_GRE_reps=150__WIP724_sPAT=4.dat: 76e580e1cff6091048499cfcf944e755
+gtplus/snr_unit_recon_ipat4/gtplus_snr_unit_recon_ref.h5: 7c19ad053bcc05dfb4ff777d07bf4d35
+gtplus/snr_unit_recon_ipat4/meas_MID00175_FID02912_GRE_reps=150_iPAT=4.dat: c73c9b25dab6e401f022aeef2549c911
+gtplus/snr_unit_recon_no_pat/gtplus_snr_unit_recon_ref.h5: c83d76aa5c2247506a10043bb92649e0
+gtplus/snr_unit_recon_no_pat/meas_MID00032_FID22409_oil_gre_128_150reps_pause_alpha_10.dat: ed7fb5ba56ae466f59bbe94e94ec9ca0
+gtplus/snr_unit_recon_spat2_asym_pf/gtplus_snr_unit_recon_ref.h5: ce584033fea1d6b85b40aac89f3a466a
+gtplus/snr_unit_recon_spat2_asym_pf/meas_MID00156_FID03210_GRE_reps=50_sPAT2_xRes256_dummy1_FOV240_strong_asym_7_8pf.dat: 93013c76eb771d62dc2eb3b9a5ab24a9
+gtplus/snr_unit_recon_spat3/gtplus_snr_unit_recon_ref.h5: 32111c294adc19aee28a5d0e34b47bf2
+gtplus/snr_unit_recon_spat3/meas_MID00152_FID03206_GRE_reps=20_sPAT3_xRes256_dummy1_FOV240.dat: 853ab53d494ee5f085888c7cfe57172f
+gtplus/snr_unit_recon_tpat3/gtplus_snr_unit_recon_ref.h5: 71a665565321165be3c0ce6509bf170a
+gtplus/snr_unit_recon_tpat3/meas_MID00171_FID02908_GRE_reps=150_TPAT=3.dat: 21e219f663a27c22fec4b56216b2f3ed
+gtplus/T2W/gtplus_T2W_ref.h5: 646496067fca502f86f2770f39a71be1
+gtplus/T2W/meas_MID00057_T2w.dat: 46aa75c471a41c793006328a224a4001
+epi/epi_2d_out_20150406_sji.h5: bf73508d9730c478b04e22c2323d4805
+epi/meas_MID517_nih_ep2d_bold_fa60_FID82077.dat: 8790d64a101acdc7b6990dd414b14be6
diff --git a/test/integration/get_data.py b/test/integration/get_data.py
new file mode 100644
index 0000000..a1e02f6
--- /dev/null
+++ b/test/integration/get_data.py
@@ -0,0 +1,68 @@
+import os
+import sys
+import urllib2
+import hashlib
+
+DATAFILE = "data.txt"
+DATADIR = "data"
+HOST = 'http://gadgetrontestdata.s3-website-us-east-1.amazonaws.com'
+
+def md5sum(filename, blocksize=64*1024):
+    hsh = hashlib.md5()
+    with open(filename, "r+b") as f:
+        buf = f.read(blocksize)
+        while len(buf) > 0:
+            hsh.update(buf)
+            buf = f.read(blocksize)
+    return hsh.hexdigest()
+
+def load_checksums(datafile):
+    checksums = {}
+    with open(datafile) as f:
+        for line in f:
+            filepath, checksum = line.split(':')
+            checksums[filepath.strip()] = checksum.strip()
+    return checksums
+
+def download(url, dest):
+    furl = urllib2.urlopen(url)
+    with open(dest, 'wb') as fdest:
+        fdest.write(furl.read())
+
+def main():
+    # determine test dir from full path to this script
+    testdir = os.path.dirname(os.path.realpath(sys.argv[0]))
+    datadir = os.path.join(testdir, DATADIR)
+    datafile = os.path.join(testdir, DATAFILE)
+    if not os.path.isdir(datadir):
+        os.mkdir(datadir)
+
+    print("Reading list of data from %s" % datafile)
+    try:
+        checksums = load_checksums(datafile)
+    except IOError:
+        print("Failed to read %s" % datafile)
+        return
+
+    print("Storing test data in %s" % datadir)
+
+    for dataname,checksum in checksums.items():
+        datapath = os.path.join(datadir, dataname)
+        parent = os.path.dirname(datapath)
+        if not os.path.isdir(parent):
+            os.makedirs(parent)
+        url = '%s/%s' % (HOST, dataname)
+
+        print("Verifying: %s..." % dataname)
+        # if file is missing or its checksum doesn't match, download it
+        if not os.path.isfile(datapath) or md5sum(datapath) != checksum:
+            print("Downloading: %s..." % dataname)
+            try:
+                download(url, datapath)
+            except urllib2.HTTPError, e:
+                print("HTTP Error: %d %s" % (e.code, url))
+            except urllib2.URLError, e:
+                print("URL Error: %s - %s" % (e.reason, url))
+
+if __name__ == '__main__':
+    main()
diff --git a/test/integration/run_all_tests.py b/test/integration/run_all_tests.py
new file mode 100644
index 0000000..cbeace2
--- /dev/null
+++ b/test/integration/run_all_tests.py
@@ -0,0 +1,71 @@
+import ConfigParser
+import os
+import sys
+import glob
+import subprocess
+
+def main():
+    if len(sys.argv) < 4:
+        sys.stderr.write("Missing arguments\n")
+        prog = os.path.basename(sys.argv[0])
+        help = "Usage: %s <ismrmrd_home> <gadgetron home> <test case list file> <optional: chroot path>\n" % prog
+        sys.stderr.write(help)
+        sys.exit(1)
+    ismrmrd_home = sys.argv[1]
+    gadgetron_home = sys.argv[2]
+    test_case_list = sys.argv[3]
+    pwd = os.getcwd()
+
+    if len(sys.argv) >= 5:
+        chroot_path = sys.argv[4]
+
+    test_cases = open( test_case_list, 'r' )
+    content = test_cases.read().splitlines()
+
+    test_result = True
+
+    gadgetron_outfile = open('gadgetron.log', 'w')
+    client_outfile    = open('client.log', 'w')
+
+    for t in content:
+        print("Grabbing test case: " + t)
+
+        # We need to figure out where this test dumps log files
+        config = ConfigParser.RawConfigParser()
+        config.read(t)
+        out_folder = config.get('FILES', 'out_folder')
+        gadgetron_log_filename = os.path.join(pwd, out_folder, "gadgetron.log")
+        client_log_filename = os.path.join(pwd, out_folder, "client.log")
+
+        # Now run the test
+        if len(sys.argv) >= 5:
+            r = subprocess.call(["python", "run_gadgetron_test.py", ismrmrd_home, gadgetron_home, t, chroot_path])
+        else:
+            r = subprocess.call(["python", "run_gadgetron_test.py", ismrmrd_home, gadgetron_home, t])
+
+        # Grab the log files and append to master logs
+        gadgetron_outfile.write("==============================================\n")
+        gadgetron_outfile.write("   GADGETRON TEST CASE: " + t + "\n")
+        gadgetron_outfile.write("==============================================\n")
+        with open(gadgetron_log_filename) as infile:
+            gadgetron_outfile.write(infile.read())
+
+        client_outfile.write("==============================================\n")
+        client_outfile.write("   GADGETRON TEST CASE: " + t + "\n")
+        client_outfile.write("==============================================\n")
+        with open(client_log_filename) as infile:
+            client_outfile.write(infile.read())
+
+        if r != 0:
+            test_result = False
+            break
+
+    if test_result:
+        print("ALL TESTS: SUCCESS")
+        return 0
+    else:
+        print("ALL_TESTS:  FAILED")
+        return -100
+
+if __name__=="__main__":
+    sys.exit(main())
diff --git a/test/integration/run_gadgetron_test.py b/test/integration/run_gadgetron_test.py
new file mode 100644
index 0000000..abe8143
--- /dev/null
+++ b/test/integration/run_gadgetron_test.py
@@ -0,0 +1,425 @@
+import subprocess
+import time
+import sys
+import h5py
+import numpy
+import ConfigParser
+import os
+import shutil
+import platform
+import time
+import re
+
+def run_test(environment, testcase_cfg_file, chroot_path, port):
+    print("Running test case: " + testcase_cfg_file)
+
+    pwd = os.getcwd()
+    config = ConfigParser.RawConfigParser()
+    config.read(testcase_cfg_file)
+
+    out_folder = config.get('FILES', 'out_folder')
+    siemens_dat = os.path.join(pwd, config.get('FILES', 'siemens_dat'))
+    ismrmrd = os.path.join(pwd, out_folder, config.get('FILES', 'ismrmrd'))
+    result_h5 = os.path.join(pwd, out_folder, config.get('FILES', 'result_h5'))
+    reference_h5 = os.path.join(pwd, config.get('FILES', 'reference_h5'))
+    siemens_parameter_xml = config.get('FILES', 'siemens_parameter_xml')
+    siemens_parameter_xsl = config.get('FILES', 'siemens_parameter_xsl')
+    siemens_dependency_measurement1 = config.getint('FILES', 'siemens_dependency_measurement1')
+    siemens_dependency_measurement2 = config.getint('FILES', 'siemens_dependency_measurement2')
+    siemens_dependency_measurement3 = config.getint('FILES', 'siemens_dependency_measurement3')
+    siemens_dependency_parameter_xml = config.get('FILES', 'siemens_dependency_parameter_xml')
+    siemens_dependency_parameter_xsl = config.get('FILES', 'siemens_dependency_parameter_xsl')
+    siemens_data_measurement = config.getint('FILES', 'siemens_data_measurement')
+    gadgetron_log_filename = os.path.join(pwd, out_folder, "gadgetron.log")
+    client_log_filename = os.path.join(pwd, out_folder, "client.log")
+
+    gadgetron_configuration = config.get('TEST', 'gadgetron_configuration')
+    reference_dataset = config.get('TEST', 'reference_dataset')
+    result_dataset = config.get('TEST', 'result_dataset')
+    compare_dimensions = config.getboolean('TEST', 'compare_dimensions')
+    compare_values = config.getboolean('TEST', 'compare_values')
+    compare_scales = config.getboolean('TEST', 'compare_scales')
+    comparison_threshold_values = config.getfloat('TEST', 'comparison_threshold_values')
+    comparison_threshold_scales = config.getfloat('TEST', 'comparison_threshold_scales')
+
+    dependency_1 = os.path.join(pwd, out_folder, "dependency_1.h5")
+    dependency_2 = os.path.join(pwd, out_folder, "dependency_2.h5")
+    dependency_3 = os.path.join(pwd, out_folder, "dependency_3.h5")
+
+    if config.has_option('REQUIREMENTS','python_support'):
+        need_python_support = config.getboolean('REQUIREMENTS','python_support')
+    else:
+        need_python_support = False
+
+    if config.has_option('REQUIREMENTS','gpu_support'):
+        need_gpu_support = config.getboolean('REQUIREMENTS','gpu_support')
+    else:
+        need_gpu_support = False
+
+    if config.has_option('REQUIREMENTS','gpu_memory'):
+        need_gpu_memory = config.getfloat('REQUIREMENTS','gpu_memory')
+    else:
+        need_gpu_memoryt = 256
+
+    if config.has_option('REQUIREMENTS','system_memory'):
+        need_system_memory= config.getfloat('REQUIREMENTS','system_memory')
+    else:
+        need_system_memory = 1024
+
+
+    if not os.path.isfile(siemens_dat):
+        print("Can't find Siemens file %s" % siemens_dat)
+        return False
+
+    if not os.path.isfile(reference_h5):
+        print("Can't find reference HDF5 file %s" % reference_h5)
+        return False
+
+    if os.path.exists(out_folder):
+        shutil.rmtree(out_folder)
+        time.sleep(2)
+
+    os.makedirs(out_folder)
+
+    #Let's figure out if we should run this test or not
+    info = subprocess.check_output(["gadgetron_info"], env=environment);
+
+    
+    has_python_support = False
+    has_cuda_support = False
+    system_memory = 1024 #MB
+    number_of_gpus = 0
+    gpu_memory = 256 #MB
+    
+    p = re.compile('^[ \w]+-- Python Support     : ([A-Z]+)', re.MULTILINE)
+    m = p.search(info);
+    if m:
+        if m.group(1) == 'YES':
+            has_python_support = True
+
+    p = re.compile('^[ \w]+-- CUDA Support[ ]+: ([A-Z]+)', re.MULTILINE)
+    m = p.search(info);
+    if m:
+        if m.group(1) == 'YES':
+            has_cuda_support = True
+    
+    
+    p = re.compile('^[ \w]+\* Number of CUDA capable devices: ([0-9]+)', re.MULTILINE)
+    m = p.search(info);
+    if m:
+        number_of_gpus = m.group(1)
+    
+    p = re.compile('^[ \w]+-- System Memory size : ([0-9\.]+) MB', re.MULTILINE)
+    m = p.search(info);
+    if m:
+        system_memory = float(m.group(1))
+
+
+    p = re.compile('^[ \w]+\+ Total amount of global GPU memory: ([0-9\.]+) MB', re.MULTILINE)
+    m = p.search(info);
+    if m:
+        gpu_memory = float(m.group(1))
+    else:
+        gpu_memory = 0
+        has_cuda_support = False
+        number_of_gpus = 0
+
+    skipping_test = False
+        
+    if (need_system_memory > system_memory):
+        print "Test skipped because needed system memory (" + str(need_system_memory) + " MB) is larger than available system memory (" + str(system_memory) + " MB)"
+        skipping_test = True
+    
+    if (need_gpu_support and ((not has_cuda_support) or (number_of_gpus == 0) or (need_gpu_memory > gpu_memory))):
+        print "Test skipped because system does not meet gpu requirements"
+        skipping_test = True #It is not a failed test, just skipping
+        
+    if (need_python_support and (not has_python_support)):
+        print "Test skipped because Python is not available"
+        skipping_test = True
+
+    if skipping_test:
+        print "System Requirements: Actual/Required"
+        print "System Memory: " + str(system_memory) + "/" + str(need_system_memory)
+        print "Python Support: " + str(has_python_support) + "/" + str(need_python_support)
+        print "CUDA Support: " + str(has_cuda_support and (number_of_gpus > 0)) + "/" + str(need_gpu_support)
+        print "GPU Memory: " + str(gpu_memory) + "/" + str(need_gpu_memory)
+
+        f = open(gadgetron_log_filename, "w");
+        f.write("Test skipped because requirements not met\n");
+        f.close();
+        
+        f = open(client_log_filename, "w");
+        f.write("Test skipped because requirements not met\n");
+        f.close();
+
+        return True
+    
+    #inputfilename, gadgetronconfig, referencefile, h5dataset, gadgetron_log_filename, client_log_filename):
+
+    success = True
+    gadgetron_start = "sudo " + chroot_path + "../start.sh"
+
+    with open(gadgetron_log_filename, "w") as gf:
+        if chroot_path == "Empty":
+            p = subprocess.Popen(["gadgetron", "-p", port], env=environment, stdout=gf, stderr=gf)
+        else:
+            p = subprocess.Popen(gadgetron_start, shell=True, stdout=gf, stderr=gf)
+
+        time.sleep(2)
+
+        with open(client_log_filename, "w") as cf:
+            # if there are dependencies
+            if siemens_data_measurement > 0:
+
+                # ------------------------------------------------------------
+                # first dependency
+                if siemens_dependency_measurement1 >= 0:
+                    print("Converting Siemens .dat file to ISMRMRD for the first dependency measurement.")
+                    r = subprocess.call(["siemens_to_ismrmrd", "-X","-f", siemens_dat, "-m",
+                                        siemens_dependency_parameter_xml, "-x", siemens_dependency_parameter_xsl, "-o",
+                                        dependency_1, "-z", str(siemens_dependency_measurement1+1)],
+                                        env=environment, stdout=cf, stderr=cf)
+                    if r != 0:
+                        print("Failed to run siemens_to_ismrmrd for the first dependency measurement!")
+                        success = False
+
+                    print("Running Gadgetron recon on the first dependency measurement")
+                    r = 0
+                    r = subprocess.call(["gadgetron_ismrmrd_client", "-p", port, "-f", dependency_1, "-c",
+                                            "default_measurement_dependencies.xml"],
+                                            env=environment, stdout=cf, stderr=cf)
+                    if r != 0:
+                        print("Failed to run gadgetron_ismrmrd_client on the first dependency measurement!")
+                        success = False
+
+                # ------------------------------------------------------------
+                # second dependency
+                if siemens_dependency_measurement2 >= 0:
+                    print("Converting Siemens .dat file to ISMRMRD for the second dependency measurement.")
+                    r = subprocess.call(["siemens_to_ismrmrd", "-X", "-f", siemens_dat, "-m",
+                                        siemens_dependency_parameter_xml, "-x", siemens_dependency_parameter_xsl, "-o",
+                                        dependency_2, "-z", str(siemens_dependency_measurement2+1)],
+                                        env=environment, stdout=cf, stderr=cf)
+                    if r != 0:
+                        print("Failed to run siemens_to_ismrmrd for the second dependency measurement!")
+                        success = False
+
+                    print("Running Gadgetron recon on the second dependency measurement")
+                    r = 0
+                    r = subprocess.call(["gadgetron_ismrmrd_client", "-p", port, "-f" , dependency_2, "-c",
+                                            "default_measurement_dependencies.xml"],
+                                            env=environment, stdout=cf, stderr=cf)
+                    
+                    if r != 0:
+                        print("Failed to run gadgetron_ismrmrd_client on the second dependency measurement!")
+                        success = False
+
+                # ------------------------------------------------------------
+                # third dependency
+                if siemens_dependency_measurement3 >= 0:
+                    print("Converting Siemens .dat file to ISMRMRD for the third dependency measurement.")
+                    r = subprocess.call(["siemens_to_ismrmrd", "-X", "-f", siemens_dat, "-m",
+                                        siemens_dependency_parameter_xml, "-x", siemens_dependency_parameter_xsl, "-o",
+                                        dependency_3, "-z", str(siemens_dependency_measurement3+1)],
+                                        env=environment, stdout=cf, stderr=cf)
+                    if r != 0:
+                        print("Failed to run siemens_to_ismrmrd for the third dependency measurement!")
+                        success = False
+
+                    print("Running Gadgetron recon on the third dependency measurement")
+                    r = 0
+                    r = subprocess.call(["gadgetron_ismrmrd_client", "-p", port, "-f", dependency_3, "-c",
+                                            "default_measurement_dependencies.xml"],
+                                            env=environment, stdout=cf, stderr=cf)
+                    
+                    if r != 0:
+                        print("Failed to run gadgetron_ismrmrd_client on the third dependency measurement!")
+                        success = False
+
+            # ---------------------------------------------------------------------------------------------
+            # now run the data measurement
+            print("Converting Siemens .dat file to ISMRMRD for data measurement.")
+            cmd = ["siemens_to_ismrmrd", "-X", "-f", siemens_dat, "-m",
+                    siemens_parameter_xml, "-x", siemens_parameter_xsl,
+                    "-o", ismrmrd, "-z", str(siemens_data_measurement+1)]
+
+            r = subprocess.call(cmd, env=environment, stdout=cf, stderr=cf)
+            if r != 0:
+                print("Failed to run siemens_to_ismrmrd!")
+                success = False
+
+            print("Running Gadgetron recon on data measurement")
+            r = 0
+            start_time = time.time()
+            r = subprocess.call(["gadgetron_ismrmrd_client", "-p", port, "-f" , ismrmrd, "-c",
+                                    gadgetron_configuration, "-G", gadgetron_configuration, "-o", result_h5],
+                                    env=environment, stdout=cf, stderr=cf)
+            print "Elapsed time: " + str(time.time()-start_time)
+            if r != 0:
+                print("Failed to run gadgetron_ismrmrd_client!")
+                success = False
+
+        p.terminate()
+
+        # make sure the gadgetron is stopped
+        if chroot_path != "Empty":
+            gadgetron_stop="sudo kill `pgrep -U root start.sh`"
+            subprocess.call(gadgetron_stop, shell=True)
+            time.sleep(1)
+
+    if not success:
+        return False
+
+    print("Comparing results")
+
+    f1 = h5py.File(result_h5)
+    f2 = h5py.File(reference_h5)
+    d1 = f1[result_dataset]
+    d2 = f2[reference_dataset]
+
+    # The shape stored by the 1.0 API is always N x Nchan x Nz x Ny x Nx
+    # Prior to 1.0, if a dimension was a singleton, it could be missing
+    # h5py returns a fixed tuple for an array shape
+    # this bit turns it into a list and removes the singletons
+    # TODO: fix the shapes in the reference data
+    # shapes_match = (d1.shape == d2.shape)
+    a1 = numpy.asarray(d1.shape)
+    a1 = a1.tolist()
+    while a1.count(1) > 0:
+        a1.remove(1)
+    a2 = numpy.asarray(d2.shape)
+    a2 = a2.tolist()
+    while a2.count(1) > 0:
+        a2.remove(1)
+    #print(" Shape 1: " + str(d1.shape) + "  numpy: " + str(a1))
+    #print(" Shape 2: " + str(d2.shape) + "  numpy: " + str(a2))
+    #print(" Compare dimensions: " + str(compare_dimensions))
+    shapes_match = (a1 == a2)
+
+    # If the types in the hdf5 are unsigned short numpy produces norms, dot products etc. in unsigned short. And that _will_ overflow...
+    norm_diff = (numpy.linalg.norm(d1[...].flatten().astype('float32') -
+                        d2[...].flatten().astype('float32')) /
+            numpy.linalg.norm(d2[...].flatten().astype('float32')))
+
+    scale = (float(numpy.dot(d1[...].flatten().astype('float32'),
+                    d1[...].flatten().astype('float32'))) /
+            float(numpy.dot(d1[...].flatten().astype('float32'),
+                    d2[...].flatten().astype('float32'))))
+
+    result = True
+
+    if compare_dimensions:
+        print("   --Comparing dimensions: " + str(shapes_match))
+        result = result and shapes_match
+
+    if compare_values:
+        print("   --Comparing values, norm diff : %s (threshold: %s)" %
+                (str(norm_diff), str(comparison_threshold_values)))
+        result = result and (norm_diff < comparison_threshold_values)
+
+    if compare_scales:
+        print("   --Comparing image scales, ratio : %s (%s) (threshold: %s)" %
+                (str(scale), str(abs(1-scale)), str(comparison_threshold_scales)))
+        result = result and (abs(1-scale) < comparison_threshold_scales)
+
+    return result
+
+def main():
+    if len(sys.argv) < 4:
+        sys.stderr.write("Missing arguments\n")
+        prog = os.path.basename(sys.argv[0])
+        help = "Usage: %s <ismrmrd home> <gadgetron home> <test case config> <optional: chroot path>\n" % prog
+        sys.stderr.write(help)
+        sys.exit(1)
+
+    if len(sys.argv) >= 5:
+        if platform.system() != "Linux":
+            prog = os.path.basename(sys.argv[0])
+            help = "%s with chroot can only run in linux \n" % prog
+            sys.stderr.write(help)
+            sys.exit(1)
+
+    if len(sys.argv) >= 5:
+        if os.getuid() != 0:
+            prog = os.path.basename(sys.argv[0])
+            help = "%s with chroot requires root previlige to run \n" % prog
+            sys.stderr.write(help)
+            sys.exit(1)
+
+    chroot_path = "Empty"
+    port = "9003"
+    if len(sys.argv) >= 5:
+        chroot_path = sys.argv[4]
+        port = "9002"
+
+    myenv = dict()
+
+    if len(sys.argv) >= 5:
+        myenv["ISMRMRD_HOME"] = os.path.join(chroot_path, os.path.realpath(sys.argv[1]))
+        myenv["GADGETRON_HOME"] = os.path.join(chroot_path, os.path.realpath(sys.argv[2]))
+    else:
+        myenv["ISMRMRD_HOME"] = os.path.realpath(sys.argv[1])
+        myenv["GADGETRON_HOME"] = os.path.realpath(sys.argv[2])
+
+    myenv["PYTHONPATH"] = os.environ.get("PYTHONPATH", "")
+    test_case = sys.argv[3]
+
+    libpath = "LD_LIBRARY_PATH"
+    if platform.system() == "Darwin":
+        libpath = "DYLD_FALLBACK_LIBRARY_PATH"
+
+    if platform.system() == "Windows":
+        myenv["SystemRoot"] = os.environ.get('SystemRoot', "")
+        myenv["PATH"] = os.environ.get('Path', "")
+        myenv["PATH"] += myenv["ISMRMRD_HOME"] + "/lib;"
+        #myenv["PATH"] = myenv["ISMRMRD_HOME"] + "/lib;" + myenv["PATH"]
+        myenv["PATH"] += myenv["ISMRMRD_HOME"] + "/bin;"
+        #myenv["PATH"] = myenv["ISMRMRD_HOME"] + "/bin;" + myenv["PATH"]
+        myenv["PATH"] += myenv["GADGETRON_HOME"] + "/lib;"
+        #myenv["PATH"] = myenv["GADGETRON_HOME"] + "/lib;" + myenv["PATH"]
+        myenv["PATH"] += myenv["GADGETRON_HOME"] + "/bin;"
+        #myenv["PATH"] = myenv["GADGETRON_HOME"] + "/bin;" + myenv["PATH"]
+        myenv[libpath] = ""
+    else:
+        myenv[libpath] = myenv["ISMRMRD_HOME"] + "/lib:"
+        myenv[libpath] += myenv["GADGETRON_HOME"] + "/lib:"
+        myenv[libpath] += myenv["GADGETRON_HOME"] + "/../arma/lib:"
+        if len(sys.argv) >= 5:
+            myenv[libpath] += chroot_path + "/usr/local/cuda/lib64:"
+            myenv[libpath] += chroot_path + "/opt/intel/mkl/lib/intel64:"
+            myenv[libpath] += chroot_path + "/opt/intel/lib/intel64:"
+        else:
+            myenv[libpath] += "/usr/local/cuda/lib64:"
+            myenv[libpath] += "/opt/intel/mkl/lib/intel64:"
+            myenv[libpath] += "/opt/intel/lib/intel64:"
+        if os.environ.get(libpath, None) is not None:
+            myenv[libpath] += os.environ[libpath]
+        myenv["PATH"] = myenv["ISMRMRD_HOME"] + "/bin" + ":" + myenv["GADGETRON_HOME"] + "/bin:/bin:/usr/local/bin:/usr/local/sbin:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin"
+
+    myenv["ACE_DEBUG"] = "1"
+    #myenv["GADGETRON_LOG_MASK"] = "ALL"
+
+    if platform.system() == "Windows":
+        os.putenv('PATH', myenv['PATH'])
+    
+    print("Running Gadgetron test with: ")
+    print("  -- ISMRMRD_HOME  : " +  myenv["ISMRMRD_HOME"])
+    print("  -- GADGETRON_HOME  : " +  myenv["GADGETRON_HOME"])
+    print("  -- PATH            : " +  myenv["PATH"])
+    print("  -- " + libpath + " : " +  myenv[libpath])
+    if len(sys.argv) >= 5:
+        print("  -- chroot          : " +  chroot_path)
+    print("  -- TEST CASE       : " + test_case)
+
+    test_result = run_test(myenv, test_case, chroot_path, port)
+
+    if test_result:
+        print("TEST: " + test_case + " SUCCESS")
+        return 0
+    else:
+        print("TEST: " + test_case + " FAILED")
+        return -100
+
+if __name__=="__main__":
+    sys.exit(main())
diff --git a/test/tests.cpp b/test/tests.cpp
new file mode 100644
index 0000000..c938a37
--- /dev/null
+++ b/test/tests.cpp
@@ -0,0 +1,14 @@
+/*
+ * tests.cpp
+ *
+ *  Created on: Feb 28, 2013
+ *      Author: Dae
+ */
+
+
+#include <gtest/gtest.h>
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/test/unit/run_unit_tests.py b/test/unit/run_unit_tests.py
new file mode 100644
index 0000000..9d36301
--- /dev/null
+++ b/test/unit/run_unit_tests.py
@@ -0,0 +1,62 @@
+import subprocess
+import sys
+import os
+import platform
+
+def main():
+    if len(sys.argv) < 4:
+        sys.stderr.write("Missing arguments\n")
+        prog = os.path.basename(sys.argv[0])
+        help = "Usage: %s <ismrmrd home> <gadgetron home> <location of test_all.exe>\n" % prog
+        sys.stderr.write(help)
+        sys.exit(1)
+        
+    myenv = dict()
+    myenv["ISMRMRD_HOME"] = os.path.realpath(sys.argv[1])
+    myenv["GADGETRON_HOME"] = os.path.realpath(sys.argv[2])
+    myenv["UNITTEST_HOME"] = os.path.realpath(sys.argv[3])
+    myenv["PYTHONPATH"] = os.environ.get("PYTHONPATH", "")
+
+    libpath = "LD_LIBRARY_PATH"
+    if platform.system() == "Darwin":
+        libpath = "DYLD_FALLBACK_LIBRARY_PATH"
+
+    if platform.system() == "Windows":
+        myenv["SystemRoot"] = os.environ.get('SystemRoot', "")
+        myenv["PATH"] = os.environ.get('Path', "")
+        myenv["PATH"] += myenv["ISMRMRD_HOME"] + "/lib;"
+        myenv["PATH"] += myenv["GADGETRON_HOME"] + "/lib;"
+        myenv["PATH"] += myenv["UNITTEST_HOME"]
+        myenv[libpath] = ""
+    else:
+        myenv[libpath] = myenv["ISMRMRD_HOME"] + "/lib:"
+        myenv[libpath] += myenv["GADGETRON_HOME"] + "/lib:"
+        myenv[libpath] += myenv["GADGETRON_HOME"] + "/../arma/lib:"
+        myenv[libpath] += "/usr/local/cuda/lib64:"
+        myenv[libpath] += "/opt/intel/mkl/lib/intel64:"
+        myenv[libpath] += "/opt/intel/lib/intel64:"
+        if os.environ.get(libpath, None) is not None:
+            myenv[libpath] += os.environ[libpath]
+        myenv["PATH"] = myenv["ISMRMRD_HOME"] + "/bin" + ":" + myenv["GADGETRON_HOME"] + "/bin" + ":" + myenv["UNITTEST_HOME"]
+
+    myenv["ACE_DEBUG"] = "1"
+
+    if platform.system() == "Windows":
+        os.putenv('PATH', myenv['PATH'])
+    
+    print("Running unit tests with: ")
+    print("  -- ISMRMRD_HOME  : " +  myenv["ISMRMRD_HOME"])
+    print("  -- GADGETRON_HOME  : " +  myenv["GADGETRON_HOME"])
+    print("  -- PATH            : " +  myenv["PATH"])
+    print("  -- " + libpath + " : " +  myenv[libpath])
+    
+    r = subprocess.call("test_all.exe", env=myenv)
+    
+    if r != 0:
+        print("Failed to run unit tests!")
+        return -100
+
+    return 0
+
+if __name__=="__main__":
+    sys.exit(main())
diff --git a/test/vector_td_test.cpp b/test/vector_td_test.cpp
new file mode 100644
index 0000000..8fc3ed7
--- /dev/null
+++ b/test/vector_td_test.cpp
@@ -0,0 +1,141 @@
+/*
+ * cuGTBLAS_test.cpp
+ *
+ *  Created on: Feb 28, 2013
+ *      Author: Dae
+ */
+#include "gtest/gtest.h"
+
+
+#include <vector>
+#include "complext.h"
+#include "cuNDArray_elemwise.h"
+#include "vector_td_utilities.h"
+#include "vector_td_io.h"
+#include "cuVector_td_test_kernels.h"
+#include <sstream>
+using namespace Gadgetron;
+using testing::Types;
+template <typename T> class vector_td_Test : public ::testing::Test {
+	protected:
+	 virtual void SetUp() {
+		 size_t vdims[] = {37}; //Using prime numbers for setup because they are messy
+		 dims= std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+		 cuData = cuNDArray<vector_td<T,3> >(&dims);
+		 cuData2 = cuNDArray<vector_td<T,3> >(&dims);
+	}
+	 cuNDArray<vector_td<T,3> > cuData;
+	 cuNDArray<vector_td<T,3> > cuData2;
+	 std::vector<size_t> dims;
+
+
+};
+
+//typedef Types<float,double,float_complext,double_complext> Implementations;
+typedef Types<float,double> Implementations;
+
+TYPED_TEST_CASE(vector_td_Test, Implementations);
+
+
+TYPED_TEST(vector_td_Test,absTest){
+
+	vector_fill(&this->cuData,vector_td<TypeParam,3>(-2));
+
+	test_abs(&this->cuData);
+	vector_td<TypeParam,3> expected(2);
+	vector_td<TypeParam,3> result = this->cuData.get_device_ptr()[2];
+	EXPECT_EQ(expected,result);
+}
+
+TYPED_TEST(vector_td_Test,normTest){
+	vector_fill(&this->cuData,vector_td<TypeParam,3>(12.1));
+
+	thrust::device_vector<TypeParam> out = test_norm(&this->cuData);
+
+	EXPECT_FLOAT_EQ(real(20.957814772),out[3]);
+}
+
+
+
+TYPED_TEST(vector_td_Test,minTest){
+	vector_fill(&this->cuData,vector_td<TypeParam,3>(2.2,1.1,5.3));
+
+	thrust::device_vector<TypeParam> out = test_min(&this->cuData);
+
+	EXPECT_FLOAT_EQ(TypeParam(1.1),out[5]);
+}
+
+TYPED_TEST(vector_td_Test,maxTest){
+	vector_fill(&this->cuData,vector_td<TypeParam,3>(2.2,1.1,5.3));
+
+	thrust::device_vector<TypeParam> out = test_max(&this->cuData);
+
+	EXPECT_FLOAT_EQ(TypeParam(5.3),out[5]);
+}
+
+
+TYPED_TEST(vector_td_Test,aminTest){
+	vector_fill(&this->cuData,vector_td<TypeParam,3>(2.2,1.1,5.3));
+	vector_fill(&this->cuData2,vector_td<TypeParam,3>(20.2,0.11,5.3));
+
+	boost::shared_ptr<cuNDArray<vector_td<TypeParam,3> > > out = test_amin(&this->cuData,&this->cuData2);
+	vector_td<TypeParam,3> expected(2.2,0.11,5.3);
+	boost::shared_ptr<hoNDArray<vector_td<TypeParam,3> > > host = out->to_host();
+	EXPECT_EQ(expected,host->begin()[35]);
+}
+
+TYPED_TEST(vector_td_Test,amin2Test){
+	vector_fill(&this->cuData,vector_td<TypeParam,3>(2.2,1.1,5.3));
+
+	boost::shared_ptr<cuNDArray<vector_td<TypeParam,3> > > out = test_amin2(&this->cuData,TypeParam(4));
+	vector_td<TypeParam,3> expected(2.2,1.1,4);
+	boost::shared_ptr<hoNDArray<vector_td<TypeParam,3> > > host = out->to_host();
+	EXPECT_EQ(expected,host->begin()[35]);
+}
+
+TYPED_TEST(vector_td_Test,amaxTest){
+	vector_fill(&this->cuData,vector_td<TypeParam,3>(2.2,1.1,5.3));
+	vector_fill(&this->cuData2,vector_td<TypeParam,3>(20.2,0.11,5.3));
+
+	boost::shared_ptr<cuNDArray<vector_td<TypeParam,3> > > out = test_amax(&this->cuData,&this->cuData2);
+	vector_td<TypeParam,3> expected(20.2,1.1,5.3);
+	boost::shared_ptr<hoNDArray<vector_td<TypeParam,3> > > host = out->to_host();
+	EXPECT_EQ(expected,host->begin()[23]);
+}
+
+TYPED_TEST(vector_td_Test,amax2Test){
+	vector_fill(&this->cuData,vector_td<TypeParam,3>(2.2,1.1,5.3));
+
+	boost::shared_ptr<cuNDArray<vector_td<TypeParam,3> > > out = test_amax2(&this->cuData,TypeParam(4));
+	vector_td<TypeParam,3> expected(4,4,5.3);
+	boost::shared_ptr<hoNDArray<vector_td<TypeParam,3> > > host = out->to_host();
+	EXPECT_EQ(expected,host->begin()[26]);
+}
+
+TEST(vector_td,parseTest){
+std::string base ="[23,22,25]";
+std::stringstream ss(base);
+
+vector_td<float,3> vec;
+vector_td<float,3> res(23,22,25);
+ss >> vec;
+
+EXPECT_FALSE(ss.fail());
+EXPECT_EQ(res,vec);
+
+}
+
+
+TEST(vector_td,parseEqualTest){
+	vector_td<float,3> res(23,22,25);
+	std::stringstream ss;
+	ss << res;
+
+	vector_td<float,3> vec;
+
+	ss >> vec;
+
+	EXPECT_FALSE(ss.fail());
+	EXPECT_EQ(res,vec);
+
+}
diff --git a/toolboxes/CMakeLists.txt b/toolboxes/CMakeLists.txt
new file mode 100644
index 0000000..8ba05b8
--- /dev/null
+++ b/toolboxes/CMakeLists.txt
@@ -0,0 +1,44 @@
+if (WIN32)
+    ADD_DEFINITIONS(-DTIXML_USE_STL)
+endif (WIN32)
+
+if (MKL_FOUND)
+    INCLUDE_DIRECTORIES( ${MKL_INCLUDE_DIR} )
+    LINK_DIRECTORIES( ${MKL_LIB_DIR} ${MKL_COMPILER_LIB_DIR} )
+    link_libraries(${MKL_LIBRARIES})
+endif (MKL_FOUND)
+
+add_subdirectory(log)
+
+add_subdirectory(operators)
+add_subdirectory(solvers)
+
+if (FFTW3_FOUND)
+  add_subdirectory(fft)
+  add_subdirectory(core)
+  add_subdirectory(mri_core)
+  add_subdirectory(mri)
+  add_subdirectory(ct)
+endif()
+
+add_subdirectory(nfft)
+add_subdirectory(dwt)
+add_subdirectory(registration)
+
+IF (ACE_FOUND)
+  add_subdirectory(gadgettools)
+  add_subdirectory(cloudbus)
+ENDIF()
+
+IF (FFTW3_FOUND AND ISMRMRD_FOUND)
+  add_subdirectory(gtplus)
+ENDIF()
+
+find_package(BLAS)
+find_package(LAPACK)
+
+# Should we compile the python toolbox
+find_package(Boost COMPONENTS python system thread REQUIRED)
+if (Boost_PYTHON_FOUND AND PYTHONLIBS_FOUND AND NUMPY_FOUND)
+    add_subdirectory(python)
+endif (Boost_PYTHON_FOUND AND PYTHONLIBS_FOUND AND NUMPY_FOUND)
diff --git a/toolboxes/cloudbus/CMakeLists.txt b/toolboxes/cloudbus/CMakeLists.txt
new file mode 100644
index 0000000..53fdf47
--- /dev/null
+++ b/toolboxes/cloudbus/CMakeLists.txt
@@ -0,0 +1,33 @@
+find_package(ACE REQUIRED)
+find_package(Boost REQUIRED)
+
+include_directories(${ACE_INCLUDE_DIR}
+                    ${Boost_INCLUDE_DIR} 
+                    )
+
+add_library(gadgetron_toolbox_cloudbus SHARED
+  CloudBus.cpp
+  CloudBus.h
+  cloudbus_export.h
+)
+
+target_link_libraries(gadgetron_toolbox_cloudbus
+		     gadgetron_toolbox_log
+                     optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY})
+
+set_target_properties(gadgetron_toolbox_cloudbus PROPERTIES COMPILE_DEFINITIONS "__BUILD_GADGETRON_CLOUDBUS__")
+set_target_properties(gadgetron_toolbox_cloudbus  PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+add_executable(gadgetron_cloudbus cloudbus_main.cpp)
+target_link_libraries(gadgetron_cloudbus 
+                     gadgetron_toolbox_cloudbus 
+		     gadgetron_toolbox_log
+                     optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} )
+
+install(TARGETS gadgetron_toolbox_cloudbus DESTINATION lib COMPONENT main)
+install(TARGETS gadgetron_cloudbus DESTINATION bin COMPONENT main)
+
+install(FILES 
+  CloudBus.h
+  cloudbus_export.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/cloudbus/CloudBus.cpp b/toolboxes/cloudbus/CloudBus.cpp
new file mode 100644
index 0000000..41f73cb
--- /dev/null
+++ b/toolboxes/cloudbus/CloudBus.cpp
@@ -0,0 +1,248 @@
+#include "CloudBus.h"
+#include "log.h"
+
+namespace Gadgetron
+{
+  CloudBus* CloudBus::instance_ = 0;
+  const char* CloudBus::mcast_inet_addr_ = GADGETRON_DEFAULT_MULTICAST_ADDR;
+  int CloudBus::mcast_port_ = GADGETRON_DEFAULT_MULTICAST_PORT;
+  bool CloudBus::query_mode_ = false; //Listen only is disabled default
+  int CloudBus::gadgetron_port_ = 9002; //Default port
+
+  CloudBusTask::CloudBusTask(int port, const char* addr)
+    : inherited()
+    , mcast_addr_(port, addr)
+    , mcast_dgram_(ACE_SOCK_Dgram_Mcast::OPT_BINDADDR_NO)
+  {
+  }
+
+  CloudBusTask::CloudBusTask()
+    : inherited()
+    , mcast_addr_(GADGETRON_DEFAULT_MULTICAST_PORT, GADGETRON_DEFAULT_MULTICAST_ADDR)
+    , mcast_dgram_(ACE_SOCK_Dgram_Mcast::OPT_BINDADDR_NO)
+  {
+  }
+    
+  int CloudBusTask::open(void*)
+  {
+    return this->activate( THR_NEW_LWP | THR_JOINABLE,1); //single thread
+  }
+
+  CloudBusReceiverTask::CloudBusReceiverTask(int port, const char* addr)
+    : CloudBusTask(port, addr)
+  {
+    
+  }
+
+  int CloudBusReceiverTask::open(void*)
+  {
+
+#if defined(__linux) || defined(__linux__) || defined(linux)
+    //On linux we will loop through all names interfaces and join as many as we can
+    struct if_nameindex *intf;
+    intf = if_nameindex ();
+    if (intf == 0) {
+      GERROR("Unable to get names of network interfaces\n");
+      return -1;
+    }
+    
+    int ifs_joined = 0;
+    int index = 0;
+    while (intf[index].if_index != 0 || intf[index].if_name != 0) {
+      if (mcast_dgram_.join(mcast_addr_,1,intf[index].if_name) != -1) {
+	++ifs_joined;
+      }
+      ++index;
+    }      
+    if_freenameindex (intf);
+
+    if (!ifs_joined) {
+      GERROR_STREAM("Error doing dgram join");
+      return -1;
+    }
+#else
+    if (mcast_dgram_.join(mcast_addr_) == -1) {
+      GERROR_STREAM("Error doing dgram join");
+      return -1;
+    }
+#endif
+
+    return CloudBusTask::open();      
+  }
+
+  int CloudBusReceiverTask::close(u_long flags)
+  {
+    mcast_dgram_.leave(mcast_addr_);
+    return CloudBusTask::close(flags);
+  }
+
+  int CloudBusReceiverTask::svc(void)
+  {
+    char buffer[GADGETRON_NODE_INFO_MESSAGE_LENGTH]; //Size of message
+    GadgetronNodeInfo info;
+    ACE_INET_Addr peer_address;
+    while (mcast_dgram_.recv(buffer, GADGETRON_NODE_INFO_MESSAGE_LENGTH, peer_address) != -1)
+      {
+	info.uuid = boost::uuids::to_string(*((boost::uuids::uuid*)buffer));
+	info.address = std::string(peer_address.get_host_addr());
+	memcpy(&info.port              , buffer + 16,                    sizeof(uint32_t));
+	memcpy(&info.compute_capability, buffer + 16 + sizeof(uint32_t), sizeof(uint32_t));
+	CloudBus::instance()->update_node(info.uuid.c_str(), info);
+      }
+
+    return 0;
+  }
+
+  CloudBusSenderTask::CloudBusSenderTask(int port, const char* addr)
+    : CloudBusTask(port, addr)
+  {
+    
+  }
+
+  int CloudBusSenderTask::open(void*)
+  {
+    if (mcast_dgram_.open(mcast_addr_) == -1) {
+      GDEBUG_STREAM("Error doing dgram open");
+      return -1;
+    }
+    return CloudBusTask::open();      
+  }
+
+  int CloudBusSenderTask::svc(void)
+  {
+    char buffer[GADGETRON_NODE_INFO_MESSAGE_LENGTH]; //Size of message
+    if (CloudBus::instance()->uuid_.size() != 16) {
+      GDEBUG_STREAM("Severe problem, UUID is != 16");
+      GDEBUG_STREAM("uuid: " << CloudBus::instance()->uuid_ << "(" << CloudBus::instance()->uuid_.size() << ")");
+    }
+    
+    memcpy(buffer                        ,  CloudBus::instance()->uuid_.begin(), 16);
+    memcpy(buffer + 16,                    &CloudBus::instance()->node_info_.port, sizeof(uint32_t));
+    memcpy(buffer + 16 + sizeof(uint32_t), &CloudBus::instance()->node_info_.compute_capability, sizeof(uint32_t));
+    
+    while (true) {
+      if (!CloudBus::instance()->query_mode_) {
+	if (mcast_dgram_.send(buffer, GADGETRON_NODE_INFO_MESSAGE_LENGTH) == -1) {
+	  GDEBUG_STREAM("Failed to send dgram data");
+	}
+      }
+      CloudBus::instance()->remove_stale_nodes();
+      ACE_OS::sleep(5);//Sleep for 5 seconds
+    }
+
+    return 0;
+  }
+
+  CloudBus* CloudBus::instance()
+  {
+    if (!instance_)
+      {
+	instance_ = new CloudBus(mcast_port_, mcast_inet_addr_);
+	instance_->receiver_.open();
+	instance_->sender_.open();
+      }
+    return instance_;
+  }
+
+  
+  void CloudBus::set_mcast_address(const char* addr)
+  {
+    mcast_inet_addr_ = addr;
+  }
+
+  void CloudBus::set_mcast_port(int port)
+  {
+    mcast_port_ = port;
+  }
+
+  void CloudBus::set_query_only(bool m)
+  {
+    query_mode_ = m;
+  }
+ 
+  void CloudBus::set_gadgetron_port(uint32_t port)
+  {
+    gadgetron_port_ = port;
+  }
+
+  void CloudBus::wait()
+  {
+    sender_.wait();
+    receiver_.wait();
+    receiver_.close();
+  }
+
+  void CloudBus::get_node_info(std::vector<GadgetronNodeInfo>& nodes)
+  {
+    mtx_.acquire();
+    nodes.clear();
+    for (map_type_::iterator it = nodes_.begin(); it != nodes_.end(); ++it) {
+      GadgetronNodeInfo n = it->second.first;
+      nodes.push_back(n);
+    }
+    mtx_.release();
+  }
+  
+  size_t CloudBus::get_number_of_nodes()
+  {
+    size_t n = 0;
+    mtx_.acquire();
+    n = nodes_.size();
+    mtx_.release();
+    return n;
+  }
+
+  CloudBus::CloudBus(int port, const char* addr)
+    : receiver_(port, addr)
+    , sender_(port, addr)
+    , mtx_("CLOUDBUSMTX")
+    , uuid_(boost::uuids::random_generator()())
+  {
+    node_info_.port = gadgetron_port_;
+    set_compute_capability(1);
+    node_info_.uuid = boost::uuids::to_string(uuid_);
+    ACE_SOCK_Acceptor listener (ACE_Addr::sap_any);
+    ACE_INET_Addr local_addr;
+    listener.get_local_addr (local_addr);
+    node_info_.address = std::string(local_addr.get_host_name());
+  }
+
+  void CloudBus::update_node(const char* a, GadgetronNodeInfo& info)
+  {
+    mtx_.acquire();
+    std::string key(a);
+    map_type_::iterator it = nodes_.find(key);
+    if (it == nodes_.end()) {
+      if (info.uuid != node_info_.uuid) { //Reject stuff coming from myself
+	GDEBUG_STREAM("---->>>> New Cloud Node <<<<< ----- " << info.uuid << " (" << info.address << ":" << info.port << ", " << info.compute_capability << ")");
+      } 
+    } 
+
+    if (info.uuid != node_info_.uuid) {
+      nodes_[key] = std::pair<GadgetronNodeInfo,time_t>(info,time(NULL));
+    }
+    mtx_.release();
+  }
+
+  void CloudBus::remove_stale_nodes()
+  {
+    mtx_.acquire();
+    map_type_ new_nodes_;
+    time_t now = time(NULL);
+    for (map_type_::iterator it = nodes_.begin(); it != nodes_.end(); ++it) {
+      if (fabs(difftime(it->second.second,now)) > 30) {
+        GadgetronNodeInfo n = it->second.first;
+        GDEBUG_STREAM("---->>>> DELETING STALE CLOUD NODE <<<<< ----- " << n.uuid << " (" << n.address << ":" << n.port  << ", " << n.compute_capability << ")");
+      }
+      else
+      {
+        new_nodes_[it->first] = it->second;
+      }
+    }
+
+    nodes_.clear();
+    nodes_ = new_nodes_;
+
+    mtx_.release();
+  }
+}
diff --git a/toolboxes/cloudbus/CloudBus.h b/toolboxes/cloudbus/CloudBus.h
new file mode 100644
index 0000000..bde427b
--- /dev/null
+++ b/toolboxes/cloudbus/CloudBus.h
@@ -0,0 +1,117 @@
+#ifndef GADGETRON_CLOUDBUS_H
+#define GADGETRON_CLOUDBUS_H
+
+#include "cloudbus_export.h"
+#include <ace/Task.h>
+#include <ace/INET_Addr.h>
+#include <ace/SOCK_Dgram_Mcast.h>
+#include <ace/OS_NS_unistd.h>
+#include <ace/SOCK_Acceptor.h>
+
+#include <boost/uuid/uuid.hpp>
+#include <boost/uuid/uuid_generators.hpp>
+#include <boost/uuid/uuid_io.hpp>
+
+#include <iostream>
+#include <map>
+#include <utility>
+#include <time.h>
+#include <vector>
+
+#define GADGETRON_DEFAULT_MULTICAST_ADDR "224.9.9.2"
+#define GADGETRON_DEFAULT_MULTICAST_PORT 4148
+#define GADGETRON_NODE_INFO_MESSAGE_LENGTH 16+sizeof(uint32_t)*2 //16 bytes for uuid + 2 ints
+
+namespace Gadgetron
+{
+
+  struct GadgetronNodeInfo
+  {
+    std::string uuid;
+    std::string address;
+    uint32_t port;
+    uint32_t compute_capability;
+  };
+  
+  class CloudBusTask : public ACE_Task<ACE_MT_SYNCH>
+  {
+  public:
+    typedef ACE_Task<ACE_MT_SYNCH> inherited;    
+    CloudBusTask(int port, const char* addr);
+    CloudBusTask();
+    virtual int open(void* = 0);
+
+  protected:
+    ACE_SOCK_Dgram_Mcast mcast_dgram_;
+    ACE_INET_Addr mcast_addr_;
+  };
+
+  class CloudBusReceiverTask : public CloudBusTask
+  {
+  public:
+    CloudBusReceiverTask(int port, const char* addr);
+    virtual int open(void* = 0);
+    virtual int close(u_long flags = 0);
+    virtual int svc(void);
+  };
+
+
+  class CloudBusSenderTask : public CloudBusTask
+  {
+  public:
+    CloudBusSenderTask(int port, const char* addr);
+    virtual int open(void* = 0);
+    virtual int svc(void);
+  };
+
+  class EXPORTCLOUDBUS CloudBus
+  {
+    friend class CloudBusReceiverTask;
+    friend class CloudBusSenderTask;
+
+    typedef std::map<std::string, std::pair<GadgetronNodeInfo, time_t> > map_type_;
+
+  public:
+    static CloudBus* instance();
+    static void set_mcast_address(const char* addr);
+    static void set_mcast_port(int port);
+    static void set_query_only(bool m = true);
+    static void set_gadgetron_port(uint32_t port);
+
+    void set_compute_capability(uint32_t c)
+    {
+      node_info_.compute_capability = c;
+    }
+
+    void wait();
+
+    void get_node_info(std::vector<GadgetronNodeInfo>& nodes);
+    size_t get_number_of_nodes();
+
+  protected:
+    ///Protected constructor. 
+    CloudBus(int port, const char* addr);
+
+    void update_node(const char* a, GadgetronNodeInfo& info);
+    void remove_stale_nodes();
+    
+    static CloudBus* instance_;
+    static const char* mcast_inet_addr_;
+    static int mcast_port_;
+    static bool query_mode_; //Listen only
+    static int gadgetron_port_;
+
+    GadgetronNodeInfo node_info_;
+    map_type_ nodes_;
+    
+    CloudBusReceiverTask receiver_;
+    CloudBusSenderTask   sender_;
+    ACE_Thread_Mutex mtx_;
+
+    boost::uuids::uuid uuid_;
+  };
+
+
+}
+
+#endif
diff --git a/toolboxes/cloudbus/cloudbus_export.h b/toolboxes/cloudbus/cloudbus_export.h
new file mode 100644
index 0000000..b85fed2
--- /dev/null
+++ b/toolboxes/cloudbus/cloudbus_export.h
@@ -0,0 +1,14 @@
+#ifndef CLOUDBUS_EXPORT_H_
+#define CLOUDBUS_EXPORT_H_
+
+#if defined (WIN32)
+    #if defined (__BUILD_GADGETRON_CLOUDBUS__) || defined (gadgetron_toolbox_cloudbus_EXPORTS)
+        #define EXPORTCLOUDBUS __declspec(dllexport)
+    #else
+        #define EXPORTCLOUDBUS __declspec(dllimport)
+    #endif
+#else
+    #define EXPORTCLOUDBUS
+#endif
+
+#endif
diff --git a/toolboxes/cloudbus/cloudbus_main.cpp b/toolboxes/cloudbus/cloudbus_main.cpp
new file mode 100644
index 0000000..65a8cff
--- /dev/null
+++ b/toolboxes/cloudbus/cloudbus_main.cpp
@@ -0,0 +1,36 @@
+#include <iostream>
+#include "log.h"
+
+#include "CloudBus.h"
+
+int main(int argc, char** argv)
+{
+  GDEBUG_STREAM("CloudBus Main Program" << std::endl);
+
+  int port = GADGETRON_DEFAULT_MULTICAST_PORT;
+  const char* addr = GADGETRON_DEFAULT_MULTICAST_ADDR;
+  bool query_only_mode = true;
+
+  if (argc > 1) {
+    addr = argv[1];
+    GDEBUG_STREAM("Setting multicast address to: " << addr << std::endl);
+  }
+
+  if (argc > 2) {
+    port = std::atoi(argv[2]);
+    GDEBUG_STREAM("Setting multicast port to: " << port << std::endl);
+  }
+
+  if (argc > 3)
+  {
+    query_only_mode = false;
+  }
+
+  //Port and address must be set before grabbing the instance for the first time. 
+  Gadgetron::CloudBus::set_mcast_address(addr);
+  Gadgetron::CloudBus::set_mcast_port(port);
+  Gadgetron::CloudBus::set_query_only(query_only_mode);
+  Gadgetron::CloudBus* cb = Gadgetron::CloudBus::instance();
+  cb->wait();
+  return 0;
+}
diff --git a/toolboxes/core/CMakeLists.txt b/toolboxes/core/CMakeLists.txt
new file mode 100644
index 0000000..5e4cb34
--- /dev/null
+++ b/toolboxes/core/CMakeLists.txt
@@ -0,0 +1,25 @@
+include_directories(
+  ${Boost_INCLUDE_DIR}
+  )
+
+configure_file(core_defines.h.in ${CMAKE_CURRENT_SOURCE_DIR}/core_defines.h)
+
+install(FILES 
+  core_defines.h
+  NDArray.h
+  complext.h
+  vector_td.h
+  vector_td_operators.h
+  vector_td_utilities.h
+  vector_td_io.h
+  real_utilities.h
+  GadgetronException.h
+  GadgetronTimer.h
+  Gadgetron_enable_types.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+add_subdirectory(cpu)
+
+if (CUDA_FOUND)
+  add_subdirectory(gpu)
+endif (CUDA_FOUND)
diff --git a/toolboxes/core/GadgetronException.h b/toolboxes/core/GadgetronException.h
new file mode 100644
index 0000000..ab38693
--- /dev/null
+++ b/toolboxes/core/GadgetronException.h
@@ -0,0 +1,33 @@
+/** \file GadgetronException.h
+    \brief An interface to the exception handling used in the Gadgetron to indicate runtime errors.
+*/
+
+#pragma once
+
+#include <iostream>
+#include <exception>
+#include <stdexcept>
+
+namespace Gadgetron{
+
+  class runtime_error: virtual public std::exception 
+  {
+  public:
+    runtime_error() : std::exception(), msg(0){}
+    runtime_error(std::string _msg) : std::exception(), msg(_msg.c_str()){
+    }
+    virtual const  char * what() const throw(){
+      if (msg) return msg;
+      else return std::exception::what();
+    }
+  protected:
+    const char * msg;
+  };
+  
+  class bad_alloc : public runtime_error 
+  {
+  public:
+    bad_alloc(std::string msg) : runtime_error(msg){}
+    bad_alloc() : runtime_error(){}
+  };
+}
diff --git a/toolboxes/core/GadgetronTimer.h b/toolboxes/core/GadgetronTimer.h
new file mode 100644
index 0000000..d185133
--- /dev/null
+++ b/toolboxes/core/GadgetronTimer.h
@@ -0,0 +1,110 @@
+/** \file GadgetronTimer.h
+    \brief Generic timer class to measure runtime performance.
+*/
+
+#ifndef __GADGETRONTIMER_H
+#define __GADGETRONTIMER_H
+
+#pragma once
+
+#ifdef WIN32 
+#include <windows.h>
+#else 
+#include <sys/time.h>
+#endif
+
+#include <string>
+#include "log.h"
+
+namespace Gadgetron{
+
+  class GadgetronTimer
+  {
+  public:
+
+    GadgetronTimer() : name_("GPUTimer"), timing_in_destruction_(true)
+    {
+        pre();
+        start();
+    }
+
+    GadgetronTimer(bool timing) : name_("GPUTimer"), timing_in_destruction_(timing)
+    {
+        if ( timing_in_destruction_ )
+        {
+            pre();
+            start();
+        }
+    }
+
+    GadgetronTimer(const char* name, bool timing=true) : name_(name), timing_in_destruction_(timing) 
+    {
+        if ( timing_in_destruction_ )
+        {
+            pre();
+            start();
+        }
+    }
+
+    virtual ~GadgetronTimer() 
+    {
+        if ( timing_in_destruction_ )
+        {
+            post();
+            stop();
+        }
+    }
+
+    virtual void pre() {}
+    virtual void post() {}
+
+    virtual void start()
+    {
+#ifdef WIN32
+        QueryPerformanceFrequency(&frequency_);
+        QueryPerformanceCounter(&start_);
+#else
+        gettimeofday(&start_, NULL);
+#endif
+    }
+
+    void start(const char* name)
+    {
+        name_ = name;
+        start();
+    }
+
+    virtual double stop()
+    {
+        double time_in_us = 0.0;
+#ifdef WIN32
+        QueryPerformanceCounter(&end_);
+        time_in_us = (end_.QuadPart * (1.0e6/ frequency_.QuadPart)) - start_.QuadPart * (1.0e6 / frequency_.QuadPart);
+#else
+        gettimeofday(&end_, NULL);
+        time_in_us = ((end_.tv_sec * 1e6) + end_.tv_usec) - ((start_.tv_sec * 1e6) + start_.tv_usec);
+#endif
+	GDEBUG("%s:%f ms\n", name_.c_str(), time_in_us/1000.0);
+        return time_in_us;
+    }
+
+    void set_timing_in_destruction(bool timing) { timing_in_destruction_ = timing; }
+
+  protected:
+
+#ifdef WIN32
+    LARGE_INTEGER frequency_;
+    LARGE_INTEGER start_;
+    LARGE_INTEGER end_;
+#else
+    timeval start_;
+    timeval end_;
+#endif
+
+    std::string name_;
+
+    bool timing_in_destruction_;
+  };
+}
+
+#endif //__GADGETRONTIMER_H
diff --git a/toolboxes/core/Gadgetron_enable_types.h b/toolboxes/core/Gadgetron_enable_types.h
new file mode 100644
index 0000000..bc70a5e
--- /dev/null
+++ b/toolboxes/core/Gadgetron_enable_types.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <boost/type_traits.hpp>
+#include "complext.h"
+
+namespace Gadgetron {
+	template<class T> struct enable_operators : public boost::false_type{};
+	template<> struct enable_operators<float> : public boost::true_type{};
+	template<> struct enable_operators<Gadgetron::complext<float> > : public boost::true_type{};
+	template<> struct enable_operators<double> : public boost::true_type{};
+	template<> struct enable_operators<Gadgetron::complext<double> > : public boost::true_type{};
+}
diff --git a/toolboxes/core/NDArray.h b/toolboxes/core/NDArray.h
new file mode 100644
index 0000000..edc4ce3
--- /dev/null
+++ b/toolboxes/core/NDArray.h
@@ -0,0 +1,802 @@
+/** \file NDArray.h
+\brief Abstract base class for all Gadgetron host and device arrays
+*/
+
+#ifndef NDARRAY_H
+#define NDARRAY_H
+#pragma once
+
+#include "GadgetronException.h"
+#include "log.h"
+
+#include <new>
+#include <vector>
+#include <iostream>
+#include <stdexcept>
+
+#include <boost/shared_ptr.hpp>
+#include <boost/cast.hpp>
+
+namespace Gadgetron{
+
+    template <typename T> class NDArray
+    {
+    public:
+
+        typedef T element_type;
+        typedef T value_type;
+
+        NDArray () : data_(0), elements_(0), delete_data_on_destruct_(true)
+        {
+            dimensions_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+            offsetFactors_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+        }
+
+        virtual ~NDArray() {}
+
+        virtual void create(std::vector<size_t> &dimensions);
+        virtual void create(std::vector<size_t> *dimensions);
+        virtual void create(boost::shared_ptr< std::vector<size_t> > dimensions);
+
+        virtual void create(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct = false);
+        virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+        virtual void create(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+        void squeeze();
+
+        void reshape(const std::vector<size_t> *dims);
+        void reshape(const std::vector<size_t> & dims){ this->reshape(&dims);}
+        void reshape(boost::shared_ptr< std::vector<size_t> > dims);
+
+        bool dimensions_equal(std::vector<size_t> *d) const;
+
+        template<class S> bool dimensions_equal(const NDArray<S> *a) const
+        {
+            //boost::shared_ptr<std::vector<size_t > > adims = a->get_dimensions();
+            //return ((this->dimensions_->size() == adims->size()) &&
+            //    std::equal(this->dimensions_->begin(), this->dimensions_->end(), adims->begin()));
+
+            std::vector<size_t>* dim;
+            a->get_dimensions(dim);
+
+            if ( this->dimensions_->size() != dim->size() ) return false;
+
+            size_t NDim = this->dimensions_->size();
+            for ( size_t d=0; d<NDim; d++ )
+            {
+                if ( (*this->dimensions_)[d] != (*dim)[d] ) return false;
+            }
+
+            return true;
+        }
+
+        size_t get_number_of_dimensions() const;
+
+        size_t get_size(size_t dimension) const;
+
+        boost::shared_ptr< std::vector<size_t> > get_dimensions() const;
+        void get_dimensions(std::vector<size_t>*& dim) const;
+        void get_dimensions(std::vector<size_t>& dim) const;
+
+        T* get_data_ptr() const;
+
+        size_t get_number_of_elements() const;
+
+        size_t get_number_of_bytes() const;
+
+        bool delete_data_on_destruct() const;
+        void delete_data_on_destruct(bool d);
+
+        size_t calculate_offset(const std::vector<size_t>& ind) const;
+        static size_t calculate_offset(const std::vector<size_t>& ind, const std::vector<size_t>& offsetFactors);
+
+        size_t calculate_offset(size_t x, size_t y) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u) const;
+
+        size_t get_offset_factor(size_t dim) const;
+        void get_offset_factor(std::vector<size_t>& offset) const;
+        boost::shared_ptr< std::vector<size_t> > get_offset_factor() const;
+
+        size_t get_offset_factor_lastdim() const;
+
+        void calculate_offset_factors(const std::vector<size_t>& dimensions);
+        static void calculate_offset_factors(const std::vector<size_t>& dimensions, std::vector<size_t>& offsetFactors);
+
+        std::vector<size_t> calculate_index( size_t offset ) const;
+        void calculate_index( size_t offset, std::vector<size_t>& index ) const;
+        static void calculate_index( size_t offset, const std::vector<size_t>& offsetFactors, std::vector<size_t>& index );
+
+        void clear();
+
+        T& operator()( const std::vector<size_t>& ind );
+        const T& operator()( const std::vector<size_t>& ind ) const;
+
+        T& operator()( size_t x );
+        const T& operator()( size_t x ) const;
+
+        T& operator()( size_t x, size_t y );
+        const T& operator()( size_t x, size_t y ) const;
+
+        T& operator()( size_t x, size_t y, size_t z );
+        const T& operator()( size_t x, size_t y, size_t z ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u ) const;
+
+        /// whether a point is within the array range
+        bool point_in_range(const std::vector<size_t>& ind) const;
+        bool point_in_range(size_t x) const;
+        bool point_in_range(size_t x, size_t y) const;
+        bool point_in_range(size_t x, size_t y, size_t z) const;
+        bool point_in_range(size_t x, size_t y, size_t z, size_t s) const;
+        bool point_in_range(size_t x, size_t y, size_t z, size_t s, size_t p) const;
+        bool point_in_range(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r) const;
+        bool point_in_range(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a) const;
+        bool point_in_range(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q) const;
+        bool point_in_range(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u) const;
+
+    protected:
+
+        virtual void allocate_memory() = 0;
+        virtual void deallocate_memory() = 0;
+
+    protected:
+
+        boost::shared_ptr< std::vector<size_t> > dimensions_;
+        boost::shared_ptr< std::vector<size_t> > offsetFactors_;
+        T* data_;
+        size_t elements_;
+        bool delete_data_on_destruct_;
+    };
+
+    template <typename T> 
+    inline void NDArray<T>::create(std::vector<size_t> *dimensions) 
+    {
+        if(!dimensions) throw std::runtime_error("NDArray<T>::create(): 0x0 pointer provided");
+        std::vector<size_t> *tmp = new std::vector<size_t>;
+        *tmp = *dimensions;
+        dimensions_ = boost::shared_ptr< std::vector<size_t> >(tmp);
+        allocate_memory();
+        calculate_offset_factors(*dimensions_);
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::create(std::vector<size_t>& dimensions) 
+    {
+        std::vector<size_t> *tmp = new std::vector<size_t>;
+        *tmp = dimensions;
+        dimensions_ = boost::shared_ptr< std::vector<size_t> >(tmp);
+        allocate_memory();
+        calculate_offset_factors(*dimensions_);
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::create(boost::shared_ptr< std::vector<size_t> > dimensions)
+    {
+        this->create(dimensions.get());
+    }
+
+    template <typename T> 
+    void NDArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct) 
+    {
+        if (!dimensions) throw std::runtime_error("NDArray<T>::create(): 0x0 pointer provided");
+        if (!data) throw std::runtime_error("NDArray<T>::create(): 0x0 pointer provided");    
+        std::vector<size_t> *tmp = new std::vector<size_t>;
+        *tmp = *dimensions;
+        dimensions_ = boost::shared_ptr< std::vector<size_t> >(tmp);
+        this->data_ = data;
+        this->delete_data_on_destruct_ = delete_data_on_destruct;
+        this->elements_ = 1;
+        for (size_t i = 0; i < this->dimensions_->size(); i++){
+            this->elements_ *= (*this->dimensions_)[i];
+        }
+        calculate_offset_factors(*dimensions_);
+    }
+
+    template <typename T> 
+    void NDArray<T>::create(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct) 
+    {
+        if (!data) throw std::runtime_error("NDArray<T>::create(): 0x0 pointer provided");    
+        std::vector<size_t> *tmp = new std::vector<size_t>;
+        *tmp = dimensions;
+        dimensions_ = boost::shared_ptr< std::vector<size_t> >(tmp);
+        this->data_ = data;
+        this->delete_data_on_destruct_ = delete_data_on_destruct;
+        this->elements_ = 1;
+        for (size_t i = 0; i < this->dimensions_->size(); i++){
+            this->elements_ *= (*this->dimensions_)[i];
+        }
+        calculate_offset_factors(*dimensions_);
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::create(boost::shared_ptr<std::vector<size_t>  > dimensions, 
+        T* data, bool delete_data_on_destruct)
+    {
+        this->create(dimensions.get(), data, delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::squeeze()
+    {
+        boost::shared_ptr< std::vector<size_t> > new_dimensions( new std::vector<size_t> ); 
+        for (size_t i = 0; i < dimensions_->size(); i++){
+            if ((*dimensions_)[i] != 1){
+                new_dimensions->push_back((*dimensions_)[i]);
+            }
+        }    
+        dimensions_ = new_dimensions;
+        this->calculate_offset_factors(*dimensions_);
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::reshape(const std::vector<size_t> *dims)
+    {
+        size_t new_elements = 1;
+        for (size_t i = 0; i < dims->size(); i++){
+            new_elements *= (*dims)[i];
+        }    
+
+        if (new_elements != elements_)
+            throw std::runtime_error("NDArray<T>::reshape : Number of elements cannot change during reshape");    
+
+        // Copy the input dimensions array
+        std::vector<size_t> *tmp = new std::vector<size_t>;
+        *tmp = *dims;
+        dimensions_ = boost::shared_ptr< std::vector<size_t> >(tmp);
+        this->calculate_offset_factors(*dimensions_);
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::reshape( boost::shared_ptr< std::vector<size_t> > dims )
+    {
+        this->reshape(dims.get());
+    }
+
+    template <typename T> 
+    inline bool NDArray<T>::dimensions_equal(std::vector<size_t> *d) const
+    {
+        //return ((this->dimensions_->size() == d->size()) &&
+        //    std::equal(this->dimensions_->begin(), this->dimensions_->end(), d->begin()));
+
+        if ( this->dimensions_->size() != d->size() ) return false;
+
+        size_t NDim = this->dimensions_->size();
+        for ( size_t ii=0; ii<NDim; ii++ )
+        {
+            if ( (*this->dimensions_)[ii] != (*d)[ii] ) return false;
+        }
+
+        return true;
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::get_number_of_dimensions() const
+    {
+        return (size_t)dimensions_->size();
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::get_size(size_t dimension) const
+    {
+        if (dimension >= dimensions_->size()){
+            return 1;
+        }
+        else{
+            return (*dimensions_)[dimension];
+        }
+    }
+
+    template <typename T> 
+    inline boost::shared_ptr< std::vector<size_t> > NDArray<T>::get_dimensions() const
+    {
+        // Make copy to ensure that the receiver cannot alter the array dimensions
+        std::vector<size_t> *tmp = new std::vector<size_t>;
+        *tmp=*dimensions_;
+        return boost::shared_ptr< std::vector<size_t> >(tmp); 
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::get_dimensions(std::vector<size_t>*& dim) const
+    {
+        dim = dimensions_.get();
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::get_dimensions(std::vector<size_t>& dim) const
+    {
+        dim = *dimensions_;
+    }
+
+    template <typename T> 
+    inline T* NDArray<T>::get_data_ptr() const
+    { 
+        return data_;
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::get_number_of_elements() const
+    {
+        return elements_;
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::get_number_of_bytes() const
+    {
+        return elements_*sizeof(T);
+    }
+
+    template <typename T> 
+    inline bool NDArray<T>::delete_data_on_destruct() const
+    {
+        return delete_data_on_destruct_;
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::delete_data_on_destruct(bool d)
+    {
+        delete_data_on_destruct_ = d;
+    }
+
+    template <typename T> 
+    size_t NDArray<T>::calculate_offset(const std::vector<size_t>& ind, const std::vector<size_t>& offsetFactors)
+    {
+        size_t offset = ind[0];
+
+        for( size_t i = 1; i < ind.size(); i++ )
+        {
+            offset += ind[i] * offsetFactors[i];
+        }
+
+        return offset;
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(const std::vector<size_t>& ind) const
+    {
+        size_t offset = ind[0];
+        for( size_t i = 1; i < dimensions_->size(); i++ )
+            offset += ind[i] * (*offsetFactors_)[i];
+        return offset;
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(size_t x, size_t y) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimensions_->size()==2);
+        return x + y * (*offsetFactors_)[1];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(size_t x, size_t y, size_t z) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimensions_->size()==3);
+        return x + y * (*offsetFactors_)[1] + z * (*offsetFactors_)[2];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(size_t x, size_t y, size_t z, size_t s) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimensions_->size()==4);
+        return x + y * (*offsetFactors_)[1] + z * (*offsetFactors_)[2] + s * (*offsetFactors_)[3];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimensions_->size()==5);
+        return x + y * (*offsetFactors_)[1] + z * (*offsetFactors_)[2] + s * (*offsetFactors_)[3] + p * (*offsetFactors_)[4];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimensions_->size()==6);
+        return x + y * (*offsetFactors_)[1] + z * (*offsetFactors_)[2] + s * (*offsetFactors_)[3] + p * (*offsetFactors_)[4] + r * (*offsetFactors_)[5];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimensions_->size()==7);
+        return x + y * (*offsetFactors_)[1] + z * (*offsetFactors_)[2] + s * (*offsetFactors_)[3] + p * (*offsetFactors_)[4] + r * (*offsetFactors_)[5] + a * (*offsetFactors_)[6];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimensions_->size()==8);
+        return x + y * (*offsetFactors_)[1] + z * (*offsetFactors_)[2] + s * (*offsetFactors_)[3] + p * (*offsetFactors_)[4] + r * (*offsetFactors_)[5] + a * (*offsetFactors_)[6] + q * (*offsetFactors_)[7];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimensions_->size()==9);
+        return x + y * (*offsetFactors_)[1] + z * (*offsetFactors_)[2] + s * (*offsetFactors_)[3] + p * (*offsetFactors_)[4] + r * (*offsetFactors_)[5] + a * (*offsetFactors_)[6] + q * (*offsetFactors_)[7]+ u * (*offsetFactors_)[8];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::get_offset_factor(size_t dim) const
+    {
+        if ( dim >= (*dimensions_).size() )
+            throw std::runtime_error("NDArray<T>::get_offset_factor : index out of range");
+        return (*offsetFactors_)[dim];
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::get_offset_factor(std::vector<size_t>& offset) const
+    {
+        offset=*offsetFactors_;
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::get_offset_factor_lastdim() const
+    {
+        if( dimensions_->size() == 0 )
+            throw std::runtime_error("NDArray<T>::get_offset_factor_lastdim : array is empty");
+
+        return get_offset_factor(dimensions_->size()-1);
+    }
+
+    template <typename T> 
+    inline boost::shared_ptr< std::vector<size_t> > NDArray<T>::get_offset_factor() const
+    {
+        std::vector<size_t> *tmp = new std::vector<size_t>;
+        *tmp=*offsetFactors_;
+        return boost::shared_ptr< std::vector<size_t> >(tmp); 
+    }
+
+    template <typename T> 
+    void NDArray<T>::calculate_offset_factors(const std::vector<size_t>& dimensions, std::vector<size_t>& offsetFactors)
+    {
+        offsetFactors.resize(dimensions.size());
+        for( size_t i = 0; i < dimensions.size(); i++ )
+        {
+            size_t k = 1;
+            for( size_t j = 0; j < i; j++ )
+            {
+                k *= dimensions[j];
+            }
+
+            offsetFactors[i] = k;
+        }
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::calculate_offset_factors(const std::vector<size_t>& dimensions)
+    {
+        if ( offsetFactors_.get() == NULL ){
+            std::vector<size_t> *tmp = new std::vector<size_t>;
+            offsetFactors_ = boost::shared_ptr< std::vector<size_t> >(tmp);
+        }
+        offsetFactors_->resize(dimensions.size());
+        for( size_t i = 0; i < dimensions.size(); i++ ){
+            size_t k = 1;
+            for( size_t j = 0; j < i; j++ )
+                k *= dimensions[j];
+            (*offsetFactors_)[i] = k;
+        }
+    }
+
+    template <typename T> 
+    inline std::vector<size_t> NDArray<T>::calculate_index( size_t offset ) const
+    {
+        if( dimensions_->size() == 0 )
+            throw std::runtime_error("NDArray<T>::calculate_index : array is empty");
+
+        std::vector<size_t> index(dimensions_->size());
+        for( long long i = dimensions_->size()-1; i>=0; i-- ){
+            index[i] = offset / (*offsetFactors_)[i];
+            offset %= (*offsetFactors_)[i];
+        }
+        return index;
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::calculate_index( size_t offset, std::vector<size_t>& index ) const
+    {
+        if( dimensions_->size() == 0 )
+            throw std::runtime_error("NDArray<T>::calculate_index : array is empty");
+
+        index.resize(dimensions_->size(), 0);
+        for( long long i = dimensions_->size()-1; i>=0; i-- ){
+            index[i] = offset / (*offsetFactors_)[i];
+            offset %= (*offsetFactors_)[i];
+        }
+    }
+
+    template <typename T> 
+    void NDArray<T>::calculate_index( size_t offset, const std::vector<size_t>& offsetFactors, std::vector<size_t>& index )
+    {
+        index.resize(offsetFactors.size(), 0);
+
+        for( long long i = offsetFactors.size()-1; i>=0; i-- )
+        {
+            index[i] = offset / offsetFactors[i];
+            offset %= offsetFactors[i];
+        }
+    }
+
+    template <typename T> 
+    void NDArray<T>::clear()
+    {
+        if ( this->delete_data_on_destruct_ ){
+            this->deallocate_memory();
+        }
+        this->data_ = 0;
+        this->elements_ = 0;
+        this->delete_data_on_destruct_ = true;
+
+        if ( !this->dimensions_ ){
+            this->dimensions_->clear();
+            this->offsetFactors_->clear();
+        }
+        else{
+            this->dimensions_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+            this->offsetFactors_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+        }
+    } 
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( const std::vector<size_t>& ind )
+    {
+        size_t idx = this->calculate_offset(ind);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( const std::vector<size_t>& ind ) const
+    {
+        size_t idx = this->calculate_offset(ind);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x )
+    {
+        GADGET_DEBUG_CHECK_THROW(x < this->get_number_of_elements());
+        return this->data_[x];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x ) const
+    {
+        GADGET_DEBUG_CHECK_THROW(x < this->get_number_of_elements());
+        return this->data_[x];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x, size_t y )
+    {
+        size_t idx = this->calculate_offset(x, y);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x, size_t y ) const
+    {
+        size_t idx = this->calculate_offset(x, y);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x, size_t y, size_t z )
+    {
+        size_t idx = this->calculate_offset(x, y, z);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x, size_t y, size_t z ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a, q);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a, q);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a, q, u);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a, q, u);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->data_[idx];
+    }
+
+    template <typename T> 
+    inline bool NDArray<T>::point_in_range(const std::vector<size_t>& ind) const
+    {
+        unsigned int D = (*dimensions_).size();
+        if ( ind.size() != D ) return false;
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            if ( ind[ii]>=(*dimensions_)[ii] )
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    template <typename T> 
+    inline bool NDArray<T>::point_in_range(size_t x) const
+    {
+        GADGET_DEBUG_CHECK_THROW((*dimensions_).size()==1);
+        return (x<(*dimensions_)[0]);
+    }
+
+    template <typename T> 
+    inline bool NDArray<T>::point_in_range(size_t x, size_t y) const
+    {
+        GADGET_DEBUG_CHECK_THROW((*dimensions_).size()==2);
+        return ((x<(*dimensions_)[0]) && (y<(*dimensions_)[1]));
+    }
+
+    template <typename T> 
+    inline bool NDArray<T>::point_in_range(size_t x, size_t y, size_t z) const
+    {
+        GADGET_DEBUG_CHECK_THROW((*dimensions_).size()==3);
+        return ( (x<(*dimensions_)[0]) && (y<(*dimensions_)[1]) && (z<(*dimensions_)[2]));
+    }
+
+    template <typename T> 
+    inline bool NDArray<T>::point_in_range(size_t x, size_t y, size_t z, size_t s) const
+    {
+        GADGET_DEBUG_CHECK_THROW((*dimensions_).size()==4);
+        return ( (x<(*dimensions_)[0]) && (y<(*dimensions_)[1]) && (z<(*dimensions_)[2]) && (s<(*dimensions_)[3]));
+    }
+
+    template <typename T> 
+    inline bool NDArray<T>::point_in_range(size_t x, size_t y, size_t z, size_t s, size_t p) const
+    {
+        GADGET_DEBUG_CHECK_THROW((*dimensions_).size()==5);
+        return ( (x<(*dimensions_)[0]) && (y<(*dimensions_)[1]) && (z<(*dimensions_)[2]) && (s<(*dimensions_)[3]) && (p<(*dimensions_)[4]));
+    }
+
+    template <typename T> 
+    inline bool NDArray<T>::point_in_range(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r) const
+    {
+        GADGET_DEBUG_CHECK_THROW((*dimensions_).size()==6);
+        return ( (x<(*dimensions_)[0]) && (y<(*dimensions_)[1]) && (z<(*dimensions_)[2]) && (s<(*dimensions_)[3]) && (p<(*dimensions_)[4]) && (r<(*dimensions_)[5]));
+    }
+
+    template <typename T> 
+    inline bool NDArray<T>::point_in_range(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a) const
+    {
+        GADGET_DEBUG_CHECK_THROW((*dimensions_).size()==7);
+        return ( (x<(*dimensions_)[0]) && (y<(*dimensions_)[1]) && (z<(*dimensions_)[2]) && (s<(*dimensions_)[3]) && (p<(*dimensions_)[4]) && (r<(*dimensions_)[5]) && (a<(*dimensions_)[6]));
+    }
+
+    template <typename T> 
+    inline bool NDArray<T>::point_in_range(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q) const
+    {
+        GADGET_DEBUG_CHECK_THROW((*dimensions_).size()==8);
+        return ( (x<(*dimensions_)[0]) && (y<(*dimensions_)[1]) && (z<(*dimensions_)[2]) && (s<(*dimensions_)[3]) && (p<(*dimensions_)[4]) && (r<(*dimensions_)[5]) && (a<(*dimensions_)[6]) && (q<(*dimensions_)[7]));
+    }
+
+    template <typename T> 
+    inline bool NDArray<T>::point_in_range(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u) const
+    {
+        GADGET_DEBUG_CHECK_THROW((*dimensions_).size()==9);
+        return ( (x<(*dimensions_)[0]) && (y<(*dimensions_)[1]) && (z<(*dimensions_)[2]) && (s<(*dimensions_)[3]) && (p<(*dimensions_)[4]) && (r<(*dimensions_)[5]) && (a<(*dimensions_)[6]) && (q<(*dimensions_)[7]) && (u<(*dimensions_)[8]));
+    }
+}
+
+#endif //NDARRAY_H
diff --git a/toolboxes/core/complext.h b/toolboxes/core/complext.h
new file mode 100644
index 0000000..502ffe9
--- /dev/null
+++ b/toolboxes/core/complext.h
@@ -0,0 +1,320 @@
+/** \file complext.h
+    \brief An implementation of complex numbers that works for both the cpu and gpu.
+
+    complext.h provides an implementation of complex numbers that, unlike std::complex,
+    works on both the cpu and gpu. 
+    It follows the interface defined for std::complex.
+*/
+
+#pragma once
+
+#include "core_defines.h"
+
+#include <complex>
+#include <cmath>
+#include <iostream>
+
+namespace Gadgetron{
+
+  using std::abs; // workaround for nvcc
+  using std::sin;
+  using std::cos;
+  using std::exp;
+  /** 
+   * \class complext
+   * \brief An implementation of complex numbers that works for both the cpu and gpu.
+   */
+  template< class T > class complext
+  {
+  public:
+
+    T vec[2];
+
+    __inline__ __host__ __device__  T real() const 
+    {
+      return vec[0];
+    }
+
+    __inline__ __host__ __device__  T imag() const 
+    {
+      return vec[1];
+    }
+
+    __inline__ __host__ __device__  complext() {}
+
+    __inline__ __host__ __device__  complext(T real, T imag){
+      vec[0]=real;
+      vec[1]=imag;
+    }
+
+    __inline__ __host__ __device__  complext(const complext<T>& tmp){
+      vec[0] = tmp.vec[0];
+      vec[1] = tmp.vec[1];
+    }
+
+    __inline__ __host__ __device__  complext(const std::complex<T>& tmp){
+      vec[0] = tmp.real();
+      vec[1] = tmp.imag();
+		}
+    __inline__ __host__ __device__  complext(const T r){
+      vec[0] = r;
+      vec[1] = 0;
+    }
+
+    __inline__ __host__ __device__ void conj(){
+      vec[1] = -vec[1];
+    }
+
+    __inline__ __host__ __device__  complext<T> operator+(const complext<T>& other){
+      return complext<T>(vec[0]+other.vec[0],vec[1]+other.vec[1]);
+    }
+
+    __inline__ __host__ __device__  complext<T> operator-(const complext<T>& other){
+      return complext<T>(vec[0]-other.vec[0],vec[1]-other.vec[1]);
+    }
+
+    __inline__ __host__ __device__  complext<T> operator-(){
+      return complext<T>(-vec[0],-vec[1]);
+    }
+
+    __inline__ __host__ __device__  void operator-=(const complext<T>& other){
+      vec[0] -= other.vec[0];
+      vec[1] -= other.vec[1];
+    }
+
+    __inline__ __host__ __device__  void operator+=(const complext<T>& other){
+      vec[0] += other.vec[0];
+      vec[1] += other.vec[1];
+    }
+
+    __inline__ __host__ __device__  complext<T> operator*(const T& other){
+      return complext<T>(vec[0]*other,vec[1]*other);
+    }
+
+    __inline__ __host__ __device__  complext<T> operator*(const complext<T>& other){
+      return complext<T>(vec[0]*other.vec[0]-vec[1]*other.vec[1],vec[0]*other.vec[1]+vec[1]*other.vec[0]);
+    }
+
+    __inline__ __host__ __device__  complext<T> operator/(const T& other){
+      return complext<T>(vec[0]/other,vec[1]/other);
+    }
+
+    __inline__ __host__ __device__  complext<T> operator/(const complext<T>& other){
+      T cd = other.vec[0]*other.vec[0]+other.vec[1]*other.vec[1];
+      return complext<T>((vec[0]*other.vec[0]+vec[1]*other.vec[1])/cd ,(vec[1]*other.vec[0]-vec[0]*other.vec[1])/cd);
+    }
+
+    __inline__ __host__ __device__  void operator*=(const T& other){
+      vec[0] *= other;
+      vec[1] *= other;
+    }
+
+    __inline__ __host__ __device__  void operator*=(const complext<T>& other){
+      complext<T> tmp = *this;
+      vec[0] = tmp.vec[0]*other.vec[0]-tmp.vec[1]*other.vec[1];
+      vec[1] = tmp.vec[0]*other.vec[1]+tmp.vec[1]*other.vec[0];
+    }
+
+    __inline__ __host__ __device__  void operator/=(const T& other){
+      vec[0] /= other;
+      vec[1] /= other;
+    }
+
+    __inline__ __host__ __device__  void operator/=(const complext<T>& other){
+      complext<T> tmp = (*this)/other;
+      vec[0]=tmp.vec[0];
+      vec[1]=tmp.vec[1];
+    }
+
+    __inline__ __host__ __device__  bool operator==(const complext<T>& comp2){
+
+      return vec[0]==comp2.vec[0] && vec[1]==comp2.vec[1];
+    }
+    __inline__ __host__ __device__  bool operator!=(const complext<T>& comp2){
+
+      return not(*this==comp2);
+    }
+  };
+
+  template <typename T> 
+  inline std::ostream & operator<< (std::ostream & os, const complext<T>& a )
+  {
+    os << a.real() <<' ' << a.imag() << "i";
+    return os;
+  }
+
+  template <> 
+  inline std::ostream & operator<< (std::ostream & os, const complext<float>& a )
+  {
+    os << a.real() <<' ' << a.imag() << "i";
+    return os;
+  }
+
+  template <> 
+  inline std::ostream & operator<< (std::ostream & os, const complext<double>& a )
+  {
+    os << a.real() <<' ' << a.imag() << "i";
+    return os;
+  }
+
+  typedef complext<float> float_complext;
+  typedef complext<double> double_complext;
+
+  template <class T> struct realType {};
+  template<> struct realType<short> {typedef double Type; };
+  template<> struct realType<unsigned short> {typedef double Type; };
+  template<> struct realType<int> {typedef double Type; };
+  template<> struct realType<unsigned int> {typedef double Type; };
+  template<> struct realType<float_complext> {typedef float Type; };
+  template<> struct realType<double_complext> {typedef double Type; };
+  template<> struct realType<float> {typedef float Type; };
+  template<> struct realType<double> {typedef double Type; };
+  template<> struct realType<std::complex<float> > {typedef float Type; };
+  template<> struct realType<std::complex<double> > {typedef double Type; };
+
+  template<class T> struct stdType {typedef T Type;};
+  template<> struct stdType<double_complext> {typedef std::complex<double> Type;};
+  template<> struct stdType<float_complext> {typedef std::complex<float> Type;};
+  template<> struct stdType<std::complex<double> > {typedef std::complex<double> Type;};
+  template<> struct stdType<std::complex<float> > {typedef std::complex<float> Type;};
+  template<> struct stdType<double> {typedef double Type;};
+  template<> struct stdType<float> {typedef float Type;};
+
+  __inline__ __host__ __device__ double sgn(double x){
+    return (double(0) < x) - (x < double(0));
+  }
+  __inline__ __host__ __device__ float sgn(float x){
+    return (float)((float(0) < x) - (x < float(0)));
+  }
+
+  template<class T> __inline__ __host__ __device__ complext<T> sgn(complext<T> x){
+    if (norm(x) <= T(0)) return complext<T>(0);
+    return (x/abs(x));
+  }
+  template<class T>  __inline__ __host__ __device__ complext<T> polar(const T& rho, const T& theta = 0){
+    return complext<T>(rho*std::cos(theta),rho*std::sin(theta));
+  }
+
+  template<class T>  __inline__ __host__ __device__ complext<T> sqrt(complext<T> x){
+    T r = abs(x);
+    return complext<T>(::sqrt((r+x.real())/2),sgn(x.imag())*::sqrt((r-x.real())/2));
+  }
+
+  template<class T> __inline__ __host__ __device__ T abs(complext<T> comp){
+    return ::sqrt(comp.vec[0]*comp.vec[0]+comp.vec[1]*comp.vec[1]);
+  }
+
+  template<class T> __inline__ __host__ __device__ complext<T> sin(complext<T> comp){
+    return complext<T>(sin(comp.vec[0])*std::cosh(comp.vec[1]),std::cos(comp.vec[0])*std::sinh(comp.vec[1]));
+  }
+
+  template<class T> __inline__ __host__ __device__ complext<T> cos(complext<T> comp){
+    return complext<T>(cos(comp.vec[0])*cosh(comp.vec[1]),-sin(comp.vec[0])*sinh(comp.vec[1]));
+  }
+
+  template<class T> __inline__ __host__ __device__ complext<T> exp(complext<T> com){
+	  return exp(com.vec[0])*complext<T>(cos(com.vec[1]),sin(com.vec[1]));
+  }
+
+  template<class T> __inline__ __host__ __device__ T imag(complext<T> comp){
+    return comp.vec[1];
+  }
+
+  __inline__ __host__ __device__ double real(double r){
+    return r;
+  }
+
+  __inline__ __host__ __device__ double imag(double r){
+    return 0.0;
+  }
+
+  __inline__ __host__ __device__ float real(float r){
+    return r;
+  }
+
+  __inline__ __host__ __device__ float imag(float r){
+    return 0.0f;
+  }
+
+  template<class T> __inline__ __host__ __device__ T real(complext<T> comp){
+    return comp.vec[0];
+  }
+
+  template<class T> __inline__ __host__ __device__ T arg(complext<T> comp){
+    return std::atan2(comp.vec[1],comp.vec[0]);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator*(const T& r,const complext<T>& z){
+    return complext<T>(z.vec[0]*r,z.vec[1]*r);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator*(const complext<T>& z,const T& r){
+    return complext<T>(z.vec[0]*r,z.vec[1]*r);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator+(const complext<T>& z1,const complext<T>& z2){
+    return complext<T>(z1.vec[0]+z2.vec[0],z1.vec[1]+z2.vec[1]);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator+(const complext<T>& z1,const T& r){
+    return complext<T>(z1.vec[0]+r, z1.vec[1]);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator+(const T& r,const complext<T>& z1){
+    return complext<T>(z1.vec[0]+r, z1.vec[1]);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator-(const complext<T>& z1,const complext<T>& z2){
+    return complext<T>(z1.vec[0]-z2.vec[0],z1.vec[1]-z2.vec[1]);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator-(const T& r,const complext<T>& z2){
+    return complext<T>(r-z2.vec[0],-z2.vec[1]);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator-(const complext<T>& z2,const T& r){
+    return complext<T>(z2.vec[0]-r,z2.vec[1]);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator*(const complext<T>& z1,const complext<T>& z2){
+    return complext<T>(z1.vec[0]*z2.vec[0]-z1.vec[1]*z2.vec[1],z1.vec[0]*z2.vec[1]+z1.vec[1]*z2.vec[0]);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator/(const complext<T>& z1,const complext<T>& z2){
+    T cd = z2.vec[0]*z2.vec[0]+z2.vec[1]*z2.vec[1];
+    return complext<T>((z1.vec[0]*z2.vec[0]+z1.vec[1]*z2.vec[1])/cd ,(z1.vec[1]*z2.vec[0]-z1.vec[0]*z2.vec[1])/cd);
+  }
+
+  template<class REAL, class T> __inline__ __host__ __device__  complext<T> operator/(const REAL& real, const complext<T>& comp){
+    T cd = comp.vec[0]*comp.vec[0]+comp.vec[1]*comp.vec[1];
+    return complext<T>(comp.vec[0]*real/cd,-real*comp.vec[1]/cd);
+  }
+
+  template<class REAL, class T> __inline__ __host__ __device__  complext<T> operator/(const complext<T>& comp,const REAL& real){
+    return complext<T>(comp.vec[0]/real,comp.vec[1]/real);
+  }
+
+  __inline__ __host__ __device__ float norm(const float& r){
+    return r*r;
+  }
+
+  __inline__ __host__ __device__ double norm(const double& r){
+    return r*r;
+  }
+
+  template<class T> __inline__ __host__ __device__ T norm(const complext<T>& z){
+    return z.vec[0]*z.vec[0]+z.vec[1]*z.vec[1];
+  }
+
+  __inline__ __host__ __device__ double conj(const double& r){ 
+    return r; }
+
+  __inline__ __host__ __device__ float conj(const float& r) { 
+    return r; }
+  
+  template<class T> __inline__ __host__ __device__ complext<T> conj( const complext<T>& z ){
+    complext<T> res=z;
+    res.conj();
+    return res;
+  }
+}
diff --git a/toolboxes/core/core_defines.h.in b/toolboxes/core/core_defines.h.in
new file mode 100644
index 0000000..fc924c0
--- /dev/null
+++ b/toolboxes/core/core_defines.h.in
@@ -0,0 +1,28 @@
+/** \file core_defines.h
+    \brief Autogenerated header providing definitions of __host__, __device__, and __inline__ for systems on which Cuda is not installed.
+*/
+
+#pragma once
+
+// Notice:
+// -------
+//
+// The header cpucore_defines.h is autogenerated 
+// by cmake from cpucore_defines.h.in
+//
+
+// Definition of Cuda availability passed to C++
+//
+
+#define GADGETRON_CUDA_IS_AVAILABLE @GADGETRON_CUDA_FOUND_BOOL@
+
+// Used Cuda host definitions if availble.
+// Otherwise we leave them empty (as no device code is compiled anyway).
+
+#if GADGETRON_CUDA_IS_AVAILABLE
+#include "host_defines.h"
+#else
+#define __host__
+#define __device__
+#define __inline__ inline
+#endif
diff --git a/toolboxes/core/cpu/CMakeLists.txt b/toolboxes/core/cpu/CMakeLists.txt
new file mode 100644
index 0000000..77b9d8e
--- /dev/null
+++ b/toolboxes/core/cpu/CMakeLists.txt
@@ -0,0 +1,92 @@
+if (WIN32)
+    ADD_DEFINITIONS(-D__BUILD_GADGETRON_CPUCORE__)
+endif (WIN32)
+
+include_directories(
+    ${FFTW3_INCLUDE_DIR}
+    ${ISMRMRD_INCLUDE_DIR}
+    ${CMAKE_SOURCE_DIR}/toolboxes/core
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/algorithm
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+    ${CMAKE_SOURCE_DIR}/apps/gadgetron
+)
+
+if(WIN32)
+    link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+#if (MKL_FOUND)
+#    INCLUDE_DIRECTORIES( ${MKL_INCLUDE_DIR} )
+#    LINK_DIRECTORIES( ${MKL_LIB_DIR} ${MKL_COMPILER_LIB_DIR} )
+#endif (MKL_FOUND)
+
+set(header_files ../NDArray.h
+                ../complext.h
+                ../GadgetronException.h
+                ../GadgetronTimer.h
+                cpucore_export.h 
+                hoNDArray.h
+                hoNDArray.hxx
+                hoNDObjectArray.h
+                hoNDArray_utils.h
+                hoNDArray_fileio.h
+                ho2DArray.h
+                ho2DArray.hxx
+                ho3DArray.h
+                ho3DArray.hxx
+                ho4DArray.h
+                ho4DArray.hxx
+                ho5DArray.h
+                ho5DArray.hxx
+                ho6DArray.h
+                ho6DArray.hxx
+                ho7DArray.h
+                ho7DArray.hxx 
+                hoMatrix.h
+                hoMatrix.hxx
+                hoNDPoint.h
+                hoNDBoundaryHandler.h
+                hoNDBoundaryHandler.hxx
+                hoNDInterpolator.h
+                hoNDInterpolatorNearestNeighbor.hxx
+                hoNDInterpolatorLinear.hxx
+                hoNDInterpolatorBSpline.hxx )
+
+set(image_files image/hoNDImage.h 
+            image/hoNDImage.hxx 
+            image/hoNDImageContainer2D.h )
+
+set(algorithm_files algorithm/hoNDBSpline.h algorithm/hoNDBSpline.hxx)
+
+source_group(algorithm FILES ${algorithm_files})
+source_group(image FILES ${image_files})
+
+add_library(gadgetron_toolbox_cpucore SHARED
+                    hoMatrix.cpp 
+                    ${header_files} 
+                    ${image_files}  
+                    ${algorithm_files} )
+
+set_target_properties(gadgetron_toolbox_cpucore  PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(
+  gadgetron_toolbox_cpucore
+  gadgetron_toolbox_log
+  ${Boost_LIBRARIES})
+
+install(TARGETS gadgetron_toolbox_cpucore DESTINATION lib COMPONENT main)
+
+install(FILES
+        ${header_files}
+        image/hoNDImage.h 
+        image/hoNDImage.hxx 
+        image/hoNDImageContainer2D.h 
+        algorithm/hoNDBSpline.h
+        algorithm/hoNDBSpline.hxx 
+        DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+add_subdirectory(math)
+add_subdirectory(hostutils)
diff --git a/toolboxes/core/cpu/algorithm/hoNDBSpline.h b/toolboxes/core/cpu/algorithm/hoNDBSpline.h
new file mode 100644
index 0000000..da0dbe7
--- /dev/null
+++ b/toolboxes/core/cpu/algorithm/hoNDBSpline.h
@@ -0,0 +1,191 @@
+/** \file       hoNDBSpline.h
+    \brief      N-dimensional inteprolation BSpline implemenation
+
+                The source code is partially from http://bigwww.epfl.ch/thevenaz/interpolation/
+                by Philippe Th�venaz
+
+                References:
+
+                [1] P. Th�venaz, T. Blu, M. Unser, "Interpolation Revisited," IEEE Trans on Medical Imaging, Vol 19, 7, 739-758, July 2000.
+                [2] M. Unser, A. Aldroubi and M. Eden, "B-Spline Signal Processing: Part I--Theory," IEEE Trans on Signal Processing, Vol 41, 2, 821-832, Feb 1993.
+                [3] M. Unser, A. Aldroubi and M. Eden, "B-Spline Signal Processing: Part II--Efficient Design and Applications," IEEE Trans on Signal Processing, Vol 41, 2, 834-848, Feb 1993.
+
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "hoNDImage.h"
+
+namespace Gadgetron
+{
+    template <typename T, unsigned int D>
+    class hoNDBSpline
+    {
+    public:
+
+        typedef hoNDBSpline<T, D> Self;
+
+        typedef T element_type;
+        typedef T value_type;
+        typedef float coord_type;
+
+        /// type for bspline computation, can be 'float' or 'double'
+        typedef typename realType<T>::Type bspline_float_type;
+
+        typedef hoNDArray<T> ArrayType;
+        typedef hoNDImage<T, D> ImageType;
+
+        hoNDBSpline() {}
+        ~hoNDBSpline() {}
+
+        /// compute BSpline coefficient
+        bool computeBSplineCoefficients(const hoNDArray<T>& data, unsigned int SplineDegree, hoNDArray<T>& coeff);
+        bool computeBSplineCoefficients(const hoNDImage<T, D>& data, unsigned int SplineDegree, hoNDArray<T>& coeff);
+
+        bool computeBSplineCoefficients(const T* data, const std::vector<size_t>& dimension, unsigned int SplineDegree, T* coeff);
+        bool computeBSplineCoefficients(const T* data, size_t len, unsigned int SplineDegree, T* coeff);
+        bool computeBSplineCoefficients(const T* data, size_t sx, size_t sy, unsigned int SplineDegree, T* coeff);
+        bool computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, unsigned int SplineDegree, T* coeff);
+        bool computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, size_t st, unsigned int SplineDegree, T* coeff);
+        bool computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, unsigned int SplineDegree, T* coeff);
+        bool computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, unsigned int SplineDegree, T* coeff);
+        bool computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, unsigned int SplineDegree, T* coeff);
+        bool computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, unsigned int SplineDegree, T* coeff);
+        bool computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, size_t su, unsigned int SplineDegree, T* coeff);
+
+        /// evaluate BSpline
+        /// derivative: can be 0/1/2, for 0-order, first-order and second-order derivative
+
+        T evaluateBSpline(const T* coeff, const std::vector<size_t>& dimension, unsigned int SplineDegree, 
+                        const std::vector<unsigned int>& derivative, 
+                        const coord_type* pos);
+
+        T evaluateBSpline(const T* coeff, const std::vector<size_t>& dimension, unsigned int SplineDegree, 
+                        const std::vector<unsigned int>& derivative, 
+                        const std::vector<coord_type>& pos);
+
+        T evaluateBSpline(const T* coeff, size_t len, unsigned int SplineDegree, 
+                        unsigned int dx, 
+                        coord_type x);
+
+        T evaluateBSpline(const T* coeff, size_t sx, size_t sy, unsigned int SplineDegree, 
+                        unsigned int dx, unsigned int dy, 
+                        coord_type x, coord_type y);
+
+        T evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, unsigned int SplineDegree, 
+                        unsigned int dx, unsigned int dy, unsigned int dz, 
+                        coord_type x, coord_type y, coord_type z);
+
+        T evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, size_t st, unsigned int SplineDegree, 
+                        unsigned int dx, unsigned int dy, unsigned int dz, unsigned int dt, 
+                        coord_type x, coord_type y, coord_type z, coord_type t);
+
+        T evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, unsigned int SplineDegree, 
+                        unsigned int dx, unsigned int dy, unsigned int dz, unsigned int dt, unsigned int dp, 
+                        coord_type x, coord_type y, coord_type z, coord_type t, coord_type p);
+
+        T evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, unsigned int SplineDegree, 
+                        unsigned int dx, unsigned int dy, unsigned int dz, unsigned int dt, unsigned int dp, unsigned int dq, 
+                        coord_type x, coord_type y, coord_type z, coord_type t, coord_type p, coord_type q);
+
+        T evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, unsigned int SplineDegree, 
+                        unsigned int dx, unsigned int dy, unsigned int dz, unsigned int dt, unsigned int dp, unsigned int dq, unsigned int dr, 
+                        coord_type x, coord_type y, coord_type z, coord_type t, coord_type p, coord_type q, coord_type r);
+
+        T evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, unsigned int SplineDegree, 
+                        unsigned int dx, unsigned int dy, unsigned int dz, unsigned int dt, unsigned int dp, unsigned int dq, unsigned int dr, unsigned int ds, 
+                        coord_type x, coord_type y, coord_type z, coord_type t, coord_type p, coord_type q, coord_type r, coord_type s);
+
+        T evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, size_t su, unsigned int SplineDegree, 
+                        unsigned int dx, unsigned int dy, unsigned int dz, unsigned int dt, unsigned int dp, unsigned int dq, unsigned int dr, unsigned int ds, unsigned int du, 
+                        coord_type x, coord_type y, coord_type z, coord_type t, coord_type p, coord_type q, coord_type r, coord_type s, coord_type u);
+
+
+        /// evaluate BSpline with pre-computed weights
+        T evaluateBSpline(const T* coeff, size_t sx, size_t sy, unsigned int SplineDegree, 
+                        bspline_float_type* xWeight, bspline_float_type* yWeight, 
+                        coord_type x, coord_type y);
+
+        T evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, unsigned int SplineDegree, 
+                        bspline_float_type* xWeight, bspline_float_type* yWeight, bspline_float_type* zWeight, 
+                        coord_type x, coord_type y, coord_type z);
+
+        T evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, size_t st, unsigned int SplineDegree, 
+                        bspline_float_type* xWeight, bspline_float_type* yWeight, bspline_float_type* zWeight, bspline_float_type* tWeight, 
+                        coord_type x, coord_type y, coord_type z, coord_type t);
+
+        T evaluateBSpline(const T* coeff, const std::vector<size_t>& dimension, unsigned int SplineDegree, 
+                        bspline_float_type** weight, const coord_type* pos);
+
+        T evaluateBSpline(const T* coeff, const std::vector<size_t>& dimension, unsigned int SplineDegree, 
+                        bspline_float_type** weight, const std::vector<coord_type>& pos);
+
+        /// compute the BSpline based derivative for an ND array
+        /// derivative indicates the order of derivatives for every dimension
+        bool computeBSplineDerivative(const hoNDArray<T>& data, const hoNDArray<T>& coeff, unsigned int SplineDegree, const std::vector<unsigned int>& derivative, hoNDArray<T>& deriv);
+        bool computeBSplineDerivative(const hoNDImage<T,D>& data, const hoNDArray<T>& coeff, unsigned int SplineDegree, const std::vector<unsigned int>& derivative, hoNDImage<T,D>& deriv);
+
+        /// print out the image information
+        void print(std::ostream& os) const;
+
+    protected:
+
+        /// these BSpline coefficients paramerters are modified from http://bigwww.epfl.ch/thevenaz/interpolation/
+        static void ConvertToInterpolationCoefficients(
+                                                        T               c[],            /* input samples --> output coefficients */
+                                                        size_t          DataLength,     /* number of samples or coefficients */
+                                                        bspline_float_type          z[],            /* poles */
+                                                        long            NbPoles,        /* number of poles */
+                                                        bspline_float_type          Tolerance       /* admissible relative error */ 
+                                                      );
+
+        static T InitialCausalCoefficient(
+                                            T           c[],                /* coefficients */
+                                            size_t      DataLength,         /* number of coefficients */
+                                            bspline_float_type      z,                  /* actual pole */
+                                            bspline_float_type      Tolerance           /* admissible relative error */
+                                         );
+
+        static T InitialAntiCausalCoefficient(
+                                                T           c[],                /* coefficients */
+                                                size_t      DataLength,         /* number of samples or coefficients */
+                                                bspline_float_type      z                   /* actual pole */
+                                             );
+
+        static void Pole(bspline_float_type* pole, unsigned int SplineDegree, unsigned int& NbPoles);
+
+        /// BSpline function
+        /// this function implements the symmetrical BSpline function of SplineDegree n
+        /// Equation 2.6 of reference [2]
+        static bspline_float_type BSpline(bspline_float_type x, unsigned int SplineDegree);
+
+        /// compute the discrete BSpline value
+        /// this function is modified from the source code at http://bigwww.epfl.ch/thevenaz/interpolation/
+        static void BSplineDiscrete(bspline_float_type x, unsigned int SplineDegree, bspline_float_type* weight, long long* xIndex);
+
+        /// compute the discrete BSpline value with the first order derivative
+        static void BSplineDiscreteFirstOrderDerivative(bspline_float_type x, unsigned int SplineDegree, bspline_float_type* weight, long long* xIndex);
+        /// compute the discrete BSpline value with the second order derivative
+        static void BSplineDiscreteSecondOrderDerivative(bspline_float_type x, unsigned int SplineDegree, bspline_float_type* weight, long long* xIndex);
+
+        /// compute BSpline interpolation locations
+        /// xIndex has at least SplineDegree elements
+        static void BSplineInterpolationLocation(bspline_float_type x, unsigned int SplineDegree, long long* xIndex);
+
+        /// apply mirror boundary condition for interpolation locations
+        static void BSplineInterpolationMirrorBoundaryCondition(unsigned int SplineDegree, long long* xIndex, size_t Width);
+
+        /// compute the derivative of BSpline
+        /// first order derivative dBSpline(x, SplineDegree)/dx = BSpline(x+0.5, SplineDegree-1) - BSpline(x-0.5, SplineDegree-1)
+        static bspline_float_type BSplineFirstOrderDerivative(bspline_float_type x, unsigned int SplineDegree);
+        /// second order derivative d2BSpline(x, SplineDegree)/dx2 = BSpline(x+1, SplineDegree-2) + BSpline(x-1, SplineDegree-2) - 2*BSpline(x, SplineDegree-2)
+        static bspline_float_type BSplineSecondOrderDerivative(bspline_float_type x, unsigned int SplineDegree);
+
+        /// compute BSpline interpolation locations and weights
+        static void computeBSplineInterpolationLocationsAndWeights(size_t len, unsigned int SplineDegree, unsigned int dx, coord_type x, bspline_float_type* weight, long long* xIndex);
+    };
+}
+
+#include "hoNDBSpline.hxx"
diff --git a/toolboxes/core/cpu/algorithm/hoNDBSpline.hxx b/toolboxes/core/cpu/algorithm/hoNDBSpline.hxx
new file mode 100644
index 0000000..a763771
--- /dev/null
+++ b/toolboxes/core/cpu/algorithm/hoNDBSpline.hxx
@@ -0,0 +1,2133 @@
+/** \file       hoNDBSpline.hxx
+    \brief      Implementation of N-dimensional BSpline class for gadgetron
+    \author     Hui Xue
+*/
+
+#include "hoNDBSpline.h"
+
+namespace Gadgetron
+{
+    template <typename T, unsigned int D> 
+    bool hoNDBSpline<T, D>::computeBSplineCoefficients(const hoNDArray<T>& data, unsigned int SplineDegree, hoNDArray<T>& coeff)
+    {
+        size_t NDim = data.get_number_of_dimensions();
+
+        if ( !coeff.dimensions_equal(&data) )
+        {
+            coeff = data;
+        }
+
+        bool res;
+        switch (NDim)
+        {
+            case 1:
+            res = this->computeBSplineCoefficients(data.begin(), data.get_size(0), SplineDegree, coeff.begin());
+                break;
+
+            case 2:
+            res = this->computeBSplineCoefficients(data.begin(), data.get_size(0), data.get_size(1), SplineDegree, coeff.begin());
+                break;
+
+            case 3:
+            res = this->computeBSplineCoefficients(data.begin(), data.get_size(0), data.get_size(1), data.get_size(2), SplineDegree, coeff.begin());
+                break;
+
+            case 4:
+            res = this->computeBSplineCoefficients(data.begin(), data.get_size(0), data.get_size(1), data.get_size(2), data.get_size(3), SplineDegree, coeff.begin());
+                break;
+
+            case 5:
+            res = this->computeBSplineCoefficients(data.begin(), data.get_size(0), data.get_size(1), data.get_size(2), data.get_size(3), data.get_size(4), SplineDegree, coeff.begin());
+                break;
+
+            case 6:
+            res = this->computeBSplineCoefficients(data.begin(), data.get_size(0), data.get_size(1), data.get_size(2), data.get_size(3), data.get_size(4), data.get_size(5), SplineDegree, coeff.begin());
+                break;
+
+            case 7:
+            res = this->computeBSplineCoefficients(data.begin(), data.get_size(0), data.get_size(1), data.get_size(2), data.get_size(3), data.get_size(4), data.get_size(5), data.get_size(6), SplineDegree, coeff.begin());
+                break;
+
+            case 8:
+            res = this->computeBSplineCoefficients(data.begin(), data.get_size(0), data.get_size(1), data.get_size(2), data.get_size(3), data.get_size(4), data.get_size(5), data.get_size(6), data.get_size(7), SplineDegree, coeff.begin());
+                break;
+
+            case 9:
+            res = this->computeBSplineCoefficients(data.begin(), data.get_size(0), data.get_size(1), data.get_size(2), data.get_size(3), data.get_size(4), data.get_size(5), data.get_size(6), data.get_size(7), data.get_size(8), SplineDegree, coeff.begin());
+                break;
+
+            default:
+                boost::shared_ptr< std::vector<size_t> > dim = data.get_dimensions();
+                res = this->computeBSplineCoefficients(data.begin(), *dim, SplineDegree, coeff.begin());
+        }
+
+        return res;
+    }
+
+    template <typename T, unsigned int D> 
+    inline bool hoNDBSpline<T, D>::computeBSplineCoefficients(const hoNDImage<T, D>& data, unsigned int SplineDegree, hoNDArray<T>& coeff)
+    {
+        std::vector<size_t> dim;
+        data.get_dimensions(dim);
+        hoNDArray<T> dataTmp(dim, const_cast<T*>(data.begin()), false);
+        return this->computeBSplineCoefficients(dataTmp, SplineDegree, coeff);
+    }
+
+    template <typename T, unsigned int D> 
+    bool hoNDBSpline<T, D>::computeBSplineCoefficients(const T* data, const std::vector<size_t>& dimension, unsigned int SplineDegree, T* coeff)
+    {
+        try
+        {
+            unsigned int NbPoles;
+            bspline_float_type pole[4];
+            this->Pole(pole, SplineDegree, NbPoles);
+
+            GADGET_CHECK_RETURN_FALSE(D==dimension.size());
+
+            hoNDArray<T> coeffBuf( const_cast<std::vector<size_t>&>(dimension), coeff, false);
+            memcpy(coeff, data, sizeof(T)*coeffBuf.get_number_of_elements());
+
+            unsigned int d;
+            for ( d=0; d<D; d++ )
+            {
+                long long ii;
+
+                size_t len = dimension[d];
+                size_t num = coeffBuf.get_number_of_elements()/len;
+
+                size_t i;
+                std::vector<size_t> dimUsed(D-1);
+                for( i = 0; i<D; i++ )
+                {
+                    if ( i < d)
+                    {
+                        dimUsed[i] = dimension[i];
+                    }
+                    else if ( i > d )
+                    {
+                        dimUsed[i-1] = dimension[i];
+                    }
+                }
+
+                std::vector<size_t> offsetFactor(D-1, 1);
+                hoNDArray<T>::calculate_offset_factors(dimUsed, offsetFactor);
+
+                //for( i = 0; i<D-1; i++ )
+                //{
+                //    size_t k = 1;
+                //    for( j = 0; j < i; j++ )
+                //    {
+                //        k *= dimUsed[j];
+                //    }
+                //    offsetFactor[i] = k;
+                //}
+
+                #pragma omp parallel default(none) private(ii) shared(coeff, coeffBuf, pole, NbPoles, dimension, num, len, offsetFactor, d)
+                {
+                    T* buf = new T[ len ];
+
+                    std::vector<size_t> ind(D, 0);
+                    std::vector<size_t> indUsed(D-1, 0);
+
+                    #pragma omp for 
+                    for ( ii=0; ii<num; ii++ )
+                    {
+                        if ( d == 0 )
+                        {
+                            memcpy(buf, coeff+ii*len, sizeof(T)*len);
+
+                            this->ConvertToInterpolationCoefficients(buf, len, pole, NbPoles, DBL_EPSILON);
+
+                            memcpy(coeff+ii*len, buf, sizeof(T)*len);
+                        }
+                        else
+                        {
+                            hoNDArray<T>::calculate_index(ii, offsetFactor, indUsed);
+
+                            long long i;
+
+                            //size_t offset = ii;
+                            //for( i=D-2; i>=0; i-- )
+                            //{
+                            //    indUsed[i] = offset / offsetFactor[i];
+                            //    offset %= offsetFactor[i];
+                            //}
+
+                            for ( i=0; i<D; i++ )
+                            {
+                                if ( i < d )
+                                {
+                                    ind[i] = indUsed[i];
+                                }
+                                else if ( i > d )
+                                {
+                                    ind[i] = indUsed[i-1];
+                                }
+                            }
+
+                            for ( i=0; i<len; i++ )
+                            {
+                                ind[d] = i;
+                                buf[i] = coeffBuf(ind);
+                            }
+
+                            this->ConvertToInterpolationCoefficients(buf, len, pole, NbPoles, DBL_EPSILON);
+
+                            for ( i=0; i<len; i++ )
+                            {
+                                ind[d] = i;
+                                coeffBuf(ind) = buf[i];
+                            }
+                        }
+                    }
+
+                    delete [] buf;
+                }
+            }
+
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoNDBSpline<T, D>::computeBSplineCoefficients(const T* data, const std::vector<size_t>& dimension, unsigned int SplineDegree, T* coeff) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    inline bool hoNDBSpline<T, D>::computeBSplineCoefficients(const T* data, size_t len, unsigned int SplineDegree, T* coeff)
+    {
+        try
+        {
+            unsigned int NbPoles;
+            bspline_float_type pole[4];
+            this->Pole(pole, SplineDegree, NbPoles);
+
+            memcpy(coeff, data, sizeof(T)*len);
+            this->ConvertToInterpolationCoefficients(coeff, len, pole, NbPoles, DBL_EPSILON);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoNDBSpline<T, D>::computeBSplineCoefficients(const T* data, size_t len, unsigned int SplineDegree, T* coeff) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    bool hoNDBSpline<T, D>::computeBSplineCoefficients(const T* data, size_t sx, size_t sy, unsigned int SplineDegree, T* coeff)
+    {
+        try
+        {
+            unsigned int NbPoles;
+            bspline_float_type pole[4];
+            this->Pole(pole, SplineDegree, NbPoles);
+
+            // x
+            long long y;
+            #pragma omp parallel default(none) private(y) shared(data, coeff, pole, NbPoles, sx, sy)
+            {
+                T* buf = new T[sx];
+
+                #pragma omp for 
+                for ( y=0; y<sy; y++ )
+                {
+                    memcpy(buf, data+y*sx, sizeof(T)*sx);
+
+                    this->ConvertToInterpolationCoefficients(buf, sx, pole, NbPoles, DBL_EPSILON);
+
+                    memcpy(coeff+y*sx, buf, sizeof(T)*sx);
+                }
+
+                delete [] buf;
+            }
+
+            // y
+            long long x;
+            #pragma omp parallel default(none) private(x) shared(data, coeff, pole, NbPoles, sx, sy)
+            {
+                T* buf = new T[sy];
+
+                #pragma omp for 
+                for ( x=0; x<sx; x++ )
+                {
+                    size_t y;
+
+                    for ( y=0; y<sy; y++ )
+                    {
+                        buf[y] = coeff[x + y*sx];
+                    }
+
+                    this->ConvertToInterpolationCoefficients(buf, sy, pole, NbPoles, DBL_EPSILON);
+
+                    for ( y=0; y<sy; y++ )
+                    {
+                        coeff[x + y*sx] = buf[y];
+                    }
+                }
+
+                delete [] buf;
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoNDBSpline<T, D>::computeBSplineCoefficients(const T* data, size_t sx, size_t sy, unsigned int SplineDegree, T* coeff) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    bool hoNDBSpline<T, D>::computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, unsigned int SplineDegree, T* coeff)
+    {
+        try
+        {
+            unsigned int NbPoles;
+            bspline_float_type pole[4];
+            this->Pole(pole, SplineDegree, NbPoles);
+
+            // x
+            long long z;
+            #pragma omp parallel default(none) private(z) shared(data, coeff, pole, NbPoles, sx, sy, sz)
+            {
+                T* buf = new T[sx];
+
+                #pragma omp for 
+                for ( z=0; z<sz; z++ )
+                {
+                    for ( size_t y=0; y<sy; y++ )
+                    {
+                        memcpy(buf, data+z*sx*sy+y*sx, sizeof(T)*sx);
+
+                        this->ConvertToInterpolationCoefficients(buf, sx, pole, NbPoles, DBL_EPSILON);
+
+                        memcpy(coeff+z*sx*sy+y*sx, buf, sizeof(T)*sx);
+                    }
+                }
+
+                delete [] buf;
+            }
+
+            // y
+            #pragma omp parallel default(none) private(z) shared(data, coeff, pole, NbPoles, sx, sy, sz)
+            {
+                T* buf = new T[sy];
+
+                #pragma omp for 
+                for ( z=0; z<sz; z++ )
+                {
+                    for ( size_t x=0; x<sx; x++ )
+                    {
+                        size_t y;
+
+                        size_t offset = x + z*sx*sy;
+
+                        for ( y=0; y<sy; y++ )
+                        {
+                            buf[y] = coeff[offset + y*sx];
+                        }
+
+                        this->ConvertToInterpolationCoefficients(buf, sy, pole, NbPoles, DBL_EPSILON);
+
+                        for ( y=0; y<sy; y++ )
+                        {
+                            coeff[offset + y*sx] = buf[y];
+                        }
+                    }
+                }
+
+                delete [] buf;
+            }
+
+            // z
+            long long x;
+            #pragma omp parallel default(none) private(x) shared(data, coeff, pole, NbPoles, sx, sy, sz)
+            {
+                T* buf = new T[sz];
+
+                #pragma omp for 
+                for ( x=0; x<sx; x++ )
+                {
+                    for ( size_t y=0; y<sy; y++ )
+                    {
+                        size_t z;
+                        size_t offset = x + y*sx;
+
+                        for ( z=0; z<sz; z++ )
+                        {
+                            buf[z] = coeff[offset + z*sx*sy];
+                        }
+
+                        this->ConvertToInterpolationCoefficients(buf, sz, pole, NbPoles, DBL_EPSILON);
+
+                        for ( z=0; z<sz; z++ )
+                        {
+                            coeff[offset + z*sx*sy] = buf[z];
+                        }
+                    }
+                }
+
+                delete [] buf;
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoNDBSpline<T, D>::computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, unsigned int SplineDegree, T* coeff) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    bool hoNDBSpline<T, D>::computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, size_t st, unsigned int SplineDegree, T* coeff)
+    {
+        try
+        {
+            unsigned int NbPoles;
+            bspline_float_type pole[4];
+            this->Pole(pole, SplineDegree, NbPoles);
+
+            long long x, y, z, t;
+
+            // x
+            #pragma omp parallel default(none) private(y, z, t) shared(data, coeff, pole, NbPoles, sx, sy, sz, st)
+            {
+                T* buf = new T[sx];
+
+                #pragma omp for 
+                for ( t=0; t<st; t++ )
+                {
+                    for ( z=0; z<sz; z++ )
+                    {
+                        for ( y=0; y<sy; y++ )
+                        {
+                            memcpy(buf, data+t*sx*sy*sz+z*sx*sy+y*sx, sizeof(T)*sx);
+
+                            this->ConvertToInterpolationCoefficients(buf, sx, pole, NbPoles, DBL_EPSILON);
+
+                            memcpy(coeff+t*sx*sy*sz+z*sx*sy+y*sx, buf, sizeof(T)*sx);
+                        }
+                    }
+                }
+
+                delete [] buf;
+            }
+
+            // y
+            #pragma omp parallel default(none) private(x, y, z, t) shared(data, coeff, pole, NbPoles, sx, sy, sz, st)
+            {
+                T* buf = new T[sy];
+
+                #pragma omp for 
+                for ( t=0; t<st; t++ )
+                {
+                    for ( z=0; z<sz; z++ )
+                    {
+                        for ( x=0; x<sx; x++ )
+                        {
+                            size_t offset = x + z*sx*sy + t*sx*sy*sz;
+
+                            for ( y=0; y<sy; y++ )
+                            {
+                                buf[y] = coeff[offset + y*sx];
+                            }
+
+                            this->ConvertToInterpolationCoefficients(buf, sy, pole, NbPoles, DBL_EPSILON);
+
+                            for ( y=0; y<sy; y++ )
+                            {
+                                coeff[offset + y*sx] = buf[y];
+                            }
+                        }
+                    }
+                }
+
+                delete [] buf;
+            }
+
+            // z
+            #pragma omp parallel default(none) private(x, y, z, t) shared(data, coeff, pole, NbPoles, sx, sy, sz, st)
+            {
+                T* buf = new T[sz];
+
+                #pragma omp for 
+                for ( t=0; t<st; t++ )
+                {
+                    for ( x=0; x<sx; x++ )
+                    {
+                        for ( y=0; y<sy; y++ )
+                        {
+                            size_t offset = x + y*sx + t*sx*sy*sz;
+
+                            for ( z=0; z<sz; z++ )
+                            {
+                                buf[z] = coeff[offset + z*sx*sy];
+                            }
+
+                            this->ConvertToInterpolationCoefficients(buf, sz, pole, NbPoles, DBL_EPSILON);
+
+                            for ( z=0; z<sz; z++ )
+                            {
+                                coeff[offset + z*sx*sy] = buf[z];
+                            }
+                        }
+                    }
+                }
+
+                delete [] buf;
+            }
+
+            // t
+            #pragma omp parallel default(none) private(x, y, z, t) shared(data, coeff, pole, NbPoles, sx, sy, sz, st)
+            {
+                T* buf = new T[st];
+
+                #pragma omp for 
+                for ( x=0; x<sx; x++ )
+                {
+                    for ( y=0; y<sy; y++ )
+                    {
+                        for ( z=0; z<sz; z++ )
+                        {
+                            size_t offset = x + y*sx + z*sx*sy;
+
+                            for ( t=0; t<st; t++ )
+                            {
+                                buf[t] = coeff[offset + t*sx*sy*sz];
+                            }
+
+                            this->ConvertToInterpolationCoefficients(buf, st, pole, NbPoles, DBL_EPSILON);
+
+                            for ( t=0; t<st; t++ )
+                            {
+                                coeff[offset + t*sx*sy*sz] = buf[t];
+                            }
+                        }
+                    }
+                }
+
+                delete [] buf;
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoNDBSpline<T, D>::computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, unsigned int SplineDegree, T* coeff) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    bool hoNDBSpline<T, D>::computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, unsigned int SplineDegree, T* coeff)
+    {
+        try
+        {
+            unsigned int NbPoles;
+            bspline_float_type pole[4];
+            this->Pole(pole, SplineDegree, NbPoles);
+
+            long long x, y, z, t, p;
+
+            // x
+            #pragma omp parallel default(none) private(y, z, t, p) shared(data, coeff, pole, NbPoles, sx, sy, sz, st, sp)
+            {
+                T* buf = new T[sx];
+
+                #pragma omp for 
+                for ( p=0; p<sp; p++ )
+                {
+                    for ( t=0; t<st; t++ )
+                    {
+                        for ( z=0; z<sz; z++ )
+                        {
+                            for ( y=0; y<sy; y++ )
+                            {
+                                memcpy(buf, data+p*sx*sy*sz*st+t*sx*sy*sz+z*sx*sy+y*sx, sizeof(T)*sx);
+
+                                this->ConvertToInterpolationCoefficients(buf, sx, pole, NbPoles, DBL_EPSILON);
+
+                                memcpy(coeff+p*sx*sy*sz*st+t*sx*sy*sz+z*sx*sy+y*sx, buf, sizeof(T)*sx);
+                            }
+                        }
+                    }
+                }
+
+                delete [] buf;
+            }
+
+            // y
+            #pragma omp parallel default(none) private(x, y, z, t, p) shared(data, coeff, pole, NbPoles, sx, sy, sz, st, sp)
+            {
+                T* buf = new T[sy];
+
+                #pragma omp for 
+                for ( p=0; p<sp; p++ )
+                {
+                    for ( t=0; t<st; t++ )
+                    {
+                        for ( z=0; z<sz; z++ )
+                        {
+                            for ( x=0; x<sx; x++ )
+                            {
+                                size_t offset = x + z*sx*sy + t*sx*sy*sz + p*sx*sy*sz*st;
+
+                                for ( y=0; y<sy; y++ )
+                                {
+                                    buf[y] = coeff[offset + y*sx];
+                                }
+
+                                this->ConvertToInterpolationCoefficients(buf, sy, pole, NbPoles, DBL_EPSILON);
+
+                                for ( y=0; y<sy; y++ )
+                                {
+                                    coeff[offset + y*sx] = buf[y];
+                                }
+                            }
+                        }
+                    }
+                }
+
+                delete [] buf;
+            }
+
+            // z
+            #pragma omp parallel default(none) private(x, y, z, t, p) shared(data, coeff, pole, NbPoles, sx, sy, sz, st, sp)
+            {
+                T* buf = new T[sz];
+
+                #pragma omp for 
+                for ( p=0; p<sp; p++ )
+                {
+                    for ( t=0; t<st; t++ )
+                    {
+                        for ( x=0; x<sx; x++ )
+                        {
+                            for ( y=0; y<sy; y++ )
+                            {
+                                size_t offset = x + y*sx + t*sx*sy*sz + p*sx*sy*sz*st;
+
+                                for ( z=0; z<sz; z++ )
+                                {
+                                    buf[z] = coeff[offset + z*sx*sy];
+                                }
+
+                                this->ConvertToInterpolationCoefficients(buf, sz, pole, NbPoles, DBL_EPSILON);
+
+                                for ( z=0; z<sz; z++ )
+                                {
+                                    coeff[offset + z*sx*sy] = buf[z];
+                                }
+                            }
+                        }
+                    }
+                }
+
+                delete [] buf;
+            }
+
+            // t
+            #pragma omp parallel default(none) private(x, y, z, t, p) shared(data, coeff, pole, NbPoles, sx, sy, sz, st, sp)
+            {
+                T* buf = new T[st];
+
+                #pragma omp for 
+                for ( p=0; p<sp; p++ )
+                {
+                    for ( x=0; x<sx; x++ )
+                    {
+                        for ( y=0; y<sy; y++ )
+                        {
+                            for ( z=0; z<sz; z++ )
+                            {
+                                size_t offset = x + y*sx + z*sx*sy + p*sx*sy*sz*st;
+
+                                for ( t=0; t<st; t++ )
+                                {
+                                    buf[t] = coeff[offset + t*sx*sy*sz];
+                                }
+
+                                this->ConvertToInterpolationCoefficients(buf, st, pole, NbPoles, DBL_EPSILON);
+
+                                for ( t=0; t<st; t++ )
+                                {
+                                    coeff[offset + t*sx*sy*sz] = buf[t];
+                                }
+                            }
+                        }
+                    }
+                }
+
+                delete [] buf;
+            }
+
+            // p
+            #pragma omp parallel default(none) private(x, y, z, t, p) shared(data, coeff, pole, NbPoles, sx, sy, sz, st, sp)
+            {
+                T* buf = new T[sp];
+
+                #pragma omp for 
+                for ( x=0; x<sx; x++ )
+                {
+                    for ( y=0; y<sy; y++ )
+                    {
+                        for ( z=0; z<sz; z++ )
+                        {
+                            for ( t=0; t<st; t++ )
+                            {
+                                size_t offset = x + y*sx + z*sx*sy + t*sx*sy*sz;
+
+                                for ( p=0; p<sp; p++ )
+                                {
+                                    buf[t] = coeff[offset + p*sx*sy*sz*st];
+                                }
+
+                                this->ConvertToInterpolationCoefficients(buf, sp, pole, NbPoles, DBL_EPSILON);
+
+                                for ( p=0; p<sp; p++ )
+                                {
+                                    coeff[offset + p*sx*sy*sz*st] = buf[t];
+                                }
+                            }
+                        }
+                    }
+                }
+
+                delete [] buf;
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoNDBSpline<T, D>::computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, unsigned int SplineDegree, T* coeff) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    inline bool hoNDBSpline<T, D>::computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, unsigned int SplineDegree, T* coeff)
+    {
+        std::vector<size_t> dim(6);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+
+        return this->computeBSplineCoefficients(data, dim, SplineDegree, coeff);
+    }
+
+    template <typename T, unsigned int D> 
+    inline bool hoNDBSpline<T, D>::computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, unsigned int SplineDegree, T* coeff)
+    {
+        std::vector<size_t> dim(7);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+
+        return this->computeBSplineCoefficients(data, dim, SplineDegree, coeff);
+    }
+
+    template <typename T, unsigned int D> 
+    inline bool hoNDBSpline<T, D>::computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, unsigned int SplineDegree, T* coeff)
+    {
+        std::vector<size_t> dim(8);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+        dim[7] = ss;
+
+        return this->computeBSplineCoefficients(data, dim, SplineDegree, coeff);
+    }
+
+    template <typename T, unsigned int D> 
+    inline bool hoNDBSpline<T, D>::computeBSplineCoefficients(const T* data, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, size_t su, unsigned int SplineDegree, T* coeff)
+    {
+        std::vector<size_t> dim(9);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+        dim[7] = ss;
+        dim[8] = su;
+
+        return this->computeBSplineCoefficients(data, dim, SplineDegree, coeff);
+    }
+
+    template <typename T, unsigned int D> 
+    inline T hoNDBSpline<T, D>::evaluateBSpline(const T* coeff, const std::vector<size_t>& dimension, unsigned int SplineDegree, 
+                                                const std::vector<unsigned int>& derivative, const coord_type* pos)
+    {
+        if ( D!=dimension.size() )
+        {
+            GERROR_STREAM("D!=dimension.get_number_of_dimensions()");
+            return T(0);
+        }
+
+        bspline_float_type weight[D][10];
+        long long index[D][10];
+
+        unsigned int ii, jj;
+        for ( ii=0; ii<D; ii++ )
+        {
+            computeBSplineInterpolationLocationsAndWeights(dimension[ii], SplineDegree, derivative[ii], pos[ii], weight[ii], index[ii]);
+        }
+
+        std::vector<size_t> splineDimension(D, SplineDegree);
+        std::vector<size_t> splineInd(D, 0);
+        std::vector<size_t> coeffInd(D, 0);
+
+        std::vector<size_t> offsetFactors(D, 0);
+        hoNDArray<T>::calculate_offset_factors(splineDimension, offsetFactors);
+
+        std::vector<size_t> coeffOffsetFactors(D, 0);
+        hoNDArray<T>::calculate_offset_factors(dimension, coeffOffsetFactors);
+
+        unsigned int num = (unsigned int)std::pow( (double)SplineDegree, (double)D);
+
+        T res = 0;
+
+        for ( ii=0; ii<num; ii++ )
+        {
+            hoNDArray<T>::calculate_index(ii, offsetFactors, splineInd);
+
+            for ( jj=0; jj<D; jj++ )
+            {
+                coeffInd[jj] = index[jj][ splineInd[jj] ];
+            }
+
+            size_t offset = hoNDArray<T>::calculate_offset(coeffInd, coeffOffsetFactors);
+
+            T v = coeff[offset];
+
+            for ( jj=0; jj<D; jj++ )
+            {
+                v *= weight[jj][ splineInd[jj] ];
+            }
+
+            res += v;
+        }
+
+        return res;
+    }
+
+    template <typename T, unsigned int D> 
+    inline T hoNDBSpline<T, D>::evaluateBSpline(const T* coeff, const std::vector<size_t>& dimension, unsigned int SplineDegree, 
+                                                const std::vector<unsigned int>& derivative, const std::vector<coord_type>& pos)
+    {
+        return this->evaluateBSpline(coeff, dimension, SplineDegree, derivative, &pos[0]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline T hoNDBSpline<T, D>::evaluateBSpline(const T* coeff, size_t len, unsigned int SplineDegree, 
+                                                unsigned int dx, 
+                                                coord_type x)
+    {
+        bspline_float_type xWeight[10];
+        long long xIndex[10];
+
+        computeBSplineInterpolationLocationsAndWeights(len, SplineDegree, dx, x, xWeight, xIndex);
+
+        T res=0;
+        unsigned int ix;
+        for ( ix=0; ix<SplineDegree; ix++ )
+        {
+            res += coeff[ xIndex[ix] ] * xWeight[ix];
+        }
+
+        return res;
+    }
+
+    template <typename T, unsigned int D> 
+    inline T hoNDBSpline<T, D>::evaluateBSpline(const T* coeff, size_t sx, size_t sy, unsigned int SplineDegree, 
+                                                unsigned int dx, unsigned int dy, 
+                                                coord_type x, coord_type y)
+    {
+        bspline_float_type xWeight[10];
+        long long xIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sx, SplineDegree, dx, x, xWeight, xIndex);
+
+        bspline_float_type yWeight[10];
+        long long yIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sy, SplineDegree, dy, y, yWeight, yIndex);
+
+        T res=0;
+
+        unsigned int ix, iy;
+        for ( iy=0; iy<SplineDegree; iy++ )
+        {
+            for ( ix=0; ix<SplineDegree; ix++ )
+            {
+                res += coeff[ xIndex[ix] + sx*yIndex[iy] ] * xWeight[ix] * yWeight[iy];
+            }
+        }
+
+        return res;
+    }
+
+
+    template <typename T, unsigned int D> 
+    inline T hoNDBSpline<T, D>::evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, unsigned int SplineDegree, 
+                                                unsigned int dx, unsigned int dy, unsigned int dz, 
+                                                coord_type x, coord_type y, coord_type z)
+    {
+        bspline_float_type xWeight[10];
+        long long xIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sx, SplineDegree, dx, x, xWeight, xIndex);
+
+        bspline_float_type yWeight[10];
+        long long yIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sy, SplineDegree, dy, y, yWeight, yIndex);
+
+        bspline_float_type zWeight[10];
+        long long zIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sz, SplineDegree, dz, z, zWeight, zIndex);
+
+        T res=0;
+
+        unsigned int ix, iy, iz;
+        for ( iz=0; iz<SplineDegree; iz++ )
+        {
+            for ( iy=0; iy<SplineDegree; iy++ )
+            {
+                long long offset = yIndex[iy]*sx + zIndex[iz]*sx*sy;
+
+                for ( ix=0; ix<SplineDegree; ix++ )
+                {
+                    res += coeff[ xIndex[ix] + offset ] 
+                        * xWeight[ix] * yWeight[iy] * zWeight[iz];
+                }
+            }
+        }
+
+        return res;
+    }
+
+    template <typename T, unsigned int D> 
+    inline T hoNDBSpline<T, D>::evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, size_t st, unsigned int SplineDegree, 
+                                                unsigned int dx, unsigned int dy, unsigned int dz, unsigned int dt, 
+                                                coord_type x, coord_type y, coord_type z, coord_type t)
+    {
+        bspline_float_type xWeight[10];
+        long long xIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sx, SplineDegree, dx, x, xWeight, xIndex);
+
+        bspline_float_type yWeight[10];
+        long long yIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sy, SplineDegree, dy, y, yWeight, yIndex);
+
+        bspline_float_type zWeight[10];
+        long long zIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sz, SplineDegree, dz, z, zWeight, zIndex);
+
+        bspline_float_type tWeight[10];
+        long long tIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(st, SplineDegree, dt, t, tWeight, tIndex);
+
+        T res=0;
+
+        unsigned int ix, iy, iz, it;
+        for ( it=0; it<SplineDegree; it++ )
+        {
+            for ( iz=0; iz<SplineDegree; iz++ )
+            {
+                for ( iy=0; iy<SplineDegree; iy++ )
+                {
+                    long long offset = yIndex[iy]*sx + zIndex[iz]*sx*sy + tIndex[it]*sx*sy*sz;
+
+                    for ( ix=0; ix<SplineDegree; ix++ )
+                    {
+                        res += coeff[ xIndex[ix] + offset ] 
+                            * xWeight[ix] * yWeight[iy] * zWeight[iz] * tWeight[it];
+                    }
+                }
+            }
+        }
+
+        return res;
+    }
+
+    template <typename T, unsigned int D> 
+    inline T hoNDBSpline<T, D>::evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, unsigned int SplineDegree, 
+                                                unsigned int dx, unsigned int dy, unsigned int dz, unsigned int dt, unsigned int dp, 
+                                                coord_type x, coord_type y, coord_type z, coord_type t, coord_type p)
+    {
+        bspline_float_type xWeight[10];
+        long long xIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sx, SplineDegree, dx, x, xWeight, xIndex);
+
+        bspline_float_type yWeight[10];
+        long long yIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sy, SplineDegree, dy, y, yWeight, yIndex);
+
+        bspline_float_type zWeight[10];
+        long long zIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sz, SplineDegree, dz, z, zWeight, zIndex);
+
+        bspline_float_type tWeight[10];
+        long long tIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(st, SplineDegree, dt, t, tWeight, tIndex);
+
+        bspline_float_type pWeight[10];
+        long long pIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sp, SplineDegree, dp, p, pWeight, pIndex);
+
+        T res=0;
+
+        unsigned int ix, iy, iz, it, ip;
+
+        for ( ip=0; ip<SplineDegree; ip++ )
+        {
+            for ( it=0; it<SplineDegree; it++ )
+            {
+                for ( iz=0; iz<SplineDegree; iz++ )
+                {
+                    for ( iy=0; iy<SplineDegree; iy++ )
+                    {
+                        long long offset = yIndex[iy]*sx + zIndex[iz]*sx*sy + tIndex[it]*sx*sy*sz + pIndex[ip]*sx*sy*sz*st;
+
+                        for ( ix=0; ix<SplineDegree; ix++ )
+                        {
+                            res += coeff[ xIndex[ix] + offset ] 
+                                * xWeight[ix] * yWeight[iy] * zWeight[iz] * tWeight[it] * pWeight[ip];
+                        }
+                    }
+                }
+            }
+        }
+
+        return res;
+    }
+
+    template <typename T, unsigned int D> 
+    inline T hoNDBSpline<T, D>::evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, unsigned int SplineDegree, 
+                                                unsigned int dx, unsigned int dy, unsigned int dz, unsigned int dt, unsigned int dp, unsigned int dq, 
+                                                coord_type x, coord_type y, coord_type z, coord_type t, coord_type p, coord_type q)
+    {
+        bspline_float_type xWeight[10];
+        long long xIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sx, SplineDegree, dx, x, xWeight, xIndex);
+
+        bspline_float_type yWeight[10];
+        long long yIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sy, SplineDegree, dy, y, yWeight, yIndex);
+
+        bspline_float_type zWeight[10];
+        long long zIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sz, SplineDegree, dz, z, zWeight, zIndex);
+
+        bspline_float_type tWeight[10];
+        long long tIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(st, SplineDegree, dt, t, tWeight, tIndex);
+
+        bspline_float_type pWeight[10];
+        long long pIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sp, SplineDegree, dp, p, pWeight, pIndex);
+
+        bspline_float_type qWeight[10];
+        long long qIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sq, SplineDegree, dq, q, qWeight, qIndex);
+
+        T res=0;
+
+        unsigned int ix, iy, iz, it, ip, iq;
+
+        for ( iq=0; iq<SplineDegree; iq++ )
+        {
+            for ( ip=0; ip<SplineDegree; ip++ )
+            {
+                for ( it=0; it<SplineDegree; it++ )
+                {
+                    for ( iz=0; iz<SplineDegree; iz++ )
+                    {
+                        for ( iy=0; iy<SplineDegree; iy++ )
+                        {
+                            long long offset = yIndex[iy]*sx 
+                                             + zIndex[iz]*sx*sy 
+                                             + tIndex[it]*sx*sy*sz 
+                                             + pIndex[ip]*sx*sy*sz*st
+                                             + qIndex[iq]*sx*sy*sz*st*sp;
+
+                            for ( ix=0; ix<SplineDegree; ix++ )
+                            {
+                                res += coeff[ xIndex[ix] + offset ] 
+                                    * xWeight[ix] * yWeight[iy] * zWeight[iz] * tWeight[it] * pWeight[ip] * qWeight[iq];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return res;
+    }
+
+    template <typename T, unsigned int D> 
+    inline T hoNDBSpline<T, D>::evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, unsigned int SplineDegree, 
+                                                unsigned int dx, unsigned int dy, unsigned int dz, unsigned int dt, unsigned int dp, unsigned int dq, unsigned int dr, 
+                                                coord_type x, coord_type y, coord_type z, coord_type t, coord_type p, coord_type q, coord_type r)
+    {
+        bspline_float_type xWeight[10];
+        long long xIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sx, SplineDegree, dx, x, xWeight, xIndex);
+
+        bspline_float_type yWeight[10];
+        long long yIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sy, SplineDegree, dy, y, yWeight, yIndex);
+
+        bspline_float_type zWeight[10];
+        long long zIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sz, SplineDegree, dz, z, zWeight, zIndex);
+
+        bspline_float_type tWeight[10];
+        long long tIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(st, SplineDegree, dt, t, tWeight, tIndex);
+
+        bspline_float_type pWeight[10];
+        long long pIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sp, SplineDegree, dp, p, pWeight, pIndex);
+
+        bspline_float_type qWeight[10];
+        long long qIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sq, SplineDegree, dq, q, qWeight, qIndex);
+
+        bspline_float_type rWeight[10];
+        long long rIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sr, SplineDegree, dr, r, rWeight, rIndex);
+
+        T res=0;
+
+        unsigned int ix, iy, iz, it, ip, iq, ir;
+
+        for ( ir=0; ir<SplineDegree; ir++ )
+        {
+            for ( iq=0; iq<SplineDegree; iq++ )
+            {
+                for ( ip=0; ip<SplineDegree; ip++ )
+                {
+                    for ( it=0; it<SplineDegree; it++ )
+                    {
+                        for ( iz=0; iz<SplineDegree; iz++ )
+                        {
+                            for ( iy=0; iy<SplineDegree; iy++ )
+                            {
+                                long long offset = yIndex[iy]*sx 
+                                                 + zIndex[iz]*sx*sy 
+                                                 + tIndex[it]*sx*sy*sz 
+                                                 + pIndex[ip]*sx*sy*sz*st
+                                                 + qIndex[iq]*sx*sy*sz*st*sp
+                                                 + rIndex[ir]*sx*sy*sz*st*sp*sq;
+
+                                for ( ix=0; ix<SplineDegree; ix++ )
+                                {
+                                    res += coeff[ xIndex[ix] + offset ] 
+                                        * xWeight[ix] * yWeight[iy] * zWeight[iz] * tWeight[it] * pWeight[ip] * qWeight[iq] * rWeight[ir];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return res;
+    }
+
+    template <typename T, unsigned int D> 
+    inline T hoNDBSpline<T, D>::evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, unsigned int SplineDegree, 
+                                                unsigned int dx, unsigned int dy, unsigned int dz, unsigned int dt, unsigned int dp, unsigned int dq, unsigned int dr, unsigned int ds, 
+                                                coord_type x, coord_type y, coord_type z, coord_type t, coord_type p, coord_type q, coord_type r, coord_type s)
+    {
+        bspline_float_type xWeight[10];
+        long long xIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sx, SplineDegree, dx, x, xWeight, xIndex);
+
+        bspline_float_type yWeight[10];
+        long long yIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sy, SplineDegree, dy, y, yWeight, yIndex);
+
+        bspline_float_type zWeight[10];
+        long long zIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sz, SplineDegree, dz, z, zWeight, zIndex);
+
+        bspline_float_type tWeight[10];
+        long long tIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(st, SplineDegree, dt, t, tWeight, tIndex);
+
+        bspline_float_type pWeight[10];
+        long long pIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sp, SplineDegree, dp, p, pWeight, pIndex);
+
+        bspline_float_type qWeight[10];
+        long long qIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sq, SplineDegree, dq, q, qWeight, qIndex);
+
+        bspline_float_type rWeight[10];
+        long long rIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sr, SplineDegree, dr, r, rWeight, rIndex);
+
+        bspline_float_type sWeight[10];
+        long long sIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(ss, SplineDegree, ds, s, sWeight, sIndex);
+
+        T res=0;
+
+        unsigned int ix, iy, iz, it, ip, iq, ir, is;
+
+        for ( is=0; is<SplineDegree; is++ )
+        {
+            for ( ir=0; ir<SplineDegree; ir++ )
+            {
+                for ( iq=0; iq<SplineDegree; iq++ )
+                {
+                    for ( ip=0; ip<SplineDegree; ip++ )
+                    {
+                        for ( it=0; it<SplineDegree; it++ )
+                        {
+                            for ( iz=0; iz<SplineDegree; iz++ )
+                            {
+                                for ( iy=0; iy<SplineDegree; iy++ )
+                                {
+                                    long long offset = yIndex[iy]*sx 
+                                                     + zIndex[iz]*sx*sy 
+                                                     + tIndex[it]*sx*sy*sz 
+                                                     + pIndex[ip]*sx*sy*sz*st
+                                                     + qIndex[iq]*sx*sy*sz*st*sp
+                                                     + rIndex[ir]*sx*sy*sz*st*sp*sq
+                                                     + sIndex[ir]*sx*sy*sz*st*sp*sq*sr;
+
+                                    for ( ix=0; ix<SplineDegree; ix++ )
+                                    {
+                                        res += coeff[ xIndex[ix] + offset ] 
+                                            * xWeight[ix] * yWeight[iy] * zWeight[iz] * tWeight[it] * pWeight[ip] * qWeight[iq] * rWeight[ir] * sWeight[is];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return res;
+    }
+
+    template <typename T, unsigned int D> 
+    inline T hoNDBSpline<T, D>::evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, size_t su, unsigned int SplineDegree, 
+                                                unsigned int dx, unsigned int dy, unsigned int dz, unsigned int dt, unsigned int dp, unsigned int dq, unsigned int dr, unsigned int ds, unsigned int du, 
+                                                coord_type x, coord_type y, coord_type z, coord_type t, coord_type p, coord_type q, coord_type r, coord_type s, coord_type u)
+    {
+        bspline_float_type xWeight[10];
+        long long xIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sx, SplineDegree, dx, x, xWeight, xIndex);
+
+        bspline_float_type yWeight[10];
+        long long yIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sy, SplineDegree, dy, y, yWeight, yIndex);
+
+        bspline_float_type zWeight[10];
+        long long zIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sz, SplineDegree, dz, z, zWeight, zIndex);
+
+        bspline_float_type tWeight[10];
+        long long tIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(st, SplineDegree, dt, t, tWeight, tIndex);
+
+        bspline_float_type pWeight[10];
+        long long pIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sp, SplineDegree, dp, p, pWeight, pIndex);
+
+        bspline_float_type qWeight[10];
+        long long qIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sq, SplineDegree, dq, q, qWeight, qIndex);
+
+        bspline_float_type rWeight[10];
+        long long rIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(sr, SplineDegree, dr, r, rWeight, rIndex);
+
+        bspline_float_type sWeight[10];
+        long long sIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(ss, SplineDegree, ds, s, sWeight, sIndex);
+
+        bspline_float_type uWeight[10];
+        long long uIndex[10];
+        computeBSplineInterpolationLocationsAndWeights(su, SplineDegree, du, u, uWeight, uIndex);
+
+        T res=0;
+
+        unsigned int ix, iy, iz, it, ip, iq, ir, is, iu;
+
+        for ( iu=0; iu<SplineDegree; iu++ )
+        {
+            for ( is=0; is<SplineDegree; is++ )
+            {
+                for ( ir=0; ir<SplineDegree; ir++ )
+                {
+                    for ( iq=0; iq<SplineDegree; iq++ )
+                    {
+                        for ( ip=0; ip<SplineDegree; ip++ )
+                        {
+                            for ( it=0; it<SplineDegree; it++ )
+                            {
+                                for ( iz=0; iz<SplineDegree; iz++ )
+                                {
+                                    for ( iy=0; iy<SplineDegree; iy++ )
+                                    {
+                                        long long offset = yIndex[iy]*sx 
+                                                         + zIndex[iz]*sx*sy 
+                                                         + tIndex[it]*sx*sy*sz 
+                                                         + pIndex[ip]*sx*sy*sz*st
+                                                         + qIndex[iq]*sx*sy*sz*st*sp
+                                                         + rIndex[ir]*sx*sy*sz*st*sp*sq
+                                                         + sIndex[ir]*sx*sy*sz*st*sp*sq*sr
+                                                         + uIndex[ir]*sx*sy*sz*st*sp*sq*sr*ss;
+
+                                        for ( ix=0; ix<SplineDegree; ix++ )
+                                        {
+                                            res += coeff[ xIndex[ix] + offset ] 
+                                                * xWeight[ix] * yWeight[iy] * zWeight[iz] * tWeight[it] * pWeight[ip] * qWeight[iq] * rWeight[ir] * sWeight[is] * uWeight[iu];
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return res;
+    }
+
+    template <typename T, unsigned int D> 
+    T hoNDBSpline<T, D>::evaluateBSpline(const T* coeff, size_t sx, size_t sy, unsigned int SplineDegree, 
+                                        bspline_float_type* xWeight, bspline_float_type* yWeight, 
+                                        coord_type x, coord_type y)
+    {
+        long long xIndex[10];
+        BSplineInterpolationLocation(x, SplineDegree, xIndex);
+        BSplineInterpolationMirrorBoundaryCondition(SplineDegree, xIndex, sx);
+
+        long long yIndex[10];
+        BSplineInterpolationLocation(y, SplineDegree, yIndex);
+        BSplineInterpolationMirrorBoundaryCondition(SplineDegree, yIndex, sy);
+
+        T res=0;
+
+        unsigned int ix, iy;
+        for ( iy=0; iy<SplineDegree; iy++ )
+        {
+            for ( ix=0; ix<SplineDegree; ix++ )
+            {
+                res += coeff[ xIndex[ix] + sx*yIndex[iy] ] * xWeight[ix] * yWeight[iy];
+            }
+        }
+
+        return res;
+    }
+
+    template <typename T, unsigned int D> 
+    T hoNDBSpline<T, D>::evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, unsigned int SplineDegree, 
+                                        bspline_float_type* xWeight, bspline_float_type* yWeight, bspline_float_type* zWeight, 
+                                        coord_type x, coord_type y, coord_type z)
+    {
+        long long xIndex[10];
+        BSplineInterpolationLocation(x, SplineDegree, xIndex);
+        BSplineInterpolationMirrorBoundaryCondition(SplineDegree, xIndex, sx);
+
+        long long yIndex[10];
+        BSplineInterpolationLocation(y, SplineDegree, yIndex);
+        BSplineInterpolationMirrorBoundaryCondition(SplineDegree, yIndex, sy);
+
+        long long zIndex[10];
+        BSplineInterpolationLocation(z, SplineDegree, zIndex);
+        BSplineInterpolationMirrorBoundaryCondition(SplineDegree, zIndex, sz);
+
+        T res=0;
+
+        unsigned int ix, iy, iz;
+        for ( iz=0; iz<SplineDegree; iz++ )
+        {
+            for ( iy=0; iy<SplineDegree; iy++ )
+            {
+                long long offset = yIndex[iy]*sx + zIndex[iz]*sx*sy;
+
+                for ( ix=0; ix<SplineDegree; ix++ )
+                {
+                    res += coeff[ xIndex[ix] + offset ] 
+                        * xWeight[ix] * yWeight[iy] * zWeight[iz];
+                }
+            }
+        }
+
+        return res;
+    }
+
+    template <typename T, unsigned int D> 
+    T hoNDBSpline<T, D>::evaluateBSpline(const T* coeff, size_t sx, size_t sy, size_t sz, size_t st, unsigned int SplineDegree, 
+                                        bspline_float_type* xWeight, bspline_float_type* yWeight, bspline_float_type* zWeight, bspline_float_type* tWeight, 
+                                        coord_type x, coord_type y, coord_type z, coord_type t)
+    {
+        long long xIndex[10];
+        BSplineInterpolationLocation(x, SplineDegree, xIndex);
+        BSplineInterpolationMirrorBoundaryCondition(SplineDegree, xIndex, sx);
+
+        long long yIndex[10];
+        BSplineInterpolationLocation(y, SplineDegree, yIndex);
+        BSplineInterpolationMirrorBoundaryCondition(SplineDegree, yIndex, sy);
+
+        long long zIndex[10];
+        BSplineInterpolationLocation(z, SplineDegree, zIndex);
+        BSplineInterpolationMirrorBoundaryCondition(SplineDegree, zIndex, sz);
+
+        long long tIndex[10];
+        BSplineInterpolationLocation(t, SplineDegree, tIndex);
+        BSplineInterpolationMirrorBoundaryCondition(SplineDegree, tIndex, st);
+
+        T res=0;
+
+        unsigned int ix, iy, iz, it;
+        for ( it=0; it<SplineDegree; it++ )
+        {
+            for ( iz=0; iz<SplineDegree; iz++ )
+            {
+                for ( iy=0; iy<SplineDegree; iy++ )
+                {
+                    long long offset = yIndex[iy]*sx + zIndex[iz]*sx*sy + tIndex[it]*sx*sy*sz;
+
+                    for ( ix=0; ix<SplineDegree; ix++ )
+                    {
+                        res += coeff[ xIndex[ix] + offset ] 
+                            * xWeight[ix] * yWeight[iy] * zWeight[iz] * tWeight[it];
+                    }
+                }
+            }
+        }
+
+        return res;
+    }
+
+    template <typename T, unsigned int D> 
+    T hoNDBSpline<T, D>::evaluateBSpline(const T* coeff, const std::vector<size_t>& dimension, unsigned int SplineDegree, 
+                                        bspline_float_type** weight, const coord_type* pos)
+    {
+        long long index[D][10];
+
+        unsigned int ii, jj;
+        for ( ii=0; ii<D; ii++ )
+        {
+            BSplineInterpolationLocation(pos[ii], SplineDegree, index[ii]);
+            BSplineInterpolationMirrorBoundaryCondition(SplineDegree, index[ii], dimension[ii]);
+        }
+
+        std::vector<size_t> splineDimension(D, SplineDegree);
+        std::vector<size_t> splineInd(D, 0);
+        std::vector<size_t> coeffInd(D, 0);
+
+        std::vector<size_t> offsetFactors(D, 0);
+        hoNDArray<T>::calculate_offset_factors(splineDimension, offsetFactors);
+
+        std::vector<size_t> coeffOffsetFactors(D, 0);
+        hoNDArray<T>::calculate_offset_factors(dimension, coeffOffsetFactors);
+
+        unsigned int num = pow(SplineDegree, D);
+
+        T res = 0;
+
+        for ( ii=0; ii<num; ii++ )
+        {
+            hoNDArray<T>::calculate_index(ii, offsetFactors, splineInd);
+
+            for ( jj=0; jj<D; jj++ )
+            {
+                coeffInd[jj] = index[jj][ splineInd[jj] ];
+            }
+
+            size_t offset = hoNDArray<T>::calculate_offset(coeffInd, coeffOffsetFactors);
+
+            T v = coeff[offset];
+
+            for ( jj=0; jj<D; jj++ )
+            {
+                v *= weight[jj][ splineInd[jj] ];
+            }
+
+            res += v;
+        }
+
+        return res;
+    }
+
+    template <typename T, unsigned int D> 
+    inline T hoNDBSpline<T, D>::evaluateBSpline(const T* coeff, const std::vector<size_t>& dimension, unsigned int SplineDegree, 
+                                        bspline_float_type** weight, const std::vector<coord_type>& pos)
+    {
+        return this->evaluateBSpline(coeff, dimension, SplineDegree, weight, &pos[0]);
+    }
+
+    template <typename T, unsigned int D> 
+    bool hoNDBSpline<T, D>::computeBSplineDerivative(const hoNDArray<T>& data, const hoNDArray<T>& coeff, unsigned int SplineDegree, const std::vector<unsigned int>& derivative, hoNDArray<T>& deriv)
+    {
+        try
+        {
+        	std::vector<size_t> dimension;
+        	data.get_dimensions(dimension);
+
+            if ( D!=data.get_number_of_dimensions() )
+            {
+                GERROR_STREAM("computeBSplineDerivative(hoNDArray) : D!=dimension.get_number_of_dimensions() ... ");
+                return T(0);
+            }
+
+            if ( !deriv.dimension_equal(&data) )
+            {
+                deriv.create(data.get_dimensions());
+            }
+
+            // only need to compute the weights once, since this is the integer point computation
+            bspline_float_type weight[D][10];
+            long long index[D][10];
+
+            unsigned int ii;
+            for ( ii=0; ii<D; ii++ )
+            {
+                computeBSplineInterpolationLocationsAndWeights(dimension[ii], SplineDegree, derivative[ii], dimension[ii]/2, weight[ii], index[ii]);
+            }
+
+            if ( D == 2 )
+            {
+                size_t sx = data.get_size(0);
+                size_t sy = data.get_size(1);
+
+                long long y;
+
+                #pragma omp parallel for default(none) private(y) shared(sx, sy, deriv, coeff, SplineDegree, weight)
+                for ( y=0; y<sy; y++ )
+                {
+                    for ( size_t x=0; x<sx; x++ )
+                    {
+                        deriv(x, y) = evaluateBSpline(coeff.begin(), sx, sy, SplineDegree, weight[0], weight[1], x, y);
+                    }
+                }
+            }
+            else if ( D == 3 )
+            {
+                size_t sx = data.get_size(0);
+                size_t sy = data.get_size(1);
+                size_t sz = data.get_size(2);
+
+                long long z;
+
+                #pragma omp parallel for default(none) private(z) shared(sx, sy, sz, deriv, coeff, SplineDegree, weight)
+                for ( z=0; z<sz; z++ )
+                {
+                    for ( size_t y=0; y<sy; y++ )
+                    {
+                        for ( size_t x=0; x<sx; x++ )
+                        {
+                            deriv(x, y, z) = evaluateBSpline(coeff.begin(), sx, sy, sz, SplineDegree, weight[0], weight[1], weight[2], x, y, z);
+                        }
+                    }
+                }
+            }
+            else if ( D == 4 )
+            {
+                size_t sx = data.get_size(0);
+                size_t sy = data.get_size(1);
+                size_t sz = data.get_size(2);
+                size_t st = data.get_size(3);
+
+                long long t;
+
+                #pragma omp parallel for default(none) private(t) shared(sx, sy, sz, st, deriv, coeff, SplineDegree, weight)
+                for ( t=0; t<st; t++ )
+                {
+                    for ( size_t z=0; z<sz; z++ )
+                    {
+                        for ( size_t y=0; y<sy; y++ )
+                        {
+                            for ( size_t x=0; x<sx; x++ )
+                            {
+                                deriv(x, y, z, t) = evaluateBSpline(coeff.begin(), sx, sy, sz, st, SplineDegree, weight[0], weight[1], weight[2], weight[3], x, y, z, t);
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                size_t num = data.get_number_of_elements();
+
+                long long ii;
+                #pragma omp parallel default(none) private(ii) shared(num, data, coeff, dimension, SplineDegree, weight)
+                {
+                    std::vector<size_t> ind(D);
+                    std::vector<coord_type> pos(D);
+
+                    #pragma omp for 
+                    for ( ii=0; ii<num; ii++ )
+                    {
+                        data.calculate_index(ii, ind);
+
+                        for ( unsigned int jj=0; jj<D; jj++ )
+                        {
+                            pos[jj] = ind[jj];
+                        }
+
+                        deriv(ii) = evaluateBSpline(coeff.begin(), dimension, SplineDegree, weight, pos);
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDBSpline<T, D>::computeBSplineDerivative(const hoNDArray<T>& data, const hoNDArray<T>& coeff, const std::vector<unsigned int>& derivative, hoNDArray<T>& deriv) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    bool hoNDBSpline<T, D>::computeBSplineDerivative(const hoNDImage<T,D>& data, const hoNDArray<T>& coeff, unsigned int SplineDegree, const std::vector<unsigned int>& derivative, hoNDImage<T,D>& deriv)
+    {
+        hoNDArray<T> dataTmp(data.get_dimensions(), const_cast<T*>(data.begin()), false);
+
+        if ( !deriv.dimension_equal(&data) )
+        {
+            deriv = data;
+        }
+
+        hoNDArray<T> derivTmp(deriv.get_dimensions(), deriv.begin(), false);
+
+        return computeBSplineDerivative(dataTmp, coeff, SplineDegree, derivative, derivTmp);
+    }
+
+    template <typename T, unsigned int D> 
+    void hoNDBSpline<T, D>::print(std::ostream& os) const
+    {
+        using namespace std;
+
+        os << "--------------Gagdgetron ND BSpline -------------" << endl;
+        os << "Dimension is : " << D << endl;
+        std::string elemTypeName = std::string(typeid(T).name());
+        os << "Data type is : " << elemTypeName << std::endl;
+    }
+
+    template <typename T, unsigned int D> 
+    void hoNDBSpline<T, D>::ConvertToInterpolationCoefficients(T c[], size_t DataLength, bspline_float_type z[], long NbPoles, bspline_float_type Tolerance)
+    { /* begin ConvertToInterpolationCoefficients */
+
+        double Lambda = 1.0;
+        long n, k;
+
+        /* special case required by mirror boundaries */
+        if (DataLength == 1L)
+        {
+            return;
+        }
+
+        /* compute the overall gain */
+        for (k = 0L; k < NbPoles; k++)
+        {
+            Lambda = Lambda * (1.0 - z[k]) * (1.0 - 1.0 / z[k]);
+        }
+
+        /* apply the gain */
+        for (n = 0L; n < DataLength; n++)
+        {
+            c[n] *= Lambda;
+        }
+
+        /* loop over all poles */
+        for (k = 0L; k < NbPoles; k++)
+        {
+            /* causal initialization */
+            c[0] = InitialCausalCoefficient(c, DataLength, z[k], Tolerance);
+
+            /* causal recursion */
+            for (n = 1L; n < DataLength; n++)
+            {
+                c[n] += z[k] * c[n - 1L];
+            }
+
+            /* anticausal initialization */
+            c[DataLength - 1L] = InitialAntiCausalCoefficient(c, DataLength, z[k]);
+
+            /* anticausal recursion */
+            for (n = DataLength - 2L; 0 <= n; n--)
+            {
+                c[n] = z[k] * (c[n + 1L] - c[n]);
+            }
+        }
+    } /* end ConvertToInterpolationCoefficients */
+
+    template <typename T, unsigned int D> 
+    T hoNDBSpline<T, D>::InitialCausalCoefficient(T c[], size_t DataLength, bspline_float_type z, bspline_float_type Tolerance)
+    { /* begin InitialCausalCoefficient */
+
+        T Sum;
+        bspline_float_type zn, z2n, iz;
+        size_t n, Horizon;
+
+        /* this initialization corresponds to mirror boundaries */
+        Horizon = DataLength;
+        if (Tolerance > 0.0)
+        {
+            Horizon = (size_t)std::ceil(log(Tolerance) / log(fabs(z)));
+        }
+
+        if (Horizon < DataLength)
+        {
+            /* accelerated loop */
+            zn = z;
+            Sum = c[0];
+            for (n = 1; n < Horizon; n++) {
+                Sum += zn * c[n];
+                zn *= z;
+            }
+            return(Sum);
+        }
+        else
+        {
+            /* full loop */
+            zn = z;
+            iz = (bspline_float_type)(1.0) / z;
+            z2n = pow(z, (bspline_float_type)(DataLength - 1L));
+            Sum = c[0] + z2n * c[DataLength - 1L];
+            z2n *= z2n * iz;
+            for (n = 1L; n <= DataLength - 2L; n++)
+            {
+                Sum += (zn + z2n) * c[n];
+                zn *= z;
+                z2n *= iz;
+            }
+            return( Sum / (bspline_float_type)(1.0 - zn * zn) );
+        }
+    } /* end InitialCausalCoefficient */
+
+    template <typename T, unsigned int D> 
+    T hoNDBSpline<T, D>::InitialAntiCausalCoefficient(T c[], size_t DataLength, bspline_float_type z)
+    { /* begin InitialAntiCausalCoefficient */
+
+        /* this initialization corresponds to mirror boundaries */
+        return((z / (z * z - (bspline_float_type)1.0)) * (z * c[DataLength - 2L] + c[DataLength - 1L]));
+    } /* end InitialAntiCausalCoefficient */
+
+    template <typename T, unsigned int D> 
+    inline void hoNDBSpline<T, D>::Pole(bspline_float_type* Pole, unsigned int SplineDegree, unsigned int& NbPoles)
+    {
+        switch (SplineDegree) 
+        {
+            case 2:
+                NbPoles = 1;
+                Pole[0] = (bspline_float_type)( std::sqrt(8.0) - 3.0 );
+                break;
+
+            case 3:
+                NbPoles = 1;
+                Pole[0] = (bspline_float_type)( std::sqrt(3.0) - 2.0 );
+                break;
+
+            case 4:
+                NbPoles = 2;
+                Pole[0] = (bspline_float_type)( std::sqrt(664.0 - std::sqrt(438976.0)) + std::sqrt(304.0) - 19.0 );
+                Pole[1] = (bspline_float_type)( std::sqrt(664.0 + std::sqrt(438976.0)) - std::sqrt(304.0) - 19.0 );
+                break;
+
+            case 5:
+                NbPoles = 2;
+                Pole[0] = (bspline_float_type)( std::sqrt(135.0 / 2.0 - std::sqrt(17745.0 / 4.0)) + std::sqrt(105.0 / 4.0)
+                    - 13.0 / 2.0 );
+                Pole[1] = (bspline_float_type)( std::sqrt(135.0 / 2.0 + std::sqrt(17745.0 / 4.0)) - std::sqrt(105.0 / 4.0)
+                    - 13.0 / 2.0 );
+                break;
+
+            case 6:
+                NbPoles = 3;
+                Pole[0] = (bspline_float_type)( -0.48829458930304475513011803888378906211227916123938 );
+                Pole[1] = (bspline_float_type)( -0.081679271076237512597937765737059080653379610398148 );
+                Pole[2] = (bspline_float_type)( -0.0014141518083258177510872439765585925278641690553467 );
+                break;
+
+            case 7:
+                NbPoles = 3;
+                Pole[0] = (bspline_float_type)( -0.53528043079643816554240378168164607183392315234269 );
+                Pole[1] = (bspline_float_type)( -0.12255461519232669051527226435935734360548654942730 );
+                Pole[2] = (bspline_float_type)( -0.0091486948096082769285930216516478534156925639545994 );
+                break;
+
+            case 8:
+                NbPoles = 4;
+                Pole[0] = (bspline_float_type)( -0.57468690924876543053013930412874542429066157804125 );
+                Pole[1] = (bspline_float_type)( -0.16303526929728093524055189686073705223476814550830 );
+                Pole[2] = (bspline_float_type)( -0.023632294694844850023403919296361320612665920854629 );
+                Pole[3] = (bspline_float_type)( -0.00015382131064169091173935253018402160762964054070043 );
+                break;
+
+            case 9:
+                NbPoles = 4;
+                Pole[0] = (bspline_float_type)( -0.60799738916862577900772082395428976943963471853991 );
+                Pole[1] = (bspline_float_type)( -0.20175052019315323879606468505597043468089886575747 );
+                Pole[2] = (bspline_float_type)( -0.043222608540481752133321142979429688265852380231497 );
+                Pole[3] = (bspline_float_type)( -0.0021213069031808184203048965578486234220548560988624 );
+                break;
+
+            default:
+                GERROR_STREAM("Only 2 - 9 order BSpline is supported ... ");
+                return;
+        }
+    }
+
+    template <typename T, unsigned int D> 
+    inline typename hoNDBSpline<T, D>::bspline_float_type hoNDBSpline<T, D>::BSpline(bspline_float_type x, unsigned int SplineDegree)
+    {
+        if ( x < -( (bspline_float_type)SplineDegree+1)/2.0 )
+        {
+            return 0.0;
+        }
+
+        // follow the notation of origin paper
+
+        unsigned int j, t;
+
+        bspline_float_type value = 0.0;
+        for ( j=0; j<=SplineDegree+1; j++ )
+        {
+            if ( ( x-j+0.5*(SplineDegree+1) ) >= 0 )
+            {
+                bspline_float_type v1 = 1.0;
+                for ( t=1; t<=j; t++ )
+                {
+                    v1 *= t;
+                }
+
+                bspline_float_type v2 = 1.0;
+                for ( t=1; t<=SplineDegree+1-j; t++ )
+                {
+                    v2 *= t;
+                }
+
+                value += (bspline_float_type)( ( std::pow(double(-1), double(j) ) * (SplineDegree+1) / (v2 * v1) ) * std::pow(x-j+0.5*(SplineDegree+1), double(SplineDegree) ) );
+            }
+        }
+
+        return value;
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDBSpline<T, D>::BSplineInterpolationLocation(bspline_float_type x, unsigned int SplineDegree, long long* xIndex)
+    {
+        long long i, k;
+
+        /* compute the interpolation indexes */
+        if (SplineDegree & 1L)
+        {
+            i = (long long)std::floor(x) - (long long)SplineDegree / 2L;
+            for (k = 0L; k <= SplineDegree; k++)
+            {
+                xIndex[k] = i++;
+            }
+        }
+        else
+        {
+            i = (long long)std::floor(x + 0.5) - (long long)SplineDegree / 2L;
+            for (k = 0L; k <= SplineDegree; k++)
+            {
+                xIndex[k] = i++;
+            }
+        }
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDBSpline<T, D>::BSplineInterpolationMirrorBoundaryCondition(unsigned int SplineDegree, long long* xIndex, size_t Width)
+    {
+        long long Width2 = 2 * Width - 2;
+
+        unsigned int k;
+
+        /* apply the mirror boundary conditions */
+        for (k = 0; k <= SplineDegree; k++)
+        {
+            xIndex[k] = (Width == 1L) ? (0L) : ((xIndex[k] < 0L) ?
+                (-xIndex[k] - Width2 * ((-xIndex[k]) / Width2))
+                : (xIndex[k] - Width2 * (xIndex[k] / Width2)));
+
+            if (Width <= xIndex[k])
+            {
+                xIndex[k] = Width2 - xIndex[k];
+            }
+        }
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDBSpline<T, D>::BSplineDiscrete(bspline_float_type x, unsigned int SplineDegree, bspline_float_type* xWeight, long long* xIndex)
+    {
+        bspline_float_type w, w2, w4, t, t0, t1;
+
+        ///* compute the interpolation indexes */
+        //if (SplineDegree & 1L)
+        //{
+        //    i = (long)std::floor(x) - SplineDegree / 2L;
+        //    for (k = 0L; k <= SplineDegree; k++)
+        //    {
+        //        xIndex[k] = i++;
+        //    }
+        //}
+        //else
+        //{
+        //    i = (long)std::floor(x + 0.5) - SplineDegree / 2L;
+        //    for (k = 0L; k <= SplineDegree; k++)
+        //    {
+        //        xIndex[k] = i++;
+        //    }
+        //}
+
+        // BSplineInterpolationLocation(x, SplineDegree, xIndex);
+
+        /* compute the interpolation weights */
+        switch (SplineDegree)
+        {
+            case 2L:
+                /* x */
+                w = x - (bspline_float_type)xIndex[1];
+                xWeight[1] = 3.0 / 4.0 - w * w;
+                xWeight[2] = (1.0 / 2.0) * (w - xWeight[1] + 1.0);
+                xWeight[0] = 1.0 - xWeight[1] - xWeight[2];
+                break;
+            case 3L:
+                /* x */
+                w = x - (bspline_float_type)xIndex[1];
+                xWeight[3] = (1.0 / 6.0) * w * w * w;
+                xWeight[0] = (1.0 / 6.0) + (1.0 / 2.0) * w * (w - 1.0) - xWeight[3];
+                xWeight[2] = w + xWeight[0] - 2.0 * xWeight[3];
+                xWeight[1] = 1.0 - xWeight[0] - xWeight[2] - xWeight[3];
+                break;
+            case 4L:
+                /* x */
+                w = x - (bspline_float_type)xIndex[2];
+                w2 = w * w;
+                t = (1.0 / 6.0) * w2;
+                xWeight[0] = 1.0 / 2.0 - w;
+                xWeight[0] *= xWeight[0];
+                xWeight[0] *= (1.0 / 24.0) * xWeight[0];
+                t0 = w * (t - 11.0 / 24.0);
+                t1 = 19.0 / 96.0 + w2 * (1.0 / 4.0 - t);
+                xWeight[1] = t1 + t0;
+                xWeight[3] = t1 - t0;
+                xWeight[4] = xWeight[0] + t0 + (1.0 / 2.0) * w;
+                xWeight[2] = 1.0 - xWeight[0] - xWeight[1] - xWeight[3] - xWeight[4];
+                break;
+            case 5L:
+                /* x */
+                w = x - (bspline_float_type)xIndex[2];
+                w2 = w * w;
+                xWeight[5] = (1.0 / 120.0) * w * w2 * w2;
+                w2 -= w;
+                w4 = w2 * w2;
+                w -= 1.0 / 2.0;
+                t = w2 * (w2 - 3.0);
+                xWeight[0] = (1.0 / 24.0) * (1.0 / 5.0 + w2 + w4) - xWeight[5];
+                t0 = (1.0 / 24.0) * (w2 * (w2 - 5.0) + 46.0 / 5.0);
+                t1 = (-1.0 / 12.0) * w * (t + 4.0);
+                xWeight[2] = t0 + t1;
+                xWeight[3] = t0 - t1;
+                t0 = (1.0 / 16.0) * (9.0 / 5.0 - t);
+                t1 = (1.0 / 24.0) * w * (w4 - w2 - 5.0);
+                xWeight[1] = t0 + t1;
+                xWeight[4] = t0 - t1;
+                break;
+            case 6L:
+                /* x */
+                w = x - (bspline_float_type)xIndex[3];
+                xWeight[0] = 1.0 / 2.0 - w;
+                xWeight[0] *= xWeight[0] * xWeight[0];
+                xWeight[0] *= xWeight[0] / 720.0;
+                xWeight[1] = (361.0 / 192.0 - w * (59.0 / 8.0 + w
+                    * (-185.0 / 16.0 + w * (25.0 / 3.0 + w * (-5.0 / 2.0 + w)
+                    * (1.0 / 2.0 + w))))) / 120.0;
+                xWeight[2] = (10543.0 / 960.0 + w * (-289.0 / 16.0 + w
+                    * (79.0 / 16.0 + w * (43.0 / 6.0 + w * (-17.0 / 4.0 + w
+                    * (-1.0 + w)))))) / 48.0;
+                w2 = w * w;
+                xWeight[3] = (5887.0 / 320.0 - w2 * (231.0 / 16.0 - w2
+                    * (21.0 / 4.0 - w2))) / 36.0;
+                xWeight[4] = (10543.0 / 960.0 + w * (289.0 / 16.0 + w
+                    * (79.0 / 16.0 + w * (-43.0 / 6.0 + w * (-17.0 / 4.0 + w
+                    * (1.0 + w)))))) / 48.0;
+                xWeight[6] = 1.0 / 2.0 + w;
+                xWeight[6] *= xWeight[6] * xWeight[6];
+                xWeight[6] *= xWeight[6] / 720.0;
+                xWeight[5] = 1.0 - xWeight[0] - xWeight[1] - xWeight[2] - xWeight[3]
+                    - xWeight[4] - xWeight[6];
+                break;
+            case 7L:
+                /* x */
+                w = x - (bspline_float_type)xIndex[3];
+                xWeight[0] = 1.0 - w;
+                xWeight[0] *= xWeight[0];
+                xWeight[0] *= xWeight[0] * xWeight[0];
+                xWeight[0] *= (1.0 - w) / 5040.0;
+                w2 = w * w;
+                xWeight[1] = (120.0 / 7.0 + w * (-56.0 + w * (72.0 + w
+                    * (-40.0 + w2 * (12.0 + w * (-6.0 + w)))))) / 720.0;
+                xWeight[2] = (397.0 / 7.0 - w * (245.0 / 3.0 + w * (-15.0 + w
+                    * (-95.0 / 3.0 + w * (15.0 + w * (5.0 + w
+                    * (-5.0 + w))))))) / 240.0;
+                xWeight[3] = (2416.0 / 35.0 + w2 * (-48.0 + w2 * (16.0 + w2
+                    * (-4.0 + w)))) / 144.0;
+                xWeight[4] = (1191.0 / 35.0 - w * (-49.0 + w * (-9.0 + w
+                    * (19.0 + w * (-3.0 + w) * (-3.0 + w2))))) / 144.0;
+                xWeight[5] = (40.0 / 7.0 + w * (56.0 / 3.0 + w * (24.0 + w
+                    * (40.0 / 3.0 + w2 * (-4.0 + w * (-2.0 + w)))))) / 240.0;
+                xWeight[7] = w2;
+                xWeight[7] *= xWeight[7] * xWeight[7];
+                xWeight[7] *= w / 5040.0;
+                xWeight[6] = 1.0 - xWeight[0] - xWeight[1] - xWeight[2] - xWeight[3]
+                    - xWeight[4] - xWeight[5] - xWeight[7];
+                break;
+            case 8L:
+                /* x */
+                w = x - (bspline_float_type)xIndex[4];
+                xWeight[0] = 1.0 / 2.0 - w;
+                xWeight[0] *= xWeight[0];
+                xWeight[0] *= xWeight[0];
+                xWeight[0] *= xWeight[0] / 40320.0;
+                w2 = w * w;
+                xWeight[1] = (39.0 / 16.0 - w * (6.0 + w * (-9.0 / 2.0 + w2)))
+                    * (21.0 / 16.0 + w * (-15.0 / 4.0 + w * (9.0 / 2.0 + w
+                    * (-3.0 + w)))) / 5040.0;
+                xWeight[2] = (82903.0 / 1792.0 + w * (-4177.0 / 32.0 + w
+                    * (2275.0 / 16.0 + w * (-487.0 / 8.0 + w * (-85.0 / 8.0 + w
+                    * (41.0 / 2.0 + w * (-5.0 + w * (-2.0 + w)))))))) / 1440.0;
+                xWeight[3] = (310661.0 / 1792.0 - w * (14219.0 / 64.0 + w
+                    * (-199.0 / 8.0 + w * (-1327.0 / 16.0 + w * (245.0 / 8.0 + w
+                    * (53.0 / 4.0 + w * (-8.0 + w * (-1.0 + w)))))))) / 720.0;
+                xWeight[4] = (2337507.0 / 8960.0 + w2 * (-2601.0 / 16.0 + w2
+                    * (387.0 / 8.0 + w2 * (-9.0 + w2)))) / 576.0;
+                xWeight[5] = (310661.0 / 1792.0 - w * (-14219.0 / 64.0 + w
+                    * (-199.0 / 8.0 + w * (1327.0 / 16.0 + w * (245.0 / 8.0 + w
+                    * (-53.0 / 4.0 + w * (-8.0 + w * (1.0 + w)))))))) / 720.0;
+                xWeight[7] = (39.0 / 16.0 - w * (-6.0 + w * (-9.0 / 2.0 + w2)))
+                    * (21.0 / 16.0 + w * (15.0 / 4.0 + w * (9.0 / 2.0 + w
+                    * (3.0 + w)))) / 5040.0;
+                xWeight[8] = 1.0 / 2.0 + w;
+                xWeight[8] *= xWeight[8];
+                xWeight[8] *= xWeight[8];
+                xWeight[8] *= xWeight[8] / 40320.0;
+                xWeight[6] = 1.0 - xWeight[0] - xWeight[1] - xWeight[2] - xWeight[3]
+                    - xWeight[4] - xWeight[5] - xWeight[7] - xWeight[8];
+                break;
+            case 9L:
+                /* x */
+                w = x - (bspline_float_type)xIndex[4];
+                xWeight[0] = 1.0 - w;
+                xWeight[0] *= xWeight[0];
+                xWeight[0] *= xWeight[0];
+                xWeight[0] *= xWeight[0] * (1.0 - w) / 362880.0;
+                xWeight[1] = (502.0 / 9.0 + w * (-246.0 + w * (472.0 + w
+                    * (-504.0 + w * (308.0 + w * (-84.0 + w * (-56.0 / 3.0 + w
+                    * (24.0 + w * (-8.0 + w))))))))) / 40320.0;
+                xWeight[2] = (3652.0 / 9.0 - w * (2023.0 / 2.0 + w * (-952.0 + w
+                    * (938.0 / 3.0 + w * (112.0 + w * (-119.0 + w * (56.0 / 3.0 + w
+                    * (14.0 + w * (-7.0 + w))))))))) / 10080.0;
+                xWeight[3] = (44117.0 / 42.0 + w * (-2427.0 / 2.0 + w * (66.0 + w
+                    * (434.0 + w * (-129.0 + w * (-69.0 + w * (34.0 + w * (6.0 + w
+                    * (-6.0 + w))))))))) / 4320.0;
+                w2 = w * w;
+                xWeight[4] = (78095.0 / 63.0 - w2 * (700.0 + w2 * (-190.0 + w2
+                    * (100.0 / 3.0 + w2 * (-5.0 + w))))) / 2880.0;
+                xWeight[5] = (44117.0 / 63.0 + w * (809.0 + w * (44.0 + w
+                    * (-868.0 / 3.0 + w * (-86.0 + w * (46.0 + w * (68.0 / 3.0 + w
+                    * (-4.0 + w * (-4.0 + w))))))))) / 2880.0;
+                xWeight[6] = (3652.0 / 21.0 - w * (-867.0 / 2.0 + w * (-408.0 + w
+                    * (-134.0 + w * (48.0 + w * (51.0 + w * (-4.0 + w) * (-1.0 + w)
+                    * (2.0 + w))))))) / 4320.0;
+                xWeight[7] = (251.0 / 18.0 + w * (123.0 / 2.0 + w * (118.0 + w
+                    * (126.0 + w * (77.0 + w * (21.0 + w * (-14.0 / 3.0 + w
+                    * (-6.0 + w * (-2.0 + w))))))))) / 10080.0;
+                xWeight[9] = w2 * w2;
+                xWeight[9] *= xWeight[9] * w / 362880.0;
+                xWeight[8] = 1.0 - xWeight[0] - xWeight[1] - xWeight[2] - xWeight[3]
+                    - xWeight[4] - xWeight[5] - xWeight[6] - xWeight[7] - xWeight[9];
+                break;
+            default:
+                GERROR_STREAM("Invalid spline degree " << SplineDegree);
+        }
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDBSpline<T, D>::BSplineDiscreteFirstOrderDerivative(bspline_float_type x, unsigned int SplineDegree, bspline_float_type* weight, long long* xIndex)
+    {
+        unsigned int k;
+        for ( k=0; k<SplineDegree; k++ )
+        {
+            weight[k] = BSplineFirstOrderDerivative(x-xIndex[k], SplineDegree);
+        }
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDBSpline<T, D>::BSplineDiscreteSecondOrderDerivative(bspline_float_type x, unsigned int SplineDegree, bspline_float_type* weight, long long* xIndex)
+    {
+        unsigned int k;
+        for ( k=0; k<SplineDegree; k++ )
+        {
+            weight[k] = BSplineSecondOrderDerivative(x-xIndex[k], SplineDegree);
+        }
+    }
+
+    template <typename T, unsigned int D> 
+    inline typename hoNDBSpline<T, D>::bspline_float_type hoNDBSpline<T, D>::BSplineFirstOrderDerivative(bspline_float_type x, unsigned int SplineDegree)
+    {
+        return ( BSpline(x+0.5, SplineDegree-1) - BSpline(x-0.5, SplineDegree-1) );
+    }
+
+    template <typename T, unsigned int D> 
+    inline typename hoNDBSpline<T, D>::bspline_float_type hoNDBSpline<T, D>::BSplineSecondOrderDerivative(bspline_float_type x, unsigned int SplineDegree)
+    {
+        return ( BSpline(x+1, SplineDegree-2) + BSpline(x-1, SplineDegree-2) - 2*BSpline(x, SplineDegree-2) );
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDBSpline<T, D>::computeBSplineInterpolationLocationsAndWeights(size_t len, unsigned int SplineDegree, unsigned int dx, coord_type x, bspline_float_type* weight, long long* xIndex)
+    {
+        BSplineInterpolationLocation(x, SplineDegree, xIndex);
+
+        if ( dx == 0 )
+        {
+            BSplineDiscrete(x, SplineDegree, weight, xIndex);
+        }
+        else if ( dx == 1 )
+        {
+            BSplineDiscreteFirstOrderDerivative(x, SplineDegree, weight, xIndex);
+        }
+        else if ( dx == 2 )
+        {
+            BSplineDiscreteSecondOrderDerivative(x, SplineDegree, weight, xIndex);
+        }
+        else
+        {
+            GERROR_STREAM("Derivative order must be 0/1/2 ... ");
+            return;
+        }
+
+        BSplineInterpolationMirrorBoundaryCondition(SplineDegree, xIndex, len);
+    }
+}
diff --git a/toolboxes/core/cpu/cpucore_export.h b/toolboxes/core/cpu/cpucore_export.h
new file mode 100644
index 0000000..3d5cd15
--- /dev/null
+++ b/toolboxes/core/cpu/cpucore_export.h
@@ -0,0 +1,18 @@
+/** \file cpucore_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef CPUCORE_EXPORT_H_
+#define CPUCORE_EXPORT_H_
+
+#if defined (WIN32)
+    #if defined (__BUILD_GADGETRON_CPUCORE__) || defined (cpucore_EXPORTS)
+        #define EXPORTCPUCORE __declspec(dllexport)
+    #else
+        #define EXPORTCPUCORE __declspec(dllimport)
+    #endif
+#else
+#define EXPORTCPUCORE
+#endif
+
+#endif /* CPUCORE_EXPORT_H_ */
diff --git a/toolboxes/core/cpu/dummy.cpp b/toolboxes/core/cpu/dummy.cpp
new file mode 100644
index 0000000..b89eb11
--- /dev/null
+++ b/toolboxes/core/cpu/dummy.cpp
@@ -0,0 +1,18 @@
+//
+// THIS IS A TEMPORARY FILE
+//
+// This file is to be removed from the repository once Hui merges his branch into mem_ops/development
+// Currently this is the only .cpp in this folder, and one needs to be present to satisfy cmake and generate the .lib
+//
+
+#ifdef WIN32
+#include <stdio.h>
+
+namespace Gadgetron {
+
+	void __declspec(dllexport) __this_is_a_temp_dummy_to_force_lib_file__(void){
+		printf("\n\nINSIDE DUMMY\n\n");
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/toolboxes/core/cpu/gadgetronmath.h b/toolboxes/core/cpu/gadgetronmath.h
new file mode 100644
index 0000000..91bed7f
--- /dev/null
+++ b/toolboxes/core/cpu/gadgetronmath.h
@@ -0,0 +1,26 @@
+#pragma once
+
+/** \file gadgetronmath.h
+\brief Math utility functionx
+
+*/
+
+#define _USE_MATH_DEFINES
+#include <math.h>
+
+namespace Gadgetron {
+
+template <typename T> T sinc(T x) {
+  
+  T val;
+  if (std::abs(x)<.01) {
+    // to 6th order
+    val = 1.0 - 1/6.*std::pow(M_PI*x,2) + 1/120.*std::pow(M_PI*x,4) - 1/5040.*std::pow(M_PI*x,6);
+  } else {
+    val = std::sin(M_PI*x) / (M_PI*x);
+  }
+
+  return val;
+}
+
+}
diff --git a/toolboxes/core/cpu/ho2DArray.h b/toolboxes/core/cpu/ho2DArray.h
new file mode 100644
index 0000000..11c81ae
--- /dev/null
+++ b/toolboxes/core/cpu/ho2DArray.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include "hoNDArray.h"
+
+namespace Gadgetron{
+
+template <class T> class ho2DArray : public hoNDArray<T>
+{
+public:
+
+    typedef hoNDArray<T> BaseClass;
+
+    using BaseClass::create;
+
+    ho2DArray();
+    ho2DArray(size_t sx, size_t sy);
+    explicit ho2DArray(std::vector<size_t> *dimensions);
+    ho2DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+    ho2DArray(size_t sx, size_t sy, T* data, bool delete_data_on_destruct = false);
+    ho2DArray(boost::shared_ptr< std::vector<size_t> > dimensions);
+    ho2DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~ho2DArray();
+
+    ho2DArray(const ho2DArray<T>& a);
+    ho2DArray<T>& operator=(const ho2DArray<T>& rhs);
+
+    virtual void create(std::vector<size_t>& dimensions);
+    virtual void create(std::vector<size_t> *dimensions);
+    virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual bool createArray(size_t sx, size_t sy);
+    virtual bool createArray(size_t sx, size_t sy, T* data, bool delete_data_on_destruct = false);
+
+    T& operator()(size_t x , size_t y);
+    const T& operator()(size_t x , size_t y) const;
+
+    T& operator()( const std::vector<size_t>& ind ) { return (*this)(ind[0], ind[1]); }
+    const T& operator()( const std::vector<size_t>& ind ) const  { return (*this)(ind[0], ind[1]); }
+
+    T& operator()( size_t x ) { return (*this)(x, 0); }
+    const T& operator()( size_t x ) const { return (*this)(x, 0); }
+
+    T& operator()( size_t x, size_t y, size_t z ) { return (*this)(x, y); }
+    const T& operator()( size_t x, size_t y, size_t z ) const { return (*this)(x, y); }
+
+    T& operator()( size_t x, size_t y, size_t z, size_t s ) { return (*this)(x, y); }
+    const T& operator()( size_t x, size_t y, size_t z, size_t s ) const { return (*this)(x, y); }
+
+    T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p ) { return (*this)(x, y); }
+    const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p ) const { return (*this)(x, y); }
+
+    T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r ) { return (*this)(x, y); }
+    const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r ) const { return (*this)(x, y); }
+
+    T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a ) { return (*this)(x, y); }
+    const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a ) const { return (*this)(x, y); }
+
+    T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q ) { return (*this)(x, y); }
+    const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q ) const { return (*this)(x, y); }
+
+    T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u ) { return (*this)(x, y); }
+    const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u ) const { return (*this)(x, y); }
+
+    virtual void print(std::ostream& os) const;
+
+protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+
+    bool init_accesser();
+    bool release_accesser();
+
+    T** accesser_;
+};
+
+}
+
+#include <ho2DArray.hxx>
diff --git a/toolboxes/core/cpu/ho2DArray.hxx b/toolboxes/core/cpu/ho2DArray.hxx
new file mode 100644
index 0000000..9af9d7d
--- /dev/null
+++ b/toolboxes/core/cpu/ho2DArray.hxx
@@ -0,0 +1,261 @@
+
+namespace Gadgetron{
+
+template <typename T> 
+ho2DArray<T>::ho2DArray()
+: BaseClass(), accesser_(NULL)
+{
+}
+
+template <typename T> 
+ho2DArray<T>::ho2DArray(size_t sx, size_t sy)
+: accesser_(NULL)
+{
+    std::vector<size_t> dim(2);
+    dim[0] = sx;
+    dim[1] = sy;
+
+    this->create(&dim);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho2DArray<T>::ho2DArray(std::vector<size_t> *dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==2);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho2DArray<T>::ho2DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==2);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho2DArray<T>::ho2DArray(size_t sx, size_t sy, T* data, bool delete_data_on_destruct)
+: BaseClass(sx, sy, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho2DArray<T>::ho2DArray(boost::shared_ptr< std::vector<size_t> > dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==2);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho2DArray<T>::ho2DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==2);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho2DArray<T>::~ho2DArray()
+{
+    GADGET_CHECK_THROW(release_accesser());
+}
+
+template <typename T> 
+ho2DArray<T>::ho2DArray(const ho2DArray<T>& a)
+: BaseClass(a), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho2DArray<T>& ho2DArray<T>::operator=(const ho2DArray<T>& rhs)
+{
+    if ( &rhs == this ) return *this;
+
+    if ( rhs.get_number_of_elements() == 0 )
+    {
+        this->clear();
+        GADGET_CHECK_THROW(init_accesser());
+        return *this;
+    }
+
+    if (this->dimensions_equal(&rhs)) 
+    {
+        memcpy(this->data_, rhs.data_, this->elements_*sizeof(T));
+    }
+    else
+    {
+        this->deallocate_memory();
+        this->data_ = 0;
+        this->dimensions_ = rhs.dimensions_;
+        this->allocate_memory();
+        memcpy( this->data_, rhs.data_, this->elements_*sizeof(T) );
+
+        GADGET_CHECK_THROW(init_accesser());
+    }
+
+    return *this;
+}
+
+template <typename T> 
+void ho2DArray<T>::create(std::vector<size_t>& dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho2DArray<T>::create(std::vector<size_t> *dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho2DArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+{
+    BaseClass::create(dimensions, data, delete_data_on_destruct);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+bool ho2DArray<T>::createArray(size_t sx, size_t sy)
+{
+    try
+    {
+        std::vector<size_t> dim(2);
+        dim[0] = sx;
+        dim[1] = sy;
+
+        if ( !this->dimensions_equal(&dim) )
+        {
+            this->create(&dim);
+            GADGET_CHECK_RETURN_FALSE(init_accesser());
+        }
+        else
+        {
+            memset(this->data_, 0, sizeof(T)*this->elements_);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("ho2DArray<T>::createArray(size_t sx, size_t sy) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho2DArray<T>::createArray(size_t sx, size_t sy, T* data, bool delete_data_on_destruct)
+{
+    try
+    {
+        std::vector<size_t> dim(2);
+        dim[0] = sx;
+        dim[1] = sy;
+
+        this->create(&dim, data, delete_data_on_destruct);
+        GADGET_CHECK_RETURN_FALSE(init_accesser());
+    }
+    catch(...)
+    {
+        GERROR_STREAM("ho2DArray<T>::createArray(size_t sx, size_t sy, T* data, bool delete_data_on_destruct) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline T& ho2DArray<T>::operator()(size_t x , size_t y)
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1]);
+    return accesser_[y][x];
+}
+
+template <typename T> 
+inline const T& ho2DArray<T>::operator()(size_t x , size_t y) const
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1]);
+    return accesser_[y][x];
+}
+
+template <typename T> 
+bool ho2DArray<T>::init_accesser()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(release_accesser());
+
+        if ( elements_ > 0 )
+        {
+            size_t sx = (*dimensions_)[0];
+            size_t sy = (*dimensions_)[1];
+
+            accesser_ = new T*[sy];
+            if( accesser_ == NULL) return false;
+
+            accesser_[0] = data_;
+            for (size_t y=1; y<sy; y++)
+            {
+                accesser_[y] = accesser_[y-1] + sx;
+            }
+        }
+        else
+        {
+            accesser_ = NULL;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in ho2DArray<T>::init_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho2DArray<T>::release_accesser()
+{
+    try
+    {
+        if (accesser_ != NULL)
+        {
+            delete [] accesser_;
+        }
+        accesser_ = NULL;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in ho2DArray<T>::release_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void ho2DArray<T>::print(std::ostream& os) const
+{
+    BaseClass::print(os);
+    size_t x, y;
+    os << "-------------------------------------------" << std::endl;
+    for (y=0; y<(*dimensions_)[1]; y++) 
+    {
+        os << "y " << y << "\t";
+        for (x=0; x<(*dimensions_)[0]; x++)
+        {
+            os << (*this)(x,y) << "\t";
+        }
+        os << std::endl;
+    }
+    os << "-------------------------------------------" << std::endl;
+}
+
+}
diff --git a/toolboxes/core/cpu/ho3DArray.h b/toolboxes/core/cpu/ho3DArray.h
new file mode 100644
index 0000000..afeb109
--- /dev/null
+++ b/toolboxes/core/cpu/ho3DArray.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "hoNDArray.h"
+
+namespace Gadgetron{
+
+template <class T> class ho3DArray : public hoNDArray<T>
+{
+public:
+
+    typedef hoNDArray<T> BaseClass;
+
+    ho3DArray();
+    ho3DArray(size_t sx, size_t sy, size_t sz);
+    explicit ho3DArray(std::vector<size_t> *dimensions);
+    ho3DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+    ho3DArray(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct = false);
+    ho3DArray(boost::shared_ptr< std::vector<size_t> > dimensions);
+    ho3DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~ho3DArray();
+
+    ho3DArray(const ho3DArray<T>& a);
+    ho3DArray<T>& operator=(const ho3DArray<T>& rhs);
+
+    virtual void create(std::vector<size_t>& dimensions);
+    virtual void create(std::vector<size_t> *dimensions);
+    virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual bool createArray(size_t sx, size_t sy, size_t sz);
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct = false);
+
+    T& operator()(size_t x, size_t y, size_t z);
+    const T& operator()(size_t x, size_t y, size_t z) const;
+
+    virtual void print(std::ostream& os) const;
+
+protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+
+    bool init_accesser();
+    bool release_accesser();
+
+    T*** accesser_;
+};
+
+}
+
+#include <ho3DArray.hxx>
diff --git a/toolboxes/core/cpu/ho3DArray.hxx b/toolboxes/core/cpu/ho3DArray.hxx
new file mode 100644
index 0000000..5b0377c
--- /dev/null
+++ b/toolboxes/core/cpu/ho3DArray.hxx
@@ -0,0 +1,287 @@
+
+namespace Gadgetron{
+
+template <typename T> 
+ho3DArray<T>::ho3DArray()
+: BaseClass(), accesser_(NULL)
+{
+}
+
+template <typename T> 
+ho3DArray<T>::ho3DArray(size_t sx, size_t sy, size_t sz)
+: accesser_(NULL)
+{
+    std::vector<size_t> dim(3);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+
+    this->create(&dim);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho3DArray<T>::ho3DArray(std::vector<size_t> *dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==3);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho3DArray<T>::ho3DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==3);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho3DArray<T>::ho3DArray(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct)
+: BaseClass(sx, sy, sz, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho3DArray<T>::ho3DArray(boost::shared_ptr< std::vector<size_t> > dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==3);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho3DArray<T>::ho3DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==3);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho3DArray<T>::~ho3DArray()
+{
+    GADGET_CHECK_THROW(release_accesser());
+}
+
+template <typename T> 
+ho3DArray<T>::ho3DArray(const ho3DArray<T>& a)
+: BaseClass(a), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho3DArray<T>& ho3DArray<T>::operator=(const ho3DArray& rhs)
+{
+    if ( &rhs == this ) return *this;
+
+    if ( rhs.get_number_of_elements() == 0 )
+    {
+        this->clear();
+        GADGET_CHECK_THROW(init_accesser());
+        return *this;
+    }
+
+    if (this->dimensions_equal(&rhs)) 
+    {
+        memcpy(this->data_, rhs.data_, this->elements_*sizeof(T));
+    }
+    else
+    {
+        this->deallocate_memory();
+        this->data_ = 0;
+        this->dimensions_ = rhs.dimensions_;
+        this->allocate_memory();
+        memcpy( this->data_, rhs.data_, this->elements_*sizeof(T) );
+
+        GADGET_CHECK_THROW(init_accesser());
+    }
+
+    return *this;
+}
+
+template <typename T> 
+void ho3DArray<T>::create(std::vector<size_t>& dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho3DArray<T>::create(std::vector<size_t> *dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho3DArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+{
+    BaseClass::create(dimensions, data, delete_data_on_destruct);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+bool ho3DArray<T>::createArray(size_t sx, size_t sy, size_t sz)
+{
+    try
+    {
+        std::vector<size_t> dim(3);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+
+        if ( !this->dimensions_equal(&dim) )
+        {
+            this->create(&dim);
+            GADGET_CHECK_RETURN_FALSE(init_accesser());
+        }
+        else
+        {
+            memset(this->data_, 0, sizeof(T)*this->elements_);
+        }
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho3DArray<T>::createArray(size_t sx, size_t sy, size_t sz) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho3DArray<T>::createArray(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct)
+{
+    try
+    {
+        std::vector<size_t> dim(3);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+
+        this->create(&dim, data, delete_data_on_destruct);
+        GADGET_CHECK_RETURN_FALSE(init_accesser());
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho3DArray<T>::createArray(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline T& ho3DArray<T>::operator()(size_t x , size_t y, size_t z)
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2]);
+    return accesser_[z][y][x];
+}
+
+template <typename T> 
+inline const T& ho3DArray<T>::operator()(size_t x , size_t y, size_t z) const
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[[2]);
+    return accesser_[z][y][x];
+}
+
+template <typename T> 
+bool ho3DArray<T>::init_accesser()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(release_accesser());
+
+        if ( elements_ > 0 )
+        {
+            size_t sx = (*dimensions_)[0];
+            size_t sy = (*dimensions_)[1];
+            size_t sz = (*dimensions_)[2];
+
+            size_t y, z;
+
+            accesser_ = new T**[sz];
+            if( accesser_ == NULL) return false;
+
+            accesser_[0] = new T*[sy*sz];
+            if( accesser_[0] == NULL)
+            {
+                delete [] accesser_;
+                return false;
+            }
+            for (z = 1; z < sz; z++)
+            {
+                accesser_[z] = accesser_[z-1] + sy;
+            }
+
+            accesser_[0][0] = data_;
+
+            for (z=0; z<sz; z++)
+            {
+                for (y=0; y<sy; y++)
+                {
+                    accesser_[z][y] = accesser_[0][0] + (z*sy+y)*sx;
+                }
+            }
+        }
+        else
+        {
+            accesser_ = NULL;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in ho3DArray<T>::init_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho3DArray<T>::release_accesser()
+{
+    try
+    {
+        if (accesser_ != NULL)
+        {
+            delete [] accesser_[0];
+            delete [] accesser_;
+        }
+        accesser_ = NULL;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in ho3DArray<T>::release_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void ho3DArray<T>::print(std::ostream& os) const
+{
+    BaseClass::print(os);
+    size_t x, y, z;
+    os << "-------------------------------------------" << std::endl;
+    for (z=0; z<(*dimensions_)[2]; z++) 
+    {
+        os << "Array3D (:, :, " << z << ") = " << std::endl;
+        for (y=0; y<(*dimensions_)[1]; y++) 
+        {
+            os << "y " << y << "\t";
+            for (x=0; x<(*dimensions_)[0]; x++)
+            {
+                os << (*this)(x,y,z) << "\t";
+            }
+            os << std::endl;
+        }
+    }
+    os << "-------------------------------------------" << std::endl;
+}
+
+}
diff --git a/toolboxes/core/cpu/ho4DArray.h b/toolboxes/core/cpu/ho4DArray.h
new file mode 100644
index 0000000..524bd56
--- /dev/null
+++ b/toolboxes/core/cpu/ho4DArray.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "hoNDArray.h"
+
+namespace Gadgetron{
+
+template <class T> class ho4DArray : public hoNDArray<T>
+{
+public:
+
+    typedef hoNDArray<T> BaseClass;
+
+    ho4DArray();
+    ho4DArray(size_t sx, size_t sy, size_t sz, size_t ss);
+    explicit ho4DArray(std::vector<size_t> *dimensions);
+    ho4DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+    ho4DArray(size_t sx, size_t sy, size_t sz, size_t ss, T* data, bool delete_data_on_destruct = false);
+    ho4DArray(boost::shared_ptr< std::vector<size_t> > dimensions);
+    ho4DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~ho4DArray();
+
+    ho4DArray(const ho4DArray<T>& a);
+    ho4DArray<T>& operator=(const ho4DArray<T>& rhs);
+
+    virtual void create(std::vector<size_t>& dimensions);
+    virtual void create(std::vector<size_t> *dimensions);
+    virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, size_t ss);
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, size_t ss, T* data, bool delete_data_on_destruct = false);
+
+    T& operator()(size_t x, size_t y, size_t z, size_t s);
+    const T& operator()(size_t x , size_t y, size_t z, size_t s) const;
+
+    virtual void print(std::ostream& os) const;
+
+protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+
+    bool init_accesser();
+    bool release_accesser();
+
+    T**** accesser_;
+};
+
+}
+
+#include <ho4DArray.hxx>
diff --git a/toolboxes/core/cpu/ho4DArray.hxx b/toolboxes/core/cpu/ho4DArray.hxx
new file mode 100644
index 0000000..79167d4
--- /dev/null
+++ b/toolboxes/core/cpu/ho4DArray.hxx
@@ -0,0 +1,313 @@
+
+namespace Gadgetron{
+
+template <typename T> 
+ho4DArray<T>::ho4DArray()
+: BaseClass(), accesser_(NULL)
+{
+}
+
+template <typename T> 
+ho4DArray<T>::ho4DArray(size_t sx, size_t sy, size_t sz, size_t ss)
+: accesser_(NULL)
+{
+    std::vector<size_t> dim(4);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = ss;
+
+    this->create(&dim);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho4DArray<T>::ho4DArray(std::vector<size_t> *dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==4);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho4DArray<T>::ho4DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==4);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho4DArray<T>::ho4DArray(size_t sx, size_t sy, size_t sz, size_t ss, T* data, bool delete_data_on_destruct)
+: BaseClass(sx, sy, sz, ss, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho4DArray<T>::ho4DArray(boost::shared_ptr< std::vector<size_t> > dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==4);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho4DArray<T>::ho4DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==4);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho4DArray<T>::~ho4DArray()
+{
+    GADGET_CHECK_THROW(release_accesser());
+}
+
+template <typename T> 
+ho4DArray<T>::ho4DArray(const ho4DArray<T>& a)
+: BaseClass(a), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho4DArray<T>& ho4DArray<T>::operator=(const ho4DArray<T>& rhs)
+{
+    if ( &rhs == this ) return *this;
+
+    if ( rhs.get_number_of_elements() == 0 )
+    {
+        this->clear();
+        GADGET_CHECK_THROW(init_accesser());
+        return *this;
+    }
+
+    if (this->dimensions_equal(&rhs)) 
+    {
+        memcpy(this->data_, rhs.data_, this->elements_*sizeof(T));
+    }
+    else
+    {
+        this->deallocate_memory();
+        this->data_ = 0;
+        this->dimensions_ = rhs.dimensions_;
+        this->allocate_memory();
+        memcpy( this->data_, rhs.data_, this->elements_*sizeof(T) );
+
+        GADGET_CHECK_THROW(init_accesser());
+    }
+
+    return *this;
+}
+
+template <typename T> 
+void ho4DArray<T>::create(std::vector<size_t>& dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho4DArray<T>::create(std::vector<size_t> *dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho4DArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+{
+    BaseClass::create(dimensions, data, delete_data_on_destruct);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+bool ho4DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss)
+{
+    try
+    {
+        std::vector<size_t> dim(4);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = ss;
+
+        if ( !this->dimensions_equal(&dim) )
+        {
+            this->create(&dim);
+            GADGET_CHECK_RETURN_FALSE(init_accesser());
+        }
+        else
+        {
+            memset(this->data_, 0, sizeof(T)*this->elements_);
+        }
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho4DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho4DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, T* data, bool delete_data_on_destruct)
+{
+    try
+    {
+        std::vector<size_t> dim(4);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = ss;
+
+        this->create(&dim, data, delete_data_on_destruct);
+        GADGET_CHECK_RETURN_FALSE(init_accesser());
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho4DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, T* data, bool delete_data_on_destruct) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline T& ho4DArray<T>::operator()(size_t x, size_t y, size_t z, size_t s)
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2] && s<(*dimensions_)[3]);
+    return accesser_[s][z][y][x];
+}
+
+template <typename T> 
+inline const T& ho4DArray<T>::operator()(size_t x, size_t y, size_t z, size_t s) const
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2] && s<(*dimensions_)[3]);
+    return accesser_[s][z][y][x];
+}
+
+template <typename T> 
+bool ho4DArray<T>::init_accesser()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(release_accesser());
+
+        if ( elements_ > 0 )
+        {
+            size_t sx = (*dimensions_)[0];
+            size_t sy = (*dimensions_)[1];
+            size_t sz = (*dimensions_)[2];
+            size_t ss = (*dimensions_)[3];
+
+            size_t y, z, s;
+
+            accesser_ = new T***[ss];
+            if( accesser_ == NULL) return false;
+
+            accesser_[0] = new T**[sz*ss];
+            if( accesser_[0] == NULL)
+            {
+                delete [] accesser_;
+                return false;
+            }
+            for (s=1; s<ss; s++)
+            {
+                accesser_[s] = accesser_[s-1] + sz;
+            }
+
+            accesser_[0][0] = new T*[sy*sz*ss];
+            if (accesser_[0][0] == NULL)
+            {
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (s=0; s<ss; s++)
+            {
+                for (z=0; z<sz; z++)
+                {
+                    accesser_[s][z] = accesser_[0][0] + s*sz*sy + z*sy;
+                }
+            }
+
+            accesser_[0][0][0] = data_;
+            for (s=0; s<ss; s++)
+            {
+                for (z=0; z<sz; z++)
+                {
+                    for (y=0; y<sy; y++)
+                    {
+                        accesser_[s][z][y] = accesser_[0][0][0] + s*sz*sy*sx + z*sy*sx + y*sx;
+                    }
+                }
+            }
+        }
+        else
+        {
+            accesser_ = NULL;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in ho4DArray<T>::init_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho4DArray<T>::release_accesser()
+{
+    try
+    {
+        if (accesser_ != NULL)
+        {
+            delete [] accesser_[0][0];
+            delete [] accesser_[0];
+            delete [] accesser_;
+        }
+        accesser_ = NULL;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in ho4DArray<T>::release_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void ho4DArray<T>::print(std::ostream& os) const
+{
+    BaseClass::print(os);
+    size_t x, y, z, s;
+    os << "-------------------------------------------" << std::endl;
+    for (s=0; s<(*dimensions_)[3]; s++) 
+    {
+        for (z=0; z<(*dimensions_)[2]; z++) 
+        {
+            os << "ho4DArray (:, :, " << z << ", " << s << ") = " << std::endl;
+            for (y=0; y<(*dimensions_)[1]; y++) 
+            {
+                os << "y " << y << "\t";
+                for (x=0; x<(*dimensions_)[0]; x++)
+                {
+                    os << (*this)(x,y,z,s) << "\t";
+                }
+                os << std::endl;
+            }
+        }
+    }
+    os << "-------------------------------------------" << std::endl;
+}
+
+}
diff --git a/toolboxes/core/cpu/ho5DArray.h b/toolboxes/core/cpu/ho5DArray.h
new file mode 100644
index 0000000..50a4ed4
--- /dev/null
+++ b/toolboxes/core/cpu/ho5DArray.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "hoNDArray.h"
+
+namespace Gadgetron{
+
+template <class T> class ho5DArray : public hoNDArray<T>
+{
+public:
+
+    typedef hoNDArray<T> BaseClass;
+
+    ho5DArray();
+    ho5DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp);
+    explicit ho5DArray(std::vector<size_t> *dimensions);
+    ho5DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+    ho5DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, T* data, bool delete_data_on_destruct = false);
+    ho5DArray(boost::shared_ptr< std::vector<size_t> > dimensions);
+    ho5DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~ho5DArray();
+
+    ho5DArray(const ho5DArray<T>& a);
+    ho5DArray<T>& operator=(const ho5DArray<T>& rhs);
+
+    virtual void create(std::vector<size_t>& dimensions);
+    virtual void create(std::vector<size_t> *dimensions);
+    virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp);
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, T* data, bool delete_data_on_destruct = false);
+
+    T& operator()(size_t x, size_t y, size_t z, size_t s, size_t p);
+    const T& operator()(size_t x , size_t y, size_t z, size_t s, size_t p) const;
+
+    virtual void print(std::ostream& os) const;
+
+protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+
+    bool init_accesser();
+    bool release_accesser();
+
+    T***** accesser_;
+};
+
+}
+
+#include <ho5DArray.hxx>
diff --git a/toolboxes/core/cpu/ho5DArray.hxx b/toolboxes/core/cpu/ho5DArray.hxx
new file mode 100644
index 0000000..1f284b4
--- /dev/null
+++ b/toolboxes/core/cpu/ho5DArray.hxx
@@ -0,0 +1,345 @@
+
+namespace Gadgetron{
+
+template <typename T> 
+ho5DArray<T>::ho5DArray()
+: BaseClass(), accesser_(NULL)
+{
+}
+
+template <typename T> 
+ho5DArray<T>::ho5DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp)
+: accesser_(NULL)
+{
+    std::vector<size_t> dim(5);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = ss;
+    dim[4] = sp;
+
+    this->create(&dim);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho5DArray<T>::ho5DArray(std::vector<size_t> *dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==5);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho5DArray<T>::ho5DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==5);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho5DArray<T>::ho5DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, T* data, bool delete_data_on_destruct)
+: BaseClass(sx, sy, sz, ss, sp, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho5DArray<T>::ho5DArray(boost::shared_ptr< std::vector<size_t> > dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==5);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho5DArray<T>::ho5DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==5);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho5DArray<T>::~ho5DArray()
+{
+    GADGET_CHECK_THROW(release_accesser());
+}
+
+template <typename T> 
+ho5DArray<T>::ho5DArray(const ho5DArray<T>& a)
+: BaseClass(a), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho5DArray<T>& ho5DArray<T>::operator=(const ho5DArray<T>& rhs)
+{
+    if ( &rhs == this ) return *this;
+
+    if ( rhs.get_number_of_elements() == 0 )
+    {
+        this->clear();
+        GADGET_CHECK_THROW(init_accesser());
+        return *this;
+    }
+
+    if (this->dimensions_equal(&rhs)) 
+    {
+        memcpy(this->data_, rhs.data_, this->elements_*sizeof(T));
+    }
+    else
+    {
+        this->deallocate_memory();
+        this->data_ = 0;
+        this->dimensions_ = rhs.dimensions_;
+        this->allocate_memory();
+        memcpy( this->data_, rhs.data_, this->elements_*sizeof(T) );
+
+        GADGET_CHECK_THROW(init_accesser());
+    }
+
+    return *this;
+}
+
+template <typename T> 
+void ho5DArray<T>::create(std::vector<size_t>& dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho5DArray<T>::create(std::vector<size_t> *dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho5DArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+{
+    BaseClass::create(dimensions, data, delete_data_on_destruct);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+bool ho5DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp)
+{
+    try
+    {
+        std::vector<size_t> dim(5);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = ss;
+        dim[4] = sp;
+
+        if ( !this->dimensions_equal(&dim) )
+        {
+            this->create(&dim);
+            GADGET_CHECK_RETURN_FALSE(init_accesser());
+        }
+        else
+        {
+            memset(this->data_, 0, sizeof(T)*this->elements_);
+        }
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho5DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho5DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, T* data, bool delete_data_on_destruct)
+{
+    try
+    {
+        std::vector<size_t> dim(5);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = ss;
+        dim[4] = sp;
+
+        this->create(&dim, data, delete_data_on_destruct);
+        GADGET_CHECK_RETURN_FALSE(init_accesser());
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho5DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, T* data, bool delete_data_on_destruct) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline T& ho5DArray<T>::operator()(size_t x, size_t y, size_t z, size_t s, size_t p)
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2] && s<(*dimensions_)[3] && p<(*dimensions_)[4]);
+    return accesser_[p][s][z][y][x];
+}
+
+template <typename T> 
+inline const T& ho5DArray<T>::operator()(size_t x, size_t y, size_t z, size_t s, size_t p) const
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2] && s<(*dimensions_)[3] && p<(*dimensions_)[4]);
+    return accesser_[p][s][z][y][x];
+}
+
+template <typename T> 
+bool ho5DArray<T>::init_accesser()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(release_accesser());
+
+        if ( elements_ > 0 )
+        {
+            size_t sx = (*dimensions_)[0];
+            size_t sy = (*dimensions_)[1];
+            size_t sz = (*dimensions_)[2];
+            size_t ss = (*dimensions_)[3];
+            size_t sp = (*dimensions_)[4];
+
+            size_t y, z, s, p;
+
+            accesser_ = new T****[sp];
+            if( accesser_ == NULL) return false;
+
+            accesser_[0] = new T***[ss*sp];
+            if( accesser_[0] == NULL)
+            {
+                delete [] accesser_;
+                return false;
+            }
+
+            for (p=1; p<sp; p++)
+            {
+                accesser_[p] = accesser_[p-1] + ss;
+            }
+
+            accesser_[0][0] = new T**[sz*ss*sp];
+            if (accesser_[0][0] == NULL)
+            {
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (p=0; p<sp; p++)
+            {
+                for (s=0; s<ss; s++)
+                {
+                    accesser_[p][s] = accesser_[0][0] + p*ss*sz + s*sz;
+                }
+            }
+
+            accesser_[0][0][0] = new T*[sy*sz*ss*sp];
+            if (accesser_[0][0][0] == NULL)
+            {
+                delete [] accesser_[0][0];
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (p=0; p<sp; p++)
+            {
+                for (s=0; s<ss; s++)
+                {
+                    for (z=0; z<sz; z++)
+                    {
+                        accesser_[p][s][z] = accesser_[0][0][0] + p*ss*sz*sy + s*sz*sy + z*sy;
+                    }
+                }
+            }
+
+            accesser_[0][0][0][0] = data_;
+            for (p=0; p<sp; p++)
+            {
+                for (s=0; s<ss; s++)
+                {
+                    for (z=0; z<sz; z++)
+                    {
+                        for (y=0; y<sy; y++)
+                        {
+                            accesser_[p][s][z][y] = accesser_[0][0][0][0] + p*ss*sz*sy*sx + s*sz*sy*sx + z*sy*sx+y*sx;
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            accesser_ = NULL;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in ho5DArray<T>::init_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho5DArray<T>::release_accesser()
+{
+    try
+    {
+        if (accesser_ != NULL)
+        {
+            delete [] accesser_[0][0][0];
+            delete [] accesser_[0][0];
+            delete [] accesser_[0];
+            delete [] accesser_;
+        }
+        accesser_ = NULL;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in ho5DArray<T>::release_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void ho5DArray<T>::print(std::ostream& os) const
+{
+    BaseClass::print(os);
+    size_t x, y, z, s, p;
+    os << "-------------------------------------------" << std::endl;
+    for (p=0; p<(*dimensions_)[4]; p++) 
+    {
+        for (s=0; s<(*dimensions_)[3]; s++) 
+        {
+            for (z=0; z<(*dimensions_)[2]; z++) 
+            {
+                os << "ho5DArray (:, :, " << z << ", " << s << ", " << p << ") = " << std::endl;
+                for (y=0; y<(*dimensions_)[1]; y++) 
+                {
+                    os << "y " << y << "\t";
+                    for (x=0; x<(*dimensions_)[0]; x++)
+                    {
+                        os << (*this)(x,y,z,s,p) << "\t";
+                    }
+                    os << std::endl;
+                }
+            }
+        }
+    }
+    os << "-------------------------------------------" << std::endl;
+}
+
+}
diff --git a/toolboxes/core/cpu/ho6DArray.h b/toolboxes/core/cpu/ho6DArray.h
new file mode 100644
index 0000000..2001237
--- /dev/null
+++ b/toolboxes/core/cpu/ho6DArray.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "hoNDArray.h"
+
+namespace Gadgetron{
+
+template <class T> class ho6DArray : public hoNDArray<T>
+{
+public:
+
+    typedef hoNDArray<T> BaseClass;
+
+    ho6DArray();
+    ho6DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr);
+    explicit ho6DArray(std::vector<size_t> *dimensions);
+    ho6DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+    ho6DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, T* data, bool delete_data_on_destruct = false);
+    ho6DArray(boost::shared_ptr< std::vector<size_t> > dimensions);
+    ho6DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~ho6DArray();
+
+    ho6DArray(const ho6DArray<T>& a);
+    ho6DArray<T>& operator=(const ho6DArray<T>& rhs);
+
+    virtual void create(std::vector<size_t>& dimensions);
+    virtual void create(std::vector<size_t> *dimensions);
+    virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr);
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, T* data, bool delete_data_on_destruct = false);
+
+    T& operator()(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r);
+    const T& operator()(size_t x , size_t y, size_t z, size_t s, size_t p, size_t r) const;
+
+    virtual void print(std::ostream& os) const;
+
+protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+
+    bool init_accesser();
+    bool release_accesser();
+
+    T****** accesser_;
+};
+
+}
+
+#include <ho6DArray.hxx>
diff --git a/toolboxes/core/cpu/ho6DArray.hxx b/toolboxes/core/cpu/ho6DArray.hxx
new file mode 100644
index 0000000..2c191ec
--- /dev/null
+++ b/toolboxes/core/cpu/ho6DArray.hxx
@@ -0,0 +1,392 @@
+
+namespace Gadgetron{
+
+template <typename T> 
+ho6DArray<T>::ho6DArray()
+: BaseClass(), accesser_(NULL)
+{
+}
+
+template <typename T> 
+ho6DArray<T>::ho6DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr)
+: accesser_(NULL)
+{
+    std::vector<size_t> dim(6);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = ss;
+    dim[4] = sp;
+    dim[5] = sr;
+
+    this->create(&dim);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho6DArray<T>::ho6DArray(std::vector<size_t> *dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==6);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho6DArray<T>::ho6DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==6);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho6DArray<T>::ho6DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, T* data, bool delete_data_on_destruct)
+: BaseClass(sx, sy, sz, ss, sp, sr, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions_->size()==6);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho6DArray<T>::ho6DArray(boost::shared_ptr< std::vector<size_t> > dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==6);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho6DArray<T>::ho6DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==6);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho6DArray<T>::~ho6DArray()
+{
+    GADGET_CHECK_THROW(release_accesser());
+}
+
+template <typename T> 
+ho6DArray<T>::ho6DArray(const ho6DArray<T>& a)
+: BaseClass(a), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho6DArray<T>& ho6DArray<T>::operator=(const ho6DArray<T>& rhs)
+{
+    if ( &rhs == this ) return *this;
+
+    if ( rhs.get_number_of_elements() == 0 )
+    {
+        this->clear();
+        GADGET_CHECK_THROW(init_accesser());
+        return *this;
+    }
+
+    if (this->dimensions_equal(&rhs)) 
+    {
+        memcpy(this->data_, rhs.data_, this->elements_*sizeof(T));
+    }
+    else
+    {
+        this->deallocate_memory();
+        this->data_ = 0;
+        this->dimensions_ = rhs.dimensions_;
+        this->allocate_memory();
+        memcpy( this->data_, rhs.data_, this->elements_*sizeof(T) );
+
+        GADGET_CHECK_THROW(init_accesser());
+    }
+
+    return *this;
+}
+
+template <typename T> 
+void ho6DArray<T>::create(std::vector<size_t>& dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho6DArray<T>::create(std::vector<size_t> *dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho6DArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+{
+    BaseClass::create(dimensions, data, delete_data_on_destruct);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+bool ho6DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr)
+{
+    try
+    {
+        std::vector<size_t> dim(6);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = ss;
+        dim[4] = sp;
+        dim[5] = sr;
+
+        if ( !this->dimensions_equal(&dim) )
+        {
+            this->create(&dim);
+            GADGET_CHECK_RETURN_FALSE(init_accesser());
+        }
+        else
+        {
+            memset(this->data_, 0, sizeof(T)*this->elements_);
+        }
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho6DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho6DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, T* data, bool delete_data_on_destruct)
+{
+    try
+    {
+        std::vector<size_t> dim(6);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = ss;
+        dim[4] = sp;
+        dim[5] = sr;
+
+        this->create(&dim, data, delete_data_on_destruct);
+        GADGET_CHECK_RETURN_FALSE(init_accesser());
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho6DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, T* data, bool delete_data_on_destruct) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline T& ho6DArray<T>::operator()(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r)
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2] && s<(*dimensions_)[3] && p<(*dimensions_)[4] && r<(*dimensions_)[5]);
+    return accesser_[r][p][s][z][y][x];
+}
+
+template <typename T> 
+inline const T& ho6DArray<T>::operator()(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r) const
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2] && s<(*dimensions_)[3] && p<(*dimensions_)[4] && r<(*dimensions_)[5]);
+    return accesser_[r][p][s][z][y][x];
+}
+
+template <typename T> 
+bool ho6DArray<T>::init_accesser()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(release_accesser());
+
+        if ( elements_ > 0 )
+        {
+            size_t sx = (*dimensions_)[0];
+            size_t sy = (*dimensions_)[1];
+            size_t sz = (*dimensions_)[2];
+            size_t ss = (*dimensions_)[3];
+            size_t sp = (*dimensions_)[4];
+            size_t sr = (*dimensions_)[5];
+
+            size_t y, z, s, p, r;
+
+            accesser_ = new T*****[sr];
+            if( accesser_ == NULL) return false;
+
+            accesser_[0] = new T****[sp*sr];
+            if( accesser_[0] == NULL)
+            {
+                delete [] accesser_;
+                return false;
+            }
+            for (r=1; r<sr; r++)
+            {
+                accesser_[r] = accesser_[r-1] + sp;
+            }
+
+            accesser_[0][0] = new T***[ss*sp*sr];
+            if (accesser_[0][0] == NULL)
+            {
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (r=0; r<sr; r++)
+            {
+                for (p=0; p<sp; p++)
+                {
+                    accesser_[r][p] = accesser_[0][0] + r*sp*ss + p*ss;
+                }
+            }
+
+            accesser_[0][0][0] = new T**[sz*ss*sp*sr];
+            if (accesser_[0][0][0] == NULL)
+            {
+                delete [] accesser_[0][0];
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (r=0; r<sr; r++)
+            {
+                for (p=0; p<sp; p++)
+                {
+                    for (s=0; s<ss; s++)
+                    {
+                        accesser_[r][p][s] = accesser_[0][0][0] 
+                                                + r*sp*ss*sz 
+                                                + p*ss*sz 
+                                                + s*sz;
+                    }
+                }
+            }
+
+            accesser_[0][0][0][0] = new T*[sy*sz*ss*sp*sr];
+            if (accesser_[0][0][0][0] == NULL)
+            {
+                delete [] accesser_[0][0][0];
+                delete [] accesser_[0][0];
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (r=0; r<sr; r++)
+            {
+                for (p=0; p<sp; p++)
+                {
+                    for (s=0; s<ss; s++)
+                    {
+                        for (z=0; z<sz; z++)
+                        {
+                            accesser_[r][p][s][z] = accesser_[0][0][0][0] 
+                                                        + r*sp*ss*sz*sy 
+                                                        + p*ss*sz*sy 
+                                                        + s*sz*sy 
+                                                        + z*sy;
+                        }
+                    }
+                }
+            }
+
+            accesser_[0][0][0][0][0] = data_;
+            for (r=0; r<sr; r++)
+            {
+                for (p=0; p<sp; p++)
+                {
+                    for (s=0; s<ss; s++)
+                    {
+                        for (z=0; z<sz; z++)
+                        {
+                            for (y=0; y<sy; y++)
+                            {
+                                accesser_[r][p][s][z][y] = accesser_[0][0][0][0][0] 
+                                                                + r*sp*ss*sz*sy*sx 
+                                                                + p*ss*sz*sy*sx 
+                                                                + s*sz*sy*sx 
+                                                                + z*sy*sx 
+                                                                + y*sx;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            accesser_ = NULL;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in ho6DArray<T>::init_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho6DArray<T>::release_accesser()
+{
+    try
+    {
+        if (accesser_ != NULL)
+        {
+            delete [] accesser_[0][0][0][0];
+            delete [] accesser_[0][0][0];
+            delete [] accesser_[0][0];
+            delete [] accesser_[0];
+            delete [] accesser_;
+        }
+        accesser_ = NULL;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in ho6DArray<T>::release_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void ho6DArray<T>::print(std::ostream& os) const
+{
+    BaseClass::print(os);
+    size_t x, y, z, s, p, r;
+    os << "-------------------------------------------" << std::endl;
+    for (r=0; r<(*dimensions_)[5]; r++) 
+    {
+        for (p=0; p<(*dimensions_)[4]; p++) 
+        {
+            for (s=0; s<(*dimensions_)[3]; s++) 
+            {
+                for (z=0; z<(*dimensions_)[2]; z++) 
+                {
+                    os << "ho6DArray (:, :, " << z << ", " << s << ", " << p << ", " << r << ") = " << std::endl;
+                    for (y=0; y<(*dimensions_)[1]; y++) 
+                    {
+                        os << "y " << y << "\t";
+                        for (x=0; x<(*dimensions_)[0]; x++)
+                        {
+                            os << (*this)(x,y,z,s,p,r) << "\t";
+                        }
+                        os << std::endl;
+                    }
+                }
+            }
+        }
+    }
+    os << "-------------------------------------------" << std::endl;
+}
+
+}
diff --git a/toolboxes/core/cpu/ho7DArray.h b/toolboxes/core/cpu/ho7DArray.h
new file mode 100644
index 0000000..728441b
--- /dev/null
+++ b/toolboxes/core/cpu/ho7DArray.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "hoNDArray.h"
+
+namespace Gadgetron{
+
+template <class T> class ho7DArray : public hoNDArray<T>
+{
+public:
+
+    typedef hoNDArray<T> BaseClass;
+
+    ho7DArray();
+    ho7DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa);
+    explicit ho7DArray(std::vector<size_t> *dimensions);
+    ho7DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+    ho7DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa, T* data, bool delete_data_on_destruct = false);
+    ho7DArray(boost::shared_ptr< std::vector<size_t> > dimensions);
+    ho7DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~ho7DArray();
+
+    ho7DArray(const ho7DArray<T>& a);
+    ho7DArray<T>& operator=(const ho7DArray<T>& rhs);
+
+    virtual void create(std::vector<size_t>& dimensions);
+    virtual void create(std::vector<size_t> *dimensions);
+    virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa);
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa, T* data, bool delete_data_on_destruct = false);
+
+    T& operator()(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a);
+    const T& operator()(size_t x , size_t y, size_t z, size_t s, size_t p, size_t r, size_t a) const;
+
+    virtual void print(std::ostream& os) const;
+
+protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+
+    bool init_accesser();
+    bool release_accesser();
+
+    T******* accesser_;
+};
+
+}
+
+#include <ho7DArray.hxx>
diff --git a/toolboxes/core/cpu/ho7DArray.hxx b/toolboxes/core/cpu/ho7DArray.hxx
new file mode 100644
index 0000000..5661a8b
--- /dev/null
+++ b/toolboxes/core/cpu/ho7DArray.hxx
@@ -0,0 +1,427 @@
+
+namespace Gadgetron{
+
+template <typename T> 
+ho7DArray<T>::ho7DArray()
+: BaseClass(), accesser_(NULL)
+{
+}
+
+template <typename T> 
+ho7DArray<T>::ho7DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa)
+: accesser_(NULL)
+{
+    std::vector<size_t> dim(7);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = ss;
+    dim[4] = sp;
+    dim[5] = sr;
+    dim[6] = sa;
+
+    this->create(&dim);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho7DArray<T>::ho7DArray(std::vector<size_t> *dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==7);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho7DArray<T>::ho7DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==7);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho7DArray<T>::ho7DArray(boost::shared_ptr< std::vector<size_t> > dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==7);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho7DArray<T>::ho7DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==7);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho7DArray<T>::ho7DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa, T* data, bool delete_data_on_destruct)
+: BaseClass(sx, sy, sz, ss, sp, sr, sa, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions_->size()==7);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho7DArray<T>::~ho7DArray()
+{
+    GADGET_CHECK_THROW(release_accesser());
+}
+
+template <typename T> 
+ho7DArray<T>::ho7DArray(const ho7DArray<T>& a)
+: BaseClass(a), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho7DArray<T>& ho7DArray<T>::operator=(const ho7DArray<T>& rhs)
+{
+    if ( &rhs == this ) return *this;
+
+    if ( rhs.get_number_of_elements() == 0 )
+    {
+        this->clear();
+        GADGET_CHECK_THROW(init_accesser());
+        return *this;
+    }
+
+    if (this->dimensions_equal(&rhs)) 
+    {
+        memcpy(this->data_, rhs.data_, this->elements_*sizeof(T));
+    }
+    else
+    {
+        this->deallocate_memory();
+        this->data_ = 0;
+        this->dimensions_ = rhs.dimensions_;
+        this->allocate_memory();
+        memcpy( this->data_, rhs.data_, this->elements_*sizeof(T) );
+
+        GADGET_CHECK_THROW(init_accesser());
+    }
+
+    return *this;
+}
+
+template <typename T> 
+void ho7DArray<T>::create(std::vector<size_t>& dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho7DArray<T>::create(std::vector<size_t> *dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho7DArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+{
+    BaseClass::create(dimensions, data, delete_data_on_destruct);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+bool ho7DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa)
+{
+    try
+    {
+        std::vector<size_t> dim(7);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = ss;
+        dim[4] = sp;
+        dim[5] = sr;
+        dim[6] = sa;
+
+        if ( !this->dimensions_equal(&dim) )
+        {
+            this->create(&dim);
+            GADGET_CHECK_RETURN_FALSE(init_accesser());
+        }
+        else
+        {
+            memset(this->data_, 0, sizeof(T)*this->elements_);
+        }
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho7DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho7DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa, T* data, bool delete_data_on_destruct)
+{
+    try
+    {
+        std::vector<size_t> dim(7);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = ss;
+        dim[4] = sp;
+        dim[5] = sr;
+        dim[6] = sa;
+
+        this->create(&dim, data, delete_data_on_destruct);
+        GADGET_CHECK_RETURN_FALSE(init_accesser());
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho7DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa, T* data, bool delete_data_on_destruct) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline T& ho7DArray<T>::operator()(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a)
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2] && s<(*dimensions_)[3] && p<(*dimensions_)[4] && r<(*dimensions_)[5] && a<(*dimensions_)[6]);
+    return accesser_[a][r][p][s][z][y][x];
+}
+
+template <typename T> 
+inline const T& ho7DArray<T>::operator()(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a) const
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2] && s<(*dimensions_)[3] && p<(*dimensions_)[4] && r<(*dimensions_)[5] && a<(*dimensions_)[6]);
+    return accesser_[a][r][p][s][z][y][x];
+}
+
+template <typename T> 
+bool ho7DArray<T>::init_accesser()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(release_accesser());
+
+        if ( elements_ > 0 )
+        {
+            size_t sx = (*dimensions_)[0];
+            size_t sy = (*dimensions_)[1];
+            size_t sz = (*dimensions_)[2];
+            size_t ss = (*dimensions_)[3];
+            size_t sp = (*dimensions_)[4];
+            size_t sr = (*dimensions_)[5];
+            size_t sa = (*dimensions_)[6];
+
+            size_t y, z, s, p, r, a;
+
+            accesser_ = new T******[sa];
+            if( accesser_ == NULL) return false;
+
+            accesser_[0] = new T*****[sr*sa];
+            if( accesser_[0] == NULL)
+            {
+                delete [] accesser_;
+                return false;
+            }
+            for (a=1; a<sa; a++)
+            {
+                accesser_[a] = accesser_[a-1] + sr;
+            }
+
+            accesser_[0][0] = new T****[sp*sr*sa];
+            if (accesser_[0][0] == NULL)
+            {
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (a=0; a<sa; a++)
+            {
+                for (r=0; r<sr; r++)
+                {
+                    accesser_[a][r] = accesser_[0][0] + a*sr*sp + r*sp;
+                }
+            }
+
+            accesser_[0][0][0] = new T***[ss*sp*sr*sa];
+            if (accesser_[0][0][0] == NULL)
+            {
+                delete [] accesser_[0][0];
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (a=0; a<sa; a++)
+            {
+                for (r=0; r<sr; r++)
+                {
+                    for (p=0; p<sp; p++)
+                    {
+                        accesser_[a][r][p] = accesser_[0][0][0] 
+                                                + a*sr*sp*ss 
+                                                + r*sp*ss 
+                                                + p*ss;
+                    }
+                }
+            }
+
+            accesser_[0][0][0][0] = new T**[sz*ss*sp*sr*sa];
+            if (accesser_[0][0][0][0] == NULL)
+            {
+                delete [] accesser_[0][0][0];
+                delete [] accesser_[0][0];
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (a=0; a<sa; a++)
+            {
+                for (r=0; r<sr; r++)
+                {
+                    for (p=0; p<sp; p++)
+                    {
+                        for (s=0; s<ss; s++)
+                        {
+                            accesser_[a][r][p][s] = accesser_[0][0][0][0] 
+                                                        + a*sr*sp*ss*sz 
+                                                        + r*sp*ss*sz 
+                                                        + p*ss*sz 
+                                                        + s*sz;
+                        }
+                    }
+                }
+            }
+
+            accesser_[0][0][0][0][0] = new T*[sy*sz*ss*sp*sr*sa];
+            for (a=0; a<sa; a++)
+            {
+                for (r=0; r<sr; r++)
+                {
+                    for (p=0; p<sp; p++)
+                    {
+                        for (s=0; s<ss; s++)
+                        {
+                            for (z=0; z<sz; z++)
+                            {
+                                accesser_[a][r][p][s][z] = accesser_[0][0][0][0][0] 
+                                                                + a*sr*sp*ss*sz*sy 
+                                                                + r*sp*ss*sz*sy 
+                                                                + p*ss*sz*sy 
+                                                                + s*sz*sy 
+                                                                + z*sy;
+                            }
+                        }
+                    }
+                }
+            }
+
+            accesser_[0][0][0][0][0][0] = data_;
+            for (a=0; a<sa; a++)
+            {
+                for (r=0; r<sr; r++)
+                {
+                    for (p=0; p<sp; p++)
+                    {
+                        for (s=0; s<ss; s++)
+                        {
+                            for (z=0; z<sz; z++)
+                            {
+                                for (y=0; y<sy; y++)
+                                {
+                                    accesser_[a][r][p][s][z][y] = accesser_[0][0][0][0][0][0] 
+                                                                    + a*sr*sp*ss*sz*sy*sx 
+                                                                    + r*sp*ss*sz*sy*sx 
+                                                                    + p*ss*sz*sy*sx 
+                                                                    + s*sz*sy*sx 
+                                                                    + z*sy*sx
+                                                                    + y*sx;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            accesser_ = NULL;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in ho7DArray<T>::init_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho7DArray<T>::release_accesser()
+{
+    try
+    {
+        if (accesser_ != NULL)
+        {
+            delete [] accesser_[0][0][0][0][0];
+            delete [] accesser_[0][0][0][0];
+            delete [] accesser_[0][0][0];
+            delete [] accesser_[0][0];
+            delete [] accesser_[0];
+            delete [] accesser_;
+        }
+        accesser_ = NULL;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in ho7DArray<T>::release_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void ho7DArray<T>::print(std::ostream& os) const
+{
+    BaseClass::print(os);
+    size_t x, y, z, s, p, r, a;
+    os << "-------------------------------------------" << std::endl;
+    for (a=0; a<(*dimensions_)[6]; a++) 
+    {
+        for (r=0; r<(*dimensions_)[5]; r++) 
+        {
+            for (p=0; p<(*dimensions_)[4]; p++) 
+            {
+                for (s=0; s<(*dimensions_)[3]; s++) 
+                {
+                    for (z=0; z<(*dimensions_)[2]; z++) 
+                    {
+                        os << "ho7DArray (:, :, " << z << ", " << s << ", " << p << ", " << r << ", " << a << ") = " << std::endl;
+                        for (y=0; y<(*dimensions_)[1]; y++) 
+                        {
+                            os << "y " << y << "\t";
+                            for (x=0; x<(*dimensions_)[0]; x++)
+                            {
+                                os << (*this)(x,y,z,s,p,r,a) << "\t";
+                            }
+                            os << std::endl;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    os << "-------------------------------------------" << std::endl;
+}
+
+}
diff --git a/toolboxes/core/cpu/hoMatrix.cpp b/toolboxes/core/cpu/hoMatrix.cpp
new file mode 100644
index 0000000..cb2f3ff
--- /dev/null
+++ b/toolboxes/core/cpu/hoMatrix.cpp
@@ -0,0 +1,416 @@
+
+#include "hoMatrix.h"
+
+namespace Gadgetron
+{
+
+// C = A*B
+bool GeneralMatrixProduct(hoNDArray<float>& C, const hoNDArray<float>& A, bool transA, const hoNDArray<float>& B, bool transB)
+{
+    try
+    {
+        typedef float T;
+
+        size_t M = A.get_size(0);
+        size_t K = A.get_size(1);
+        if ( transA )
+        { 
+            M = A.get_size(1);
+            K = A.get_size(0);
+        }
+
+        size_t K2 = B.get_size(0);
+        size_t N = B.get_size(1);
+        if ( transB )
+        {
+            K2 = B.get_size(1);
+            N = B.get_size(0);
+        }
+
+        GADGET_CHECK_RETURN_FALSE(K==K2);
+        if ( (C.get_size(0)!=M) || (C.get_size(1)!=N) )
+        {
+            C.create(M, N);
+        }
+
+        const T* pA = A.begin();
+        const T* pB = B.begin();
+        T* pC = C.begin();
+
+        size_t m, n, k;
+
+        if ( !transA && !transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += pA[m+k*M]*pB[k+n*K];
+                    }
+                }
+            }
+        }
+
+        if ( transA && !transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += pA[k+m*K]*pB[k+n*K];
+                    }
+                }
+            }
+        }
+
+        if ( !transA && transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += pA[m+k*M]*pB[n+k*K];
+                    }
+                }
+            }
+        }
+
+        if ( transA && transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += pA[k+m*K]*pB[n+k*K];
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GeneralMatrixProduct(hoNDArray<float>& C, const hoNDArray<float>& A, bool transA, const hoNDArray<float>& B, bool transB) ...");
+        return false;
+    }
+    return true;
+}
+
+bool GeneralMatrixProduct(hoNDArray<double>& C, const hoNDArray<double>& A, bool transA, const hoNDArray<double>& B, bool transB)
+{
+    try
+    {
+        typedef double T;
+
+        size_t M = A.get_size(0);
+        size_t K = A.get_size(1);
+        if ( transA )
+        { 
+            M = A.get_size(1);
+            K = A.get_size(0);
+        }
+
+        size_t K2 = B.get_size(0);
+        size_t N = B.get_size(1);
+        if ( transB )
+        {
+            K2 = B.get_size(1);
+            N = B.get_size(0);
+        }
+
+        GADGET_CHECK_RETURN_FALSE(K==K2);
+        if ( (C.get_size(0)!=M) || (C.get_size(1)!=N) )
+        {
+            C.create(M, N);
+        }
+
+        const T* pA = A.begin();
+        const T* pB = B.begin();
+        T* pC = C.begin();
+
+        size_t m, n, k;
+
+        if ( !transA && !transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += pA[m+k*M]*pB[k+n*K];
+                    }
+                }
+            }
+        }
+
+        if ( transA && !transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += pA[k+m*K]*pB[k+n*K];
+                    }
+                }
+            }
+        }
+
+        if ( !transA && transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += pA[m+k*M]*pB[n+k*K];
+                    }
+                }
+            }
+        }
+
+        if ( transA && transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += pA[k+m*K]*pB[n+k*K];
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GeneralMatrixProduct(hoNDArray<double>& C, const hoNDArray<double>& A, bool transA, const hoNDArray<double>& B, bool transB) ...");
+        return false;
+    }
+    return true;
+}
+
+bool GeneralMatrixProduct(hoNDArray< std::complex<float> >& C, const hoNDArray< std::complex<float> >& A, bool transA, const hoNDArray< std::complex<float> >& B, bool transB)
+{
+    try
+    {
+        typedef  std::complex<float>  T;
+
+        size_t M = A.get_size(0);
+        size_t K = A.get_size(1);
+        if ( transA )
+        { 
+            M = A.get_size(1);
+            K = A.get_size(0);
+        }
+
+        size_t K2 = B.get_size(0);
+        size_t N = B.get_size(1);
+        if ( transB )
+        {
+            K2 = B.get_size(1);
+            N = B.get_size(0);
+        }
+
+        GADGET_CHECK_RETURN_FALSE(K==K2);
+        if ( (C.get_size(0)!=M) || (C.get_size(1)!=N) )
+        {
+            C.create(M, N);
+        }
+
+        const T* pA = A.begin();
+        const T* pB = B.begin();
+        T* pC = C.begin();
+
+        size_t m, n, k;
+
+        if ( !transA && !transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += pA[m+k*M]*pB[k+n*K];
+                    }
+                }
+            }
+        }
+
+        if ( transA && !transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += std::conj(pA[k+m*K])*pB[k+n*K];
+                    }
+                }
+            }
+        }
+
+        if ( !transA && transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += pA[m+k*M]*std::conj(pB[n+k*K]);
+                    }
+                }
+            }
+        }
+
+        if ( transA && transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += std::conj(pA[k+m*K])*std::conj(pB[n+k*K]);
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GeneralMatrixProduct(hoNDArray< std::complex<float> >& C, const hoNDArray< std::complex<float> >& A, bool transA, const hoNDArray< std::complex<float> >& B, bool transB) ...");
+        return false;
+    }
+    return true;
+}
+
+bool GeneralMatrixProduct(hoNDArray< std::complex<double> >& C, const hoNDArray< std::complex<double> >& A, bool transA, const hoNDArray< std::complex<double> >& B, bool transB)
+{
+    try
+    {
+        typedef  std::complex<double>  T;
+
+        size_t M = A.get_size(0);
+        size_t K = A.get_size(1);
+        if ( transA )
+        { 
+            M = A.get_size(1);
+            K = A.get_size(0);
+        }
+
+        size_t K2 = B.get_size(0);
+        size_t N = B.get_size(1);
+        if ( transB )
+        {
+            K2 = B.get_size(1);
+            N = B.get_size(0);
+        }
+
+        GADGET_CHECK_RETURN_FALSE(K==K2);
+        if ( (C.get_size(0)!=M) || (C.get_size(1)!=N) )
+        {
+            C.create(M, N);
+        }
+
+        const T* pA = A.begin();
+        const T* pB = B.begin();
+        T* pC = C.begin();
+
+        size_t m, n, k;
+
+        if ( !transA && !transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += pA[m+k*M]*pB[k+n*K];
+                    }
+                }
+            }
+        }
+
+        if ( transA && !transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += std::conj(pA[k+m*K])*pB[k+n*K];
+                    }
+                }
+            }
+        }
+
+        if ( !transA && transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += pA[m+k*M]*std::conj(pB[n+k*K]);
+                    }
+                }
+            }
+        }
+
+        if ( transA && transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += std::conj(pA[k+m*K])*std::conj(pB[n+k*K]);
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GeneralMatrixProduct(hoNDArray< std::complex<double> >& C, const hoNDArray< std::complex<double> >& A, bool transA, const hoNDArray< std::complex<double> >& B, bool transB) ...");
+        return false;
+    }
+    return true;
+}
+
+}
diff --git a/toolboxes/core/cpu/hoMatrix.h b/toolboxes/core/cpu/hoMatrix.h
new file mode 100644
index 0000000..45801e6
--- /dev/null
+++ b/toolboxes/core/cpu/hoMatrix.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#include "cpucore_export.h"
+#include "ho2DArray.h"
+#include "complext.h"
+#include <algorithm>
+#include <iomanip>
+
+namespace Gadgetron{
+
+// the hoMatrix stores every column as the first dimension
+// it has the column-wise storage
+template <class T> class  hoMatrix : public ho2DArray<T>
+{
+public:
+
+    typedef hoMatrix<T> Self;
+    typedef ho2DArray<T> BaseClass;
+
+    hoMatrix();
+    hoMatrix(size_t rows, size_t cols);
+    hoMatrix(size_t rows, size_t cols, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~hoMatrix();
+
+    hoMatrix(const hoMatrix<T>& a);
+    hoMatrix<T>& operator=(const hoMatrix& rhs);
+
+    virtual bool createMatrix(size_t rows, size_t cols);
+    virtual bool createMatrix(size_t rows, size_t cols, T* data, bool delete_data_on_destruct = false);
+
+    T& operator()(size_t r , size_t c);
+    const T& operator()(size_t r , size_t c) const;
+
+    size_t rows() const;
+    size_t cols() const;
+
+    // assign the upper/lower triangle matrix as a fixed value
+    bool upperTri(const T& v);
+    bool lowerTri(const T& v);
+
+    // copy upper triangle to the lower
+    bool copyUpperTriToLower();
+    bool copyLowerTriToUpper();
+
+    // sum along row or col
+    bool sumOverRow(hoNDArray<T>& res) const;
+    bool sumOverCol(hoNDArray<T>& res) const;
+
+    // get the sub matrix
+    bool subMatrix(Self& res, size_t startR, size_t endR, size_t startC, size_t endC) const;
+
+    // set the matrix to be identity
+    bool setIdentity();
+
+    // normalize the matrix, so the L2 norm of matrix is 1
+    bool normalize();
+
+    bool operator == (const Self& m) const;
+    bool operator != (const Self& m) const;
+
+    virtual void print(std::ostream& os) const;
+
+protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+    using BaseClass::accesser_;
+    
+};
+
+/// for real matrix
+template <class T> class hoMatrixReal : public hoMatrix<T>
+{
+public:
+
+    typedef hoMatrixReal<T> Self;
+    typedef hoMatrix<T> BaseClass;
+
+    hoMatrixReal();
+    hoMatrixReal(size_t rows, size_t cols);
+    hoMatrixReal(size_t rows, size_t cols, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~hoMatrixReal();
+
+    hoMatrixReal(const hoMatrixReal<T>& a);
+
+    /// sort along the row direction (sort along the 1st dimension)
+    bool sort_ascending_along_row();
+
+    /// sort along the column direction (sort along the 2nd dimension)
+    bool sort_ascending_along_column();
+
+protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+    using BaseClass::accesser_;
+};
+
+}
+
+#include <hoMatrix.hxx>
diff --git a/toolboxes/core/cpu/hoMatrix.hxx b/toolboxes/core/cpu/hoMatrix.hxx
new file mode 100644
index 0000000..d3c2e72
--- /dev/null
+++ b/toolboxes/core/cpu/hoMatrix.hxx
@@ -0,0 +1,732 @@
+
+namespace Gadgetron
+{
+
+template <typename T> 
+hoMatrix<T>::hoMatrix() : BaseClass(1, 1)
+{
+}
+
+template <typename T> 
+hoMatrix<T>::hoMatrix(size_t rows, size_t cols) : BaseClass(rows, cols)
+{
+    this->fill(T(0));
+}
+
+template <typename T> 
+hoMatrix<T>::hoMatrix(size_t rows, size_t cols, T* data, bool delete_data_on_destruct)
+{
+    std::vector<size_t> dim(2);
+    dim[0] = rows;
+    dim[1] = cols;
+    this->create(&dim,data,delete_data_on_destruct);
+    GADGET_CHECK_THROW(this->init_accesser());
+}
+
+template <typename T> 
+hoMatrix<T>::~hoMatrix()
+{
+
+}
+
+template <typename T> 
+hoMatrix<T>::hoMatrix(const hoMatrix<T>& a) : BaseClass(a)
+{
+}
+
+template <typename T> 
+hoMatrix<T>& hoMatrix<T>::operator=(const hoMatrix& rhs)
+{
+    if ( this == &rhs ) return *this;
+    BaseClass::operator=(rhs);
+    return *this;
+}
+
+template <typename T> 
+bool hoMatrix<T>::createMatrix(size_t rows, size_t cols)
+{
+    return this->createArray(rows, cols);
+}
+
+template <typename T> 
+bool hoMatrix<T>::createMatrix(size_t rows, size_t cols, T* data, bool delete_data_on_destruct)
+{
+    return this->createArray(rows, cols, data, delete_data_on_destruct);
+}
+
+template <typename T> 
+inline T& hoMatrix<T>::operator()(size_t r, size_t c)
+{
+    GADGET_DEBUG_CHECK_THROW(c>=0 && r>=0 && r<(*dimensions_)[0] && c<(*dimensions_)[1]);
+    return accesser_[c][r];
+}
+
+template <typename T> 
+inline const T& hoMatrix<T>::operator()(size_t r, size_t c) const
+{
+    GADGET_DEBUG_CHECK_THROW(c>=0 && r>=0 && c<(*dimensions_)[0] && r<(*dimensions_)[1]);
+    return accesser_[c][r];
+}
+
+template <typename T> 
+inline size_t hoMatrix<T>::rows() const
+{
+    if ( dimensions_->empty() ) return 0;
+    return (*dimensions_)[0];
+}
+
+template <typename T> 
+inline size_t hoMatrix<T>::cols() const
+{
+    if ( dimensions_->empty() ) return 0;
+    return (*dimensions_)[1];
+}
+
+template <typename T> 
+bool hoMatrix<T>::upperTri(const T& v)
+{
+    try
+    {
+        size_t r, c;
+        for (r=0; r<(*dimensions_)[0]; r++)
+        {
+            for (c=r+1; c<(*dimensions_)[1]; c++)
+            {
+                (*this)(r, c) = v;
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in hoMatrix<T>::upperTri(const T& v) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::lowerTri(const T& v)
+{
+    try
+    {
+        size_t r, c;
+        for (c=0; c<(*dimensions_)[1]; c++)
+        {
+            for (r=c+1; r<(*dimensions_)[0]; r++)
+            {
+                (*this)(r, c) = v;
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in hoMatrix<T>::lowerTri(const T& v) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::copyUpperTriToLower()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE((*dimensions_)[0]==(*dimensions_)[1]);
+
+        size_t r, c;
+        for (r=0; r<(*dimensions_)[0]; r++)
+        {
+            for (c=r+1; c<(*dimensions_)[1]; c++)
+            {
+                (*this)(c, r)= (*this)(r, c);
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in hoMatrix<T>::copyUpperTriToLower() ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::copyLowerTriToUpper()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE((*dimensions_)[0]==(*dimensions_)[1]);
+
+        size_t r, c;
+        for (c=0; c<(*dimensions_)[1]; c++)
+        {
+            for (r=c+1; r<(*dimensions_)[0]; r++)
+            {
+                (*this)(c, r)= (*this)(r, c);
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in hoMatrix<T>::copyUpperTriToLower() ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::sumOverRow(hoNDArray<T>& res) const
+{
+    try
+    {
+        size_t ROW = rows();
+        size_t COL = cols();
+
+        if ( res.get_number_of_elements() != ROW )
+        {
+            res.create(ROW);
+        }
+
+        T* pRes = res.begin();
+
+        size_t r, c;
+
+        for ( r=0; r<ROW; r++ )
+        {
+            pRes[r] = 0;
+        }
+
+        for ( c=0; c<COL; c++ )
+        {
+            for ( r=0; r<ROW; r++ )
+            {
+                // res(r) += (*this)(r, c);
+                pRes[r] += this->data_[r+c*ROW];
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in hoMatrix<T>::sumOverRow(hoNDArray<T>& r) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::sumOverCol(hoNDArray<T>& res) const
+{
+    try
+    {
+        size_t ROW = rows();
+        size_t COL = cols();
+
+        if ( res.get_number_of_elements() != COL )
+        {
+            res.create(COL);
+        }
+
+        T* pRes = res.begin();
+
+        size_t r;
+        long long c;
+
+        for ( c=0; c<(long long)COL; c++ )
+        {
+            pRes[c] = 0;
+        }
+
+        //for ( r=0; r<ROW; r++ )
+        //{
+        //    for ( c=0; c<COL; c++ )
+        //    {
+        //        // res(c) += (*this)(r, c);
+        //        pRes[c] += this->data_[r+c*ROW];
+        //    }
+        //}
+
+        T* pCurr = NULL;
+        T v(0);
+        // #pragma omp parallel for default(none) private(c, r) shared(COL, ROW, pRes) if ( COL > 16 )
+        for ( c=0; c<(long long)COL; c++ )
+        {
+            v = 0;
+            pCurr = this->data_ + c*ROW;
+            for ( r=0; r<ROW; r++ )
+            {
+                v += pCurr[r];
+            }
+            pRes[c] = v;
+        }
+
+        //size_t r, c;
+        //for ( c=0; c<COL; c++ )
+        //{
+        //    T v = (*this)(0, c);
+        //    for ( r=1; r<ROW; r++ )
+        //    {
+        //        v += (*this)(r, c);
+        //        //v += this->data_[r+c*ROW];
+        //    }
+        //    res(c) = v;
+        //}
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in hoMatrix<T>::sumOverCol(hoNDArray<T>& r) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::subMatrix(Self& res, size_t startR, size_t endR, size_t startC, size_t endC) const
+{
+    try
+    {
+        size_t ROW = rows();
+        size_t COL = cols();
+
+        GADGET_CHECK_RETURN_FALSE(startR<ROW);
+        GADGET_CHECK_RETURN_FALSE(startC<COL);
+        GADGET_CHECK_RETURN_FALSE(endR<ROW);
+        GADGET_CHECK_RETURN_FALSE(endC<COL);
+        GADGET_CHECK_RETURN_FALSE(endR>=startR);
+        GADGET_CHECK_RETURN_FALSE(endC>=startC);
+
+        GADGET_CHECK_RETURN_FALSE(res.createMatrix(endR-startR+1, endC-startC+1));
+
+        size_t r, c;
+        for ( r=startR; r<=endR; r++ )
+        {
+            for ( c=startC; c<=endC; c++ )
+            {
+                res(r-startR, c-startC) = (*this)(r, c);
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in hoMatrix<T>::subMatrix(Self& res, size_t startR, size_t endR, size_t startC, size_t endC) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::setIdentity()
+{
+    try
+    {
+        size_t ROW = this->rows();
+        size_t COL = this->cols();
+
+        size_t N = std::min(ROW, COL);
+
+        this->fill(T(0));
+
+        size_t r;
+        for ( r=0; r<N; r++ )
+        {
+            (*this)(r, r) = T(1.0);
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in hoMatrix<T>::setIdentity() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::normalize()
+{
+    try
+    {
+        T dist = std::abs(this->data_[0]);
+        dist *= dist;
+
+        unsigned int ii;
+        for ( ii=1; ii<this->element_; ii++ )
+        {
+            T v = std::abs(this->data_[ii]);
+            dist += v*v;
+        }
+
+        dist = std::sqrt(dist);
+
+        if ( std::abs(dist) < DBL_EPSILON ) return false;
+
+        for ( ii=0; ii<this->element_; ii++ )
+        {
+            this->data_[ii] /= dist;
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in hoMatrix<T>::normalize() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::operator == (const Self& m) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->dimensions_equal(&m));
+    for ( size_t i=0; i<elements_; i++ )
+    { 
+        if (std::abs(data_[i]-m.data_[i])>DBL_EPSILON)
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::operator != (const Self& m) const
+{
+    return !(*this==m);
+}
+
+template <typename T> 
+void hoMatrix<T>::print(std::ostream& os) const
+{
+    using namespace std;
+    os.unsetf(std::ios::scientific);
+
+    os << "hoMatrix (row X col): " << this->rows() << " X " << this->cols() << " : " << std::string(typeid(T).name()) << endl;
+    size_t r, c;
+    for (r=0; r<(*dimensions_)[0]; r++) 
+    {
+        os << "r " << r << ":\t";
+        for (c=0; c<(*dimensions_)[1]; c++)
+        {
+            os << setprecision(10) << (*this)(r,c) << "\t";
+        }
+        os << endl; 
+    }
+}
+
+// --------------------------------------------------------------------------------------------------------
+
+template <typename T> 
+hoMatrixReal<T>::hoMatrixReal() : BaseClass()
+{
+}
+
+template <typename T> 
+hoMatrixReal<T>::hoMatrixReal(size_t rows, size_t cols) : BaseClass(rows, cols)
+{
+}
+
+template <typename T> 
+hoMatrixReal<T>::hoMatrixReal(size_t rows, size_t cols, T* data, bool delete_data_on_destruct) : BaseClass(rows, cols, delete_data_on_destruct)
+{
+}
+
+template <typename T> 
+hoMatrixReal<T>::~hoMatrixReal()
+{
+}
+
+template <typename T> 
+hoMatrixReal<T>::hoMatrixReal(const hoMatrixReal<T>& a) : BaseClass(a)
+{
+}
+
+template <typename T> 
+bool hoMatrixReal<T>::sort_ascending_along_row()
+{
+    try
+    {
+        size_t R = this->rows();
+        size_t C = this->cols();
+
+        size_t col;
+        for(col=0; col<C; col++) 
+        {
+            std::sort(data_+col*R, data_+(col+1)*R);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in hoMatrixReal<T>::sort_ascending_along_row() ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool hoMatrixReal<T>::sort_ascending_along_column()
+{
+    try
+    {
+        size_t R = this->rows();
+        size_t C = this->cols();
+
+        std::vector<T> buf(C);
+
+        size_t col, row;
+        for(row=0; row<R; row++) 
+        {
+            for(col=0; col<C; col++)
+            {
+                buf[col] = data_[row + col*R];
+            }
+
+            std::sort(buf.begin(), buf.end());
+
+            for(col=0; col<C; col++)
+            {
+                data_[row + col*R] = buf[col];
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in hoMatrixReal<T>::sort_ascending_along_column() ... ");
+        return false;
+    }
+    return true;
+}
+
+// --------------------------------------------------------------------------------------------------------
+
+template <typename T> 
+bool copyL2U(hoMatrix<T>& A)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        size_t R = A.rows();
+        size_t C = A.cols();
+
+        size_t row, col;
+        for(row=0; row<R; row++) 
+        {
+            for(col=0; col<row; col++ )
+            {
+                A(col, row) = A(row, col);
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in copyL2U(hoMatrix<T>& A) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool copyL2U(hoMatrix<T>& A, bool conj)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        size_t R = A.rows();
+        size_t row, col;
+
+        if ( conj )
+        {
+            for(row=0; row<R; row++) 
+            {
+                for(col=0; col<row; col++ )
+                {
+                    A(col, row) = std::conj(A(row, col));
+                }
+            }
+        }
+        else
+        {
+            for(row=0; row<R; row++) 
+            {
+                for(col=0; col<row; col++ )
+                {
+                    A(col, row) = A(row, col);
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in copyL2U(hoMatrix<T>& A, bool conj) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool copyU2L(hoMatrix<T>& A)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        size_t R = A.rows();
+        size_t C = A.cols();
+
+        size_t row, col;
+        for(row=0; row<R; row++) 
+        {
+            for(col=row+1; col<C; col++ )
+            {
+                A(col, row) = A(row, col);
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in copyU2L(hoMatrix<T>& A) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool copyU2L(hoMatrix<T>& A, bool conj)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        size_t R = A.rows();
+        size_t C = A.cols();
+
+        size_t row, col;
+
+        if ( conj )
+        {
+            for(row=0; row<R; row++) 
+            {
+                for(col=row+1; col<C; col++ )
+                {
+                    A(col, row) = std::conj(A(row, col));
+                }
+            }
+        }
+        else
+        {
+            for(row=0; row<R; row++) 
+            {
+                for(col=row+1; col<C; col++ )
+                {
+                    A(col, row) = A(row, col);
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in copyU2L(hoMatrix<T>& A, bool conj) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool trans(const hoMatrix<T>& A, hoMatrix<T>& AT)
+{
+    try
+    {
+        if ( A.get_number_of_elements() == 0 ) return true;
+
+        if ( AT.rows()!=A.cols() || AT.cols()!=A.rows() )
+        {
+            AT.createMatrix(A.cols(), A.rows());
+        }
+
+        long long r, c;
+        #pragma omp parallel for default(none) private(r, c) shared(A, AT)
+        for ( c=0; c<(long long)A.cols(); c++ )
+        {
+            for ( r=0; r<(long long)A.rows(); r++ )
+            {
+                AT(c,r) = A(r,c);
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in trans(const hoMatrix<T>& A, hoMatrix<T>& AT) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool conjugatetrans(const hoMatrix<T>& A, hoMatrix<T>& AH)
+{
+    try
+    {
+        if ( A.get_number_of_elements() == 0 ) return true;
+
+        if ( AH.rows()!=A.cols() || AH.cols()!=A.rows() )
+        {
+            AH.createMatrix(A.cols(), A.rows());
+        }
+
+        long long r, c;
+        #pragma omp parallel for default(none) private(r, c) shared(A, AH)
+        for ( c=0; c<(long long)A.cols(); c++ )
+        {
+            for ( r=0; r<(long long)A.rows(); r++ )
+            {
+                AH(c,r) = std::conj(A(r,c));
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in conjugatetrans(const hoMatrix<T>& A, hoMatrix<T>& AH) ... ");
+        return false;
+    }
+    return true;
+}
+
+inline bool conjugatetrans(const hoMatrix<float>& A, hoMatrix<float>& AH)
+{
+    return trans(A, AH);
+}
+
+inline bool conjugatetrans(const hoMatrix<double>& A, hoMatrix<double>& AH)
+{
+    return trans(A, AH);
+}
+
+// C = A*B
+EXPORTCPUCORE bool GeneralMatrixProduct(hoNDArray<float>& C, const hoNDArray<float>& A, bool transA, const hoNDArray<float>& B, bool transB);
+EXPORTCPUCORE bool GeneralMatrixProduct(hoNDArray<double>& C, const hoNDArray<double>& A, bool transA, const hoNDArray<double>& B, bool transB);
+EXPORTCPUCORE bool GeneralMatrixProduct(hoNDArray< std::complex<float> >& C, const hoNDArray< std::complex<float> >& A, bool transA, const hoNDArray< std::complex<float> >& B, bool transB);
+EXPORTCPUCORE bool GeneralMatrixProduct(hoNDArray< std::complex<double> >& C, const hoNDArray< std::complex<double> >& A, bool transA, const hoNDArray< std::complex<double> >& B, bool transB);
+
+template<typename T> 
+bool GeneralMatrixProduct(hoMatrix<T>& C, const hoMatrix<T>& A, bool transA, const hoMatrix<T>& B, bool transB)
+{
+    try
+    {
+        hoNDArray<T> mC(C.get_dimensions(), C.begin(), false);
+        hoNDArray<T> mA(A.get_dimensions(), const_cast<T*>(A.begin()), false);
+        hoNDArray<T> mB(B.get_dimensions(), const_cast<T*>(B.begin()), false);
+
+        Gadgetron::GeneralMatrixProduct(mC, mA, transA, mB, transB);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in GeneralMatrixProduct(hoMatrix<T>& C, const hoMatrix<T>& A, bool transA, const hoMatrix<T>& B, bool transB) ...");
+        return false;
+    }
+    return true;
+}
+
+}
diff --git a/toolboxes/core/cpu/hoNDArray.h b/toolboxes/core/cpu/hoNDArray.h
new file mode 100644
index 0000000..1a7ccc7
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDArray.h
@@ -0,0 +1,212 @@
+/** \file hoNDArray.h
+    \brief CPU-based N-dimensional array (data container)
+*/
+
+#pragma once
+
+#include "NDArray.h"
+#include "complext.h"
+#include "vector_td.h"
+
+#include "cpucore_export.h"
+
+#include <string.h>
+#include <float.h>
+#include <boost/shared_ptr.hpp>
+#include <stdexcept>
+
+namespace Gadgetron{
+
+  template <typename T> class hoNDArray : public NDArray<T>
+  {
+  public:
+
+    typedef NDArray<T> BaseClass;
+    typedef float coord_type;
+    typedef T value_type;
+
+    hoNDArray();
+
+    explicit hoNDArray(std::vector<size_t> &dimensions);
+    explicit hoNDArray(std::vector<size_t> *dimensions);
+    explicit hoNDArray(boost::shared_ptr< std::vector<size_t> > dimensions);
+
+    hoNDArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+#if __cplusplus > 199711L
+    hoNDArray(std::initializer_list<size_t> dimensions);
+    hoNDArray(std::initializer_list<size_t> dimensions,T* data, bool delete_data_on_destruct = false);
+#endif
+
+    explicit hoNDArray(size_t len);
+    hoNDArray(size_t sx, size_t sy);
+    hoNDArray(size_t sx, size_t sy, size_t sz);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss);
+
+    hoNDArray(size_t len, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(size_t sx, size_t sy, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~hoNDArray();
+
+    // Copy constructors
+    hoNDArray(const hoNDArray<T> &a);
+    explicit hoNDArray(const hoNDArray<T> *a);
+    //Move constructors
+
+#if __cplusplus > 199711L
+    //Move constructors
+    hoNDArray(hoNDArray<T>&& a);
+    hoNDArray& operator=(hoNDArray&& rhs);
+
+#endif
+
+    // Assignment operator
+    hoNDArray& operator=(const hoNDArray& rhs);
+
+    virtual void create(std::vector<size_t>& dimensions);
+    virtual void create(std::vector<size_t> *dimensions);
+    virtual void create(boost::shared_ptr< std::vector<size_t> > dimensions);
+
+#if __cplusplus > 199711L
+    virtual void create(std::initializer_list<size_t> dimensions);
+    virtual void create(std::initializer_list<size_t> dimensions,T* data, bool delete_data_on_destruct = false);
+#endif
+
+    virtual void create(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct = false);
+    virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+    virtual void create(boost::shared_ptr<std::vector<size_t>  > dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual void create(size_t len);
+    virtual void create(size_t sx, size_t sy);
+    virtual void create(size_t sx, size_t sy, size_t sz);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, size_t su);
+
+    virtual void create(size_t len, T* data, bool delete_data_on_destruct = false);
+    virtual void create(size_t sx, size_t sy, T* data, bool delete_data_on_destruct = false);
+    virtual void create(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct = false);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, T* data, bool delete_data_on_destruct = false);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, T* data, bool delete_data_on_destruct = false);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, T* data, bool delete_data_on_destruct = false);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, T* data, bool delete_data_on_destruct = false);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, T* data, bool delete_data_on_destruct = false);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, size_t su, T* data, bool delete_data_on_destruct = false);
+
+    void fill(T value);
+
+    T* begin();
+    const T* begin() const;
+
+    T* end();
+    const T* end() const;
+
+    T& at( size_t idx );
+    const T& at( size_t idx ) const;
+
+    T& operator[]( size_t idx );
+
+    //T& operator()( size_t idx );
+    //const T& operator()( size_t idx ) const;
+
+    //T& operator()( const std::vector<size_t>& ind );
+    //const T& operator()( const std::vector<size_t>& ind ) const;
+
+    template<typename T2> 
+    bool copyFrom(const hoNDArray<T2>& aArray) // Should be a void function
+    {
+      if ( !this->dimensions_equal(&aArray) )
+      {
+        this->create(aArray.get_dimensions());
+      }
+
+      long long i;
+      #pragma omp parallel for default(none) private(i) shared(aArray)
+      for ( i=0; i<(long long)elements_; i++ )
+      {
+        data_[i] = static_cast<T>(aArray(i));
+      }
+      return true;
+    }
+
+    void get_sub_array(const std::vector<size_t>& start, std::vector<size_t>& size, hoNDArray<T>& out);
+
+    virtual void print(std::ostream& os) const;
+    virtual void printContent(std::ostream& os) const;
+
+    virtual bool serialize(char*& buf, size_t& len) const;
+    virtual bool deserialize(char* buf, size_t& len);
+
+  protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+
+    virtual void allocate_memory();
+    virtual void deallocate_memory();
+
+    // Generic allocator / deallocator
+    //
+
+    template<class X> void _allocate_memory( size_t size, X** data )
+    {
+      *data = new (std::nothrow) X[size];
+    }
+
+    template<class X> void _deallocate_memory( X* data )
+    {
+      delete [] data;
+    }
+
+    // Overload these instances to avoid invoking the element class constructor/destructor
+    //
+
+    virtual void _allocate_memory( size_t size, float** data );
+    virtual void _deallocate_memory( float* data );
+
+    virtual void _allocate_memory( size_t size, double** data );
+    virtual void _deallocate_memory( double* data );
+
+    virtual void _allocate_memory( size_t size, std::complex<float>** data );
+    virtual void _deallocate_memory( std::complex<float>* data );
+
+    virtual void _allocate_memory( size_t size, std::complex<double>** data );
+    virtual void _deallocate_memory( std::complex<double>* data );
+
+    virtual void _allocate_memory( size_t size, float_complext** data );
+    virtual void _deallocate_memory( float_complext* data );
+
+    virtual void _allocate_memory( size_t size, double_complext** data );
+    virtual void _deallocate_memory( double_complext* data );
+
+    template<class TYPE, unsigned int D> void _allocate_memory( size_t size, vector_td<TYPE,D>** data )
+    {
+      *data = (vector_td<TYPE,D>*) malloc( size*sizeof(vector_td<TYPE,D>) );
+    }
+
+    template<class TYPE, unsigned int D>  void _deallocate_memory( vector_td<TYPE,D>* data )
+    {
+      free( data );
+    }
+  };
+}
+
+#include "hoNDArray.hxx"
diff --git a/toolboxes/core/cpu/hoNDArray.hxx b/toolboxes/core/cpu/hoNDArray.hxx
new file mode 100644
index 0000000..2f301ca
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDArray.hxx
@@ -0,0 +1,1029 @@
+// This file is not to be included by anyone else than hoNDArray.h
+// Contains the "private" implementation of the container
+//
+
+namespace Gadgetron
+{
+    template <typename T> 
+    hoNDArray<T>::hoNDArray() : NDArray<T>::NDArray() 
+    {
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(std::vector<size_t> *dimensions) : NDArray<T>::NDArray()
+    {
+        this->create(dimensions);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(std::vector<size_t> &dimensions) : NDArray<T>::NDArray()
+    {
+        this->create(dimensions);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(boost::shared_ptr< std::vector<size_t> > dimensions) : NDArray<T>::NDArray()
+    {
+        this->create(dimensions);
+    }
+
+#if __cplusplus > 199711L
+    template<class T> hoNDArray<T>::hoNDArray(std::initializer_list<size_t> dimensions){
+    	this->create(dimensions);
+    }
+    template<class T> hoNDArray<T>::hoNDArray(std::initializer_list<size_t> dimensions,T* data, bool delete_data_on_destruct ){
+    	this->create(dimensions,data,delete_data_on_destruct);
+    }
+#endif
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(size_t len) : NDArray<T>::NDArray()
+    {
+        std::vector<size_t> dim(1);
+        dim[0] = len;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(size_t sx, size_t sy) : NDArray<T>::NDArray()
+    {
+        std::vector<size_t> dim(2);
+        dim[0] = sx;
+        dim[1] = sy;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz) : NDArray<T>::NDArray()
+    {
+        std::vector<size_t> dim(3);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st) : NDArray<T>::NDArray()
+    {
+        std::vector<size_t> dim(4);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp) : NDArray<T>::NDArray()
+    {
+        std::vector<size_t> dim(5);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq) : NDArray<T>::NDArray()
+    {
+        std::vector<size_t> dim(6);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr) : NDArray<T>::NDArray()
+    {
+        std::vector<size_t> dim(7);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss) : NDArray<T>::NDArray()
+    {
+        std::vector<size_t> dim(8);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+        dim[7] = ss;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+    {
+        this->create(dimensions,data,delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+    {
+        this->create(dimensions,data,delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+    {
+        this->create(dimensions,data,delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(size_t len, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+    {
+        std::vector<size_t> dim(1);
+        dim[0] = len;
+        this->create(&dim,data,delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(size_t sx, size_t sy, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+    {
+        std::vector<size_t> dim(2);
+        dim[0] = sx;
+        dim[1] = sy;
+        this->create(&dim,data,delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+    {
+        std::vector<size_t> dim(3);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        this->create(&dim,data,delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+    {
+        std::vector<size_t> dim(4);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        this->create(&dim,data,delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+    {
+        std::vector<size_t> dim(5);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        this->create(&dim,data,delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+    {
+        std::vector<size_t> dim(6);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        this->create(&dim,data,delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+    {
+        std::vector<size_t> dim(7);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+        this->create(&dim,data,delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+    {
+        std::vector<size_t> dim(8);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+        dim[7] = ss;
+        this->create(&dim,data,delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    hoNDArray<T>::~hoNDArray()
+    {
+        if (this->delete_data_on_destruct_){
+            deallocate_memory();
+        }
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(const hoNDArray<T>  *a)
+    {
+        if(!a) throw std::runtime_error("hoNDArray<T>::hoNDArray(): 0x0 pointer provided");
+        this->data_ = 0;
+
+        std::vector<size_t>* tmp = new std::vector<size_t>;
+        this->dimensions_ = boost::shared_ptr< std::vector<size_t> >(tmp);
+        *(this->dimensions_) = *(a->dimensions_);
+
+        tmp = new std::vector<size_t>;
+        this->offsetFactors_ = boost::shared_ptr< std::vector<size_t> >(tmp);
+        *(this->offsetFactors_) = *(a->offsetFactors_);
+
+        if ( !this->dimensions_->empty() )
+        {
+            allocate_memory();
+            memcpy( this->data_, a->data_, this->elements_*sizeof(T) );
+        }
+        else
+        {
+            this->elements_ = 0;
+        }
+    }
+
+    template <typename T> 
+    hoNDArray<T>::hoNDArray(const hoNDArray<T> &a)
+    {
+        this->data_ = 0;
+
+        std::vector<size_t>* tmp = new std::vector<size_t>;
+        this->dimensions_ = boost::shared_ptr< std::vector<size_t> >(tmp);
+        *(this->dimensions_) = *(a.dimensions_);
+
+        tmp = new std::vector<size_t>;
+        this->offsetFactors_ = boost::shared_ptr< std::vector<size_t> >(tmp);
+        *(this->offsetFactors_) = *(a.offsetFactors_);
+
+        if ( !this->dimensions_->empty() )
+        {
+            allocate_memory();
+            memcpy( this->data_, a.data_, this->elements_*sizeof(T) );
+        }
+        else
+        {
+            this->elements_ = 0;
+        }
+    }
+
+#if __cplusplus > 199711L
+    template <typename T>
+    hoNDArray<T>::hoNDArray(hoNDArray<T>&& a) : NDArray<T>::NDArray(){
+    	data_ = a.data_;
+    	*this->dimensions_ = *a.dimensions_;
+    	this->elements_ = a.elements_;
+    	a.dimensions_.reset();
+    	a.data_ = nullptr;
+    	this->offsetFactors_ = a.offsetFactors_;
+    	a.offsetFactors_.reset();
+    }
+#endif
+    template <typename T> 
+    hoNDArray<T>& hoNDArray<T>::operator=(const hoNDArray<T>& rhs)
+    {
+        if ( &rhs == this ) return *this;
+
+        if ( rhs.get_number_of_elements() == 0 ){
+            this->clear();
+            return *this;
+        }
+
+        // Are the dimensions the same? Then we can just memcpy
+        if (this->dimensions_equal(&rhs)){
+            memcpy(this->data_, rhs.data_, this->elements_*sizeof(T));
+        }
+        else{
+            deallocate_memory();
+            this->data_ = 0;
+            *(this->dimensions_) = *(rhs.dimensions_);
+            *(this->offsetFactors_) = *(rhs.offsetFactors_);
+            allocate_memory();
+            memcpy( this->data_, rhs.data_, this->elements_*sizeof(T) );
+        }
+        return *this;
+    }
+
+#if __cplusplus > 199711L
+    template <typename T>
+    hoNDArray<T>& hoNDArray<T>::operator=(hoNDArray<T>&& rhs)
+    {
+        if ( &rhs == this ) return *this;
+
+        this->clear();
+        *this->dimensions_ = *rhs.dimensions_;
+        *this->offsetFactors_ = *rhs.offsetFactors_;
+        this->elements_ = rhs.elements_;
+        rhs.dimensions_.reset();
+        rhs.offsetFactors_.reset();
+        data_ = rhs.data_;
+        rhs.data_ = nullptr;
+        return *this;
+    }
+#endif
+
+    template <typename T> 
+    void hoNDArray<T>::create(std::vector<size_t>& dimensions)
+    {
+        if ( this->dimensions_equal(&dimensions) )
+        {
+            return;
+        }
+
+        this->clear();
+        BaseClass::create(dimensions);
+    }
+
+    template <typename T> 
+    void hoNDArray<T>::create(std::vector<size_t> *dimensions)
+    {
+        if ( this->dimensions_equal(dimensions) )
+        {
+            return;
+        }
+        this->clear();
+        BaseClass::create(dimensions);
+    }
+
+    template <typename T> 
+    void hoNDArray<T>::create(boost::shared_ptr< std::vector<size_t> > dimensions)
+    {
+        if ( this->dimensions_equal(dimensions.get()) )
+        {
+            return;
+        }
+        this->clear();
+        BaseClass::create(dimensions);
+    }
+
+    template <typename T> 
+    void hoNDArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct) 
+    {
+        if(!dimensions) throw std::runtime_error("hoNDArray<T>::create(): 0x0 pointer provided");
+        if(!data) throw std::runtime_error("hoNDArray<T>::create(): 0x0 pointer provided");
+
+        if ( this->dimensions_equal(dimensions) )
+        {
+            if ( this->delete_data_on_destruct_ )
+            {
+                this->deallocate_memory();
+            }
+
+            this->data_ = data;
+            this->delete_data_on_destruct_ = delete_data_on_destruct;
+        }
+        else
+        {
+            if ( this->delete_data_on_destruct_ )
+            {
+                this->deallocate_memory();
+                this->data_ = NULL;
+            }
+
+            BaseClass::create(dimensions, data, delete_data_on_destruct);
+        }
+    }
+
+    template <typename T> 
+    void hoNDArray<T>::create(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct) 
+    {
+        if(!data) throw std::runtime_error("hoNDArray<T>::create(): 0x0 pointer provided");
+
+        if ( this->dimensions_equal(&dimensions) )
+        {
+            if ( this->delete_data_on_destruct_ )
+            {
+                this->deallocate_memory();
+            }
+
+            this->data_ = data;
+            this->delete_data_on_destruct_ = delete_data_on_destruct;
+        }
+        else
+        {
+            if ( this->delete_data_on_destruct_ )
+            {
+                this->deallocate_memory();
+                this->data_ = NULL;
+            }
+
+            BaseClass::create(dimensions, data, delete_data_on_destruct);
+        }
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct)
+    {
+        this->create(dimensions.get(), data, delete_data_on_destruct);
+    }
+
+
+
+#if __cplusplus > 199711L
+    template<class T> void hoNDArray<T>::create(std::initializer_list<size_t> dimensions){
+    	std::vector<size_t> dims(dimensions);
+    	this->create(dims);
+    }
+    template<class T> void hoNDArray<T>::create(std::initializer_list<size_t> dimensions,T* data, bool delete_data_on_destruct ){
+    	std::vector<size_t> dims(dimensions);
+    	this->create(dims,data,delete_data_on_destruct);
+    }
+#endif
+
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t len)
+    {
+        std::vector<size_t> dim(1);
+        dim[0] = len;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t sx, size_t sy)
+    {
+        std::vector<size_t> dim(2);
+        dim[0] = sx;
+        dim[1] = sy;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz)
+    {
+        std::vector<size_t> dim(3);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st)
+    {
+        std::vector<size_t> dim(4);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp)
+    {
+        std::vector<size_t> dim(5);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq)
+    {
+        std::vector<size_t> dim(6);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr)
+    {
+        std::vector<size_t> dim(7);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss)
+    {
+        std::vector<size_t> dim(8);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+        dim[7] = ss;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, size_t su)
+    {
+        std::vector<size_t> dim(9);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+        dim[7] = ss;
+        dim[8] = su;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t len, T* data, bool delete_data_on_destruct)
+    {
+        std::vector<size_t> dim(1);
+        dim[0] = len;
+        this->create(&dim, data, delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t sx, size_t sy, T* data, bool delete_data_on_destruct)
+    {
+        std::vector<size_t> dim(2);
+        dim[0] = sx;
+        dim[1] = sy;
+        this->create(&dim, data, delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct)
+    {
+        std::vector<size_t> dim(3);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        this->create(&dim, data, delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, T* data, bool delete_data_on_destruct)
+    {
+        std::vector<size_t> dim(4);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        this->create(&dim, data, delete_data_on_destruct);
+    }
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, T* data, bool delete_data_on_destruct)
+    {
+        std::vector<size_t> dim(5);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        this->create(&dim, data, delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, T* data, bool delete_data_on_destruct)
+    {
+        std::vector<size_t> dim(6);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        this->create(&dim, data, delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, T* data, bool delete_data_on_destruct)
+    {
+        std::vector<size_t> dim(7);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+        this->create(&dim, data, delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, T* data, bool delete_data_on_destruct)
+    {
+        std::vector<size_t> dim(8);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+        dim[7] = ss;
+        this->create(&dim, data, delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, size_t su, T* data, bool delete_data_on_destruct)
+    {
+        std::vector<size_t> dim(9);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+        dim[7] = ss;
+        dim[8] = su;
+        this->create(&dim, data, delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    void hoNDArray<T>::fill(T value)
+    {
+        std::fill(this->get_data_ptr(), this->get_data_ptr()+this->get_number_of_elements(), value);
+    }
+
+    template <typename T> 
+    inline T* hoNDArray<T>::begin()
+    {
+        return this->data_;
+    }
+
+    template <typename T> 
+    inline const T* hoNDArray<T>::begin() const
+    {
+        return this->data_;
+    }
+
+    template <typename T> 
+    inline T* hoNDArray<T>::end()
+    {
+        return (this->data_+this->elements_);
+    }
+
+    template <typename T> 
+    inline const T* hoNDArray<T>::end() const
+    {
+        return (this->data_+this->elements_);
+    }
+
+    template <typename T> 
+    inline T& hoNDArray<T>::at( size_t idx )
+    {
+        /*if( idx >= this->get_number_of_elements() )
+        {
+        BOOST_THROW_EXCEPTION( runtime_error("hoNDArray::at(): index out of range."));
+        }*/
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline const T& hoNDArray<T>::at( size_t idx ) const
+    {
+        /*if( idx >= this->get_number_of_elements() )
+        {
+        BOOST_THROW_EXCEPTION( runtime_error("hoNDArray::at(): index out of range."));
+        }*/
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline T& hoNDArray<T>::operator[]( size_t idx )
+    {
+        /*if( idx >= this->get_number_of_elements() )
+        {
+        BOOST_THROW_EXCEPTION( runtime_error("hoNDArray::operator[]: index out of range."));
+        }*/
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    //template <typename T> 
+    //inline T& hoNDArray<T>::operator()( size_t idx )
+    //{
+    //    /*if( idx >= this->get_number_of_elements() )
+    //    {
+    //    BOOST_THROW_EXCEPTION( runtime_error("hoNDArray::operator(): index out of range."));
+    //    }*/
+    //    GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+    //    return this->get_data_ptr()[idx];
+    //}
+
+    //template <typename T> 
+    //inline const T& hoNDArray<T>::operator()( size_t idx ) const
+    //{
+    //    /*if( idx >= this->get_number_of_elements() )
+    //    {
+    //    BOOST_THROW_EXCEPTION( runtime_error("hoNDArray::operator(): index out of range."));
+    //    }*/
+    //    GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+    //    return this->get_data_ptr()[idx];
+    //}
+
+    //template <typename T> 
+    //inline T& hoNDArray<T>::operator()( const std::vector<size_t>& ind )
+    //{
+    //    size_t idx = this->calculate_offset(ind);
+    //    /*if( idx >= this->get_number_of_elements() )
+    //    {
+    //    BOOST_THROW_EXCEPTION( runtime_error("hoNDArray::operator(): index out of range."));
+    //    }*/
+    //    GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+    //    return this->get_data_ptr()[idx];
+    //}
+
+    //template <typename T> 
+    //inline const T& hoNDArray<T>::operator()( const std::vector<size_t>& ind ) const
+    //{
+    //    size_t idx = this->calculate_offset(ind);
+    //    /*if( idx >= this->get_number_of_elements() )
+    //    {
+    //    BOOST_THROW_EXCEPTION( runtime_error("hoNDArray::operator(): index out of range."));
+    //    }*/
+    //    GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+    //    return this->get_data_ptr()[idx];
+    //}
+
+    template <typename T> 
+    void hoNDArray<T>::get_sub_array(const std::vector<size_t>& start, std::vector<size_t>& size, hoNDArray<T>& out)
+    {
+        if ( start.size() != size.size() ){
+            BOOST_THROW_EXCEPTION( runtime_error("hoNDArray<>::get_sub_array failed"));
+        }
+
+        if ( start.size() != (*dimensions_).size() ){
+            BOOST_THROW_EXCEPTION( runtime_error("hoNDArray<>::get_sub_array failed"));
+        }
+
+        out.create(&size);
+
+        if ( out.get_number_of_elements() == this->get_number_of_elements() ){
+            out = *this;
+            return;
+        }
+
+        std::vector<size_t> end(start.size());
+
+        size_t ii;
+        for ( ii=0; ii<start.size(); ii++ ){
+            end[ii] = start[ii] + size[ii] - 1;
+            if ( end[ii] >= (*dimensions_)[ii] ){
+                BOOST_THROW_EXCEPTION( runtime_error("hoNDArray<>::get_sub_array failed"));
+            }
+        }
+    }
+
+    template <typename T> 
+    void hoNDArray<T>::printContent(std::ostream& os) const
+    {
+        using namespace std;
+
+        os.unsetf(std::ios::scientific);
+        os.setf(ios::fixed);
+
+        size_t i;
+
+        os << "Array dimension is : " << dimensions_->size() << endl;
+
+        os << "Array size is : ";
+        for (i=0; i<dimensions_->size(); i++ ) 
+            os << (*dimensions_)[i] << " "; 
+        os << endl;
+
+        int elemTypeSize = sizeof(T);
+        std::string elemTypeName = std::string(typeid(T).name());
+
+        os << "Array data type is : " << elemTypeName << std::endl;
+        os << "Byte number for each element is : " << elemTypeSize << std::endl;
+        os << "Number of array size in bytes is : ";
+        os << elements_*elemTypeSize << std::endl;
+        os << "Delete data on destruction flag is : " << this->delete_data_on_destruct_ << endl;
+
+        //os << "-------------------------------------------" << std::endl;
+        //size_t numOfPrints = 20;
+        //if ( this->elements_ < numOfPrints ) numOfPrints = this->elements_;
+        //for (i=0; i<numOfPrints; i++) 
+        //{
+        //    os << i << " = " << (*this)(i) << std::endl;
+        //}
+        //os << "-------------------------------------------" << std::endl;
+
+        os << std::endl;
+    }
+
+    template <typename T> 
+    void hoNDArray<T>::print(std::ostream& os) const
+    {
+        using namespace std;
+
+        os.unsetf(std::ios::scientific);
+        os.setf(ios::fixed);
+
+        os << "--------------Gagdgetron ND Array -------------" << endl;
+        this->printContent(os);
+    }
+
+    template <typename T> 
+    void hoNDArray<T>::allocate_memory()
+    {
+        deallocate_memory();
+
+        if ( !this->dimensions_->empty() )
+        {
+            this->elements_ = (*this->dimensions_)[0];
+            for (size_t i = 1; i < this->dimensions_->size(); i++)
+            {
+                this->elements_ *= (*this->dimensions_)[i];
+            }
+
+            if ( this->elements_ > 0 )
+            {
+                this->_allocate_memory(this->elements_, &this->data_);
+
+                if( this->data_ == 0x0 )
+                {
+                    BOOST_THROW_EXCEPTION( bad_alloc("hoNDArray<>::allocate memory failed"));
+                }
+
+                this->delete_data_on_destruct_ = true;
+
+                // memset(this->data_, 0, sizeof(T)*this->elements_);
+            }
+        }
+        else
+        {
+            this->elements_ = 0;
+        }
+    }
+
+    template <typename T> 
+    void hoNDArray<T>::deallocate_memory()
+    {
+        if (!(this->delete_data_on_destruct_)) {
+             throw std::runtime_error("You don't own this data.  You cannot deallocate its memory.");
+        }
+        
+        if( this->data_ ){
+            this->_deallocate_memory( this->data_ );
+            this->data_ = 0x0;
+        }
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::_allocate_memory( size_t size, float** data )
+    {
+        *data = (float*) malloc( size*sizeof(float) );
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::_deallocate_memory( float* data )
+    {
+        free(data);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::_allocate_memory( size_t size, double** data )
+    {
+        *data = (double*) malloc( size*sizeof(double) );
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::_deallocate_memory( double* data )
+    {
+        free(data);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::_allocate_memory( size_t size, std::complex<float>** data )
+    {
+        *data = (std::complex<float>*) malloc( size*sizeof(std::complex<float>) );
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::_deallocate_memory( std::complex<float>* data )
+    {
+        free(data);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::_allocate_memory( size_t size, std::complex<double>** data )
+    {
+        *data = (std::complex<double>*) malloc( size*sizeof(std::complex<double>) );
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::_deallocate_memory( std::complex<double>* data )
+    {
+        free(data);
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::_allocate_memory( size_t size, float_complext** data )
+    {
+        *data = (float_complext*) malloc( size*sizeof(float_complext) );
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::_deallocate_memory( float_complext* data )
+    {
+        free( data );
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::_allocate_memory( size_t size, double_complext** data )
+    {
+        *data = (double_complext*) malloc( size*sizeof(double_complext) );
+    }
+
+    template <typename T> 
+    inline void hoNDArray<T>::_deallocate_memory( double_complext* data )
+    {
+        free( data );
+    }
+
+    template <typename T> 
+    bool hoNDArray<T>::serialize(char*& buf, size_t& len) const 
+    {
+        if ( buf != NULL ) delete[] buf;
+
+        size_t NDim = dimensions_->size();
+
+        // number of dimensions + dimension vector + contents
+        len = sizeof(size_t) + sizeof(size_t)*NDim + sizeof(T)*elements_;
+
+        buf = new char[len];
+
+        memcpy(buf, &NDim, sizeof(size_t));
+        if ( NDim > 0 )
+        {
+            memcpy(buf+sizeof(size_t), &((*dimensions_)[0]), sizeof(size_t)*NDim);
+            memcpy(buf+sizeof(size_t)+sizeof(size_t)*NDim, this->data_, sizeof(T)*elements_);
+        }
+
+        return true; // Temporary. Should not be a boolean function.
+    }
+
+    template <typename T> 
+    bool hoNDArray<T>::deserialize(char* buf, size_t& len)
+    {
+        size_t NDim;
+        memcpy(&NDim, buf, sizeof(size_t));
+
+        if ( NDim > 0 )
+        {
+            std::vector<size_t> dimensions(NDim);
+            memcpy(&dimensions[0], buf+sizeof(size_t), sizeof(size_t)*NDim);
+
+            // allocate memory
+            this->create(&dimensions);
+
+            // copy the content
+            memcpy(this->data_, buf+sizeof(size_t)+sizeof(size_t)*NDim, sizeof(T)*elements_);
+        }
+        else
+        {
+            this->clear();
+        }
+
+        len = sizeof(size_t)+sizeof(size_t)*NDim+sizeof(T)*elements_;
+        return true; // Temporary. Should not be a boolean function.
+    }
+}
diff --git a/toolboxes/core/cpu/hoNDArray_fileio.h b/toolboxes/core/cpu/hoNDArray_fileio.h
new file mode 100644
index 0000000..7802213
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDArray_fileio.h
@@ -0,0 +1,66 @@
+#ifndef HONDARRAY_FILEIO_H
+#define HONDARRAY_FILEIO_H
+#pragma once
+
+#include "hoNDArray.h"
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string.h>
+#include <boost/shared_ptr.hpp>
+
+namespace Gadgetron{
+template<class T> int write_nd_array(hoNDArray<T> *a, const char* filename)
+{
+  int* header = new int[a->get_number_of_dimensions()+1];
+
+  header[0] = static_cast<int>(a->get_number_of_dimensions());
+  for (int i = 0; i < header[0]; i++)
+  {
+    header[i+1] = static_cast<int>(a->get_size(i));
+  }
+
+  std::fstream f(filename,std::ios::out | std::ios::binary);
+
+  if( !f.is_open() ){
+    GDEBUG_STREAM("ERROR: Cannot write file " << filename << std::endl);
+    delete [] header;
+    return -1;
+  }
+
+  f.write(reinterpret_cast<char*>(header),sizeof(int)*(a->get_number_of_dimensions()+1));
+  f.write(reinterpret_cast<char*>(a->get_data_ptr()),sizeof(T)*a->get_number_of_elements());
+  
+  f.close();
+
+  delete [] header;
+  
+  return 0;
+}
+
+template <class T> boost::shared_ptr< hoNDArray<T> > read_nd_array(const char* filename)
+{
+  int dimensions,tmp;
+  std::vector<size_t> dim_array;
+  std::fstream f(filename,std::ios::in | std::ios::binary);
+
+  if( !f.is_open() ){
+    GDEBUG_STREAM("ERROR: Cannot open file " << filename << std::endl);
+    return boost::shared_ptr< hoNDArray<T> >();
+  }
+
+  f.read(reinterpret_cast<char*>(&dimensions),sizeof(int));
+  for (int i = 0; i < dimensions; i++)
+  {
+    f.read(reinterpret_cast<char*>(&tmp),sizeof(int));
+    dim_array.push_back(static_cast<size_t>(tmp));
+  }
+
+  boost::shared_ptr< hoNDArray<T> > out( new hoNDArray<T>(&dim_array) );
+  f.read(reinterpret_cast<char*>(out->get_data_ptr()),sizeof(T)*out->get_number_of_elements());
+  
+  return out;
+}
+}
+#endif
diff --git a/toolboxes/core/cpu/hoNDArray_utils.h b/toolboxes/core/cpu/hoNDArray_utils.h
new file mode 100644
index 0000000..271f27e
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDArray_utils.h
@@ -0,0 +1,763 @@
+#pragma once
+
+#include "hoNDArray.h"
+#include "vector_td_utilities.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif
+
+namespace Gadgetron {
+
+  class ArrayIterator
+  {
+  public:
+
+    ArrayIterator(std::vector<size_t> *dimensions, std::vector<size_t> *order)
+    {
+      dimensions_  = boost::shared_ptr< std::vector<size_t> > (new std::vector<size_t>);
+      order_       = boost::shared_ptr< std::vector<size_t> > (new std::vector<size_t>);
+      current_     = boost::shared_ptr< std::vector<size_t> > (new std::vector<size_t>);
+      block_sizes_ = boost::shared_ptr< std::vector<size_t> > (new std::vector<size_t>);
+
+      block_sizes_->push_back(1);
+      for (size_t i = 0; i < order->size(); i++) {
+        dimensions_->push_back((*dimensions)[i]);
+        order_->push_back((*order)[i]);
+        current_->push_back(0);
+        if (i > 0) {
+          block_sizes_->push_back((*block_sizes_)[i-1]*(*dimensions_)[i-1]);
+        }
+      }
+      current_idx_ = 0;
+    }
+
+    inline size_t advance()
+    {
+      size_t order_index = 0;
+      (*current_)[(*order_)[order_index]]++;
+      while ((*current_)[(*order_)[order_index]] >= (*dimensions_)[(*order_)[order_index]]) {
+        (*current_)[(*order_)[order_index]] = 0;
+        order_index = (order_index+1)%dimensions_->size();
+        (*current_)[(*order_)[order_index]]++;
+      }
+
+      current_idx_ = 0;
+      for (size_t i = 0; i < dimensions_->size(); i++) {
+        current_idx_ += (*current_)[i]*(*block_sizes_)[i];
+      }	
+      return current_idx_;
+    }
+
+    inline size_t get_current_idx() {
+      return current_idx_;
+    }
+
+    boost::shared_ptr< std::vector<size_t> > get_current_sub() {
+      return current_;
+    }
+
+  protected:
+    boost::shared_ptr< std::vector<size_t> > dimensions_;
+    boost::shared_ptr< std::vector<size_t> > order_;
+    boost::shared_ptr< std::vector<size_t> > current_;
+    boost::shared_ptr< std::vector<size_t> > block_sizes_;
+    size_t current_idx_;
+  };
+
+  template<class T> boost::shared_ptr< hoNDArray<T> > shift_dim( hoNDArray<T> *in, int shift )  
+  {
+    if( in == 0x0 ) {
+      throw std::runtime_error("shift_dim(): invalid input pointer provided");;
+    }    
+    std::vector<size_t> order;
+    for (size_t i = 0; i < in->get_number_of_dimensions(); i++) {
+      order.push_back(static_cast<size_t>((i+shift)%in->get_number_of_dimensions()));
+    }
+    return permute(in,&order);
+  }
+
+  template<class T> void shift_dim( hoNDArray<T> *in, hoNDArray<T> *out, int shift )
+  {
+    if( in == 0x0 || out == 0x0 ) {
+      throw std::runtime_error("shift_dim(): invalid pointer provided");;
+    }    
+    std::vector<size_t> order;
+    for (size_t i = 0; i < in->get_number_of_dimensions(); i++) {
+      order.push_back(static_cast<size_t>((i+shift)%in->get_number_of_dimensions()));
+    }
+    permute(in,out,&order);
+  }
+
+  template<class T> boost::shared_ptr< hoNDArray<T> > 
+  permute( hoNDArray<T> *in, std::vector<size_t> *dim_order, int shift_mode = 0) 
+  {
+    if( in == 0x0 || dim_order == 0x0 ) {
+      throw std::runtime_error("permute(): invalid pointer provided");;
+    }    
+
+    std::vector<size_t> dims;
+    for (size_t i = 0; i < dim_order->size(); i++)
+      dims.push_back(in->get_dimensions()->at(dim_order->at(i)));
+    boost::shared_ptr< hoNDArray<T> > out( new hoNDArray<T>() );    
+    out->create(&dims);
+    permute( in, out.get(), dim_order, shift_mode );
+    return out;
+  }
+
+  template<class T> void 
+  permute( hoNDArray<T> *in, hoNDArray<T> *out, std::vector<size_t> *dim_order, int shift_mode = 0) 
+  {
+    if( in == 0x0 || out == 0x0 || dim_order == 0x0 ) {
+      throw std::runtime_error("permute(): invalid pointer provided");;
+    }    
+
+    if( in == out ){
+      throw std::runtime_error("permute(): in-place permutation not supported");;
+    }   
+
+    // Check ordering array
+    if (dim_order->size() > in->get_number_of_dimensions()) {
+      throw std::runtime_error("hoNDArray::permute - Invalid length of dimension ordering array");;
+    }
+
+    std::vector<size_t> dim_count(in->get_number_of_dimensions(),0);
+    for (size_t i = 0; i < dim_order->size(); i++) {
+      if ((*dim_order)[i] >= in->get_number_of_dimensions()) {
+        throw std::runtime_error("hoNDArray::permute - Invalid dimension order array");;
+      }
+      dim_count[(*dim_order)[i]]++;
+    }
+
+    // Create an internal array to store the dimensions
+    std::vector<size_t> dim_order_int;
+
+    // Check that there are no duplicate dimensions
+    for (size_t i = 0; i < dim_order->size(); i++) {
+      if (dim_count[(*dim_order)[i]] != 1) {
+        throw std::runtime_error("hoNDArray::permute - Invalid dimension order array (duplicates)");;
+
+      }
+      dim_order_int.push_back((*dim_order)[i]);
+    }
+
+    for (size_t i = 0; i < dim_order_int.size(); i++) {
+      if ((*in->get_dimensions())[dim_order_int[i]] != out->get_size(i)) {
+        throw std::runtime_error("permute(): dimensions of output array do not match the input array");;
+      }
+    }
+
+    // Pad dimension order array with dimension not mentioned in order array
+    if (dim_order_int.size() < in->get_number_of_dimensions()) {
+      for (size_t i = 0; i < dim_count.size(); i++) {
+        if (dim_count[i] == 0) {
+          dim_order_int.push_back(i);
+        }
+      }
+    }
+
+    T* o = out->get_data_ptr();
+
+    // if memcpy can be used during permute
+    size_t stride = 1;
+    size_t num_dim_memcpy = 0;
+    for (size_t i = 0; i < dim_order_int.size(); i++) {
+        if (dim_order_int[i]==i){
+            stride *= in->get_size(i);
+            num_dim_memcpy = i;
+        }
+        else{
+            break;
+        }
+    }
+
+    if (stride == 1) {
+        // point by point assignment is needed
+        ArrayIterator it(in->get_dimensions().get(), &dim_order_int);
+        for (size_t i = 0; i < in->get_number_of_elements(); i++) {
+            o[i] = in->get_data_ptr()[it.get_current_idx()];
+            it.advance();
+        }
+    }
+    else {
+        // memcpy can be used
+
+        size_t nDim = in->get_number_of_dimensions();
+        size_t num_memcpy = in->get_number_of_elements() / stride;
+
+        if (num_dim_memcpy == nDim - 1){
+            memcpy(out->begin(), in->begin(), in->get_number_of_bytes());
+            return;
+        }
+
+        // for the array index calculation
+        std::vector<size_t> dim_permute(nDim-num_dim_memcpy-1);
+        for (size_t i = num_dim_memcpy+1; i < dim_order_int.size(); i++) {
+            dim_permute[i - num_dim_memcpy - 1] = in->get_size(i);
+        }
+
+        long long n;
+
+        hoNDArray<T> permuteArray(dim_permute, in->begin(), false);
+
+        // starting index for in and out array for every permute memcpy operation
+        std::vector<size_t> ind_permute_in(dim_permute.size(), 0), ind_in(nDim, 0), ind_out(nDim, 0);
+
+        for (n = 0; n < num_memcpy; n++) {
+            permuteArray.calculate_index(n, ind_permute_in);
+            memcpy(&ind_in[0] + num_dim_memcpy + 1, &ind_permute_in[0], sizeof(size_t)*ind_permute_in.size());
+
+            // permute the indexes
+            for (size_t i = 0; i < nDim; i++) {
+                ind_out[i] = ind_in[dim_order_int[i]];
+            }
+
+            size_t offset_in = in->calculate_offset(ind_in);
+            size_t offset_out = out->calculate_offset(ind_out);
+
+            memcpy(o + offset_out, in->begin() + offset_in, sizeof(T)*stride);
+        }
+    }
+  }
+
+  // Expand array to new dimension
+  template<class T> boost::shared_ptr<hoNDArray<T> > 
+  expand(hoNDArray<T> *in, size_t new_dim_size )
+  {
+    if( in == 0x0 ){
+      throw std::runtime_error("expand(): illegal input pointer.");;
+    }
+
+    const size_t number_of_elements_in = in->get_number_of_elements();    
+
+    std::vector<size_t> dims = *in->get_dimensions(); 
+    dims.push_back(new_dim_size);
+
+    boost::shared_ptr< hoNDArray<T> > out(new hoNDArray<T>(&dims));
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+    for( long long int idx=0; idx<number_of_elements_in*new_dim_size; idx++ ){
+      (*out)[idx] = in->at(idx%number_of_elements_in);
+    }
+    return out;
+  }
+  
+  // Sum over dimension
+  template<class T> boost::shared_ptr<hoNDArray<T> > 
+  sum(hoNDArray<T> *in, size_t dim )
+  {
+    if( in == 0x0 ){
+      throw std::runtime_error("sum(): illegal input pointer.");;
+    }
+
+    if( !(in->get_number_of_dimensions()>1) ){
+      throw std::runtime_error("sum(): underdimensioned.");;
+    }
+
+    if( dim > in->get_number_of_dimensions()-1 ){
+      throw std::runtime_error( "sum(): dimension out of range.");;
+    }
+
+    size_t number_of_batches = in->get_size(dim);
+    size_t number_of_elements = in->get_number_of_elements()/number_of_batches;
+    std::vector<size_t> dims = *in->get_dimensions(); dims.pop_back();
+
+    boost::shared_ptr< hoNDArray<T> > out(new hoNDArray<T>());
+    out->create(&dims);
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+    for( long long idx=0; idx<(long long)number_of_elements; idx++ ){
+      T val(0);
+      for( size_t j=0; j<number_of_batches; j++ ){
+        size_t in_idx = j*number_of_elements+idx;
+        val += in->get_data_ptr()[in_idx];      
+      }
+      out->get_data_ptr()[idx] = val;
+    }
+    return out;
+  } 
+
+  /**
+  * @param[in] crop_offset starting position to crop
+  * @param[in] crop_size Size of cropped array
+  * @param[in] in input array
+  * @param[out] out Output array after cropping
+  */
+  template<class T, unsigned int D> void
+  crop(const vector_td<size_t, D>& crop_offset, const vector_td<size_t, D>& crop_size, hoNDArray<T> *in, hoNDArray<T> *out)
+  {
+      if (in == 0x0){
+          throw std::runtime_error("crop: 0x0 array provided");;
+      }
+
+      if (in->get_number_of_dimensions() < D){
+          std::stringstream ss;
+          ss << "crop: number of image dimensions should be at least " << D;
+          throw std::runtime_error(ss.str());;
+      }
+
+      if (out == 0x0){
+          throw std::runtime_error("crop: 0x0 array provided");;
+      }
+
+      std::vector<size_t> dims = to_std_vector(crop_size);
+      for (unsigned int d = D; d<in->get_number_of_dimensions(); d++){
+          dims.push_back(in->get_size(d));
+      }
+
+      if (!out->dimensions_equal(&dims)){
+          out->create(dims);
+      }
+
+      typename uint64d<D>::Type matrix_size_in = from_std_vector<size_t, D>(*in->get_dimensions());
+      typename uint64d<D>::Type matrix_size_out = from_std_vector<size_t, D>(*out->get_dimensions());
+
+      if (weak_greater(crop_offset + matrix_size_out, matrix_size_in)){
+          throw std::runtime_error("crop: cropping size mismatch");;
+      }
+
+      size_t len = out->get_size(0);
+      size_t num = out->get_number_of_elements() / len;
+
+      long long k;
+
+      T *in_ptr = in->get_data_ptr();
+      T *out_ptr = out->get_data_ptr();
+
+      #pragma omp parallel default(none) private(k) shared(in_ptr, out_ptr, num, len, in, out, crop_offset)
+      {
+          std::vector<size_t> ind;
+
+      #pragma omp for 
+          for (k = 0; k < (long long)num; k++){
+              ind = out->calculate_index(k*len);
+              for (unsigned int d = 0; d < D; d++){
+                  ind[d] += crop_offset[d];
+              }
+
+              T* in_ptr_curr = in_ptr + in->calculate_offset(ind);
+              memcpy(out_ptr + k*len, in_ptr_curr, sizeof(T)*len);
+          }
+      }
+  }
+
+  /**
+  * @param[in] crop_size Size of cropped array
+  * @param[in] in input array
+  * @param[out] out Output array after cropping
+
+  * Crop the input array around its center N/2; that is, the center pixel of in array is the center pixel of out array
+  */
+  template<class T, unsigned int D> void
+  crop(const vector_td<size_t, D>& crop_size, hoNDArray<T> *in, hoNDArray<T> *out)
+  {
+    // compute crop offset, perserving the center
+    if (in == 0x0){
+        throw std::runtime_error("crop: 0x0 array provided");;
+    }
+
+    vector_td<size_t, D> crop_offset;
+
+    unsigned int d;
+    for (d = 0; d < D; d++)
+    {
+        crop_offset[d] = in->get_size(d) / 2 - crop_size[d] / 2;
+    }
+
+    crop(crop_offset, crop_size, in, out);
+  }
+
+  template<class T> void
+  crop(size_t x, hoNDArray<T> *in, hoNDArray<T> *out)
+  {
+      vector_td<size_t, 1> crop_size(x);
+      crop(crop_size, in, out);
+  }
+
+  template<class T> void
+  crop(size_t x, size_t y, hoNDArray<T> *in, hoNDArray<T> *out)
+  {
+      vector_td<size_t, 2> crop_size(x, y);
+      crop(crop_size, in, out);
+  }
+
+  template<class T> void
+  crop(size_t x, size_t y, size_t z, hoNDArray<T> *in, hoNDArray<T> *out)
+  {
+      vector_td<size_t, 3> crop_size(x, y, z);
+      crop(crop_size, in, out);
+  }
+
+  template<class T, unsigned int D> boost::shared_ptr< hoNDArray<T> >
+  crop( const vector_td<size_t, D>& crop_offset, const vector_td<size_t, D>& crop_size, hoNDArray<T> *in )
+  {
+    boost::shared_ptr< hoNDArray<T> > out( new hoNDArray<T>() );
+    crop(crop_offset, crop_size, in, out.get());
+    return out;
+  }
+
+  /**
+   * @param[in]     size    Size of the output array
+   * @param[in]     in      Input array
+   * @param[out]    out     Output array after padding
+   * @param[in]     preset_out_with_val if true, out array will be filled with val before padding
+   * @param[in]     val     Value to use for padding
+
+   * The padding operations keep the center of array unchanged, e.g. the center is always N/2
+   */
+  template<class T, unsigned int D> void
+  pad(const typename uint64d<D>::Type& size, hoNDArray<T> *in, hoNDArray<T>* out, bool preset_out_with_val = true, T val = T(0))
+  {
+      if (in == 0x0){
+          throw std::runtime_error("pad: 0x0 array provided");;
+      }
+
+      if (out == 0x0){
+          throw std::runtime_error("pad: 0x0 array provided");;
+      }
+
+      if (in->get_number_of_dimensions() < D){
+          std::stringstream ss;
+          ss << "pad: number of image dimensions should be at least " << D;
+          throw std::runtime_error(ss.str());;
+      }
+
+      unsigned int d;
+
+      std::vector<size_t> dims = to_std_vector(size);
+      for (d = D; d<in->get_number_of_dimensions(); d++){
+          dims.push_back(in->get_size(d));
+      }
+
+      if (!out->dimensions_equal(&dims)){
+          out->create(dims);
+      }
+
+      if (in->dimensions_equal(&dims)){
+          memcpy(out->begin(), in->begin(), in->get_number_of_bytes());
+          return;
+      }
+
+      T *in_ptr = in->get_data_ptr();
+      T *out_ptr = out->get_data_ptr();
+
+      if (preset_out_with_val){
+          if (val == T(0)){
+              memset(out_ptr, 0, out->get_number_of_bytes());
+          }
+          else{
+                size_t N = out->get_number_of_elements();
+                long long n;
+                #pragma omp parallel for default(none) private(n) shared(N, out_ptr, val)
+                for (n = 0; n<(long long)N; n++)
+                {
+                    out_ptr[n] = val;
+                }
+          }
+      }
+
+      typename uint64d<D>::Type matrix_size_in = from_std_vector<size_t, D>(*in->get_dimensions());
+      typename uint64d<D>::Type matrix_size_out = from_std_vector<size_t, D>(*out->get_dimensions());
+
+      if (weak_greater(matrix_size_in, matrix_size_out)){
+          throw std::runtime_error("pad: size mismatch, cannot expand");
+      }
+
+      typename uint64d<D>::Type offset(D);
+      for (d = 0; d<D; d++){
+          offset[d] = matrix_size_out[d]/2 - matrix_size_in[d]/2;
+      }
+
+      size_t len = in->get_size(0);
+      size_t num = in->get_number_of_elements() / len;
+
+      long long k;
+
+#pragma omp parallel default(none) private(k, d) shared(in_ptr, out_ptr, num, len, in, out, offset)
+      {
+          std::vector<size_t> ind;
+
+#pragma omp for 
+          for (k = 0; k < (long long)num; k++){
+              ind = in->calculate_index(k*len);
+              for (d = 0; d < D; d++){
+                  ind[d] += offset[d];
+              }
+
+              T* out_ptr_curr = out_ptr + out->calculate_offset(ind);
+              memcpy(out_ptr_curr, in_ptr + k*len, sizeof(T)*len);
+          }
+      }
+  }
+
+  template<class T> void
+  pad(size_t x, hoNDArray<T> *in, hoNDArray<T>* out, bool preset_out_with_val = true, T val = T(0))
+  {
+      typename uint64d<1>::Type padSize(x);
+      pad<T, 1>(padSize, in, out, preset_out_with_val, val);
+  }
+
+  template<class T> void
+  pad(size_t x, size_t y, hoNDArray<T> *in, hoNDArray<T>* out, bool preset_out_with_val = true, T val = T(0))
+  {
+      typename uint64d<2>::Type padSize(x, y);
+      pad<T, 2>(padSize, in, out, preset_out_with_val, val);
+  }
+
+  template<class T> void
+  pad(size_t x, size_t y, size_t z, hoNDArray<T> *in, hoNDArray<T>* out, bool preset_out_with_val = true, T val = T(0))
+  {
+      typename uint64d<3>::Type padSize(x, y, z);
+      pad<T, 3>(padSize, in, out, preset_out_with_val, val);
+  }
+
+  /**
+  * @param[in] size Size of the output array
+  * @param[in] in Input array
+  * @param[in] val Value to use for padding
+  * @returns New array of the specified size, containing the original input array in the center and val outside.
+  */
+  template<class T, unsigned int D> boost::shared_ptr< hoNDArray<T> >
+  pad(const typename uint64d<D>::Type& size, hoNDArray<T> *in, T val = T(0))
+  {
+    boost::shared_ptr< hoNDArray<T> > out(new hoNDArray<T>());
+    pad<T,D>(size, in, out.get(), true, val);
+    return out;
+  }
+
+  /// copy the sub array x(:, indLastDim) to all other places of the last dimensions
+  template<typename T> 
+  bool repmatLastDimension(hoNDArray<T>& x, size_t indLastDim)
+  {
+    try
+      {
+        size_t NDim = x.get_number_of_dimensions();
+        size_t lastDim = x.get_size(NDim-1);
+        GADGET_CHECK_RETURN_FALSE( indLastDim < lastDim );
+
+        std::vector<size_t> ind(NDim, 0);
+        ind[NDim-1] = indLastDim;
+        size_t offsetIndLastDim = x.calculate_offset(ind);
+
+        size_t N = x.get_number_of_elements() / lastDim;
+
+        long long l;
+#pragma omp parallel for default(none) private(l) shared(lastDim, offsetIndLastDim, x, ind, indLastDim, N, NDim)
+        for ( l=0; l<(long long)lastDim; l++ )
+        {
+            if ( l==indLastDim ) continue;
+            ind[NDim-1] = l;
+            size_t offsetInd = x.calculate_offset(ind);
+
+            memcpy(x.begin()+offsetInd, x.begin()+offsetIndLastDim, sizeof(T)*N);
+        }
+      }
+    catch (...)
+      {
+        GERROR_STREAM("Errors in repmatLastDimension(hoNDArray<T>& x, size_t indLastDim) ... ");
+        return false;
+      }
+    return true;
+  }
+
+  // Utility to check if all neighbors required for the linear interpolation exists
+  // ... do not include dimensions of size 1
+
+  template<class REAL, unsigned int D> inline bool
+  is_border_pixel( vector_td<size_t,D> co, vector_td<size_t,D> dims )
+  {
+    for( size_t dim=0; dim<D; dim++ ){
+      if( dims[dim] > 1 && ( co[dim] == 0 || co[dim] == (dims[dim]-1) ) )
+	return true;
+    }
+    return false;
+  }
+
+  // Downsample
+  template<class REAL, unsigned int D> 
+  boost::shared_ptr< hoNDArray<REAL> > downsample( hoNDArray<REAL> *_in )
+  {
+    // A few sanity checks 
+
+    if( _in == 0x0 ){
+      throw std::runtime_error( "downsample(): illegal input provided.");
+    }
+    
+    if( _in->get_number_of_dimensions() < D ){
+      throw std::runtime_error( "downsample(): the number of array dimensions should be at least D");
+    }
+    
+    for( size_t d=0; d<D; d++ ){
+      if( (_in->get_size(d)%2) == 1 && _in->get_size(d) != 1 ){
+	throw std::runtime_error( "downsample(): uneven array dimensions larger than one not accepted");
+      }
+    }
+    
+    typename uint64d<D>::Type matrix_size_in = from_std_vector<size_t,D>( *_in->get_dimensions() );
+    typename uint64d<D>::Type matrix_size_out = matrix_size_in >> 1;
+
+    for( size_t d=0; d<D; d++ ){
+      if( matrix_size_out[d] == 0 ) 
+	matrix_size_out[d] = 1;
+    }
+  
+    size_t num_elements = prod(matrix_size_out);
+    size_t num_batches = 1;
+
+    for( size_t d=D; d<_in->get_number_of_dimensions(); d++ ){
+      num_batches *= _in->get_size(d);
+    }
+  
+    std::vector<size_t> dims = to_std_vector(matrix_size_out);
+    for( size_t d=D; d<_in->get_number_of_dimensions(); d++ ){
+      dims.push_back(_in->get_size(d));
+    }
+  
+    REAL *in = _in->get_data_ptr();
+
+    boost::shared_ptr< hoNDArray<REAL> > _out( new hoNDArray<REAL>(&dims) );
+    REAL *out = _out->get_data_ptr();
+    
+    typedef vector_td<size_t,D> uint64d;
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+    for( long long idx=0; idx < num_elements*num_batches; idx++ ){
+
+      const size_t frame_offset = idx/num_elements;
+      const uint64d co_out = idx_to_co<D>( idx-frame_offset*num_elements, matrix_size_out );
+      const uint64d co_in = co_out << 1;
+      const uint64d twos(2);
+      const size_t num_adds = 1 << D;
+
+      size_t actual_adds = 0;
+      REAL res = REAL(0);
+
+      for( size_t i=0; i<num_adds; i++ ){
+	const uint64d local_co = idx_to_co<D>( i, twos );
+	if( weak_greater_equal( local_co, matrix_size_out ) ) continue; // To allow array dimensions of size 1
+	const size_t in_idx = co_to_idx<D>(co_in+local_co, matrix_size_in)+frame_offset*prod(matrix_size_in);
+	actual_adds++;
+	res += in[in_idx];
+      }    
+      out[idx] = res/REAL(actual_adds);
+    }
+
+    return _out;
+  }
+
+  // Linear interpolation upsampling
+  template<class REAL, unsigned int D> boost::shared_ptr< hoNDArray<REAL> >
+  upsample( hoNDArray<REAL> *_in )
+  {
+    // A few sanity checks 
+
+    if( _in == 0x0 ){
+      throw std::runtime_error("upsample(): illegal input provided.");
+    }
+
+    if( _in->get_number_of_dimensions() < D ){
+      throw std::runtime_error( "upsample(): the number of array dimensions should be at least D");
+    }
+    
+    typename uint64d<D>::Type matrix_size_in = from_std_vector<size_t,D>( *_in->get_dimensions() );
+    typename uint64d<D>::Type matrix_size_out = matrix_size_in << 1;
+
+    for( size_t d=0; d<D; d++ ){
+      if( matrix_size_in[d] == 1 )
+	matrix_size_out[d] = 1;
+    }
+  
+    size_t num_elements = prod(matrix_size_out);
+    size_t num_batches = 1;
+
+    for( size_t d=D; d<_in->get_number_of_dimensions(); d++ ){
+      num_batches *= _in->get_size(d);
+    }
+  
+    std::vector<size_t> dims = to_std_vector(matrix_size_out);
+    for( size_t d=D; d<_in->get_number_of_dimensions(); d++ ){
+      dims.push_back(_in->get_size(d));
+    }
+
+    REAL *in = _in->get_data_ptr();
+
+    boost::shared_ptr< hoNDArray<REAL> > _out( new hoNDArray<REAL>(&dims) );
+    REAL *out = _out->get_data_ptr();
+    
+    typedef vector_td<size_t,D> uint64d;
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+    for( long long idx=0; idx < num_elements*num_batches; idx++ ){
+      
+      REAL res = REAL(0);
+
+      const size_t num_neighbors = 1 << D;
+      const size_t frame_idx = idx/num_elements;
+      const uint64d co_out = idx_to_co<D>( idx-frame_idx*num_elements, matrix_size_out );
+
+      // We will only proceed if all neighbours exist (this adds a zero-boundary to the upsampled image/vector field)
+      //
+    
+      if( !is_border_pixel<REAL,D>(co_out, matrix_size_out) ){
+      
+	for( size_t i=0; i<num_neighbors; i++ ){
+	
+	  // Determine coordinate of neighbor in input
+	  //
+
+	  const uint64d twos(2);
+	  const uint64d stride = idx_to_co<D>( i, twos );
+
+	  if( weak_greater_equal( stride, matrix_size_out ) ) continue; // To allow array dimensions of 1
+
+	  // Be careful about dimensions of size 1
+	  uint64d ones(1);
+	  for( size_t d=0; d<D; d++ ){
+	    if( matrix_size_out[d] == 1 )
+	      ones[d] = 0;
+	  }
+	  uint64d co_in = ((co_out-ones)>>1)+stride;
+	
+	  // Read corresponding pixel value
+	  //
+	
+	  const size_t in_idx = co_to_idx<D>(co_in, matrix_size_in)+frame_idx*prod(matrix_size_in);
+	  REAL value = in[in_idx];
+	
+	  // Determine weight
+	  //
+	
+	  REAL weight = REAL(1);
+	
+	  for( size_t dim=0; dim<D; dim++ ){	  
+	    if( matrix_size_in[dim] > 1 ){
+	      if( stride.vec[dim] == (co_out.vec[dim]%2) ) {
+		weight *= REAL(0.25);
+	      }
+	      else{
+		weight *= REAL(0.75);
+	      }
+	    }
+	  }
+	
+	  // Accumulate result
+	  //
+	
+	  res += weight*value;
+	}
+      }
+      out[idx] = res;
+    }
+    
+    return _out;
+  }
+
+}
diff --git a/toolboxes/core/cpu/hoNDBoundaryHandler.h b/toolboxes/core/cpu/hoNDBoundaryHandler.h
new file mode 100644
index 0000000..8836868
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDBoundaryHandler.h
@@ -0,0 +1,276 @@
+/** \file       hoNDBoundaryHandler.h
+    \brief      N-dimensional boundary condition handler
+
+                Designed to work with hoNDArray and hoNDImage
+
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "hoNDImage.h"
+
+namespace Gadgetron
+{
+    // define the boundary condition
+    enum GT_BOUNDARY_CONDITION
+    {
+        GT_BOUNDARY_CONDITION_FIXEDVALUE=34568, // a magic number
+        GT_BOUNDARY_CONDITION_BORDERVALUE,
+        GT_BOUNDARY_CONDITION_PERIODIC,
+        GT_BOUNDARY_CONDITION_MIRROR
+    };
+
+    inline std::string getBoundaryHandlerName(GT_BOUNDARY_CONDITION bh)
+    {
+        std::string name;
+
+        switch (bh)
+        {
+            case GT_BOUNDARY_CONDITION_FIXEDVALUE:
+                name = "FixedValue";
+                break;
+
+            case GT_BOUNDARY_CONDITION_BORDERVALUE:
+                name = "BorderValue";
+                break;
+
+            case GT_BOUNDARY_CONDITION_PERIODIC:
+                name = "Periodic";
+                break;
+
+            case GT_BOUNDARY_CONDITION_MIRROR:
+                name = "Mirror";
+                break;
+
+            default:
+                GERROR_STREAM("Unrecognized boundary handler type : " << bh);
+        }
+
+        return name;
+    }
+
+    inline GT_BOUNDARY_CONDITION getBoundaryHandlerType(const std::string& bh_name)
+    {
+        GT_BOUNDARY_CONDITION bh;
+
+        if ( bh_name == "FixedValue" )
+        {
+            bh = GT_BOUNDARY_CONDITION_FIXEDVALUE;
+        }
+        else if ( bh_name == "BorderValue" )
+        {
+            bh = GT_BOUNDARY_CONDITION_BORDERVALUE;
+        }
+        else if ( bh_name == "Periodic" )
+        {
+            bh = GT_BOUNDARY_CONDITION_PERIODIC;
+        }
+        else if ( bh_name == "Mirror" )
+        {
+            bh = GT_BOUNDARY_CONDITION_MIRROR;
+        }
+        else
+        {
+            GERROR_STREAM("Unrecognized boundary handler name : " << bh_name);
+        }
+
+        return bh;
+    }
+
+    template <typename ArrayType>
+    class hoNDBoundaryHandler
+    {
+    public:
+
+        typedef hoNDBoundaryHandler<ArrayType> Self;
+        typedef typename ArrayType::value_type T;
+
+        // enum { D = ArrayType::D }
+
+        hoNDBoundaryHandler() { array_ = NULL; }
+        hoNDBoundaryHandler(ArrayType& a) { array_ = &a; }
+        virtual ~hoNDBoundaryHandler() { array_ = NULL ; }
+
+        /// access the pixel value
+        virtual T operator()( const std::vector<long long>& ind ) = 0;
+        virtual T operator()( long long x ) = 0;
+        virtual T operator()( long long x, long long y ) = 0;
+        virtual T operator()( long long x, long long y, long long z ) = 0;
+        virtual T operator()( long long x, long long y, long long z, long long s ) = 0;
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p ) = 0;
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r ) = 0;
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a ) = 0;
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q ) = 0;
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q, long long u ) = 0;
+
+        void setArray(ArrayType& a) { array_ = &a; };
+
+        /// return a%b
+        inline long long mod(long long a, long long b)
+        {
+            a %= b;
+
+            if ( a<0 && b>0 )
+            {
+                a += b;
+            }
+
+            return a;
+        }
+
+    protected:
+
+        ArrayType* array_;
+    };
+
+    template <typename ArrayType>
+    class hoNDBoundaryHandlerFixedValue : public hoNDBoundaryHandler<ArrayType>
+    {
+    public:
+
+        typedef hoNDBoundaryHandler<ArrayType> BaseClass;
+        typedef hoNDBoundaryHandlerFixedValue<ArrayType> Self;
+        typedef typename BaseClass::T T;
+
+        hoNDBoundaryHandlerFixedValue(T v=0) : BaseClass(), value_(v) {}
+        hoNDBoundaryHandlerFixedValue(ArrayType& a, T v=T(0)) : BaseClass(a), value_(v) {}
+        virtual ~hoNDBoundaryHandlerFixedValue() {}
+
+        /// access the pixel value
+        virtual T operator()( const std::vector<long long>& ind );
+        virtual T operator()( long long x );
+        virtual T operator()( long long x, long long y );
+        virtual T operator()( long long x, long long y, long long z );
+        virtual T operator()( long long x, long long y, long long z, long long s );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q, long long u );
+
+    protected:
+        using BaseClass::array_;
+        T value_;
+    };
+
+    template <typename ArrayType>
+    class hoNDBoundaryHandlerBorderValue : public hoNDBoundaryHandler<ArrayType>
+    {
+    public:
+
+        typedef hoNDBoundaryHandler<ArrayType> BaseClass;
+        typedef hoNDBoundaryHandlerBorderValue<ArrayType> Self;
+        typedef typename BaseClass::T T;
+
+        hoNDBoundaryHandlerBorderValue() : BaseClass() {}
+        hoNDBoundaryHandlerBorderValue(ArrayType& a) : BaseClass(a) {}
+        virtual ~hoNDBoundaryHandlerBorderValue() {}
+
+        /// access the pixel value
+        virtual T operator()( const std::vector<long long>& ind );
+        virtual T operator()( long long x );
+        virtual T operator()( long long x, long long y );
+        virtual T operator()( long long x, long long y, long long z );
+        virtual T operator()( long long x, long long y, long long z, long long s );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q, long long u );
+
+    protected:
+        using BaseClass::array_;
+    };
+
+    template <typename ArrayType>
+    class hoNDBoundaryHandlerPeriodic : public hoNDBoundaryHandler<ArrayType>
+    {
+    public:
+
+        typedef hoNDBoundaryHandler<ArrayType> BaseClass;
+        typedef hoNDBoundaryHandlerPeriodic<ArrayType> Self;
+        typedef typename BaseClass::T T;
+
+        hoNDBoundaryHandlerPeriodic() : BaseClass() {}
+        hoNDBoundaryHandlerPeriodic(ArrayType& a) : BaseClass(a) {}
+        virtual ~hoNDBoundaryHandlerPeriodic() {}
+
+        /// access the pixel value
+        virtual T operator()( const std::vector<long long>& ind );
+        virtual T operator()( long long x );
+        virtual T operator()( long long x, long long y );
+        virtual T operator()( long long x, long long y, long long z );
+        virtual T operator()( long long x, long long y, long long z, long long s );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q, long long u );
+
+    protected:
+        using BaseClass::array_;
+    };
+
+    template <typename ArrayType>
+    class hoNDBoundaryHandlerMirror : public hoNDBoundaryHandler<ArrayType>
+    {
+    public:
+
+        typedef hoNDBoundaryHandler<ArrayType> BaseClass;
+        typedef hoNDBoundaryHandlerMirror<ArrayType> Self;
+        typedef typename BaseClass::T T;
+
+        hoNDBoundaryHandlerMirror() : BaseClass() {}
+        hoNDBoundaryHandlerMirror(ArrayType& a) : BaseClass(a) {}
+        virtual ~hoNDBoundaryHandlerMirror() {}
+
+        /// access the pixel value
+        virtual T operator()( const std::vector<long long>& ind );
+        virtual T operator()( long long x );
+        virtual T operator()( long long x, long long y );
+        virtual T operator()( long long x, long long y, long long z );
+        virtual T operator()( long long x, long long y, long long z, long long s );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q );
+        virtual T operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q, long long u );
+
+    protected:
+        using BaseClass::array_;
+    };
+
+    template <typename ArrayType> 
+    hoNDBoundaryHandler<ArrayType>* createBoundaryHandler(GT_BOUNDARY_CONDITION bh)
+    {
+        hoNDBoundaryHandler<ArrayType>* res=NULL;
+
+        switch (bh)
+        {
+            case GT_BOUNDARY_CONDITION_FIXEDVALUE:
+                res = new hoNDBoundaryHandlerFixedValue<ArrayType>();
+                break;
+
+            case GT_BOUNDARY_CONDITION_BORDERVALUE:
+                res = new hoNDBoundaryHandlerBorderValue<ArrayType>();
+                break;
+
+            case GT_BOUNDARY_CONDITION_PERIODIC:
+                res = new hoNDBoundaryHandlerPeriodic<ArrayType>();
+                break;
+
+            case GT_BOUNDARY_CONDITION_MIRROR:
+                res = new hoNDBoundaryHandlerMirror<ArrayType>();
+                break;
+
+            default:
+                GERROR_STREAM("Unrecognized boundary handler type : " << bh);
+        }
+
+        return res;
+    }
+}
+
+#include "hoNDBoundaryHandler.hxx"
diff --git a/toolboxes/core/cpu/hoNDBoundaryHandler.hxx b/toolboxes/core/cpu/hoNDBoundaryHandler.hxx
new file mode 100644
index 0000000..62405e8
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDBoundaryHandler.hxx
@@ -0,0 +1,497 @@
+/** \file       hoNDBoundaryHandler.hxx
+    \brief      N-dimensional boundary condition handler
+
+                Designed to work with hoNDArray and hoNDImage
+
+    \author     Hui Xue
+*/
+
+namespace Gadgetron
+{
+    /// hoNDBoundaryHandlerFixedValue
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerFixedValue<ArrayType>::T hoNDBoundaryHandlerFixedValue<ArrayType>::operator()( const std::vector<long long>& ind )
+    {
+        size_t D = ind.size();
+
+        bool inside = true;
+
+        size_t d;
+        for ( d=0; d<D; d++)
+        {
+            if ( (ind[d]<0) || (ind[d]>=array_->get_size(d)) )
+            {
+                inside = false;
+                break;
+            }
+        }
+
+        if (inside)
+        {
+            size_t offset = ind[0];
+            for ( d=1; d<D; d++ )
+            {
+                offset += ind[d]*array_->get_offset_factor(d);
+            }
+
+            return (*array_)(offset);
+        }
+        else
+        {
+            return value_;
+        }
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerFixedValue<ArrayType>::T hoNDBoundaryHandlerFixedValue<ArrayType>::operator()( long long x )
+    {
+        return ((x >= 0) && array_->point_in_range( size_t(x) )) ? (*array_)( size_t(x) ) : value_;
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerFixedValue<ArrayType>::T hoNDBoundaryHandlerFixedValue<ArrayType>::operator()( long long x, long long y )
+    {
+        return ((x >=0 ) && (y >= 0) && array_->point_in_range(size_t(x), size_t(y) )) ? (*array_)(size_t(x), size_t(y)) : value_;
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerFixedValue<ArrayType>::T hoNDBoundaryHandlerFixedValue<ArrayType>::operator()( long long x, long long y, long long z )
+    {
+        return ((x >= 0) && (y >= 0) && (z >= 0) && array_->point_in_range(size_t(x), size_t(y), size_t(z))) ? (*array_)(size_t(x), size_t(y), size_t(z)) : value_;
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerFixedValue<ArrayType>::T hoNDBoundaryHandlerFixedValue<ArrayType>::operator()( long long x, long long y, long long z, long long s )
+    {
+        return ((x >= 0) && (y >= 0) && (z >= 0) && (s >= 0) && array_->point_in_range(size_t(x), size_t(y), size_t(z), size_t(s))) ? (*array_)(size_t(x), size_t(y), size_t(z), size_t(s)) : value_;
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerFixedValue<ArrayType>::T hoNDBoundaryHandlerFixedValue<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p )
+    {
+        return ((x >= 0) && (y >= 0) && (z >= 0) && (s >= 0) && (p >= 0) && array_->point_in_range(size_t(x), size_t(y), size_t(z), size_t(s), size_t(p))) ? (*array_)(size_t(x), size_t(y), size_t(z), size_t(s), size_t(p)) : value_;
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerFixedValue<ArrayType>::T hoNDBoundaryHandlerFixedValue<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p, long long r )
+    {
+        return ((x >= 0) && (y >= 0) && (z >= 0) && (s >= 0) && (p >= 0) && (r >= 0) && array_->point_in_range(size_t(x), size_t(y), size_t(z), size_t(s), size_t(p), size_t(r))) ? (*array_)(size_t(x), size_t(y), size_t(z), size_t(s), size_t(p), size_t(r)) : value_;
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerFixedValue<ArrayType>::T hoNDBoundaryHandlerFixedValue<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a )
+    {
+        return ((x >= 0) && (y >= 0) && (z >= 0) && (s >= 0) && (p >= 0) && (r >= 0) && (a >= 0) && array_->point_in_range(size_t(x), size_t(y), size_t(z), size_t(s), size_t(p), size_t(r), size_t(a))) ? (*array_)(size_t(x), size_t(y), size_t(z), size_t(s), size_t(p), size_t(r), size_t(a)) : value_;
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerFixedValue<ArrayType>::T hoNDBoundaryHandlerFixedValue<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q )
+    {
+        return ((x >= 0) && (y >= 0) && (z >= 0) && (s >= 0) && (p >= 0) && (r >= 0) && (a >= 0) && (q >= 0) && array_->point_in_range(size_t(x), size_t(y), size_t(z), size_t(s), size_t(p), size_t(r), size_t(a), size_t(q))) ? (*array_)(size_t(x), size_t(y), size_t(z), size_t(s), size_t(p), size_t(r), size_t(a), size_t(q)) : value_;
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerFixedValue<ArrayType>::T hoNDBoundaryHandlerFixedValue<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q, long long u )
+    {
+        return ((x >= 0) && (y >= 0) && (z >= 0) && (s >= 0) && (p >= 0) && (r >= 0) && (a >= 0) && (q >= 0) && (u >= 0) && array_->point_in_range(size_t(x), size_t(y), size_t(z), size_t(s), size_t(p), size_t(r), size_t(a), size_t(q), size_t(u))) ? (*array_)(size_t(x), size_t(y), size_t(z), size_t(s), size_t(p), size_t(r), size_t(a), size_t(q), size_t(u)) : value_;
+    }
+
+    /// hoNDBoundaryHandlerBorderValue
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerBorderValue<ArrayType>::T hoNDBoundaryHandlerBorderValue<ArrayType>::operator()( const std::vector<long long>& ind )
+    {
+        std::vector<size_t> indInside(array_->get_number_of_dimensions());
+        unsigned int D = array_->get_number_of_dimensions();
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            if ( ind[ii] < 0 )
+            {
+                indInside[ii] = 0;
+            }
+            else if ( ind[ii] >= array_->get_size(ii) )
+            {
+                indInside[ii] = array_->get_size(ii)-1;
+            }
+            else
+            {
+                indInside[ii] = ind[ii];
+            }
+        }
+        return (*array_)(indInside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerBorderValue<ArrayType>::T hoNDBoundaryHandlerBorderValue<ArrayType>::operator()( long long x )
+    {
+        size_t x_inside = (x<0) ? 0 : ( (x>=(long long)array_->get_size(0)) ? array_->get_size(0)-1 : x );
+
+        return (*array_)(x_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerBorderValue<ArrayType>::T hoNDBoundaryHandlerBorderValue<ArrayType>::operator()( long long x, long long y )
+    {
+        size_t x_inside = (x<0) ? 0 : ( (x>=(long long)array_->get_size(0)) ? array_->get_size(0)-1 : x );
+        size_t y_inside = (y<0) ? 0 : ( (y>=(long long)array_->get_size(1)) ? array_->get_size(1)-1 : y );
+
+        return (*array_)(x_inside, y_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerBorderValue<ArrayType>::T hoNDBoundaryHandlerBorderValue<ArrayType>::operator()( long long x, long long y, long long z )
+    {
+        size_t x_inside = (x<0) ? 0 : ( (x>=(long long)array_->get_size(0)) ? array_->get_size(0)-1 : x );
+        size_t y_inside = (y<0) ? 0 : ( (y>=(long long)array_->get_size(1)) ? array_->get_size(1)-1 : y );
+        size_t z_inside = (z<0) ? 0 : ( (z>=(long long)array_->get_size(2)) ? array_->get_size(2)-1 : z );
+
+        return (*array_)(x_inside, y_inside, z_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerBorderValue<ArrayType>::T hoNDBoundaryHandlerBorderValue<ArrayType>::operator()( long long x, long long y, long long z, long long s )
+    {
+        size_t x_inside = (x<0) ? 0 : ( (x>=(long long)array_->get_size(0)) ? array_->get_size(0)-1 : x );
+        size_t y_inside = (y<0) ? 0 : ( (y>=(long long)array_->get_size(1)) ? array_->get_size(1)-1 : y );
+        size_t z_inside = (z<0) ? 0 : ( (z>=(long long)array_->get_size(2)) ? array_->get_size(2)-1 : z );
+        size_t s_inside = (s<0) ? 0 : ( (s>=(long long)array_->get_size(3)) ? array_->get_size(3)-1 : s );
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerBorderValue<ArrayType>::T hoNDBoundaryHandlerBorderValue<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p )
+    {
+        size_t x_inside = (x<0) ? 0 : ( (x>=(long long)array_->get_size(0)) ? array_->get_size(0)-1 : x );
+        size_t y_inside = (y<0) ? 0 : ( (y>=(long long)array_->get_size(1)) ? array_->get_size(1)-1 : y );
+        size_t z_inside = (z<0) ? 0 : ( (z>=(long long)array_->get_size(2)) ? array_->get_size(2)-1 : z );
+        size_t s_inside = (s<0) ? 0 : ( (s>=(long long)array_->get_size(3)) ? array_->get_size(3)-1 : s );
+        size_t p_inside = (p<0) ? 0 : ( (p>=(long long)array_->get_size(4)) ? array_->get_size(4)-1 : p );
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside, p_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerBorderValue<ArrayType>::T hoNDBoundaryHandlerBorderValue<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p, long long r )
+    {
+        size_t x_inside = (x<0) ? 0 : ( (x>=(long long)array_->get_size(0)) ? array_->get_size(0)-1 : x );
+        size_t y_inside = (y<0) ? 0 : ( (y>=(long long)array_->get_size(1)) ? array_->get_size(1)-1 : y );
+        size_t z_inside = (z<0) ? 0 : ( (z>=(long long)array_->get_size(2)) ? array_->get_size(2)-1 : z );
+        size_t s_inside = (s<0) ? 0 : ( (s>=(long long)array_->get_size(3)) ? array_->get_size(3)-1 : s );
+        size_t p_inside = (p<0) ? 0 : ( (p>=(long long)array_->get_size(4)) ? array_->get_size(4)-1 : p );
+        size_t r_inside = (r<0) ? 0 : ( (r>=(long long)array_->get_size(5)) ? array_->get_size(5)-1 : r );
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside, p_inside, r_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerBorderValue<ArrayType>::T hoNDBoundaryHandlerBorderValue<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a )
+    {
+        size_t x_inside = (x<0) ? 0 : ( (x>=(long long)array_->get_size(0)) ? array_->get_size(0)-1 : x );
+        size_t y_inside = (y<0) ? 0 : ( (y>=(long long)array_->get_size(1)) ? array_->get_size(1)-1 : y );
+        size_t z_inside = (z<0) ? 0 : ( (z>=(long long)array_->get_size(2)) ? array_->get_size(2)-1 : z );
+        size_t s_inside = (s<0) ? 0 : ( (s>=(long long)array_->get_size(3)) ? array_->get_size(3)-1 : s );
+        size_t p_inside = (p<0) ? 0 : ( (p>=(long long)array_->get_size(4)) ? array_->get_size(4)-1 : p );
+        size_t r_inside = (r<0) ? 0 : ( (r>=(long long)array_->get_size(5)) ? array_->get_size(5)-1 : r );
+        size_t a_inside = (a<0) ? 0 : ( (a>=(long long)array_->get_size(6)) ? array_->get_size(6)-1 : a );
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside, p_inside, r_inside, a_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerBorderValue<ArrayType>::T hoNDBoundaryHandlerBorderValue<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q )
+    {
+        size_t x_inside = (x<0) ? 0 : ( (x>=(long long)array_->get_size(0)) ? array_->get_size(0)-1 : x );
+        size_t y_inside = (y<0) ? 0 : ( (y>=(long long)array_->get_size(1)) ? array_->get_size(1)-1 : y );
+        size_t z_inside = (z<0) ? 0 : ( (z>=(long long)array_->get_size(2)) ? array_->get_size(2)-1 : z );
+        size_t s_inside = (s<0) ? 0 : ( (s>=(long long)array_->get_size(3)) ? array_->get_size(3)-1 : s );
+        size_t p_inside = (p<0) ? 0 : ( (p>=(long long)array_->get_size(4)) ? array_->get_size(4)-1 : p );
+        size_t r_inside = (r<0) ? 0 : ( (r>=(long long)array_->get_size(5)) ? array_->get_size(5)-1 : r );
+        size_t a_inside = (a<0) ? 0 : ( (a>=(long long)array_->get_size(6)) ? array_->get_size(6)-1 : a );
+        size_t q_inside = (q<0) ? 0 : ( (q>=(long long)array_->get_size(7)) ? array_->get_size(7)-1 : q );
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside, p_inside, r_inside, a_inside, q_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerBorderValue<ArrayType>::T hoNDBoundaryHandlerBorderValue<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q, long long u )
+    {
+        size_t x_inside = (x<0) ? 0 : ( (x>=(long long)array_->get_size(0)) ? array_->get_size(0)-1 : x );
+        size_t y_inside = (y<0) ? 0 : ( (y>=(long long)array_->get_size(1)) ? array_->get_size(1)-1 : y );
+        size_t z_inside = (z<0) ? 0 : ( (z>=(long long)array_->get_size(2)) ? array_->get_size(2)-1 : z );
+        size_t s_inside = (s<0) ? 0 : ( (s>=(long long)array_->get_size(3)) ? array_->get_size(3)-1 : s );
+        size_t p_inside = (p<0) ? 0 : ( (p>=(long long)array_->get_size(4)) ? array_->get_size(4)-1 : p );
+        size_t r_inside = (r<0) ? 0 : ( (r>=(long long)array_->get_size(5)) ? array_->get_size(5)-1 : r );
+        size_t a_inside = (a<0) ? 0 : ( (a>=(long long)array_->get_size(6)) ? array_->get_size(6)-1 : a );
+        size_t q_inside = (q<0) ? 0 : ( (q>=(long long)array_->get_size(7)) ? array_->get_size(7)-1 : q );
+        size_t u_inside = (u<0) ? 0 : ( (u>=(long long)array_->get_size(8)) ? array_->get_size(8)-1 : u );
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside, p_inside, r_inside, a_inside, q_inside, u_inside);
+    }
+
+    /// hoNDBoundaryHandlerPeriodic
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerPeriodic<ArrayType>::T hoNDBoundaryHandlerPeriodic<ArrayType>::operator()( const std::vector<long long>& ind )
+    {
+        unsigned int D = (unsigned int)array_->get_number_of_dimensions();
+        std::vector<size_t> indInside(D);
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            if ( (ind[ii]<0) || (ind[ii]>=(long long)array_->get_size(ii)) )
+            {
+                indInside[ii] = this->mod(ind[ii], array_->get_size(ii));
+            }
+            else
+            {
+                indInside[ii] = ind[ii];
+            }
+        }
+        return (*array_)(indInside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerPeriodic<ArrayType>::T hoNDBoundaryHandlerPeriodic<ArrayType>::operator()( long long x )
+    {
+        size_t x_inside = (x<0 || x>=(long long)array_->get_size(0)) ? (this->mod(x, (long long)array_->get_size(0))) : x;
+
+        return (*array_)(x_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerPeriodic<ArrayType>::T hoNDBoundaryHandlerPeriodic<ArrayType>::operator()( long long x, long long y )
+    {
+        size_t x_inside = (x<0 || x>=(long long)array_->get_size(0)) ? (this->mod(x, (long long)array_->get_size(0))) : x;
+        size_t y_inside = (y<0 || y>=(long long)array_->get_size(1)) ? (this->mod(y, (long long)array_->get_size(1))) : y;
+
+        return (*array_)(x_inside, y_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerPeriodic<ArrayType>::T hoNDBoundaryHandlerPeriodic<ArrayType>::operator()( long long x, long long y, long long z )
+    {
+        size_t x_inside = (x<0 || x>=(long long)array_->get_size(0)) ? (this->mod(x, (long long)array_->get_size(0))) : x;
+        size_t y_inside = (y<0 || y>=(long long)array_->get_size(1)) ? (this->mod(y, (long long)array_->get_size(1))) : y;
+        size_t z_inside = (z<0 || z>=(long long)array_->get_size(2)) ? (this->mod(z, (long long)array_->get_size(2))) : z;
+
+        return (*array_)(x_inside, y_inside, z_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerPeriodic<ArrayType>::T hoNDBoundaryHandlerPeriodic<ArrayType>::operator()( long long x, long long y, long long z, long long s )
+    {
+        size_t x_inside = (x<0 || x>=(long long)array_->get_size(0)) ? (this->mod(x, (long long)array_->get_size(0))) : x;
+        size_t y_inside = (y<0 || y>=(long long)array_->get_size(1)) ? (this->mod(y, (long long)array_->get_size(1))) : y;
+        size_t z_inside = (z<0 || z>=(long long)array_->get_size(2)) ? (this->mod(z, (long long)array_->get_size(2))) : z;
+        size_t s_inside = (s<0 || s>=(long long)array_->get_size(3)) ? (this->mod(s, (long long)array_->get_size(3))) : s;
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerPeriodic<ArrayType>::T hoNDBoundaryHandlerPeriodic<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p )
+    {
+        size_t x_inside = (x<0 || x>=(long long)array_->get_size(0)) ? (this->mod(x, (long long)array_->get_size(0))) : x;
+        size_t y_inside = (y<0 || y>=(long long)array_->get_size(1)) ? (this->mod(y, (long long)array_->get_size(1))) : y;
+        size_t z_inside = (z<0 || z>=(long long)array_->get_size(2)) ? (this->mod(z, (long long)array_->get_size(2))) : z;
+        size_t s_inside = (s<0 || s>=(long long)array_->get_size(3)) ? (this->mod(s, (long long)array_->get_size(3))) : s;
+        size_t p_inside = (p<0 || p>=(long long)array_->get_size(4)) ? (this->mod(p, (long long)array_->get_size(4))) : p;
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside, p_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerPeriodic<ArrayType>::T hoNDBoundaryHandlerPeriodic<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p, long long r )
+    {
+        size_t x_inside = (x<0 || x>=(long long)array_->get_size(0)) ? (this->mod(x, (long long)array_->get_size(0))) : x;
+        size_t y_inside = (y<0 || y>=(long long)array_->get_size(1)) ? (this->mod(y, (long long)array_->get_size(1))) : y;
+        size_t z_inside = (z<0 || z>=(long long)array_->get_size(2)) ? (this->mod(z, (long long)array_->get_size(2))) : z;
+        size_t s_inside = (s<0 || s>=(long long)array_->get_size(3)) ? (this->mod(s, (long long)array_->get_size(3))) : s;
+        size_t p_inside = (p<0 || p>=(long long)array_->get_size(4)) ? (this->mod(p, (long long)array_->get_size(4))) : p;
+        size_t r_inside = (r<0 || r>=(long long)array_->get_size(5)) ? (this->mod(r, (long long)array_->get_size(5))) : r;
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside, p_inside, r_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerPeriodic<ArrayType>::T hoNDBoundaryHandlerPeriodic<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a )
+    {
+        size_t x_inside = (x<0 || x>=(long long)array_->get_size(0)) ? (this->mod(x, (long long)array_->get_size(0))) : x;
+        size_t y_inside = (y<0 || y>=(long long)array_->get_size(1)) ? (this->mod(y, (long long)array_->get_size(1))) : y;
+        size_t z_inside = (z<0 || z>=(long long)array_->get_size(2)) ? (this->mod(z, (long long)array_->get_size(2))) : z;
+        size_t s_inside = (s<0 || s>=(long long)array_->get_size(3)) ? (this->mod(s, (long long)array_->get_size(3))) : s;
+        size_t p_inside = (p<0 || p>=(long long)array_->get_size(4)) ? (this->mod(p, (long long)array_->get_size(4))) : p;
+        size_t r_inside = (r<0 || r>=(long long)array_->get_size(5)) ? (this->mod(r, (long long)array_->get_size(5))) : r;
+        size_t a_inside = (a<0 || a>=(long long)array_->get_size(6)) ? (this->mod(a, (long long)array_->get_size(6))) : a;
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside, p_inside, r_inside, a_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerPeriodic<ArrayType>::T hoNDBoundaryHandlerPeriodic<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q )
+    {
+        size_t x_inside = (x<0 || x>=(long long)array_->get_size(0)) ? (this->mod(x, (long long)array_->get_size(0))) : x;
+        size_t y_inside = (y<0 || y>=(long long)array_->get_size(1)) ? (this->mod(y, (long long)array_->get_size(1))) : y;
+        size_t z_inside = (z<0 || z>=(long long)array_->get_size(2)) ? (this->mod(z, (long long)array_->get_size(2))) : z;
+        size_t s_inside = (s<0 || s>=(long long)array_->get_size(3)) ? (this->mod(s, (long long)array_->get_size(3))) : s;
+        size_t p_inside = (p<0 || p>=(long long)array_->get_size(4)) ? (this->mod(p, (long long)array_->get_size(4))) : p;
+        size_t r_inside = (r<0 || r>=(long long)array_->get_size(5)) ? (this->mod(r, (long long)array_->get_size(5))) : r;
+        size_t a_inside = (a<0 || a>=(long long)array_->get_size(6)) ? (this->mod(a, (long long)array_->get_size(6))) : a;
+        size_t q_inside = (q<0 || q>=(long long)array_->get_size(7)) ? (this->mod(q, (long long)array_->get_size(7))) : q;
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside, p_inside, r_inside, a_inside, q_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerPeriodic<ArrayType>::T hoNDBoundaryHandlerPeriodic<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q, long long u )
+    {
+        size_t x_inside = (x<0 || x>=(long long)array_->get_size(0)) ? (this->mod(x, (long long)array_->get_size(0))) : x;
+        size_t y_inside = (y<0 || y>=(long long)array_->get_size(1)) ? (this->mod(y, (long long)array_->get_size(1))) : y;
+        size_t z_inside = (z<0 || z>=(long long)array_->get_size(2)) ? (this->mod(z, (long long)array_->get_size(2))) : z;
+        size_t s_inside = (s<0 || s>=(long long)array_->get_size(3)) ? (this->mod(s, (long long)array_->get_size(3))) : s;
+        size_t p_inside = (p<0 || p>=(long long)array_->get_size(4)) ? (this->mod(p, (long long)array_->get_size(4))) : p;
+        size_t r_inside = (r<0 || r>=(long long)array_->get_size(5)) ? (this->mod(r, (long long)array_->get_size(5))) : r;
+        size_t a_inside = (a<0 || a>=(long long)array_->get_size(6)) ? (this->mod(a, (long long)array_->get_size(6))) : a;
+        size_t q_inside = (q<0 || q>=(long long)array_->get_size(7)) ? (this->mod(q, (long long)array_->get_size(7))) : q;
+        size_t u_inside = (u<0 || u>=(long long)array_->get_size(8)) ? (this->mod(u, (long long)array_->get_size(8))) : u;
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside, p_inside, r_inside, a_inside, q_inside, u_inside);
+    }
+
+    /// hoNDBoundaryHandlerMirror
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerMirror<ArrayType>::T hoNDBoundaryHandlerMirror<ArrayType>::operator()( const std::vector<long long>& ind )
+    {
+        unsigned int D = array_->get_number_of_dimensions();
+        std::vector<size_t> indInside(D);
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            if ( ind[ii] < 0 )
+            {
+                indInside[ii] = -ind[ii];
+            }
+            else if ( ind[ii] >= (long long)array_->get_size(ii) )
+            {
+                indInside[ii] = 2*(long long)array_->get_size(ii) - ind[ii] -2;
+            }
+            else
+            {
+                indInside[ii] = ind[ii];
+            }
+        }
+        return (*array_)(indInside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerMirror<ArrayType>::T hoNDBoundaryHandlerMirror<ArrayType>::operator()( long long x )
+    {
+        size_t x_inside = (x<0) ? -x : ( (x>=(long long)array_->get_size(0)) ? (2*array_->get_size(0)-x-2) : x );
+
+        return (*array_)(x_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerMirror<ArrayType>::T hoNDBoundaryHandlerMirror<ArrayType>::operator()( long long x, long long y )
+    {
+        size_t x_inside = (x<0) ? -x : ( (x>=(long long)array_->get_size(0)) ? (2*array_->get_size(0)-x-2) : x );
+        size_t y_inside = (y<0) ? -y : ( (y>=(long long)array_->get_size(1)) ? (2*array_->get_size(1)-y-2) : y );
+
+        return (*array_)(x_inside, y_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerMirror<ArrayType>::T hoNDBoundaryHandlerMirror<ArrayType>::operator()( long long x, long long y, long long z )
+    {
+        size_t x_inside = (x<0) ? -x : ( (x>=(long long)array_->get_size(0)) ? (2*array_->get_size(0)-x-2) : x );
+        size_t y_inside = (y<0) ? -y : ( (y>=(long long)array_->get_size(1)) ? (2*array_->get_size(1)-y-2) : y );
+        size_t z_inside = (z<0) ? -z : ( (z>=(long long)array_->get_size(2)) ? (2*array_->get_size(2)-z-2) : z );
+
+        return (*array_)(x_inside, y_inside, z_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerMirror<ArrayType>::T hoNDBoundaryHandlerMirror<ArrayType>::operator()( long long x, long long y, long long z, long long s )
+    {
+        size_t x_inside = (x<0) ? -x : ( (x>=(long long)array_->get_size(0)) ? (2*array_->get_size(0)-x-2) : x );
+        size_t y_inside = (y<0) ? -y : ( (y>=(long long)array_->get_size(1)) ? (2*array_->get_size(1)-y-2) : y );
+        size_t z_inside = (z<0) ? -z : ( (z>=(long long)array_->get_size(2)) ? (2*array_->get_size(2)-z-2) : z );
+        size_t s_inside = (s<0) ? -s : ( (s>=(long long)array_->get_size(3)) ? (2*array_->get_size(3)-s-2) : s );
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerMirror<ArrayType>::T hoNDBoundaryHandlerMirror<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p )
+    {
+        size_t x_inside = (x<0) ? -x : ( (x>=(long long)array_->get_size(0)) ? (2*array_->get_size(0)-x-2) : x );
+        size_t y_inside = (y<0) ? -y : ( (y>=(long long)array_->get_size(1)) ? (2*array_->get_size(1)-y-2) : y );
+        size_t z_inside = (z<0) ? -z : ( (z>=(long long)array_->get_size(2)) ? (2*array_->get_size(2)-z-2) : z );
+        size_t s_inside = (s<0) ? -s : ( (s>=(long long)array_->get_size(3)) ? (2*array_->get_size(3)-s-2) : s );
+        size_t p_inside = (p<0) ? -p : ( (p>=(long long)array_->get_size(4)) ? (2*array_->get_size(4)-p-2) : p );
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside, p_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerMirror<ArrayType>::T hoNDBoundaryHandlerMirror<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p, long long r )
+    {
+        size_t x_inside = (x<0) ? -x : ( (x>=(long long)array_->get_size(0)) ? (2*array_->get_size(0)-x-2) : x );
+        size_t y_inside = (y<0) ? -y : ( (y>=(long long)array_->get_size(1)) ? (2*array_->get_size(1)-y-2) : y );
+        size_t z_inside = (z<0) ? -z : ( (z>=(long long)array_->get_size(2)) ? (2*array_->get_size(2)-z-2) : z );
+        size_t s_inside = (s<0) ? -s : ( (s>=(long long)array_->get_size(3)) ? (2*array_->get_size(3)-s-2) : s );
+        size_t p_inside = (p<0) ? -p : ( (p>=(long long)array_->get_size(4)) ? (2*array_->get_size(4)-p-2) : p );
+        size_t r_inside = (r<0) ? -r : ( (r>=(long long)array_->get_size(5)) ? (2*array_->get_size(5)-r-2) : r );
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside, p_inside, r_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerMirror<ArrayType>::T hoNDBoundaryHandlerMirror<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a )
+    {
+        size_t x_inside = (x<0) ? -x : ( (x>=(long long)array_->get_size(0)) ? (2*array_->get_size(0)-x-2) : x );
+        size_t y_inside = (y<0) ? -y : ( (y>=(long long)array_->get_size(1)) ? (2*array_->get_size(1)-y-2) : y );
+        size_t z_inside = (z<0) ? -z : ( (z>=(long long)array_->get_size(2)) ? (2*array_->get_size(2)-z-2) : z );
+        size_t s_inside = (s<0) ? -s : ( (s>=(long long)array_->get_size(3)) ? (2*array_->get_size(3)-s-2) : s );
+        size_t p_inside = (p<0) ? -p : ( (p>=(long long)array_->get_size(4)) ? (2*array_->get_size(4)-p-2) : p );
+        size_t r_inside = (r<0) ? -r : ( (r>=(long long)array_->get_size(5)) ? (2*array_->get_size(5)-r-2) : r );
+        size_t a_inside = (a<0) ? -a : ( (a>=(long long)array_->get_size(6)) ? (2*array_->get_size(6)-a-2) : a );
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside, p_inside, r_inside, a_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerMirror<ArrayType>::T hoNDBoundaryHandlerMirror<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q )
+    {
+        size_t x_inside = (x<0) ? -x : ( (x>=(long long)array_->get_size(0)) ? (2*array_->get_size(0)-x-2) : x );
+        size_t y_inside = (y<0) ? -y : ( (y>=(long long)array_->get_size(1)) ? (2*array_->get_size(1)-y-2) : y );
+        size_t z_inside = (z<0) ? -z : ( (z>=(long long)array_->get_size(2)) ? (2*array_->get_size(2)-z-2) : z );
+        size_t s_inside = (s<0) ? -s : ( (s>=(long long)array_->get_size(3)) ? (2*array_->get_size(3)-s-2) : s );
+        size_t p_inside = (p<0) ? -p : ( (p>=(long long)array_->get_size(4)) ? (2*array_->get_size(4)-p-2) : p );
+        size_t r_inside = (r<0) ? -r : ( (r>=(long long)array_->get_size(5)) ? (2*array_->get_size(5)-r-2) : r );
+        size_t a_inside = (a<0) ? -a : ( (a>=(long long)array_->get_size(6)) ? (2*array_->get_size(6)-a-2) : a );
+        size_t q_inside = (q<0) ? -q : ( (q>=(long long)array_->get_size(7)) ? (2*array_->get_size(7)-q-2) : q );
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside, p_inside, r_inside, a_inside, q_inside);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDBoundaryHandlerMirror<ArrayType>::T hoNDBoundaryHandlerMirror<ArrayType>::operator()( long long x, long long y, long long z, long long s, long long p, long long r, long long a, long long q, long long u )
+    {
+        size_t x_inside = (x<0) ? -x : ( (x>=(long long)array_->get_size(0)) ? (2*array_->get_size(0)-x-2) : x );
+        size_t y_inside = (y<0) ? -y : ( (y>=(long long)array_->get_size(1)) ? (2*array_->get_size(1)-y-2) : y );
+        size_t z_inside = (z<0) ? -z : ( (z>=(long long)array_->get_size(2)) ? (2*array_->get_size(2)-z-2) : z );
+        size_t s_inside = (s<0) ? -s : ( (s>=(long long)array_->get_size(3)) ? (2*array_->get_size(3)-s-2) : s );
+        size_t p_inside = (p<0) ? -p : ( (p>=(long long)array_->get_size(4)) ? (2*array_->get_size(4)-p-2) : p );
+        size_t r_inside = (r<0) ? -r : ( (r>=(long long)array_->get_size(5)) ? (2*array_->get_size(5)-r-2) : r );
+        size_t a_inside = (a<0) ? -a : ( (a>=(long long)array_->get_size(6)) ? (2*array_->get_size(6)-a-2) : a );
+        size_t q_inside = (q<0) ? -q : ( (q>=(long long)array_->get_size(7)) ? (2*array_->get_size(7)-q-2) : q );
+        size_t u_inside = (u<0) ? -u : ( (u>=(long long)array_->get_size(8)) ? (2*array_->get_size(8)-u-2) : u );
+
+        return (*array_)(x_inside, y_inside, z_inside, s_inside, p_inside, r_inside, a_inside, q_inside, u_inside);
+    }
+}
diff --git a/toolboxes/core/cpu/hoNDInterpolator.h b/toolboxes/core/cpu/hoNDInterpolator.h
new file mode 100644
index 0000000..b7a8ba2
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDInterpolator.h
@@ -0,0 +1,307 @@
+/** \file       hoNDInterpolator.h
+    \brief      N-dimensional interpolator
+
+                Designed to work with hoNDArray and hoNDImage
+
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "hoNDImage.h"
+#include "hoNDBoundaryHandler.h"
+#include "hoNDBSpline.h"
+
+namespace Gadgetron
+{
+    // define the image interpolation methods
+    enum GT_IMAGE_INTERPOLATOR
+    {
+        GT_IMAGE_INTERPOLATOR_NEARESTNEIGHBOR=35642, // a magic number
+        GT_IMAGE_INTERPOLATOR_LINEAR,
+        GT_IMAGE_INTERPOLATOR_BSPLINE
+    };
+
+    inline std::string getInterpolatorName(GT_IMAGE_INTERPOLATOR interp)
+    {
+        std::string name;
+
+        switch (interp)
+        {
+            case GT_IMAGE_INTERPOLATOR_NEARESTNEIGHBOR:
+                name = "NearestNeighbor";
+                break;
+
+            case GT_IMAGE_INTERPOLATOR_LINEAR:
+                name = "Linear";
+                break;
+
+            case GT_IMAGE_INTERPOLATOR_BSPLINE:
+                name = "BSpline";
+                break;
+
+            default:
+                GERROR_STREAM("Unrecognized interpolator type : " << interp);
+        }
+
+        return name;
+    }
+
+    inline GT_IMAGE_INTERPOLATOR getInterpolatorType(const std::string& interp_name)
+    {
+        GT_IMAGE_INTERPOLATOR interp;
+
+        if ( interp_name == "NearestNeighbor" )
+        {
+            interp = GT_IMAGE_INTERPOLATOR_NEARESTNEIGHBOR;
+        }
+        else if ( interp_name == "Linear" )
+        {
+            interp = GT_IMAGE_INTERPOLATOR_LINEAR;
+        }
+        else if ( interp_name == "BSpline" )
+        {
+            interp = GT_IMAGE_INTERPOLATOR_BSPLINE;
+        }
+        else
+        {
+            GERROR_STREAM("Unrecognized interpolator name : " << interp_name);
+        }
+
+        return interp;
+    }
+
+    /// all interpolation calls must be made thread-safe
+    template <typename ArrayType>
+    class hoNDInterpolator
+    {
+    public:
+
+        typedef hoNDInterpolator<ArrayType> Self;
+        typedef typename ArrayType::value_type T;
+        typedef hoNDBoundaryHandler<ArrayType> BoundHanlderType;
+        typedef typename ArrayType::coord_type coord_type;
+
+        hoNDInterpolator() : array_(NULL), data_(NULL), bh_(NULL), sx_(0), sy_(0), sz_(0), st_(0) {}
+
+        hoNDInterpolator(ArrayType& a, BoundHanlderType& bh)
+        {
+            array_ = &a;
+            data_ = array_->begin();
+            bh_ = &bh; bh_->setArray(a);
+
+            sx_ = array_->get_size(0);
+            sy_ = array_->get_size(1);
+            sz_ = array_->get_size(2);
+            st_ = array_->get_size(3);
+        }
+
+        virtual ~hoNDInterpolator() { array_ = NULL; bh_ = NULL; }
+
+        virtual void setArray(ArrayType& a)
+        {
+            array_ = &a;
+            data_ = array_->begin();
+
+            sx_ = array_->get_size(0);
+            sy_ = array_->get_size(1);
+            sz_ = array_->get_size(2);
+            st_ = array_->get_size(3);
+        }
+
+        virtual void setBoundaryHandler(BoundHanlderType& bh) { bh_ = &bh; if ( array_!=NULL ) bh_->setArray(*array_); }
+
+        /// access the pixel value
+        virtual T operator()( const coord_type* pos ) = 0;
+        virtual T operator()( const std::vector<coord_type>& pos ) = 0;
+        virtual T operator()( coord_type x ) = 0;
+        virtual T operator()( coord_type x, coord_type y ) = 0;
+        virtual T operator()( coord_type x, coord_type y, coord_type z ) = 0;
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s ) = 0;
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p ) = 0;
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r ) = 0;
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a ) = 0;
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q ) = 0;
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q, coord_type u ) = 0;
+
+    protected:
+
+        ArrayType* array_;
+        T* data_;
+        BoundHanlderType* bh_;
+
+        size_t sx_;
+        size_t sy_;
+        size_t sz_;
+        size_t st_;
+    };
+
+    template <typename ArrayType>
+    class hoNDInterpolatorNearestNeighbor : public hoNDInterpolator<ArrayType>
+    {
+    public:
+
+        typedef hoNDInterpolator<ArrayType> BaseClass;
+        typedef hoNDInterpolatorNearestNeighbor<ArrayType> Self;
+        typedef typename BaseClass::T T;
+        typedef typename BaseClass::coord_type coord_type;
+        typedef typename BaseClass::BoundHanlderType BoundHanlderType;
+
+        hoNDInterpolatorNearestNeighbor() : BaseClass() {}
+        hoNDInterpolatorNearestNeighbor(ArrayType& a, BoundHanlderType& bh) : BaseClass(a, bh) {}
+        virtual ~hoNDInterpolatorNearestNeighbor() {}
+
+        /// access the pixel value
+        virtual T operator()( const coord_type* pos );
+        virtual T operator()( const std::vector<coord_type>& pos );
+        virtual T operator()( coord_type x );
+        virtual T operator()( coord_type x, coord_type y );
+        virtual T operator()( coord_type x, coord_type y, coord_type z );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q, coord_type u );
+
+    protected:
+
+        using BaseClass::array_;
+        using BaseClass::data_;
+        using BaseClass::bh_;
+
+        using BaseClass::sx_;
+        using BaseClass::sy_;
+        using BaseClass::sz_;
+        using BaseClass::st_;
+    };
+
+    template <typename ArrayType>
+    class hoNDInterpolatorLinear : public hoNDInterpolator<ArrayType>
+    {
+    public:
+
+        typedef hoNDInterpolator<ArrayType> BaseClass;
+        typedef hoNDInterpolatorLinear<ArrayType> Self;
+        typedef typename BaseClass::T T;
+        typedef typename BaseClass::coord_type coord_type;
+        typedef typename BaseClass::BoundHanlderType BoundHanlderType;
+
+        hoNDInterpolatorLinear() : BaseClass() {}
+
+        hoNDInterpolatorLinear(ArrayType& a, BoundHanlderType& bh) : BaseClass(a, bh)
+        {
+            number_of_points_ = 1<<a.get_number_of_dimensions();
+        }
+
+        virtual ~hoNDInterpolatorLinear() {}
+
+        /// access the pixel value
+        virtual T operator()( const coord_type* pos );
+        virtual T operator()( const std::vector<coord_type>& pos );
+        virtual T operator()( coord_type x );
+        virtual T operator()( coord_type x, coord_type y );
+        virtual T operator()( coord_type x, coord_type y, coord_type z );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q, coord_type u );
+
+    protected:
+
+        using BaseClass::array_;
+        using BaseClass::data_;
+        using BaseClass::bh_;
+
+        using BaseClass::sx_;
+        using BaseClass::sy_;
+        using BaseClass::sz_;
+        using BaseClass::st_;
+
+        // number of points involved in interpolation
+        unsigned int number_of_points_;
+    };
+
+    template <typename ArrayType, unsigned int D>
+    class hoNDInterpolatorBSpline : public hoNDInterpolator<ArrayType>
+    {
+    public:
+
+        typedef hoNDInterpolator<ArrayType> BaseClass;
+        typedef hoNDInterpolatorBSpline<ArrayType, D> Self;
+        typedef typename BaseClass::T T;
+        typedef typename BaseClass::coord_type coord_type;
+        typedef typename BaseClass::BoundHanlderType BoundHanlderType;
+
+        hoNDInterpolatorBSpline(unsigned int order=5) : BaseClass(), order_(order) { derivative_.resize(D, 0); }
+        hoNDInterpolatorBSpline(ArrayType& a, BoundHanlderType& bh, unsigned int order=5);
+        virtual ~hoNDInterpolatorBSpline();
+
+        virtual void setArray(ArrayType& a);
+
+        void setDerivative(const std::vector<unsigned int>& derivative) { GADGET_CHECK_THROW(derivative.size()>=D); derivative_ = derivative; }
+
+        /// access the pixel value
+        virtual T operator()( const coord_type* pos );
+        virtual T operator()( const std::vector<coord_type>& pos );
+        virtual T operator()( coord_type x );
+        virtual T operator()( coord_type x, coord_type y );
+        virtual T operator()( coord_type x, coord_type y, coord_type z );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q );
+        virtual T operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q, coord_type u );
+
+     protected:
+
+        using BaseClass::array_;
+        using BaseClass::data_;
+        using BaseClass::bh_;
+
+        using BaseClass::sx_;
+        using BaseClass::sy_;
+        using BaseClass::sz_;
+        using BaseClass::st_;
+
+        hoNDBSpline<T, D> bspline_;
+        std::vector<size_t> dimension_;
+        std::vector<unsigned int> derivative_;
+        unsigned int order_;
+        hoNDArray<T> coeff_;
+    };
+
+    template <typename ArrayType, unsigned int D>
+    inline hoNDInterpolator<ArrayType>* createInterpolator(GT_IMAGE_INTERPOLATOR interp)
+    {
+        hoNDInterpolator<ArrayType>* res = NULL;
+
+        switch (interp)
+        {
+            case GT_IMAGE_INTERPOLATOR_NEARESTNEIGHBOR:
+                res = new hoNDInterpolatorNearestNeighbor<ArrayType>();
+                break;
+
+            case GT_IMAGE_INTERPOLATOR_LINEAR:
+                res = new hoNDInterpolatorLinear<ArrayType>();
+                break;
+
+            case GT_IMAGE_INTERPOLATOR_BSPLINE:
+                res = new hoNDInterpolatorBSpline<ArrayType, D>();
+                break;
+
+            default:
+                GERROR_STREAM("Unrecognized interpolator type : " << interp);
+        }
+
+        return res;
+    }
+}
+
+#include "hoNDInterpolatorNearestNeighbor.hxx"
+#include "hoNDInterpolatorLinear.hxx"
+#include "hoNDInterpolatorBSpline.hxx"
diff --git a/toolboxes/core/cpu/hoNDInterpolatorBSpline.hxx b/toolboxes/core/cpu/hoNDInterpolatorBSpline.hxx
new file mode 100644
index 0000000..1db1ce9
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDInterpolatorBSpline.hxx
@@ -0,0 +1,339 @@
+/** \file       hoNDInterpolatorBSpline.hxx
+    \brief      N-dimensional BSpline interpolator
+
+                Designed to work with hoNDArray and hoNDImage
+
+    \author     Hui Xue
+*/
+
+namespace Gadgetron
+{
+    template <typename ArrayType, unsigned int D> 
+    hoNDInterpolatorBSpline<ArrayType, D>::hoNDInterpolatorBSpline(ArrayType& a, BoundHanlderType& bh, unsigned int order) : BaseClass(a, bh), order_(order)
+    {
+        bspline_.computeBSplineCoefficients(a, order_, this->coeff_);
+
+        dimension_.resize(D);
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            dimension_[ii] = a.get_size(ii);
+        }
+
+        derivative_.resize(D, 0);
+    }
+
+    template <typename ArrayType, unsigned int D> 
+    hoNDInterpolatorBSpline<ArrayType, D>::~hoNDInterpolatorBSpline()
+    {
+    }
+
+    template <typename ArrayType, unsigned int D> 
+    void hoNDInterpolatorBSpline<ArrayType, D>::setArray(ArrayType& a)
+    {
+        this->array_ = &a;
+
+        dimension_.resize(D);
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            dimension_[ii] = a.get_size(ii);
+        }
+
+        bspline_.computeBSplineCoefficients(a, this->order_, this->coeff_);
+    }
+
+    template <typename ArrayType, unsigned int D> 
+    inline typename hoNDInterpolatorBSpline<ArrayType, D>::T hoNDInterpolatorBSpline<ArrayType, D>::operator()( const coord_type* pos )
+    {
+        std::vector<long long> anchor(D);
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            anchor[ii] = static_cast<long long>(std::floor(pos[ii]));
+        }
+
+        bool inRange = true;
+        for ( ii=0; ii<D; ii++ )
+        {
+            if ( anchor[ii]<0 || anchor[ii]>=array_->get_size(ii) )
+            {
+                inRange = false;
+                break;
+            }
+        }
+
+        if( inRange )
+        {
+            return bspline_.evaluateBSpline(coeff_.begin(), dimension_, order_, derivative_, pos);
+        }
+        else
+        {
+            return (*bh_)(anchor);
+        }
+    }
+
+    template <typename ArrayType, unsigned int D> 
+    inline typename hoNDInterpolatorBSpline<ArrayType, D>::T hoNDInterpolatorBSpline<ArrayType, D>::operator()( const std::vector<coord_type>& pos )
+    {
+        return this->operator()(&pos[0]);
+    }
+
+    template <typename ArrayType, unsigned int D> 
+    inline typename hoNDInterpolatorBSpline<ArrayType, D>::T hoNDInterpolatorBSpline<ArrayType, D>::operator()( coord_type x )
+    {
+        long long ix = static_cast<long long>(std::floor(x));
+
+        if ( ix>=0 && ix<(long long)array_->get_size(0)-1 )
+        {
+            return bspline_.evaluateBSpline(coeff_.begin(), dimension_[0], order_, derivative_[0], x);
+        }
+        else
+        {
+            return (*bh_)(ix);
+        }
+    }
+
+    template <typename ArrayType, unsigned int D> 
+    inline typename hoNDInterpolatorBSpline<ArrayType, D>::T hoNDInterpolatorBSpline<ArrayType, D>::operator()( coord_type x, coord_type y )
+    {
+        long long ix = static_cast<long long>(std::floor(x));
+        long long iy = static_cast<long long>(std::floor(y));
+
+        /*x = (x<0) ? 0 : x;
+        x = (x>array_->get_size(0)-1) ? array_->get_size(0)-1 : x;
+
+        y = (y<0) ? 0 : y;
+        y = (y>array_->get_size(1)-1) ? array_->get_size(1)-1 : y;*/
+
+        /*if ( ix>=0 && ix<(long long)array_->get_size(0)-1 && iy>=0 && iy<(long long)array_->get_size(1)-1 )
+        {
+            return bspline_.evaluateBSpline(coeff_.begin(), dimension_[0], dimension_[1], order_, derivative_[0], derivative_[1], x, y);
+        }
+        else
+        {
+            return (*bh_)(ix, iy);
+        }*/
+
+        if ( ix>=0 && ix<sx_-1 && iy>=0 && iy<sy_-1 )
+        {
+            return bspline_.evaluateBSpline(coeff_.begin(), dimension_[0], dimension_[1], order_, derivative_[0], derivative_[1], x, y);
+        }
+        else
+        {
+            return (*bh_)(ix, iy);
+        }
+    }
+
+    template <typename ArrayType, unsigned int D> 
+    inline typename hoNDInterpolatorBSpline<ArrayType, D>::T hoNDInterpolatorBSpline<ArrayType, D>::operator()( coord_type x, coord_type y, coord_type z )
+    {
+        long long ix = static_cast<long long>(std::floor(x));
+        long long iy = static_cast<long long>(std::floor(y));
+        long long iz = static_cast<long long>(std::floor(z));
+
+        if ( ix>=0 && ix<(long long)array_->get_size(0)-1 
+            && iy>=0 && iy<(long long)array_->get_size(1)-1 
+            && iz>=0 && iz<(long long)array_->get_size(2)-1 )
+        {
+            return bspline_.evaluateBSpline(coeff_.begin(), 
+                dimension_[0], dimension_[1], dimension_[2], 
+                order_, 
+                derivative_[0], derivative_[1], derivative_[2], 
+                x, y, z);
+        }
+        else
+        {
+            return (*bh_)(ix, iy, iz);
+        }
+    }
+
+    template <typename ArrayType, unsigned int D> 
+    inline typename hoNDInterpolatorBSpline<ArrayType, D>::T hoNDInterpolatorBSpline<ArrayType, D>::operator()( coord_type x, coord_type y, coord_type z, coord_type s )
+    {
+        long long ix = static_cast<long long>(std::floor(x));
+        long long iy = static_cast<long long>(std::floor(y));
+        long long iz = static_cast<long long>(std::floor(z));
+        long long is = static_cast<long long>(std::floor(s));
+
+        if ( ix>=0 && ix<(long long)array_->get_size(0)-1 
+            && iy>=0 && iy<(long long)array_->get_size(1)-1 
+            && iz>=0 && iz<(long long)array_->get_size(2)-1 
+            && is>=0 && is<(long long)array_->get_size(3)-1 )
+        {
+            return bspline_.evaluateBSpline(coeff_.begin(), 
+                dimension_[0], dimension_[1], dimension_[2], dimension_[3], 
+                order_, 
+                derivative_[0], derivative_[1], derivative_[2], derivative_[3], 
+                x, y, z, s);
+        }
+        else
+        {
+            return (*bh_)(ix, iy, iz, is);
+        }
+    }
+
+    template <typename ArrayType, unsigned int D> 
+    inline typename hoNDInterpolatorBSpline<ArrayType, D>::T hoNDInterpolatorBSpline<ArrayType, D>::operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p )
+    {
+        long long ix = static_cast<long long>(std::floor(x));
+        long long iy = static_cast<long long>(std::floor(y));
+        long long iz = static_cast<long long>(std::floor(z));
+        long long is = static_cast<long long>(std::floor(s));
+        long long ip = static_cast<long long>(std::floor(p));
+
+        if ( ix>=0 && ix<(long long)array_->get_size(0)-1 
+            && iy>=0 && iy<(long long)array_->get_size(1)-1 
+            && iz>=0 && iz<(long long)array_->get_size(2)-1 
+            && is>=0 && is<(long long)array_->get_size(3)-1 
+            && ip>=0 && ip<(long long)array_->get_size(4)-1 )
+        {
+            return bspline_.evaluateBSpline(coeff_.begin(), 
+                dimension_[0], dimension_[1], dimension_[2], dimension_[3], dimension_[4], 
+                order_, 
+                derivative_[0], derivative_[1], derivative_[2], derivative_[3], derivative_[4], 
+                x, y, z, s, p);
+        }
+        else
+        {
+            return (*bh_)(ix, iy, iz, is, ip);
+        }
+    }
+
+    template <typename ArrayType, unsigned int D> 
+    inline typename hoNDInterpolatorBSpline<ArrayType, D>::T hoNDInterpolatorBSpline<ArrayType, D>::operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r )
+    {
+        long long ix = static_cast<long long>(std::floor(x));
+        long long iy = static_cast<long long>(std::floor(y));
+        long long iz = static_cast<long long>(std::floor(z));
+        long long is = static_cast<long long>(std::floor(s));
+        long long ip = static_cast<long long>(std::floor(p));
+        long long ir = static_cast<long long>(std::floor(r));
+
+        if ( ix>=0 && ix<(long long)array_->get_size(0)-1 
+            && iy>=0 && iy<(long long)array_->get_size(1)-1 
+            && iz>=0 && iz<(long long)array_->get_size(2)-1 
+            && is>=0 && is<(long long)array_->get_size(3)-1 
+            && ip>=0 && ip<(long long)array_->get_size(4)-1 
+            && ir>=0 && ir<(long long)array_->get_size(5)-1 )
+        {
+            return bspline_.evaluateBSpline(coeff_.begin(), 
+                dimension_[0], dimension_[1], dimension_[2], dimension_[3], dimension_[4], dimension_[5], 
+                order_, 
+                derivative_[0], derivative_[1], derivative_[2], derivative_[3], derivative_[4], derivative_[5], 
+                x, y, z, s, p, r);
+        }
+        else
+        {
+            return (*bh_)(ix, iy, iz, is, ip, ir);
+        }
+    }
+
+    template <typename ArrayType, unsigned int D> 
+    inline typename hoNDInterpolatorBSpline<ArrayType, D>::T hoNDInterpolatorBSpline<ArrayType, D>::operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a )
+    {
+        long long anchor[7];
+
+        anchor[0] = static_cast<long long>(std::floor(x));
+        anchor[1] = static_cast<long long>(std::floor(y));
+        anchor[2] = static_cast<long long>(std::floor(z));
+        anchor[3] = static_cast<long long>(std::floor(s));
+        anchor[4] = static_cast<long long>(std::floor(p));
+        anchor[5] = static_cast<long long>(std::floor(r));
+        anchor[6] = static_cast<long long>(std::floor(a));
+
+        if ( anchor[0]>=0 && anchor[0]<(long long)array_->get_size(0)-1 
+            && anchor[1]>=0 && anchor[1]<(long long)array_->get_size(1)-1 
+            && anchor[2]>=0 && anchor[2]<(long long)array_->get_size(2)-1 
+            && anchor[3]>=0 && anchor[3]<(long long)array_->get_size(3)-1 
+            && anchor[4]>=0 && anchor[4]<(long long)array_->get_size(4)-1 
+            && anchor[5]>=0 && anchor[5]<(long long)array_->get_size(5)-1
+            && anchor[6]>=0 && anchor[6]<(long long)array_->get_size(6)-1 )
+        {
+            return bspline_.evaluateBSpline(coeff_.begin(), 
+                dimension_[0], dimension_[1], dimension_[2], dimension_[3], dimension_[4], dimension_[5], dimension_[6], 
+                order_, 
+                derivative_[0], derivative_[1], derivative_[2], derivative_[3], derivative_[4], derivative_[5], derivative_[6], 
+                x, y, z, s, p, r, a);
+        }
+        else
+        {
+            return (*bh_)(anchor[0], anchor[1], anchor[1], anchor[2], anchor[3], anchor[4], anchor[5], anchor[6]);
+        }
+    }
+
+    template <typename ArrayType, unsigned int D> 
+    inline typename hoNDInterpolatorBSpline<ArrayType, D>::T hoNDInterpolatorBSpline<ArrayType, D>::operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q )
+    {
+        long long anchor[8];
+
+        anchor[0] = static_cast<long long>(std::floor(x));
+        anchor[1] = static_cast<long long>(std::floor(y));
+        anchor[2] = static_cast<long long>(std::floor(z));
+        anchor[3] = static_cast<long long>(std::floor(s));
+        anchor[4] = static_cast<long long>(std::floor(p));
+        anchor[5] = static_cast<long long>(std::floor(r));
+        anchor[6] = static_cast<long long>(std::floor(a));
+        anchor[7] = static_cast<long long>(std::floor(q));
+
+        if ( anchor[0]>=0 && anchor[0]<(long long)array_->get_size(0)-1 
+            && anchor[1]>=0 && anchor[1]<(long long)array_->get_size(1)-1 
+            && anchor[2]>=0 && anchor[2]<(long long)array_->get_size(2)-1 
+            && anchor[3]>=0 && anchor[3]<(long long)array_->get_size(3)-1 
+            && anchor[4]>=0 && anchor[4]<(long long)array_->get_size(4)-1 
+            && anchor[5]>=0 && anchor[5]<(long long)array_->get_size(5)-1 
+            && anchor[6]>=0 && anchor[6]<(long long)array_->get_size(6)-1 
+            && anchor[7]>=0 && anchor[7]<(long long)array_->get_size(7)-1 )
+        {
+            return bspline_.evaluateBSpline(coeff_.begin(), 
+                dimension_[0], dimension_[1], dimension_[2], dimension_[3], dimension_[4], dimension_[5], dimension_[6], dimension_[7], 
+                order_, 
+                derivative_[0], derivative_[1], derivative_[2], derivative_[3], derivative_[4], derivative_[5], derivative_[6], derivative_[7], 
+                x, y, z, s, p, r, a, q);
+        }
+        else
+        {
+            return (*bh_)(anchor[0], anchor[1], anchor[1], anchor[2], anchor[3], anchor[4], anchor[5], anchor[6], anchor[7]);
+        }
+    }
+
+    template <typename ArrayType, unsigned int D> 
+    inline typename hoNDInterpolatorBSpline<ArrayType, D>::T hoNDInterpolatorBSpline<ArrayType, D>::operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q, coord_type u )
+    {
+        long long anchor[9];
+
+        anchor[0] = static_cast<long long>(std::floor(x));
+        anchor[1] = static_cast<long long>(std::floor(y));
+        anchor[2] = static_cast<long long>(std::floor(z));
+        anchor[3] = static_cast<long long>(std::floor(s));
+        anchor[4] = static_cast<long long>(std::floor(p));
+        anchor[5] = static_cast<long long>(std::floor(r));
+        anchor[6] = static_cast<long long>(std::floor(a));
+        anchor[7] = static_cast<long long>(std::floor(q));
+        anchor[8] = static_cast<long long>(std::floor(u));
+
+        if ( anchor[0]>=0 && anchor[0]<(long long)array_->get_size(0)-1 
+            && anchor[1]>=0 && anchor[1]<(long long)array_->get_size(1)-1 
+            && anchor[2]>=0 && anchor[2]<(long long)array_->get_size(2)-1 
+            && anchor[3]>=0 && anchor[3]<(long long)array_->get_size(3)-1 
+            && anchor[4]>=0 && anchor[4]<(long long)array_->get_size(4)-1 
+            && anchor[5]>=0 && anchor[5]<(long long)array_->get_size(5)-1 
+            && anchor[6]>=0 && anchor[6]<(long long)array_->get_size(6)-1 
+            && anchor[7]>=0 && anchor[7]<(long long)array_->get_size(7)-1 
+            && anchor[8]>=0 && anchor[8]<(long long)array_->get_size(8)-1 )
+        {
+            return bspline_.evaluateBSpline(coeff_.begin(), 
+                dimension_[0], dimension_[1], dimension_[2], dimension_[3], dimension_[4], dimension_[5], dimension_[6], dimension_[7], dimension_[8], 
+                order_, 
+                derivative_[0], derivative_[1], derivative_[2], derivative_[3], derivative_[4], derivative_[5], derivative_[6], derivative_[7], derivative_[8], 
+                x, y, z, s, p, r, a, q, u);
+        }
+        else
+        {
+            return (*bh_)(anchor[0], anchor[1], anchor[2], anchor[3], anchor[4], anchor[5], anchor[6], anchor[7], anchor[8]);
+        }
+    }
+}
diff --git a/toolboxes/core/cpu/hoNDInterpolatorLinear.hxx b/toolboxes/core/cpu/hoNDInterpolatorLinear.hxx
new file mode 100644
index 0000000..db7e583
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDInterpolatorLinear.hxx
@@ -0,0 +1,874 @@
+/** \file       hoNDInterpolatorLinear.h
+    \brief      N-dimensional linear interpolator
+
+                Designed to work with hoNDArray and hoNDImage
+
+    \author     Hui Xue
+*/
+
+#ifdef _WIN32
+    #include "malloc.h"
+#else
+    #include "alloca.h"
+#endif // _WIN32
+
+namespace Gadgetron
+{
+    /// hoNDInterpolatorLinear
+
+    template <typename ArrayType> 
+    typename hoNDInterpolatorLinear<ArrayType>::T hoNDInterpolatorLinear<ArrayType>::operator()( const coord_type* pos )
+    {
+        unsigned int D = array_->get_number_of_dimensions();
+
+        long long* anchor = reinterpret_cast<long long*>(alloca(D * sizeof(long long)));
+        coord_type* weights = reinterpret_cast<coord_type*>(alloca(D * sizeof(coord_type)));
+        coord_type* weightsMinusOne = reinterpret_cast<coord_type*>(alloca(D * sizeof(coord_type)));
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            anchor[ii] = static_cast<long long>(std::floor(pos[ii]));
+            weights[ii] = pos[ii] - anchor[ii];
+            weightsMinusOne[ii] = coord_type(1.0) - weights[ii];
+        }
+
+        T res(0);
+
+        coord_type weightAll(1.0);
+
+        bool inRange = true;
+        for ( ii=0; ii<D; ii++ )
+        {
+            if ( anchor[ii]<0 || anchor[ii]>=array_->get_size(ii)-1 )
+            {
+                inRange = false;
+                break;
+            }
+        }
+
+        if( inRange )
+        {
+            std::vector<size_t> ind(D);
+
+            unsigned int n;
+            for ( n=0; n<number_of_points_; n++ )
+            {
+                unsigned int lastDigit = n;
+                weightAll = coord_type(1.0);
+
+                for ( ii=0; ii<D; ii++ )
+                {
+                    if ( lastDigit & 1 )
+                    {
+                        ind[ii] = anchor[ii]+1;
+                        weightAll *= weights[ii];
+                    }
+                    else
+                    {
+                        ind[ii] = anchor[ii];
+                        weightAll *= weightsMinusOne[ii];
+                    }
+
+                    // shift one digit
+                    lastDigit >>= 1;
+                }
+
+                res += weightAll * (*array_)(ind);
+            }
+        }
+        else
+        {
+            std::vector<long long> ind(D);
+
+            unsigned int n;
+            for ( n=0; n<number_of_points_; n++ )
+            {
+                unsigned int lastDigit = n;
+                weightAll = coord_type(1.0);
+
+                for ( ii=0; ii<D; ii++ )
+                {
+                    if ( lastDigit & 1 )
+                    {
+                        ind[ii] = anchor[ii]+1;
+                        weightAll *= weights[ii];
+                    }
+                    else
+                    {
+                        ind[ii] = anchor[ii];
+                        weightAll *= weightsMinusOne[ii];
+                    }
+
+                    // shift one digit
+                    lastDigit >>= 1;
+                }
+
+                res += weightAll * (*bh_)(ind);
+            }
+        }
+
+        return res;
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorLinear<ArrayType>::T hoNDInterpolatorLinear<ArrayType>::operator()( const std::vector<coord_type>& pos )
+    {
+        return this->operator()(&pos[0]);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorLinear<ArrayType>::T hoNDInterpolatorLinear<ArrayType>::operator()( coord_type x )
+    {
+        long long ix = static_cast<long long>(std::floor(x));
+        coord_type dx = x - ix;
+
+        if ( ix>=0 && ix<(long long)sx_-1 )
+        {
+            return ( (*array_)( size_t(ix) )*(1-dx) + (*array_)( size_t(ix)+1 )*dx );
+        }
+        else
+        {
+            return ( (*bh_)(ix)*(1-dx) + (*bh_)(ix+1)*dx );
+        }
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorLinear<ArrayType>::T hoNDInterpolatorLinear<ArrayType>::operator()( coord_type x, coord_type y )
+    {
+        long long ix = static_cast<long long>(std::floor(x));
+        coord_type dx = x - ix;
+        coord_type dx_prime = coord_type(1.0)-dx;
+
+        long long iy = static_cast<long long>(std::floor(y));
+        coord_type dy = y - iy;
+        coord_type dy_prime = coord_type(1.0)-dy;
+
+        if ( ix>=0 && ix<sx_-1 && iy>=0 && iy<sy_-1 )
+        {
+            size_t offset = ix + iy*sx_;
+            T* data = array_->begin();
+
+            //return (    ( data_[offset]   *   dx_prime     *dy_prime 
+            //        +   data_[offset+1]   *   dx           *dy_prime)
+            //        +   (data_[offset+sx_]   *   dx_prime     *dy
+            //        +   data_[offset+sx_+1]   *   dx           *dy) );
+
+            /*return (    ((*array_)(size_t(ix), size_t(iy)       )   *   dx_prime     *dy_prime
+                    +   (*array_)(size_t(ix)+1, size_t(iy)      )   *   dx           *dy_prime)
+                    +   ((*array_)(size_t(ix), size_t(iy)+1     )   *   dx_prime     *dy
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1    )   *   dx           *dy) );*/
+
+            return (    (data[offset]       *   dx_prime     *dy_prime
+                    +   data[offset+1]      *   dx           *dy_prime)
+                    +   (data[offset+sx_]   *   dx_prime     *dy
+                    +   data[offset+sx_+1]  *   dx           *dy) );
+        }
+        else
+        {
+            return (    ((*bh_)(ix, iy       )   *   dx_prime    *dy_prime 
+                    +   (*bh_)(ix+1, iy      )   *   dx          *dy_prime)
+                    +   ((*bh_)(ix, iy+1     )   *   dx_prime    *dy
+                    +   (*bh_)(ix+1, iy+1    )   *   dx          *dy) );
+        }
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorLinear<ArrayType>::T hoNDInterpolatorLinear<ArrayType>::operator()( coord_type x, coord_type y, coord_type z )
+    {
+        long long ix = static_cast<long long>(std::floor(x));
+        coord_type dx = x - ix;
+        coord_type dx_prime = coord_type(1.0)-dx;
+
+        long long iy = static_cast<long long>(std::floor(y));
+        coord_type dy = y - iy;
+        coord_type dy_prime = coord_type(1.0)-dy;
+
+        long long iz = static_cast<long long>(std::floor(z));
+        coord_type dz = z - iz;
+        coord_type dz_prime = coord_type(1.0)-dz;
+
+        if ( ix>=0 && ix<sx_-1 
+            && iy>=0 && iy<sy_-1 
+            && iz>=0 && iz<sz_-1 )
+        {
+            /*return (    ((*array_)(size_t(ix),   size_t(iy),     size_t(iz)   )   *   dx_prime     *dy_prime   *dz_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz)    )   *   dx           *dy_prime   *dz_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz)   )   *   dx_prime     *dy         *dz_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz)    )   *   dx           *dy         *dz_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz)+1 )   *   dx_prime     *dy_prime   *dz 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz)+1  )   *   dx           *dy_prime   *dz) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz)+1 )   *   dx_prime     *dy         *dz 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz)+1  )   *   dx           *dy         *dz) );*/
+
+            size_t offset = ix + iy*sx_ + iz*sx_*sy_;
+
+            return (    (data_[offset]              *   dx_prime     *dy_prime   *dz_prime 
+                    +   data_[offset+1]             *   dx           *dy_prime   *dz_prime) 
+                    +   (data_[offset+sx_]          *   dx_prime     *dy         *dz_prime 
+                    +   data_[offset+sx_+1]         *   dx           *dy         *dz_prime) 
+                    +   (data_[offset+sx_*sy_]      *   dx_prime     *dy_prime   *dz 
+                    +   data_[offset+sx_*sy_+1]     *   dx           *dy_prime   *dz) 
+                    +   (data_[offset+sx_*sy_+sx_]  *   dx_prime     *dy         *dz 
+                    +   data_[offset+sx_*sy_+sx_+1] *   dx           *dy         *dz) );
+        }
+        else
+        {
+            return (    ((*bh_)(ix,   iy,     iz   )   *   dx_prime     *dy_prime   *dz_prime 
+                    +   (*bh_)(ix+1, iy,     iz    )   *   dx           *dy_prime   *dz_prime) 
+                    +   ((*bh_)(ix,   iy+1,   iz   )   *   dx_prime     *dy         *dz_prime 
+                    +   (*bh_)(ix+1, iy+1,   iz    )   *   dx           *dy         *dz_prime) 
+                    +   ((*bh_)(ix,   iy,     iz+1 )   *   dx_prime     *dy_prime   *dz 
+                    +   (*bh_)(ix+1, iy,     iz+1  )   *   dx           *dy_prime   *dz) 
+                    +   ((*bh_)(ix,   iy+1,   iz+1 )   *   dx_prime     *dy         *dz 
+                    +   (*bh_)(ix+1, iy+1,   iz+1  )   *   dx           *dy         *dz) );
+        }
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorLinear<ArrayType>::T hoNDInterpolatorLinear<ArrayType>::operator()( coord_type x, coord_type y, coord_type z, coord_type s )
+    {
+        long long ix = static_cast<long long>(std::floor(x));
+        coord_type dx = x - ix;
+        coord_type dx_prime = coord_type(1.0)-dx;
+
+        long long iy = static_cast<long long>(std::floor(y));
+        coord_type dy = y - iy;
+        coord_type dy_prime = coord_type(1.0)-dy;
+
+        long long iz = static_cast<long long>(std::floor(z));
+        coord_type dz = z - iz;
+        coord_type dz_prime = coord_type(1.0)-dz;
+
+        long long is = static_cast<long long>(std::floor(s));
+        coord_type ds = s - is;
+        coord_type ds_prime = coord_type(1.0)-ds;
+
+        if ( ix>=0 && ix<(long long)array_->get_size(0)-1 
+            && iy>=0 && iy<(long long)array_->get_size(1)-1 
+            && iz>=0 && iz<(long long)array_->get_size(2)-1 
+            && is>=0 && is<(long long)array_->get_size(3)-1 )
+        {
+            return (    ((*array_)(size_t(ix),   size_t(iy),     size_t(iz),    size_t(is) )   *   dx_prime     *dy_prime   *dz_prime   *ds_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz),     size_t(is) )   *   dx           *dy_prime   *dz_prime   *ds_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz),    size_t(is) )   *   dx_prime     *dy         *dz_prime   *ds_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz),     size_t(is) )   *   dx           *dy         *dz_prime   *ds_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz)+1,  size_t(is) )   *   dx_prime     *dy_prime   *dz         *ds_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz)+1,   size_t(is) )   *   dx           *dy_prime   *dz         *ds_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz)+1,  size_t(is) )   *   dx_prime     *dy         *dz         *ds_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz)+1,   size_t(is) )   *   dx           *dy         *dz         *ds_prime)
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz),    size_t(is)+1 )   *   dx_prime     *dy_prime   *dz_prime   *ds 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz),     size_t(is)+1 )   *   dx           *dy_prime   *dz_prime   *ds) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz),    size_t(is)+1 )   *   dx_prime     *dy         *dz_prime   *ds 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz),     size_t(is)+1 )   *   dx           *dy         *dz_prime   *ds) 
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz)+1,  size_t(is)+1 )   *   dx_prime     *dy_prime   *dz         *ds 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz)+1,   size_t(is)+1 )   *   dx           *dy_prime   *dz         *ds) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz)+1,  size_t(is)+1 )   *   dx_prime     *dy         *dz         *ds 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz)+1,   size_t(is)+1 )   *   dx           *dy         *dz         *ds) );
+        }
+        else
+        {
+            return (    ((*bh_)(ix,   iy,     iz,    is )   *   dx_prime     *dy_prime   *dz_prime   *ds_prime 
+                    +   (*bh_)(ix+1, iy,     iz,     is )   *   dx           *dy_prime   *dz_prime   *ds_prime) 
+                    +   ((*bh_)(ix,   iy+1,   iz,    is )   *   dx_prime     *dy         *dz_prime   *ds_prime 
+                    +   (*bh_)(ix+1, iy+1,   iz,     is )   *   dx           *dy         *dz_prime   *ds_prime) 
+                    +   ((*bh_)(ix,   iy,     iz+1,  is )   *   dx_prime     *dy_prime   *dz         *ds_prime 
+                    +   (*bh_)(ix+1, iy,     iz+1,   is )   *   dx           *dy_prime   *dz         *ds_prime) 
+                    +   ((*bh_)(ix,   iy+1,   iz+1,  is )   *   dx_prime     *dy         *dz         *ds_prime 
+                    +   (*bh_)(ix+1, iy+1,   iz+1,   is )   *   dx           *dy         *dz         *ds_prime)
+                    +   ((*bh_)(ix,   iy,     iz,    is+1 )   *   dx_prime     *dy_prime   *dz_prime   *ds 
+                    +   (*bh_)(ix+1, iy,     iz,     is+1 )   *   dx           *dy_prime   *dz_prime   *ds) 
+                    +   ((*bh_)(ix,   iy+1,   iz,    is+1 )   *   dx_prime     *dy         *dz_prime   *ds 
+                    +   (*bh_)(ix+1, iy+1,   iz,     is+1 )   *   dx           *dy         *dz_prime   *ds) 
+                    +   ((*bh_)(ix,   iy,     iz+1,  is+1 )   *   dx_prime     *dy_prime   *dz         *ds 
+                    +   (*bh_)(ix+1, iy,     iz+1,   is+1 )   *   dx           *dy_prime   *dz         *ds) 
+                    +   ((*bh_)(ix,   iy+1,   iz+1,  is+1 )   *   dx_prime     *dy         *dz         *ds 
+                    +   (*bh_)(ix+1, iy+1,   iz+1,   is+1 )   *   dx           *dy         *dz         *ds) );
+        }
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorLinear<ArrayType>::T hoNDInterpolatorLinear<ArrayType>::operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p )
+    {
+        long long ix = static_cast<long long>(std::floor(x));
+        coord_type dx = x - ix;
+        coord_type dx_prime = coord_type(1.0)-dx;
+
+        long long iy = static_cast<long long>(std::floor(y));
+        coord_type dy = y - iy;
+        coord_type dy_prime = coord_type(1.0)-dy;
+
+        long long iz = static_cast<long long>(std::floor(z));
+        coord_type dz = z - iz;
+        coord_type dz_prime = coord_type(1.0)-dz;
+
+        long long is = static_cast<long long>(std::floor(s));
+        coord_type ds = s - is;
+        coord_type ds_prime = coord_type(1.0)-ds;
+
+        long long ip = static_cast<long long>(std::floor(p));
+        coord_type dp = p - ip;
+        coord_type dp_prime = coord_type(1.0)-dp;
+
+        if ( ix>=0 && ix<(long long)array_->get_size(0)-1 
+            && iy>=0 && iy<(long long)array_->get_size(1)-1 
+            && iz>=0 && iz<(long long)array_->get_size(2)-1 
+            && is>=0 && is<(long long)array_->get_size(3)-1 
+            && ip>=0 && ip<(long long)array_->get_size(4)-1 )
+        {
+            return (    ((*array_)(size_t(ix),   size_t(iy),     size_t(iz),    size_t(is),     size_t(ip) )   *   dx_prime     *dy_prime   *dz_prime   *ds_prime   *dp_prime
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz),     size_t(is),     size_t(ip) )   *   dx           *dy_prime   *dz_prime   *ds_prime   *dp_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz),    size_t(is),     size_t(ip) )   *   dx_prime     *dy         *dz_prime   *ds_prime   *dp_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz),     size_t(is),     size_t(ip) )   *   dx           *dy         *dz_prime   *ds_prime   *dp_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz)+1,  size_t(is),     size_t(ip) )   *   dx_prime     *dy_prime   *dz         *ds_prime   *dp_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz)+1,   size_t(is),     size_t(ip) )   *   dx           *dy_prime   *dz         *ds_prime   *dp_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz)+1,  size_t(is),     size_t(ip) )   *   dx_prime     *dy         *dz         *ds_prime   *dp_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz)+1,   size_t(is),     size_t(ip) )   *   dx           *dy         *dz         *ds_prime   *dp_prime)
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz),    size_t(is)+1,   size_t(ip) )   *   dx_prime     *dy_prime   *dz_prime   *ds   *dp_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz),     size_t(is)+1,   size_t(ip) )   *   dx           *dy_prime   *dz_prime   *ds   *dp_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz),    size_t(is)+1,   size_t(ip) )   *   dx_prime     *dy         *dz_prime   *ds   *dp_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz),     size_t(is)+1,   size_t(ip) )   *   dx           *dy         *dz_prime   *ds   *dp_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz)+1,  size_t(is)+1,   size_t(ip) )   *   dx_prime     *dy_prime   *dz         *ds   *dp_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz)+1,   size_t(is)+1,   size_t(ip) )   *   dx           *dy_prime   *dz         *ds   *dp_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz)+1,  size_t(is)+1,   size_t(ip) )   *   dx_prime     *dy         *dz         *ds   *dp_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz)+1,   size_t(is)+1,   size_t(ip) )   *   dx           *dy         *dz         *ds   *dp_prime)
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz),    size_t(is),     size_t(ip)+1 )   *   dx_prime     *dy_prime   *dz_prime   *ds_prime   *dp
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz),     size_t(is),     size_t(ip)+1 )   *   dx           *dy_prime   *dz_prime   *ds_prime   *dp) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz),    size_t(is),     size_t(ip)+1 )   *   dx_prime     *dy         *dz_prime   *ds_prime   *dp 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz),     size_t(is),     size_t(ip)+1 )   *   dx           *dy         *dz_prime   *ds_prime   *dp) 
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz)+1,  size_t(is),     size_t(ip)+1 )   *   dx_prime     *dy_prime   *dz         *ds_prime   *dp 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz)+1,   size_t(is),     size_t(ip)+1 )   *   dx           *dy_prime   *dz         *ds_prime   *dp) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz)+1,  size_t(is),     size_t(ip)+1 )   *   dx_prime     *dy         *dz         *ds_prime   *dp 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz)+1,   size_t(is),     size_t(ip)+1 )   *   dx           *dy         *dz         *ds_prime   *dp)
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz),    size_t(is)+1,     size_t(ip)+1 )   *   dx_prime     *dy_prime   *dz_prime   *ds   *dp 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz),     size_t(is)+1,     size_t(ip)+1 )   *   dx           *dy_prime   *dz_prime   *ds   *dp) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz),    size_t(is)+1,     size_t(ip)+1 )   *   dx_prime     *dy         *dz_prime   *ds   *dp 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz),     size_t(is)+1,     size_t(ip)+1 )   *   dx           *dy         *dz_prime   *ds   *dp) 
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz)+1,  size_t(is)+1,     size_t(ip)+1 )   *   dx_prime     *dy_prime   *dz         *ds   *dp 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz)+1,   size_t(is)+1,     size_t(ip)+1 )   *   dx           *dy_prime   *dz         *ds   *dp) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz)+1,  size_t(is)+1,     size_t(ip)+1 )   *   dx_prime     *dy         *dz         *ds   *dp 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz)+1,   size_t(is)+1,     size_t(ip)+1 )   *   dx           *dy         *dz         *ds   *dp) );
+        }
+        else
+        {
+            return (    ((*bh_)(ix,   iy,     iz,    is,     ip )   *   dx_prime     *dy_prime   *dz_prime   *ds_prime   *dp_prime
+                    +   (*bh_)(ix+1, iy,     iz,     is,     ip )   *   dx           *dy_prime   *dz_prime   *ds_prime   *dp_prime) 
+                    +   ((*bh_)(ix,   iy+1,   iz,    is,     ip )   *   dx_prime     *dy         *dz_prime   *ds_prime   *dp_prime 
+                    +   (*bh_)(ix+1, iy+1,   iz,     is,     ip )   *   dx           *dy         *dz_prime   *ds_prime   *dp_prime) 
+                    +   ((*bh_)(ix,   iy,     iz+1,  is,     ip )   *   dx_prime     *dy_prime   *dz         *ds_prime   *dp_prime 
+                    +   (*bh_)(ix+1, iy,     iz+1,   is,     ip )   *   dx           *dy_prime   *dz         *ds_prime   *dp_prime) 
+                    +   ((*bh_)(ix,   iy+1,   iz+1,  is,     ip )   *   dx_prime     *dy         *dz         *ds_prime   *dp_prime 
+                    +   (*bh_)(ix+1, iy+1,   iz+1,   is,     ip )   *   dx           *dy         *dz         *ds_prime   *dp_prime)
+                    +   ((*bh_)(ix,   iy,     iz,    is+1,   ip )   *   dx_prime     *dy_prime   *dz_prime   *ds   *dp_prime 
+                    +   (*bh_)(ix+1, iy,     iz,     is+1,   ip )   *   dx           *dy_prime   *dz_prime   *ds   *dp_prime) 
+                    +   ((*bh_)(ix,   iy+1,   iz,    is+1,   ip )   *   dx_prime     *dy         *dz_prime   *ds   *dp_prime 
+                    +   (*bh_)(ix+1, iy+1,   iz,     is+1,   ip )   *   dx           *dy         *dz_prime   *ds   *dp_prime) 
+                    +   ((*bh_)(ix,   iy,     iz+1,  is+1,   ip )   *   dx_prime     *dy_prime   *dz         *ds   *dp_prime 
+                    +   (*bh_)(ix+1, iy,     iz+1,   is+1,   ip )   *   dx           *dy_prime   *dz         *ds   *dp_prime) 
+                    +   ((*bh_)(ix,   iy+1,   iz+1,  is+1,   ip )   *   dx_prime     *dy         *dz         *ds   *dp_prime 
+                    +   (*bh_)(ix+1, iy+1,   iz+1,   is+1,   ip )   *   dx           *dy         *dz         *ds   *dp_prime)
+                    +   ((*bh_)(ix,   iy,     iz,    is,     ip+1 )   *   dx_prime     *dy_prime   *dz_prime   *ds_prime   *dp
+                    +   (*bh_)(ix+1, iy,     iz,     is,     ip+1 )   *   dx           *dy_prime   *dz_prime   *ds_prime   *dp) 
+                    +   ((*bh_)(ix,   iy+1,   iz,    is,     ip+1 )   *   dx_prime     *dy         *dz_prime   *ds_prime   *dp 
+                    +   (*bh_)(ix+1, iy+1,   iz,     is,     ip+1 )   *   dx           *dy         *dz_prime   *ds_prime   *dp) 
+                    +   ((*bh_)(ix,   iy,     iz+1,  is,     ip+1 )   *   dx_prime     *dy_prime   *dz         *ds_prime   *dp 
+                    +   (*bh_)(ix+1, iy,     iz+1,   is,     ip+1 )   *   dx           *dy_prime   *dz         *ds_prime   *dp) 
+                    +   ((*bh_)(ix,   iy+1,   iz+1,  is,     ip+1 )   *   dx_prime     *dy         *dz         *ds_prime   *dp 
+                    +   (*bh_)(ix+1, iy+1,   iz+1,   is,     ip+1 )   *   dx           *dy         *dz         *ds_prime   *dp)
+                    +   ((*bh_)(ix,   iy,     iz,    is+1,   ip+1 )   *   dx_prime     *dy_prime   *dz_prime   *ds   *dp 
+                    +   (*bh_)(ix+1, iy,     iz,     is+1,   ip+1 )   *   dx           *dy_prime   *dz_prime   *ds   *dp) 
+                    +   ((*bh_)(ix,   iy+1,   iz,    is+1,   ip+1 )   *   dx_prime     *dy         *dz_prime   *ds   *dp 
+                    +   (*bh_)(ix+1, iy+1,   iz,     is+1,   ip+1 )   *   dx           *dy         *dz_prime   *ds   *dp) 
+                    +   ((*bh_)(ix,   iy,     iz+1,  is+1,   ip+1 )   *   dx_prime     *dy_prime   *dz         *ds   *dp 
+                    +   (*bh_)(ix+1, iy,     iz+1,   is+1,   ip+1 )   *   dx           *dy_prime   *dz         *ds   *dp) 
+                    +   ((*bh_)(ix,   iy+1,   iz+1,  is+1,   ip+1 )   *   dx_prime     *dy         *dz         *ds   *dp 
+                    +   (*bh_)(ix+1, iy+1,   iz+1,   is+1,   ip+1 )   *   dx           *dy         *dz         *ds   *dp) );
+        }
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorLinear<ArrayType>::T hoNDInterpolatorLinear<ArrayType>::operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r )
+    {
+        long long ix = static_cast<long long>(std::floor(x));
+        coord_type dx = x - ix;
+        coord_type dx_prime = coord_type(1.0)-dx;
+
+        long long iy = static_cast<long long>(std::floor(y));
+        coord_type dy = y - iy;
+        coord_type dy_prime = coord_type(1.0)-dy;
+
+        long long iz = static_cast<long long>(std::floor(z));
+        coord_type dz = z - iz;
+        coord_type dz_prime = coord_type(1.0)-dz;
+
+        long long is = static_cast<long long>(std::floor(s));
+        coord_type ds = s - is;
+        coord_type ds_prime = coord_type(1.0)-ds;
+
+        long long ip = static_cast<long long>(std::floor(p));
+        coord_type dp = p - ip;
+        coord_type dp_prime = coord_type(1.0)-dp;
+
+        long long ir = static_cast<long long>(std::floor(r));
+        coord_type dr = r - ir;
+        coord_type dr_prime = coord_type(1.0)-dr;
+
+        if ( ix>=0 && ix<(long long)array_->get_size(0)-1 
+            && iy>=0 && iy<(long long)array_->get_size(1)-1 
+            && iz>=0 && iz<(long long)array_->get_size(2)-1 
+            && is>=0 && is<(long long)array_->get_size(3)-1 
+            && ip>=0 && ip<(long long)array_->get_size(4)-1 
+            && ir>=0 && ir<(long long)array_->get_size(5)-1 )
+        {
+            return (    ((*array_)(size_t(ix),   size_t(iy),     size_t(iz),    size_t(is),     size_t(ip),     size_t(ir) )   *   dx_prime     *dy_prime   *dz_prime   *ds_prime   *dp_prime   *dr_prime
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz),     size_t(is),     size_t(ip),     size_t(ir) )   *   dx           *dy_prime   *dz_prime   *ds_prime   *dp_prime   *dr_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz),    size_t(is),     size_t(ip),     size_t(ir) )   *   dx_prime     *dy         *dz_prime   *ds_prime   *dp_prime   *dr_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz),     size_t(is),     size_t(ip),     size_t(ir) )   *   dx           *dy         *dz_prime   *ds_prime   *dp_prime   *dr_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz)+1,  size_t(is),     size_t(ip),     size_t(ir) )   *   dx_prime     *dy_prime   *dz         *ds_prime   *dp_prime   *dr_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz)+1,   size_t(is),     size_t(ip),     size_t(ir) )   *   dx           *dy_prime   *dz         *ds_prime   *dp_prime   *dr_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz)+1,  size_t(is),     size_t(ip),     size_t(ir) )   *   dx_prime     *dy         *dz         *ds_prime   *dp_prime   *dr_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz)+1,   size_t(is),     size_t(ip),     size_t(ir) )   *   dx           *dy         *dz         *ds_prime   *dp_prime   *dr_prime)
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz),    size_t(is)+1,   size_t(ip),     size_t(ir) )   *   dx_prime     *dy_prime   *dz_prime   *ds         *dp_prime   *dr_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz),     size_t(is)+1,   size_t(ip),     size_t(ir) )   *   dx           *dy_prime   *dz_prime   *ds         *dp_prime   *dr_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz),    size_t(is)+1,   size_t(ip),     size_t(ir) )   *   dx_prime     *dy         *dz_prime   *ds         *dp_prime   *dr_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz),     size_t(is)+1,   size_t(ip),     size_t(ir) )   *   dx           *dy         *dz_prime   *ds         *dp_prime   *dr_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz)+1,  size_t(is)+1,   size_t(ip),     size_t(ir) )   *   dx_prime     *dy_prime   *dz         *ds         *dp_prime   *dr_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz)+1,   size_t(is)+1,   size_t(ip),     size_t(ir) )   *   dx           *dy_prime   *dz         *ds         *dp_prime   *dr_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz)+1,  size_t(is)+1,   size_t(ip),     size_t(ir) )   *   dx_prime     *dy         *dz         *ds         *dp_prime   *dr_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz)+1,   size_t(is)+1,   size_t(ip),     size_t(ir) )   *   dx           *dy         *dz         *ds         *dp_prime   *dr_prime)
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz),    size_t(is),     size_t(ip)+1,     size_t(ir) ) *   dx_prime     *dy_prime   *dz_prime   *ds_prime   *dp         *dr_prime
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz),     size_t(is),     size_t(ip)+1,     size_t(ir) ) *   dx           *dy_prime   *dz_prime   *ds_prime   *dp         *dr_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz),    size_t(is),     size_t(ip)+1,     size_t(ir) ) *   dx_prime     *dy         *dz_prime   *ds_prime   *dp         *dr_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz),     size_t(is),     size_t(ip)+1,     size_t(ir) ) *   dx           *dy         *dz_prime   *ds_prime   *dp         *dr_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz)+1,  size_t(is),     size_t(ip)+1,     size_t(ir) ) *   dx_prime     *dy_prime   *dz         *ds_prime   *dp         *dr_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz)+1,   size_t(is),     size_t(ip)+1,     size_t(ir) ) *   dx           *dy_prime   *dz         *ds_prime   *dp         *dr_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz)+1,  size_t(is),     size_t(ip)+1,     size_t(ir) ) *   dx_prime     *dy         *dz         *ds_prime   *dp         *dr_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz)+1,   size_t(is),     size_t(ip)+1,     size_t(ir) ) *   dx           *dy         *dz         *ds_prime   *dp         *dr_prime)
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz),    size_t(is)+1,   size_t(ip)+1,     size_t(ir) ) *   dx_prime     *dy_prime   *dz_prime   *ds         *dp         *dr_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz),     size_t(is)+1,   size_t(ip)+1,     size_t(ir) ) *   dx           *dy_prime   *dz_prime   *ds         *dp         *dr_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz),    size_t(is)+1,   size_t(ip)+1,     size_t(ir) ) *   dx_prime     *dy         *dz_prime   *ds         *dp         *dr_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz),     size_t(is)+1,   size_t(ip)+1,     size_t(ir) ) *   dx           *dy         *dz_prime   *ds         *dp         *dr_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz)+1,  size_t(is)+1,   size_t(ip)+1,     size_t(ir) ) *   dx_prime     *dy_prime   *dz         *ds         *dp         *dr_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz)+1,   size_t(is)+1,   size_t(ip)+1,     size_t(ir) ) *   dx           *dy_prime   *dz         *ds         *dp         *dr_prime) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz)+1,  size_t(is)+1,   size_t(ip)+1,     size_t(ir) ) *   dx_prime     *dy         *dz         *ds         *dp         *dr_prime 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz)+1,   size_t(is)+1,   size_t(ip)+1,     size_t(ir) ) *   dx           *dy         *dz         *ds         *dp         *dr_prime)
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz),    size_t(is),     size_t(ip),     size_t(ir)+1 )   *   dx_prime     *dy_prime   *dz_prime   *ds_prime   *dp_prime   *dr
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz),     size_t(is),     size_t(ip),     size_t(ir)+1 )   *   dx           *dy_prime   *dz_prime   *ds_prime   *dp_prime   *dr) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz),    size_t(is),     size_t(ip),     size_t(ir)+1 )   *   dx_prime     *dy         *dz_prime   *ds_prime   *dp_prime   *dr 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz),     size_t(is),     size_t(ip),     size_t(ir)+1 )   *   dx           *dy         *dz_prime   *ds_prime   *dp_prime   *dr) 
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz)+1,  size_t(is),     size_t(ip),     size_t(ir)+1 )   *   dx_prime     *dy_prime   *dz         *ds_prime   *dp_prime   *dr 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz)+1,   size_t(is),     size_t(ip),     size_t(ir)+1 )   *   dx           *dy_prime   *dz         *ds_prime   *dp_prime   *dr) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz)+1,  size_t(is),     size_t(ip),     size_t(ir)+1 )   *   dx_prime     *dy         *dz         *ds_prime   *dp_prime   *dr 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz)+1,   size_t(is),     size_t(ip),     size_t(ir)+1 )   *   dx           *dy         *dz         *ds_prime   *dp_prime   *dr)
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz),    size_t(is)+1,   size_t(ip),     size_t(ir)+1 )   *   dx_prime     *dy_prime   *dz_prime   *ds         *dp_prime   *dr 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz),     size_t(is)+1,   size_t(ip),     size_t(ir)+1 )   *   dx           *dy_prime   *dz_prime   *ds         *dp_prime   *dr) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz),    size_t(is)+1,   size_t(ip),     size_t(ir)+1 )   *   dx_prime     *dy         *dz_prime   *ds         *dp_prime   *dr 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz),     size_t(is)+1,   size_t(ip),     size_t(ir)+1 )   *   dx           *dy         *dz_prime   *ds         *dp_prime   *dr) 
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz)+1,  size_t(is)+1,   size_t(ip),     size_t(ir)+1 )   *   dx_prime     *dy_prime   *dz         *ds         *dp_prime   *dr 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz)+1,   size_t(is)+1,   size_t(ip),     size_t(ir)+1 )   *   dx           *dy_prime   *dz         *ds         *dp_prime   *dr) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz)+1,  size_t(is)+1,   size_t(ip),     size_t(ir)+1 )   *   dx_prime     *dy         *dz         *ds         *dp_prime   *dr 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz)+1,   size_t(is)+1,   size_t(ip),     size_t(ir)+1 )   *   dx           *dy         *dz         *ds         *dp_prime   *dr)
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz),    size_t(is),     size_t(ip)+1,     size_t(ir)+1 ) *   dx_prime     *dy_prime   *dz_prime   *ds_prime   *dp         *dr
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz),     size_t(is),     size_t(ip)+1,     size_t(ir)+1 ) *   dx           *dy_prime   *dz_prime   *ds_prime   *dp         *dr) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz),    size_t(is),     size_t(ip)+1,     size_t(ir)+1 ) *   dx_prime     *dy         *dz_prime   *ds_prime   *dp         *dr 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz),     size_t(is),     size_t(ip)+1,     size_t(ir)+1 ) *   dx           *dy         *dz_prime   *ds_prime   *dp         *dr) 
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz)+1,  size_t(is),     size_t(ip)+1,     size_t(ir)+1 ) *   dx_prime     *dy_prime   *dz         *ds_prime   *dp         *dr 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz)+1,   size_t(is),     size_t(ip)+1,     size_t(ir)+1 ) *   dx           *dy_prime   *dz         *ds_prime   *dp         *dr) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz)+1,  size_t(is),     size_t(ip)+1,     size_t(ir)+1 ) *   dx_prime     *dy         *dz         *ds_prime   *dp         *dr 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz)+1,   size_t(is),     size_t(ip)+1,     size_t(ir)+1 ) *   dx           *dy         *dz         *ds_prime   *dp         *dr)
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz),    size_t(is)+1,   size_t(ip)+1,     size_t(ir)+1 ) *   dx_prime     *dy_prime   *dz_prime   *ds         *dp         *dr 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz),     size_t(is)+1,   size_t(ip)+1,     size_t(ir)+1 ) *   dx           *dy_prime   *dz_prime   *ds         *dp         *dr) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz),    size_t(is)+1,   size_t(ip)+1,     size_t(ir)+1 ) *   dx_prime     *dy         *dz_prime   *ds         *dp         *dr 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz),     size_t(is)+1,   size_t(ip)+1,     size_t(ir)+1 ) *   dx           *dy         *dz_prime   *ds         *dp         *dr) 
+                    +   ((*array_)(size_t(ix),   size_t(iy),     size_t(iz)+1,  size_t(is)+1,   size_t(ip)+1,     size_t(ir)+1 ) *   dx_prime     *dy_prime   *dz         *ds         *dp         *dr 
+                    +   (*array_)(size_t(ix)+1, size_t(iy),     size_t(iz)+1,   size_t(is)+1,   size_t(ip)+1,     size_t(ir)+1 ) *   dx           *dy_prime   *dz         *ds         *dp         *dr) 
+                    +   ((*array_)(size_t(ix),   size_t(iy)+1,   size_t(iz)+1,  size_t(is)+1,   size_t(ip)+1,     size_t(ir)+1 ) *   dx_prime     *dy         *dz         *ds         *dp         *dr 
+                    +   (*array_)(size_t(ix)+1, size_t(iy)+1,   size_t(iz)+1,   size_t(is)+1,   size_t(ip)+1,     size_t(ir)+1 ) *   dx           *dy         *dz         *ds         *dp         *dr) );
+        }
+        else
+        {
+            return (    ((*bh_)(ix,   iy,     iz,    is,     ip,     ir )   *   dx_prime     *dy_prime   *dz_prime   *ds_prime   *dp_prime   *dr_prime
+                    +   (*bh_)(ix+1, iy,     iz,     is,     ip,     ir )   *   dx           *dy_prime   *dz_prime   *ds_prime   *dp_prime   *dr_prime) 
+                    +   ((*bh_)(ix,   iy+1,   iz,    is,     ip,     ir )   *   dx_prime     *dy         *dz_prime   *ds_prime   *dp_prime   *dr_prime 
+                    +   (*bh_)(ix+1, iy+1,   iz,     is,     ip,     ir )   *   dx           *dy         *dz_prime   *ds_prime   *dp_prime   *dr_prime) 
+                    +   ((*bh_)(ix,   iy,     iz+1,  is,     ip,     ir )   *   dx_prime     *dy_prime   *dz         *ds_prime   *dp_prime   *dr_prime 
+                    +   (*bh_)(ix+1, iy,     iz+1,   is,     ip,     ir )   *   dx           *dy_prime   *dz         *ds_prime   *dp_prime   *dr_prime) 
+                    +   ((*bh_)(ix,   iy+1,   iz+1,  is,     ip,     ir )   *   dx_prime     *dy         *dz         *ds_prime   *dp_prime   *dr_prime 
+                    +   (*bh_)(ix+1, iy+1,   iz+1,   is,     ip,     ir )   *   dx           *dy         *dz         *ds_prime   *dp_prime   *dr_prime)
+                    +   ((*bh_)(ix,   iy,     iz,    is+1,   ip,     ir )   *   dx_prime     *dy_prime   *dz_prime   *ds         *dp_prime   *dr_prime 
+                    +   (*bh_)(ix+1, iy,     iz,     is+1,   ip,     ir )   *   dx           *dy_prime   *dz_prime   *ds         *dp_prime   *dr_prime) 
+                    +   ((*bh_)(ix,   iy+1,   iz,    is+1,   ip,     ir )   *   dx_prime     *dy         *dz_prime   *ds         *dp_prime   *dr_prime 
+                    +   (*bh_)(ix+1, iy+1,   iz,     is+1,   ip,     ir )   *   dx           *dy         *dz_prime   *ds         *dp_prime   *dr_prime) 
+                    +   ((*bh_)(ix,   iy,     iz+1,  is+1,   ip,     ir )   *   dx_prime     *dy_prime   *dz         *ds         *dp_prime   *dr_prime 
+                    +   (*bh_)(ix+1, iy,     iz+1,   is+1,   ip,     ir )   *   dx           *dy_prime   *dz         *ds         *dp_prime   *dr_prime) 
+                    +   ((*bh_)(ix,   iy+1,   iz+1,  is+1,   ip,     ir )   *   dx_prime     *dy         *dz         *ds         *dp_prime   *dr_prime 
+                    +   (*bh_)(ix+1, iy+1,   iz+1,   is+1,   ip,     ir )   *   dx           *dy         *dz         *ds         *dp_prime   *dr_prime)
+                    +   ((*bh_)(ix,   iy,     iz,    is,     ip+1,     ir ) *   dx_prime     *dy_prime   *dz_prime   *ds_prime   *dp         *dr_prime
+                    +   (*bh_)(ix+1, iy,     iz,     is,     ip+1,     ir ) *   dx           *dy_prime   *dz_prime   *ds_prime   *dp         *dr_prime) 
+                    +   ((*bh_)(ix,   iy+1,   iz,    is,     ip+1,     ir ) *   dx_prime     *dy         *dz_prime   *ds_prime   *dp         *dr_prime 
+                    +   (*bh_)(ix+1, iy+1,   iz,     is,     ip+1,     ir ) *   dx           *dy         *dz_prime   *ds_prime   *dp         *dr_prime) 
+                    +   ((*bh_)(ix,   iy,     iz+1,  is,     ip+1,     ir ) *   dx_prime     *dy_prime   *dz         *ds_prime   *dp         *dr_prime 
+                    +   (*bh_)(ix+1, iy,     iz+1,   is,     ip+1,     ir ) *   dx           *dy_prime   *dz         *ds_prime   *dp         *dr_prime) 
+                    +   ((*bh_)(ix,   iy+1,   iz+1,  is,     ip+1,     ir ) *   dx_prime     *dy         *dz         *ds_prime   *dp         *dr_prime 
+                    +   (*bh_)(ix+1, iy+1,   iz+1,   is,     ip+1,     ir ) *   dx           *dy         *dz         *ds_prime   *dp         *dr_prime)
+                    +   ((*bh_)(ix,   iy,     iz,    is+1,   ip+1,     ir ) *   dx_prime     *dy_prime   *dz_prime   *ds         *dp         *dr_prime 
+                    +   (*bh_)(ix+1, iy,     iz,     is+1,   ip+1,     ir ) *   dx           *dy_prime   *dz_prime   *ds         *dp         *dr_prime) 
+                    +   ((*bh_)(ix,   iy+1,   iz,    is+1,   ip+1,     ir ) *   dx_prime     *dy         *dz_prime   *ds         *dp         *dr_prime 
+                    +   (*bh_)(ix+1, iy+1,   iz,     is+1,   ip+1,     ir ) *   dx           *dy         *dz_prime   *ds         *dp         *dr_prime) 
+                    +   ((*bh_)(ix,   iy,     iz+1,  is+1,   ip+1,     ir ) *   dx_prime     *dy_prime   *dz         *ds         *dp         *dr_prime 
+                    +   (*bh_)(ix+1, iy,     iz+1,   is+1,   ip+1,     ir ) *   dx           *dy_prime   *dz         *ds         *dp         *dr_prime) 
+                    +   ((*bh_)(ix,   iy+1,   iz+1,  is+1,   ip+1,     ir ) *   dx_prime     *dy         *dz         *ds         *dp         *dr_prime 
+                    +   (*bh_)(ix+1, iy+1,   iz+1,   is+1,   ip+1,     ir ) *   dx           *dy         *dz         *ds         *dp         *dr_prime)
+                    +   ((*bh_)(ix,   iy,     iz,    is,     ip,     ir+1 )   *   dx_prime     *dy_prime   *dz_prime   *ds_prime   *dp_prime   *dr
+                    +   (*bh_)(ix+1, iy,     iz,     is,     ip,     ir+1 )   *   dx           *dy_prime   *dz_prime   *ds_prime   *dp_prime   *dr) 
+                    +   ((*bh_)(ix,   iy+1,   iz,    is,     ip,     ir+1 )   *   dx_prime     *dy         *dz_prime   *ds_prime   *dp_prime   *dr 
+                    +   (*bh_)(ix+1, iy+1,   iz,     is,     ip,     ir+1 )   *   dx           *dy         *dz_prime   *ds_prime   *dp_prime   *dr) 
+                    +   ((*bh_)(ix,   iy,     iz+1,  is,     ip,     ir+1 )   *   dx_prime     *dy_prime   *dz         *ds_prime   *dp_prime   *dr 
+                    +   (*bh_)(ix+1, iy,     iz+1,   is,     ip,     ir+1 )   *   dx           *dy_prime   *dz         *ds_prime   *dp_prime   *dr) 
+                    +   ((*bh_)(ix,   iy+1,   iz+1,  is,     ip,     ir+1 )   *   dx_prime     *dy         *dz         *ds_prime   *dp_prime   *dr 
+                    +   (*bh_)(ix+1, iy+1,   iz+1,   is,     ip,     ir+1 )   *   dx           *dy         *dz         *ds_prime   *dp_prime   *dr)
+                    +   ((*bh_)(ix,   iy,     iz,    is+1,   ip,     ir+1 )   *   dx_prime     *dy_prime   *dz_prime   *ds         *dp_prime   *dr 
+                    +   (*bh_)(ix+1, iy,     iz,     is+1,   ip,     ir+1 )   *   dx           *dy_prime   *dz_prime   *ds         *dp_prime   *dr) 
+                    +   ((*bh_)(ix,   iy+1,   iz,    is+1,   ip,     ir+1 )   *   dx_prime     *dy         *dz_prime   *ds         *dp_prime   *dr 
+                    +   (*bh_)(ix+1, iy+1,   iz,     is+1,   ip,     ir+1 )   *   dx           *dy         *dz_prime   *ds         *dp_prime   *dr) 
+                    +   ((*bh_)(ix,   iy,     iz+1,  is+1,   ip,     ir+1 )   *   dx_prime     *dy_prime   *dz         *ds         *dp_prime   *dr 
+                    +   (*bh_)(ix+1, iy,     iz+1,   is+1,   ip,     ir+1 )   *   dx           *dy_prime   *dz         *ds         *dp_prime   *dr) 
+                    +   ((*bh_)(ix,   iy+1,   iz+1,  is+1,   ip,     ir+1 )   *   dx_prime     *dy         *dz         *ds         *dp_prime   *dr 
+                    +   (*bh_)(ix+1, iy+1,   iz+1,   is+1,   ip,     ir+1 )   *   dx           *dy         *dz         *ds         *dp_prime   *dr)
+                    +   ((*bh_)(ix,   iy,     iz,    is,     ip+1,     ir+1 ) *   dx_prime     *dy_prime   *dz_prime   *ds_prime   *dp         *dr
+                    +   (*bh_)(ix+1, iy,     iz,     is,     ip+1,     ir+1 ) *   dx           *dy_prime   *dz_prime   *ds_prime   *dp         *dr) 
+                    +   ((*bh_)(ix,   iy+1,   iz,    is,     ip+1,     ir+1 ) *   dx_prime     *dy         *dz_prime   *ds_prime   *dp         *dr 
+                    +   (*bh_)(ix+1, iy+1,   iz,     is,     ip+1,     ir+1 ) *   dx           *dy         *dz_prime   *ds_prime   *dp         *dr) 
+                    +   ((*bh_)(ix,   iy,     iz+1,  is,     ip+1,     ir+1 ) *   dx_prime     *dy_prime   *dz         *ds_prime   *dp         *dr 
+                    +   (*bh_)(ix+1, iy,     iz+1,   is,     ip+1,     ir+1 ) *   dx           *dy_prime   *dz         *ds_prime   *dp         *dr) 
+                    +   ((*bh_)(ix,   iy+1,   iz+1,  is,     ip+1,     ir+1 ) *   dx_prime     *dy         *dz         *ds_prime   *dp         *dr 
+                    +   (*bh_)(ix+1, iy+1,   iz+1,   is,     ip+1,     ir+1 ) *   dx           *dy         *dz         *ds_prime   *dp         *dr)
+                    +   ((*bh_)(ix,   iy,     iz,    is+1,   ip+1,     ir+1 ) *   dx_prime     *dy_prime   *dz_prime   *ds         *dp         *dr 
+                    +   (*bh_)(ix+1, iy,     iz,     is+1,   ip+1,     ir+1 ) *   dx           *dy_prime   *dz_prime   *ds         *dp         *dr) 
+                    +   ((*bh_)(ix,   iy+1,   iz,    is+1,   ip+1,     ir+1 ) *   dx_prime     *dy         *dz_prime   *ds         *dp         *dr 
+                    +   (*bh_)(ix+1, iy+1,   iz,     is+1,   ip+1,     ir+1 ) *   dx           *dy         *dz_prime   *ds         *dp         *dr) 
+                    +   ((*bh_)(ix,   iy,     iz+1,  is+1,   ip+1,     ir+1 ) *   dx_prime     *dy_prime   *dz         *ds         *dp         *dr 
+                    +   (*bh_)(ix+1, iy,     iz+1,   is+1,   ip+1,     ir+1 ) *   dx           *dy_prime   *dz         *ds         *dp         *dr) 
+                    +   ((*bh_)(ix,   iy+1,   iz+1,  is+1,   ip+1,     ir+1 ) *   dx_prime     *dy         *dz         *ds         *dp         *dr 
+                    +   (*bh_)(ix+1, iy+1,   iz+1,   is+1,   ip+1,     ir+1 ) *   dx           *dy         *dz         *ds         *dp         *dr) );
+        }
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorLinear<ArrayType>::T hoNDInterpolatorLinear<ArrayType>::operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a )
+    {
+        long long anchor[7];
+        coord_type d[7];
+        coord_type d_prime[7];
+
+        anchor[0] = static_cast<long long>(std::floor(x));
+        anchor[1] = static_cast<long long>(std::floor(y));
+        anchor[2] = static_cast<long long>(std::floor(z));
+        anchor[3] = static_cast<long long>(std::floor(s));
+        anchor[4] = static_cast<long long>(std::floor(p));
+        anchor[5] = static_cast<long long>(std::floor(r));
+        anchor[6] = static_cast<long long>(std::floor(a));
+
+        d[0] = x - anchor[0];
+        d[1] = y - anchor[1];
+        d[2] = z - anchor[2];
+        d[3] = s - anchor[3];
+        d[4] = p - anchor[4];
+        d[5] = r - anchor[5];
+        d[6] = a - anchor[6];
+
+        unsigned int ii;
+        for ( ii=0; ii<7; ii++ )
+        {
+            d_prime[ii] = coord_type(1.0)-d[ii];
+        }
+
+        T res(0);
+
+        coord_type weightAll(1.0);
+
+        unsigned int n;
+
+        if ( anchor[0]>=0 && anchor[0]<(long long)array_->get_size(0)-1 
+            && anchor[1]>=0 && anchor[1]<(long long)array_->get_size(1)-1 
+            && anchor[2]>=0 && anchor[2]<(long long)array_->get_size(2)-1 
+            && anchor[3]>=0 && anchor[3]<(long long)array_->get_size(3)-1 
+            && anchor[4]>=0 && anchor[4]<(long long)array_->get_size(4)-1 
+            && anchor[5]>=0 && anchor[5]<(long long)array_->get_size(5)-1
+            && anchor[6]>=0 && anchor[6]<(long long)array_->get_size(6)-1 )
+        {
+            std::vector<size_t> ind(7);
+
+            for ( n=0; n<number_of_points_; n++ )
+            {
+                unsigned int lastDigit = n;
+                weightAll = coord_type(1.0);
+
+                for ( ii=0; ii<7; ii++ )
+                {
+                    if ( lastDigit & 1 )
+                    {
+                        ind[ii] = anchor[ii]+1;
+                        weightAll *= d[ii];
+                    }
+                    else
+                    {
+                        ind[ii] = anchor[ii];
+                        weightAll *= d_prime[ii];
+                    }
+
+                    // shift one digit
+                    lastDigit >>= 1;
+                }
+
+                res += weightAll * (*array_)(ind);
+            }
+        }
+        else
+        {
+            std::vector<long long> ind(7);
+
+            for ( n=0; n<number_of_points_; n++ )
+            {
+                unsigned int lastDigit = n;
+                weightAll = coord_type(1.0);
+
+                for ( ii=0; ii<7; ii++ )
+                {
+                    if ( lastDigit & 1 )
+                    {
+                        ind[ii] = anchor[ii]+1;
+                        weightAll *= d[ii];
+                    }
+                    else
+                    {
+                        ind[ii] = anchor[ii];
+                        weightAll *= d_prime[ii];
+                    }
+
+                    // shift one digit
+                    lastDigit >>= 1;
+                }
+
+                res += weightAll * (*bh_)(ind);
+            }
+        }
+
+        return res;
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorLinear<ArrayType>::T hoNDInterpolatorLinear<ArrayType>::operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q )
+    {
+        long long anchor[8];
+        coord_type d[8];
+        coord_type d_prime[8];
+
+        anchor[0] = static_cast<long long>(std::floor(x));
+        anchor[1] = static_cast<long long>(std::floor(y));
+        anchor[2] = static_cast<long long>(std::floor(z));
+        anchor[3] = static_cast<long long>(std::floor(s));
+        anchor[4] = static_cast<long long>(std::floor(p));
+        anchor[5] = static_cast<long long>(std::floor(r));
+        anchor[6] = static_cast<long long>(std::floor(a));
+        anchor[7] = static_cast<long long>(std::floor(q));
+
+        d[0] = x - anchor[0];
+        d[1] = y - anchor[1];
+        d[2] = z - anchor[2];
+        d[3] = s - anchor[3];
+        d[4] = p - anchor[4];
+        d[5] = r - anchor[5];
+        d[6] = a - anchor[6];
+        d[7] = q - anchor[7];
+
+        unsigned int ii;
+        for ( ii=0; ii<8; ii++ )
+        {
+            d_prime[ii] = coord_type(1.0)-d[ii];
+        }
+
+        T res(0);
+
+        coord_type weightAll(1.0);
+
+        unsigned int n;
+
+        if ( anchor[0]>=0 && anchor[0]<(long long)array_->get_size(0)-1 
+            && anchor[1]>=0 && anchor[1]<(long long)array_->get_size(1)-1 
+            && anchor[2]>=0 && anchor[2]<(long long)array_->get_size(2)-1 
+            && anchor[3]>=0 && anchor[3]<(long long)array_->get_size(3)-1 
+            && anchor[4]>=0 && anchor[4]<(long long)array_->get_size(4)-1 
+            && anchor[5]>=0 && anchor[5]<(long long)array_->get_size(5)-1
+            && anchor[6]>=0 && anchor[6]<(long long)array_->get_size(6)-1
+            && anchor[7]>=0 && anchor[7]<(long long)array_->get_size(7)-1 )
+        {
+            std::vector<size_t> ind(8);
+
+            for ( n=0; n<number_of_points_; n++ )
+            {
+                unsigned int lastDigit = n;
+                weightAll = coord_type(1.0);
+
+                for ( ii=0; ii<8; ii++ )
+                {
+                    if ( lastDigit & 1 )
+                    {
+                        ind[ii] = anchor[ii]+1;
+                        weightAll *= d[ii];
+                    }
+                    else
+                    {
+                        ind[ii] = anchor[ii];
+                        weightAll *= d_prime[ii];
+                    }
+
+                    // shift one digit
+                    lastDigit >>= 1;
+                }
+
+                res += weightAll * (*array_)(ind);
+            }
+        }
+        else
+        {
+            std::vector<long long> ind(8);
+
+            for ( n=0; n<number_of_points_; n++ )
+            {
+                unsigned int lastDigit = n;
+                weightAll = coord_type(1.0);
+
+                for ( ii=0; ii<8; ii++ )
+                {
+                    if ( lastDigit & 1 )
+                    {
+                        ind[ii] = anchor[ii]+1;
+                        weightAll *= d[ii];
+                    }
+                    else
+                    {
+                        ind[ii] = anchor[ii];
+                        weightAll *= d_prime[ii];
+                    }
+
+                    // shift one digit
+                    lastDigit >>= 1;
+                }
+
+                res += weightAll * (*bh_)(ind);
+            }
+        }
+
+        return res;
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorLinear<ArrayType>::T hoNDInterpolatorLinear<ArrayType>::operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q, coord_type u )
+    {
+        long long anchor[9];
+        coord_type d[9];
+        coord_type d_prime[9];
+
+        anchor[0] = static_cast<long long>(std::floor(x));
+        anchor[1] = static_cast<long long>(std::floor(y));
+        anchor[2] = static_cast<long long>(std::floor(z));
+        anchor[3] = static_cast<long long>(std::floor(s));
+        anchor[4] = static_cast<long long>(std::floor(p));
+        anchor[5] = static_cast<long long>(std::floor(r));
+        anchor[6] = static_cast<long long>(std::floor(a));
+        anchor[7] = static_cast<long long>(std::floor(q));
+        anchor[8] = static_cast<long long>(std::floor(u));
+
+        d[0] = x - anchor[0];
+        d[1] = y - anchor[1];
+        d[2] = z - anchor[2];
+        d[3] = s - anchor[3];
+        d[4] = p - anchor[4];
+        d[5] = r - anchor[5];
+        d[6] = a - anchor[6];
+        d[7] = q - anchor[7];
+        d[8] = u - anchor[8];
+
+        unsigned int ii;
+        for ( ii=0; ii<9; ii++ )
+        {
+            d_prime[ii] = coord_type(1.0)-d[ii];
+        }
+
+        T res(0);
+
+        coord_type weightAll(1.0);
+
+        unsigned int n;
+
+        if ( anchor[0]>=0 && anchor[0]<(long long)array_->get_size(0)-1 
+            && anchor[1]>=0 && anchor[1]<(long long)array_->get_size(1)-1 
+            && anchor[2]>=0 && anchor[2]<(long long)array_->get_size(2)-1 
+            && anchor[3]>=0 && anchor[3]<(long long)array_->get_size(3)-1 
+            && anchor[4]>=0 && anchor[4]<(long long)array_->get_size(4)-1 
+            && anchor[5]>=0 && anchor[5]<(long long)array_->get_size(5)-1
+            && anchor[6]>=0 && anchor[6]<(long long)array_->get_size(6)-1
+            && anchor[7]>=0 && anchor[7]<(long long)array_->get_size(7)-1
+            && anchor[8]>=0 && anchor[8]<(long long)array_->get_size(8)-1 )
+        {
+            std::vector<size_t> ind(9);
+
+            for ( n=0; n<number_of_points_; n++ )
+            {
+                unsigned int lastDigit = n;
+                weightAll = coord_type(1.0);
+
+                for ( ii=0; ii<9; ii++ )
+                {
+                    if ( lastDigit & 1 )
+                    {
+                        ind[ii] = anchor[ii]+1;
+                        weightAll *= d[ii];
+                    }
+                    else
+                    {
+                        ind[ii] = anchor[ii];
+                        weightAll *= d_prime[ii];
+                    }
+
+                    // shift one digit
+                    lastDigit >>= 1;
+                }
+
+                res += weightAll * (*array_)(ind);
+            }
+        }
+        else
+        {
+            std::vector<long long> ind(9);
+
+            for ( n=0; n<number_of_points_; n++ )
+            {
+                unsigned int lastDigit = n;
+                weightAll = coord_type(1.0);
+
+                for ( ii=0; ii<9; ii++ )
+                {
+                    if ( lastDigit & 1 )
+                    {
+                        ind[ii] = anchor[ii]+1;
+                        weightAll *= d[ii];
+                    }
+                    else
+                    {
+                        ind[ii] = anchor[ii];
+                        weightAll *= d_prime[ii];
+                    }
+
+                    // shift one digit
+                    lastDigit >>= 1;
+                }
+
+                res += weightAll * (*bh_)(ind);
+            }
+        }
+
+        return res;
+    }
+}
diff --git a/toolboxes/core/cpu/hoNDInterpolatorNearestNeighbor.hxx b/toolboxes/core/cpu/hoNDInterpolatorNearestNeighbor.hxx
new file mode 100644
index 0000000..43d501c
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDInterpolatorNearestNeighbor.hxx
@@ -0,0 +1,94 @@
+/** \file       hoNDInterpolatorNearestNeighbor.h
+    \brief      N-dimensional nearest neighbor interpolator
+
+                Designed to work with hoNDArray and hoNDImage
+
+    \author     Hui Xue
+*/
+
+namespace Gadgetron
+{
+    /// hoNDInterpolatorNearestNeighbor
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorNearestNeighbor<ArrayType>::T hoNDInterpolatorNearestNeighbor<ArrayType>::operator()( const coord_type* pos )
+    {
+        unsigned int D = array_->get_number_of_dimensions();
+        std::vector<long long> ind(D);
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            ind[ii] = static_cast<long long>(pos[ii]+0.5);
+        }
+
+        return (*bh_)(ind);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorNearestNeighbor<ArrayType>::T hoNDInterpolatorNearestNeighbor<ArrayType>::operator()( const std::vector<coord_type>& pos )
+    {
+        std::vector<long long> ind(pos.size());
+        unsigned int ii;
+        unsigned int D = array_->get_number_of_dimensions();
+        for ( ii=0; ii<D; ii++ )
+        {
+            ind[ii] = static_cast<long long>(pos[ii]+0.5);
+        }
+
+        return (*bh_)(ind);
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorNearestNeighbor<ArrayType>::T hoNDInterpolatorNearestNeighbor<ArrayType>::operator()( coord_type x )
+    {
+        return (*bh_)(static_cast<long long>(x+0.5));
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorNearestNeighbor<ArrayType>::T hoNDInterpolatorNearestNeighbor<ArrayType>::operator()( coord_type x, coord_type y )
+    {
+        return (*bh_)(static_cast<long long>(x+0.5), static_cast<long long>(y+0.5));
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorNearestNeighbor<ArrayType>::T hoNDInterpolatorNearestNeighbor<ArrayType>::operator()( coord_type x, coord_type y, coord_type z )
+    {
+        return (*bh_)(static_cast<long long>(x+0.5), static_cast<long long>(y+0.5), static_cast<long long>(z+0.5));
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorNearestNeighbor<ArrayType>::T hoNDInterpolatorNearestNeighbor<ArrayType>::operator()( coord_type x, coord_type y, coord_type z, coord_type s )
+    {
+        return (*bh_)(static_cast<long long>(x+0.5), static_cast<long long>(y+0.5), static_cast<long long>(z+0.5), static_cast<long long>(s+0.5));
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorNearestNeighbor<ArrayType>::T hoNDInterpolatorNearestNeighbor<ArrayType>::operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p )
+    {
+        return (*bh_)(static_cast<long long>(x+0.5), static_cast<long long>(y+0.5), static_cast<long long>(z+0.5), static_cast<long long>(s+0.5), static_cast<long long>(p+0.5));
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorNearestNeighbor<ArrayType>::T hoNDInterpolatorNearestNeighbor<ArrayType>::operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r )
+    {
+        return (*bh_)(static_cast<long long>(x+0.5), static_cast<long long>(y+0.5), static_cast<long long>(z+0.5), static_cast<long long>(s+0.5), static_cast<long long>(p+0.5), static_cast<long long>(r+0.5));
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorNearestNeighbor<ArrayType>::T hoNDInterpolatorNearestNeighbor<ArrayType>::operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a )
+    {
+        return (*bh_)(static_cast<long long>(x+0.5), static_cast<long long>(y+0.5), static_cast<long long>(z+0.5), static_cast<long long>(s+0.5), static_cast<long long>(p+0.5), static_cast<long long>(r+0.5), static_cast<long long>(a+0.5));
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorNearestNeighbor<ArrayType>::T hoNDInterpolatorNearestNeighbor<ArrayType>::operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q )
+    {
+        return (*bh_)(static_cast<long long>(x+0.5), static_cast<long long>(y+0.5), static_cast<long long>(z+0.5), static_cast<long long>(s+0.5), static_cast<long long>(p+0.5), static_cast<long long>(r+0.5), static_cast<long long>(a+0.5), static_cast<long long>(q+0.5));
+    }
+
+    template <typename ArrayType> 
+    inline typename hoNDInterpolatorNearestNeighbor<ArrayType>::T hoNDInterpolatorNearestNeighbor<ArrayType>::operator()( coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q, coord_type u )
+    {
+        return (*bh_)(static_cast<long long>(x+0.5), static_cast<long long>(y+0.5), static_cast<long long>(z+0.5), static_cast<long long>(s+0.5), static_cast<long long>(p+0.5), static_cast<long long>(r+0.5), static_cast<long long>(a+0.5), static_cast<long long>(q+0.5), static_cast<long long>(u+0.5));
+    }
+}
diff --git a/toolboxes/core/cpu/hoNDObjectArray.h b/toolboxes/core/cpu/hoNDObjectArray.h
new file mode 100644
index 0000000..6091203
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDObjectArray.h
@@ -0,0 +1,200 @@
+/** \file   hoNDObjectArray.h
+\brief  CPU-based N-dimensional array for object pointers
+if delete_data_on_destruct == true, the object will be released; otherwise, only the object array memory is released
+\author Hui Xue
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+
+namespace Gadgetron
+{
+
+    template <typename TObjectType> class hoNDObjectArray : public hoNDArray<TObjectType*>
+    {
+    public:
+
+        typedef hoNDArray<TObjectType*> BaseClass;
+        typedef float coord_type;
+        typedef typename BaseClass::value_type value_type;
+
+        hoNDObjectArray();
+
+        explicit hoNDObjectArray(std::vector<size_t> &dimensions);
+        explicit hoNDObjectArray(std::vector<size_t> *dimensions);
+        explicit hoNDObjectArray(boost::shared_ptr< std::vector<size_t> > dimensions);
+
+        virtual ~hoNDObjectArray();
+
+        // Copy constructors
+        hoNDObjectArray(const hoNDObjectArray<TObjectType> &a);
+        explicit hoNDObjectArray(const hoNDObjectArray<TObjectType> *a);
+
+        // Assignment operator
+        hoNDObjectArray& operator=(const hoNDObjectArray& rhs);
+
+        virtual void create(std::vector<size_t>& dimensions);
+        virtual void create(std::vector<size_t> *dimensions);
+        virtual void create(boost::shared_ptr< std::vector<size_t> > dimensions);
+
+        void get_sub_array(const std::vector<size_t>& start, std::vector<size_t>& size, hoNDObjectArray<TObjectType>& out);
+
+        virtual void print(std::ostream& os) const;
+
+    protected:
+
+        using BaseClass::dimensions_;
+        using BaseClass::offsetFactors_;
+        using BaseClass::data_;
+        using BaseClass::elements_;
+        using BaseClass::delete_data_on_destruct_;
+    };
+
+    template <typename TObjectType> 
+    hoNDObjectArray<TObjectType>::hoNDObjectArray() : BaseClass() 
+    {
+    }
+
+    template <typename TObjectType> 
+    hoNDObjectArray<TObjectType>::hoNDObjectArray(std::vector<size_t> *dimensions) : BaseClass(dimensions)
+    {
+        this->create(dimensions);
+    }
+
+    template <typename TObjectType> 
+    hoNDObjectArray<TObjectType>::hoNDObjectArray(std::vector<size_t> &dimensions) : BaseClass(dimensions)
+    {
+        this->create(dimensions);
+    }
+
+    template <typename TObjectType> 
+    hoNDObjectArray<TObjectType>::hoNDObjectArray(boost::shared_ptr< std::vector<size_t> > dimensions) : BaseClass(dimensions)
+    {
+        this->create(dimensions);
+    }
+
+    template <typename TObjectType> 
+    hoNDObjectArray<TObjectType>::~hoNDObjectArray()
+    {
+        if (this->delete_data_on_destruct_)
+        {
+            size_t n;
+            for ( n=0; n<this->elements_; n++ )
+            {
+                if ( this->data_[n] != NULL )
+                {
+                    delete this->data_[n];
+                    this->data_[n] = NULL;
+                }
+            }
+
+            this->deallocate_memory();
+        }
+    }
+
+    template <typename TObjectType> 
+    hoNDObjectArray<TObjectType>::hoNDObjectArray(const hoNDObjectArray<TObjectType>  *a) : BaseClass(a)
+    {
+        this->delete_data_on_destruct_ = false;
+    }
+
+    template <typename TObjectType> 
+    hoNDObjectArray<TObjectType>::hoNDObjectArray(const hoNDObjectArray<TObjectType> &a) : BaseClass(a)
+    {
+        this->delete_data_on_destruct_ = false;
+    }
+
+    template <typename TObjectType> 
+    hoNDObjectArray<TObjectType>& hoNDObjectArray<TObjectType>::operator=(const hoNDObjectArray<TObjectType>& rhs)
+    {
+        if ( &rhs == this ) return *this;
+
+        BaseClass::operator=(rhs);
+
+        this->delete_data_on_destruct_ = false;
+
+        return *this;
+    }
+
+    template <typename TObjectType> 
+    void hoNDObjectArray<TObjectType>::create(std::vector<size_t>& dimensions)
+    {
+        BaseClass::create(dimensions);
+
+        for ( size_t n=0; n<this->elements_; n++ )
+        {
+            this->data_[n] = NULL;
+        }
+    }
+
+    template <typename TObjectType> 
+    void hoNDObjectArray<TObjectType>::create(std::vector<size_t> *dimensions)
+    {
+        BaseClass::create(dimensions);
+
+        for ( size_t n=0; n<this->elements_; n++ )
+        {
+            this->data_[n] = NULL;
+        }
+    }
+
+    template <typename TObjectType> 
+    void hoNDObjectArray<TObjectType>::create(boost::shared_ptr< std::vector<size_t> > dimensions)
+    {
+        BaseClass::create(dimensions);
+
+        for ( size_t n=0; n<this->elements_; n++ )
+        {
+            this->data_[n] = NULL;
+        }
+    }
+
+    template <typename TObjectType> 
+    void hoNDObjectArray<TObjectType>::get_sub_array(const std::vector<size_t>& start, std::vector<size_t>& size, hoNDObjectArray<TObjectType>& out)
+    {
+        if ( start.size() != size.size() )
+        {
+            BOOST_THROW_EXCEPTION( runtime_error("hoNDArray<>::get_sub_array failed"));
+        }
+
+        if ( start.size() != (*dimensions_).size() )
+        {
+            BOOST_THROW_EXCEPTION( runtime_error("hoNDArray<>::get_sub_array failed"));
+        }
+
+        out.create(&size);
+
+        if ( out.get_number_of_elements() == this->get_number_of_elements() )
+        {
+            out = *this;
+            return;
+        }
+
+        std::vector<size_t> end(start.size());
+
+        size_t ii;
+        for ( ii=0; ii<start.size(); ii++ )
+        {
+            end[ii] = start[ii] + size[ii] - 1;
+            if ( end[ii] >= (*dimensions_)[ii] )
+            {
+                BOOST_THROW_EXCEPTION( runtime_error("hoNDArray<>::get_sub_array failed"));
+            }
+        }
+
+        out.delete_data_on_destruct(false);
+    }
+
+    template <typename TObjectType> 
+    void hoNDObjectArray<TObjectType>::print(std::ostream& os) const
+    {
+        using namespace std;
+
+        os.unsetf(std::ios::scientific);
+        os.setf(ios::fixed);
+
+        os << "-------------- Gagdgetron ND Object Array -------------" << endl;
+        this->printContent(os);
+    }
+}
diff --git a/toolboxes/core/cpu/hoNDPoint.h b/toolboxes/core/cpu/hoNDPoint.h
new file mode 100644
index 0000000..dd31d0a
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDPoint.h
@@ -0,0 +1,337 @@
+/** \file       hoNDPoint.h
+    \brief      N-dimensional point
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "GadgetronException.h"
+
+#include <new>
+#include <vector>
+#include <iostream>
+#include <stdexcept>
+#include <cmath>
+
+#include "float.h"
+
+namespace Gadgetron
+{
+
+    template <typename T, unsigned int D>
+    class hoNDPoint
+    {
+    public:
+
+        typedef hoNDPoint<T, D> Self;
+        typedef T value_type;
+
+        hoNDPoint();
+        hoNDPoint(const Self& p);
+
+        ~hoNDPoint();
+
+        Self& operator=(const Self& p);
+
+        void fill(const T& v);
+
+        T* begin() { return this->data_; }
+        const T* begin() const { return this->data_; }
+
+        T& operator[]( size_t idx );
+        const T& operator[]( size_t idx ) const;
+
+        T& operator()( size_t idx );
+        const T& operator()( size_t idx ) const;
+
+        bool operator==(const Self& p) const;
+        bool operator!=(const Self& p) const;
+
+        template<typename T2> 
+        void copyFrom(const hoNDPoint<T2, D>& aArray)
+        {
+            unsigned int ii;
+            for ( ii=0; ii<D; ii++ )
+            {
+                this->data_[ii] = static_cast<T>(aArray(ii));
+            }
+        }
+
+        Self& operator += (const Self& p);
+        Self& operator -= (const Self& p);
+        Self& operator *= (const Self& p);
+        Self& operator /= (const Self& p);
+
+        Self& operator += (const T& p);
+        Self& operator -= (const T& p);
+        Self& operator *= (const T& p);
+        Self& operator /= (const T& p);
+
+        // dot product
+        void dot(const Self& p, T& r);
+
+        // the magnitude of point vector
+        T abs();
+
+        // normalize the magnitude of point to be 1
+        void normalize();
+
+        virtual void print(std::ostream& os) const;
+
+    protected:
+
+        T data_[D];
+    };
+
+    template <typename T, unsigned int D>
+    hoNDPoint<T, D>::hoNDPoint()
+    {
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            this->data_[ii] = T(0);
+        }
+    }
+
+    template <typename T, unsigned int D>
+    hoNDPoint<T, D>::hoNDPoint(const Self& p)
+    {
+        memcpy(this->data_, p.data_, sizeof(T)*D);
+    }
+
+    template <typename T, unsigned int D>
+    hoNDPoint<T, D>::~hoNDPoint()
+    {
+
+    }
+
+    template <typename T, unsigned int D>
+    inline hoNDPoint<T, D>& hoNDPoint<T, D>::operator=(const Self& p)
+    {
+        if ( this == &p ) return *this;
+        memcpy(this->data_, p.data_, sizeof(T)*D);
+        return *this;
+    }
+
+    template <typename T, unsigned int D>
+    inline void hoNDPoint<T, D>::fill(const T& v)
+    {
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            this->data_[ii] = v;
+        }
+    }
+
+    template <typename T, unsigned int D>
+    inline T& hoNDPoint<T, D>::operator[]( size_t idx )
+    {
+        GADGET_DEBUG_CHECK_THROW(idx < D);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D>
+    inline const T& hoNDPoint<T, D>::operator[]( size_t idx ) const
+    {
+        GADGET_DEBUG_CHECK_THROW(idx < D);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D>
+    inline T& hoNDPoint<T, D>::operator()( size_t idx )
+    {
+        GADGET_DEBUG_CHECK_THROW(idx < D);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D>
+    inline const T& hoNDPoint<T, D>::operator()( size_t idx ) const
+    {
+        GADGET_DEBUG_CHECK_THROW(idx < D);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D>
+    inline bool hoNDPoint<T, D>::operator==(const Self& p) const
+    {
+        T minV = std::numeric_limits<T>::epsilon();
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            if ( std::abs(this->data_[ii] - p.data_[ii]) > minV ) return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D>
+    inline bool hoNDPoint<T, D>::operator!=(const Self& p) const
+    {
+        return !(*this==p);
+    }
+
+    template <typename T, unsigned int D>
+    inline hoNDPoint<T, D>& hoNDPoint<T, D>::operator += (const Self& p)
+    {
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            this->data_[ii] += p.data_[ii];
+        }
+
+        return *this;
+    }
+
+    template <typename T, unsigned int D>
+    inline hoNDPoint<T, D>& hoNDPoint<T, D>::operator -= (const Self& p)
+    {
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            this->data_[ii] -= p.data_[ii];
+        }
+
+        return *this;
+    }
+
+    template <typename T, unsigned int D>
+    inline hoNDPoint<T, D>& hoNDPoint<T, D>::operator *= (const Self& p)
+    {
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            this->data_[ii] *= p.data_[ii];
+        }
+
+        return *this;
+    }
+
+    template <typename T, unsigned int D>
+    inline hoNDPoint<T, D>& hoNDPoint<T, D>::operator /= (const Self& p)
+    {
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            if ( std::abs(p.data_[ii]) < DBL_EPSILON )
+            {
+                this->data_[ii] /= (p.data_[ii]+DBL_EPSILON);
+            }
+            else
+            {
+                this->data_[ii] /= p.data_[ii];
+            }
+        }
+
+        return *this;
+    }
+
+    template <typename T, unsigned int D>
+    inline hoNDPoint<T, D>& hoNDPoint<T, D>::operator += (const T& p)
+    {
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            this->data_[ii] += p;
+        }
+
+        return *this;
+    }
+
+    template <typename T, unsigned int D>
+    inline hoNDPoint<T, D>& hoNDPoint<T, D>::operator -= (const T& p)
+    {
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            this->data_[ii] -= p;
+        }
+
+        return *this;
+    }
+
+    template <typename T, unsigned int D>
+    inline hoNDPoint<T, D>& hoNDPoint<T, D>::operator *= (const T& p)
+    {
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            this->data_[ii] *= p;
+        }
+
+        return *this;
+    }
+
+    template <typename T, unsigned int D>
+    inline hoNDPoint<T, D>& hoNDPoint<T, D>::operator /= (const T& p)
+    {
+        T pTmp = p;
+        if ( std::abs(p) < DBL_EPSILON ) pTmp += DBL_EPSILON;
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            this->data_[ii] /= pTmp;
+        }
+
+        return *this;
+    }
+
+    template <typename T, unsigned int D>
+    inline void hoNDPoint<T, D>::dot(const Self& p, T& r)
+    {
+        r = this->data_[0]*p.data_[0];
+
+        unsigned int ii;
+        for ( ii=1; ii<D; ii++ )
+        {
+            r += (this->data_[ii]*p.data_[ii]);
+        }
+    }
+
+    template <typename T, unsigned int D>
+    inline T hoNDPoint<T, D>::abs()
+    {
+        T dist = this->data_[0]*this->data_[0];
+
+        unsigned int ii;
+        for ( ii=1; ii<D; ii++ )
+        {
+            dist += (this->data_[ii]*this->data_[ii]);
+        }
+
+        dist = std::sqrt(dist);
+
+        return dist;
+    }
+
+    template <typename T, unsigned int D>
+    inline void hoNDPoint<T, D>::normalize()
+    {
+        T dist = this->abs();
+        if ( std::abs(dist) < DBL_EPSILON ) return;
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            this->data_[ii] /= dist;
+        }
+    }
+
+    template <typename T, unsigned int D>
+    void hoNDPoint<T, D>::print(std::ostream& os) const
+    {
+        using namespace std;
+
+        os.unsetf(std::ios::scientific);
+        os.setf(ios::fixed);
+
+        os << "[";
+        unsigned int ii;
+        for ( ii=0; ii<D-1; ii++ )
+        {
+            os << this->data_[ii] << ",";
+        }
+        os << this->data_[D-1] << "]";
+    }
+}
diff --git a/toolboxes/core/cpu/hostutils/CMakeLists.txt b/toolboxes/core/cpu/hostutils/CMakeLists.txt
new file mode 100644
index 0000000..f2816db
--- /dev/null
+++ b/toolboxes/core/cpu/hostutils/CMakeLists.txt
@@ -0,0 +1,20 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_HOSTUTILS__)
+endif (WIN32)
+
+add_library(gadgetron_toolbox_hostutils SHARED 
+  parameterparser.cpp
+  )
+
+set_target_properties(gadgetron_toolbox_hostutils PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+install(TARGETS 
+  gadgetron_toolbox_hostutils 
+  DESTINATION lib COMPONENT main)
+
+install(FILES 
+  hostutils_export.h 
+  parameterparser.h 
+  url_encode.h 
+  FileInfo.h 
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/core/cpu/hostutils/FileInfo.h b/toolboxes/core/cpu/hostutils/FileInfo.h
new file mode 100644
index 0000000..a706e7a
--- /dev/null
+++ b/toolboxes/core/cpu/hostutils/FileInfo.h
@@ -0,0 +1,54 @@
+#ifndef FILEINFO_H_
+#define FILEINFO_H_
+
+#include <string>
+#include <fstream>
+
+namespace Gadgetron {
+
+/**
+ *  Simple wrapper class for getting file info (file exists, file length, etc) before accessing file
+ *
+ */
+class FileInfo
+{
+public:
+
+	/**
+	 *   Constructor. After calling the constructor the file_exists_ flag will be set in the class
+	 */
+	FileInfo(std::string filename)
+	{
+		filename_ = filename;
+		std::ifstream ifile(filename_.c_str());
+        file_exists_ = ifile.good();
+	}
+
+	virtual ~FileInfo() {}
+
+	/**
+	 *  Does the file exist (can be opened)
+	 */
+	bool exists() {
+		return file_exists_;
+	}
+
+	size_t length() {
+		size_t length = 0;
+		if (file_exists_) {
+			std::ifstream ifile(filename_.c_str());
+			ifile.seekg(0,std::ios::end);
+			length = ifile.tellg();
+		} else {
+			return -1;
+		}
+		return length;
+	}
+
+protected:
+	bool file_exists_;
+	std::string filename_;
+};
+}
+
+#endif /* FILEINFO_H_ */
diff --git a/toolboxes/core/cpu/hostutils/hostutils_export.h b/toolboxes/core/cpu/hostutils/hostutils_export.h
new file mode 100644
index 0000000..2c732db
--- /dev/null
+++ b/toolboxes/core/cpu/hostutils/hostutils_export.h
@@ -0,0 +1,22 @@
+/*
+ * hostutils_export.h
+ *
+ *  Created on: Nov 18, 2011
+ *      Author: Michael S. Hansen
+ */
+
+#ifndef HOSTUTILS_EXPORT_H_
+#define HOSTUTILS_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_HOSTUTILS__) || defined (hostutils_EXPORTS)
+#define EXPORTHOSTUTILS __declspec(dllexport)
+#else
+#define EXPORTHOSTUTILS __declspec(dllimport)
+#endif
+#else
+#define EXPORTHOSTUTILS
+#endif
+
+
+#endif /* HOSTUTILS_EXPORT_H_ */
diff --git a/toolboxes/core/cpu/hostutils/parameterparser.cpp b/toolboxes/core/cpu/hostutils/parameterparser.cpp
new file mode 100644
index 0000000..883a768
--- /dev/null
+++ b/toolboxes/core/cpu/hostutils/parameterparser.cpp
@@ -0,0 +1,330 @@
+#include "parameterparser.h"
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <iomanip>
+#include <stdexcept>
+
+namespace Gadgetron {
+
+  CommandLineParameter::CommandLineParameter(char com_switch,CommandLineParameterType type, unsigned int nr_values, const char* desc, bool required)
+  {
+    m_type = type;
+    m_switch = com_switch;
+    m_nr_values = nr_values;
+    m_desc = std::string(desc);
+    m_is_required = required;
+    m_is_set = false;
+
+    if (m_nr_values > 0){
+      m_int_value = new int[m_nr_values];
+      m_float_value = new float[m_nr_values];
+      m_string_value = new std::string[m_nr_values];
+    }
+    else{
+      m_int_value = 0;
+      m_float_value = 0;
+      m_string_value = 0;
+    }
+  }
+
+  CommandLineParameter::~CommandLineParameter()
+  {
+    if (m_int_value != 0) delete [] m_int_value;
+    if (m_float_value != 0) delete [] m_float_value;
+    if (m_string_value != 0) delete [] m_string_value;
+  } 
+
+  const char* CommandLineParameter::get_string_value(unsigned int i)
+  {
+    if (i < m_nr_values){
+      return m_string_value[i].c_str();
+    }
+    else{
+      return 0;
+    }
+  }
+
+  int CommandLineParameter::get_int_value(unsigned int i)
+  {
+    if (i < m_nr_values){
+      return m_int_value[i];
+    }
+    else{
+      return 0;
+    }
+  }
+
+  float CommandLineParameter::get_float_value(unsigned int i)
+  {
+    if (i < m_nr_values){
+      return m_float_value[i];
+    }
+    else{
+      return 0.0f;
+    }
+  }
+
+  bool CommandLineParameter::get_is_set()
+  {
+    return m_is_set;
+  }
+
+  bool CommandLineParameter::get_is_required()
+  {
+    return m_is_required;
+  }
+
+  bool CommandLineParameter::is_switch_equal_to(char com_switch)
+  {
+    return (m_switch == com_switch);
+  }
+
+  char** CommandLineParameter::set_value(char** argv)
+  {
+    int args = 0; 
+    for (unsigned int i = 0; i < m_nr_values;i++){
+      m_string_value[i] = std::string(argv[i]);
+      if (m_type == COMMAND_LINE_FLOAT || m_type == COMMAND_LINE_INT){
+        std::stringstream ss (std::stringstream::in | std::stringstream::out);
+        ss << m_string_value[i];
+        ss >> m_float_value[i];
+        m_int_value[i] = static_cast<int>(m_float_value[i]);
+      }
+      else{
+        m_int_value[i] = 1;
+        m_float_value[i] = 1.0f;
+      }
+      args++;
+    }
+    m_is_set = true;
+
+    return (argv+args);
+  }
+
+  int CommandLineParameter::get_number_of_values()
+  {
+    return m_nr_values;
+  }
+
+  char CommandLineParameter::get_switch()
+  {
+    return m_switch;
+  }
+
+  std::string CommandLineParameter::get_desc()
+  {
+    return m_desc;
+  }
+
+  ParameterParser::ParameterParser(int list_size, int list_increment)
+  {
+    m_list_size = list_size;
+    m_list_increment = list_increment;
+    m_parameter_list = new CommandLineParameter*[m_list_size];
+    m_number_of_parameters = 0;
+    m_max_desc_length = 0;
+    m_max_number_values = 0;
+  }
+
+  ParameterParser::~ParameterParser()
+  {
+    delete_list();
+  }
+
+  void ParameterParser::expand_list()
+  {
+    int new_list_size = m_list_size + m_list_increment;
+    CommandLineParameter **new_list = new CommandLineParameter*[new_list_size];
+
+    for (int i = 0; i < m_number_of_parameters; i++){
+      new_list[i] = m_parameter_list[i];
+    }
+
+    delete [] m_parameter_list;
+    m_parameter_list = new_list;
+  }
+
+  void ParameterParser::delete_list()
+  {
+    for (int i = 0; i < m_number_of_parameters; i++){
+      delete m_parameter_list[i];
+    }
+    delete [] m_parameter_list;
+  }
+
+  int ParameterParser::add_parameter(char com_switch,CommandLineParameterType type, unsigned int nr_values, 
+                                     const char* desc, bool required, const char* def)
+  {
+    char** argv = new char*[nr_values];
+    std::string *arg_list = new std::string[nr_values];
+
+    add_parameter(com_switch, type, nr_values, desc, required);
+
+    std::stringstream ss (std::stringstream::in | std::stringstream::out);
+    ss << def;
+
+    unsigned int args = 0; 
+    while (args < nr_values){
+      ss >> arg_list[args];
+      argv[args] = (char*)arg_list[args].c_str();
+      args++;
+    }
+
+    m_parameter_list[m_number_of_parameters-1]->set_value(argv);
+ 
+    delete [] argv;
+    delete [] arg_list;
+
+    return 0;
+  }
+
+  int ParameterParser::add_parameter(char com_switch,CommandLineParameterType type, unsigned int nr_values, const char* desc, bool required)
+  {
+    CommandLineParameter *p = new CommandLineParameter(com_switch, type, nr_values, desc, required);
+    for (int i = 0; i < m_number_of_parameters; i++){
+      if (m_parameter_list[i]->is_switch_equal_to(com_switch)){
+        std::cout << "ParameterParser: Attempt to parameter twice " << com_switch << std::endl;
+        delete p;
+        return -1;
+      }
+    }
+    if (m_number_of_parameters >= m_list_size) expand_list();
+    m_parameter_list[m_number_of_parameters++] = p;
+    if ((int)p->get_desc().length() > m_max_desc_length){
+      m_max_desc_length = p->get_desc().length();
+    }
+    if ((int)p->get_number_of_values() > m_max_number_values) {
+      m_max_number_values = p->get_number_of_values();
+    }
+    return 0;
+  }
+
+  int ParameterParser::parse_parameter_list(int argc, char** argv)
+  {
+    int a = 0;
+    int ret = 0;
+    m_command_name = std::string(argv[a++]);
+    bool argument_found;
+    while (a < argc){
+      if (argv[a][0] != '-'){
+        std::cout << "ParameterParser: malformed argument list" << std::endl;
+        print_usage();
+      }
+
+      argument_found = false;
+      for (int i = 0; i < m_number_of_parameters; i++){
+        if (m_parameter_list[i]->is_switch_equal_to(argv[a][1])){
+          if (m_parameter_list[i]->get_number_of_values() <= argc-a-1){
+            m_parameter_list[i]->set_value((argv+a+1));
+            a += m_parameter_list[i]->get_number_of_values()+1;
+            argument_found = true;
+            break;
+          }
+          else{
+            std::cout << std::endl << "ParameterParser: malformed argument list: -" << argv[a][1] << std::endl;
+            //print_usage();
+            argument_found = true;
+            a++;
+            while (a < argc && argv[a][0] != '-') a++;
+            ret = -1;
+            break;
+          }
+        }
+      }
+
+      if (!argument_found){
+        std::cout << std::endl << "ParameterParser: unknown argument: -" << argv[a][1] << std::endl;
+        //print_usage();
+        ret = -1;
+        a++;
+        while (a < argc && argv[a][0] != '-') a++;
+      }
+    }
+    return ret;
+  }
+ 
+  void ParameterParser::print_usage()
+  {
+    int space_fill = 0;
+
+    std::cout << "---------------------------------------------------- " << std::endl;
+    std::cout << "Usage: " << m_command_name << " -[";
+    for (int i = 0; i < m_number_of_parameters; i++){
+      std::cout << m_parameter_list[i]->get_switch();
+    }
+    std::cout << "]" << std::endl;
+ 
+    for (int i = 0; i < m_number_of_parameters; i++){
+        std::cout << " -" << m_parameter_list[i]->get_switch() << " ";
+        if (m_max_number_values > 1){
+          if (m_parameter_list[i]->get_number_of_values() > 1){
+            std::cout << m_parameter_list[i]->get_number_of_values() << "x "; 
+          }
+          else{
+            std::cout << " "; 
+          }
+        }
+        if (m_parameter_list[i]->get_number_of_values() > 0){
+          std::cout << "[" << m_parameter_list[i]->get_desc() << "]";
+          space_fill = (m_max_desc_length - m_parameter_list[i]->get_desc().length())+2;
+        }
+        else{
+          space_fill = m_max_desc_length+2+2;
+        }
+        std::cout << std::endl;
+      }
+    std::cout << "---------------------------------------------------- " << std::endl; 
+  }
+
+  void ParameterParser::print_parameter_list()
+  {
+    std::cout << "---------------------------------------------------- " << std::endl;
+    for (int i = 0; i < m_number_of_parameters; i++){
+      std::cout << " ";
+      std::cout << "(-" << m_parameter_list[i]->get_switch() << ") ";
+      std::cout << std::setw(m_max_desc_length+2) << std::setiosflags(std::ios::left);
+      std::cout << m_parameter_list[i]->get_desc() << ": ";
+      if (m_parameter_list[i]->get_is_set()){
+        for (int j = 0; j < m_parameter_list[i]->get_number_of_values(); j++){
+          std::cout << m_parameter_list[i]->get_string_value(j) << " ";
+          if (!m_parameter_list[i]->get_is_required()){
+            std::cout << "(optional)";
+          }
+        }
+      }
+      else{
+        if (m_parameter_list[i]->get_is_required()){
+          std::cout << "(missing)";
+        }
+        else{
+          std::cout << "N/A (optional)";
+        }
+      }
+      std::cout << std::endl;
+    }
+    std::cout << "---------------------------------------------------- " << std::endl;
+  }
+ 
+  bool ParameterParser::all_required_parameters_set()
+  {
+    for (int i = 0; i < m_number_of_parameters; i++){
+      if (!m_parameter_list[i]->get_is_set() && m_parameter_list[i]->get_is_required())
+        return false;
+    }
+    return true;
+  }
+
+  CommandLineParameter* ParameterParser::get_parameter(char com_switch)
+  {
+    for (int i = 0; i < m_number_of_parameters; i++){
+      if (m_parameter_list[i]->is_switch_equal_to(com_switch)){
+        return m_parameter_list[i];
+      }
+    }
+    std::stringstream ss;
+    ss << "Parameter " << com_switch << " is undefined";
+    throw std::runtime_error(ss.str());
+  }
+}
diff --git a/toolboxes/core/cpu/hostutils/parameterparser.h b/toolboxes/core/cpu/hostutils/parameterparser.h
new file mode 100644
index 0000000..7bb5e22
--- /dev/null
+++ b/toolboxes/core/cpu/hostutils/parameterparser.h
@@ -0,0 +1,81 @@
+#pragma once
+#include "hostutils_export.h"
+
+#include <vector>
+#include <string>
+
+namespace Gadgetron {
+
+  typedef enum 
+    {
+      COMMAND_LINE_STRING,
+      COMMAND_LINE_INT,
+      COMMAND_LINE_FLOAT,
+      COMMAND_LINE_NO_ARG
+    } CommandLineParameterType; 
+
+  class EXPORTHOSTUTILS CommandLineParameter
+  {
+  public:
+    CommandLineParameter(char com_switch, CommandLineParameterType type, unsigned int nr_values, const char* desc, bool required);
+    ~CommandLineParameter();
+
+    bool is_switch_equal_to(char com_switch);
+
+    char** set_value(char** argv);
+
+    int get_number_of_values();
+    char get_switch();
+
+    const char* get_string_value(unsigned int i = 0);
+    int get_int_value(unsigned int i = 0);
+    float get_float_value(unsigned int i = 0);
+
+    bool get_is_set();
+    bool get_is_required();
+    std::string get_desc();
+
+  private:
+    CommandLineParameterType  m_type;
+    char                      m_switch;
+    unsigned int              m_nr_values;
+    std::string               m_desc;
+    bool                      m_is_set;
+    bool                      m_is_required;
+    int                      *m_int_value;
+    std::string              *m_string_value;
+    float                    *m_float_value;
+  };
+
+  class EXPORTHOSTUTILS ParameterParser
+  {
+  public:
+    ParameterParser(int list_size = 10, int list_increment = 10);
+    ~ParameterParser();
+
+    int add_parameter(char com_switch,CommandLineParameterType type,  unsigned int nr_values, const char* desc, bool required);
+    int add_parameter(char com_switch,CommandLineParameterType type,  unsigned int nr_values, const char* desc, bool required, const char* def);
+
+    int parse_parameter_list(int argc, char** argv);
+
+    int get_number_of_parameters();
+
+    void print_usage();
+    void print_parameter_list();
+
+    bool all_required_parameters_set();
+
+    CommandLineParameter* get_parameter(char com_switch);
+
+  private:
+    CommandLineParameter** m_parameter_list;
+    int m_number_of_parameters;
+    int m_list_size;
+    int m_list_increment;
+    int m_max_desc_length;
+    int m_max_number_values;
+    std::string m_command_name;
+    void expand_list();
+    void delete_list();
+  };
+}
diff --git a/toolboxes/core/cpu/hostutils/url_encode.h b/toolboxes/core/cpu/hostutils/url_encode.h
new file mode 100644
index 0000000..c772740
--- /dev/null
+++ b/toolboxes/core/cpu/hostutils/url_encode.h
@@ -0,0 +1,47 @@
+#ifndef URLENCODE_H
+#define URLENCODE_H
+
+#include "log.h"
+
+namespace Gadgetron {
+
+/**
+   Simple utility function for removing spaces and backslashes in URLs
+   This function is used in various places to ensure proper encoding of schemalocation URIs
+   
+*/
+inline std::string url_encode(const std::string& in) {
+	char* tmp = new char[in.size()*4]; //Leave us plenty of space
+	if (!tmp) {
+		GDEBUG_STREAM("Failed to allocate temporary space for string in url_encode" << std::endl);
+		return in;
+	}
+
+	char* optr = tmp;
+	char* iptr = (char*)in.c_str();
+
+	unsigned int counter = 0;
+	while (counter < in.size()) {
+		if (*iptr == ' ') {
+			*optr++ = '%';
+			*optr++ = '2';
+			*optr++ = '0';
+		} else if (*iptr == '\\') {
+			*optr++ = '/';
+		} else {
+			*optr++ = *iptr;
+		}
+		iptr++;
+		counter++;
+	}
+	*optr = '\0';
+
+	std::string ret(tmp);
+
+	delete [] tmp;
+
+	return ret;
+}
+}
+
+#endif //URLENCODE_H
diff --git a/toolboxes/core/cpu/image/hoNDImage.h b/toolboxes/core/cpu/image/hoNDImage.h
new file mode 100644
index 0000000..67955cc
--- /dev/null
+++ b/toolboxes/core/cpu/image/hoNDImage.h
@@ -0,0 +1,513 @@
+/** \file       hoNDImage.h
+    \brief      N-dimensional image class for gadgetron
+
+                The default N-dimensional image is defined by the origin (the first pixel indexed by [0 0 0 ...]),
+                the pixel size and the axis for every coordinate. This defines an Euclidean space.
+
+                If this N-dimensional image is used with other coordinate systems, e.g. polar coordinate system, then the axis 
+                should not be used to compute the image-to-world transformation.
+
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "hoNDPoint.h"
+#include "hoMatrix.h"
+#include "ismrmrd/meta.h"
+
+namespace Gadgetron
+{
+
+    template <typename T, unsigned int D>
+    class hoNDImage : public hoNDArray<T>
+    {
+    public:
+
+        typedef hoNDArray<T> BaseClass;
+        typedef hoNDImage<T, D> Self;
+
+        typedef T element_type;
+        typedef T value_type;
+        typedef float coord_type;
+
+        typedef hoNDPoint<coord_type, D> a_axis_type;
+        typedef std::vector<a_axis_type> axis_type;
+
+        typedef hoNDPoint<coord_type, 3> a_axis_image_patient_type;
+
+        enum { NDIM = D };
+
+        void* operator new (size_t bytes)
+        {
+            return ::new char[bytes];
+        }
+
+        void operator delete (void *ptr)
+        {
+            delete [] static_cast <char *> (ptr);
+        } 
+
+        void * operator new(size_t s, void * p)
+        {
+            return p;
+        }
+
+        /// constructors
+        hoNDImage ();
+        hoNDImage (const std::vector<size_t>& dimensions);
+        hoNDImage (boost::shared_ptr< std::vector<size_t> > dimensions);
+        hoNDImage (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize);
+        hoNDImage (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin);
+        hoNDImage (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, const axis_type& axis);
+
+        hoNDImage(size_t len);
+        hoNDImage(size_t sx, size_t sy);
+        hoNDImage(size_t sx, size_t sy, size_t sz);
+        hoNDImage(size_t sx, size_t sy, size_t sz, size_t st);
+        hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, size_t sp);
+        hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq);
+        hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr);
+        hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss);
+
+        /// attach memory constructors
+        hoNDImage (const std::vector<size_t>& dimensions, T* data, bool delete_data_on_destruct = false);
+        hoNDImage (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, T* data, bool delete_data_on_destruct = false);
+        hoNDImage (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, T* data, bool delete_data_on_destruct = false);
+        hoNDImage (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, const axis_type& axis, T* data, bool delete_data_on_destruct = false);
+
+        hoNDImage(size_t len, T* data, bool delete_data_on_destruct = false);
+        hoNDImage(size_t sx, size_t sy, T* data, bool delete_data_on_destruct = false);
+        hoNDImage(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct = false);
+        hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, T* data, bool delete_data_on_destruct = false);
+        hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, T* data, bool delete_data_on_destruct = false);
+        hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, T* data, bool delete_data_on_destruct = false);
+        hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, T* data, bool delete_data_on_destruct = false);
+        hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, T* data, bool delete_data_on_destruct = false);
+
+        hoNDImage(const hoNDArray<T>& a);
+
+        hoNDImage(const Self& a);
+        Self& operator=(const Self& rhs);
+
+        virtual ~hoNDImage();
+
+        /// clear the images, release all memory it holds, set pixelsize/axis/origin to zero-status
+        void clear();
+
+        /// create the image, called by constructors
+        virtual void create(const std::vector<size_t>& dimensions);
+        virtual void create(boost::shared_ptr< std::vector<size_t> > dimensions);
+        virtual void create(const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize);
+        virtual void create(const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin);
+        virtual void create(const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, const axis_type& axis);
+
+        /// create the image from another image
+        /// not copy its content
+        template<typename T2> 
+        void createFrom(const hoNDImage<T2, D>& im)
+        {
+            this->clear();
+
+            std::vector<size_t> dim;
+            im.get_dimensions(dim);
+
+            std::vector<coord_type> pixelSize;
+            im.get_pixel_size(pixelSize);
+
+            std::vector<coord_type> origin;
+            im.get_origin(origin);
+
+            axis_type axis;
+            im.get_axis(axis);
+
+            this->create(dim, pixelSize, origin, axis);
+
+            this->attrib_ = im.attrib_;
+        }
+
+        /// create the image from another image
+        /// copy its content
+        template<typename T2> 
+        void create(const hoNDImage<T2, D>& im)
+        {
+            this->createFrom(im);
+
+            size_t ii;
+            size_t N = this->get_number_of_elements();
+            for ( ii=0; ii<N; ii++ )
+            {
+                this->data_[ii] = static_cast<T>(im.get_data_ptr()[ii]);
+            }
+        }
+
+        template<typename T2> 
+        inline void copyImageInfo(const hoNDImage<T2, D>& im)
+        {
+            this->createFrom(im);
+        }
+
+        template<typename T2> 
+        inline void copyImageInfoAndContent(const hoNDImage<T2, D>& im)
+        {
+            this->create(im);
+        }
+
+        template<typename T2> 
+        inline void copyImageInfoWithoutImageSize(const hoNDImage<T2, D>& im)
+        {
+            std::vector<coord_type> pixelSize;
+            im.get_pixel_size(pixelSize);
+
+            std::vector<coord_type> origin;
+            im.get_origin(origin);
+
+            axis_type axis;
+            im.get_axis(axis);
+
+            this->set_pixel_size(pixelSize);
+            this->set_origin(origin);
+            this->set_axis(axis);
+
+            this->attrib_ = im.attrib_;
+        }
+
+        virtual void create(const std::vector<size_t>& dimensions,
+                            T* data, 
+                            bool delete_data_on_destruct = false);
+
+        virtual void create(const std::vector<size_t>& dimensions, 
+                            const std::vector<coord_type>& pixelSize, 
+                            T* data, 
+                            bool delete_data_on_destruct = false);
+
+        virtual void create(const std::vector<size_t>& dimensions, 
+                            const std::vector<coord_type>& pixelSize, 
+                            const std::vector<coord_type>& origin, 
+                            T* data, 
+                            bool delete_data_on_destruct = false);
+
+        virtual void create(const std::vector<size_t>& dimensions, 
+                            const std::vector<coord_type>& pixelSize, 
+                            const std::vector<coord_type>& origin, 
+                            const axis_type& axis, 
+                            T* data, 
+                            bool delete_data_on_destruct = false);
+
+        /// convert from/to hoNDArray
+        void from_NDArray(const hoNDArray<T>& a);
+        void to_NDArray(hoNDArray<T>& a) const;
+
+        /// whether two images have the same size
+        bool dimensions_equal(const std::vector<size_t>& dimensions) const;
+
+        template<class S> 
+        bool dimensions_equal(const hoNDArray<S>& im) const
+        {
+            std::vector<size_t> dim;
+            im.get_dimensions(dim);
+
+            return this->dimensions_equal(dim);
+        }
+
+        template<class S> 
+        bool dimensions_equal(const hoNDImage<S, D>& im) const
+        {
+            std::vector<size_t> dim;
+            im.get_dimensions(dim);
+
+            return this->dimensions_equal(dim);
+        }
+
+        template<class S> 
+        bool dimensions_equal(const hoNDImage<S, D>* im) const
+        {
+            return this->dimensions_equal(*im);
+        }
+
+        template<class S> 
+        bool pixel_size_equal(const hoNDImage<S, D>& im) const
+        {
+            unsigned int ii;
+            for ( ii=0; ii<D; ii++ )
+            {
+                if ( std::abs(this->pixelSize_[ii] - im.pixelSize_[ii]) > FLT_EPSILON ) return false;
+            }
+
+            return true;
+        }
+
+        template<class S> 
+        bool axis_equal(const hoNDImage<S, D>& im) const
+        {
+            unsigned int ii, jj;
+            for ( ii=0; ii<D; ii++ )
+            {
+                if ( this->axis_[ii] != im.axis_[ii] ) return false;
+            }
+
+            return true;
+        }
+
+        /// get the pixel size
+        coord_type get_pixel_size(size_t dimension) const;
+        void get_pixel_size(std::vector<coord_type>& pixelSize) const;
+
+        void set_pixel_size(size_t dimension, coord_type v);
+        void set_pixel_size(const std::vector<coord_type>& pixelSize);
+
+        /// get origin
+        coord_type get_origin(size_t dimension) const;
+        void get_origin(std::vector<coord_type>& origin) const;
+
+        void set_origin(size_t dimension, coord_type v);
+        void set_origin(const std::vector<coord_type>& origin);
+
+        /// get axis
+        coord_type get_axis(size_t dimension, size_t elem) const;
+        a_axis_type get_axis(size_t dimension) const;
+        void get_axis(axis_type& axis) const;
+
+        void set_axis(size_t dimension, size_t elem, coord_type v);
+        void set_axis(size_t dimension, const a_axis_type& v);
+        void set_axis(const axis_type& axis);
+
+        /// get image position patient
+        void get_image_position(coord_type pos[3]) const;
+        void get_image_position(unsigned int d, coord_type& pos) const;
+        void get_image_position(a_axis_image_patient_type& pos) const;
+
+        void set_image_position(coord_type pos[3]);
+        void set_image_position(unsigned int d, coord_type pos);
+        void set_image_position(const a_axis_image_patient_type& pos);
+
+        /// get image orientation patient
+        void get_image_orientation(unsigned int d, coord_type ori[3]) const;
+        void get_image_orientation(unsigned int d, a_axis_image_patient_type& ori) const;
+        
+        /// for dimension d and index ind
+        void get_image_orientation(unsigned int d, unsigned int ind, coord_type& ori) const;
+        /// get image orientation as a quaternion
+        void get_image_orientation(coord_type quat[4]) const;
+
+        void set_image_orientation(unsigned int d, coord_type ori[3]);
+        void set_image_orientation(unsigned int d, const a_axis_image_patient_type& ori);
+        void set_image_orientation(unsigned int d, unsigned int ind, coord_type ori);
+        void set_image_orientation(coord_type quat[4]);
+
+        size_t get_number_of_dimensions() const { return D; }
+
+        size_t calculate_offset(const size_t* ind) const;
+        size_t calculate_offset(const std::vector<size_t>& ind) const;
+
+        size_t calculate_offset(size_t x, size_t y) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u) const;
+
+        /// given the 1D offset, compute the corresponding indexes
+        std::vector<size_t> calculate_index( size_t offset ) const;
+        void calculate_index( size_t offset, size_t* index ) const;
+        void calculate_index( size_t offset, std::vector<size_t>& index ) const;
+        void calculate_index( size_t offset, coord_type* index ) const;
+        void calculate_index( size_t offset, size_t& x, size_t& y ) const;
+        void calculate_index( size_t offset, size_t& x, size_t& y, size_t& z ) const;
+        void calculate_index( size_t offset, size_t& x, size_t& y, size_t& z, size_t& s ) const;
+        void calculate_index( size_t offset, size_t& x, size_t& y, size_t& z, size_t& s, size_t& p ) const;
+        void calculate_index( size_t offset, size_t& x, size_t& y, size_t& z, size_t& s, size_t& p, size_t& r ) const;
+        void calculate_index( size_t offset, size_t& x, size_t& y, size_t& z, size_t& s, size_t& p, size_t& r, size_t& a ) const;
+        void calculate_index( size_t offset, size_t& x, size_t& y, size_t& z, size_t& s, size_t& p, size_t& r, size_t& a, size_t& q ) const;
+        void calculate_index( size_t offset, size_t& x, size_t& y, size_t& z, size_t& s, size_t& p, size_t& r, size_t& a, size_t& q, size_t& u ) const;
+
+        /// access the pixel value
+        T& operator()( const size_t* ind );
+        const T& operator()( const size_t* ind ) const;
+
+        T& operator()( const std::vector<size_t>& ind );
+        const T& operator()( const std::vector<size_t>& ind ) const;
+
+        T& operator[]( size_t x );
+        const T& operator[]( size_t x ) const;
+
+        T& operator()( size_t x );
+        const T& operator()( size_t x ) const;
+
+        T& operator()( size_t x, size_t y );
+        const T& operator()( size_t x, size_t y ) const;
+
+        T& operator()( size_t x, size_t y, size_t z );
+        const T& operator()( size_t x, size_t y, size_t z ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u ) const;
+
+        /// fill the image with a value
+        void fill(T value);
+
+        template<typename T2> 
+        void copyFrom(const hoNDImage<T2, D>& aIm)
+        {
+            this->create(aIm);
+        }
+
+        /// image pixel index to world coordinate
+        void image_to_world(const coord_type* ind, coord_type* coord) const;
+        void image_to_world(const std::vector<coord_type>& ind, std::vector<coord_type>& coord) const;
+
+        void image_to_world(coord_type x, coord_type& cx) const;
+
+        void image_to_world(coord_type x, coord_type y, 
+                            coord_type& cx, coord_type& cy) const;
+
+        void image_to_world(coord_type x, coord_type y, coord_type z,
+                            coord_type& cx, coord_type& cy, coord_type& cz) const;
+
+        void image_to_world(coord_type x, coord_type y, coord_type z, coord_type s,
+                            coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs) const;
+
+        void image_to_world(coord_type x, coord_type y, coord_type z, coord_type s, coord_type p,
+                            coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp) const;
+
+        void image_to_world(coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r,
+                            coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp, coord_type& cr) const;
+
+        void image_to_world(coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a,
+                            coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp, coord_type& cr, coord_type& ca) const;
+
+        void image_to_world(coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q,
+                            coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp, coord_type& cr, coord_type& ca, coord_type& cq) const;
+
+        void image_to_world(coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q, coord_type u,
+                            coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp, coord_type& cr, coord_type& ca, coord_type& cq, coord_type& cu) const;
+
+        /// for integer pixel indexes
+        void image_to_world(const size_t* ind, coord_type* coord) const;
+        void image_to_world(const std::vector<size_t>& ind, std::vector<coord_type>& coord) const;
+
+        void image_to_world(size_t x, coord_type& cx) const;
+
+        void image_to_world(size_t x, size_t y, 
+                            coord_type& cx, coord_type& cy) const;
+
+        void image_to_world(size_t x, size_t y, size_t z,
+                            coord_type& cx, coord_type& cy, coord_type& cz) const;
+
+        void image_to_world(size_t x, size_t y, size_t z, size_t s,
+                            coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs) const;
+
+        void image_to_world(size_t x, size_t y, size_t z, size_t s, size_t p,
+                            coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp) const;
+
+        void image_to_world(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r,
+                            coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp, coord_type& cr) const;
+
+        void image_to_world(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a,
+                            coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp, coord_type& cr, coord_type& ca) const;
+
+        void image_to_world(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q,
+                            coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp, coord_type& cr, coord_type& ca, coord_type& cq) const;
+
+        void image_to_world(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u,
+                            coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp, coord_type& cr, coord_type& ca, coord_type& cq, coord_type& cu) const;
+
+        /// get the image-to-world transformation matrix
+        /// the Homogeneous coordinate transformation matrix is computed
+        void image_to_world_matrix(hoMatrix<coord_type>& image2world) const;
+        void set_image_to_world_matrix(const hoMatrix<coord_type>& image2world);
+
+        /// world coordinate to image pixel index
+        void world_to_image(const coord_type* coord, coord_type* ind) const;
+        void world_to_image(const std::vector<coord_type>& coord, std::vector<coord_type>& ind) const;
+
+        void world_to_image(coord_type cx, coord_type& x) const;
+
+        void world_to_image(coord_type cx, coord_type cy, 
+                            coord_type& x, coord_type& y) const;
+
+        void world_to_image(coord_type cx, coord_type cy, coord_type cz,
+                            coord_type& x, coord_type& y, coord_type& z) const;
+
+        void world_to_image(coord_type cx, coord_type cy, coord_type cz, coord_type cs,
+                            coord_type& x, coord_type& y, coord_type& z, coord_type& s) const;
+
+        void world_to_image(coord_type cx, coord_type cy, coord_type cz, coord_type cs, coord_type cp,
+                            coord_type& x, coord_type& y, coord_type& z, coord_type& s, coord_type& p) const;
+
+        void world_to_image(coord_type cx, coord_type cy, coord_type cz, coord_type cs, coord_type cp, coord_type cr,
+                            coord_type& x, coord_type& y, coord_type& z, coord_type& s, coord_type& p, coord_type& r) const;
+
+        void world_to_image(coord_type cx, coord_type cy, coord_type cz, coord_type cs, coord_type cp, coord_type cr, coord_type ca,
+                            coord_type& x, coord_type& y, coord_type& z, coord_type& s, coord_type& p, coord_type& r, coord_type& a) const;
+
+        void world_to_image(coord_type cx, coord_type cy, coord_type cz, coord_type cs, coord_type cp, coord_type cr, coord_type ca, coord_type cq,
+                            coord_type& x, coord_type& y, coord_type& z, coord_type& s, coord_type& p, coord_type& r, coord_type& a, coord_type& q) const;
+
+        void world_to_image(coord_type cx, coord_type cy, coord_type cz, coord_type cs, coord_type cp, coord_type cr, coord_type ca, coord_type cq, coord_type cu,
+                            coord_type& x, coord_type& y, coord_type& z, coord_type& s, coord_type& p, coord_type& r, coord_type& a, coord_type& q, coord_type& u) const;
+
+        /// get the world_to_image transformation matrix
+        /// the Homogeneous coordinate transformation matrix is computed
+        void world_to_image_matrix(hoMatrix<coord_type>& world2image) const;
+        void set_world_to_image_matrix(const hoMatrix<coord_type>& world2image);
+
+        /// is the sub region in the image
+        bool in_image_region(const std::vector<size_t>& start, std::vector<size_t>& size);
+
+        /// get the sub image
+        void get_sub_image(const std::vector<size_t>& start, std::vector<size_t>& size, Self& out);
+
+        /// meta attributes
+        ISMRMRD::MetaContainer attrib_;
+
+        /// serialize/deserialize image content
+        virtual bool serializeImage(char*& buf, size_t& len) const;
+        virtual bool deserializeImage(char* buf, size_t& len);
+
+        /// serialize/deserialize image content and meta attributes
+        virtual bool serialize(char*& buf, size_t& len) const;
+        virtual bool deserialize(char* buf, size_t& len);
+
+        /// print out the image information
+        virtual void print(std::ostream& os) const;
+        virtual void printContent(std::ostream& os) const;
+
+    protected:
+
+        using BaseClass::dimensions_;
+        using BaseClass::offsetFactors_;
+        using BaseClass::data_;
+        using BaseClass::elements_;
+        using BaseClass::delete_data_on_destruct_;
+
+        coord_type pixelSize_[D];
+        coord_type pixelSize_reciprocal_[D];
+        coord_type origin_[D];
+        hoNDPoint<coord_type, D> axis_[D];
+
+        /// for the dicom coordinate system
+        a_axis_image_patient_type image_position_patient_;
+        /// image orientation for row/column/slice directions
+        a_axis_image_patient_type image_orientation_patient_[3];
+    };
+}
+
+#include "hoNDImage.hxx"
diff --git a/toolboxes/core/cpu/image/hoNDImage.hxx b/toolboxes/core/cpu/image/hoNDImage.hxx
new file mode 100644
index 0000000..916245d
--- /dev/null
+++ b/toolboxes/core/cpu/image/hoNDImage.hxx
@@ -0,0 +1,2955 @@
+/** \file       hoNDImage.hxx
+    \brief      Implementation of N-dimensional image class for gadgetron
+    \author     Hui Xue
+*/
+
+#include "hoNDImage.h"
+
+namespace Gadgetron
+{
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage () : BaseClass()
+    {
+        dimensions_->resize(D, 0);
+        offsetFactors_->resize(D, 0);
+
+        unsigned int ii;
+        for (ii=0;ii<D; ii++)
+        {
+            pixelSize_[ii] = 1;
+            pixelSize_reciprocal_[ii] = 1;
+            origin_[ii] = 0;
+            axis_[ii].fill(0);
+            axis_[ii][ii] = coord_type(1.0);
+        }
+
+        image_position_patient_[0] = 0;
+        image_position_patient_[1] = 0;
+        image_position_patient_[2] = 0;
+
+        image_orientation_patient_[0][0] = 1; image_orientation_patient_[0][1] = 0; image_orientation_patient_[0][2] = 0;
+        image_orientation_patient_[1][0] = 0; image_orientation_patient_[1][1] = 1; image_orientation_patient_[1][2] = 0;
+        image_orientation_patient_[2][0] = 0; image_orientation_patient_[2][1] = 0; image_orientation_patient_[2][2] = 1;
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage (const std::vector<size_t>& dimensions) : BaseClass( const_cast<std::vector<size_t>& >(dimensions) )
+    {
+        this->create(dimensions);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage (boost::shared_ptr< std::vector<size_t> > dimensions) : BaseClass( dimensions )
+    {
+        this->create( *dimensions );
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage (const std::vector<size_t>& dimensions, 
+        const std::vector<coord_type>& pixelSize) : BaseClass( const_cast<std::vector<size_t>& >(dimensions) )
+    {
+        this->create(dimensions, pixelSize);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage (const std::vector<size_t>& dimensions, 
+        const std::vector<coord_type>& pixelSize, 
+        const std::vector<coord_type>& origin) : BaseClass( const_cast<std::vector<size_t>& >(dimensions) )
+    {
+        this->create(dimensions, pixelSize, origin);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage (const std::vector<size_t>& dimensions, 
+                                const std::vector<coord_type>& pixelSize, 
+                                const std::vector<coord_type>& origin, 
+                                const axis_type& axis) : BaseClass( const_cast<std::vector<size_t>& >(dimensions) )
+    {
+        this->create(dimensions, pixelSize, origin, axis);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(size_t len) : BaseClass(len)
+    {
+        std::vector<size_t> dimension(1, len);
+        this->create(dimension);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(size_t sx, size_t sy) : BaseClass(sx, sy)
+    {
+        std::vector<size_t> dimension(2);
+        dimension[0] = sx;
+        dimension[1] = sy;
+        this->create(dimension);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(size_t sx, size_t sy, size_t sz) : BaseClass(sx, sy, sz)
+    {
+        std::vector<size_t> dimension(3);
+        dimension[0] = sx;
+        dimension[1] = sy;
+        dimension[2] = sz;
+        this->create(dimension);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(size_t sx, size_t sy, size_t sz, size_t st) : BaseClass(sx, sy, sz, st)
+    {
+        std::vector<size_t> dimension(4);
+        dimension[0] = sx;
+        dimension[1] = sy;
+        dimension[2] = sz;
+        dimension[3] = st;
+        this->create(dimension);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, size_t sp) : BaseClass(sx, sy, sz, st, sp)
+    {
+        std::vector<size_t> dimension(5);
+        dimension[0] = sx;
+        dimension[1] = sy;
+        dimension[2] = sz;
+        dimension[3] = st;
+        dimension[4] = sp;
+        this->create(dimension);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq) : BaseClass(sx, sy, sz, st, sp, sq)
+    {
+        std::vector<size_t> dimension(6);
+        dimension[0] = sx;
+        dimension[1] = sy;
+        dimension[2] = sz;
+        dimension[3] = st;
+        dimension[4] = sp;
+        dimension[5] = sq;
+        this->create(dimension);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr) : BaseClass(sx, sy, sz, st, sp, sq, sr)
+    {
+        std::vector<size_t> dimension(7);
+        dimension[0] = sx;
+        dimension[1] = sy;
+        dimension[2] = sz;
+        dimension[3] = st;
+        dimension[4] = sp;
+        dimension[5] = sq;
+        dimension[6] = sr;
+        this->create(dimension);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss) : BaseClass(sx, sy, sz, st, sp, sq, sr, ss)
+    {
+        std::vector<size_t> dimension(8);
+        dimension[0] = sx;
+        dimension[1] = sy;
+        dimension[2] = sz;
+        dimension[3] = st;
+        dimension[4] = sp;
+        dimension[5] = sq;
+        dimension[6] = sr;
+        dimension[7] = ss;
+        this->create(dimension);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage (const std::vector<size_t>& dimensions, T* data, bool delete_data_on_destruct) : BaseClass(dimensions, data, delete_data_on_destruct)
+    {
+        this->create(dimensions, data, delete_data_on_destruct);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, T* data, bool delete_data_on_destruct) : BaseClass(dimensions, data, delete_data_on_destruct)
+    {
+        this->create(dimensions, pixelSize, data, delete_data_on_destruct);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, T* data, bool delete_data_on_destruct) : BaseClass(dimensions, data, delete_data_on_destruct)
+    {
+        this->create(dimensions, pixelSize, origin, data, delete_data_on_destruct);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, const axis_type& axis, T* data, bool delete_data_on_destruct) : BaseClass(dimensions, data, delete_data_on_destruct)
+    {
+        this->create(dimensions, pixelSize, origin, axis, data, delete_data_on_destruct);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(size_t len, T* data, bool delete_data_on_destruct) : BaseClass(len, data, delete_data_on_destruct)
+    {
+        std::vector<size_t> dimension(1, len);
+        this->create(dimension, data, delete_data_on_destruct);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(size_t sx, size_t sy, T* data, bool delete_data_on_destruct) : BaseClass(sx, sy, data, delete_data_on_destruct)
+    {
+        std::vector<size_t> dimension(2);
+        dimension[0] = sx;
+        dimension[1] = sy;
+        this->create(dimension, data, delete_data_on_destruct);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct) : BaseClass(sx, sy, sz, data, delete_data_on_destruct)
+    {
+        std::vector<size_t> dimension(3);
+        dimension[0] = sx;
+        dimension[1] = sy;
+        dimension[2] = sz;
+        this->create(dimension, data, delete_data_on_destruct);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, T* data, bool delete_data_on_destruct) : BaseClass(sx, sy, sz, st, data, delete_data_on_destruct)
+    {
+        std::vector<size_t> dimension(4);
+        dimension[0] = sx;
+        dimension[1] = sy;
+        dimension[2] = sz;
+        dimension[3] = st;
+        this->create(dimension, data, delete_data_on_destruct);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, T* data, bool delete_data_on_destruct) : BaseClass(sx, sy, sz, st, sp, data, delete_data_on_destruct)
+    {
+        std::vector<size_t> dimension(5);
+        dimension[0] = sx;
+        dimension[1] = sy;
+        dimension[2] = sz;
+        dimension[3] = st;
+        dimension[4] = sp;
+        this->create(dimension, data, delete_data_on_destruct);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, T* data, bool delete_data_on_destruct) : BaseClass(sx, sy, sz, st, sp, sq, data, delete_data_on_destruct)
+    {
+        std::vector<size_t> dimension(6);
+        dimension[0] = sx;
+        dimension[1] = sy;
+        dimension[2] = sz;
+        dimension[3] = st;
+        dimension[4] = sp;
+        dimension[5] = sq;
+        this->create(dimension, data, delete_data_on_destruct);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, T* data, bool delete_data_on_destruct) : BaseClass(sx, sy, sz, st, sp, sq, sr, data, delete_data_on_destruct)
+    {
+        std::vector<size_t> dimension(7);
+        dimension[0] = sx;
+        dimension[1] = sy;
+        dimension[2] = sz;
+        dimension[3] = st;
+        dimension[4] = sp;
+        dimension[5] = sq;
+        dimension[6] = sr;
+        this->create(dimension, data, delete_data_on_destruct);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, T* data, bool delete_data_on_destruct) : BaseClass(sx, sy, sz, st, sp, sq, sr, ss, data, delete_data_on_destruct)
+    {
+        std::vector<size_t> dimension(8);
+        dimension[0] = sx;
+        dimension[1] = sy;
+        dimension[2] = sz;
+        dimension[3] = st;
+        dimension[4] = sp;
+        dimension[5] = sq;
+        dimension[6] = sr;
+        dimension[7] = ss;
+        this->create(dimension, data, delete_data_on_destruct);
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(const hoNDArray<T>& a) : BaseClass(a)
+    {
+         boost::shared_ptr< std::vector<size_t> > dim = a.get_dimensions();
+         this->create(*dim);
+         memcpy(this->data_, a.begin(), this->get_number_of_bytes());
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::hoNDImage(const Self& a) : BaseClass()
+    {
+        *this = a;
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>& hoNDImage<T, D>::operator=(const Self& rhs)
+    {
+        if ( &rhs == this ) return *this;
+
+        if ( rhs.get_number_of_elements() == 0 )
+        {
+            this->clear();
+            return *this;
+        }
+
+        if ( !this->dimensions_ ) this->dimensions_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+        if ( !this->offsetFactors_ ) this->offsetFactors_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+
+        if ( this->dimensions_equal(rhs) && this->data_!=NULL )
+        {
+            memcpy(this->data_, rhs.data_, rhs.elements_*sizeof(T));
+        }
+        else
+        {
+            this->deallocate_memory();
+            this->data_ = 0;
+
+            *(this->dimensions_) = *(rhs.dimensions_);
+            this->allocate_memory();
+            this->calculate_offset_factors( *(this->dimensions_) );
+            memcpy( this->data_, rhs.data_, this->elements_*sizeof(T) );
+        }
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            this->pixelSize_[ii] = rhs.pixelSize_[ii];
+            this->pixelSize_reciprocal_[ii] = rhs.pixelSize_reciprocal_[ii];
+            this->origin_[ii] = rhs.origin_[ii];
+            this->axis_[ii] = rhs.axis_[ii];
+        }
+
+        this->image_position_patient_ = rhs.image_position_patient_;
+        this->image_orientation_patient_[0] = rhs.image_orientation_patient_[0];
+        this->image_orientation_patient_[1] = rhs.image_orientation_patient_[1];
+        this->image_orientation_patient_[2] = rhs.image_orientation_patient_[2];
+
+        this->attrib_ = rhs.attrib_;
+
+        return *this;
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImage<T, D>::~hoNDImage()
+    {
+        if (this->delete_data_on_destruct_)
+        {
+            this->deallocate_memory();
+        }
+    }
+
+    template <typename T, unsigned int D> 
+    void hoNDImage<T, D>::clear()
+    {
+        if ( this->delete_data_on_destruct_ )
+        {
+            this->deallocate_memory();
+        }
+        this->data_ = 0;
+        this->elements_ = 0;
+        this->delete_data_on_destruct_ = true;
+
+        unsigned int ii;
+
+        dimensions_->clear();
+        offsetFactors_->clear();
+
+        for (ii=0;ii<D; ii++)
+        {
+            pixelSize_[ii] = 1;
+            pixelSize_reciprocal_[ii] = 1;
+            origin_[ii] = 0;
+            axis_[ii].fill(0);
+            axis_[ii][ii] = coord_type(1.0);
+        }
+
+        image_position_patient_[0] = 0;
+        image_position_patient_[1] = 0;
+        image_position_patient_[2] = 0;
+
+        image_orientation_patient_[0][0] = 1; image_orientation_patient_[0][1] = 0; image_orientation_patient_[0][2] = 0;
+        image_orientation_patient_[1][0] = 0; image_orientation_patient_[1][1] = 1; image_orientation_patient_[1][2] = 0;
+        image_orientation_patient_[2][0] = 0; image_orientation_patient_[2][1] = 0; image_orientation_patient_[2][2] = 1;
+
+        this->attrib_ = ISMRMRD::MetaContainer();
+    }
+
+    template <typename T, unsigned int D> 
+    void hoNDImage<T, D>::create(const std::vector<size_t>& dimensions)
+    {
+        if ( !this->dimensions_equal(dimensions) )
+        {
+            if ( !dimensions_ )
+            {
+                dimensions_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+            }
+
+            if ( !offsetFactors_ )
+            {
+                offsetFactors_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+            }
+
+            *dimensions_ = dimensions;
+            this->allocate_memory();
+            this->calculate_offset_factors(dimensions);
+        }
+
+        unsigned int ii;
+        for (ii=0;ii<D; ii++)
+        {
+            pixelSize_[ii] = 1;
+            pixelSize_reciprocal_[ii] = 1;
+            origin_[ii] = 0;
+            axis_[ii].fill(0);
+            axis_[ii][ii] = coord_type(1.0);
+        }
+
+        image_position_patient_[0] = 0;
+        image_position_patient_[1] = 0;
+        image_position_patient_[2] = 0;
+
+        image_orientation_patient_[0][0] = 1; image_orientation_patient_[0][1] = 0; image_orientation_patient_[0][2] = 0;
+        image_orientation_patient_[1][0] = 0; image_orientation_patient_[1][1] = 1; image_orientation_patient_[1][2] = 0;
+        image_orientation_patient_[2][0] = 0; image_orientation_patient_[2][1] = 0; image_orientation_patient_[2][2] = 1;
+    }
+
+    template <typename T, unsigned int D> 
+    void hoNDImage<T, D>::create(boost::shared_ptr< std::vector<size_t> > dimensions)
+    {
+        this->create(*dimensions);
+    }
+
+    template <typename T, unsigned int D> 
+    void hoNDImage<T, D>::create(const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize)
+    {
+        if ( !this->dimensions_equal(dimensions) )
+        {
+            if ( !dimensions_ )
+            {
+                dimensions_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+            }
+
+            if ( !offsetFactors_ )
+            {
+                offsetFactors_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+            }
+
+            *dimensions_ = dimensions;
+            this->allocate_memory();
+            this->calculate_offset_factors(dimensions);
+        }
+
+        unsigned int ii;
+        for (ii=0;ii<D; ii++)
+        {
+            pixelSize_[ii] = pixelSize[ii];
+            pixelSize_reciprocal_[ii] = coord_type(1.0)/pixelSize_[ii];
+            origin_[ii] = 0;
+            axis_[ii].fill(0);
+            axis_[ii][ii] = coord_type(1.0);
+        }
+
+        image_position_patient_[0] = 0;
+        image_position_patient_[1] = 0;
+        image_position_patient_[2] = 0;
+
+        image_orientation_patient_[0][0] = 1; image_orientation_patient_[0][1] = 0; image_orientation_patient_[0][2] = 0;
+        image_orientation_patient_[1][0] = 0; image_orientation_patient_[1][1] = 1; image_orientation_patient_[1][2] = 0;
+        image_orientation_patient_[2][0] = 0; image_orientation_patient_[2][1] = 0; image_orientation_patient_[2][2] = 1;
+    }
+
+    template <typename T, unsigned int D> 
+    void hoNDImage<T, D>::create(const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin)
+    {
+        if ( !this->dimensions_equal(dimensions) )
+        {
+            if ( !dimensions_ )
+            {
+                dimensions_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+            }
+
+            if ( !offsetFactors_ )
+            {
+                offsetFactors_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+            }
+
+            *dimensions_ = dimensions;
+            this->allocate_memory();
+            this->calculate_offset_factors(dimensions);
+        }
+
+        unsigned int ii;
+        for (ii=0;ii<D; ii++)
+        {
+            pixelSize_[ii] = pixelSize[ii];
+            pixelSize_reciprocal_[ii] = coord_type(1.0)/pixelSize_[ii];
+            origin_[ii] = origin[ii];
+            axis_[ii].fill(0);
+            axis_[ii][ii] = coord_type(1.0);
+        }
+
+        image_position_patient_[0] = 0;
+        image_position_patient_[1] = 0;
+        image_position_patient_[2] = 0;
+
+        if ( D==1 )
+        {
+            image_position_patient_[0] = origin[0];
+        }
+        else if ( D == 2 )
+        {
+            image_position_patient_[0] = origin[0];
+            image_position_patient_[1] = origin[1];
+        }
+        else
+        {
+            image_position_patient_[0] = origin[0];
+            image_position_patient_[1] = origin[1];
+            image_position_patient_[2] = origin[2];
+        }
+
+        image_orientation_patient_[0][0] = 1; image_orientation_patient_[0][1] = 0; image_orientation_patient_[0][2] = 0;
+        image_orientation_patient_[1][0] = 0; image_orientation_patient_[1][1] = 1; image_orientation_patient_[1][2] = 0;
+        image_orientation_patient_[2][0] = 0; image_orientation_patient_[2][1] = 0; image_orientation_patient_[2][2] = 1;
+    }
+
+    template <typename T, unsigned int D> 
+    void hoNDImage<T, D>::create(const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, const axis_type& axis)
+    {
+        if ( !this->dimensions_equal(dimensions) )
+        {
+            if ( !dimensions_ )
+            {
+                dimensions_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+            }
+
+            if ( !offsetFactors_ )
+            {
+                offsetFactors_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+            }
+
+            *dimensions_ = dimensions;
+            this->allocate_memory();
+            this->calculate_offset_factors(dimensions);
+        }
+
+        unsigned int ii;
+        for (ii=0;ii<D; ii++)
+        {
+            pixelSize_[ii] = pixelSize[ii];
+            pixelSize_reciprocal_[ii] = coord_type(1.0)/pixelSize_[ii];
+            origin_[ii] = origin[ii];
+            axis_[ii] = axis[ii];
+        }
+
+        image_position_patient_[0] = 0;
+        image_position_patient_[1] = 0;
+        image_position_patient_[2] = 0;
+
+        image_orientation_patient_[0][0] = 1; image_orientation_patient_[0][1] = 0; image_orientation_patient_[0][2] = 0;
+        image_orientation_patient_[1][0] = 0; image_orientation_patient_[1][1] = 1; image_orientation_patient_[1][2] = 0;
+        image_orientation_patient_[2][0] = 0; image_orientation_patient_[2][1] = 0; image_orientation_patient_[2][2] = 1;
+
+        if ( D==1 )
+        {
+            image_position_patient_[0] = origin[0];
+        }
+        else if ( D == 2 )
+        {
+            image_position_patient_[0] = origin[0];
+            image_position_patient_[1] = origin[1];
+        }
+        else
+        {
+            image_position_patient_[0] = origin[0];
+            image_position_patient_[1] = origin[1];
+            image_position_patient_[2] = origin[2];
+
+            image_orientation_patient_[0][0] = axis[0][0]; image_orientation_patient_[0][1] = axis[0][1]; image_orientation_patient_[0][2] = axis[0][2];
+            image_orientation_patient_[1][0] = axis[1][0]; image_orientation_patient_[1][1] = axis[1][1]; image_orientation_patient_[1][2] = axis[1][2];
+            image_orientation_patient_[2][0] = axis[2][0]; image_orientation_patient_[2][1] = axis[2][1]; image_orientation_patient_[2][2] = axis[2][2];
+        }
+    }
+
+    template <typename T, unsigned int D> 
+    void hoNDImage<T, D>::create(const std::vector<size_t>& dimensions, T* data, bool delete_data_on_destruct)
+    {
+        if ( this->delete_data_on_destruct_ )
+        {
+            this->deallocate_memory();
+            this->data_ = NULL;
+        }
+
+        this->data_ = data;
+        this->delete_data_on_destruct_ = delete_data_on_destruct;
+
+        if ( !dimensions_ )
+        {
+            dimensions_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+        }
+
+        if ( !offsetFactors_ )
+        {
+            offsetFactors_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+        }
+
+        *dimensions_ = dimensions;
+
+        unsigned int ii;
+
+        this->elements_ = 1;
+        for (ii=0; ii<D; ii++)
+        {
+            this->elements_ *= (*dimensions_)[ii];
+        }
+        this->calculate_offset_factors(dimensions);
+
+        for (ii=0;ii<D; ii++)
+        {
+            pixelSize_[ii] = 1.0;
+            pixelSize_reciprocal_[ii] = 1.0;
+            origin_[ii] = 0;
+            axis_[ii].fill(0);
+            axis_[ii][ii] = coord_type(1.0);
+        }
+
+        image_position_patient_[0] = 0;
+        image_position_patient_[1] = 0;
+        image_position_patient_[2] = 0;
+
+        image_orientation_patient_[0][0] = 1; image_orientation_patient_[0][1] = 0; image_orientation_patient_[0][2] = 0;
+        image_orientation_patient_[1][0] = 0; image_orientation_patient_[1][1] = 1; image_orientation_patient_[1][2] = 0;
+        image_orientation_patient_[2][0] = 0; image_orientation_patient_[2][1] = 0; image_orientation_patient_[2][2] = 1;
+    }
+
+    template <typename T, unsigned int D> 
+    void hoNDImage<T, D>::create(const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, T* data, bool delete_data_on_destruct)
+    {
+        if ( this->delete_data_on_destruct_ )
+        {
+            this->deallocate_memory();
+            this->data_ = NULL;
+        }
+
+        this->data_ = data;
+        this->delete_data_on_destruct_ = delete_data_on_destruct;
+
+        if ( !dimensions_ )
+        {
+            dimensions_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+        }
+
+        if ( !offsetFactors_ )
+        {
+            offsetFactors_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+        }
+
+        *dimensions_ = dimensions;
+
+        unsigned int ii;
+
+        this->elements_ = 1;
+        for (ii=0; ii<D; ii++)
+        {
+            this->elements_ *= (*dimensions_)[ii];
+        }
+        this->calculate_offset_factors(dimensions);
+
+        for (ii=0;ii<D; ii++)
+        {
+            pixelSize_[ii] = pixelSize[ii];
+            pixelSize_reciprocal_[ii] = coord_type(1.0)/pixelSize_[ii];
+            origin_[ii] = 0;
+            axis_[ii].fill(0);
+            axis_[ii][ii] = coord_type(1.0);
+        }
+
+        image_position_patient_[0] = 0;
+        image_position_patient_[1] = 0;
+        image_position_patient_[2] = 0;
+
+        image_orientation_patient_[0][0] = 1; image_orientation_patient_[0][1] = 0; image_orientation_patient_[0][2] = 0;
+        image_orientation_patient_[1][0] = 0; image_orientation_patient_[1][1] = 1; image_orientation_patient_[1][2] = 0;
+        image_orientation_patient_[2][0] = 0; image_orientation_patient_[2][1] = 0; image_orientation_patient_[2][2] = 1;
+    }
+
+    template <typename T, unsigned int D> 
+    void hoNDImage<T, D>::create(const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, T* data, bool delete_data_on_destruct)
+    {
+        if ( this->delete_data_on_destruct_ )
+        {
+            this->deallocate_memory();
+            this->data_ = NULL;
+        }
+
+        this->data_ = data;
+        this->delete_data_on_destruct_ = delete_data_on_destruct;
+
+        if ( !dimensions_ )
+        {
+            dimensions_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+        }
+
+        if ( !offsetFactors_ )
+        {
+            offsetFactors_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+        }
+
+        *dimensions_ = dimensions;
+
+        unsigned int ii;
+
+        this->elements_ = 1;
+        for (ii=0; ii<D; ii++)
+        {
+            this->elements_ *= (*dimensions_)[ii];
+        }
+        this->calculate_offset_factors(dimensions);
+
+        for (ii=0;ii<D; ii++)
+        {
+            pixelSize_[ii] = pixelSize[ii];
+            pixelSize_reciprocal_[ii] = coord_type(1.0)/pixelSize_[ii];
+            origin_[ii] = origin[ii];
+            axis_[ii].fill(0);
+            axis_[ii][ii] = coord_type(1.0);
+        }
+
+        image_position_patient_[0] = 0;
+        image_position_patient_[1] = 0;
+        image_position_patient_[2] = 0;
+
+        if ( D==1 )
+        {
+            image_position_patient_[0] = origin[0];
+        }
+        else if ( D == 2 )
+        {
+            image_position_patient_[0] = origin[0];
+            image_position_patient_[1] = origin[1];
+        }
+        else
+        {
+            image_position_patient_[0] = origin[0];
+            image_position_patient_[1] = origin[1];
+            image_position_patient_[2] = origin[2];
+        }
+
+        image_orientation_patient_[0][0] = 1; image_orientation_patient_[0][1] = 0; image_orientation_patient_[0][2] = 0;
+        image_orientation_patient_[1][0] = 0; image_orientation_patient_[1][1] = 1; image_orientation_patient_[1][2] = 0;
+        image_orientation_patient_[2][0] = 0; image_orientation_patient_[2][1] = 0; image_orientation_patient_[2][2] = 1;
+    }
+
+    template <typename T, unsigned int D> 
+    void hoNDImage<T, D>::create(const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, const axis_type& axis, T* data, bool delete_data_on_destruct)
+    {
+        if ( this->delete_data_on_destruct_ )
+        {
+            this->deallocate_memory();
+            this->data_ = NULL;
+        }
+
+        this->data_ = data;
+        this->delete_data_on_destruct_ = delete_data_on_destruct;
+
+        if ( !dimensions_ )
+        {
+            dimensions_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+        }
+
+        if ( !offsetFactors_ )
+        {
+            offsetFactors_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+        }
+
+        *dimensions_ = dimensions;
+
+        unsigned int ii;
+
+        this->elements_ = 1;
+        for (ii=0; ii<D; ii++)
+        {
+            this->elements_ *= (*dimensions_)[ii];
+        }
+        this->calculate_offset_factors(dimensions);
+
+        for (ii=0;ii<D; ii++)
+        {
+            pixelSize_[ii] = pixelSize[ii];
+            pixelSize_reciprocal_[ii] = coord_type(1.0)/pixelSize_[ii];
+            origin_[ii] = origin[ii];
+            axis_[ii] = axis[ii];
+        }
+
+        image_position_patient_[0] = 0;
+        image_position_patient_[1] = 0;
+        image_position_patient_[2] = 0;
+
+        image_orientation_patient_[0][0] = 1; image_orientation_patient_[0][1] = 0; image_orientation_patient_[0][2] = 0;
+        image_orientation_patient_[1][0] = 0; image_orientation_patient_[1][1] = 1; image_orientation_patient_[1][2] = 0;
+        image_orientation_patient_[2][0] = 0; image_orientation_patient_[2][1] = 0; image_orientation_patient_[2][2] = 1;
+
+        if ( D==1 )
+        {
+            image_position_patient_[0] = origin[0];
+        }
+        else if ( D == 2 )
+        {
+            image_position_patient_[0] = origin[0];
+            image_position_patient_[1] = origin[1];
+        }
+        else
+        {
+            image_position_patient_[0] = origin[0];
+            image_position_patient_[1] = origin[1];
+            image_position_patient_[2] = origin[2];
+
+            image_orientation_patient_[0][0] = axis[0][0]; image_orientation_patient_[0][1] = axis[0][1]; image_orientation_patient_[0][2] = axis[0][2];
+            image_orientation_patient_[1][0] = axis[1][0]; image_orientation_patient_[1][1] = axis[1][1]; image_orientation_patient_[1][2] = axis[1][2];
+            image_orientation_patient_[2][0] = axis[2][0]; image_orientation_patient_[2][1] = axis[2][1]; image_orientation_patient_[2][2] = axis[2][2];
+        }
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::from_NDArray(const hoNDArray<T>& a)
+    {
+        boost::shared_ptr< std::vector<size_t> > dim = a.get_dimensions();
+
+        size_t ii;
+
+        if ( dim->size() < D )
+        {
+            std::vector<size_t> dimUsed(D, 1);
+            for ( ii=0; ii<dim->size(); ii++ )
+            {
+                dimUsed[ii] = (*dim)[ii];
+            }
+
+            if ( !this->dimensions_equal(dimUsed) )
+            {
+                this->create(dimUsed);
+            }
+        }
+        else if ( dim->size() > D )
+        {
+            std::vector<size_t> dimUsed(D, 1);
+            for ( ii=0; ii<D; ii++ )
+            {
+                dimUsed[ii] = (*dim)[ii];
+            }
+
+            if ( !this->dimensions_equal(dimUsed) )
+            {
+                this->create(dimUsed);
+            }
+        }
+        else
+        {
+            if ( !this->dimensions_equal(*dim) )
+            {
+                this->create(*dim);
+            }
+        }
+
+        memcpy(this->data_, a.begin(), this->get_number_of_bytes());
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::to_NDArray(hoNDArray<T>& a) const
+    {
+        std::vector<size_t> dim;
+        this->get_dimensions(dim);
+
+        if ( !a.dimensions_equal(&dim) )
+        {
+            a.create(&dim);
+        }
+
+        memcpy(a.begin(), this->data_, a.get_number_of_bytes());
+    }
+
+    template <typename T, unsigned int D> 
+    inline bool hoNDImage<T, D>::dimensions_equal(const std::vector<size_t>& dimensions) const
+    {
+        if ( (!dimensions_) || (dimensions.size() != D) || ( dimensions_->size() != dimensions.size() ) ) return false;
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            if ( (*dimensions_)[ii] != dimensions[ii] ) return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    inline typename hoNDImage<T, D>::coord_type hoNDImage<T, D>::get_pixel_size(size_t dimension) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimension < D);
+        return this->pixelSize_[dimension];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::get_pixel_size(std::vector<coord_type>& pixelSize) const
+    {
+        pixelSize.resize(D);
+        memcpy(&pixelSize[0], this->pixelSize_, sizeof(coord_type)*D);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::set_pixel_size(size_t dimension, coord_type v)
+    {
+        GADGET_DEBUG_CHECK_THROW(dimension < D);
+        this->pixelSize_[dimension] = v;
+        this->pixelSize_reciprocal_[dimension] = coord_type(1.0)/v;
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::set_pixel_size(const std::vector<coord_type>& pixelSize)
+    {
+        GADGET_DEBUG_CHECK_THROW(pixelSize.size() >= D);
+        memcpy(this->pixelSize_, &pixelSize[0], sizeof(coord_type)*D);
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            this->pixelSize_reciprocal_[ii] = coord_type(1.0)/this->pixelSize_[ii];
+        }
+    }
+
+    template <typename T, unsigned int D> 
+    inline typename hoNDImage<T, D>::coord_type hoNDImage<T, D>::get_origin(size_t dimension) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimension < D);
+        return this->origin_[dimension];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::get_origin(std::vector<coord_type>& origin) const
+    {
+        origin.resize(D);
+        memcpy(&origin[0], this->origin_, sizeof(coord_type)*D);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::set_origin(size_t dimension, coord_type v)
+    {
+        GADGET_DEBUG_CHECK_THROW(dimension < D);
+        this->origin_[dimension] = v;
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::set_origin(const std::vector<coord_type>& origin)
+    {
+        GADGET_DEBUG_CHECK_THROW(origin.size() >= D);
+        memcpy(this->origin_, &origin[0], sizeof(coord_type)*D);
+    }
+
+    template <typename T, unsigned int D> 
+    inline typename hoNDImage<T, D>::coord_type hoNDImage<T, D>::get_axis(size_t dimension, size_t elem) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimension<D && elem<D);
+        return this->axis_[dimension][elem];
+    }
+
+    template <typename T, unsigned int D> 
+    inline typename hoNDImage<T, D>::a_axis_type hoNDImage<T, D>::get_axis(size_t dimension) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimension < D);
+        return this->axis_[dimension];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::get_axis(axis_type& axis) const
+    {
+        axis.resize(D);
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            axis[ii] = this->axis_[ii];
+        }
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::set_axis(size_t dimension, size_t elem, coord_type v)
+    {
+        GADGET_DEBUG_CHECK_THROW(dimension<D && elem<D);
+        this->axis_[dimension][elem] = v;
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::set_axis(size_t dimension, const a_axis_type& v)
+    {
+        GADGET_DEBUG_CHECK_THROW(dimension < D);
+        this->axis_[dimension] = v;
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::set_axis(const axis_type& axis)
+    {
+        GADGET_DEBUG_CHECK_THROW(axis.size() >= D);
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            this->axis_[ii] = axis[ii];
+        }
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::get_image_position(coord_type pos[3]) const 
+    {
+        pos[0] = image_position_patient_[0];
+        pos[1] = image_position_patient_[1];
+        pos[2] = image_position_patient_[2];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::get_image_position(unsigned int d, coord_type& pos) const 
+    {
+        GADGET_DEBUG_CHECK_THROW(d<3);
+        pos = image_position_patient_[d];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::get_image_position(a_axis_image_patient_type& pos) const 
+    {
+        pos = image_position_patient_;
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::set_image_position(coord_type pos[3])
+    {
+        image_position_patient_[0] = pos[0];
+        image_position_patient_[1] = pos[1];
+        image_position_patient_[2] = pos[2];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::set_image_position(unsigned int d, coord_type pos)
+    {
+        GADGET_DEBUG_CHECK_THROW(d<3);
+        pos = image_position_patient_[d];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::set_image_position(const a_axis_image_patient_type& pos)
+    {
+        image_position_patient_ = pos;
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::get_image_orientation(unsigned int d, coord_type ori[3]) const 
+    {
+        GADGET_DEBUG_CHECK_THROW(d<3);
+        ori[0] = image_orientation_patient_[d][0];
+        ori[1] = image_orientation_patient_[d][1];
+        ori[2] = image_orientation_patient_[d][2];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::get_image_orientation(unsigned int d, a_axis_image_patient_type& ori) const 
+    {
+        GADGET_DEBUG_CHECK_THROW(d<3);
+        ori = image_orientation_patient_[d];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::get_image_orientation(unsigned int d, unsigned int ind, coord_type& ori) const 
+    {
+        GADGET_DEBUG_CHECK_THROW(d<3);
+        GADGET_DEBUG_CHECK_THROW(ind<3);
+        ori = image_orientation_patient_[d][ind];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::get_image_orientation(coord_type quat[4]) const 
+    {
+        coord_type r11 = image_orientation_patient_[0][0], r12 = image_orientation_patient_[1][0], r13 = image_orientation_patient_[2][0];
+        coord_type r21 = image_orientation_patient_[0][1], r22 = image_orientation_patient_[1][1], r23 = image_orientation_patient_[2][1];
+        coord_type r31 = image_orientation_patient_[0][2], r32 = image_orientation_patient_[1][2], r33 = image_orientation_patient_[2][2];
+
+        double a = 1, b = 0, c = 0, d = 0, s = 0;
+        double trace = 0;
+        double xd, yd, zd;
+
+        /* verify the sign of the rotation*/
+        coord_type deti = (r11 * r22 * r33) + (r12 * r23 * r31) + (r21 * r32 * r13) -
+            (r13 * r22 * r31) - (r12 * r21 * r33) - (r11 * r23 * r32);
+
+        if (deti < 0)
+        {
+            /* flip 3rd column */
+            r13 = -r13;
+            r23 = -r23;
+            r33 = -r33;
+        }
+
+        /* Compute quaternion parameters */
+        /* http://www.cs.princeton.edu/~gewang/projects/darth/stuff/quat_faq.html#Q55 */
+        trace = 1.0l + r11 + r22 + r33;
+        if (trace > 0.00001l)
+        {                /* simplest case */
+            s = std::sqrt(trace) * 2;
+            a = (r32 - r23) / s;
+            b = (r13 - r31) / s;
+            c = (r21 - r12) / s;
+            d = 0.25l * s;
+        }
+        else
+        {
+            /* trickier case...
+             * determine which major diagonal element has
+             * the greatest value... */
+            xd = 1.0 + r11 - (r22 + r33);  /* 4**b**b */
+            yd = 1.0 + r22 - (r11 + r33);  /* 4**c**c */
+            zd = 1.0 + r33 - (r11 + r22);  /* 4**d**d */
+            /* if r11 is the greatest */
+            if (xd > 1.0)
+            {
+                s = 2.0 * std::sqrt(xd);
+                a = 0.25l * s;
+                b = (r21 + r12) / s;
+                c = (r31 + r13) / s;
+                d = (r32 - r23) / s;
+            }
+            /* else if r22 is the greatest */
+            else if (yd > 1.0)
+            {
+                s = 2.0 * std::sqrt(yd);
+                a = (r21 + r12) / s;
+                b = 0.25l * s;
+                c = (r32 + r23) / s;
+                d = (r13 - r31) / s;
+            }
+            /* else, r33 must be the greatest */
+            else
+            {
+                s = 2.0 * std::sqrt(zd);
+                a = (r13 + r31) / s;
+                b = (r23 + r32) / s;
+                c = 0.25l * s;
+                d = (r21 - r12) / s;
+            }
+
+            if (a < 0.0l)
+            {
+                b = -b;
+                c = -c;
+                d = -d;
+                a = -a;
+            }
+        }
+
+        quat[0] = (coord_type)a; 
+        quat[1] = (coord_type)b; 
+        quat[2] = (coord_type)c; 
+        quat[3] = (coord_type)d;
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::set_image_orientation(unsigned int d, coord_type ori[3])
+    {
+        GADGET_DEBUG_CHECK_THROW(d<3);
+        image_orientation_patient_[d][0] = ori[0];
+        image_orientation_patient_[d][1] = ori[1];
+        image_orientation_patient_[d][2] = ori[2];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::set_image_orientation(unsigned int d, const a_axis_image_patient_type& ori)
+    {
+        GADGET_DEBUG_CHECK_THROW(d<3);
+        image_orientation_patient_[d] = ori;
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::set_image_orientation(unsigned int d, unsigned int ind, coord_type ori)
+    {
+        GADGET_DEBUG_CHECK_THROW(d<3);
+        GADGET_DEBUG_CHECK_THROW(ind<3);
+        image_orientation_patient_[d][ind] = ori;
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::set_image_orientation(coord_type quat[4])
+    {
+        coord_type a = quat[0], b = quat[1], c = quat[2], d = quat[3];
+
+        image_orientation_patient_[0][0] = 1 - 2*( b*b + c*c );
+        image_orientation_patient_[1][0] = 2*( a*b - c*d );
+        image_orientation_patient_[2][0] = 2*( a*c + b*d );
+
+        image_orientation_patient_[0][1] = 2*( a*b + c*d );
+        image_orientation_patient_[1][1] = 1 - 2*( a*a + c*c );
+        image_orientation_patient_[2][1] = 2*( b*c - a*d );
+
+        image_orientation_patient_[0][2] = 2*( a*c - b*d );
+        image_orientation_patient_[1][2] = 2*( b*c + a*d );
+        image_orientation_patient_[2][2] = 1 - 2*( a*a + b*b );
+    }
+
+    template <typename T, unsigned int D> 
+    inline size_t hoNDImage<T, D>::calculate_offset(const size_t* ind) const
+    {
+        GADGET_DEBUG_CHECK_THROW(ind!=NULL);
+
+        size_t offset = ind[0];
+        for( size_t i = 1; i < D; i++ )
+            offset += ind[i] * (*offsetFactors_)[i];
+        return offset;
+    }
+
+    template <typename T, unsigned int D> 
+    inline size_t hoNDImage<T, D>::calculate_offset(const std::vector<size_t>& ind) const
+    {
+        return this->calculate_offset(&ind[0]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline size_t hoNDImage<T, D>::calculate_offset(size_t x, size_t y) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==2);
+        return x + y * (*offsetFactors_)[1];
+    }
+
+    template <typename T, unsigned int D> 
+    inline size_t hoNDImage<T, D>::calculate_offset(size_t x, size_t y, size_t z) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==3);
+        return x + (y * (*offsetFactors_)[1]) + (z * (*offsetFactors_)[2]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline size_t hoNDImage<T, D>::calculate_offset(size_t x, size_t y, size_t z, size_t s) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==4);
+        return x + (y * (*offsetFactors_)[1]) + (z * (*offsetFactors_)[2]) + (s * (*offsetFactors_)[3]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline size_t hoNDImage<T, D>::calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==5);
+        return x + (y * (*offsetFactors_)[1]) + (z * (*offsetFactors_)[2]) + (s * (*offsetFactors_)[3]) + (p * (*offsetFactors_)[4]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline size_t hoNDImage<T, D>::calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==6);
+        return x + (y * (*offsetFactors_)[1]) + (z * (*offsetFactors_)[2]) + (s * (*offsetFactors_)[3]) + (p * (*offsetFactors_)[4]) + (r * (*offsetFactors_)[5]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline size_t hoNDImage<T, D>::calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==7);
+        return x + (y * (*offsetFactors_)[1]) + (z * (*offsetFactors_)[2]) + (s * (*offsetFactors_)[3]) + (p * (*offsetFactors_)[4]) + (r * (*offsetFactors_)[5]) + (a * (*offsetFactors_)[6]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline size_t hoNDImage<T, D>::calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==8);
+        return x + (y * (*offsetFactors_)[1]) + (z * (*offsetFactors_)[2]) + (s * (*offsetFactors_)[3]) + (p * (*offsetFactors_)[4]) + (r * (*offsetFactors_)[5]) + (a * (*offsetFactors_)[6]) + (q * (*offsetFactors_)[7]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline size_t hoNDImage<T, D>::calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==9);
+        return x + (y * (*offsetFactors_)[1]) + (z * (*offsetFactors_)[2]) + (s * (*offsetFactors_)[3]) + (p * (*offsetFactors_)[4]) + (r * (*offsetFactors_)[5]) + (a * (*offsetFactors_)[6]) + (q * (*offsetFactors_)[7]) + (u * (*offsetFactors_)[8]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline std::vector<size_t> hoNDImage<T, D>::calculate_index( size_t offset ) const
+    {
+        std::vector<size_t> index(D, 0);
+        this->calculate_index(offset, index);
+        return index;
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::calculate_index( size_t offset, size_t* index ) const
+    {
+        GADGET_DEBUG_CHECK_THROW(index!=NULL);
+
+        unsigned int i;
+        for( i=D-1; i>0; i-- )
+        {
+            index[i] = offset / (*offsetFactors_)[i];
+            offset %= (*offsetFactors_)[i];
+        }
+        index[0] = offset;
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::calculate_index( size_t offset, std::vector<size_t>& index ) const
+    {
+        index.resize(D, 0);
+        this->calculate_index(offset, &index[0]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::calculate_index( size_t offset, coord_type* index ) const
+    {
+        unsigned int i;
+        for( i=D-1; i>0; i-- )
+        {
+            index[i] = offset / (*offsetFactors_)[i];
+            offset %= (*offsetFactors_)[i];
+        }
+        index[0] = offset;
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::calculate_index( size_t offset, size_t& x, size_t& y ) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==2);
+        y = offset / (*offsetFactors_)[1];
+        x = offset % (*offsetFactors_)[1];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::calculate_index( size_t offset, size_t& x, size_t& y, size_t& z ) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==3);
+
+        z = offset / (*offsetFactors_)[2];
+        offset %= (*offsetFactors_)[2];
+
+        y = offset / (*offsetFactors_)[1];
+        x = offset % (*offsetFactors_)[1];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::calculate_index( size_t offset, size_t& x, size_t& y, size_t& z, size_t& s ) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==4);
+
+        s = offset / (*offsetFactors_)[3];
+        offset %= (*offsetFactors_)[3];
+
+        z = offset / (*offsetFactors_)[2];
+        offset %= (*offsetFactors_)[2];
+
+        y = offset / (*offsetFactors_)[1];
+        x = offset % (*offsetFactors_)[1];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::calculate_index( size_t offset, size_t& x, size_t& y, size_t& z, size_t& s, size_t& p ) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==5);
+
+        p = offset / (*offsetFactors_)[4];
+        offset %= (*offsetFactors_)[4];
+
+        s = offset / (*offsetFactors_)[3];
+        offset %= (*offsetFactors_)[3];
+
+        z = offset / (*offsetFactors_)[2];
+        offset %= (*offsetFactors_)[2];
+
+        y = offset / (*offsetFactors_)[1];
+        x = offset % (*offsetFactors_)[1];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::calculate_index( size_t offset, size_t& x, size_t& y, size_t& z, size_t& s, size_t& p, size_t& r ) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==6);
+
+        r = offset / (*offsetFactors_)[5];
+        offset %= (*offsetFactors_)[5];
+
+        p = offset / (*offsetFactors_)[4];
+        offset %= (*offsetFactors_)[4];
+
+        s = offset / (*offsetFactors_)[3];
+        offset %= (*offsetFactors_)[3];
+
+        z = offset / (*offsetFactors_)[2];
+        offset %= (*offsetFactors_)[2];
+
+        y = offset / (*offsetFactors_)[1];
+        x = offset % (*offsetFactors_)[1];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::calculate_index( size_t offset, size_t& x, size_t& y, size_t& z, size_t& s, size_t& p, size_t& r, size_t& a ) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==7);
+
+        a = offset / (*offsetFactors_)[6];
+        offset %= (*offsetFactors_)[6];
+
+        r = offset / (*offsetFactors_)[5];
+        offset %= (*offsetFactors_)[5];
+
+        p = offset / (*offsetFactors_)[4];
+        offset %= (*offsetFactors_)[4];
+
+        s = offset / (*offsetFactors_)[3];
+        offset %= (*offsetFactors_)[3];
+
+        z = offset / (*offsetFactors_)[2];
+        offset %= (*offsetFactors_)[2];
+
+        y = offset / (*offsetFactors_)[1];
+        x = offset % (*offsetFactors_)[1];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::calculate_index( size_t offset, size_t& x, size_t& y, size_t& z, size_t& s, size_t& p, size_t& r, size_t& a, size_t& q ) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==8);
+
+        q = offset / (*offsetFactors_)[7];
+        offset %= (*offsetFactors_)[7];
+
+        a = offset / (*offsetFactors_)[6];
+        offset %= (*offsetFactors_)[6];
+
+        r = offset / (*offsetFactors_)[5];
+        offset %= (*offsetFactors_)[5];
+
+        p = offset / (*offsetFactors_)[4];
+        offset %= (*offsetFactors_)[4];
+
+        s = offset / (*offsetFactors_)[3];
+        offset %= (*offsetFactors_)[3];
+
+        z = offset / (*offsetFactors_)[2];
+        offset %= (*offsetFactors_)[2];
+
+        y = offset / (*offsetFactors_)[1];
+        x = offset % (*offsetFactors_)[1];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::calculate_index( size_t offset, size_t& x, size_t& y, size_t& z, size_t& s, size_t& p, size_t& r, size_t& a, size_t& q, size_t& u ) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==9);
+
+        u = offset / (*offsetFactors_)[8];
+        offset %= (*offsetFactors_)[8];
+
+        q = offset / (*offsetFactors_)[7];
+        offset %= (*offsetFactors_)[7];
+
+        a = offset / (*offsetFactors_)[6];
+        offset %= (*offsetFactors_)[6];
+
+        r = offset / (*offsetFactors_)[5];
+        offset %= (*offsetFactors_)[5];
+
+        p = offset / (*offsetFactors_)[4];
+        offset %= (*offsetFactors_)[4];
+
+        s = offset / (*offsetFactors_)[3];
+        offset %= (*offsetFactors_)[3];
+
+        z = offset / (*offsetFactors_)[2];
+        offset %= (*offsetFactors_)[2];
+
+        y = offset / (*offsetFactors_)[1];
+        x = offset % (*offsetFactors_)[1];
+    }
+
+    template <typename T, unsigned int D> 
+    inline T& hoNDImage<T, D>::operator()( const size_t* ind )
+    {
+        size_t idx = this->calculate_offset(ind);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline const T& hoNDImage<T, D>::operator()( const size_t* ind ) const
+    {
+        size_t idx = this->calculate_offset(ind);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline T& hoNDImage<T, D>::operator()( const std::vector<size_t>& ind )
+    {
+        size_t idx = this->calculate_offset(ind);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline const T& hoNDImage<T, D>::operator()( const std::vector<size_t>& ind ) const
+    {
+        size_t idx = this->calculate_offset(ind);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline T& hoNDImage<T, D>::operator[]( size_t x )
+    {
+        GADGET_DEBUG_CHECK_THROW(x < this->elements_);
+        return this->data_[x];
+    }
+
+    template <typename T, unsigned int D> 
+    inline const T& hoNDImage<T, D>::operator[]( size_t x ) const
+    {
+        GADGET_DEBUG_CHECK_THROW(x < this->elements_);
+        return this->data_[x];
+    }
+
+    template <typename T, unsigned int D> 
+    inline T& hoNDImage<T, D>::operator()( size_t x )
+    {
+        GADGET_DEBUG_CHECK_THROW(x < this->elements_);
+        return this->data_[x];
+    }
+
+    template <typename T, unsigned int D> 
+    inline const T& hoNDImage<T, D>::operator()( size_t x ) const
+    {
+        GADGET_DEBUG_CHECK_THROW(x < this->elements_);
+        return this->data_[x];
+    }
+
+    template <typename T, unsigned int D> 
+    inline T& hoNDImage<T, D>::operator()( size_t x, size_t y )
+    {
+        size_t idx = this->calculate_offset(x, y);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline const T& hoNDImage<T, D>::operator()( size_t x, size_t y ) const
+    {
+        size_t idx = this->calculate_offset(x, y);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline T& hoNDImage<T, D>::operator()( size_t x, size_t y, size_t z )
+    {
+        size_t idx = this->calculate_offset(x, y, z);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline const T& hoNDImage<T, D>::operator()( size_t x, size_t y, size_t z ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline T& hoNDImage<T, D>::operator()( size_t x, size_t y, size_t z, size_t s )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline const T& hoNDImage<T, D>::operator()( size_t x, size_t y, size_t z, size_t s ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline T& hoNDImage<T, D>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline const T& hoNDImage<T, D>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline T& hoNDImage<T, D>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline const T& hoNDImage<T, D>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline T& hoNDImage<T, D>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline const T& hoNDImage<T, D>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline T& hoNDImage<T, D>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a, q);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline const T& hoNDImage<T, D>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a, q);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline T& hoNDImage<T, D>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a, q, u);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    inline const T& hoNDImage<T, D>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a, q, u);
+        GADGET_DEBUG_CHECK_THROW(idx < this->elements_);
+        return this->data_[idx];
+    }
+
+    template <typename T, unsigned int D> 
+    void hoNDImage<T, D>::fill(T value)
+    {
+        std::fill(this->get_data_ptr(), this->get_data_ptr()+this->get_number_of_elements(), value);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(const coord_type* ind, coord_type* coord) const
+    {
+        unsigned int ii, jj;
+        for(ii=0; ii<D; ii++)
+        {
+            coord[ii] = 0;
+
+            for(jj=0; jj<D; jj++)
+            {
+                coord[ii] += this->axis_[jj][ii] * ( ind[jj] * this->pixelSize_[jj] );
+            }
+
+            coord[ii] += this->origin_[ii];
+        }
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(const std::vector<coord_type>& ind, std::vector<coord_type>& coord) const
+    {
+        GADGET_DEBUG_CHECK_THROW(ind.size >= D);
+
+        if ( coord.size() < D ) coord.resize(D);
+
+        this->image_to_world(&ind[0], &coord[0]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(coord_type x, coord_type& cx) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==1);
+        cx = this->axis_[0][0] * ( x * this->pixelSize_[0] ) + this->origin_[0];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(coord_type x, coord_type y, coord_type& cx, coord_type& cy) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==2);
+
+        coord_type sx = x*this->pixelSize_[0];
+        coord_type sy = y*this->pixelSize_[1];
+
+        cx =    this->axis_[0][0] * sx 
+              + this->axis_[1][0] * sy 
+              + this->origin_[0];
+
+        cy =    this->axis_[0][1] * sx 
+              + this->axis_[1][1] * sy 
+              + this->origin_[1];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(coord_type x, coord_type y, coord_type z, coord_type& cx, coord_type& cy, coord_type& cz) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==3);
+
+        coord_type sx = x*this->pixelSize_[0];
+        coord_type sy = y*this->pixelSize_[1];
+        coord_type sz = z*this->pixelSize_[2];
+
+        cx =    (this->axis_[0][0] * sx 
+              + this->axis_[1][0] * sy) 
+              + (this->axis_[2][0] * sz 
+              + this->origin_[0]);
+
+        cy =    (this->axis_[0][1] * sx 
+              + this->axis_[1][1] * sy) 
+              + (this->axis_[2][1] * sz 
+              + this->origin_[1]);
+
+        cz =    (this->axis_[0][2] * sx 
+              + this->axis_[1][2] * sy) 
+              + (this->axis_[2][2] * sz 
+              + this->origin_[2]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(coord_type x, coord_type y, coord_type z, coord_type s, coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==4);
+
+        coord_type sx = x*this->pixelSize_[0];
+        coord_type sy = y*this->pixelSize_[1];
+        coord_type sz = z*this->pixelSize_[2];
+        coord_type ss = s*this->pixelSize_[3];
+
+        cx =    (this->axis_[0][0] * sx 
+              + this->axis_[1][0] * sy) 
+              + (this->axis_[2][0] * sz 
+              + this->axis_[3][0] * ss) 
+              + this->origin_[0];
+
+        cy =    (this->axis_[0][1] * sx 
+              + this->axis_[1][1] * sy) 
+              + (this->axis_[2][1] * sz 
+              + this->axis_[3][1] * ss) 
+              + this->origin_[1];
+
+        cz =    (this->axis_[0][2] * sx 
+              + this->axis_[1][2] * sy) 
+              + (this->axis_[2][2] * sz 
+              + this->axis_[3][2] * ss) 
+              + this->origin_[2];
+
+        cs =    (this->axis_[0][3] * sx 
+              + this->axis_[1][3] * sy) 
+              + (this->axis_[2][3] * sz 
+              + this->axis_[3][3] * ss) 
+              + this->origin_[3];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==5);
+
+        coord_type sx = x*this->pixelSize_[0];
+        coord_type sy = y*this->pixelSize_[1];
+        coord_type sz = z*this->pixelSize_[2];
+        coord_type ss = s*this->pixelSize_[3];
+        coord_type sp = p*this->pixelSize_[4];
+
+        cx =    (this->axis_[0][0] * sx 
+              + this->axis_[1][0] * sy) 
+              + (this->axis_[2][0] * sz 
+              + this->axis_[3][0] * ss) 
+              + (this->axis_[4][0] * sp 
+              + this->origin_[0]);
+
+        cy =    (this->axis_[0][1] * sx 
+              + this->axis_[1][1] * sy) 
+              + (this->axis_[2][1] * sz 
+              + this->axis_[3][1] * ss) 
+              + (this->axis_[4][1] * sp 
+              + this->origin_[1]);
+
+        cz =    (this->axis_[0][2] * sx 
+              + this->axis_[1][2] * sy) 
+              + (this->axis_[2][2] * sz 
+              + this->axis_[3][2] * ss) 
+              + (this->axis_[4][2] * sp 
+              + this->origin_[2]);
+
+        cs =    (this->axis_[0][3] * sx 
+              + this->axis_[1][3] * sy) 
+              + (this->axis_[2][3] * sz 
+              + this->axis_[3][3] * ss) 
+              + (this->axis_[4][3] * sp 
+              + this->origin_[3]);
+
+        cp =    (this->axis_[0][4] * sx 
+              + this->axis_[1][4] * sy) 
+              + (this->axis_[2][4] * sz 
+              + this->axis_[3][4] * ss) 
+              + (this->axis_[4][4] * sp 
+              + this->origin_[4]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp, coord_type& cr) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==6);
+
+        coord_type sx = x*this->pixelSize_[0];
+        coord_type sy = y*this->pixelSize_[1];
+        coord_type sz = z*this->pixelSize_[2];
+        coord_type ss = s*this->pixelSize_[3];
+        coord_type sp = p*this->pixelSize_[4];
+        coord_type sr = r*this->pixelSize_[5];
+
+        cx =    (this->axis_[0][0] * sx 
+              + this->axis_[1][0] * sy) 
+              + (this->axis_[2][0] * sz 
+              + this->axis_[3][0] * ss) 
+              + (this->axis_[4][0] * sp 
+              + this->axis_[5][0] * sr) 
+              + this->origin_[0];
+
+        cy =    (this->axis_[0][1] * sx 
+              + this->axis_[1][1] * sy) 
+              + (this->axis_[2][1] * sz 
+              + this->axis_[3][1] * ss) 
+              + (this->axis_[4][1] * sp 
+              + this->axis_[5][1] * sr) 
+              + this->origin_[1];
+
+        cz =    (this->axis_[0][2] * sx 
+              + this->axis_[1][2] * sy) 
+              + (this->axis_[2][2] * sz 
+              + this->axis_[3][2] * ss) 
+              + (this->axis_[4][2] * sp 
+              + this->axis_[5][2] * sr) 
+              + this->origin_[2];
+
+        cs =    (this->axis_[0][3] * sx 
+              + this->axis_[1][3] * sy) 
+              + (this->axis_[2][3] * sz 
+              + this->axis_[3][3] * ss) 
+              + (this->axis_[4][3] * sp 
+              + this->axis_[5][3] * sr) 
+              + this->origin_[3];
+
+        cp =    (this->axis_[0][4] * sx 
+              + this->axis_[1][4] * sy) 
+              + (this->axis_[2][4] * sz 
+              + this->axis_[3][4] * ss) 
+              + (this->axis_[4][4] * sp 
+              + this->axis_[5][4] * sr) 
+              + this->origin_[4];
+
+        cr =    (this->axis_[0][5] * sx 
+              + this->axis_[1][5] * sy) 
+              + (this->axis_[2][5] * sz 
+              + this->axis_[3][5] * ss) 
+              + (this->axis_[4][5] * sp 
+              + this->axis_[5][5] * sr) 
+              + this->origin_[5];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp, coord_type& cr, coord_type& ca) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==7);
+
+        coord_type sx = x*this->pixelSize_[0];
+        coord_type sy = y*this->pixelSize_[1];
+        coord_type sz = z*this->pixelSize_[2];
+        coord_type ss = s*this->pixelSize_[3];
+        coord_type sp = p*this->pixelSize_[4];
+        coord_type sr = r*this->pixelSize_[5];
+        coord_type sa = a*this->pixelSize_[6];
+
+        cx =    (this->axis_[0][0] * sx 
+              + this->axis_[1][0] * sy) 
+              + (this->axis_[2][0] * sz 
+              + this->axis_[3][0] * ss) 
+              + (this->axis_[4][0] * sp 
+              + this->axis_[5][0] * sr) 
+              + (this->axis_[6][0] * sa 
+              + this->origin_[0]);
+
+        cy =    (this->axis_[0][1] * sx 
+              + this->axis_[1][1] * sy) 
+              + (this->axis_[2][1] * sz 
+              + this->axis_[3][1] * ss) 
+              + (this->axis_[4][1] * sp 
+              + this->axis_[5][1] * sr) 
+              + (this->axis_[6][1] * sa 
+              + this->origin_[1]);
+
+        cz =    (this->axis_[0][2] * sx 
+              + this->axis_[1][2] * sy) 
+              + (this->axis_[2][2] * sz 
+              + this->axis_[3][2] * ss) 
+              + (this->axis_[4][2] * sp 
+              + this->axis_[5][2] * sr) 
+              + (this->axis_[6][2] * sa 
+              + this->origin_[2]);
+
+        cs =    (this->axis_[0][3] * sx 
+              + this->axis_[1][3] * sy) 
+              + (this->axis_[2][3] * sz 
+              + this->axis_[3][3] * ss) 
+              + (this->axis_[4][3] * sp 
+              + this->axis_[5][3] * sr) 
+              + (this->axis_[6][3] * sa 
+              + this->origin_[3]);
+
+        cp =    (this->axis_[0][4] * sx 
+              + this->axis_[1][4] * sy) 
+              + (this->axis_[2][4] * sz 
+              + this->axis_[3][4] * ss) 
+              + (this->axis_[4][4] * sp 
+              + this->axis_[5][4] * sr) 
+              + (this->axis_[6][4] * sa 
+              + this->origin_[4]);
+
+        cr =    (this->axis_[0][5] * sx 
+              + this->axis_[1][5] * sy) 
+              + (this->axis_[2][5] * sz 
+              + this->axis_[3][5] * ss) 
+              + (this->axis_[4][5] * sp 
+              + this->axis_[5][5] * sr) 
+              + (this->axis_[6][5] * sa 
+              + this->origin_[5]);
+
+        ca =    (this->axis_[0][6] * sx 
+              + this->axis_[1][6] * sy) 
+              + (this->axis_[2][6] * sz 
+              + this->axis_[3][6] * ss) 
+              + (this->axis_[4][6] * sp 
+              + this->axis_[5][6] * sr) 
+              + (this->axis_[6][6] * sa 
+              + this->origin_[6]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q, coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp, coord_type& cr, coord_type& ca, coord_type& cq) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==8);
+
+        coord_type sx = x*this->pixelSize_[0];
+        coord_type sy = y*this->pixelSize_[1];
+        coord_type sz = z*this->pixelSize_[2];
+        coord_type ss = s*this->pixelSize_[3];
+        coord_type sp = p*this->pixelSize_[4];
+        coord_type sr = r*this->pixelSize_[5];
+        coord_type sa = a*this->pixelSize_[6];
+        coord_type sq = q*this->pixelSize_[7];
+
+        cx =    (this->axis_[0][0] * sx 
+              + this->axis_[1][0] * sy) 
+              + (this->axis_[2][0] * sz 
+              + this->axis_[3][0] * ss) 
+              + (this->axis_[4][0] * sp 
+              + this->axis_[5][0] * sr) 
+              + (this->axis_[6][0] * sa 
+              + this->axis_[7][0] * sq) 
+              + this->origin_[0];
+
+        cy =    (this->axis_[0][1] * sx 
+              + this->axis_[1][1] * sy) 
+              + (this->axis_[2][1] * sz 
+              + this->axis_[3][1] * ss) 
+              + (this->axis_[4][1] * sp 
+              + this->axis_[5][1] * sr) 
+              + (this->axis_[6][1] * sa 
+              + this->axis_[7][1] * sq) 
+              + this->origin_[1];
+
+        cz =    (this->axis_[0][2] * sx 
+              + this->axis_[1][2] * sy) 
+              + (this->axis_[2][2] * sz 
+              + this->axis_[3][2] * ss) 
+              + (this->axis_[4][2] * sp 
+              + this->axis_[5][2] * sr) 
+              + (this->axis_[6][2] * sa 
+              + this->axis_[7][2] * sq) 
+              + this->origin_[2];
+
+        cs =    (this->axis_[0][3] * sx 
+              + this->axis_[1][3] * sy) 
+              + (this->axis_[2][3] * sz 
+              + this->axis_[3][3] * ss) 
+              + (this->axis_[4][3] * sp 
+              + this->axis_[5][3] * sr) 
+              + (this->axis_[6][3] * sa 
+              + this->axis_[7][3] * sq) 
+              + this->origin_[3];
+
+        cp =    (this->axis_[0][4] * sx 
+              + this->axis_[1][4] * sy) 
+              + (this->axis_[2][4] * sz 
+              + this->axis_[3][4] * ss) 
+              + (this->axis_[4][4] * sp 
+              + this->axis_[5][4] * sr) 
+              + (this->axis_[6][4] * sa 
+              + this->axis_[7][4] * sq) 
+              + this->origin_[4];
+
+        cr =    (this->axis_[0][5] * sx 
+              + this->axis_[1][5] * sy) 
+              + (this->axis_[2][5] * sz 
+              + this->axis_[3][5] * ss) 
+              + (this->axis_[4][5] * sp 
+              + this->axis_[5][5] * sr) 
+              + (this->axis_[6][5] * sa 
+              + this->axis_[7][5] * sq) 
+              + this->origin_[5];
+
+        ca =    (this->axis_[0][6] * sx 
+              + this->axis_[1][6] * sy) 
+              + (this->axis_[2][6] * sz 
+              + this->axis_[3][6] * ss) 
+              + (this->axis_[4][6] * sp 
+              + this->axis_[5][6] * sr) 
+              + (this->axis_[6][6] * sa 
+              + this->axis_[7][6] * sq) 
+              + this->origin_[6];
+
+        cq =    (this->axis_[0][7] * sx 
+              + this->axis_[1][7] * sy) 
+              + (this->axis_[2][7] * sz 
+              + this->axis_[3][7] * ss) 
+              + (this->axis_[4][7] * sp 
+              + this->axis_[5][7] * sr) 
+              + (this->axis_[6][7] * sa 
+              + this->axis_[7][7] * sq) 
+              + this->origin_[7];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(coord_type x, coord_type y, coord_type z, coord_type s, coord_type p, coord_type r, coord_type a, coord_type q, coord_type u, coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp, coord_type& cr, coord_type& ca, coord_type& cq, coord_type& cu) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==9);
+
+        coord_type sx = x*this->pixelSize_[0];
+        coord_type sy = y*this->pixelSize_[1];
+        coord_type sz = z*this->pixelSize_[2];
+        coord_type ss = s*this->pixelSize_[3];
+        coord_type sp = p*this->pixelSize_[4];
+        coord_type sr = r*this->pixelSize_[5];
+        coord_type sa = a*this->pixelSize_[6];
+        coord_type sq = q*this->pixelSize_[7];
+        coord_type su = u*this->pixelSize_[8];
+
+        cx =    (this->axis_[0][0] * sx 
+              + this->axis_[1][0] * sy) 
+              + (this->axis_[2][0] * sz 
+              + this->axis_[3][0] * ss) 
+              + (this->axis_[4][0] * sp 
+              + this->axis_[5][0] * sr) 
+              + (this->axis_[6][0] * sa 
+              + this->axis_[7][0] * sq) 
+              + (this->axis_[8][0] * su 
+              + this->origin_[0]);
+
+        cy =    (this->axis_[0][1] * sx 
+              + this->axis_[1][1] * sy) 
+              + (this->axis_[2][1] * sz 
+              + this->axis_[3][1] * ss) 
+              + (this->axis_[4][1] * sp 
+              + this->axis_[5][1] * sr) 
+              + (this->axis_[6][1] * sa 
+              + this->axis_[7][1] * sq) 
+              + (this->axis_[8][1] * su 
+              + this->origin_[1]);
+
+        cz =    (this->axis_[0][2] * sx 
+              + this->axis_[1][2] * sy) 
+              + (this->axis_[2][2] * sz 
+              + this->axis_[3][2] * ss) 
+              + (this->axis_[4][2] * sp 
+              + this->axis_[5][2] * sr) 
+              + (this->axis_[6][2] * sa 
+              + this->axis_[7][2] * sq) 
+              + (this->axis_[8][2] * su 
+              + this->origin_[2]);
+
+        cs =    (this->axis_[0][3] * sx 
+              + this->axis_[1][3] * sy) 
+              + (this->axis_[2][3] * sz 
+              + this->axis_[3][3] * ss) 
+              + (this->axis_[4][3] * sp 
+              + this->axis_[5][3] * sr) 
+              + (this->axis_[6][3] * sa 
+              + this->axis_[7][3] * sq) 
+              + (this->axis_[8][3] * su 
+              + this->origin_[3]);
+
+        cp =    (this->axis_[0][4] * sx 
+              + this->axis_[1][4] * sy) 
+              + (this->axis_[2][4] * sz 
+              + this->axis_[3][4] * ss) 
+              + (this->axis_[4][4] * sp 
+              + this->axis_[5][4] * sr) 
+              + (this->axis_[6][4] * sa 
+              + this->axis_[7][4] * sq) 
+              + (this->axis_[8][4] * su 
+              + this->origin_[4]);
+
+        cr =    (this->axis_[0][5] * sx 
+              + this->axis_[1][5] * sy) 
+              + (this->axis_[2][5] * sz 
+              + this->axis_[3][5] * ss) 
+              + (this->axis_[4][5] * sp 
+              + this->axis_[5][5] * sr) 
+              + (this->axis_[6][5] * sa 
+              + this->axis_[7][5] * sq) 
+              + (this->axis_[8][5] * su 
+              + this->origin_[5]);
+
+        ca =    (this->axis_[0][6] * sx 
+              + this->axis_[1][6] * sy) 
+              + (this->axis_[2][6] * sz 
+              + this->axis_[3][6] * ss) 
+              + (this->axis_[4][6] * sp 
+              + this->axis_[5][6] * sr) 
+              + (this->axis_[6][6] * sa 
+              + this->axis_[7][6] * sq) 
+              + (this->axis_[8][6] * su 
+              + this->origin_[6]);
+
+        cq =    (this->axis_[0][7] * sx 
+              + this->axis_[1][7] * sy) 
+              + (this->axis_[2][7] * sz 
+              + this->axis_[3][7] * ss) 
+              + (this->axis_[4][7] * sp 
+              + this->axis_[5][7] * sr) 
+              + (this->axis_[6][7] * sa 
+              + this->axis_[7][7] * sq) 
+              + (this->axis_[8][7] * su 
+              + this->origin_[7]);
+
+        cu =    (this->axis_[0][8] * sx 
+              + this->axis_[1][8] * sy) 
+              + (this->axis_[2][8] * sz 
+              + this->axis_[3][8] * ss) 
+              + (this->axis_[4][8] * sp 
+              + this->axis_[5][8] * sr) 
+              + (this->axis_[6][8] * sa 
+              + this->axis_[7][8] * sq) 
+              + (this->axis_[8][8] * su 
+              + this->origin_[8]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(const size_t* ind, coord_type* coord) const
+    {
+        unsigned int ii, jj;
+        for(ii=0; ii<D; ii++)
+        {
+            coord[ii] = 0;
+
+            for(jj=0; jj<D; jj++)
+            {
+                coord[ii] += this->axis_[jj][ii] * ( ind[jj] * this->pixelSize_[jj] );
+            }
+
+            coord[ii] += this->origin_[ii];
+        }
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(const std::vector<size_t>& ind, std::vector<coord_type>& coord) const
+    {
+        GADGET_DEBUG_CHECK_THROW(ind.size >= D);
+
+        if ( coord.size() < D ) coord.resize(D);
+
+        this->image_to_world(&ind[0], &coord[0]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(size_t x, coord_type& cx) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==1);
+        cx = this->axis_[0][0] * ( x * this->pixelSize_[0] ) + this->origin_[0];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(size_t x, size_t y, coord_type& cx, coord_type& cy) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==2);
+
+        coord_type sx = x*this->pixelSize_[0];
+        coord_type sy = y*this->pixelSize_[1];
+
+        cx =    this->axis_[0][0] * sx 
+              + this->axis_[1][0] * sy 
+              + this->origin_[0];
+
+        cy =    this->axis_[0][1] * sx 
+              + this->axis_[1][1] * sy 
+              + this->origin_[1];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(size_t x, size_t y, size_t z, coord_type& cx, coord_type& cy, coord_type& cz) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==3);
+
+        coord_type sx = x*this->pixelSize_[0];
+        coord_type sy = y*this->pixelSize_[1];
+        coord_type sz = z*this->pixelSize_[2];
+
+        cx =    (this->axis_[0][0] * sx 
+              + this->axis_[1][0] * sy) 
+              + (this->axis_[2][0] * sz 
+              + this->origin_[0]);
+
+        cy =    (this->axis_[0][1] * sx 
+              + this->axis_[1][1] * sy) 
+              + (this->axis_[2][1] * sz 
+              + this->origin_[1]);
+
+        cz =    (this->axis_[0][2] * sx 
+              + this->axis_[1][2] * sy) 
+              + (this->axis_[2][2] * sz 
+              + this->origin_[2]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(size_t x, size_t y, size_t z, size_t s, coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==4);
+
+        coord_type sx = x*this->pixelSize_[0];
+        coord_type sy = y*this->pixelSize_[1];
+        coord_type sz = z*this->pixelSize_[2];
+        coord_type ss = s*this->pixelSize_[3];
+
+        cx =    (this->axis_[0][0] * sx 
+              + this->axis_[1][0] * sy) 
+              + (this->axis_[2][0] * sz 
+              + this->axis_[3][0] * ss) 
+              + this->origin_[0];
+
+        cy =    (this->axis_[0][1] * sx 
+              + this->axis_[1][1] * sy) 
+              + (this->axis_[2][1] * sz 
+              + this->axis_[3][1] * ss) 
+              + this->origin_[1];
+
+        cz =    (this->axis_[0][2] * sx 
+              + this->axis_[1][2] * sy) 
+              + (this->axis_[2][2] * sz 
+              + this->axis_[3][2] * ss) 
+              + this->origin_[2];
+
+        cs =    (this->axis_[0][3] * sx 
+              + this->axis_[1][3] * sy) 
+              + (this->axis_[2][3] * sz 
+              + this->axis_[3][3] * ss) 
+              + this->origin_[3];
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(size_t x, size_t y, size_t z, size_t s, size_t p,
+                            coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp) const
+    {
+        this->image_to_world(static_cast<coord_type>(x), static_cast<coord_type>(y), static_cast<coord_type>(z), static_cast<coord_type>(s), static_cast<coord_type>(p), cx, cy, cz, cs, cp);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r,
+                        coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp, coord_type& cr) const
+    {
+        this->image_to_world(static_cast<coord_type>(x), static_cast<coord_type>(y), static_cast<coord_type>(z), static_cast<coord_type>(s), static_cast<coord_type>(p), static_cast<coord_type>(r), cx, cy, cz, cs, cp, cr);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a,
+                        coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp, coord_type& cr, coord_type& ca) const
+    {
+        this->image_to_world(static_cast<coord_type>(x), static_cast<coord_type>(y), static_cast<coord_type>(z), static_cast<coord_type>(s), static_cast<coord_type>(p), static_cast<coord_type>(r), static_cast<coord_type>(a), cx, cy, cz, cs, cp, cr, ca);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q,
+                        coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp, coord_type& cr, coord_type& ca, coord_type& cq) const
+    {
+        this->image_to_world(static_cast<coord_type>(x), static_cast<coord_type>(y), static_cast<coord_type>(z), static_cast<coord_type>(s), static_cast<coord_type>(p), static_cast<coord_type>(r), static_cast<coord_type>(a), static_cast<coord_type>(q), cx, cy, cz, cs, cp, cr, ca, cq);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u,
+                        coord_type& cx, coord_type& cy, coord_type& cz, coord_type& cs, coord_type& cp, coord_type& cr, coord_type& ca, coord_type& cq, coord_type& cu) const
+    {
+        this->image_to_world(static_cast<coord_type>(x), static_cast<coord_type>(y), static_cast<coord_type>(z), static_cast<coord_type>(s), static_cast<coord_type>(p), static_cast<coord_type>(r), static_cast<coord_type>(a), static_cast<coord_type>(q), static_cast<coord_type>(u), cx, cy, cz, cs, cp, cr, ca, cq, cu);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::image_to_world_matrix(hoMatrix<coord_type>& image2world) const
+    {
+        // image to world matrix = tranlation * rotation * pixelSize_Scaling
+        image2world.createMatrix(D+1, D+1);
+
+        // rotation matrix
+        hoMatrix<coord_type> rotation(D+1, D+1);
+        rotation.setIdentity();
+
+        unsigned int ii, jj;
+        for ( jj=0; jj<D; jj++ )
+        {
+            for ( ii=0; ii<D; ii++ )
+            {
+                rotation(ii, jj) = this->axis_[jj][ii];
+            }
+        }
+
+        // pixel scaling matrix
+        hoMatrix<coord_type> scaling(D+1, D+1);
+        scaling.setIdentity();
+        for ( ii=0; ii<D; ii++ )
+        {
+            scaling(ii, ii) = this->pixelSize_[ii];
+        }
+
+        // translation matrix
+        hoMatrix<coord_type> translation(D+1, D+1);
+        translation.setIdentity();
+        for ( ii=0; ii<D; ii++ )
+        {
+            translation(ii, D) = this->origin_[ii];
+        }
+        Gadgetron::GeneralMatrixProduct(image2world, rotation, false, scaling, false);
+        Gadgetron::GeneralMatrixProduct(rotation, translation, false, image2world, false);
+        image2world = rotation;
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::set_image_to_world_matrix(const hoMatrix<coord_type>& image2world)
+    {
+        GADGET_DEBUG_CHECK_THROW(D+1==image2world.rows());
+        GADGET_DEBUG_CHECK_THROW(D+1==image2world.cols);
+
+        // origin
+        hoMatrix<coord_type> pt(D+1, 1);
+        pt(D, 0) = 1.0;
+
+        hoMatrix<coord_type> res(D+1, 1);
+
+        Gadgetron::GeneralMatrixProduct(res, image2world, false, pt, false);
+        memcpy(this->origin_, res.begin(), sizeof(coord_type)*D);
+
+        // rotation
+        unsigned int ii, jj;
+        for ( ii=0; ii<D; ii++ )
+        {
+            memset(pt.get_data_ptr(), 0, sizeof(coord_type)*(D+1));
+            pt(D, 0) = 1.0;
+            pt(ii, 0) = 1.0;
+
+            Gadgetron::GeneralMatrixProduct(res, image2world, false, pt, false);
+            for ( jj=0; jj<D; jj++ )
+            {
+                this->axis_[ii][jj] = res(jj, 0) - this->origin_[jj];
+            }
+
+            this->pixelSize_[ii] = this->axis_[ii].abs();
+            this->axis_[ii].normalize();
+        }
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::world_to_image(const coord_type* coord, coord_type* ind) const
+    {
+        unsigned int ii, jj;
+        for(ii=0; ii<D; ii++)
+        {
+            ind[ii] = 0;
+            for(jj=0; jj<D; jj++)
+            {
+                ind[ii] += this->axis_[ii][jj] * ( coord[jj] - this->origin_[jj] );
+            }
+
+            ind[ii] *= this->pixelSize_reciprocal_[ii];
+        }
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::world_to_image(const std::vector<coord_type>& coord, std::vector<coord_type>& ind) const
+    {
+        GADGET_DEBUG_CHECK_THROW(coord.size()>=D);
+
+        if ( ind.size() < D ) ind.resize(D);
+
+        this->world_to_image(&coord[0], &ind[0]);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::world_to_image(coord_type cx, coord_type& x) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==1);
+        x = this->pixelSize_reciprocal_[0] * this->axis_[0][0] * ( cx - this->origin_[0] );
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::world_to_image(coord_type cx, coord_type cy, coord_type& x, coord_type& y) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==2);
+
+        coord_type sx = cx - this->origin_[0];
+        coord_type sy = cy - this->origin_[1];
+
+        x = this->pixelSize_reciprocal_[0] * (this->axis_[0][0]*sx + this->axis_[0][1]*sy);
+        y = this->pixelSize_reciprocal_[1] * (this->axis_[1][0]*sx + this->axis_[1][1]*sy);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::world_to_image(coord_type cx, coord_type cy, coord_type cz, coord_type& x, coord_type& y, coord_type& z) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==3);
+
+        coord_type sx = cx - this->origin_[0];
+        coord_type sy = cy - this->origin_[1];
+        coord_type sz = cz - this->origin_[2];
+
+        x = this->pixelSize_reciprocal_[0] * (this->axis_[0][0]*sx + this->axis_[0][1]*sy + this->axis_[0][2]*sz);
+        y = this->pixelSize_reciprocal_[1] * (this->axis_[1][0]*sx + this->axis_[1][1]*sy + this->axis_[1][2]*sz);
+        z = this->pixelSize_reciprocal_[2] * (this->axis_[2][0]*sx + this->axis_[2][1]*sy + this->axis_[2][2]*sz);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::world_to_image(coord_type cx, coord_type cy, coord_type cz, coord_type cs, coord_type& x, coord_type& y, coord_type& z, coord_type& s) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==4);
+
+        coord_type sx = cx - this->origin_[0];
+        coord_type sy = cy - this->origin_[1];
+        coord_type sz = cz - this->origin_[2];
+        coord_type ss = cs - this->origin_[3];
+
+        x = this->pixelSize_reciprocal_[0] * ((this->axis_[0][0]*sx + this->axis_[0][1]*sy) + (this->axis_[0][2]*sz + this->axis_[0][3]*ss));
+        y = this->pixelSize_reciprocal_[1] * ((this->axis_[1][0]*sx + this->axis_[1][1]*sy) + (this->axis_[1][2]*sz + this->axis_[1][3]*ss));
+        z = this->pixelSize_reciprocal_[2] * ((this->axis_[2][0]*sx + this->axis_[2][1]*sy) + (this->axis_[2][2]*sz + this->axis_[2][3]*ss));
+        s = this->pixelSize_reciprocal_[3] * ((this->axis_[3][0]*sx + this->axis_[3][1]*sy) + (this->axis_[3][2]*sz + this->axis_[3][3]*ss));
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::world_to_image(coord_type cx, coord_type cy, coord_type cz, coord_type cs, coord_type cp, coord_type& x, coord_type& y, coord_type& z, coord_type& s, coord_type& p) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==5);
+
+        coord_type sx = cx - this->origin_[0];
+        coord_type sy = cy - this->origin_[1];
+        coord_type sz = cz - this->origin_[2];
+        coord_type ss = cs - this->origin_[3];
+        coord_type sp = cp - this->origin_[4];
+
+        x = this->pixelSize_reciprocal_[0] * ((this->axis_[0][0]*sx + this->axis_[0][1]*sy) + (this->axis_[0][2]*sz + this->axis_[0][3]*ss) + this->axis_[0][4]*sp);
+        y = this->pixelSize_reciprocal_[1] * ((this->axis_[1][0]*sx + this->axis_[1][1]*sy) + (this->axis_[1][2]*sz + this->axis_[1][3]*ss) + this->axis_[1][4]*sp);
+        z = this->pixelSize_reciprocal_[2] * ((this->axis_[2][0]*sx + this->axis_[2][1]*sy) + (this->axis_[2][2]*sz + this->axis_[2][3]*ss) + this->axis_[2][4]*sp);
+        s = this->pixelSize_reciprocal_[3] * ((this->axis_[3][0]*sx + this->axis_[3][1]*sy) + (this->axis_[3][2]*sz + this->axis_[3][3]*ss) + this->axis_[3][4]*sp);
+        p = this->pixelSize_reciprocal_[4] * ((this->axis_[4][0]*sx + this->axis_[4][1]*sy) + (this->axis_[4][2]*sz + this->axis_[4][3]*ss) + this->axis_[4][4]*sp);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::world_to_image(coord_type cx, coord_type cy, coord_type cz, coord_type cs, coord_type cp, coord_type cr, coord_type& x, coord_type& y, coord_type& z, coord_type& s, coord_type& p, coord_type& r) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==6);
+
+        coord_type sx = cx - this->origin_[0];
+        coord_type sy = cy - this->origin_[1];
+        coord_type sz = cz - this->origin_[2];
+        coord_type ss = cs - this->origin_[3];
+        coord_type sp = cp - this->origin_[4];
+        coord_type sr = cr - this->origin_[5];
+
+        x = this->pixelSize_reciprocal_[0] * ((this->axis_[0][0]*sx + this->axis_[0][1]*sy) + (this->axis_[0][2]*sz + this->axis_[0][3]*ss) + (this->axis_[0][4]*sp + this->axis_[0][5]*sr));
+        y = this->pixelSize_reciprocal_[1] * ((this->axis_[1][0]*sx + this->axis_[1][1]*sy) + (this->axis_[1][2]*sz + this->axis_[1][3]*ss) + (this->axis_[1][4]*sp + this->axis_[1][5]*sr));
+        z = this->pixelSize_reciprocal_[2] * ((this->axis_[2][0]*sx + this->axis_[2][1]*sy) + (this->axis_[2][2]*sz + this->axis_[2][3]*ss) + (this->axis_[2][4]*sp + this->axis_[2][5]*sr));
+        s = this->pixelSize_reciprocal_[3] * ((this->axis_[3][0]*sx + this->axis_[3][1]*sy) + (this->axis_[3][2]*sz + this->axis_[3][3]*ss) + (this->axis_[3][4]*sp + this->axis_[3][5]*sr));
+        p = this->pixelSize_reciprocal_[4] * ((this->axis_[4][0]*sx + this->axis_[4][1]*sy) + (this->axis_[4][2]*sz + this->axis_[4][3]*ss) + (this->axis_[4][4]*sp + this->axis_[4][5]*sr));
+        r = this->pixelSize_reciprocal_[5] * ((this->axis_[5][0]*sx + this->axis_[5][1]*sy) + (this->axis_[5][2]*sz + this->axis_[5][3]*ss) + (this->axis_[5][4]*sp + this->axis_[5][5]*sr));
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::world_to_image(coord_type cx, coord_type cy, coord_type cz, coord_type cs, coord_type cp, coord_type cr, coord_type ca, coord_type& x, coord_type& y, coord_type& z, coord_type& s, coord_type& p, coord_type& r, coord_type& a) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==7);
+
+        coord_type sx = cx - this->origin_[0];
+        coord_type sy = cy - this->origin_[1];
+        coord_type sz = cz - this->origin_[2];
+        coord_type ss = cs - this->origin_[3];
+        coord_type sp = cp - this->origin_[4];
+        coord_type sr = cr - this->origin_[5];
+        coord_type sa = ca - this->origin_[6];
+
+        x = this->pixelSize_reciprocal_[0] * ((this->axis_[0][0]*sx + this->axis_[0][1]*sy) + (this->axis_[0][2]*sz + this->axis_[0][3]*ss) + (this->axis_[0][4]*sp + this->axis_[0][5]*sr) + this->axis_[0][6]*sa);
+        y = this->pixelSize_reciprocal_[1] * ((this->axis_[1][0]*sx + this->axis_[1][1]*sy) + (this->axis_[1][2]*sz + this->axis_[1][3]*ss) + (this->axis_[1][4]*sp + this->axis_[1][5]*sr) + this->axis_[1][6]*sa);
+        z = this->pixelSize_reciprocal_[2] * ((this->axis_[2][0]*sx + this->axis_[2][1]*sy) + (this->axis_[2][2]*sz + this->axis_[2][3]*ss) + (this->axis_[2][4]*sp + this->axis_[2][5]*sr) + this->axis_[2][6]*sa);
+        s = this->pixelSize_reciprocal_[3] * ((this->axis_[3][0]*sx + this->axis_[3][1]*sy) + (this->axis_[3][2]*sz + this->axis_[3][3]*ss) + (this->axis_[3][4]*sp + this->axis_[3][5]*sr) + this->axis_[3][6]*sa);
+        p = this->pixelSize_reciprocal_[4] * ((this->axis_[4][0]*sx + this->axis_[4][1]*sy) + (this->axis_[4][2]*sz + this->axis_[4][3]*ss) + (this->axis_[4][4]*sp + this->axis_[4][5]*sr) + this->axis_[4][6]*sa);
+        r = this->pixelSize_reciprocal_[5] * ((this->axis_[5][0]*sx + this->axis_[5][1]*sy) + (this->axis_[5][2]*sz + this->axis_[5][3]*ss) + (this->axis_[5][4]*sp + this->axis_[5][5]*sr) + this->axis_[5][6]*sa);
+        a = this->pixelSize_reciprocal_[6] * ((this->axis_[6][0]*sx + this->axis_[6][1]*sy) + (this->axis_[6][2]*sz + this->axis_[6][3]*ss) + (this->axis_[6][4]*sp + this->axis_[6][5]*sr) + this->axis_[6][6]*sa);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::world_to_image(coord_type cx, coord_type cy, coord_type cz, coord_type cs, coord_type cp, coord_type cr, coord_type ca, coord_type cq, coord_type& x, coord_type& y, coord_type& z, coord_type& s, coord_type& p, coord_type& r, coord_type& a, coord_type& q) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==8);
+
+        coord_type sx = cx - this->origin_[0];
+        coord_type sy = cy - this->origin_[1];
+        coord_type sz = cz - this->origin_[2];
+        coord_type ss = cs - this->origin_[3];
+        coord_type sp = cp - this->origin_[4];
+        coord_type sr = cr - this->origin_[5];
+        coord_type sa = ca - this->origin_[6];
+        coord_type sq = cq - this->origin_[7];
+
+        x = this->pixelSize_reciprocal_[0] * ((this->axis_[0][0]*sx + this->axis_[0][1]*sy) + (this->axis_[0][2]*sz + this->axis_[0][3]*ss) + (this->axis_[0][4]*sp + this->axis_[0][5]*sr) + (this->axis_[0][6]*sa + this->axis_[0][7]*sq));
+        y = this->pixelSize_reciprocal_[1] * ((this->axis_[1][0]*sx + this->axis_[1][1]*sy) + (this->axis_[1][2]*sz + this->axis_[1][3]*ss) + (this->axis_[1][4]*sp + this->axis_[1][5]*sr) + (this->axis_[1][6]*sa + this->axis_[1][7]*sq));
+        z = this->pixelSize_reciprocal_[2] * ((this->axis_[2][0]*sx + this->axis_[2][1]*sy) + (this->axis_[2][2]*sz + this->axis_[2][3]*ss) + (this->axis_[2][4]*sp + this->axis_[2][5]*sr) + (this->axis_[2][6]*sa + this->axis_[2][7]*sq));
+        s = this->pixelSize_reciprocal_[3] * ((this->axis_[3][0]*sx + this->axis_[3][1]*sy) + (this->axis_[3][2]*sz + this->axis_[3][3]*ss) + (this->axis_[3][4]*sp + this->axis_[3][5]*sr) + (this->axis_[3][6]*sa + this->axis_[3][7]*sq));
+        p = this->pixelSize_reciprocal_[4] * ((this->axis_[4][0]*sx + this->axis_[4][1]*sy) + (this->axis_[4][2]*sz + this->axis_[4][3]*ss) + (this->axis_[4][4]*sp + this->axis_[4][5]*sr) + (this->axis_[4][6]*sa + this->axis_[4][7]*sq));
+        r = this->pixelSize_reciprocal_[5] * ((this->axis_[5][0]*sx + this->axis_[5][1]*sy) + (this->axis_[5][2]*sz + this->axis_[5][3]*ss) + (this->axis_[5][4]*sp + this->axis_[5][5]*sr) + (this->axis_[5][6]*sa + this->axis_[5][7]*sq));
+        a = this->pixelSize_reciprocal_[6] * ((this->axis_[6][0]*sx + this->axis_[6][1]*sy) + (this->axis_[6][2]*sz + this->axis_[6][3]*ss) + (this->axis_[6][4]*sp + this->axis_[6][5]*sr) + (this->axis_[6][6]*sa + this->axis_[6][7]*sq));
+        q = this->pixelSize_reciprocal_[7] * ((this->axis_[7][0]*sx + this->axis_[7][1]*sy) + (this->axis_[7][2]*sz + this->axis_[7][3]*ss) + (this->axis_[7][4]*sp + this->axis_[7][5]*sr) + (this->axis_[7][6]*sa + this->axis_[7][7]*sq));
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::world_to_image(coord_type cx, coord_type cy, coord_type cz, coord_type cs, coord_type cp, coord_type cr, coord_type ca, coord_type cq, coord_type cu, coord_type& x, coord_type& y, coord_type& z, coord_type& s, coord_type& p, coord_type& r, coord_type& a, coord_type& q, coord_type& u) const
+    {
+        GADGET_DEBUG_CHECK_THROW(D==9);
+
+        coord_type sx = cx - this->origin_[0];
+        coord_type sy = cy - this->origin_[1];
+        coord_type sz = cz - this->origin_[2];
+        coord_type ss = cs - this->origin_[3];
+        coord_type sp = cp - this->origin_[4];
+        coord_type sr = cr - this->origin_[5];
+        coord_type sa = ca - this->origin_[6];
+        coord_type sq = cq - this->origin_[7];
+        coord_type su = cu - this->origin_[8];
+
+        x = this->pixelSize_reciprocal_[0] * ((this->axis_[0][0]*sx + this->axis_[0][1]*sy) + (this->axis_[0][2]*sz + this->axis_[0][3]*ss) + (this->axis_[0][4]*sp + this->axis_[0][5]*sr) + (this->axis_[0][6]*sa + this->axis_[0][7]*sq) + this->axis_[0][8]*su);
+        y = this->pixelSize_reciprocal_[1] * ((this->axis_[1][0]*sx + this->axis_[1][1]*sy) + (this->axis_[1][2]*sz + this->axis_[1][3]*ss) + (this->axis_[1][4]*sp + this->axis_[1][5]*sr) + (this->axis_[1][6]*sa + this->axis_[1][7]*sq) + this->axis_[1][8]*su);
+        z = this->pixelSize_reciprocal_[2] * ((this->axis_[2][0]*sx + this->axis_[2][1]*sy) + (this->axis_[2][2]*sz + this->axis_[2][3]*ss) + (this->axis_[2][4]*sp + this->axis_[2][5]*sr) + (this->axis_[2][6]*sa + this->axis_[2][7]*sq) + this->axis_[2][8]*su);
+        s = this->pixelSize_reciprocal_[3] * ((this->axis_[3][0]*sx + this->axis_[3][1]*sy) + (this->axis_[3][2]*sz + this->axis_[3][3]*ss) + (this->axis_[3][4]*sp + this->axis_[3][5]*sr) + (this->axis_[3][6]*sa + this->axis_[3][7]*sq) + this->axis_[3][8]*su);
+        p = this->pixelSize_reciprocal_[4] * ((this->axis_[4][0]*sx + this->axis_[4][1]*sy) + (this->axis_[4][2]*sz + this->axis_[4][3]*ss) + (this->axis_[4][4]*sp + this->axis_[4][5]*sr) + (this->axis_[4][6]*sa + this->axis_[4][7]*sq) + this->axis_[4][8]*su);
+        r = this->pixelSize_reciprocal_[5] * ((this->axis_[5][0]*sx + this->axis_[5][1]*sy) + (this->axis_[5][2]*sz + this->axis_[5][3]*ss) + (this->axis_[5][4]*sp + this->axis_[5][5]*sr) + (this->axis_[5][6]*sa + this->axis_[5][7]*sq) + this->axis_[5][8]*su);
+        a = this->pixelSize_reciprocal_[6] * ((this->axis_[6][0]*sx + this->axis_[6][1]*sy) + (this->axis_[6][2]*sz + this->axis_[6][3]*ss) + (this->axis_[6][4]*sp + this->axis_[6][5]*sr) + (this->axis_[6][6]*sa + this->axis_[6][7]*sq) + this->axis_[6][8]*su);
+        q = this->pixelSize_reciprocal_[7] * ((this->axis_[7][0]*sx + this->axis_[7][1]*sy) + (this->axis_[7][2]*sz + this->axis_[7][3]*ss) + (this->axis_[7][4]*sp + this->axis_[7][5]*sr) + (this->axis_[7][6]*sa + this->axis_[7][7]*sq) + this->axis_[7][8]*su);
+        u = this->pixelSize_reciprocal_[8] * ((this->axis_[8][0]*sx + this->axis_[8][1]*sy) + (this->axis_[8][2]*sz + this->axis_[8][3]*ss) + (this->axis_[8][4]*sp + this->axis_[8][5]*sr) + (this->axis_[8][6]*sa + this->axis_[8][7]*sq) + this->axis_[8][8]*su);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::world_to_image_matrix(hoMatrix<coord_type>& world2image) const
+    {
+        // world to image matrix = inv(pixelSize_Scaling) * inv(rotation) * inv(tranlation)
+        world2image.createMatrix(D+1, D+1);
+
+        // rotation matrix
+        hoMatrix<coord_type> rotation(D+1, D+1);
+        rotation.setIdentity();
+
+        unsigned int ii, jj;
+        for ( jj=0; jj<D; jj++ )
+        {
+            for ( ii=0; ii<D; ii++ )
+            {
+                rotation(jj, ii) = this->axis_[jj][ii];
+            }
+        }
+
+        // pixel scaling matrix
+        hoMatrix<coord_type> scaling(D+1, D+1);
+        scaling.setIdentity();
+        for ( ii=0; ii<D; ii++ )
+        {
+            scaling(ii, ii) = this->pixelSize_reciprocal_[ii];
+        }
+
+        // translation matrix
+        hoMatrix<coord_type> translation(D+1, D+1);
+        translation.setIdentity();
+        for ( ii=0; ii<D; ii++ )
+        {
+            translation(ii, D) = -this->origin_[ii];
+        }
+
+        Gadgetron::GeneralMatrixProduct(world2image, rotation, false, translation, false);
+        Gadgetron::GeneralMatrixProduct(rotation, scaling, false, world2image, false);
+
+        world2image = rotation;
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::set_world_to_image_matrix(const hoMatrix<coord_type>& world2image)
+    {
+        GADGET_THROW("This function is not implemented ... ");
+    }
+
+    template <typename T, unsigned int D> 
+    inline bool hoNDImage<T, D>::in_image_region(const std::vector<size_t>& start, std::vector<size_t>& size)
+    {
+        GADGET_DEBUG_CHECK_THROW(start.size()>=D);
+        GADGET_DEBUG_CHECK_THROW(size.size()>=D);
+
+        if ( !this->point_in_range(start) ) return false;
+
+        std::vector<size_t> end(D);
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            end[ii] = start[ii]+size[ii]-1;
+        }
+
+        if ( !this->point_in_range(end) ) return false;
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    void hoNDImage<T, D>::get_sub_image(const std::vector<size_t>& start, std::vector<size_t>& size, Self& out)
+    {
+        GADGET_DEBUG_CHECK_THROW(start.size()>=D);
+        GADGET_DEBUG_CHECK_THROW(size.size()>=D);
+
+        if ( !this->in_image_region(start, size) )
+        {
+        	GWARN_STREAM("Sub-image regin is not in the image ... ");
+            return;
+        }
+
+        out.create(size);
+
+        memcpy(out.pixelSize_, this->pixelSize_, sizeof(coord_type)*D);
+        memcpy(out.pixelSize_reciprocal_, this->pixelSize_reciprocal_, sizeof(coord_type)*D);
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            out.axis_[ii] = this->axis_[ii];
+        }
+
+        size_t N = out.get_number_of_elements() / size[0];
+
+        long long t;
+
+        #pragma omp parallel default(none) private(t) shared(N, size, out)
+        {
+            std::vector<size_t> indOut(D), ind(D);
+
+            #pragma omp for
+            for ( t=0; t<N; t++ )
+            {
+                out->calculate_index(t*size[0], indOut);
+
+                unsigned int ii;
+                for ( ii=0; ii<D; ii++ )
+                {
+                    ind[ii] = indOut[ii]+start[ii];
+                }
+
+                size_t offset = this->calculate_offset(ind);
+
+                memcpy(out.begin()+t*size[0], this->data_+offset, sizeof(T)*size[0]);
+            }
+        }
+
+        std::vector<coord_type> origin_out(D);
+        this->image_to_world(start, origin_out);
+
+        memcpy(out.origin_, &origin_out[0], sizeof(coord_type)*D);
+    }
+
+    template <typename T, unsigned int D> 
+    bool hoNDImage<T, D>::serializeImage(char*& buf, size_t& len) const 
+    {
+        try
+        {
+            if ( buf != NULL ) delete[] buf;
+
+            // number of dimensions + dimension vector + pixel size + origin + axis + contents
+            len = sizeof(unsigned int) + sizeof(size_t)*D 
+                + sizeof(coord_type)*D + sizeof(coord_type)*D + sizeof(coord_type)*D*D 
+                + sizeof(T)*this->elements_;
+
+            buf = new char[len];
+            GADGET_CHECK_RETURN_FALSE(buf!=NULL);
+
+            unsigned int NDim=D;
+
+            size_t offset = 0;
+            memcpy(buf, &NDim, sizeof(unsigned int));
+            offset += sizeof(unsigned int);
+
+            if ( NDim > 0 )
+            {
+                memcpy(buf+offset, &((*dimensions_)[0]), sizeof(size_t)*D);
+                offset += sizeof(size_t)*D;
+
+                memcpy(buf+offset, this->pixelSize_, sizeof(coord_type)*D);
+                offset += sizeof(coord_type)*D;
+
+                memcpy(buf+offset, this->origin_, sizeof(coord_type)*D);
+                offset += sizeof(coord_type)*D;
+
+                unsigned int ii;
+                for ( ii=0; ii<D; ii++ )
+                {
+                    memcpy(buf+offset, this->axis_[ii].begin(), sizeof(coord_type)*D);
+                    offset += sizeof(coord_type)*D;
+                }
+
+                memcpy(buf+offset, this->data_, sizeof(T)*elements_);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImage<T, D>::serializeImage(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    bool hoNDImage<T, D>::deserializeImage(char* buf, size_t& len)
+    {
+        try
+        {
+            unsigned int NDim;
+            memcpy(&NDim, buf, sizeof(unsigned int));
+            if ( NDim != D )
+            {
+                GERROR_STREAM("hoNDImage<T, D>::deserialize(...) : number of image dimensions does not match ... ");
+                return false;
+            }
+
+            size_t offset = sizeof(unsigned int);
+
+            unsigned int ii;
+
+            if ( NDim > 0 )
+            {
+                std::vector<size_t> dimensions(NDim);
+
+                memcpy(&dimensions[0], buf+offset, sizeof(size_t)*D);
+                offset += sizeof(size_t)*D;
+
+                this->create(dimensions);
+
+                memcpy(this->pixelSize_, buf+offset, sizeof(coord_type)*D);
+                offset += sizeof(coord_type)*D;
+
+                for ( ii=0; ii<D; ii++ )
+                {
+                    this->pixelSize_reciprocal_[ii] = coord_type(1.0)/this->pixelSize_[ii];
+                }
+
+                memcpy(this->origin_, buf+offset, sizeof(coord_type)*D);
+                offset += sizeof(coord_type)*D;
+
+                for ( ii=0; ii<D; ii++ )
+                {
+                    memcpy(this->axis_[ii].begin(), buf+offset, sizeof(coord_type)*D);
+                    offset += sizeof(coord_type)*D;
+                }
+
+                // copy the content
+                memcpy(this->data_, buf+offset, sizeof(T)*elements_);
+                offset += sizeof(T)*elements_;
+            }
+            else
+            {
+                this->clear();
+            }
+
+            len = offset;
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImage<T, D>::deserializeImage(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    bool hoNDImage<T, D>::serialize(char*& buf, size_t& len) const 
+    {
+        char* bufImage = NULL;
+        char* bufAttrib = NULL;
+
+        try
+        {
+            size_t lenImage(0);
+            GADGET_CHECK_THROW(this->serializeImage(bufImage, lenImage));
+
+            unsigned long long lenAttrib(0);
+
+            std::stringstream str;
+            ISMRMRD::serialize( const_cast<ISMRMRD::MetaContainer&>(attrib_), str);
+            std::string attribContent = str.str();
+            lenAttrib = attribContent.length()+1;
+
+            bufAttrib = new char[lenAttrib];
+            GADGET_CHECK_THROW(bufAttrib != NULL);
+
+            memset(bufAttrib, '\0', sizeof(char)*lenAttrib);
+            memcpy(bufAttrib, attribContent.c_str(), lenAttrib-1);
+
+            len = sizeof(unsigned long long) + lenImage + sizeof(unsigned long long) + lenAttrib;
+
+            if ( buf != NULL )
+            {
+                delete [] buf;
+                buf = NULL;
+            }
+
+            buf = new char[len];
+            GADGET_CHECK_THROW(buf != NULL);
+
+            size_t offset = 0;
+            memcpy(buf, &lenImage, sizeof(size_t));
+            offset += sizeof(size_t);
+
+            memcpy(buf+offset, bufImage, lenImage);
+            offset += lenImage;
+
+            memcpy(buf+offset, &lenAttrib, sizeof(size_t));
+            offset += sizeof(size_t);
+
+            memcpy(buf+offset, bufAttrib, lenAttrib);
+            offset += lenAttrib;
+
+            if ( bufImage != NULL ) delete [] bufImage;
+            if ( bufAttrib != NULL ) delete [] bufAttrib;
+        }
+        catch(...)
+        {
+            if ( bufImage != NULL ) delete [] bufImage;
+            if ( bufAttrib != NULL ) delete [] bufAttrib;
+
+            GERROR_STREAM("Errors happened in hoNDImage<T, D>::serialize(char*& buf, size_t& len) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    bool hoNDImage<T, D>::deserialize(char* buf, size_t& len)
+    {
+        try
+        {
+            size_t lenImage(0);
+            unsigned long long lenAttrib(0);
+
+            size_t offset = 0;
+            memcpy(&lenImage, buf, sizeof(size_t));
+            offset += sizeof(size_t);
+
+            GADGET_CHECK_RETURN_FALSE(this->deserializeImage(buf+offset, lenImage));
+            offset += lenImage;
+
+            memcpy(&lenAttrib, buf+offset, sizeof(size_t));
+            offset += sizeof(size_t);
+
+            ISMRMRD::deserialize(buf+offset, attrib_);
+            offset += lenAttrib;
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImage<T, D>::deserialize(char* buf, size_t& len) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    void hoNDImage<T, D>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "-------------- Gagdgetron ND Image -------------" << endl;
+        this->printContent(os);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImage<T, D>::printContent(std::ostream& os) const
+    {
+        using namespace std;
+
+        os.unsetf(std::ios::scientific);
+        os.setf(ios::fixed);
+
+        size_t i, j;
+
+        os << "Image dimension is : " << D << endl;
+
+        os << "Image size is : ";
+        for (i=0; i<D; i++ ) 
+            os << (*dimensions_)[i] << " "; 
+        os << endl;
+
+        int elemTypeSize = sizeof(T);
+        std::string elemTypeName = std::string(typeid(T).name());
+
+        os << "Image data type is : " << elemTypeName << std::endl;
+        os << "Byte number for each element is : " << elemTypeSize << std::endl;
+        os << "Number of array size in bytes is : ";
+        os << elements_*elemTypeSize << std::endl;
+
+        os << "Pixel size is : ";
+        for (i=0; i<D; i++ ) 
+            os << this->pixelSize_[i] << " "; 
+        os << endl;
+
+        os << "Origin is : ";
+        for (i=0; i<D; i++ ) 
+            os << this->origin_[i] << " "; 
+        os << endl;
+
+        for (i=0; i<D; i++ )
+        {
+            os << "Axis " << i << " : [ ";
+            for (j=0; j<D; j++ )
+            {
+                os << this->axis_[i][j] << " "; 
+            }
+            os << "] " << endl;
+        }
+        os << endl << ends;
+
+        ISMRMRD::serialize( const_cast<ISMRMRD::MetaContainer&>(this->attrib_), os);
+    }
+}
diff --git a/toolboxes/core/cpu/image/hoNDImageAttrib.h b/toolboxes/core/cpu/image/hoNDImageAttrib.h
new file mode 100644
index 0000000..805b595
--- /dev/null
+++ b/toolboxes/core/cpu/image/hoNDImageAttrib.h
@@ -0,0 +1,329 @@
+/** \file       hoNDImageAttrib.h
+    \brief      N-dimensional image class for gadgetron with meta attributes
+
+                The serialize and deserialize function includes the meta attribute structure as well
+                The image data are first serialized, followed by the xml meta attribute representation
+
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "hoNDImage.h"
+#include "hoNDMetaAttributes.h"
+
+namespace Gadgetron
+{
+    template <typename T, unsigned int D>
+    class hoNDImageAttrib : public hoNDImage<T, D>
+    {
+    public:
+
+        typedef hoNDImage<T, D> BaseClass;
+        typedef hoNDImageAttrib<T, D> Self;
+
+        typedef T element_type;
+        typedef T value_type;
+        typedef float coord_type;
+
+        typedef typename BaseClass::a_axis_type a_axis_type;
+        typedef typename BaseClass::axis_type axis_type;
+
+        /// constructors
+        hoNDImageAttrib ();
+        hoNDImageAttrib (const std::vector<size_t>& dimensions);
+        hoNDImageAttrib (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize);
+        hoNDImageAttrib (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin);
+        hoNDImageAttrib (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, const axis_type& axis);
+
+        hoNDImageAttrib(size_t len);
+        hoNDImageAttrib(size_t sx, size_t sy);
+        hoNDImageAttrib(size_t sx, size_t sy, size_t sz);
+        hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st);
+        hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, size_t sp);
+        hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq);
+        hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr);
+        hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss);
+
+        /// attach memory constructors
+        hoNDImageAttrib (const std::vector<size_t>& dimensions, T* data, bool delete_data_on_destruct = false);
+        hoNDImageAttrib (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, T* data, bool delete_data_on_destruct = false);
+        hoNDImageAttrib (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, T* data, bool delete_data_on_destruct = false);
+        hoNDImageAttrib (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, const axis_type& axis, T* data, bool delete_data_on_destruct = false);
+
+        hoNDImageAttrib(size_t len, T* data, bool delete_data_on_destruct = false);
+        hoNDImageAttrib(size_t sx, size_t sy, T* data, bool delete_data_on_destruct = false);
+        hoNDImageAttrib(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct = false);
+        hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, T* data, bool delete_data_on_destruct = false);
+        hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, T* data, bool delete_data_on_destruct = false);
+        hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, T* data, bool delete_data_on_destruct = false);
+        hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, T* data, bool delete_data_on_destruct = false);
+        hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, T* data, bool delete_data_on_destruct = false);
+
+        hoNDImageAttrib(const hoNDArray<T>& a);
+        hoNDImageAttrib(const Self& a);
+
+        virtual ~hoNDImageAttrib();
+
+        /// meta attributes
+        GtImageAttribType attrib_;
+
+        /// serialize/deserialize
+        virtual bool serialize(char*& buf, size_t& len);
+        virtual bool deserialize(char* buf, size_t& len);
+
+        /// print out the image information
+        virtual void print(std::ostream& os) const;
+        virtual void printContent(std::ostream& os) const;
+
+    protected:
+
+        using BaseClass::dimensions_;
+        using BaseClass::offsetFactors_;
+        using BaseClass::pixelSize_;
+        using BaseClass::pixelSize_reciprocal_;
+        using BaseClass::origin_;
+        using BaseClass::axis_;
+        using BaseClass::data_;
+        using BaseClass::elements_;
+        using BaseClass::delete_data_on_destruct_;
+    };
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib () : BaseClass()
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib (const std::vector<size_t>& dimensions) : BaseClass(dimensions)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize) : BaseClass(dimensions, pixelSize)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin) : BaseClass(dimensions, pixelSize, origin)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, const axis_type& axis) : BaseClass(dimensions, pixelSize, origin, axis)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(size_t len) : BaseClass(len)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(size_t sx, size_t sy) : BaseClass(sx, sy)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(size_t sx, size_t sy, size_t sz) : BaseClass(sx, sy, sz)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st) : BaseClass(sx, sy, sz, st)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, size_t sp) : BaseClass(sx, sy, sz, st, sp)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq) : BaseClass(sx, sy, sz, st, sp, sq)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr) : BaseClass(sx, sy, sz, st, sp, sq, sr)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss) : BaseClass(sx, sy, sz, st, sp, sq, sr, ss)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib (const std::vector<size_t>& dimensions, T* data, bool delete_data_on_destruct) : BaseClass(dimensions, data, delete_data_on_destruct)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, T* data, bool delete_data_on_destruct) : BaseClass(dimensions, pixelSize, data, delete_data_on_destruct)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, T* data, bool delete_data_on_destruct) : BaseClass(dimensions, pixelSize, origin, data, delete_data_on_destruct)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib (const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, const axis_type& axis, T* data, bool delete_data_on_destruct) : BaseClass(dimensions, pixelSize, origin, axis, data, delete_data_on_destruct)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(size_t len, T* data, bool delete_data_on_destruct) : BaseClass(len, data, delete_data_on_destruct)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(size_t sx, size_t sy, T* data, bool delete_data_on_destruct) : BaseClass(sx, sy, data, delete_data_on_destruct)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct) : BaseClass(sx, sy, sz, data, delete_data_on_destruct)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, T* data, bool delete_data_on_destruct) : BaseClass(sx, sy, sz, st, data, delete_data_on_destruct)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, T* data, bool delete_data_on_destruct) : BaseClass(sx, sy, sz, st, sp, data, delete_data_on_destruct)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, T* data, bool delete_data_on_destruct) : BaseClass(sx, sy, sz, st, sp, sq, data, delete_data_on_destruct)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, T* data, bool delete_data_on_destruct) : BaseClass(sx, sy, sz, st, sp, sq, sr, data, delete_data_on_destruct)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, T* data, bool delete_data_on_destruct) : BaseClass(sx, sy, sz, st, sp, sq, sr, ss, data, delete_data_on_destruct)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(const hoNDArray<T>& a) : BaseClass(a)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::hoNDImageAttrib(const Self& a) : BaseClass(a)
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    hoNDImageAttrib<T, D>::~hoNDImageAttrib()
+    {
+    }
+
+    template <typename T, unsigned int D> 
+    bool hoNDImageAttrib<T, D>::serialize(char*& buf, size_t& len) 
+    {
+        char* bufImage = NULL;
+        char* bufAttrib = NULL;
+
+        try
+        {
+            size_t lenImage(0);
+            GADGET_CHECK_THROW(BaseClass::serialize(bufImage, lenImage));
+
+            size_t lenAttrib(0);
+            GADGET_CHECK_THROW(attrib_.serialize(bufAttrib, lenAttrib));
+
+            len = sizeof(unsigned long long) + lenImage + sizeof(unsigned long long) + lenAttrib;
+
+            if ( buf != NULL )
+            {
+                delete [] buf;
+                buf = NULL;
+            }
+
+            buf = new char[len];
+            GADGET_CHECK_THROW(buf != NULL);
+
+            size_t offset = 0;
+            memcpy(buf, &lenImage, sizeof(size_t));
+            offset += sizeof(size_t);
+
+            memcpy(buf+offset, bufImage, lenImage);
+            offset += lenImage;
+
+            memcpy(buf+offset, &lenAttrib, sizeof(size_t));
+            offset += sizeof(size_t);
+
+            memcpy(buf+offset, bufAttrib, lenAttrib);
+            offset += lenAttrib;
+
+            if ( bufImage != NULL ) delete [] bufImage;
+            if ( bufAttrib != NULL ) delete [] bufAttrib;
+        }
+        catch(...)
+        {
+            if ( bufImage != NULL ) delete [] bufImage;
+            if ( bufAttrib != NULL ) delete [] bufAttrib;
+
+            GERROR_STREAM("Errors happened in hoNDImageAttrib<T, D>::serialize(char*& buf, size_t& len) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    bool hoNDImageAttrib<T, D>::deserialize(char* buf, size_t& len)
+    {
+        try
+        {
+            size_t lenImage(0);
+            size_t lenAttrib(0);
+
+            size_t offset = 0;
+            memcpy(&lenImage, buf, sizeof(size_t));
+            offset += sizeof(size_t);
+
+            GADGET_CHECK_RETURN_FALSE(BaseClass::deserialize(buf+offset, lenImage));
+            offset += lenImage;
+
+            memcpy(&lenAttrib, buf+offset, sizeof(size_t));
+            offset += sizeof(size_t);
+
+            GADGET_CHECK_RETURN_FALSE(attrib_.deserialize(buf+offset, lenAttrib));
+            offset += lenAttrib;
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageAttrib<T, D>::deserialize(char* buf, size_t& len) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImageAttrib<T, D>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "-------------- Gagdgetron ND Image with meta attributes -------------" << endl;
+        this->printContent(os);
+    }
+
+    template <typename T, unsigned int D> 
+    inline void hoNDImageAttrib<T, D>::printContent(std::ostream& os) const
+    {
+        BaseClass::printContent(os);
+        attrib_.print(os);
+    }
+}
diff --git a/toolboxes/core/cpu/image/hoNDImageContainer2D.h b/toolboxes/core/cpu/image/hoNDImageContainer2D.h
new file mode 100644
index 0000000..7df6100
--- /dev/null
+++ b/toolboxes/core/cpu/image/hoNDImageContainer2D.h
@@ -0,0 +1,1223 @@
+/** \file       hoNDImageContainer2D.h
+    \brief      a container class to store a matrix of hoNDImages
+
+                This name "container2D" does not mean the 2D images. It means the container is a 2D array in its storage logic.
+
+                The points of images are stored in this container. However, the images can be deleted if delete_data_on_destruct_==true
+                The images are stored as 2D arrays. But every row can have differnet number of images (or columns). So it is not exactly an 
+                image matrix.
+
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "hoNDImage.h"
+#include "hoNDArray_elemwise.h"
+
+namespace Gadgetron
+{
+
+    template <typename ImageType>
+    class hoNDImageContainer2D
+    {
+    public:
+
+        typedef hoNDImageContainer2D<ImageType> Self;
+
+        typedef typename ImageType::value_type value_type;
+        typedef typename ImageType::coord_type coord_type;
+        typedef typename ImageType::a_axis_type a_axis_type;
+        typedef typename ImageType::axis_type axis_type;
+
+        /// constructors
+        hoNDImageContainer2D(bool delete_data_on_destruct=true);
+        hoNDImageContainer2D(const hoNDImageContainer2D<ImageType>& a);
+
+        Self& operator=(const Self& rhs);
+
+        virtual ~hoNDImageContainer2D();
+
+        /// create a container with images
+        bool create(const std::vector<size_t>& col, bool createImage=true);
+
+        /// create a container with images at certain sizes/pixel sizes/axis
+        /// the image will not be filled with zeros
+        bool create(const std::vector<size_t>& col, const std::vector<size_t>& dimensions);
+        bool create(const std::vector<size_t>& col, const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize);
+        bool create(const std::vector<size_t>& col, const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin);
+        bool create(const std::vector<size_t>& col, const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, const axis_type& axis);
+
+        /// create a container from a chunk of memory
+        /// the dim.size() = ImageType.get_number_of_dimensions()+1
+        /// e.g., a 3D memory chunk [RO E1 N] is used to allocate N [RO E1] images
+        /// the container will have 1 row and N columns
+        bool create(value_type* buf, const std::vector<size_t>& dim);
+
+        /// clear the matrix, if delete_data_on_destruct_==true, delete all stored images
+        bool clear();
+
+        /// copy from a container 2D, deep copy image content
+        bool copyFrom(const Self& a);
+
+        /// fill all images with zeros
+        bool fillWithZeros();
+
+        /// whether two containers have the same size
+        template <typename ImageType2> 
+        bool dimensions_equal_container(const hoNDImageContainer2D<ImageType2>& a) const
+        {
+            if ( this->rows() != a.rows() ) return false;
+
+            unsigned int row;
+            for ( row=0; row<this->rows(); row++ )
+            {
+                if ( this->cols(row) != a.cols(row) )
+                {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+
+        /// add one image to a row at end
+        bool push_back(ImageType& im, size_t row);
+
+        /// add one image to a row at head
+        bool push_front(ImageType& im, size_t row);
+
+        /// add one image to a row
+        bool insert(ImageType& im, size_t row, size_t col);
+
+        /// pop an image from a row end
+        bool pop_back(ImageType*& im, size_t row);
+
+        /// pop an image from a row head
+        bool pop_front(ImageType*& im, size_t row);
+
+        /// remove an image from the storage
+        bool remove(ImageType*& im, size_t row, size_t col);
+
+        /// if delete_data_on_destruct_==true, the image will be deleted
+        bool remove(size_t row, size_t col);
+
+        /// expand the container by certain number of rows
+        bool expand(size_t newRows);
+
+        /// insert one row
+        bool insert(std::vector<ImageType*>& im_array, size_t row);
+
+        /// remove one row
+        bool remove(std::vector<ImageType*>& im_array, size_t row);
+        /// if delete_data_on_destruct_==true, the image will be deleted
+        bool remove(size_t row);
+
+        /// get image pointers
+        ImageType& get(size_t row, size_t col);
+        const ImageType& get(size_t row, size_t col) const;
+
+        ImageType& operator() (size_t row, size_t col);
+        const ImageType& operator() (size_t row, size_t col) const;
+
+        /// get one row
+        bool get(std::vector<ImageType*>& im_array, size_t row) const;
+
+        /// get number of all images in the container
+        size_t get_number_of_all_images();
+
+        /// get all images
+        bool get_all_images(std::vector<ImageType*>& im_array);
+
+        /// set image pointer
+        bool set(ImageType* pImage, size_t row, size_t col);
+
+        /// convert one row to a hoNDArray
+        /// all images in this row should have the same dimensions; if not, return false
+        bool to_NDArray(size_t row, hoNDArray<value_type>& a) const;
+
+        /// whether to delete the memory on destruction
+        bool delete_data_on_destruct() const;
+        void delete_data_on_destruct(bool d);
+
+        /// get number of row and column
+        size_t rows() const;
+        size_t cols(size_t row) const;
+        std::vector<size_t> cols() const;
+
+        /// check whether all images in a row have the same dimensions/pixelSizes/axises
+        bool has_identical_dimensions(unsigned int row) const;
+        bool has_identical_pixel_size(unsigned int row) const;
+        bool has_identical_axis(unsigned int row) const;
+        bool has_identical_image_geometry(unsigned int row) const;
+
+        /// serialize/deserialize
+        virtual bool serialize(char*& buf, size_t& len) const;
+        virtual bool deserialize(char* buf, size_t& len);
+
+        /// print out the image container information
+        virtual void print(std::ostream& os) const;
+
+    protected:
+
+        std::vector< std::vector<ImageType*> > image_container_;
+
+        bool delete_data_on_destruct_;
+    };
+
+    template <typename ImageType> 
+    hoNDImageContainer2D<ImageType>::hoNDImageContainer2D(bool delete_data_on_destruct) : delete_data_on_destruct_(delete_data_on_destruct)
+    {
+    }
+
+    template <typename ImageType> 
+    hoNDImageContainer2D<ImageType>::hoNDImageContainer2D(const hoNDImageContainer2D<ImageType>& a) : delete_data_on_destruct_(false)
+    {
+        *this = a;
+    }
+
+    template <typename ImageType> 
+    hoNDImageContainer2D<ImageType>& hoNDImageContainer2D<ImageType>::operator=(const Self& rhs)
+    {
+        if ( this == &rhs ) return *this;
+
+        this->clear();
+        size_t row = rhs.rows();
+
+        size_t ii;
+        for ( ii=0; ii<row; ii++ )
+        {
+            std::vector<ImageType*> a_row;
+            rhs.get(a_row, ii);
+            this->image_container_.push_back(a_row);
+        }
+
+        this->delete_data_on_destruct_ = false;
+
+        return *this;
+    }
+
+    template <typename ImageType> 
+    hoNDImageContainer2D<ImageType>::~hoNDImageContainer2D()
+    {
+        this->clear();
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::create(const std::vector<size_t>& col, bool createImage)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(this->clear());
+            if ( createImage )
+            {
+                this->delete_data_on_destruct(true);
+            }
+            else
+            {
+                this->delete_data_on_destruct(false);
+            }
+
+            size_t row = col.size();
+            image_container_.resize(row);
+
+            unsigned int r, c;
+            for ( r=0; r<row; r++ )
+            {
+                image_container_[r].resize(col[r], NULL);
+
+                if ( createImage )
+                {
+                    for ( c=0; c<col[r]; c++ )
+                    {
+                        image_container_[r][c] = new ImageType();
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::create(const std::vector<size_t>& col) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::create(const std::vector<size_t>& col, const std::vector<size_t>& dimensions)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(this->clear());
+            this->delete_data_on_destruct(true);
+
+            size_t row = col.size();
+            image_container_.resize(row);
+
+            unsigned int r, c;
+            for ( r=0; r<row; r++ )
+            {
+                image_container_[r].resize(col[r], NULL);
+
+                for ( c=0; c<col[r]; c++ )
+                {
+                    image_container_[r][c] = new ImageType(dimensions);
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::create(const std::vector<size_t>& col, const std::vector<size_t>& dimensions) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::create(const std::vector<size_t>& col, const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(this->clear());
+            this->delete_data_on_destruct(true);
+
+            size_t row = col.size();
+            image_container_.resize(row);
+
+            unsigned int r, c;
+            for ( r=0; r<row; r++ )
+            {
+                image_container_[r].resize(col[r], NULL);
+
+                for ( c=0; c<col[r]; c++ )
+                {
+                    image_container_[r][c] = new ImageType(dimensions, pixelSize);
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::create(const std::vector<size_t>& col, const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::create(const std::vector<size_t>& col, const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(this->clear());
+            this->delete_data_on_destruct(true);
+
+            size_t row = col.size();
+            image_container_.resize(row);
+
+            unsigned int r, c;
+            for ( r=0; r<row; r++ )
+            {
+                image_container_[r].resize(col[r], NULL);
+
+                for ( c=0; c<col[r]; c++ )
+                {
+                    image_container_[r][c] = new ImageType(dimensions, pixelSize, origin);
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::create(const std::vector<size_t>& col, const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::create(const std::vector<size_t>& col, const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, const axis_type& axis)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(this->clear());
+            this->delete_data_on_destruct(true);
+
+            size_t row = col.size();
+            image_container_.resize(row);
+
+            unsigned int r, c;
+            for ( r=0; r<row; r++ )
+            {
+                image_container_[r].resize(col[r], NULL);
+
+                for ( c=0; c<col[r]; c++ )
+                {
+                    image_container_[r][c] = new ImageType(dimensions, pixelSize, origin, axis);
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::create(const std::vector<size_t>& col, const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, const axis_type& axis) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::create(value_type* buf, const std::vector<size_t>& dim)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE( (dim.size()==ImageType::NDIM) || (dim.size()==ImageType::NDIM+1) );
+
+            GADGET_CHECK_RETURN_FALSE(this->clear());
+            this->delete_data_on_destruct(true);
+
+            unsigned int ii;
+            size_t col;
+            std::vector<size_t> dim_im;
+            if ( dim.size()==ImageType::NDIM )
+            {
+                dim_im = dim;
+                col = 1;
+            }
+            else
+            {
+                dim_im.resize(ImageType::NDIM);
+                memcpy(&dim_im[0], &dim[0], sizeof(size_t)*ImageType::NDIM);
+                col = dim[ImageType::NDIM];
+            }
+
+            size_t row = 1;
+            image_container_.resize(row);
+            image_container_[0].resize(col);
+
+            size_t numOfPixels = 1;
+            for ( ii=0; ii<dim_im.size(); ii++ )
+            {
+                numOfPixels *= dim_im[ii];
+            }
+
+            unsigned int c;
+            for ( c=0; c<col; c++ )
+            {
+                image_container_[0][c] = new ImageType();
+                image_container_[0][c]->create(dim_im, buf+c*numOfPixels, false);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::create(value_type* buf, const std::vector<size_t>& dim) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::clear()
+    {
+        try
+        {
+            if ( delete_data_on_destruct_ )
+            {
+                size_t row = this->rows();
+
+                unsigned int ii, jj;
+                for ( ii=0; ii<row; ii++ )
+                {
+                    size_t col = this->cols(ii);
+                    for ( jj=0; jj<col; jj++ )
+                    {
+                        ImageType* pImg = image_container_[ii][jj];
+                        if ( pImg != NULL )
+                        {
+                            delete pImg;
+                            image_container_[ii][jj] = NULL;
+                        }
+                    }
+                }
+            }
+
+            image_container_.clear();
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::clear() ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::copyFrom(const Self& a)
+    {
+        try
+        {
+            if ( !this->dimensions_equal_container(a) )
+            {
+                GADGET_CHECK_RETURN_FALSE(this->clear());
+                this->delete_data_on_destruct(true);
+
+                GADGET_CHECK_RETURN_FALSE(this->create(a.cols()));
+            }
+
+            size_t row = this->rows();
+
+            unsigned int ii, jj;
+            for ( ii=0; ii<row; ii++ )
+            {
+                size_t col = this->cols(ii);
+                for ( jj=0; jj<col; jj++ )
+                {
+                    image_container_[ii][jj]->copyImageInfoAndContent(a(ii, jj));
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::copyFrom(const Self& a) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::fillWithZeros()
+    {
+        try
+        {
+            size_t row = this->rows();
+
+            unsigned int ii, jj;
+            for ( ii=0; ii<row; ii++ )
+            {
+                size_t col = this->cols(ii);
+                for ( jj=0; jj<col; jj++ )
+                {
+                    memset(image_container_[ii][jj]->begin(), 0, image_container_[ii][jj]->get_number_of_bytes());
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::fillWithZeros() ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::push_back(ImageType& im, size_t row)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(row<this->rows());
+            image_container_[row].push_back(&im);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::push_back(ImageType& im, size_t row) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::push_front(ImageType& im, size_t row)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(row<this->rows());
+            image_container_[row].insert(image_container_[row].begin(), &im);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::push_front(ImageType& im, size_t row) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::insert(ImageType& im, size_t row, size_t col)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(row<this->rows());
+            GADGET_CHECK_RETURN_FALSE(col<this->cols(row));
+
+            image_container_[row].insert(image_container_[row].begin()+col, &im);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::insert(ImageType& im, size_t row, size_t col) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::pop_back(ImageType*& im, size_t row)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(row<this->rows());
+            im = image_container_[row][this->cols(row)-1];
+            image_container_[row].pop_back();
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::pop_back(ImageType*& im, size_t row) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::pop_front(ImageType*& im, size_t row)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(row<this->rows());
+
+            if ( this->cols(row) == 0 )
+            {
+                im = NULL;
+                return true;
+            }
+
+            image_container_[row].erase(image_container_[row].begin());
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::pop_front(ImageType*& im, size_t row) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::remove(ImageType*& im, size_t row, size_t col)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(row<this->rows());
+            GADGET_CHECK_RETURN_FALSE(col<this->cols(row));
+
+            im = image_container_[row][col];
+
+            image_container_[row].erase(image_container_[row].begin()+col);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::remove(ImageType*& im, size_t row, size_t col) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::remove(size_t row, size_t col)
+    {
+        try
+        {
+            ImageType* im = NULL;
+            GADGET_CHECK_RETURN_FALSE(this->remove(im, row, col));
+            if( delete_data_on_destruct_ ) delete im;
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::remove(size_t row, size_t col) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::expand(size_t newRows)
+    {
+        try
+        {
+            size_t row = this->rows();
+            if ( newRows > 0 )
+            {
+                image_container_.resize(row+newRows);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::expand(size_t newRows) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::insert(std::vector<ImageType*>& im_array, size_t row)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(row<this->rows());
+            image_container_.insert(image_container_.begin()+row, im_array);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::insert(std::vector<ImageType*>& im_array, size_t row) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::remove(std::vector<ImageType*>& im_array, size_t row)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(row<this->rows());
+            im_array = image_container_[row];
+            image_container_.erase(image_container_.begin()+row);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::remove(std::vector<ImageType*>& im_array, size_t row) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::remove(size_t row)
+    {
+        try
+        {
+            std::vector<ImageType*> im_array;
+            GADGET_CHECK_RETURN_FALSE(this->remove(im_array, row));
+
+            if( delete_data_on_destruct_ )
+            {
+                size_t N = im_array.size();
+                unsigned int ii;
+                for ( ii=0; ii<N; ii++ )
+                {
+                    if ( im_array[ii] != NULL )
+                    {
+                        delete im_array[ii];
+                        im_array[ii] = NULL;
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::remove(size_t row) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    /// get image pointers
+    template <typename ImageType> 
+    inline ImageType& hoNDImageContainer2D<ImageType>::get(size_t row, size_t col)
+    {
+        GADGET_DEBUG_CHECK_THROW(row<this->rows());
+        GADGET_DEBUG_CHECK_THROW(col<this->cols(row));
+
+        return *(image_container_[row][col]);
+    }
+
+    template <typename ImageType> 
+    inline const ImageType& hoNDImageContainer2D<ImageType>::get(size_t row, size_t col) const
+    {
+        GADGET_DEBUG_CHECK_THROW(row<this->rows());
+        GADGET_DEBUG_CHECK_THROW(col<this->cols(row));
+
+        return *(image_container_[row][col]);
+    }
+
+    template <typename ImageType> 
+    inline ImageType& hoNDImageContainer2D<ImageType>::operator() (size_t row, size_t col)
+    {
+        GADGET_DEBUG_CHECK_THROW(row<this->rows());
+        GADGET_DEBUG_CHECK_THROW(col<this->cols(row));
+
+        return *(image_container_[row][col]);
+    }
+
+    template <typename ImageType> 
+    inline const ImageType& hoNDImageContainer2D<ImageType>::operator() (size_t row, size_t col) const
+    {
+        GADGET_DEBUG_CHECK_THROW(row<this->rows());
+        GADGET_DEBUG_CHECK_THROW(col<this->cols(row));
+
+        return *(image_container_[row][col]);
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::get(std::vector<ImageType*>& im_array, size_t row) const
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(row<this->rows());
+            im_array = image_container_[row];
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::get(std::vector<ImageType*>& im_array, size_t row) const ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline size_t hoNDImageContainer2D<ImageType>::get_number_of_all_images()
+    {
+        try
+        {
+            size_t num = 0;
+
+            size_t row = this->rows();
+            if ( row == 0 ) return num;
+
+            unsigned int r;
+            for ( r=0; r<row; r++ )
+            {
+                num += this->cols(r);
+            }
+
+            return num;
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::get(std::vector<ImageType*>& im_array, size_t row) ... ");
+            return false;
+        }
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::get_all_images(std::vector<ImageType*>& im_array)
+    {
+        try
+        {
+            im_array.clear();
+
+            size_t row = this->rows();
+            if ( row == 0 ) return true;
+
+            size_t num = this->get_number_of_all_images();
+
+            im_array.resize(num, NULL);
+
+            unsigned int r, c, ind(0);
+            for ( r=0; r<row; r++ )
+            {
+                for ( c=0; c<this->cols(r); c++ )
+                {
+                    im_array[ind] = image_container_[r][c];
+                    ind++;
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::get(std::vector<ImageType*>& im_array, size_t row) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::set(ImageType* pImage, size_t row, size_t col)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(row<this->rows());
+        GADGET_DEBUG_CHECK_RETURN_FALSE(col<this->cols(row));
+
+        if ( image_container_[row][col] != NULL )
+        {
+            if ( this->delete_data_on_destruct() ) delete image_container_[row][col];
+        }
+
+        image_container_[row][col] = pImage;
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::to_NDArray(size_t row, hoNDArray<value_type>& a) const
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(row<this->rows());
+
+            size_t col = this->cols(row);
+            if ( col == 0 ) return true;
+
+            GADGET_CHECK_RETURN_FALSE(this->has_identical_dimensions( (unsigned int)row));
+
+            std::vector<size_t> dim;
+            image_container_[row][0]->get_dimensions(dim);
+
+            size_t numOfElements = image_container_[row][0]->get_number_of_elements();
+            size_t numOfBytes = image_container_[row][0]->get_number_of_bytes();
+
+            std::vector<size_t> dim_out(dim.size()+1);
+            memcpy(&dim_out[0], &dim[0], sizeof(size_t)*dim.size());
+            dim_out[ dim.size() ] = col;
+
+            a.create(dim_out);
+
+            unsigned int c;
+            for ( c=0; c<col; c++ )
+            {
+                memcpy(a.begin()+c*numOfElements, image_container_[row][c]->begin(), numOfBytes);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::to_NDArray(size_t row, hoNDArray<value_type>& a) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::delete_data_on_destruct() const
+    {
+        return this->delete_data_on_destruct_;
+    }
+
+    template <typename ImageType> 
+    inline void hoNDImageContainer2D<ImageType>::delete_data_on_destruct(bool d)
+    {
+        this->delete_data_on_destruct_ = d;
+    }
+
+    template <typename ImageType> 
+    inline size_t hoNDImageContainer2D<ImageType>::rows() const
+    {
+        return image_container_.size();
+    }
+
+    template <typename ImageType> 
+    inline size_t hoNDImageContainer2D<ImageType>::cols(size_t row) const
+    {
+        GADGET_DEBUG_CHECK_THROW(row<this->rows());
+        return image_container_[row].size();
+    }
+
+    template <typename ImageType> 
+    inline std::vector<size_t> hoNDImageContainer2D<ImageType>::cols() const
+    {
+        std::vector<size_t> col(this->rows(), 0);
+        unsigned int row;
+        for ( row=0; row<this->rows(); row++ )
+        {
+            col[row] = this->cols(row);
+        }
+        return col;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::has_identical_dimensions(unsigned int row) const
+    {
+        GADGET_CHECK_RETURN_FALSE(row<this->rows());
+
+        size_t col = this->cols(row);
+        if ( col == 0 ) return true;
+
+        unsigned int c;
+        for ( c=1; c<col; c++ )
+        {
+            if ( !image_container_[row][0]->dimensions_equal( *image_container_[row][c] ) )
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::has_identical_pixel_size(unsigned int row) const
+    {
+        GADGET_CHECK_RETURN_FALSE(row<this->rows());
+
+        size_t col = this->cols(row);
+        if ( col == 0 ) return true;
+
+        unsigned int c;
+        for ( c=1; c<col; c++ )
+        {
+            if ( !image_container_[row][0]->pixel_size_equal( *image_container_[row][c] ) )
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::has_identical_axis(unsigned int row) const
+    {
+        GADGET_CHECK_RETURN_FALSE(row<this->rows());
+
+        size_t col = this->cols(row);
+        if ( col == 0 ) return true;
+
+        unsigned int c;
+        for ( c=1; c<col; c++ )
+        {
+            if ( !image_container_[row][0]->axis_equal( *image_container_[row][c] ) )
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::has_identical_image_geometry(unsigned int row) const
+    {
+        GADGET_CHECK_RETURN_FALSE(row<this->rows());
+
+        if ( !this->has_identical_dimensions() ) return false;
+        if ( !this->has_identical_pixel_size() ) return false;
+        if ( !this->has_identical_axis() ) return false;
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::serialize(char*& buf, size_t& totalLen) const 
+    {
+        try
+        {
+            // memory layout
+            // number of row, number of col for row 1 (col1), number of col for row 2, ..., number of col for row n
+            // offset for image[0][0], len of buffer for image[0][0], offset for image[0][1], len of buffer for image[0][1], ..., offset for image[0][0], len of buffer for image[0][col1-1], 
+            // ...
+            // offset for image[row-1][0], len of buffer for image[row-1][0], offset for image[row-1][1], len of buffer for image[row-1][1], ..., offset for image[row-1][0], len of buffer for image[row-1][col1-1], 
+            // content for image[0][0], ..., image[0][col1-1], image[1][0], ..., image[row-1][col1-1]
+
+            // starting for image content
+            size_t offsetImage = sizeof(size_t) + this->rows()*sizeof(size_t);
+
+            std::vector<size_t> col(this->rows());
+
+            std::vector< std::vector<size_t> > offset(this->rows());
+            std::vector< std::vector<size_t> > len(this->rows());
+            std::vector< std::vector<char*> > bufIm(this->rows());
+
+            size_t row, c;
+            for ( row=0; row<this->rows(); row++ )
+            {
+                col[row] = this->cols(row);
+                offset[row].resize(col[row], 0);
+                len[row].resize(col[row], 0);
+
+                offsetImage += sizeof(size_t)*col[row]*2;
+
+                bufIm[row].resize(col[row], NULL);
+            }
+
+            totalLen = offsetImage;
+            offset[0][0] = offsetImage;
+
+            for ( row=0; row<this->rows(); row++ )
+            {
+                for ( c=0; c<col[row]; c++ )
+                {
+                    ImageType* im = image_container_[row][c];
+                    if ( im != NULL )
+                    {
+                        char* bufImCurr=NULL;
+                        size_t lenIm;
+
+                        im->serialize(bufImCurr, lenIm);
+
+                        bufIm[row][c] = bufImCurr;
+                        len[row][c] = lenIm;
+                    }
+                    else
+                    {
+                        len[row][c] = 0;
+                    }
+
+                    totalLen += len[row][c];
+
+                    if ( row==0 && c== 0 ) continue;
+
+                    offset[row][c] = offset[row][c-1] + len[row][c-1];
+                }
+            }
+
+            buf = new char[totalLen];
+            GADGET_CHECK_RETURN_FALSE(buf!=NULL);
+
+            size_t offsetBuf = 0;
+
+            size_t numOfRows = this->rows();
+            memcpy(buf+offsetBuf, &numOfRows, sizeof(size_t));
+            offsetBuf += sizeof(size_t);
+
+            memcpy(buf+offsetBuf, &col[0], sizeof(size_t)*numOfRows);
+            offsetBuf += sizeof(size_t)*numOfRows;
+
+            for ( row=0; row<this->rows(); row++ )
+            {
+                for ( c=0; c<col[row]; c++ )
+                {
+                    size_t v = offset[row][c];
+                    size_t lv = len[row][c];
+
+                    memcpy(buf+offsetBuf, &v, sizeof(size_t));
+                    offsetBuf += sizeof(size_t);
+
+                    memcpy(buf+offsetBuf, &lv, sizeof(size_t));
+                    offsetBuf += sizeof(size_t);
+                }
+            }
+
+            for ( row=0; row<this->rows(); row++ )
+            {
+                for ( c=0; c<col[row]; c++ )
+                {
+                    if ( bufIm[row][c] != NULL )
+                    {
+                        memcpy(buf+offsetBuf, bufIm[row][c], len[row][c]);
+                        offsetBuf += len[row][c];
+
+                        delete [] bufIm[row][c];
+                        bufIm[row][c] = NULL;
+                    }
+                }
+            }
+
+            GADGET_CHECK_RETURN_FALSE(totalLen == offsetBuf);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::serialize(char*& buf, size_t& len) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline bool hoNDImageContainer2D<ImageType>::deserialize(char* buf, size_t& len)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(this->clear());
+            this->delete_data_on_destruct(true);
+
+            size_t offsetBuf = 0;
+
+            size_t numOfRows(0);
+            memcpy(&numOfRows, buf+offsetBuf, sizeof(size_t));
+            offsetBuf += sizeof(size_t);
+
+            if ( numOfRows == 0 ) return true;
+
+            image_container_.resize(numOfRows);
+
+            std::vector<size_t> col(numOfRows);
+
+            memcpy(&col[0], buf+offsetBuf, sizeof(size_t)*numOfRows);
+            offsetBuf += sizeof(size_t)*numOfRows;
+
+            size_t row, c;
+            for ( row=0; row<this->rows(); row++ )
+            {
+                image_container_[row].resize(col[row], NULL);
+
+                for ( c=0; c<col[row]; c++ )
+                {
+                    size_t offsetCurr, lenCurr;
+
+                    memcpy(&offsetCurr, buf+offsetBuf, sizeof(size_t));
+                    offsetBuf += sizeof(size_t);
+
+                    memcpy(&lenCurr, buf+offsetBuf, sizeof(size_t));
+                    offsetBuf += sizeof(size_t);
+
+                    image_container_[row][c] = new ImageType();
+                    GADGET_CHECK_RETURN_FALSE(image_container_[row][c]!=NULL);
+
+                    if ( lenCurr > 0 )
+                    {
+                        GADGET_CHECK_RETURN_FALSE(image_container_[row][c]->deserialize(buf+offsetCurr, lenCurr));
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoNDImageContainer2D<ImageType>::deserialize(char* buf, size_t& len) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ImageType> 
+    inline void hoNDImageContainer2D<ImageType>::print(std::ostream& os) const
+    {
+        using namespace std;
+
+        os.unsetf(std::ios::scientific);
+        os.setf(ios::fixed);
+
+        size_t r, c;
+
+        os << "--------------Gagdgetron Image Container 2D -------------" << endl;
+        os << "Image type is : " << std::string(typeid(ImageType).name()) << endl;
+        os << "Number of stored image rows is : " << this->rows() << endl;
+        for ( r=0; r<this->rows(); r++ )
+        {
+            os << "Row " << r << " has " << this->cols(r) << " images " << endl;
+        }
+        os << "---------------------------------------------------------" << endl;
+        for ( r=0; r<this->rows(); r++ )
+        {
+            os << "Row " << r << " : "<< endl;
+            os << "=========================================================" << endl;
+            for ( c=0; c<this->cols(r); c++ )
+            {
+                if ( c > 2 ) break;
+
+                if ( image_container_[r][c] != NULL )
+                {
+                    os << "--> Image " << c << " : "<< endl;
+                    image_container_[r][c]->print(os);
+                    os << "=========================================================" << endl;
+                }
+            }
+        }
+    }
+}
diff --git a/toolboxes/core/cpu/math/CMakeLists.txt b/toolboxes/core/cpu/math/CMakeLists.txt
new file mode 100644
index 0000000..7c48960
--- /dev/null
+++ b/toolboxes/core/cpu/math/CMakeLists.txt
@@ -0,0 +1,78 @@
+if (WIN32)
+    ADD_DEFINITIONS(-D__BUILD_GADGETRON_CPUCORE_MATH__)
+endif (WIN32)
+
+include_directories(
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/algorithm
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+    ${ARMADILLO_INCLUDE_DIRS} 
+    ${ISMRMRD_INCLUDE_DIR} )
+
+if (MKL_FOUND)
+    INCLUDE_DIRECTORIES( ${MKL_INCLUDE_DIR} )
+    LINK_DIRECTORIES( ${MKL_LIB_DIR} ${MKL_COMPILER_LIB_DIR} )
+endif (MKL_FOUND)
+
+set(cpucore_math_header_files
+    cpucore_math_export.h
+    # hoNDArray_math_util.h
+    hoNDArray_math.h
+    hoNDImage_util.h
+    hoNDImage_util.hxx
+    hoNDImage_util_instantiate.hxx 
+    hoNDArray_linalg.h )
+
+set(cpucore_math_src_files 
+    # hoNDArray_math_util.cpp
+    hoNDImage_util.cpp 
+    hoNDArray_linalg.cpp )
+
+if (ARMADILLO_FOUND)
+
+    set(cpucore_math_header_files 
+        ${cpucore_math_header_files}
+        hoNDArray_reductions.h
+        hoArmadillo.h
+        hoNDArray_elemwise.h
+         )
+
+    set(cpucore_math_src_files 
+        ${cpucore_math_src_files} 
+        hoNDArray_reductions.cpp
+        hoNDArray_elemwise.cpp
+        )
+
+endif (ARMADILLO_FOUND)
+
+add_library(gadgetron_toolbox_cpucore_math SHARED ${cpucore_math_header_files} ${cpucore_math_src_files})
+set_target_properties(gadgetron_toolbox_cpucore_math PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+if (MKL_FOUND)
+    target_link_libraries(gadgetron_toolbox_cpucore_math 
+        gadgetron_toolbox_cpucore
+	gadgetron_toolbox_log
+        ${ARMADILLO_LIBRARIES}
+        ${ISMRMRD_LIBRARIES}
+        ${FFTW3_LIBRARIES} 
+        ${MKL_LIBRARIES}
+    )
+else (MKL_FOUND)
+    target_link_libraries(gadgetron_toolbox_cpucore_math 
+        gadgetron_toolbox_cpucore
+	gadgetron_toolbox_log
+        ${ARMADILLO_LIBRARIES}
+        ${ISMRMRD_LIBRARIES}
+        ${BLAS_LIBRARIES}
+        ${LAPACK_LIBRARIES}
+        ${FFTW3_LIBRARIES}
+    )
+endif (MKL_FOUND)
+
+install(TARGETS gadgetron_toolbox_cpucore_math DESTINATION lib COMPONENT main)
+
+install(FILES 
+    ${cpucore_math_header_files}
+    DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/core/cpu/math/cpucore_math_export.h b/toolboxes/core/cpu/math/cpucore_math_export.h
new file mode 100644
index 0000000..1da820e
--- /dev/null
+++ b/toolboxes/core/cpu/math/cpucore_math_export.h
@@ -0,0 +1,18 @@
+/** \file cpucore_math_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef CPUCORE_MATH_EXPORT_H_
+#define CPUCORE_MATH_EXPORT_H_
+
+#if defined (WIN32)
+    #if defined (__BUILD_GADGETRON_CPUCORE_MATH__) || defined (gadgetron_toolbox_cpucore_math_EXPORTS)
+        #define EXPORTCPUCOREMATH __declspec(dllexport)
+    #else
+        #define EXPORTCPUCOREMATH __declspec(dllimport)
+    #endif
+#else
+#define EXPORTCPUCOREMATH
+#endif
+
+#endif /* CPUCORE_MATH_EXPORT_H_ */
diff --git a/toolboxes/core/cpu/math/hoArmadillo.h b/toolboxes/core/cpu/math/hoArmadillo.h
new file mode 100644
index 0000000..87d65bd
--- /dev/null
+++ b/toolboxes/core/cpu/math/hoArmadillo.h
@@ -0,0 +1,89 @@
+#pragma once
+#define ARMA_64BIT_WORD
+#include "hoNDArray.h"
+
+#ifdef USE_ARMADILLO
+
+#include <armadillo>
+
+/** \file hoArmadillo.h
+\brief Utilities to create an Armadillo matrix or column vector from an hoNDArray.
+
+Utilities to create an Armadillo matrix or column vector from an hoNDArray.
+A helper function that creates an hoNDArray from an Armadillo matrix or vector is deliberatly omitted:
+The reccomended approach to using Armadillo's functionality and providing an hoNDArray of the result is 
+1) create an hoNDArray to hold the result, 
+2) convert this array to an Armadillo matrix or vector using the utilities provided in this header,
+3) assign the desired Armadillo computation to this array.
+This approach ensures that the Gadgetron -- and not Armadillo -- is responsible for subsequent memory handling.
+We refer to hoNDArray_math.h for some specific examples on how to use this Armadillo interface.
+*/
+
+namespace Gadgetron{
+
+  /**
+   * @brief Creates an Armadillo matrix from a two-dimensional hoNDArray.
+   * @param[in] x Input array.
+   * @return An Armadillo array mapped to the data pointer of the hoNDArray.
+   */
+  template<class T> arma::Mat<typename stdType<T>::Type> as_arma_matrix( hoNDArray<T> *x )
+  {
+    if( x->get_number_of_dimensions() != 2 )
+      throw std::runtime_error("Wrong number of dimensions. Cannot convert hoNDArray to matrix");
+    return arma::Mat<typename stdType<T>::Type>( (typename stdType<T>::Type*) x->get_data_ptr(), x->get_size(0), x->get_size(1), false, true );
+  }
+
+  /**
+   * @brief Creates an Armadillo matrix from a two-dimensional hoNDArray.
+   * @param[in] x Input array.
+   * @return An Armadillo array mapped to the data pointer of the hoNDArray.
+   */
+  template<class T> const arma::Mat<typename stdType<T>::Type> as_arma_matrix( const hoNDArray<T> *x )
+  {
+    if( x->get_number_of_dimensions() != 2 )
+      throw std::runtime_error("Wrong number of dimensions. Cannot convert hoNDArray to matrix");
+    return arma::Mat<typename stdType<T>::Type>( (typename stdType<T>::Type*) x->get_data_ptr(), x->get_size(0), x->get_size(1), false, true );
+  }
+  
+  /**
+   * @brief Creates an Armadillo column vector from an arbitrary-dimensional hoNDArray.
+   * @param[in] x Input array.
+   * @return An Armadillo array mapped to the data pointer of the hoNDArray.
+   */
+  template<class T> arma::Col<typename stdType<T>::Type > as_arma_col( hoNDArray<T> *x )
+  {
+    return arma::Col<typename stdType<T>::Type>( (typename stdType<T>::Type*) x->get_data_ptr(), x->get_number_of_elements(), false, true );
+  }
+
+  /**
+   * @brief Creates an Armadillo column vector from an arbitrary-dimensional hoNDArray.
+   * @param[in] x Input array.
+   * @return An Armadillo array mapped to the data pointer of the hoNDArray.
+   */
+  template<class T> const arma::Col<typename stdType<T>::Type > as_arma_col( const hoNDArray<T> *x )
+  {
+    return arma::Col<typename stdType<T>::Type>( (typename stdType<T>::Type*) x->get_data_ptr(), x->get_number_of_elements(), false, true );
+  }
+
+  /**
+     * @brief Creates an Armadillo row vector from an arbitrary-dimensional hoNDArray.
+     * @param[in] x Input array.
+     * @return An Armadillo array mapped to the data pointer of the hoNDArray.
+     */
+    template<class T> arma::Row<typename stdType<T>::Type > as_arma_row( hoNDArray<T> *x )
+    {
+      return arma::Row<typename stdType<T>::Type>( (typename stdType<T>::Type*) x->get_data_ptr(), x->get_number_of_elements(), false, true );
+    }
+
+    /**
+     * @brief Creates an Armadillo row vector from an arbitrary-dimensional hoNDArray.
+     * @param[in] x Input array.
+     * @return An Armadillo array mapped to the data pointer of the hoNDArray.
+     */
+    template<class T> const arma::Row<typename stdType<T>::Type > as_arma_row( const hoNDArray<T> *x )
+    {
+      return arma::Row<typename stdType<T>::Type>( (typename stdType<T>::Type*) x->get_data_ptr(), x->get_number_of_elements(), false, true );
+    }
+}
+
+#endif // USE_ARMADILLO
diff --git a/toolboxes/core/cpu/math/hoNDArray_elemwise.cpp b/toolboxes/core/cpu/math/hoNDArray_elemwise.cpp
new file mode 100644
index 0000000..ec9f03f
--- /dev/null
+++ b/toolboxes/core/cpu/math/hoNDArray_elemwise.cpp
@@ -0,0 +1,2936 @@
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_reductions.h"
+#include "complext.h"
+#include "hoArmadillo.h"
+
+#ifdef USE_OMP
+    #include <omp.h>
+#endif
+
+#ifndef lapack_int
+    #define lapack_int int
+#endif // lapack_int
+
+#ifndef lapack_complex_float
+    #define lapack_complex_float  std::complex<float> 
+#endif // lapack_complex_float
+
+#ifndef lapack_complex_double
+    #define lapack_complex_double  std::complex<double> 
+#endif // #ifndef lapack_complex_double
+
+#define NumElementsUseThreading 64*1024
+
+namespace Gadgetron{
+
+  //
+  // Math internal complex types
+  // this replaces std::complex<T> with complext<T>
+  //
+  template <class T> struct mathInternalType {};
+  template <class T> struct mathInternalType<std::complex<T> > {typedef complext<T> type;};
+  template <> struct mathInternalType<float> {typedef float type;};
+  template <> struct mathInternalType<double> {typedef double type;};
+  template <> struct mathInternalType<complext<float> > {typedef complext<float> type;};
+  template <> struct mathInternalType<complext<double> > {typedef complext<double> type;};
+
+  // --------------------------------------------------------------------------------
+
+  // internal low level function for element-wise addition of two arrays
+  template <class T, class S>
+  void add_impl(size_t sizeX, size_t sizeY, const T* x, const S* y, typename mathReturnType<T,S>::type * r)
+  {
+
+    // cast to internal types
+    const typename mathInternalType<T>::type * a = reinterpret_cast<const typename mathInternalType<T>::type *>(x);
+    const typename mathInternalType<S>::type * b = reinterpret_cast<const typename mathInternalType<S>::type *>(y);
+    typename mathInternalType<typename mathReturnType<T,S>::type >::type * c = reinterpret_cast<typename mathInternalType<typename mathReturnType<T,S>::type >::type *>(r);
+
+    if (sizeY>sizeX) {
+        throw std::runtime_error("Add cannot broadcast when the size of x is less than the size of y.");
+    }
+
+    if (sizeX==sizeY) {
+        // No Broadcasting
+        long long loopsize = sizeX;
+        long long n;
+#ifdef USE_OMP
+#pragma omp parallel for default(none) private(n) shared(loopsize, c, a, b) if (loopsize>NumElementsUseThreading)
+#endif
+        for (n=0; n< loopsize; n++ )
+          {
+            c[n] = a[n]+b[n];
+          }
+    } else {
+        // Broadcasting
+        long long outerloopsize = sizeX/sizeY;
+        long long innerloopsize = sizeX/outerloopsize;
+        if (sizeX<NumElementsUseThreading) {
+            // No OMP at All
+            for (long long outer=0; outer<outerloopsize; outer++) {
+                size_t offset = outer * innerloopsize;
+                const typename mathInternalType<T>::type * ai= &a[offset];
+                typename mathInternalType<typename mathReturnType<T,S>::type >::type * ci = &c[offset];
+                for (long long n=0; n< innerloopsize; n++ )
+                  {
+                    ci[n] = ai[n]+b[n];
+                  }
+            }
+        } else if (innerloopsize>NumElementsUseThreading) {
+            // OMP in the inner loop
+            for (long long outer=0; outer<outerloopsize; outer++) {
+                size_t offset = outer * innerloopsize;
+                const typename mathInternalType<T>::type * ai= &a[offset];
+                typename mathInternalType<typename mathReturnType<T,S>::type >::type * ci = &c[offset];
+                long long n;
+#ifdef USE_OMP
+#pragma omp parallel for default(none) private(n) shared(innerloopsize, ci, ai, b)
+#endif
+                for (n=0; n< innerloopsize; n++ )
+                  {
+                    ci[n] = ai[n]+b[n];
+                  }
+            }
+        } else {
+            // OMP in the outer loop
+            long long outer;
+#ifdef USE_OMP
+#pragma omp parallel for default(none) private(outer) shared(outerloopsize, c, a, b, innerloopsize)
+#endif
+            for (outer=0; outer<outerloopsize; outer++) {
+                size_t offset = outer * innerloopsize;
+                const typename mathInternalType<T>::type * ai = &a[offset];
+                typename mathInternalType<typename mathReturnType<T,S>::type >::type * ci = &c[offset];
+                for (long long n=0; n< innerloopsize; n++ )
+                  {
+                    ci[n] = ai[n]+b[n];
+                  }
+            }
+        }
+    }
+
+  }
+
+  template <class T, class S>
+  void add(const hoNDArray<T>& x, const hoNDArray<S>& y, hoNDArray<typename mathReturnType<T,S>::type >& r)
+  {
+    //Check the dimensions os x and y for broadcasting.
+    if (!compatible_dimensions<T,S>(x,y)) {
+        throw std::runtime_error("add: x and y have incompatible dimensions.");
+    }
+
+    //Resize r if necessary
+    size_t sx = x.get_number_of_elements();
+    size_t sy = y.get_number_of_elements();
+    size_t sr = r.get_number_of_elements();
+    if (sx>=sy) {
+        // x is bigger than y or they have the same size
+        if (sx!=sr) {
+          r.create(x.get_dimensions());
+        }
+    }
+    else {
+        // y is bigger than x
+        if (sy!=sr) {
+          r.create(y.get_dimensions());
+        }
+    }
+
+    add_impl(x.get_number_of_elements(), y.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+  }
+
+  // Instantiations
+  template EXPORTCPUCOREMATH void add(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r);
+  template EXPORTCPUCOREMATH void add(const hoNDArray<float>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+  template EXPORTCPUCOREMATH void add(const hoNDArray<double>& x, const hoNDArray<float>& y, hoNDArray<double>& r);
+  template EXPORTCPUCOREMATH void add(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+
+  template EXPORTCPUCOREMATH void add(const hoNDArray< complext<float> >& x, const hoNDArray< float >& y, hoNDArray< complext<float> >& r);
+  template EXPORTCPUCOREMATH void add(const hoNDArray< float >& x, const hoNDArray< complext<float> >& y, hoNDArray< complext<float> >& r);
+  template EXPORTCPUCOREMATH void add(const hoNDArray< complext<float> >& x, const hoNDArray< complext<float> >& y, hoNDArray< complext<float> >& r);
+
+  template EXPORTCPUCOREMATH void add(const hoNDArray< complext<double> >& x, const hoNDArray< double >& y, hoNDArray< complext<double> >& r);
+  template EXPORTCPUCOREMATH void add(const hoNDArray< double >& x, const hoNDArray< complext<double> >& y, hoNDArray< complext<double> >& r);
+  template EXPORTCPUCOREMATH void add(const hoNDArray< complext<double> >& x, const hoNDArray< complext<double> >& y, hoNDArray< complext<double> >& r);
+
+  template EXPORTCPUCOREMATH void add(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& r);
+  template EXPORTCPUCOREMATH void add(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& r);
+
+    // --------------------------------------------------------------------------------
+
+
+  // internal low level function for element-wise subtraction of two arrays
+  template <class T, class S>
+  void subtract_impl(size_t sizeX, size_t sizeY, const T* x, const S* y, typename mathReturnType<T,S>::type * r)
+  {
+
+    // cast to internal types
+    const typename mathInternalType<T>::type * a = reinterpret_cast<const typename mathInternalType<T>::type *>(x);
+    const typename mathInternalType<S>::type * b = reinterpret_cast<const typename mathInternalType<S>::type *>(y);
+    typename mathInternalType<typename mathReturnType<T,S>::type >::type * c = reinterpret_cast<typename mathInternalType<typename mathReturnType<T,S>::type >::type *>(r);
+
+    if (sizeY>sizeX) {
+        throw std::runtime_error("Subtract cannot broadcast when the size of x is less than the size of y.");
+    }
+
+    if (sizeX==sizeY) {
+        // No Broadcasting
+        long long loopsize = sizeX;
+        long long n;
+#ifdef USE_OMP
+#pragma omp parallel for default(none) private(n) shared(loopsize, c, a, b) if (loopsize>NumElementsUseThreading)
+#endif
+        for (n=0; n< loopsize; n++ )
+          {
+            c[n] = a[n]-b[n];
+          }
+    } else {
+        // Broadcasting
+        long long outerloopsize = sizeX/sizeY;
+        long long innerloopsize = sizeX/outerloopsize;
+        if (sizeX<NumElementsUseThreading) {
+            // No OMP at All
+            for (long long outer=0; outer<outerloopsize; outer++) {
+                size_t offset = outer * innerloopsize;
+                const typename mathInternalType<T>::type * ai= &a[offset];
+                typename mathInternalType<typename mathReturnType<T,S>::type >::type * ci = &c[offset];
+                for (long long n=0; n< innerloopsize; n++ )
+                  {
+                    ci[n] = ai[n]-b[n];
+                  }
+            }
+        } else if (innerloopsize>NumElementsUseThreading) {
+            // OMP in the inner loop
+            for (long long outer=0; outer<outerloopsize; outer++) {
+                size_t offset = outer * innerloopsize;
+                const typename mathInternalType<T>::type * ai= &a[offset];
+                typename mathInternalType<typename mathReturnType<T,S>::type >::type * ci = &c[offset];
+                long long n;
+#ifdef USE_OMP
+#pragma omp parallel for default(none) private(n) shared(innerloopsize, ci, ai, b)
+#endif
+                for (n=0; n< innerloopsize; n++ )
+                  {
+                    ci[n] = ai[n]-b[n];
+                  }
+            }
+        } else {
+            // OMP in the outer loop
+            long long outer;
+#ifdef USE_OMP
+#pragma omp parallel for default(none) private(outer) shared(outerloopsize, c, a, b, innerloopsize)
+#endif
+            for (outer=0; outer<outerloopsize; outer++) {
+                size_t offset = outer * innerloopsize;
+                const typename mathInternalType<T>::type * ai = &a[offset];
+                typename mathInternalType<typename mathReturnType<T,S>::type >::type * ci = &c[offset];
+                for (long long n=0; n< innerloopsize; n++ )
+                  {
+                    ci[n] = ai[n]-b[n];
+                  }
+            }
+        }
+    }
+
+  }
+
+  template <class T, class S>
+  void subtract(const hoNDArray<T>& x, const hoNDArray<S>& y, hoNDArray<typename mathReturnType<T,S>::type >& r)
+  {
+    //Check the dimensions os x and y for broadcasting.
+    if (!compatible_dimensions<T,S>(x,y)) {
+        throw std::runtime_error("subtract: x and y have incompatible dimensions.");
+    }
+
+    //Resize r if necessary
+    size_t sx = x.get_number_of_elements();
+    size_t sy = y.get_number_of_elements();
+    size_t sr = r.get_number_of_elements();
+    if (sx>=sy) {
+        // x is bigger than y or they have the same size
+        if (sx!=sr) {
+          r.create(x.get_dimensions());
+        }
+    }
+    else {
+        // y is bigger than x
+        if (sy!=sr) {
+          r.create(y.get_dimensions());
+        }
+    }
+
+    subtract_impl(x.get_number_of_elements(), y.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+  }
+
+  // Instantiations
+  template EXPORTCPUCOREMATH void subtract(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r);
+  template EXPORTCPUCOREMATH void subtract(const hoNDArray<float>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+  template EXPORTCPUCOREMATH void subtract(const hoNDArray<double>& x, const hoNDArray<float>& y, hoNDArray<double>& r);
+  template EXPORTCPUCOREMATH void subtract(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+
+  template EXPORTCPUCOREMATH void subtract(const hoNDArray< complext<float> >& x, const hoNDArray< float >& y, hoNDArray< complext<float> >& r);
+  template EXPORTCPUCOREMATH void subtract(const hoNDArray< float >& x, const hoNDArray< complext<float> >& y, hoNDArray< complext<float> >& r);
+  template EXPORTCPUCOREMATH void subtract(const hoNDArray< complext<float> >& x, const hoNDArray< complext<float> >& y, hoNDArray< complext<float> >& r);
+
+  template EXPORTCPUCOREMATH void subtract(const hoNDArray< complext<double> >& x, const hoNDArray< double >& y, hoNDArray< complext<double> >& r);
+  template EXPORTCPUCOREMATH void subtract(const hoNDArray< double >& x, const hoNDArray< complext<double> >& y, hoNDArray< complext<double> >& r);
+  template EXPORTCPUCOREMATH void subtract(const hoNDArray< complext<double> >& x, const hoNDArray< complext<double> >& y, hoNDArray< complext<double> >& r);
+
+  template EXPORTCPUCOREMATH void subtract(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& r);
+  template EXPORTCPUCOREMATH void subtract(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& r);
+
+
+    // --------------------------------------------------------------------------------
+
+    // internal low level function for element-wise multiplication of two arrays
+    template <class T, class S>
+    void multiply_impl(size_t sizeX, size_t sizeY, const T* x, const S* y, typename mathReturnType<T,S>::type * r)
+    {
+
+      // cast to internal types
+      const typename mathInternalType<T>::type * a = reinterpret_cast<const typename mathInternalType<T>::type *>(x);
+      const typename mathInternalType<S>::type * b = reinterpret_cast<const typename mathInternalType<S>::type *>(y);
+      typename mathInternalType<typename mathReturnType<T,S>::type >::type * c = reinterpret_cast<typename mathInternalType<typename mathReturnType<T,S>::type >::type *>(r);
+
+      if (sizeY>sizeX) {
+          throw std::runtime_error("Multiply cannot broadcast when the size of x is less than the size of y.");
+      }
+
+      if (sizeX==sizeY) {
+          // No Broadcasting
+          long long loopsize = sizeX;
+          long long n;
+#ifdef USE_OMP
+#pragma omp parallel for default(none) private(n) shared(loopsize, c, a, b) if (loopsize>NumElementsUseThreading)
+#endif
+          for (n=0; n< loopsize; n++ )
+            {
+              c[n] = a[n]*b[n];
+            }
+      } else {
+          // Broadcasting
+          long long outerloopsize = sizeX/sizeY;
+          long long innerloopsize = sizeX/outerloopsize;
+          if (sizeX<NumElementsUseThreading) {
+              // No OMP at All
+              for (long long outer=0; outer<outerloopsize; outer++) {
+                  size_t offset = outer * innerloopsize;
+                  const typename mathInternalType<T>::type * ai= &a[offset];
+                  typename mathInternalType<typename mathReturnType<T,S>::type >::type * ci = &c[offset];
+                  for (long long n=0; n< innerloopsize; n++ )
+                    {
+                      ci[n] = ai[n]*b[n];
+                    }
+              }
+          } else if (innerloopsize>NumElementsUseThreading) {
+              // OMP in the inner loop
+              for (long long outer=0; outer<outerloopsize; outer++) {
+                  size_t offset = outer * innerloopsize;
+                  const typename mathInternalType<T>::type * ai= &a[offset];
+                  typename mathInternalType<typename mathReturnType<T,S>::type >::type * ci = &c[offset];
+
+                  long long n;
+#ifdef USE_OMP
+#pragma omp parallel for default(none) private(n) shared(innerloopsize, ci, ai, b)
+#endif
+                  for (n=0; n< innerloopsize; n++ )
+                    {
+                      ci[n] = ai[n]*b[n];
+                    }
+              }
+          } else {
+              // OMP in the outer loop
+              long long outer;
+#ifdef USE_OMP
+#pragma omp parallel for default(none) private(outer) shared(outerloopsize, c, a, b, innerloopsize)
+#endif
+              for (outer=0; outer<outerloopsize; outer++) {
+                  size_t offset = outer * innerloopsize;
+                  const typename mathInternalType<T>::type * ai = &a[offset];
+                  typename mathInternalType<typename mathReturnType<T,S>::type >::type * ci = &c[offset];
+                  for (long long n=0; n< innerloopsize; n++ )
+                    {
+                      ci[n] = ai[n]*b[n];
+                    }
+              }
+          }
+      }
+
+    }
+
+    template <class T, class S>
+    void multiply(const hoNDArray<T>& x, const hoNDArray<S>& y, hoNDArray<typename mathReturnType<T,S>::type >& r)
+    {
+      //Check the dimensions os x and y for broadcasting.
+      if (!compatible_dimensions<T,S>(x,y)) {
+          throw std::runtime_error("multiply: x and y have incompatible dimensions.");
+      }
+
+      //Resize r if necessary
+      size_t sx = x.get_number_of_elements();
+      size_t sy = y.get_number_of_elements();
+      size_t sr = r.get_number_of_elements();
+      if (sx>=sy) {
+          // x is bigger than y or they have the same size
+          if (sx!=sr) {
+            r.create(x.get_dimensions());
+          }
+      }
+      else {
+          // y is bigger than x
+          if (sy!=sr) {
+            r.create(y.get_dimensions());
+          }
+      }
+
+      multiply_impl(x.get_number_of_elements(), y.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+    }
+
+    // Instantiations
+    template EXPORTCPUCOREMATH void multiply(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void multiply(const hoNDArray<float>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void multiply(const hoNDArray<double>& x, const hoNDArray<float>& y, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void multiply(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+
+    template EXPORTCPUCOREMATH void multiply(const hoNDArray< complext<float> >& x, const hoNDArray< float >& y, hoNDArray< complext<float> >& r);
+    template EXPORTCPUCOREMATH void multiply(const hoNDArray< float >& x, const hoNDArray< complext<float> >& y, hoNDArray< complext<float> >& r);
+    template EXPORTCPUCOREMATH void multiply(const hoNDArray< complext<float> >& x, const hoNDArray< complext<float> >& y, hoNDArray< complext<float> >& r);
+
+    template EXPORTCPUCOREMATH void multiply(const hoNDArray< complext<double> >& x, const hoNDArray< double >& y, hoNDArray< complext<double> >& r);
+    template EXPORTCPUCOREMATH void multiply(const hoNDArray< double >& x, const hoNDArray< complext<double> >& y, hoNDArray< complext<double> >& r);
+    template EXPORTCPUCOREMATH void multiply(const hoNDArray< complext<double> >& x, const hoNDArray< complext<double> >& y, hoNDArray< complext<double> >& r);
+
+    template EXPORTCPUCOREMATH void multiply(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void multiply(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& r);
+
+    // --------------------------------------------------------------------------------
+
+    // internal low level function for element-wise division of two arrays
+    template <class T, class S>
+    void divide_impl(size_t sizeX, size_t sizeY, const T* x, const S* y, typename mathReturnType<T,S>::type * r)
+    {
+
+      // cast to internal types
+      const typename mathInternalType<T>::type * a = reinterpret_cast<const typename mathInternalType<T>::type *>(x);
+      const typename mathInternalType<S>::type * b = reinterpret_cast<const typename mathInternalType<S>::type *>(y);
+      typename mathInternalType<typename mathReturnType<T,S>::type >::type * c = reinterpret_cast<typename mathInternalType<typename mathReturnType<T,S>::type >::type *>(r);
+
+      if (sizeY>sizeX) {
+          throw std::runtime_error("Multiply cannot broadcast when the size of x is less than the size of y.");
+      }
+
+      if (sizeX==sizeY) {
+          // No Broadcasting
+          long long loopsize = sizeX;
+          long long n;
+#ifdef USE_OMP
+#pragma omp parallel for default(none) private(n) shared(loopsize, c, a, b) if (loopsize>NumElementsUseThreading)
+#endif
+          for (n=0; n< loopsize; n++ )
+            {
+              c[n] = a[n]/b[n];
+            }
+      } else {
+          // Broadcasting
+          long long outerloopsize = sizeX/sizeY;
+          long long innerloopsize = sizeX/outerloopsize;
+          if (sizeX<NumElementsUseThreading) {
+              // No OMP at All
+              for (long long outer=0; outer<outerloopsize; outer++) {
+                  size_t offset = outer * innerloopsize;
+                  const typename mathInternalType<T>::type * ai= &a[offset];
+                  typename mathInternalType<typename mathReturnType<T,S>::type >::type * ci = &c[offset];
+                  for (long long n=0; n< innerloopsize; n++ )
+                    {
+                      ci[n] = ai[n]/b[n];
+                    }
+              }
+          } else if (innerloopsize>NumElementsUseThreading) {
+              // OMP in the inner loop
+              for (long long outer=0; outer<outerloopsize; outer++) {
+                  size_t offset = outer * innerloopsize;
+                  const typename mathInternalType<T>::type * ai= &a[offset];
+                  typename mathInternalType<typename mathReturnType<T,S>::type >::type * ci = &c[offset];
+                  long long n;
+#ifdef USE_OMP
+#pragma omp parallel for default(none) private(n) shared(innerloopsize, ci, ai, b)
+#endif
+                  for (n=0; n< innerloopsize; n++ )
+                    {
+                      ci[n] = ai[n]/b[n];
+                    }
+              }
+          } else {
+              // OMP in the outer loop
+              long long outer;
+#ifdef USE_OMP
+#pragma omp parallel for default(none) private(outer) shared(outerloopsize, c, a, b, innerloopsize)
+#endif
+              for (outer=0; outer<outerloopsize; outer++) {
+                  size_t offset = outer * innerloopsize;
+                  const typename mathInternalType<T>::type * ai = &a[offset];
+                  typename mathInternalType<typename mathReturnType<T,S>::type >::type * ci = &c[offset];
+                  for (long long n=0; n< innerloopsize; n++ )
+                    {
+                      ci[n] = ai[n]/b[n];
+                    }
+              }
+          }
+      }
+
+    }
+
+    template <class T, class S>
+    void divide(const hoNDArray<T>& x, const hoNDArray<S>& y, hoNDArray<typename mathReturnType<T,S>::type >& r)
+    {
+      //Check the dimensions os x and y for broadcasting.
+      if (!compatible_dimensions<T,S>(x,y)) {
+          throw std::runtime_error("divide: x and y have incompatible dimensions.");
+      }
+
+      //Resize r if necessary
+      size_t sx = x.get_number_of_elements();
+      size_t sy = y.get_number_of_elements();
+      size_t sr = r.get_number_of_elements();
+      if (sx>=sy) {
+          // x is bigger than y or they have the same size
+          if (sx!=sr) {
+            r.create(x.get_dimensions());
+          }
+      }
+      else {
+          // y is bigger than x
+          if (sy!=sr) {
+            r.create(y.get_dimensions());
+          }
+      }
+
+      divide_impl(x.get_number_of_elements(), y.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+    }
+
+    // Instantiations
+    template EXPORTCPUCOREMATH void divide(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void divide(const hoNDArray<float>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void divide(const hoNDArray<double>& x, const hoNDArray<float>& y, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void divide(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+
+    template EXPORTCPUCOREMATH void divide(const hoNDArray< complext<float> >& x, const hoNDArray< float >& y, hoNDArray< complext<float> >& r);
+    template EXPORTCPUCOREMATH void divide(const hoNDArray< float >& x, const hoNDArray< complext<float> >& y, hoNDArray< complext<float> >& r);
+    template EXPORTCPUCOREMATH void divide(const hoNDArray< complext<float> >& x, const hoNDArray< complext<float> >& y, hoNDArray< complext<float> >& r);
+
+    template EXPORTCPUCOREMATH void divide(const hoNDArray< complext<double> >& x, const hoNDArray< double >& y, hoNDArray< complext<double> >& r);
+    template EXPORTCPUCOREMATH void divide(const hoNDArray< double >& x, const hoNDArray< complext<double> >& y, hoNDArray< complext<double> >& r);
+    template EXPORTCPUCOREMATH void divide(const hoNDArray< complext<double> >& x, const hoNDArray< complext<double> >& y, hoNDArray< complext<double> >& r);
+
+    template EXPORTCPUCOREMATH void divide(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void divide(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& r);
+
+    // --------------------------------------------------------------------------------
+
+
+    // internal low level function for element-wise multiplication of two arrays
+    template <class T, class S>
+    void multiplyConj_impl(size_t sizeX, size_t sizeY, const T* x, const S* y, typename mathReturnType<T,S>::type * r)
+    {
+
+      // cast to internal types
+      const typename mathInternalType<T>::type * a = reinterpret_cast<const typename mathInternalType<T>::type *>(x);
+      const typename mathInternalType<S>::type * b = reinterpret_cast<const typename mathInternalType<S>::type *>(y);
+      typename mathInternalType<typename mathReturnType<T,S>::type >::type * c = reinterpret_cast<typename mathInternalType<typename mathReturnType<T,S>::type >::type *>(r);
+
+      if (sizeY>sizeX) {
+          throw std::runtime_error("MultiplyConj cannot broadcast when the size of x is less than the size of y.");
+      }
+
+      if (sizeX==sizeY) {
+          // No Broadcasting
+          long long loopsize = sizeX;
+          long long n;
+#ifdef USE_OMP
+#pragma omp parallel for default(none) private(n) shared(loopsize, c, a, b) if (loopsize>NumElementsUseThreading)
+#endif
+          for (n=0; n< loopsize; n++ )
+            {
+              c[n] = a[n]*conj(b[n]);
+            }
+      } else {
+          // Broadcasting
+          long long outerloopsize = sizeX/sizeY;
+          long long innerloopsize = sizeX/outerloopsize;
+          if (sizeX<NumElementsUseThreading) {
+              // No OMP at All
+              for (long long outer=0; outer<outerloopsize; outer++) {
+                  size_t offset = outer * innerloopsize;
+                  const typename mathInternalType<T>::type * ai= &a[offset];
+                  typename mathInternalType<typename mathReturnType<T,S>::type >::type * ci = &c[offset];
+                  for (long long n=0; n< innerloopsize; n++ )
+                    {
+                      ci[n] = ai[n]*conj(b[n]);
+                    }
+              }
+          } else if (innerloopsize>NumElementsUseThreading) {
+              // OMP in the inner loop
+              for (long long outer=0; outer<outerloopsize; outer++) {
+                  size_t offset = outer * innerloopsize;
+                  const typename mathInternalType<T>::type * ai= &a[offset];
+                  typename mathInternalType<typename mathReturnType<T,S>::type >::type * ci = &c[offset];
+                  long long n;
+#ifdef USE_OMP
+#pragma omp parallel for default(none) private(n) shared(innerloopsize, ci, ai, b)
+#endif
+                  for (n=0; n< innerloopsize; n++ )
+                    {
+                      ci[n] = ai[n]*conj(b[n]);
+                    }
+              }
+          } else {
+              // OMP in the outer loop
+              long long outer;
+#ifdef USE_OMP
+#pragma omp parallel for default(none) private(outer) shared(outerloopsize, c, a, b, innerloopsize)
+#endif
+              for (outer=0; outer<outerloopsize; outer++) {
+                  size_t offset = outer * innerloopsize;
+                  const typename mathInternalType<T>::type * ai = &a[offset];
+                  typename mathInternalType<typename mathReturnType<T,S>::type >::type * ci = &c[offset];
+                  for (long long n=0; n< innerloopsize; n++ )
+                    {
+                      ci[n] = ai[n]*conj(b[n]);
+                    }
+              }
+          }
+      }
+
+    }
+
+    template <class T, class S>
+    void multiplyConj(const hoNDArray<T>& x, const hoNDArray<S>& y, hoNDArray<typename mathReturnType<T,S>::type >& r)
+    {
+      //Check the dimensions os x and y for broadcasting.
+      if (!compatible_dimensions<T,S>(x,y)) {
+          throw std::runtime_error("multiplyConj: x and y have incompatible dimensions.");
+      }
+
+      //Resize r if necessary
+      size_t sx = x.get_number_of_elements();
+      size_t sy = y.get_number_of_elements();
+      size_t sr = r.get_number_of_elements();
+      if (sx>=sy) {
+          // x is bigger than y or they have the same size
+          if (sx!=sr) {
+            r.create(x.get_dimensions());
+          }
+      }
+      else {
+          // y is bigger than x
+          if (sy!=sr) {
+            r.create(y.get_dimensions());
+          }
+      }
+
+      multiplyConj_impl(x.get_number_of_elements(), y.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+    }
+
+    // Instantiations
+    template EXPORTCPUCOREMATH void multiplyConj(const hoNDArray< float >& x, const hoNDArray< complext<float> >& y, hoNDArray< complext<float> >& r);
+    template EXPORTCPUCOREMATH void multiplyConj(const hoNDArray< complext<float> >& x, const hoNDArray< complext<float> >& y, hoNDArray< complext<float> >& r);
+    template EXPORTCPUCOREMATH void multiplyConj(const hoNDArray< double >& x, const hoNDArray< complext<double> >& y, hoNDArray< complext<double> >& r);
+    template EXPORTCPUCOREMATH void multiplyConj(const hoNDArray< complext<double> >& x, const hoNDArray< complext<double> >& y, hoNDArray< complext<double> >& r);
+
+    template EXPORTCPUCOREMATH void multiplyConj(const hoNDArray< float >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void multiplyConj(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void multiplyConj(const hoNDArray< double >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& r);
+    template EXPORTCPUCOREMATH void multiplyConj(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& r);
+
+
+    // --------------------------------------------------------------------------------
+
+    inline void conjugate(size_t N, const  std::complex<float> * x,  std::complex<float> * r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, r) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            reinterpret_cast<float(&)[2]>(r[n])[0] = reinterpret_cast< const float(&)[2]>(x[n])[0];
+            reinterpret_cast<float(&)[2]>(r[n])[1] = -(reinterpret_cast< const float(&)[2]>(x[n])[1]);
+        }
+    }
+
+    inline void conjugate(size_t N, const  std::complex<double> * x,  std::complex<double> * r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, r) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            reinterpret_cast<double(&)[2]>(r[n])[0] = reinterpret_cast< const double(&)[2]>(x[n])[0];
+            reinterpret_cast<double(&)[2]>(r[n])[1] = -(reinterpret_cast<const double(&)[2]>(x[n])[1]);
+        }
+    }
+
+    template <typename T> 
+    void conjugate(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        conjugate(x.get_number_of_elements(), x.begin(), r.begin());
+    }
+
+    template EXPORTCPUCOREMATH void conjugate(const hoNDArray< std::complex<float> >& x, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void conjugate(const hoNDArray< std::complex<double> >& x, hoNDArray< std::complex<double> >& r);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    inline void addEpsilon(size_t N, T* x)
+    {
+        typename realType<T>::Type eps = std::numeric_limits<typename realType<T>::Type>::epsilon();
+
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, eps) if (N>NumElementsUseThreading)
+        for (n=0; n<(long long)N; n++ )
+        {
+            if ( std::abs(x[n]) < eps )
+            {
+                x[n] += eps;
+            }
+        }
+    }
+
+    inline void addEpsilon(size_t N,  std::complex<float> * x)
+    {
+        const float eps = std::numeric_limits<float>::epsilon();
+
+        long long n;
+
+        #pragma omp parallel for private(n) if (N>NumElementsUseThreading)
+        for (n=0; n<(long long)N; n++ )
+        {
+            if ( std::abs(x[n]) < eps )
+            {
+                reinterpret_cast<float(&)[2]>(x[n])[0] += eps;
+            }
+        }
+    }
+
+    inline void addEpsilon(size_t N,  std::complex<double> * x)
+    {
+        const double eps = std::numeric_limits<double>::epsilon();
+
+        long long n;
+
+        #pragma omp parallel for private(n) if (N>NumElementsUseThreading)
+        for (n=0; n<(long long)N; n++ )
+        {
+            if ( std::abs(x[n]) < eps )
+            {
+                reinterpret_cast<double(&)[2]>(x[n])[0] += eps;
+            }
+        }
+    }
+
+    template <typename T> 
+    void addEpsilon(hoNDArray<T>& x)
+    {
+        addEpsilon(x.get_number_of_elements(), x.begin());
+    }
+
+    template EXPORTCPUCOREMATH void addEpsilon(hoNDArray<float>& x);
+    template EXPORTCPUCOREMATH void addEpsilon(hoNDArray<double>& x);
+    template EXPORTCPUCOREMATH void addEpsilon(hoNDArray< std::complex<float> >& x);
+    template EXPORTCPUCOREMATH void addEpsilon(hoNDArray< std::complex<double> >& x);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    void argument(const hoNDArray<T>& x, hoNDArray<typename realType<T>::Type>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        size_t N = x.get_number_of_elements();
+        const T* pX = x.begin();
+        typename realType<T>::Type* pR = r.begin();
+
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, pX, pR) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            pR[n] = std::arg( pX[n] );
+        }
+    }
+
+    template EXPORTCPUCOREMATH void argument(const hoNDArray< std::complex<float> >& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void argument(const hoNDArray< std::complex<double> >& x, hoNDArray<double>& r);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    void inv(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        if ( !r.dimensions_equal(&x) )
+        {
+            r = x;
+        }
+
+        size_t N = x.get_number_of_elements();
+        const T* pX = x.begin();
+        T* pR = r.begin();
+
+        T v(1.0);
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, pX, pR, v) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            pR[n] = v/pX[n];
+        }
+    }
+
+    template EXPORTCPUCOREMATH void inv(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void inv(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void inv(const hoNDArray< std::complex<float> >& x, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void inv(const hoNDArray< std::complex<double> >& x, hoNDArray< std::complex<double> >& r);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    void abs(size_t N, const T* x, typename realType<T>::Type* r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, r) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            r[n]= std::abs(x[n]);
+        }
+    }
+
+    inline void abs(size_t N, const  std::complex<float> * x, float* r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, r) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            const  std::complex<float> & c = x[n];
+            const float re = c.real();
+            const float im = c.imag();
+            r[n]= std::sqrt( (re*re) + (im * im) );
+        }
+    }
+
+    inline void abs(size_t N, const  std::complex<double> * x, double* r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, r) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            const  std::complex<double> & c = x[n];
+            const double re = c.real();
+            const double im = c.imag();
+            r[n]= std::sqrt( (re*re) + (im * im) );
+        }
+    }
+
+    void abs(size_t N, const complext<float> * x, float* r)
+    {
+        long long n;
+
+        #pragma omp parallel for private(n) shared(N, x, r) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            const  complext<float> & c = x[n];
+            const float re = c.real();
+            const float im = c.imag();
+            r[n]= std::sqrt( (re*re) + (im * im) );
+        }
+    }
+
+    void abs(size_t N, const complext<double> * x, double* r)
+    {
+        long long n;
+
+        #pragma omp parallel for private(n) shared(N, x, r) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            const  complext<double> & c = x[n];
+            const double re = c.real();
+            const double im = c.imag();
+            r[n]= std::sqrt( (re*re) + (im * im) );
+        }
+    }
+
+    template <typename T> 
+    void abs(const hoNDArray<T>& x, hoNDArray<typename realType<T>::Type>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        abs(x.get_number_of_elements(), x.begin(), r.begin());
+    }
+
+    template EXPORTCPUCOREMATH void abs(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void abs(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void abs(const hoNDArray< std::complex<float> >& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void abs(const hoNDArray< std::complex<double> >& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void abs(const hoNDArray< complext<float> >& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void abs(const hoNDArray< complext<double> >& x, hoNDArray<double>& r);
+
+    inline void abs(size_t N, const std::complex<float>* x, std::complex<float>* r)
+    {
+        try
+        {
+            long long n;
+
+            #pragma omp parallel for default(none) private(n) shared(N, x, r) if (N>NumElementsUseThreading)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                const std::complex<float>& c = x[n];
+                const float re = c.real();
+                const float im = c.imag();
+
+                reinterpret_cast<float(&)[2]>(r[n])[0] = std::sqrt( (re*re) + (im * im) );
+                reinterpret_cast<float(&)[2]>(r[n])[1] = 0;
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Error happened in abs(size_t N, const std::complex<float>* x, std::complex<float>* r) ... ");
+        }
+    }
+
+    inline void abs(size_t N, const std::complex<double>* x, std::complex<double>* r)
+    {
+        try
+        {
+            long long n;
+
+            #pragma omp parallel for default(none) private(n) shared(N, x, r) if (N>NumElementsUseThreading)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                const std::complex<double>& c = x[n];
+                const double re = c.real();
+                const double im = c.imag();
+
+                reinterpret_cast<double(&)[2]>(r[n])[0] = std::sqrt( (re*re) + (im * im) );
+                reinterpret_cast<double(&)[2]>(r[n])[1] = 0;
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Error happened in abs(size_t N, const std::complex<double>* x, std::complex<double>* r) ... ");
+        }
+    }
+
+    template <typename T> 
+    void abs(const hoNDArray< std::complex<T> >& x, hoNDArray< std::complex<T> >& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        abs(x.get_number_of_elements(), x.begin(), r.begin());
+    }
+
+    template EXPORTCPUCOREMATH void abs(const hoNDArray< std::complex<float> >& x, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void abs(const hoNDArray< std::complex<double> >& x, hoNDArray< std::complex<double> >& r);
+
+    template<class T> boost::shared_ptr< hoNDArray<typename realType<T>::Type> > abs( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::abs(): Invalid input array");
+
+        boost::shared_ptr< hoNDArray<typename realType<T>::Type> > result(new hoNDArray<typename realType<T>::Type>());
+        result->create(x->get_dimensions());
+        abs(*x, *result);
+        return result;
+    }
+
+    template<class T> void abs_inplace( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::abs_inplace(): Invalid input array");
+
+        abs(*x, *x);
+    }
+
+    template<class T> boost::shared_ptr< hoNDArray<typename realType<T>::Type> > abs_square( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::abs_square(): Invalid input array");
+
+        boost::shared_ptr< hoNDArray<typename realType<T>::Type> > result(new hoNDArray<typename realType<T>::Type>());
+        result->create(x->get_dimensions());
+        abs(*x, *result);
+        multiply(*result, *result, *result);
+        return result;
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    void sqrt(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        size_t N = x.get_number_of_elements();
+        const T* pX = x.begin();
+        T* pR = r.begin();
+
+        long long n;
+        #pragma omp parallel for default(none) private(n) shared(N, pX, pR) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            pR[n] = std::sqrt(pX[n]);
+        }
+    }
+
+    template <typename T> 
+    void sqrt(const hoNDArray< complext<T> >& x, hoNDArray< complext<T> >& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        size_t N = x.get_number_of_elements();
+        const complext<T>* pX = x.begin();
+        complext<T>* pR = r.begin();
+
+        long long n;
+        #pragma omp parallel for default(none) private(n) shared(N, pX, pR) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            pR[n] = Gadgetron::sqrt(pX[n]);
+        }
+    }
+
+    template EXPORTCPUCOREMATH void sqrt(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void sqrt(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void sqrt(const hoNDArray< std::complex<float> >& x, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void sqrt(const hoNDArray< std::complex<double> >& x, hoNDArray< std::complex<double> >& r);
+    template EXPORTCPUCOREMATH void sqrt(const hoNDArray< complext<float> >& x, hoNDArray< complext<float> >& r);
+    template EXPORTCPUCOREMATH void sqrt(const hoNDArray< complext<double> >& x, hoNDArray< complext<double> >& r);
+
+    template<class T> boost::shared_ptr< hoNDArray<T> > sqrt( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::sqrt(): Invalid input array");
+
+        boost::shared_ptr< hoNDArray<T> > result(new hoNDArray<T>());
+        result->create(x->get_dimensions());
+        sqrt(*x, *result);
+        return result;
+    }
+
+    template<class T> void sqrt_inplace( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::sqrt_inplace(): Invalid input array");
+
+        sqrt(*x, *x);
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> boost::shared_ptr< hoNDArray<T> > square( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::square(): Invalid input array");
+
+        boost::shared_ptr< hoNDArray<T> > result(new hoNDArray<T>());
+        result->create(x->get_dimensions());
+        /*arma::Col<typename stdType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = arma::square(as_arma_col(x));*/
+        multiply(*x, *x, *result);
+        return result;
+    }
+
+    template<class T> void square_inplace( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::square_inplace(): Invalid input array");
+
+        /*arma::Col<typename stdType<T>::Type> aRes = as_arma_col(x);
+        aRes = arma::square(aRes);*/
+
+        multiply(*x, *x, *x);
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> boost::shared_ptr< hoNDArray<T> > reciprocal( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::reciprocal(): Invalid input array");
+
+        /*arma::Col<typename stdType<T>::Type> ones(x->get_number_of_elements());
+        ones.ones();*/
+        boost::shared_ptr< hoNDArray<T> > result(new hoNDArray<T>());
+        result->create(x->get_dimensions());
+        /*arma::Col<typename stdType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = ones/as_arma_col(x);*/
+        inv(*x, *result);
+        return result;
+    }
+
+    template<class T> void reciprocal_inplace( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::reciprocal_inplace(): Invalid input array");
+
+        /*arma::Col<typename stdType<T>::Type> aRes = as_arma_col(x);
+        arma::Col<typename stdType<T>::Type> ones(x->get_number_of_elements());
+        ones.ones();
+        aRes = ones/aRes;*/
+
+        inv(*x, *x);
+    }
+
+    template<class T> boost::shared_ptr< hoNDArray<T> > reciprocal_sqrt( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::reciprocal_sqrt(): Invalid input array");
+
+        /*arma::Col<typename stdType<T>::Type> ones(x->get_number_of_elements());
+        ones.ones();*/
+        boost::shared_ptr< hoNDArray<T> > result(new hoNDArray<T>());
+        result->create(x->get_dimensions());
+        /*arma::Col<typename stdType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = ones/arma::sqrt(as_arma_col(x));*/
+
+        sqrt(*x, *result);
+        inv(*result, *result);
+        return result;
+    }
+
+    template<class T> void reciprocal_sqrt_inplace( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::reciprocal_sqrt_inplace(): Invalid input array");
+
+        /*arma::Col<typename stdType<T>::Type> ones(x->get_number_of_elements());
+        ones.ones();
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(x);
+        aRes = ones/arma::sqrt(aRes);*/
+
+        sqrt(*x, *x);
+        inv(*x, *x);
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> boost::shared_ptr< hoNDArray<T> > sgn( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::sgn(): Invalid input array");
+
+        boost::shared_ptr< hoNDArray<T> > res( new hoNDArray<T>() );
+        res->create(x->get_dimensions());
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+        for( long long i = 0; i < (long long)res->get_number_of_elements(); i++ ){
+            res->get_data_ptr()[i] = sgn(x->get_data_ptr()[i]);
+        }
+        return res;
+    }
+
+    template<class T> void sgn_inplace( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::sgn_inplace(): Invalid input array");
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+        for( long long i = 0; i < (long long)x->get_number_of_elements(); i++ )
+            x->get_data_ptr()[i] = sgn(x->get_data_ptr()[i]);
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> boost::shared_ptr< hoNDArray<typename realType<T>::Type> > real( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::real(): Invalid input array");
+
+        boost::shared_ptr< hoNDArray<typename realType<T>::Type> > result(new hoNDArray<typename realType<T>::Type>());
+        result->create(x->get_dimensions());
+        arma::Col<typename realType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = arma::real(as_arma_col(x));
+        return result;
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> boost::shared_ptr< hoNDArray<typename realType<T>::Type> > imag( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::imag(): Invalid input array");
+
+        boost::shared_ptr< hoNDArray<typename realType<T>::Type> > result(new hoNDArray<typename realType<T>::Type>());
+        result->create(x->get_dimensions());
+        arma::Col<typename realType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = arma::imag(as_arma_col(x));
+        return result;
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> boost::shared_ptr< hoNDArray<T> > conj( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::conj(): Invalid input array");
+
+        boost::shared_ptr< hoNDArray<T> > result(new hoNDArray<T>());
+        result->create(x->get_dimensions());
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = arma::conj(as_arma_col(x));
+        return result;
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> boost::shared_ptr< hoNDArray<T> > real_to_complex( hoNDArray<typename realType<T>::Type> *x )
+    {
+        if( x == 0x0 )
+            BOOST_THROW_EXCEPTION(runtime_error("Gadgetron::real_to_complex(): Invalid input array"));
+
+        boost::shared_ptr< hoNDArray<T> > result(new hoNDArray<T>());
+        result->create(x->get_dimensions());
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = arma::Col<typename stdType<T>::Type>(as_arma_col(x), arma::Col<typename realType<T>::Type>(x->get_number_of_elements()).zeros());
+        return result;
+    }
+
+    template<class T> boost::shared_ptr< hoNDArray<T> > real_imag_to_complex( hoNDArray<typename realType<T>::Type>* real, hoNDArray<typename realType<T>::Type>* imag )
+    {
+        if( real==0x0 || imag==0x0 )
+            BOOST_THROW_EXCEPTION(runtime_error("Gadgetron::real_imag_to_complex(): Invalid input array"));
+
+        if( real->get_number_of_elements() != imag->get_number_of_elements() )
+            BOOST_THROW_EXCEPTION(runtime_error("Gadgetron::real_imag_to_complex(): Invalid input array"));
+
+        boost::shared_ptr< hoNDArray<T> > result(new hoNDArray<T>());
+        result->create(real->get_dimensions());
+
+        T* pRes = result->begin();
+
+        size_t N = real->get_number_of_elements();
+        for ( size_t n=0; n<N; n++ )
+        {
+            pRes[n] = T(real->at(n), imag->at(n));
+        }
+
+        return result;
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> 
+    void real_imag_to_complex(const hoNDArray<typename realType<T>::Type>& real, const hoNDArray<typename realType<T>::Type>& imag, hoNDArray<T>& cplx)
+    {
+        try
+        {
+            GADGET_CHECK_THROW(real.dimensions_equal(&imag));
+
+            if ( !cplx.dimensions_equal(&real) )
+            {
+                cplx.create(real.get_dimensions());
+            }
+
+            T* pRes = cplx.begin();
+            const typename realType<T>::Type* pReal = real.begin();
+            const typename realType<T>::Type* pImag = imag.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for private(n) shared(N, pRes, pReal, pImag)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pRes[n] = T(pReal[n], pImag[n]);
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in real_imag_to_complex(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void real_imag_to_complex(const hoNDArray<float>& real, const hoNDArray<float>& imag, hoNDArray< std::complex<float> >& cplx);
+    template EXPORTCPUCOREMATH void real_imag_to_complex(const hoNDArray<double>& real, const hoNDArray<double>& imag, hoNDArray< std::complex<double> >& cplx);
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> 
+    void complex_to_real_imag(const hoNDArray<T>& cplx, hoNDArray<typename realType<T>::Type>& real, hoNDArray<typename realType<T>::Type>& imag)
+    {
+        try
+        {
+            if ( !real.dimensions_equal(&cplx) )
+            {
+                real.create(cplx.get_dimensions());
+            }
+
+            if ( !imag.dimensions_equal(&cplx) )
+            {
+                imag.create(cplx.get_dimensions());
+            }
+
+            const T* pRes = cplx.begin();
+            typename realType<T>::Type* pReal = real.begin();
+            typename realType<T>::Type* pImag = imag.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, pRes, pReal, pImag)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pReal[n] = pRes[n].real();
+                pImag[n] = pRes[n].imag();
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_real_imag(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void complex_to_real_imag(const hoNDArray< std::complex<float> >& cplx, hoNDArray<float>& real, hoNDArray<float>& imag);
+    template EXPORTCPUCOREMATH void complex_to_real_imag(const hoNDArray< std::complex<double> >& cplx, hoNDArray<double>& real, hoNDArray<double>& imag);
+
+    void complex_to_real_imag(const hoNDArray<float>& cplx, hoNDArray<float>& real, hoNDArray<float>& imag)
+    {
+        try
+        {
+            if ( !real.dimensions_equal(&cplx) )
+            {
+                real.create(cplx.get_dimensions());
+            }
+
+            if ( !imag.dimensions_equal(&cplx) )
+            {
+                imag.create(cplx.get_dimensions());
+            }
+
+            const float* pRes = cplx.begin();
+            float* pReal = real.begin();
+            float* pImag = imag.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, pRes, pReal, pImag)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pReal[n] = pRes[n];
+                pImag[n] = 0;
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_real_imag(...) ... ");
+        }
+    }
+
+    void complex_to_real_imag(const hoNDArray<double>& cplx, hoNDArray<double>& real, hoNDArray<double>& imag)
+    {
+        try
+        {
+            if ( !real.dimensions_equal(&cplx) )
+            {
+                real.create(cplx.get_dimensions());
+            }
+
+            if ( !imag.dimensions_equal(&cplx) )
+            {
+                imag.create(cplx.get_dimensions());
+            }
+
+            const double* pRes = cplx.begin();
+            double* pReal = real.begin();
+            double* pImag = imag.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, pRes, pReal, pImag)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pReal[n] = pRes[n];
+                pImag[n] = 0;
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_real_imag(...) ... ");
+        }
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> 
+    void complex_to_real(const hoNDArray<T>& cplx, hoNDArray<typename realType<T>::Type>& real)
+    {
+        try
+        {
+            if ( !real.dimensions_equal(&cplx) )
+            {
+                real.create(cplx.get_dimensions());
+            }
+
+            const T* pRes = cplx.begin();
+            typename realType<T>::Type* pReal = real.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, pRes, pReal)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pReal[n] = pRes[n].real();
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_real(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void complex_to_real(const hoNDArray< std::complex<float> >& cplx, hoNDArray<float>& real);
+    template EXPORTCPUCOREMATH void complex_to_real(const hoNDArray< std::complex<double> >& cplx, hoNDArray<double>& real);
+
+    template<class T> 
+    void complex_to_real(const hoNDArray<T>& cplx, hoNDArray<T>& real)
+    {
+        try
+        {
+            if ( !real.dimensions_equal(&cplx) )
+            {
+                real.create(cplx.get_dimensions());
+            }
+
+            const T* pRes = cplx.begin();
+            T* pReal = real.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for private(n) shared(N, pRes, pReal)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pReal[n] = T(pRes[n].real(), 0);
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_real(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void complex_to_real(const hoNDArray< std::complex<float> >& cplx, hoNDArray< std::complex<float> >& real);
+    template EXPORTCPUCOREMATH void complex_to_real(const hoNDArray< std::complex<double> >& cplx, hoNDArray< std::complex<double> >& real);
+
+    template<class T> 
+    void complex_to_real(hoNDArray<T>& cplx)
+    {
+        try
+        {
+            T* pRes = cplx.begin();
+
+            size_t N = cplx.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for private(n) shared(N, pRes)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pRes[n] = T(pRes[n].real(), 0);
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_real(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void complex_to_real(hoNDArray< std::complex<float> >& cplx);
+    template EXPORTCPUCOREMATH void complex_to_real(hoNDArray< std::complex<double> >& cplx);
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> 
+    void complex_to_imag(const hoNDArray<T>& cplx, hoNDArray<typename realType<T>::Type>& imag)
+    {
+        try
+        {
+            if ( !imag.dimensions_equal(&cplx) )
+            {
+                imag.create(cplx.get_dimensions());
+            }
+
+            const T* pRes = cplx.begin();
+            typename realType<T>::Type* pImag = imag.begin();
+
+            size_t N = imag.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, pRes, pImag)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pImag[n] = pRes[n].imag();
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_imag(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void complex_to_imag(const hoNDArray< std::complex<float> >& cplx, hoNDArray<float>& imag);
+    template EXPORTCPUCOREMATH void complex_to_imag(const hoNDArray< std::complex<double> >& cplx, hoNDArray<double>& imag);
+
+    template<class T> 
+    void complex_to_imag(const hoNDArray<T>& cplx, hoNDArray<T>& imag)
+    {
+        try
+        {
+            if ( !imag.dimensions_equal(&cplx) )
+            {
+                imag.create(cplx.get_dimensions());
+            }
+
+            const T* pRes = cplx.begin();
+            T* pImag = imag.begin();
+
+            size_t N = imag.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for private(n) shared(N, pRes, pImag)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pImag[n] = T(0, pRes[n].imag());
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_imag(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void complex_to_imag(const hoNDArray< std::complex<float> >& cplx, hoNDArray< std::complex<float> >& imag);
+    template EXPORTCPUCOREMATH void complex_to_imag(const hoNDArray< std::complex<double> >& cplx, hoNDArray< std::complex<double> >& imag);
+
+    template<class T> 
+    void complex_to_imag(hoNDArray<T>& cplx)
+    {
+        try
+        {
+            T* pRes = cplx.begin();
+
+            size_t N = cplx.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for private(n) shared(N, pRes)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pRes[n] = T( pRes[n].real(), 0);
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_imag(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void complex_to_imag(hoNDArray< std::complex<float> >& cplx);
+    template EXPORTCPUCOREMATH void complex_to_imag(hoNDArray< std::complex<double> >& cplx);
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> 
+    void real_to_complex(const hoNDArray<typename realType<T>::Type>& real, hoNDArray<T>& cplx)
+    {
+        try
+        {
+            if ( !cplx.dimensions_equal(&real) )
+            {
+                cplx.create(real.get_dimensions());
+            }
+
+            const typename realType<T>::Type* pReal = real.begin();
+            T* pRes = cplx.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for private(n) shared(N, pRes, pReal)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pRes[n] = T(pReal[n], 0);
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in real_to_complex(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void real_to_complex(const hoNDArray< float >& real, hoNDArray< std::complex<float> >& cplx);
+    template EXPORTCPUCOREMATH void real_to_complex(const hoNDArray< double >& real, hoNDArray< std::complex<double> >& cplx);
+
+    // --------------------------------------------------------------------------------
+
+    template<typename T> void fill( hoNDArray<T>* x, T val)
+    {
+        size_t N = x->get_number_of_elements();
+        T* pX = x->begin();
+
+        long long n;
+        #pragma omp parallel for default(none) private(n) shared(N, pX, val) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            pX[n] = val;
+        }
+    }
+
+    template EXPORTCPUCOREMATH void fill( hoNDArray<float>* x, float val);
+    template EXPORTCPUCOREMATH void fill( hoNDArray<double>* x, double val);
+    template EXPORTCPUCOREMATH void fill( hoNDArray< std::complex<float> >* x,  std::complex<float>  val);
+    template EXPORTCPUCOREMATH void fill( hoNDArray< std::complex<double> >* x,  std::complex<double>  val);
+    template EXPORTCPUCOREMATH void fill( hoNDArray< complext<float> >* x,  complext<float>  val);
+    template EXPORTCPUCOREMATH void fill( hoNDArray< complext<double> >* x,  complext<double>  val);
+
+    // --------------------------------------------------------------------------------
+
+    template<typename T> void fill( hoNDArray<T>& x, T val )
+    {
+        Gadgetron::fill( &x, val);
+    }
+
+    template EXPORTCPUCOREMATH void fill( hoNDArray<float>& x, float val);
+    template EXPORTCPUCOREMATH void fill( hoNDArray<double>& x, double val);
+    template EXPORTCPUCOREMATH void fill( hoNDArray< std::complex<float> >& x,  std::complex<float>  val);
+    template EXPORTCPUCOREMATH void fill( hoNDArray< std::complex<double> >& x,  std::complex<double>  val);
+    template EXPORTCPUCOREMATH void fill( hoNDArray< complext<float> >& x,  complext<float>  val);
+    template EXPORTCPUCOREMATH void fill( hoNDArray< complext<double> >& x,  complext<double>  val);
+
+    // --------------------------------------------------------------------------------
+
+    //
+    // TODO:
+    // The clamp functions could (probably) be implemented much like we use Thrust for the device versions
+    // - i.e. using Armadillo's transform on the array.
+    // However this requires a newer version of Armadillo as current Linux distributions provide...
+    //
+
+    template<typename T> struct hoNDA_clamp //: public thrust::unary_function<T,T>
+    {
+        hoNDA_clamp( T _min, T _max, T _min_val, T _max_val ) : min(_min), max(_max), min_val(_min_val), max_val(_max_val) {}
+        T operator()(const T &x) const
+        {
+            if( x < min ) return min_val;
+            else if ( x >= max) return max_val;
+            else return x;
+        }
+        T min, max;
+        T min_val, max_val;
+    };
+
+    template<typename T> struct hoNDA_clamp< std::complex<T> > //: public thrust::unary_function< std::complex<T>, std::complex<T> >
+    {
+        hoNDA_clamp( T _min, T _max, std::complex<T> _min_val, std::complex<T> _max_val ) : min(_min), max(_max), min_val(_min_val), max_val(_max_val) {}
+        std::complex<T> operator()(const std::complex<T> &x) const
+        {
+            if( real(x) < min ) return min_val;
+            else if ( real(x) >= max) return max_val;
+            else return std::complex<T>(real(x));
+        }
+        T min, max;
+        std::complex<T> min_val, max_val;
+    };
+
+    template<typename T> struct hoNDA_clamp< complext<T> > //: public thrust::unary_function< complext<T>, complext<T> >
+    {
+        hoNDA_clamp( T _min, T _max, complext<T> _min_val, complext<T> _max_val ) : min(_min), max(_max), min_val(_min_val), max_val(_max_val) {}
+        complext<T> operator()(const complext<T> &x) const
+        {
+            if( real(x) < min ) return min_val;
+            else if ( real(x) >= max) return max_val;
+            else return complext<T>(real(x));
+        }
+        T min, max;
+        complext<T> min_val, max_val;
+    };
+
+    template<class T> void clamp( hoNDArray<T> *x,
+        typename realType<T>::Type min, typename realType<T>::Type max, T min_val, T max_val )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::clamp(): Invalid input array");
+
+        hoNDA_clamp<T> functor(min, max, min_val, max_val);
+        std::transform(x->begin(),x->end(),x->begin(),functor);
+    }
+
+    template<class T> void clamp( hoNDArray<T> *x, typename realType<T>::Type min, typename realType<T>::Type max )
+    {
+        clamp(x,min,max,T(min),T(max));
+    }
+
+    template<typename T> struct hoNDA_clamp_min //: public thrust::unary_function<T,T>
+    {
+        hoNDA_clamp_min( T _min ) : min(_min) {}
+        T operator()(const T &x) const
+        {
+            if( x < min ) return min;
+            else return x;
+        }
+        T min;
+    };
+
+    template<typename T> struct hoNDA_clamp_min< std::complex<T> > //: public thrust::unary_function< std::complex<T>, std::complex<T> >
+    {
+        hoNDA_clamp_min( T _min ) : min(_min) {}
+        std::complex<T> operator()(const std::complex<T> &x) const
+        {
+            if( real(x) < min ) return std::complex<T>(min);
+            else return std::complex<T>(real(x));
+        }
+        T min;
+    };
+
+    template<typename T> struct hoNDA_clamp_min< complext<T> > //: public thrust::unary_function< complext<T>, complext<T> >
+    {
+        hoNDA_clamp_min( T _min ) : min(_min) {}
+        complext<T> operator()(const complext<T> &x) const
+        {
+            if( real(x) < min ) return complext<T>(min);
+            else return complext<T>(real(x));
+        }
+        T min;
+    };
+
+    template<class T> void clamp_min( hoNDArray<T> *x, typename realType<T>::Type min )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::clamp_min(): Invalid input array");
+
+        hoNDA_clamp_min<T> functor(min);
+        std::transform(x->begin(),x->end(),x->begin(),functor);
+    }
+
+    template<typename T> struct hoNDA_clamp_max //: public thrust::unary_function<T,T>
+    {
+        hoNDA_clamp_max( T _max ) : max(_max) {}
+        T operator()(const T &x) const
+        {
+            if( x > max ) return max;
+            else return x;
+        }
+        T max;
+    };
+
+    template<typename T> struct hoNDA_clamp_max< std::complex<T> > //: public thrust::unary_function< std::complex<T>, std::complex<T> >
+    {
+        hoNDA_clamp_max( T _max ) : max(_max) {}
+        std::complex<T> operator()(const std::complex<T> &x) const
+        {
+            if( real(x) > max ) return std::complex<T>(max);
+            else return std::complex<T>(real(x));
+        }
+        T max;
+    };
+
+    template<typename T> struct hoNDA_clamp_max< complext<T> > //: public thrust::unary_function< complext<T>, complext<T> >
+    {
+        hoNDA_clamp_max( T _max ) : max(_max) {}
+        complext<T> operator()(const complext<T> &x) const
+        {
+            if( real(x) > max ) return complext<T>(max);
+            else return complext<T>(real(x));
+        }
+        T max;
+    };
+
+    template<class T> void clamp_max( hoNDArray<T> *x, typename realType<T>::Type max )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::clamp_max(): Invalid input array");
+
+        hoNDA_clamp_max<T> functor(max);
+        std::transform(x->begin(),x->end(),x->begin(),functor);
+    }
+
+    template<class T> void normalize( hoNDArray<T> *x, typename realType<T>::Type val )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::normalize(): Invalid input array");
+
+        size_t max_idx = amax(x);
+        T max_val_before = x->get_data_ptr()[max_idx];
+        typename realType<T>::Type scale = val/abs(max_val_before);
+        *x *= scale;
+    }
+
+    template<class T> void shrink1( hoNDArray<T> *x, typename realType<T>::Type gamma, hoNDArray<T> *out )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::shrink1(): Invalid input array");
+
+        T *outPtr = (out==0x0) ? x->get_data_ptr() : out->get_data_ptr();
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+        for( long long i = 0; i < (long long)x->get_number_of_elements(); i++ ) {
+            T prev = x->get_data_ptr()[i];
+            typename realType<T>::Type absPrev = abs(prev);
+            T sgnPrev = (absPrev <= typename realType<T>::Type(0)) ? T(0) : prev/absPrev;
+            outPtr[i] = sgnPrev*std::max(absPrev-gamma, typename realType<T>::Type(0));
+        }
+    }
+
+    template<class T> void pshrink( hoNDArray<T> *x, typename realType<T>::Type gamma,typename realType<T>::Type p, hoNDArray<T> *out )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::pshrink(): Invalid input array");
+
+        T *outPtr = (out==0x0) ? x->get_data_ptr() : out->get_data_ptr();
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+        for( long long i = 0; i < (long long)x->get_number_of_elements(); i++ ) {
+            T prev = x->get_data_ptr()[i];
+            typename realType<T>::Type absPrev = abs(prev);
+            T sgnPrev = (absPrev <= typename realType<T>::Type(0)) ? T(0) : prev/absPrev;
+            outPtr[i] = sgnPrev*std::max(absPrev-gamma*std::pow(absPrev,p-1), typename realType<T>::Type(0));
+        }
+    }
+
+    template<class T> void shrinkd ( hoNDArray<T> *_x, hoNDArray<typename realType<T>::Type> *_s, typename realType<T>::Type gamma, hoNDArray<T> *out )
+    {
+        if( _x == 0x0  || _s == 0 )
+            throw std::runtime_error("Gadgetron::shrinkd(): Invalid input array");
+
+        T *outPtr = (out==0x0) ? _x->get_data_ptr() : out->get_data_ptr();
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+        for( long long i = 0; i < (long long)_x->get_number_of_elements(); i++ ) {
+            T x = _x->get_data_ptr()[i];
+            typename realType<T>::Type s = _s->get_data_ptr()[i];
+            if (s > gamma)
+                outPtr[i] = x/s*(s-gamma);
+            else
+                outPtr[i] = 0;
+        }
+    }
+
+    template<class T> void pshrinkd( hoNDArray<T> *_x, hoNDArray<typename realType<T>::Type> *_s, typename realType<T>::Type gamma,typename realType<T>::Type p, hoNDArray<T> *out )
+    {
+        if( _x == 0x0 )
+            throw std::runtime_error("Gadgetron::pshrinkd(): Invalid input array");
+
+        T *outPtr = (out==0x0) ? _x->get_data_ptr() : out->get_data_ptr();
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+        for( long long i = 0; i < (long long)_x->get_number_of_elements(); i++ )
+        {
+            T x = _x->get_data_ptr()[i];
+            typename realType<T>::Type s = _s->get_data_ptr()[i];
+            outPtr[i] = x/s*std::max(s-gamma*std::pow(s,p-1),typename realType<T>::Type(0));
+        }
+    }
+
+    // --------------------------------------------------------------------------------
+
+    inline void axpy(float a, size_t N, const float* x, const float* y, float* r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, r, a , x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            r[n] = a*x[n] + y[n];
+        }
+    }
+
+    inline void axpy(double a, size_t N, const double* x, const double* y, double* r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, r, a , x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            r[n] = a*x[n] + y[n];
+        }
+    }
+
+    inline void axpy( std::complex<float>  a, size_t N, const  std::complex<float> * x, const  std::complex<float> * y,  std::complex<float> * r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, r, a, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            const  std::complex<float> & vx = x[n];
+            const float re1 = vx.real();
+            const float im1 = vx.imag();
+
+            const  std::complex<float> & vy = y[n];
+            const float re2 = vy.real();
+            const float im2 = vy.imag();
+
+            const float ar = a.real();
+            const float ai = a.imag();
+
+            reinterpret_cast<float(&)[2]>(r[n])[0] = re2 + ar*re1 - ai*im1;
+            reinterpret_cast<float(&)[2]>(r[n])[1] = im2 + ar*im1 + ai*re1;
+        }
+    }
+
+    inline void axpy( std::complex<double>  a, size_t N, const  std::complex<double> * x, const  std::complex<double> * y,  std::complex<double> * r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, r, a, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            const  std::complex<double> & vx = x[n];
+            const double re1 = vx.real();
+            const double im1 = vx.imag();
+
+            const  std::complex<double> & vy = y[n];
+            const double re2 = vy.real();
+            const double im2 = vy.imag();
+
+            const double ar = a.real();
+            const double ai = a.imag();
+
+            reinterpret_cast<double(&)[2]>(r[n])[0] = re2 + ar*re1 - ai*im1;
+            reinterpret_cast<double(&)[2]>(r[n])[1] = im2 + ar*im1 + ai*re1;
+        }
+    }
+
+    template <typename T> 
+    void axpy( complext<T>  a, size_t N, const  complext<T> * x, const  complext<T> * y,  complext<T> * r)
+    {
+        long long n;
+
+        #pragma omp parallel for private(n) shared(N, r, a, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            const  complext<T> & vx = x[n];
+            const T re1 = vx.real();
+            const T im1 = vx.imag();
+
+            const  complext<T> & vy = y[n];
+            const T re2 = vy.real();
+            const T im2 = vy.imag();
+
+            const T ar = a.real();
+            const T ai = a.imag();
+
+            reinterpret_cast<T(&)[2]>(r[n])[0] = re2 + ar*re1 - ai*im1;
+            reinterpret_cast<T(&)[2]>(r[n])[1] = im2 + ar*im1 + ai*re1;
+        }
+    }
+
+    template <typename T> 
+    void axpy(T a, const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r)
+    {
+        GADGET_DEBUG_CHECK_THROW(x.get_number_of_elements()==y.get_number_of_elements());
+
+        if ( r.get_number_of_elements() != x.get_number_of_elements() )
+        {
+            r = y;
+        }
+        else
+        {
+            if ( &r != &y )
+            {
+                memcpy(r.begin(), y.begin(), r.get_number_of_bytes());
+            }
+        }
+
+        axpy(a, x.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+    }
+
+    template EXPORTCPUCOREMATH void axpy(float a, const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void axpy(double a, const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void axpy( std::complex<float>  a, const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void axpy( std::complex<double>  a, const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& r);
+    template EXPORTCPUCOREMATH void axpy( complext<float>  a, const hoNDArray< complext<float> >& x, const hoNDArray< complext<float> >& y, hoNDArray< complext<float> >& r);
+    template EXPORTCPUCOREMATH void axpy( complext<double>  a, const hoNDArray< complext<double> >& x, const hoNDArray< complext<double> >& y, hoNDArray< complext<double> >& r);
+
+    template<class T> void axpy( T a, hoNDArray<T> *x, hoNDArray<T> *y )
+    {
+        if( x == 0x0 || y == 0x0 )
+            throw std::runtime_error("Gadgetron::axpy(): Invalid input array");
+
+        if( x->get_number_of_elements() != y->get_number_of_elements() )
+            throw std::runtime_error("Gadgetron::axpy(): Array sizes mismatch");
+
+        axpy(a, *x, *y, *y);
+    }
+
+    // --------------------------------------------------------------------------------
+
+    inline void scal(size_t N, float a, float* x)
+    {
+        long long n;
+        #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            x[n] *= a;
+        }
+    }
+
+    inline void scal(size_t N, double a, double* x)
+    {
+        long long n;
+        #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            x[n] *= a;
+        }
+    }
+
+    inline void scal(size_t N,  std::complex<float>  a,  std::complex<float> * x)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const  std::complex<float> & c = x[n];
+            const float re = c.real();
+            const float im = c.imag();
+
+            const float ar = a.real();
+            const float ai = a.imag();
+
+            reinterpret_cast<float(&)[2]>(x[n])[0] = re*ar-im*ai;
+            reinterpret_cast<float(&)[2]>(x[n])[1] = re*ai+im*ar;
+        }
+    }
+
+    inline void scal(size_t N,  std::complex<double>  a,  std::complex<double> * x)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const  std::complex<double> & c = x[n];
+            const double re = c.real();
+            const double im = c.imag();
+
+            const double ar = a.real();
+            const double ai = a.imag();
+
+            reinterpret_cast<double(&)[2]>(x[n])[0] = re*ar-im*ai;
+            reinterpret_cast<double(&)[2]>(x[n])[1] = re*ai+im*ar;
+        }
+    }
+
+    inline void scal(size_t N,  complext<float>  a,  complext<float> * x)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const  complext<float> & c = x[n];
+            const float re = c.real();
+            const float im = c.imag();
+
+            const float ar = a.real();
+            const float ai = a.imag();
+
+            reinterpret_cast<float(&)[2]>(x[n])[0] = re*ar-im*ai;
+            reinterpret_cast<float(&)[2]>(x[n])[1] = re*ai+im*ar;
+        }
+    }
+
+    inline void scal(size_t N,  complext<double>  a,  complext<double> * x)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const  complext<double> & c = x[n];
+            const double re = c.real();
+            const double im = c.imag();
+
+            const double ar = a.real();
+            const double ai = a.imag();
+
+            reinterpret_cast<double(&)[2]>(x[n])[0] = re*ar-im*ai;
+            reinterpret_cast<double(&)[2]>(x[n])[1] = re*ai+im*ar;
+        }
+    }
+
+    template <typename T> 
+    void scal(T a, hoNDArray<T>& x)
+    {
+        scal(x.get_number_of_elements(), a, x.begin());
+    }
+
+    template EXPORTCPUCOREMATH void scal(float a, hoNDArray<float>& x);
+    template EXPORTCPUCOREMATH void scal(double a, hoNDArray<double>& x);
+    template EXPORTCPUCOREMATH void scal( std::complex<float>  a, hoNDArray< std::complex<float> >& x);
+    template EXPORTCPUCOREMATH void scal( std::complex<double>  a, hoNDArray< std::complex<double> >& x);
+    template EXPORTCPUCOREMATH void scal( complext<float>  a, hoNDArray< complext<float> >& x);
+    template EXPORTCPUCOREMATH void scal( complext<double>  a, hoNDArray< complext<double> >& x);
+
+    // --------------------------------------------------------------------------------
+
+    inline void scal(size_t N, float a,  std::complex<float> * x)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const  std::complex<float> & c = x[n];
+            const float re = c.real();
+            const float im = c.imag();
+
+            reinterpret_cast<float(&)[2]>(x[n])[0] = re*a;
+            reinterpret_cast<float(&)[2]>(x[n])[1] = im*a;
+        }
+    }
+
+    inline void scal(size_t N, double a,  std::complex<double> * x)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const  std::complex<double> & c = x[n];
+            const double re = c.real();
+            const double im = c.imag();
+
+            reinterpret_cast<double(&)[2]>(x[n])[0] = re*a;
+            reinterpret_cast<double(&)[2]>(x[n])[1] = im*a;
+        }
+    }
+
+    template <typename T> 
+    void scal(T a, hoNDArray< std::complex<T> >& x)
+    {
+        scal(x.get_number_of_elements(), a, x.begin());
+    }
+
+    template EXPORTCPUCOREMATH void scal(float a, hoNDArray< std::complex<float> >& x);
+    template EXPORTCPUCOREMATH void scal(double a, hoNDArray< std::complex<double> >& x);
+
+    template <typename T> 
+    void scal(T a, hoNDArray< complext<T> >& x)
+    {
+        scal(x.get_number_of_elements(), a, x.begin());
+    }
+
+    template EXPORTCPUCOREMATH void scal(float a, hoNDArray< complext<float> >& x);
+    template EXPORTCPUCOREMATH void scal(double a, hoNDArray< complext<double> >& x);
+
+    // --------------------------------------------------------------------------------
+
+    template<typename T> 
+    void conv2(size_t RO, size_t E1, size_t num, const T* x, size_t kRO, size_t kE1, const T* y, T* z)
+    {
+        try
+        {
+            long long halfKRO = (long long)(kRO/2);
+            long long halfKE1 = (long long)(kE1/2);
+
+            hoNDArray<T> flipY(2*halfKRO+1, 2*halfKE1+1);
+            T* pKer = flipY.begin();
+
+            long long n;
+            long long ro, e1;
+
+            // flip the kernel
+            for ( e1=0; e1<(long long)kE1; e1++ )
+            {
+                long long flip_e1 = 2*halfKE1 - e1;
+
+                for ( ro=0; ro<(long long)kRO; ro++ )
+                {
+                    long long flip_ro = 2*halfKRO - ro;
+
+                    flipY(flip_ro, flip_e1) = y[ro+e1*kRO];
+                }
+            }
+
+            // perform the convolution
+            #pragma omp parallel for default(none) private(n, ro, e1) shared(num, x, RO, E1, z, halfKRO, halfKE1, pKer)
+            for ( n=0; n<(long long)num; n++ )
+            {
+                const T* pX = x + n*RO*E1;
+                T* pZ = z + n*RO*E1;
+
+                long long kro, ke1, dro, de1;
+
+                for ( e1=0; e1<(long long)E1; e1++ )
+                {
+                    for ( ro=0; ro<(long long)RO; ro++ )
+                    {
+                        pZ[ro + e1*RO] = 0;
+                        for ( ke1=-halfKE1; ke1<=halfKE1; ke1++ )
+                        {
+                            de1 = ke1 + e1;
+                            if ( de1 < 0 )
+                            {
+                                de1 += E1;
+                            }
+                            else if ( de1 >= (long long)E1 )
+                            {
+                                de1 -= E1;
+                            }
+
+                            for ( kro=-halfKRO; kro<=halfKRO; kro++ )
+                            {
+                                dro = kro + ro;
+                                if ( dro < 0 )
+                                {
+                                    dro += RO;
+                                }
+                                else if ( dro >= (long long)RO )
+                                {
+                                    dro -= RO;
+                                }
+
+                                pZ[ro + e1*RO] += pKer[ kro+halfKRO + (ke1+halfKE1) * (2*halfKRO+1) ] * pX[dro + de1*RO];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors happened in conv2(size_t RO, size_t E1, size_t num, const T* x, size_t kRO, size_t kE1, const T* y, T* z) ... ");
+        }
+    }
+
+    template<typename T> 
+    void conv2(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& z)
+    {
+        try
+        {
+            if ( !z.dimensions_equal(&x) )
+            {
+                z = x;
+            }
+
+            long long RO = (long long) x.get_size(0);
+            long long E1 = (long long) x.get_size(1);
+            long long num = ((long long) x.get_number_of_elements()) / (RO*E1);
+
+            long long kRO = (long long) y.get_size(0);
+            long long kE1 = (long long) y.get_size(1);
+
+            conv2(RO, E1, num, x.begin(), kRO, kE1, y.begin(), z.begin());
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors happened in conv2(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& z) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void conv2(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& z);
+    template EXPORTCPUCOREMATH void conv2(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& z);
+    template EXPORTCPUCOREMATH void conv2(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& z);
+    template EXPORTCPUCOREMATH void conv2(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& z);
+
+    // --------------------------------------------------------------------------------
+
+    template<typename T> 
+    void conv3(size_t RO, size_t E1, size_t E2, size_t num, const T* x, size_t kRO, size_t kE1, size_t kE2, const T* y, T* z)
+    {
+        try
+        {
+            long long halfKRO = (long long)(kRO/2);
+            long long halfKE1 = (long long)(kE1/2);
+            long long halfKE2 = (long long)(kE2/2);
+
+            hoNDArray<T> flipY(2*halfKRO+1, 2*halfKE1+1, 2*halfKE2+1);
+            T* pKer = flipY.begin();
+
+            long long n, e2;
+            long long ro, e1;
+
+            // flip the kernel
+            for ( e2=0; e2<(long long)kE2; e2++ )
+            {
+                long long flip_e2 = 2*halfKE2 - e2;
+
+                for ( e1=0; e1<(long long)kE1; e1++ )
+                {
+                    long long flip_e1 = 2*halfKE1 - e1;
+
+                    for ( ro=0; ro<(long long)kRO; ro++ )
+                    {
+                        long long flip_ro = 2*halfKRO - ro;
+
+                        flipY(flip_ro, flip_e1, flip_e2) = y[ro+e1*kRO+e2*kRO*kE1];
+                    }
+                }
+            }
+
+            // perform the convolution
+            #pragma omp parallel for default(none) private(n) shared(num, x, RO, E1, E2, z, halfKRO, halfKE1, halfKE2, pKer) if ( num > 8 )
+            for ( n=0; n<(long long)num; n++ )
+            {
+                const T* pX = x + n*RO*E1*E2;
+                T* pZ = z + n*RO*E1*E2;
+
+                long long kro, ke1, ke2, dro, de1, de2;
+
+                #pragma omp parallel for default(none) private(ro, e1, e2, kro, ke1, ke2, dro, de1, de2) shared(pX, RO, E1, E2, pZ, halfKRO, halfKE1, halfKE2, pKer)
+                for ( e2=0; e2<(long long)E2; e2++ )
+                {
+                    for ( e1=0; e1<(long long)E1; e1++ )
+                    {
+                        for ( ro=0; ro<(long long)RO; ro++ )
+                        {
+                            pZ[ro + e1*RO + e2*RO*E1] = 0;
+                            for ( ke2=-halfKE2; ke2<=halfKE2; ke2++ )
+                            {
+                                de2 = ke2 + e2;
+                                if ( de2 < 0 )
+                                {
+                                    de2 += E2;
+                                }
+                                else if ( de2 >= (long long)E2 )
+                                {
+                                    de2 -= E2;
+                                }
+
+                                for ( ke1=-halfKE1; ke1<=halfKE1; ke1++ )
+                                {
+                                    de1 = ke1 + e1;
+                                    if ( de1 < 0 )
+                                    {
+                                        de1 += E1;
+                                    }
+                                    else if ( de1 >= (long long)E1 )
+                                    {
+                                        de1 -= E1;
+                                    }
+
+                                    for ( kro=-halfKRO; kro<=halfKRO; kro++ )
+                                    {
+                                        dro = kro + ro;
+                                        if ( dro < 0 )
+                                        {
+                                            dro += RO;
+                                        }
+                                        else if ( dro >= (long long)RO )
+                                        {
+                                            dro -= RO;
+                                        }
+
+                                        pZ[ro + e1*RO + e2*RO*E1] += pKer[ kro+halfKRO + (ke1+halfKE1)*(2*halfKRO+1) + (ke2+halfKE2)*(2*halfKRO+1)*(2*halfKE1+1) ] * pX[dro + de1*RO + de2*RO*E1];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors happened in conv3(size_t RO, size_t E1, size_t E2, size_t num, const T* x, size_t kRO, size_t kE1, size_t kE2, const T* y, T* z) ... ");
+        }
+    }
+
+    template<typename T> 
+    void conv3(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& z)
+    {
+        try
+        {
+            if ( !z.dimensions_equal(&x) )
+            {
+                z = x;
+            }
+
+            long long RO = (long long) x.get_size(0);
+            long long E1 = (long long) x.get_size(1);
+            long long E2 = (long long) x.get_size(2);
+            long long num = ((long long)x.get_number_of_elements()) / (RO*E1*E2);
+
+            long long kRO = (long long) y.get_size(0);
+            long long kE1 = (long long) y.get_size(1);
+            long long kE2 = (long long) y.get_size(2);
+
+            conv3(RO, E1, E2, num, x.begin(), kRO, kE1, kE2, y.begin(), z.begin());
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors happened in conv3(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& z) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void conv3(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& z);
+    template EXPORTCPUCOREMATH void conv3(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& z);
+    template EXPORTCPUCOREMATH void conv3(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& z);
+    template EXPORTCPUCOREMATH void conv3(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& z);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    void sum_over_dimension(const hoNDArray<T>& x, hoNDArray<T>& r, size_t dim)
+    {
+        try
+        {
+            size_t D = x.get_number_of_dimensions();
+            if (dim >= D)
+            {
+                r = x;
+                return;
+            }
+
+            std::vector<size_t> dimX, dimR;
+            x.get_dimensions(dimX);
+
+            dimR = dimX;
+            dimR[dim] = 1;
+
+            if (!r.dimensions_equal(&dimR))
+            {
+                r.create(dimR);
+            }
+
+            if (dim == 0)
+            {
+                size_t X = x.get_size(0);
+                size_t num = x.get_number_of_elements() / X;
+
+                const T* pX = x.begin();
+                T* pR = r.begin();
+
+                long long n;
+
+                #pragma omp parallel for default(none) private(n) shared(X, num, pX, pR)
+                for (n = 0; n<(long long)num; n++)
+                {
+                    T xsum = pX[n*X];
+                    for (size_t ro = 1; ro<X; ro++)
+                    {
+                        xsum += pX[n*X + ro];
+                    }
+
+                    pR[n] = xsum;
+                }
+            }
+            else
+            {
+                size_t strideX = x.get_size(0);
+                for (size_t d = 1; d <= dim; d++)
+                {
+                    strideX *= x.get_size(d);
+                }
+
+                size_t strideR = strideX / x.get_size(dim);
+                size_t num = x.get_number_of_elements() / strideX;
+                size_t nDim = x.get_size(dim);
+
+                const T* pX = x.begin();
+                T* pR = r.begin();
+
+                if (nDim == 1)
+                {
+                    memcpy(pR, pX, x.get_number_of_bytes());
+                    return;
+                }
+
+                long long n;
+
+                #pragma omp parallel for default(none) private(n) shared(strideX, strideR, num, nDim, pX, pR)
+                for (n = 0; n<(long long)num; n++)
+                {
+                    const T* pX_curr = pX + n*strideX;
+                    T* pR_curr = pR + n*strideR;
+
+                    memcpy(pR_curr, pX_curr, sizeof(T)*strideR);
+
+                    size_t p, c;
+                    for (p = 1; p<nDim; p++)
+                    {
+                        for (c = 0; c < strideR; c++)
+                        {
+                            pR_curr[c] += pX_curr[p*strideR+c];
+                        }
+                    }
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_THROW("Errors happened in sum_over_dimension(const hoNDArray<T>& x, hoNDArray<T>& y, size_t dim) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void sum_over_dimension(const hoNDArray<float>& x, hoNDArray<float>& y, size_t dim);
+    template EXPORTCPUCOREMATH void sum_over_dimension(const hoNDArray<double>& x, hoNDArray<double>& y, size_t dim);
+    template EXPORTCPUCOREMATH void sum_over_dimension(const hoNDArray< std::complex<float> >& x, hoNDArray< std::complex<float> >& y, size_t dim);
+    template EXPORTCPUCOREMATH void sum_over_dimension(const hoNDArray< std::complex<double> >& x, hoNDArray< std::complex<double> >& y, size_t dim);
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> hoNDArray<T>& operator+= (hoNDArray<T> &x, const T &y)
+    {
+        /*arma::Col<typename stdType<T>::Type> aRes = as_arma_col(&x);
+        typename stdType<T>::Type aY = *((typename stdType<T>::Type*)&y);
+        aRes += aY;*/
+
+        long long n;
+
+        size_t N = x.get_number_of_elements();
+        T* px = x.begin();
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            x[n] += y;
+        }
+
+        return x;
+    }
+
+    template<class T> hoNDArray< std::complex<T> >& operator+= (hoNDArray< std::complex<T> > &x, const T &y)
+    {
+        /*arma::Col< std::complex<T> > aRes = as_arma_col(&x);
+        std::complex<T> aY( y, T(0) );
+        aRes += aY;*/
+
+        long long n;
+
+        size_t N = x.get_number_of_elements();
+        std::complex<T>* px = x.begin();
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            x[n] += y;
+        }
+
+        return x;
+    }
+
+    template<class T> hoNDArray< complext<T> >& operator+= (hoNDArray< complext<T> > &x, const T &y)
+    {
+        /*arma::Col< std::complex<T> > aRes = as_arma_col(&x);
+        std::complex<T> aY( y, T(0) );
+        aRes += aY;*/
+
+        long long n;
+
+        size_t N = x.get_number_of_elements();
+        complext<T>* px = x.begin();
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            x[n] += y;
+        }
+
+        return x;
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> hoNDArray<T>& operator-= (hoNDArray<T> &x, const T &y)
+    {
+        /*arma::Col<typename stdType<T>::Type> aRes = as_arma_col(&x);
+        typename stdType<T>::Type aY = *((typename stdType<T>::Type*)&y);
+        aRes -= aY;*/
+
+        long long n;
+
+        size_t N = x.get_number_of_elements();
+        T* px = x.begin();
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            x[n] -= y;
+        }
+
+        return x;
+    }
+
+    template<class T> hoNDArray< std::complex<T> >& operator-= (hoNDArray< std::complex<T> > &x, const T &y)
+    {
+        /*arma::Col< std::complex<T> > aRes = as_arma_col(&x);
+        std::complex<T> aY( y, T(0) );
+        aRes -= aY;*/
+
+        long long n;
+
+        size_t N = x.get_number_of_elements();
+        std::complex<T>* px = x.begin();
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            x[n] -= y;
+        }
+
+        return x;
+    }
+
+    template<class T> hoNDArray< complext<T> >& operator-= (hoNDArray< complext<T> > &x, const T &y)
+    {
+        /*arma::Col< std::complex<T> > aRes = as_arma_col(&x);
+        std::complex<T> aY( y, T(0) );
+        aRes -= aY;*/
+
+        long long n;
+
+        size_t N = x.get_number_of_elements();
+        complext<T>* px = x.begin();
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            x[n] -= y;
+        }
+
+        return x;
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> hoNDArray<T>& operator*= (hoNDArray<T> &x, const T &y)
+    {
+        //arma::Col<typename stdType<T>::Type> aRes = as_arma_col(&x);
+        //typename stdType<T>::Type aY = *((typename stdType<T>::Type*)&y);
+        //aRes *= aY;
+
+        scal(x.get_number_of_elements(), y, x.begin());
+
+        return x;
+    }
+
+    template<class T> hoNDArray< std::complex<T> >& operator*= (hoNDArray< std::complex<T> > &x, const T &y)
+    {
+        /*arma::Col< std::complex<T> > aRes = as_arma_col(&x);
+        std::complex<T> aY( y, T(0) );
+        aRes *= aY;*/
+
+        scal(x.get_number_of_elements(), y, x.begin());
+
+        return x;
+    }
+
+    template<class T> hoNDArray< complext<T> >& operator*= (hoNDArray< complext<T> > &x, const T &y)
+    {
+        //arma::Col< std::complex<T> > aRes = as_arma_col(&x);
+        //std::complex<T> aY( y, T(0) );
+        //aRes *= aY;
+
+        scal(x.get_number_of_elements(), y, reinterpret_cast< std::complex<T>* >(x.begin()) );
+        return x;
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> hoNDArray<T>& operator/= (hoNDArray<T> &x, const T &y)
+    {
+        /*arma::Col<typename stdType<T>::Type> aRes = as_arma_col(&x);
+        typename stdType<T>::Type aY = *((typename stdType<T>::Type*)&y);
+        aRes /= aY;*/
+
+        T ry = T(1)/y;
+        scal(x.get_number_of_elements(), ry, x.begin());
+
+        return x;
+    }
+
+    template<class T> hoNDArray< std::complex<T> >& operator/= (hoNDArray< std::complex<T> > &x, const T &y)
+    {
+        /*arma::Col< std::complex<T> > aRes = as_arma_col(&x);
+        std::complex<T> aY( y, T(0) );
+        aRes /= aY;*/
+
+        T ry = T(1)/y;
+        scal(x.get_number_of_elements(), ry, x.begin());
+
+        return x;
+    }
+
+    template<class T> hoNDArray< complext<T> >& operator/= (hoNDArray< complext<T> > &x, const T &y)
+    {
+        /*arma::Col< std::complex<T> > aRes = as_arma_col(&x);
+        std::complex<T> aY( y, T(0) );
+        aRes /= aY;*/
+
+        T ry = T(1)/y;
+        scal(x.get_number_of_elements(), ry, reinterpret_cast< std::complex<T>* >(x.begin()) );
+
+        return x;
+    }
+
+    // --------------------------------------------------------------------------------
+
+    //
+    // Instantiation
+    //
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > abs<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void abs_inplace<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > abs_square<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > sqrt<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void sqrt_inplace<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > square<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void square_inplace<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > reciprocal<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void reciprocal_inplace<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > reciprocal_sqrt<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void reciprocal_sqrt_inplace<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > sgn<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void sgn_inplace<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void clamp<float>( hoNDArray<float>*, float, float );
+    template EXPORTCPUCOREMATH void clamp_min<float>( hoNDArray<float>*, float );
+    template EXPORTCPUCOREMATH void clamp_max<float>( hoNDArray<float>*, float );
+    template EXPORTCPUCOREMATH void normalize<float>( hoNDArray<float>*, float );
+    template EXPORTCPUCOREMATH void shrink1<float>( hoNDArray<float>*, float, hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void pshrink<float>( hoNDArray<float>*, float,float, hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void shrinkd<float> ( hoNDArray<float>*, hoNDArray<float>*, float, hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void pshrinkd<float> ( hoNDArray<float>*, hoNDArray<float>*, float, float, hoNDArray<float>* );
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > abs<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void abs_inplace<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > abs_square<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > sqrt<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void sqrt_inplace<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > square<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void square_inplace<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > reciprocal<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void reciprocal_inplace<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > reciprocal_sqrt<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void reciprocal_sqrt_inplace<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > sgn<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void sgn_inplace<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void clamp<double>( hoNDArray<double>*, double, double );
+    template EXPORTCPUCOREMATH void clamp_min<double>( hoNDArray<double>*, double );
+    template EXPORTCPUCOREMATH void clamp_max<double>( hoNDArray<double>*, double );
+    template EXPORTCPUCOREMATH void normalize<double>( hoNDArray<double>*, double );
+    template EXPORTCPUCOREMATH void shrink1<double>( hoNDArray<double>*, double, hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void pshrink<double>( hoNDArray<double>*, double,double, hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void shrinkd<double> ( hoNDArray<double>*, hoNDArray<double>*, double, hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void pshrinkd<double> ( hoNDArray<double>*, hoNDArray<double>*, double, double, hoNDArray<double>* );
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > abs< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > abs_square< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<float> > > sqrt< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void sqrt_inplace< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<float> > > square< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void square_inplace< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<float> > > reciprocal< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void reciprocal_inplace< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<float> > > reciprocal_sqrt< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void reciprocal_sqrt_inplace< std::complex<float> >( hoNDArray< std::complex<float> >* );
+
+    template EXPORTCPUCOREMATH void clamp< std::complex<float> >( hoNDArray< std::complex<float> >*, float, float );
+    template EXPORTCPUCOREMATH void clamp_min< std::complex<float> >( hoNDArray< std::complex<float> >*, float );
+    template EXPORTCPUCOREMATH void clamp_max<std::complex<float> >( hoNDArray< std::complex<float> >*, float );
+    template EXPORTCPUCOREMATH void normalize< std::complex<float> >( hoNDArray< std::complex<float> >*, float );
+    template EXPORTCPUCOREMATH void shrink1< std::complex<float> >( hoNDArray< std::complex<float> >*, float, hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void pshrink< std::complex<float> >( hoNDArray< std::complex<float> >*, float,float, hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void shrinkd< std::complex<float> > ( hoNDArray< std::complex<float> >*, hoNDArray<float>*, float, hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void pshrinkd< std::complex<float> > ( hoNDArray< std::complex<float> >*, hoNDArray<float>*, float, float, hoNDArray< std::complex<float> >* );
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > abs< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > abs_square< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<double> > > sqrt< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void sqrt_inplace< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<double> > > square< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void square_inplace< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<double> > > reciprocal< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void reciprocal_inplace< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<double> > > reciprocal_sqrt< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void reciprocal_sqrt_inplace< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void clamp< std::complex<double> >( hoNDArray< std::complex<double> >*, double, double );
+    template EXPORTCPUCOREMATH void clamp_min< std::complex<double> >( hoNDArray< std::complex<double> >*, double );
+    template EXPORTCPUCOREMATH void clamp_max<std::complex<double> >( hoNDArray< std::complex<double> >*, double );
+    template EXPORTCPUCOREMATH void normalize< std::complex<double> >( hoNDArray< std::complex<double> >*, double );
+    template EXPORTCPUCOREMATH void shrink1< std::complex<double> >( hoNDArray< std::complex<double> >*, double, hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void pshrink< std::complex<double> >( hoNDArray< std::complex<double> >*, double,double, hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void shrinkd< std::complex<double> > ( hoNDArray< std::complex<double> >*, hoNDArray<double>*, double, hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void pshrinkd< std::complex<double> > ( hoNDArray< std::complex<double> >*, hoNDArray<double>*, double, double, hoNDArray< std::complex<double> >* );
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > abs< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > abs_square< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< complext<float> > > sqrt< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void sqrt_inplace< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< complext<float> > > square< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void square_inplace< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< complext<float> > > reciprocal< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void reciprocal_inplace< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< complext<float> > > reciprocal_sqrt< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void reciprocal_sqrt_inplace< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void clamp< complext<float> >( hoNDArray< complext<float> >*, float, float );
+    template EXPORTCPUCOREMATH void clamp_min< complext<float> >( hoNDArray< complext<float> >*, float );
+    template EXPORTCPUCOREMATH void clamp_max<complext<float> >( hoNDArray< complext<float> >*, float );
+    template EXPORTCPUCOREMATH void normalize< complext<float> >( hoNDArray< complext<float> >*, float );
+    template EXPORTCPUCOREMATH void shrink1< complext<float> >( hoNDArray< complext<float> >*, float, hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void pshrink< complext<float> >( hoNDArray< complext<float> >*, float,float, hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void shrinkd< complext<float> > ( hoNDArray< complext<float> >*, hoNDArray<float>*, float, hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void pshrinkd< complext<float> > ( hoNDArray< complext<float> >*, hoNDArray<float>*, float, float, hoNDArray< complext<float> >* );
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > abs< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > abs_square< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< complext<double> > > sqrt< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void sqrt_inplace< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< complext<double> > > square< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void square_inplace< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< complext<double> > > reciprocal< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void reciprocal_inplace< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< complext<double> > > reciprocal_sqrt< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void reciprocal_sqrt_inplace< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void clamp< complext<double> >( hoNDArray< complext<double> >*, double, double );
+    template EXPORTCPUCOREMATH void clamp_min< complext<double> >( hoNDArray< complext<double> >*, double );
+    template EXPORTCPUCOREMATH void clamp_max<complext<double> >( hoNDArray< complext<double> >*, double );
+    template EXPORTCPUCOREMATH void normalize< complext<double> >( hoNDArray< complext<double> >*, double );
+    template EXPORTCPUCOREMATH void shrink1< complext<double> >( hoNDArray< complext<double> >*, double, hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void pshrink< complext<double> >( hoNDArray< complext<double> >*, double,double, hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void shrinkd< complext<double> > ( hoNDArray< complext<double> >*, hoNDArray<double>*, double, hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void pshrinkd< complext<double> > ( hoNDArray< complext<double> >*, hoNDArray<double>*, double, double, hoNDArray< complext<double> >* );
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<float> > > real_to_complex< std::complex<float> >( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<float> > > real_imag_to_complex< std::complex<float> >( hoNDArray<float>*, hoNDArray<float>* );
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float_complext> > real_to_complex<float_complext>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float_complext> > real_imag_to_complex<float_complext>( hoNDArray<float>*, hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > real<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > real<std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > real<float_complext>( hoNDArray<float_complext>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > imag<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > imag<std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > imag<float_complext>( hoNDArray<float_complext>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > conj<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<std::complex<float> > > conj<std::complex<float> >( hoNDArray<std::complex<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float_complext> > conj<float_complext>( hoNDArray<float_complext>* );
+
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<double> > > real_to_complex< std::complex<double> >( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<double> > > real_imag_to_complex< std::complex<double> >( hoNDArray<double>*, hoNDArray<double>* );
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double_complext> > real_to_complex<double_complext>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double_complext> > real_imag_to_complex<double_complext>( hoNDArray<double>*, hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > real<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > real<std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > real<double_complext>( hoNDArray<double_complext>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > imag<std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > imag<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > imag<double_complext>( hoNDArray<double_complext>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > conj<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<std::complex<double> > > conj<std::complex<double> >( hoNDArray<std::complex<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double_complext> > conj<double_complext>( hoNDArray<double_complext>* );
+
+    template EXPORTCPUCOREMATH hoNDArray<float>& operator+=<float>(hoNDArray<float>&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray<float>& operator-=<float>(hoNDArray<float>&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray<float>& operator*=<float>(hoNDArray<float>&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray<float>& operator/=<float>(hoNDArray<float>&, const float&);
+
+    template EXPORTCPUCOREMATH hoNDArray<double>& operator+=<double>(hoNDArray<double>&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray<double>& operator-=<double>(hoNDArray<double>&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray<double>& operator*=<double>(hoNDArray<double>&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray<double>& operator/=<double>(hoNDArray<double>&, const double&);
+
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator+=< std::complex<float> >
+        (hoNDArray< std::complex<float> >&, const std::complex<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator-=< std::complex<float> >
+        (hoNDArray< std::complex<float> >&, const std::complex<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator*=< std::complex<float> >
+        (hoNDArray< std::complex<float> >&, const std::complex<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator/=< std::complex<float> >
+        (hoNDArray< std::complex<float> >&, const std::complex<float>&);
+
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator+=< complext<float> >
+        (hoNDArray< complext<float> >&, const complext<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator-=< complext<float> >
+        (hoNDArray< complext<float> >&, const complext<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator*=< complext<float> >
+        (hoNDArray< complext<float> >&, const complext<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator/=< complext<float> >
+        (hoNDArray< complext<float> >&, const complext<float>&);
+
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator+=<float>(hoNDArray< std::complex<float> >&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator-=<float>(hoNDArray< std::complex<float> >&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator*=<float>(hoNDArray< std::complex<float> >&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator/=<float>(hoNDArray< std::complex<float> >&, const float&);
+
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator+=<float>(hoNDArray< complext<float> >&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator-=<float>(hoNDArray< complext<float> >&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator*=<float>(hoNDArray< complext<float> >&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator/=<float>(hoNDArray< complext<float> >&, const float&);
+
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator+=< std::complex<double> >
+        (hoNDArray< std::complex<double> >&, const std::complex<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator-=< std::complex<double> >
+        (hoNDArray< std::complex<double> >&, const std::complex<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator*=< std::complex<double> >
+        (hoNDArray< std::complex<double> >&, const std::complex<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator/=< std::complex<double> >
+        (hoNDArray< std::complex<double> >&, const std::complex<double>&);
+
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator+=< complext<double> >
+        (hoNDArray< complext<double> >&, const complext<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator-=< complext<double> >
+        (hoNDArray< complext<double> >&, const complext<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator*=< complext<double> >
+        (hoNDArray< complext<double> >&, const complext<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator/=< complext<double> >
+        (hoNDArray< complext<double> >&, const complext<double>&);
+
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator+=<double>(hoNDArray< std::complex<double> >&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator-=<double>(hoNDArray< std::complex<double> >&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator*=<double>(hoNDArray< std::complex<double> >&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator/=<double>(hoNDArray< std::complex<double> >&, const double&);
+
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator+=<double>(hoNDArray< complext<double> >&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator-=<double>(hoNDArray< complext<double> >&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator*=<double>(hoNDArray< complext<double> >&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator/=<double>(hoNDArray< complext<double> >&, const double&);
+
+
+    template EXPORTCPUCOREMATH void axpy<float>( float, hoNDArray<float>*, hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void axpy<double>( double, hoNDArray<double>*, hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void axpy< std::complex<float> >( std::complex<float> , hoNDArray< std::complex<float> >*, hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void axpy< std::complex<double> >( std::complex<double> , hoNDArray< std::complex<double> >*, hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void axpy< complext<float> >( complext<float> , hoNDArray< complext<float> >*, hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void axpy< complext<double> >( complext<double> , hoNDArray< complext<double> >*, hoNDArray< complext<double> >* );
+
+}
diff --git a/toolboxes/core/cpu/math/hoNDArray_elemwise.h b/toolboxes/core/cpu/math/hoNDArray_elemwise.h
new file mode 100644
index 0000000..e2252c8
--- /dev/null
+++ b/toolboxes/core/cpu/math/hoNDArray_elemwise.h
@@ -0,0 +1,694 @@
+/** \file   hoNDArray_elemwise.h
+    \brief  Element-wise math operations on the hoNDArray class.
+
+    hoNDArray_elementwise.h defines element-wise array operations on the hoNDArray class.
+    Many of the provided functions come in two flavours:
+    1) A function that returns a smart pointer to a new array holding the result of the element-wise operation, and
+
+    2) A function that perform in-place element-wise computation replacing the input array.
+    When both versions are available the in-place version is suffixed _inplace.
+    Some functions (clear, fill, clamp, clamp_min, clamp_max, normalize, shrink1, shrinkd) are only provided as in-place operations,
+    and they do not carry the _inplace suffix in order to keep user code compact.
+    A few functions return a different type as its input array 
+    (abs on complex data, real, imag, real_to_std_complex, real_to_complext) and consequently is not offered as an in place operation.
+    The functions provided in hoNDArray_elemwise are deliberatly placed outside the NDArray derived classes
+    - to allow the NDArray classes to be lightweight header only data containers for both the cpu and gpu instances
+    - to allow for external library optimized implementations of the element-wise functions without adding such dependencies to the core data container
+    The present cpu implementation is based on Armadillo (whenever suitable functions are available).
+    The implementation is purposely split into a header and underlying implementation (.cpp) 
+    as this allows specific instantiation of the supported template types.     
+    The supported types are float, double, std::complex<float>, std::complex<double>, 
+    Gadgetron::complext<float> and Gadgetron::complext<double> -- with some deliberate omissions.
+
+    3) Many functions are also reimplemented if the Intel MKL is avaiable to speedup the computation.
+ */
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "cpucore_math_export.h"
+
+#include <complex>
+
+namespace Gadgetron{
+
+  //
+  // Math return types
+  //
+  template <class T, class I> struct mathReturnType {};
+
+  template <class T> struct mathReturnType<T,T> {typedef T type;};
+
+  template <class T> struct mathReturnType<complext<T>,T> {typedef complext<T> type;};
+  template <class T> struct mathReturnType<T,complext<T> > {typedef complext<T> type;};
+  template <class T> struct mathReturnType<complext<T>,complext<T> > {typedef complext<T> type;};
+
+  template <class T> struct mathReturnType<std::complex<T>,T> {typedef std::complex<T> type;};
+  template <class T> struct mathReturnType<T,std::complex<T> > {typedef std::complex<T> type;};
+  template <class T> struct mathReturnType<std::complex<T>,std::complex<T> > {typedef std::complex<T> type;};
+
+  template <class T, class S> struct mathReturnType<T, complext<S> > {typedef complext<typename mathReturnType<T,S>::type> type;};
+  template <class T, class S> struct mathReturnType<complext<T>, S> {typedef complext<typename mathReturnType<T,S>::type> type;};
+  template <class T, class S> struct mathReturnType<complext<T>, complext<S> > {typedef complext<typename mathReturnType<T,S>::type> type;};
+
+  template <class T, class S> struct mathReturnType<T, std::complex<S> > {typedef std::complex<typename mathReturnType<T,S>::type> type;};
+  template <class T, class S> struct mathReturnType<std::complex<T>, S> {typedef std::complex<typename mathReturnType<T,S>::type> type;};
+  template <class T, class S> struct mathReturnType<std::complex<T>, std::complex<S> > {typedef std::complex<typename mathReturnType<T,S>::type> type;};
+
+  template<> struct mathReturnType<unsigned int, int> {typedef int type;};
+  template<> struct mathReturnType<int, unsigned int> {typedef int type;};
+  template<> struct mathReturnType<int, bool> {typedef int type;};
+  template<> struct mathReturnType<bool,int> {typedef int type;};
+  template<> struct mathReturnType<unsigned int, bool> {typedef int type;};
+  template<> struct mathReturnType<bool,unsigned int> {typedef int type;};
+  template<> struct mathReturnType<float, unsigned int> {typedef float type;};
+  template<> struct mathReturnType<unsigned int, float> {typedef float type;};
+  template<> struct mathReturnType<float, int> {typedef float type;};
+  template<> struct mathReturnType<int, float> {typedef float type;};
+  template<> struct mathReturnType<float, bool> {typedef float type;};
+  template<> struct mathReturnType<bool, float> {typedef float type;};
+  template<> struct mathReturnType<double, unsigned int> {typedef double type;};
+  template<> struct mathReturnType<unsigned int, double> {typedef double type;};
+  template<> struct mathReturnType<double, int> {typedef double type;};
+  template<> struct mathReturnType<int, double> {typedef double type;};
+  template<> struct mathReturnType<double, bool> {typedef double type;};
+  template<> struct mathReturnType<bool, double> {typedef double type;};
+  template<> struct mathReturnType<double, float> {typedef double type;};
+  template<> struct mathReturnType<float,double> {typedef double type;};
+
+  // Utility to verify array dimensions for simple broadcasting.
+  // It "replaces" NDArray::dimensions_equal() to support batch mode.
+  // There is an identical function for all array instances (currently hoNDArray, cuNDArray, hoCuNDArray)
+  // !!! Remember to fix any bugs in all versions !!!
+  //
+  template<class T,class S> bool compatible_dimensions( const hoNDArray<T> &x, const hoNDArray<S> &y )
+  {
+      return ((x.get_number_of_elements()%y.get_number_of_elements())==0);
+  }
+
+  // Utility to verify if array dimensions are compatible for a binary operation
+  // that supports simple broadcasting, i.e. for f(x,y,r), there are three cases:
+  // 1) nr = nx = ny
+  // 2) nr = nx > ny, and nx is divisible by ny
+  // 3) nr = ny > nx, and ny is divisible by nx
+  //
+  template<class T,class S, class U> bool compatible_dimensions( const hoNDArray<T> &x, const hoNDArray<S> &y,
+              const hoNDArray<U> &r )
+  {
+      size_t nx = x.get_number_of_elements();
+      size_t ny = y.get_number_of_elements();
+      size_t nr = r.get_number_of_elements();
+      if (nx == ny) {
+          return (nx==nr);
+      }
+      if ((nx%ny)==0) {
+          return (nx==nr);
+      }
+      if ((ny%nx)==0) {
+          return (ny==nr);
+      }
+      return false;
+  }
+
+
+/**
+* @brief add two vectors of values, r = x + y
+  support in-place computation, e.g. x==r or y==r
+  support simple broadcasting
+*/
+template <class T, class S> EXPORTCPUCOREMATH
+void add(const hoNDArray<T>& x, const hoNDArray<S>& y, hoNDArray<typename mathReturnType<T,S>::type >& r);
+
+// Pointer version calls the reference version
+template <typename T, class S>
+void add(const hoNDArray<T>* x, const hoNDArray<S>* y, hoNDArray<typename mathReturnType<T,S>::type >* r)
+{
+  add(*x, *y, *r);
+}
+
+
+/**
+* @brief subtract two vectors of values, r = x - y
+  support in-place computation, e.g. x==r
+  support simple broadcasting
+*/
+template <class T, class S> EXPORTCPUCOREMATH
+void subtract(const hoNDArray<T>& x, const hoNDArray<S>& y, hoNDArray<typename mathReturnType<T,S>::type >& r);
+
+// Pointer version calls the reference version
+template <typename T, class S>
+void subtract(const hoNDArray<T>* x, const hoNDArray<S>* y, hoNDArray<typename mathReturnType<T,S>::type >* r)
+{
+  subtract(*x, *y, *r);
+}
+
+
+/**
+* @brief multiply two vectors of values, r = x * y
+  support in-place computation, e.g. x==r or y==r
+  support simple broadcasting
+*/
+template <class T, class S> EXPORTCPUCOREMATH
+void multiply(const hoNDArray<T>& x, const hoNDArray<S>& y, hoNDArray<typename mathReturnType<T,S>::type >& r);
+
+// Pointer version calls the reference version
+template <class T, class S>
+void multiply(const hoNDArray<T>* x, const hoNDArray<S>* y, hoNDArray<typename mathReturnType<T,S>::type >* r)
+{
+  multiply(*x, *y, *r);
+}
+
+
+/**
+* @brief divide two vectors of values, r = x / y
+  support in-place computation, e.g. x==r
+  support simple broadcasting
+  no check for y==0
+*/
+template <class T, class S> EXPORTCPUCOREMATH
+void divide(const hoNDArray<T>& x, const hoNDArray<S>& y, hoNDArray<typename mathReturnType<T,S>::type >& r);
+
+// Pointer version calls the reference version
+template <class T, class S>
+void divide(const hoNDArray<T>* x, const hoNDArray<S>* y, hoNDArray<typename mathReturnType<T,S>::type >* r)
+{
+  divide(*x, *y, *r);
+}
+
+
+/**
+* @brief r = x * conj(y)
+  support in-place computation, e.g. x==r
+  support simple broadcasting
+*/
+template <class T, class S> EXPORTCPUCOREMATH
+void multiplyConj(const hoNDArray<T>& x, const hoNDArray<S>& y, hoNDArray<typename mathReturnType<T,S>::type >& r);
+
+// Pointer version calls the reference version
+template <class T, class S>
+void multiplyConj(const hoNDArray<T>* x, const hoNDArray<S>* y, hoNDArray<typename mathReturnType<T,S>::type >* r)
+{
+  multiplyConj(*x, *y, *r);
+}
+
+
+/**
+* @brief r = conj(x)
+*/
+template <typename T> EXPORTCPUCOREMATH 
+void conjugate(const hoNDArray<T>& x, hoNDArray<T>& r);
+
+/**
+* @brief if abs(x) is smaller than epsilon for its numeric type
+add epsilon to this x
+*/
+template <typename T> EXPORTCPUCOREMATH 
+void addEpsilon(hoNDArray<T>& x);
+
+/**
+* @brief r = angle(x)
+*/
+template <typename T> EXPORTCPUCOREMATH 
+void argument(const hoNDArray<T>& x, hoNDArray<typename realType<T>::Type>& r);
+
+/**
+* @brief r = 1/x
+*/
+template <typename T> EXPORTCPUCOREMATH 
+void inv(const hoNDArray<T>& x, hoNDArray<T>& r);
+
+/**
+ * @brief Calculates the element-wise absolute values (l2 norm) of the array entries
+ * @param[in] x Input array.
+ * @return A new array containing the element-wise absolute values of the input.
+ */
+template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<typename realType<T>::Type> > abs( hoNDArray<T> *x );
+template <typename T> EXPORTCPUCOREMATH void abs(const hoNDArray<T>& x, hoNDArray<typename realType<T>::Type>& r);
+template <typename T> EXPORTCPUCOREMATH void abs(const hoNDArray< std::complex<T> >& x, hoNDArray< std::complex<T> >& r);
+
+/**
+ * @brief Calculates the element-wise absolute values (l2 norm) of the array entries (in place).
+ * @param[in,out] x Input and output array.
+ */
+template<class T> EXPORTCPUCOREMATH void abs_inplace( hoNDArray<T> *x );
+
+/**
+ * @brief Calculates the element-wise squared absolute values of the array entries
+ * @param[in] x Input array.
+ * @return A new array containing the element-wise absolute values of the input.
+ */
+template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<typename realType<T>::Type> > abs_square( hoNDArray<T> *x );
+
+/**
+ * @brief Calculates the element-wise sqrt of the array entries.
+ * @param[in] x Input array.
+ * @return A new array containing the element-wise sqrt of the input.
+ */
+template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<T> > sqrt( hoNDArray<T> *x );
+
+template <typename T> EXPORTCPUCOREMATH void sqrt(const hoNDArray<T>& x, hoNDArray<T>& r);
+
+/**
+ * @brief Calculates the element-wise sqrt of the array entries (in place).
+ * @param[in,out] x Input and output array.
+ */
+template<class T> EXPORTCPUCOREMATH void sqrt_inplace( hoNDArray<T> *x );
+
+/**
+ * @brief Calculates the element-wise square of the array entries.
+ * @param[in] x Input array.
+ * @return A new array containing the element-wise square of the input.
+ *
+ * For real numbers this functions is equivalent to square.
+ * For complex arrays abs_square() and square() differ however.
+ */
+template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<T> > square( hoNDArray<T> *x );
+
+/**
+ * @brief Calculates the element-wise square of the array entries (in place).
+ * @param[in,out] x Input and output array.
+ */
+template<class T> EXPORTCPUCOREMATH void square_inplace( hoNDArray<T> *x );
+
+/**
+ * @brief Calculates the element-wise reciprocal of the array entries.
+ * @param[in] x Input array.
+ * @return A new array containing the element-wise reciprocal of the input.
+ */
+template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<T> > reciprocal( hoNDArray<T> *x );
+
+/**
+ * @brief Calculates the element-wise reciprocal of the array entries (in place).
+ * @param[in,out] x Input and output array.
+ */
+template<class T> EXPORTCPUCOREMATH void reciprocal_inplace( hoNDArray<T> *x );
+
+/**
+ * @brief Calculates the element-wise reciprocal sqrt of the array entries.
+ * @param[in] x Input array.
+ * @return A new array containing the element-wise reciprocal sqrt of the input.
+ */
+template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<T> > reciprocal_sqrt( hoNDArray<T> *x );
+
+/**
+ * @brief Calculates the element-wise reciprocal sqrt of the array entries (in place).
+ * @param[in,out] x Input and output array.
+ */
+template<class T> EXPORTCPUCOREMATH void reciprocal_sqrt_inplace( hoNDArray<T> *x );
+
+/**
+ * @brief Calculates the elementwise signum function on the array.
+ * @param[in] x Input array.
+ * @return A new array containing the element-wise sgn of the input.
+ */
+template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<T> > sgn( hoNDArray<T> *x );
+
+/**
+ * @brief Calculates the elementwise signum function on the array (in place).
+ * @param[in,out] x Input and output array.
+ */
+template<class T> EXPORTCPUCOREMATH void sgn_inplace( hoNDArray<T> *x );
+
+/**
+ * @brief Extract the real component from a complex array.
+ * @param[in] x Input array.
+ * @return A new array of the real component of the complex array.
+ */
+template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<typename realType<T>::Type> > real( hoNDArray<T> *x );
+
+/**
+ * @brief Extract the imaginary component from a complex array.
+ * @param[in] x Input array.
+ * @return A new array of the imaginary component of the complex array.
+ */
+template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<typename realType<T>::Type> > imag( hoNDArray<T> *x );
+
+/**
+ * @brief Create a new array of the complex conjugate of the input array. For real arrays a copy of the input array is return.
+ * @param[in] x Input array.
+ * @return A new array of the complex conjugate of the input array.
+ */
+template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<T> > conj( hoNDArray<T> *x );
+
+/**
+ * @brief Construct a complex array from a real array.
+ * @param[in] x Input array.
+ * @return A new complex array containing the input array in the real component and zeros in the imaginary component.
+ */
+template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<T> >
+real_to_complex( hoNDArray<typename realType<T>::Type> *x );
+
+template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<T> >
+real_imag_to_complex( hoNDArray<typename realType<T>::Type> *real, hoNDArray<typename realType<T>::Type>* imag);
+
+/**
+* @brief real and imag to complex
+*/
+template<class T> EXPORTCPUCOREMATH 
+void real_imag_to_complex(const hoNDArray<typename realType<T>::Type>& real, const hoNDArray<typename realType<T>::Type>& imag, hoNDArray<T>& cplx);
+
+/**
+* @brief complex to real and imag
+*/
+template<class T> EXPORTCPUCOREMATH 
+void complex_to_real_imag(const hoNDArray<T>& cplx, hoNDArray<typename realType<T>::Type>& real, hoNDArray<typename realType<T>::Type>& imag);
+
+template<class T> EXPORTCPUCOREMATH 
+void complex_to_real_imag(const hoNDArray<T>& cplx, hoNDArray<T>& real, hoNDArray<T>& imag);
+
+/**
+* @brief get the real part of complex
+*/
+template<class T> EXPORTCPUCOREMATH 
+void complex_to_real(const hoNDArray<T>& cplx, hoNDArray<typename realType<T>::Type>& real);
+
+template<class T> EXPORTCPUCOREMATH 
+void complex_to_real(const hoNDArray<T>& cplx, hoNDArray<T>& real);
+
+template<class T> 
+void complex_to_real(hoNDArray<T>& cplx);
+
+/**
+* @brief get the imag part of complex
+*/
+template<class T> EXPORTCPUCOREMATH 
+void complex_to_imag(const hoNDArray<T>& cplx, hoNDArray<typename realType<T>::Type>& imag);
+
+template<class T> EXPORTCPUCOREMATH 
+void complex_to_imag(const hoNDArray<T>& cplx, hoNDArray<T>& imag);
+
+template<class T> EXPORTCPUCOREMATH 
+void complex_to_imag(hoNDArray<T>& cplx);
+
+/**
+* @brief get complex array whose real part is the input and imag part is zero
+*/
+template<class T> EXPORTCPUCOREMATH 
+void real_to_complex(const hoNDArray<typename realType<T>::Type>& real, hoNDArray<T>& cplx);
+
+/**
+ * @brief Clears the array to all zeros ( in place). Faster than fill.
+ * @param[in,out] x Input and output array.
+ */
+template<class T> void clear( hoNDArray<T>* x )
+{
+    if ( x->get_number_of_elements() > 0 )
+    {
+        memset( x->get_data_ptr(), 0, x->get_number_of_elements()*sizeof(T));
+    }
+}
+
+template<class T> void clear( hoNDArray<T>& x )
+{
+    if ( x.get_number_of_elements() > 0 )
+    {
+        memset( x.get_data_ptr(), 0, x.get_number_of_elements()*sizeof(T));
+    }
+}
+
+/**
+ * @brief Fills the array with a user provided constant value (in place).
+ * @param[in,out] x Input and output array.
+ * @param[in] val Fill value.
+ */
+template <typename T> EXPORTCPUCOREMATH void fill( hoNDArray<T>* x, T val);
+template <typename T> EXPORTCPUCOREMATH void fill( hoNDArray<T>& x, T val );
+
+/**
+ * @brief Clamps all values in the array to the minimum and maximum values specified (in place).
+ * @param[in,out] x Input and output array.
+ * @param[in] min minimum value.
+ * @param[in] max maximum value.
+ * @param[in] min_val value to which everything below the minimum will be set
+ * @param[in] max_val value to which everything above the maximum will be set
+ */
+template<class T> EXPORTCPUCOREMATH void clamp( hoNDArray<T> *x, typename realType<T>::Type min, typename realType<T>::Type max, T min_val, T max_val );
+
+/**
+ * @brief Clamps all values in the array to the minimum and maximum values specified (in place).
+ * @param[in,out] x Input and output array.
+ * @param[in] min minimum value.
+ * @param[in] max maximum value.
+ */
+template<class T> EXPORTCPUCOREMATH void clamp( hoNDArray<T> *x, typename realType<T>::Type min, typename realType<T>::Type max );
+
+/**
+ * @brief Clamps all values in the array to a minimum value allowed (in place).
+ * @param[in,out] x Input and output array.
+ * @param[in] min Minimum value.
+ */
+template<class T> EXPORTCPUCOREMATH void clamp_min( hoNDArray<T> *x, typename realType<T>::Type min );
+
+/**
+ * @brief Clamps all values in the array to a maximum value allowed (in place).
+ * @param[in,out] x Input and output array.
+ * @param[in] max Maximum value.
+ */
+template<class T> EXPORTCPUCOREMATH void clamp_max( hoNDArray<T> *x, typename realType<T>::Type max );
+
+/**
+ * @brief In place normalization (scaling) to a new maximum absolute array value val.
+ * @param[in,out] x Input and output array.
+ * @param[in] val New maximum absolute array value (according to the l2-norm)
+ */
+template<class T> EXPORTCPUCOREMATH void normalize( hoNDArray<T> *x, typename realType<T>::Type val = typename realType<T>::Type(1) );
+
+/**
+ * @brief Shrinkage (soft thresholding), i.e. shrink(x,gamma) = x/abs(x)*max(abs(x)-gamma,0).
+ * @param[out] out Output array. Can be 0x0 in which case an in place transform is performed.
+ * @param[in,out] x Input array (and output array if out == 0x0).
+ * @param[in] gamma Shrinkage control parameter
+ */
+template<class T> EXPORTCPUCOREMATH void shrink1( hoNDArray<T> *x, typename realType<T>::Type gamma, hoNDArray<T> *out = 0x0 );
+
+/**
+ * @brief In place p-shrinkage (soft thresholding), i.e. pshrink(x,gamma,p) = x/abs(x)*max(abs(x)-gamma*abs(x)^(p-1),0).
+ * @param[out] out Output array. Can be 0x0 in which case an in place transform is performed.
+ * @param[in,out] x Input array (and output array if out == 0x0).
+ * @param[in] gamma Shrinkage control parameter
+ * @param[in] p p value of the shrinkage. Should be less than 1 and more than 0.
+ */
+template<class T> EXPORTCPUCOREMATH void pshrink( hoNDArray<T> *x, typename realType<T>::Type gamma,typename realType<T>::Type p, hoNDArray<T> *out = 0x0 );
+
+/**
+ * @brief Shrinkage (soft thresholding, multi-dimensional), i.e. shrink(x,gamma,s) = x/s*max(s-gamma,0).
+ * @param[out] out Output array. Can be 0x0 in which case an in place transform is performed.
+ * @param[in,out] x Input array (and output array if out == 0x0).
+ * @param[in] s Input array, normalization.
+ * @param[in] gamma Shrinkage control parameter
+ */
+template<class T> EXPORTCPUCOREMATH void shrinkd ( hoNDArray<T> *x, hoNDArray<typename realType<T>::Type> *s, typename realType<T>::Type gamma, hoNDArray<T> *out = 0x0 );
+
+/**
+ * @brief In place p-shrinkage (soft thresholding, multi-dimensional), i.e. pshrink(x,s,gamma,p) = x/s*max(s-gamma*s^(p-1),0).
+ * @param[out] out Output array. Can be 0x0 in which case an in place transform is performed.
+ * @param[in,out] x Input array (and output array if out == 0x0).
+ * @param[in] gamma Shrinkage control parameter
+ * @param[in] p p value of the shrinkage. Should be less than 1 and more than 0.
+ */
+template<class T> EXPORTCPUCOREMATH void pshrinkd ( hoNDArray<T> *x, hoNDArray<typename realType<T>::Type> *s, typename realType<T>::Type gamma,typename realType<T>::Type p, hoNDArray<T> *out = 0x0 );
+
+/**
+ * @brief Implementation of element-wise operator+= on two hoNDArrays.
+ * @param[in,out] x Input and output array.
+ * @param[in] y Input array.
+
+ * Let y be an n-dimensional array.
+ * Then the sizes of the first n array dimensions must match between x and y.
+ * If x contains further dimensions the operator is batched across those dimensions.
+ */
+template<class T, class S> hoNDArray<T>& operator+= (hoNDArray<T> &x, const hoNDArray<S> &y)
+{
+  if (compatible_dimensions<T,S>(x,y)) {
+      add(x, y, x);
+      return x;
+  } else {
+      throw std::runtime_error("+= incompatible dimensions.");
+  }
+}
+
+/**
+ * @brief Implementation of element-wise operator+= on a hoNDArray with a scalar value.
+ * @param[in,out] x Input and output array.
+ * @param[in] y Input scalar.
+ */
+template<class T> EXPORTCPUCOREMATH hoNDArray<T>& operator+= (hoNDArray<T> &x, const T &y);
+
+/**
+ * @brief Implementation of element-wise operator+= on a hoNDArray with a scalar value.
+ * @param[in,out] x Input and output array.
+ * @param[in] y Input scalar.
+ */
+template<class T> EXPORTCPUCOREMATH hoNDArray< std::complex<T> >& operator+= (hoNDArray< std::complex<T> >&x, const T &y);
+
+/**
+ * @brief Implementation of element-wise operator+= on a hoNDArray with a scalar value.
+ * @param[in,out] x Input and output array.
+ * @param[in] y Input scalar.
+ */
+template<class T> EXPORTCPUCOREMATH hoNDArray< complext<T> >& operator+= (hoNDArray< complext<T> >&x, const T &y);
+
+/**
+ * @brief Implementation of element-wise operator-= on two hoNDArrays.
+ * @param[in,out] x Input and output array.
+ * @param[in] y Input array.
+
+ * Let y be an n-dimensional array.
+ * Then the sizes of the first n array dimensions must match between x and y.
+ * If x contains further dimensions the operator is batched across those dimensions.
+ */
+template<class T, class S> hoNDArray<T>& operator-= (hoNDArray<T> &x, const hoNDArray<S> &y)
+{
+  if (compatible_dimensions<T,S>(x,y)) {
+      subtract(x, y, x);
+      return x;
+  } else {
+      throw std::runtime_error("-= incompatible dimensions.");
+  }
+}
+
+
+/**
+ * @brief Implementation of element-wise operator-= on a hoNDArray with a scalar value.
+ * @param[in,out] x Input and output array.
+ * @param[in] y Input scalar.
+ */
+template<class T> EXPORTCPUCOREMATH hoNDArray<T>& operator-= (hoNDArray<T> &x, const T &y);
+
+/**
+ * @brief Implementation of element-wise operator-= on a hoNDArray with a scalar value.
+ * @param[in,out] x Input and output array.
+ * @param[in] y Input scalar.
+ */
+template<class T> EXPORTCPUCOREMATH hoNDArray< std::complex<T> >& operator-= (hoNDArray< std::complex<T> >&x, const T &y);
+
+/**
+ * @brief Implementation of element-wise operator-= on a hoNDArray with a scalar value.
+ * @param[in,out] x Input and output array.
+ * @param[in] y Input scalar.
+ */
+template<class T> EXPORTCPUCOREMATH hoNDArray< complext<T> >& operator-= (hoNDArray< complext<T> >&x, const T &y);
+
+/**
+ * @brief Implementation of element-wise operator*= on two hoNDArrays.
+ * @param[in,out] x Input and output array.
+ * @param[in] y Input array.
+
+ * Let y be an n-dimensional array.
+ * Then the sizes of the first n array dimensions must match between x and y.
+ * If x contains further dimensions the operator is batched across those dimensions.
+ */
+template<class T, class S> hoNDArray<T>& operator*= (hoNDArray<T> &x, const hoNDArray<S> &y)
+{
+  if (compatible_dimensions<T,S>(x,y)) {
+      multiply(x, y, x);
+      return x;
+  } else {
+      throw std::runtime_error("*= incompatible dimensions.");
+  }
+}
+
+/**
+ * @brief Implementation of element-wise operator*= on a hoNDArray with a scalar value.
+ * @param[in,out] x Input and output array.
+ * @param[in] y Input scalar.
+ */
+template<class T> EXPORTCPUCOREMATH hoNDArray<T>& operator*= (hoNDArray<T> &x, const T &y);
+
+/**
+ * @brief Implementation of element-wise operator*= on a hoNDArray with a scalar value.
+ * @param[in,out] x Input and output array.
+ * @param[in] y Input scalar.
+ */
+template<class T> EXPORTCPUCOREMATH hoNDArray< std::complex<T> >& operator*= (hoNDArray< std::complex<T> > &x, const T &y);
+
+/**
+ * @brief Implementation of element-wise operator*= on a hoNDArray with a scalar value.
+ * @param[in,out] x Input and output array.
+ * @param[in] y Input scalar.
+ */
+template<class T> EXPORTCPUCOREMATH hoNDArray< complext<T> >& operator*= (hoNDArray< complext<T> > &x, const T &y);
+
+/**
+ * @brief Implementation of element-wise operator/= on two hoNDArrays.
+ * @param[in,out] x Input and output array.
+ * @param[in] y Input array.
+
+ * Let y be an n-dimensional array.
+ * Then the sizes of the first n array dimensions must match between x and y.
+ * If x contains further dimensions the operator is batched across those dimensions.
+ */
+template<class T, class S> hoNDArray<T>& operator/= (hoNDArray<T> &x, const hoNDArray<S> &y)
+{
+  if (compatible_dimensions<T,S>(x,y)) {
+      divide(x, y, x);
+      return x;
+  } else {
+      throw std::runtime_error("*= incompatible dimensions.");
+  }
+}
+
+/**
+ * @brief Implementation of element-wise operator/= on a hoNDArray with a scalar value.
+ * @param[in,out] x Input and output array.
+ * @param[in] y Input scalar.
+ */
+template<class T> EXPORTCPUCOREMATH hoNDArray<T>& operator/= (hoNDArray<T> &x, const T &y);
+
+/**
+ * @brief Implementation of element-wise operator/= on a hoNDArray with a scalar value.
+ * @param[in,out] x Input and output array.
+ * @param[in] y Input scalar.
+ */
+template<class T> EXPORTCPUCOREMATH hoNDArray< std::complex<T> >& operator/= (hoNDArray< std::complex<T> > &x, const T &y);
+
+/**
+ * @brief Implementation of element-wise operator/= on a hoNDArray with a scalar value.
+ * @param[in,out] x Input and output array.
+ * @param[in] y Input scalar.
+ */
+template<class T> EXPORTCPUCOREMATH hoNDArray< complext<T> >& operator/= (hoNDArray< complext<T> > &x, const T &y);
+
+/**
+ * @brief Calculates y = a*x+y in which x and y are considered as vectors
+ * @param[in] a Scalar value
+ * @param[in] x Array
+ * @param[in,out] y Array
+ */
+template<class T> EXPORTCPUCOREMATH void axpy( T a, hoNDArray<T> *x, hoNDArray<T> *y );
+
+/**
+* @brief compute r = a*x + y
+*/
+template <typename T> EXPORTCPUCOREMATH void axpy(T a, const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r);
+
+/**
+* @brief compute x *= a
+*/
+template <typename T> EXPORTCPUCOREMATH void scal(T a, hoNDArray<T>& x);
+template <typename T> EXPORTCPUCOREMATH void scal(T a, hoNDArray< std::complex<T> >& x);
+template <typename T> EXPORTCPUCOREMATH void scal(T a, hoNDArray< complext<T> >& x);
+
+/**
+* @brief 2D convolution
+            x: input data, y: convolution kernel, z: output; each 2D slice is convolved
+*/
+template <typename T> EXPORTCPUCOREMATH 
+void conv2(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& z);
+
+/**
+* @brief 3D convolution
+            x: input data, y: convolution kernel, z: output; each 3D volume is convolved
+*/
+template <typename T> EXPORTCPUCOREMATH 
+void conv3(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& z);
+
+/**
+* @brief sum over a specific dimension
+            x: input array, y: output array, dim: dimension to perform sum
+            resulting y.get_size(d) == 1
+*/
+template <typename T> EXPORTCPUCOREMATH
+void sum_over_dimension(const hoNDArray<T>& x, hoNDArray<T>& y, size_t dim);
+
+}
diff --git a/toolboxes/core/cpu/math/hoNDArray_linalg.cpp b/toolboxes/core/cpu/math/hoNDArray_linalg.cpp
new file mode 100644
index 0000000..73279e7
--- /dev/null
+++ b/toolboxes/core/cpu/math/hoNDArray_linalg.cpp
@@ -0,0 +1,1970 @@
+#include "log.h"
+#include "hoNDArray_linalg.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_reductions.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+#ifndef lapack_complex_float
+    #define lapack_complex_float  std::complex<float> 
+#endif // lapack_complex_float
+
+#ifndef lapack_complex_double
+    #define lapack_complex_double  std::complex<double> 
+#endif // #ifndef lapack_complex_double
+
+extern "C" void sgemm_(const char *transa, const char *transb, const lapack_int *m, const lapack_int *n, const lapack_int *k,
+            const float *alpha, const float *a, const lapack_int *lda, const float *b, const lapack_int *ldb,
+            const float *beta, float *c, const lapack_int *ldc);
+
+extern "C" void dgemm_(const char *transa, const char *transb, const lapack_int *m, const lapack_int *n, const lapack_int *k,
+            const double *alpha, const double *a, const lapack_int *lda, const double *b, const lapack_int *ldb,
+            const double *beta, double *c, const lapack_int *ldc);
+
+extern "C" void cgemm_(const char *transa, const char *transb, const lapack_int *m, const lapack_int *n, const lapack_int *k,
+                    const lapack_complex_float *alpha, const lapack_complex_float *a, const lapack_int *lda,
+                    const lapack_complex_float *b, const lapack_int *ldb, const lapack_complex_float *beta,
+                    lapack_complex_float *c, const lapack_int *ldc);
+
+extern "C" void zgemm_(const char *transa, const char *transb, const lapack_int *m, const lapack_int *n, const lapack_int *k,
+            const lapack_complex_double *alpha, const lapack_complex_double *a, const lapack_int *lda,
+            const lapack_complex_double *b, const lapack_int *ldb, const lapack_complex_double *beta,
+            lapack_complex_double *c, const lapack_int *ldc);
+
+extern "C" void ssyrk_( const char* uplo, const char *trans, const lapack_int *n, const lapack_int *k, const float *alpha, const float *a, const lapack_int *lda, const float *beta, float *c, const lapack_int *ldc);
+extern "C" void dsyrk_( const char* uplo, const char *trans, const lapack_int *n, const lapack_int *k, const double *alpha, const double *a, const lapack_int *lda, const double *beta, double *c, const lapack_int *ldc);
+extern "C" void csyrk_( const char* uplo, const char *trans, const lapack_int *n, const lapack_int *k, const lapack_complex_float *alpha, const lapack_complex_float *a, const lapack_int *lda, const lapack_complex_float *beta, lapack_complex_float *c, const lapack_int *ldc);
+extern "C" void zsyrk_( const char* uplo, const char *trans, const lapack_int *n, const lapack_int *k, const lapack_complex_double *alpha, const lapack_complex_double *a, const lapack_int *lda, const lapack_complex_double *beta, lapack_complex_double *c, const lapack_int *ldc);
+
+extern "C" void cherk_( const char* uplo, const char *trans, const lapack_int *n, const lapack_int *k, const lapack_complex_float *alpha, const lapack_complex_float *a, const lapack_int *lda, const lapack_complex_float *beta, lapack_complex_float *c, const lapack_int *ldc);
+extern "C" void zherk_( const char* uplo, const char *trans, const lapack_int *n, const lapack_int *k, const lapack_complex_double *alpha, const lapack_complex_double *a, const lapack_int *lda, const lapack_complex_double *beta, lapack_complex_double *c, const lapack_int *ldc);
+
+extern "C" void spotrf_( const char* uplo, const lapack_int* n, float* a, const lapack_int* lda, lapack_int* info );
+extern "C" void dpotrf_( const char* uplo, const lapack_int* n, double* a, const lapack_int* lda, lapack_int* info );
+extern "C" void cpotrf_( const char* uplo, const lapack_int* n, lapack_complex_float* a, const lapack_int* lda, lapack_int* info );
+extern "C" void zpotrf_( const char* uplo, const lapack_int* n, lapack_complex_double* a, const lapack_int* lda, lapack_int* info );
+
+extern "C" void ssyev_( const char* jobz, const char* uplo, const lapack_int* n, float* a,
+        const lapack_int* lda, float* w, float* work, const lapack_int* lwork,
+        lapack_int* info );
+
+extern "C" void dsyev_( const char* jobz, const char* uplo, const lapack_int* n, double* a,
+        const lapack_int* lda, double* w, double* work, const lapack_int* lwork,
+        lapack_int* info );
+
+extern "C" void cheev_( const char* jobz, const char* uplo, const lapack_int* n,
+        lapack_complex_float* a, const lapack_int* lda, float* w, lapack_complex_float* work,
+        const lapack_int* lwork, float* rwork, lapack_int* info );
+
+extern "C" void zheev_( const char* jobz, const char* uplo, const lapack_int* n,
+        lapack_complex_double* a, const lapack_int* lda, double* w,
+        lapack_complex_double* work, const lapack_int* lwork, double* rwork,
+        lapack_int* info );
+
+extern "C" void spotrf_( const char* uplo, const lapack_int* n, float* a, const lapack_int* lda,
+        lapack_int* info );
+
+extern "C" void spotri_( const char* uplo, const lapack_int* n, float* a, const lapack_int* lda,
+        lapack_int* info );
+
+extern "C" void dpotrf_( const char* uplo, const lapack_int* n, double* a,
+        const lapack_int* lda, lapack_int* info );
+
+extern "C" void dpotri_( const char* uplo, const lapack_int* n, double* a,
+        const lapack_int* lda, lapack_int* info );
+
+extern "C" void cpotrf_( const char* uplo, const lapack_int* n, lapack_complex_float* a,
+        const lapack_int* lda, lapack_int* info );
+
+extern "C" void cpotri_( const char* uplo, const lapack_int* n, lapack_complex_float* a,
+        const lapack_int* lda, lapack_int* info );
+
+extern "C" void zpotrf_( const char* uplo, const lapack_int* n, lapack_complex_double* a,
+        const lapack_int* lda, lapack_int* info );
+
+extern "C" void zpotri_( const char* uplo, const lapack_int* n, lapack_complex_double* a,
+        const lapack_int* lda, lapack_int* info );
+
+extern "C" void strtri_( const char* uplo, const char* diag, const lapack_int* n, float* a,
+        const lapack_int* lda, lapack_int* info );
+
+extern "C" void dtrtri_( const char* uplo, const char* diag, const lapack_int* n, double* a,
+        const lapack_int* lda, lapack_int* info );
+
+extern "C" void ctrtri_( const char* uplo, const char* diag, const lapack_int* n,
+        lapack_complex_float* a, const lapack_int* lda, lapack_int* info );
+
+extern "C" void ztrtri_( const char* uplo, const char* diag, const lapack_int* n,
+        lapack_complex_double* a, const lapack_int* lda, lapack_int* info );
+
+extern "C" void sposv_( const char* uplo, const lapack_int* n, const lapack_int* nrhs, float* a,
+        const lapack_int* lda, float* b, const lapack_int* ldb, lapack_int* info );
+
+extern "C" void dposv_( const char* uplo, const lapack_int* n, const lapack_int* nrhs,
+        double* a, const lapack_int* lda, double* b, const lapack_int* ldb,
+        lapack_int* info );
+
+extern "C" void cposv_( const char* uplo, const lapack_int* n, const lapack_int* nrhs,
+        lapack_complex_float* a, const lapack_int* lda, lapack_complex_float* b,
+        const lapack_int* ldb, lapack_int* info );
+
+extern "C" void zposv_( const char* uplo, const lapack_int* n, const lapack_int* nrhs,
+        lapack_complex_double* a, const lapack_int* lda, lapack_complex_double* b,
+        const lapack_int* ldb, lapack_int* info );
+
+extern "C" void sgesv_( const lapack_int* n, const lapack_int* nrhs, float* a,
+        const lapack_int* lda, lapack_int* ipiv, float* b, const lapack_int* ldb, lapack_int* info );
+
+extern "C" void dgesv_( const lapack_int* n, const lapack_int* nrhs, double* a,
+        const lapack_int* lda, lapack_int* ipiv, double* b, const lapack_int* ldb, lapack_int* info );
+
+extern "C" void cgesv_( const lapack_int* n, const lapack_int* nrhs, lapack_complex_float* a,
+        const lapack_int* lda, lapack_int* ipiv, lapack_complex_float* b, const lapack_int* ldb, lapack_int* info );
+
+extern "C" void zgesv_( const lapack_int* n, const lapack_int* nrhs, lapack_complex_double* a,
+        const lapack_int* lda, lapack_int* ipiv, lapack_complex_double* b, const lapack_int* ldb, lapack_int* info );
+
+extern "C" void ssysv_( const char* uplo, const lapack_int* n, const lapack_int* nrhs, float* a,
+        const lapack_int* lda, lapack_int* ipiv, float* b, const lapack_int* ldb, float* work, lapack_int* lwork, lapack_int* info );
+
+extern "C" void dsysv_( const char* uplo, const lapack_int* n, const lapack_int* nrhs, double* a,
+        const lapack_int* lda, lapack_int* ipiv, double* b, const lapack_int* ldb, double* work, lapack_int* lwork, lapack_int* info );
+
+extern "C" void chesv_( const char* uplo, const lapack_int* n, const lapack_int* nrhs, lapack_complex_float* a,
+        const lapack_int* lda, lapack_int* ipiv, lapack_complex_float* b, const lapack_int* ldb, lapack_complex_float* work, lapack_int* lwork, lapack_int* info );
+
+extern "C" void zhesv_( const char* uplo, const lapack_int* n, const lapack_int* nrhs, lapack_complex_double* a,
+        const lapack_int* lda, lapack_int* ipiv, lapack_complex_double* b, const lapack_int* ldb, lapack_complex_double* work, lapack_int* lwork,  lapack_int* info );
+
+extern "C" void sgetrf_( const lapack_int* m, const lapack_int* n, float* a, const lapack_int* lda,
+        lapack_int* ipiv, lapack_int* info );
+
+extern "C" void dgetrf_( const lapack_int* m, const lapack_int* n, double* a,
+        const lapack_int* lda, lapack_int* ipiv, lapack_int* info );
+
+extern "C" void cgetrf_( const lapack_int* m, const lapack_int* n, lapack_complex_float* a,
+        const lapack_int* lda, lapack_int* ipiv, lapack_int* info );
+
+extern "C" void zgetrf_( const lapack_int* m, const lapack_int* n, lapack_complex_double* a,
+        const lapack_int* lda, lapack_int* ipiv, lapack_int* info );
+
+extern "C" void sgetri_( const lapack_int* n, float* a, const lapack_int* lda,
+        const lapack_int* ipiv, float* work, const lapack_int* lwork,
+        lapack_int* info );
+
+extern "C" void dgetri_( const lapack_int* n, double* a, const lapack_int* lda,
+        const lapack_int* ipiv, double* work, const lapack_int* lwork,
+        lapack_int* info );
+
+extern "C" void cgetri_( const lapack_int* n, lapack_complex_float* a, const lapack_int* lda,
+        const lapack_int* ipiv, lapack_complex_float* work, const lapack_int* lwork,
+        lapack_int* info );
+
+extern "C" void zgetri_( const lapack_int* n, lapack_complex_double* a, const lapack_int* lda,
+        const lapack_int* ipiv, lapack_complex_double* work, const lapack_int* lwork,
+        lapack_int* info );
+
+namespace Gadgetron
+{
+
+// following matrix computation calls MKL functions
+#if defined(USE_MKL) || defined(USE_LAPACK)
+
+void gemm(hoNDArray< std::complex<float> >& C, const hoNDArray< std::complex<float> >& A, const hoNDArray< std::complex<float> >& B)
+{
+    typedef std::complex<float> T;
+    try
+    {
+        char TA, TB;
+
+        GADGET_CHECK_THROW( (&C!=&A) && (&C!=&B) && (&A!=&B) );
+
+        lapack_int lda = (lapack_int)A.get_size(0);
+        lapack_int ldb = (lapack_int)B.get_size(0);
+        const T* pA = A.begin(); 
+        const T* pB = B.begin(); 
+
+        lapack_int M = (lapack_int)A.get_size(0);
+        lapack_int K = (lapack_int)A.get_size(1);
+
+        lapack_int K2 = (lapack_int)B.get_size(0);
+        lapack_int N = (lapack_int)B.get_size(1);
+
+        GADGET_CHECK_THROW(K==K2);
+        if ( (C.get_size(0)!=M) || (C.get_size(1)!=N) )
+        {
+            C.create(M, N);
+        }
+
+        T* pC = C.begin();
+        lapack_int ldc = (lapack_int)C.get_size(0);
+
+         std::complex<float>  alpha(1), beta(0);
+
+        TA = 'N';
+        TB = 'N';
+
+        cgemm_(&TA, &TB, &M, &N, &K, reinterpret_cast<lapack_complex_float*>(&alpha), reinterpret_cast<const lapack_complex_float*>(pA), &lda, reinterpret_cast<const lapack_complex_float*>(pB), &ldb, reinterpret_cast<lapack_complex_float*>(&beta), reinterpret_cast<lapack_complex_float*>(pC), &ldc);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in gemm(hoNDArray< std::complex<float> >& C, const hoNDArray< std::complex<float> >& A, const hoNDArray< std::complex<float> >& B) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH 
+void gemm(hoNDArray<float>& C, const hoNDArray<float>& A, bool transA, const hoNDArray<float>& B, bool transB)
+{
+    try
+    {
+        typedef float T;
+
+        GADGET_CHECK_THROW( (&C!=&A) && (&C!=&B) && (&A!=&B) );
+
+        char TA, TB;
+
+        lapack_int lda = (lapack_int)A.get_size(0);
+        lapack_int ldb = (lapack_int)B.get_size(0);
+        const T* pA = A.begin(); 
+        const T* pB = B.begin(); 
+
+        lapack_int M = (lapack_int)A.get_size(0);
+        lapack_int K = (lapack_int)A.get_size(1);
+        if ( transA )
+        { 
+            M = (lapack_int)A.get_size(1);
+            K = (lapack_int)A.get_size(0);
+        }
+
+        lapack_int K2 = (lapack_int)B.get_size(0);
+        lapack_int N = (lapack_int)B.get_size(1);
+        if ( transB )
+        {
+            K2 = (lapack_int)B.get_size(1);
+            N = (lapack_int)B.get_size(0);
+        }
+
+        GADGET_CHECK_THROW(K==K2);
+        if ( (C.get_size(0)!=M) || (C.get_size(1)!=N) )
+        {
+            C.create(M, N);
+        }
+
+        T* pC = C.begin();
+        lapack_int ldc = (lapack_int)C.get_size(0);
+
+        float alpha(1), beta(0);
+
+        if ( transA )
+        {
+            TA = 'T';
+        }
+        else
+        {
+            TA = 'N';
+        }
+
+        if ( transB )
+        {
+            TB = 'T';
+        }
+        else
+        {
+            TB = 'N';
+        }
+
+        sgemm_(&TA, &TB, &M, &N, &K, &alpha, reinterpret_cast<const float*>(pA), &lda, reinterpret_cast<const float*>(pB), &ldb, &beta, reinterpret_cast<float*>(pC), &ldc);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in gemm(hoNDArray<float>& C, const hoNDArray<float>& A, bool transA, const hoNDArray<float>& B, bool transB) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH 
+void gemm(hoNDArray<double>& C, const hoNDArray<double>& A, bool transA, const hoNDArray<double>& B, bool transB)
+{
+    try
+    {
+        typedef double T;
+
+        GADGET_CHECK_THROW( (&C!=&A) && (&C!=&B) && (&A!=&B) );
+
+        char TA, TB;
+
+        lapack_int lda = (lapack_int)A.get_size(0);
+        lapack_int ldb = (lapack_int)B.get_size(0);
+        const T* pA = A.begin(); 
+        const T* pB = B.begin(); 
+
+        lapack_int M = (lapack_int)A.get_size(0);
+        lapack_int K = (lapack_int)A.get_size(1);
+        if ( transA )
+        { 
+            M = (lapack_int)A.get_size(1);
+            K = (lapack_int)A.get_size(0);
+        }
+
+        lapack_int K2 = (lapack_int)B.get_size(0);
+        lapack_int N = (lapack_int)B.get_size(1);
+        if ( transB )
+        {
+            K2 = (lapack_int)B.get_size(1);
+            N = (lapack_int)B.get_size(0);
+        }
+
+        GADGET_CHECK_THROW(K==K2);
+        if ( (C.get_size(0)!=M) || (C.get_size(1)!=N) )
+        {
+            C.create(M, N);
+        }
+
+        T* pC = C.begin();
+        lapack_int ldc = (lapack_int)C.get_size(0);
+
+        double alpha(1), beta(0);
+
+        if ( transA )
+        {
+            TA = 'T';
+        }
+        else
+        {
+            TA = 'N';
+        }
+
+        if ( transB )
+        {
+            TB = 'T';
+        }
+        else
+        {
+            TB = 'N';
+        }
+
+        dgemm_(&TA, &TB, &M, &N, &K, &alpha, reinterpret_cast<const double*>(pA), &lda, reinterpret_cast<const double*>(pB), &ldb, &beta, reinterpret_cast<double*>(pC), &ldc);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in gemm(hoNDArray<double>& C, const hoNDArray<double>& A, bool transA, const hoNDArray<double>& B, bool transB) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH 
+void gemm(hoNDArray< std::complex<float> >& C, const hoNDArray< std::complex<float> >& A, bool transA, const hoNDArray< std::complex<float> >& B, bool transB)
+{
+    try
+    {
+        typedef  std::complex<float>  T;
+
+        GADGET_CHECK_THROW( (&C!=&A) && (&C!=&B) && (&A!=&B) );
+
+        char TA, TB;
+
+        lapack_int lda = (lapack_int)A.get_size(0);
+        lapack_int ldb = (lapack_int)B.get_size(0);
+        const T* pA = A.begin(); 
+        const T* pB = B.begin(); 
+
+        lapack_int M = (lapack_int)A.get_size(0);
+        lapack_int K = (lapack_int)A.get_size(1);
+        if ( transA )
+        { 
+            M = (lapack_int)A.get_size(1);
+            K = (lapack_int)A.get_size(0);
+        }
+
+        lapack_int K2 = (lapack_int)B.get_size(0);
+        lapack_int N = (lapack_int)B.get_size(1);
+        if ( transB )
+        {
+            K2 = (lapack_int)B.get_size(1);
+            N = (lapack_int)B.get_size(0);
+        }
+
+        GADGET_CHECK_THROW(K==K2);
+        if ( (C.get_size(0)!=M) || (C.get_size(1)!=N) )
+        {
+            C.create(M, N);
+        }
+
+        T* pC = C.begin();
+        lapack_int ldc = (lapack_int)C.get_size(0);
+
+         std::complex<float>  alpha(1), beta(0);
+
+        if ( transA )
+        {
+            TA = 'C';
+        }
+        else
+        {
+            TA = 'N';
+        }
+
+        if ( transB )
+        {
+            TB = 'C';
+        }
+        else
+        {
+            TB = 'N';
+        }
+
+        cgemm_(&TA, &TB, &M, &N, &K, reinterpret_cast<lapack_complex_float*>(&alpha), reinterpret_cast<const lapack_complex_float*>(pA), &lda, reinterpret_cast<const lapack_complex_float*>(pB), &ldb, reinterpret_cast<lapack_complex_float*>(&beta), reinterpret_cast<lapack_complex_float*>(pC), &ldc);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in gemm(hoNDArray< std::complex<float> >& C, const hoNDArray< std::complex<float> >& A, bool transA, const hoNDArray< std::complex<float> >& B, bool transB) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH 
+void gemm(hoNDArray< complext<float> >& C, const hoNDArray< complext<float> >& A, bool transA, const hoNDArray< complext<float> >& B, bool transB)
+{
+    try
+    {
+        typedef hoNDArray< std::complex<float> > ArrayType;
+        gemm( reinterpret_cast<ArrayType&>(C), reinterpret_cast<const ArrayType&>(A), transA, reinterpret_cast<const ArrayType&>(B), transB );
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in gemm(hoNDArray< complext<float> >& C, const hoNDArray< complext<float> >& A, bool transA, const hoNDArray< complext<float> >& B, bool transB) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH 
+void gemm(hoNDArray< std::complex<double> >& C, const hoNDArray< std::complex<double> >& A, bool transA, const hoNDArray< std::complex<double> >& B, bool transB)
+{
+    try
+    {
+        typedef  std::complex<double>  T;
+
+        GADGET_CHECK_THROW( (&C!=&A) && (&C!=&B) && (&A!=&B) );
+
+        char TA, TB;
+
+        lapack_int lda = (lapack_int)A.get_size(0);
+        lapack_int ldb = (lapack_int)B.get_size(0);
+        const T* pA = A.begin(); 
+        const T* pB = B.begin(); 
+
+        lapack_int M = (lapack_int)A.get_size(0);
+        lapack_int K = (lapack_int)A.get_size(1);
+        if ( transA )
+        { 
+            M = (lapack_int)A.get_size(1);
+            K = (lapack_int)A.get_size(0);
+        }
+
+        lapack_int K2 = (lapack_int)B.get_size(0);
+        lapack_int N = (lapack_int)B.get_size(1);
+        if ( transB )
+        {
+            K2 = (lapack_int)B.get_size(1);
+            N = (lapack_int)B.get_size(0);
+        }
+
+        GADGET_CHECK_THROW(K==K2);
+        if ( (C.get_size(0)!=M) || (C.get_size(1)!=N) )
+        {
+            C.create(M, N);
+        }
+
+        T* pC = C.begin();
+        lapack_int ldc = (lapack_int)C.get_size(0);
+
+         std::complex<double>  alpha(1), beta(0);
+
+        if ( transA )
+        {
+            TA = 'C';
+        }
+        else
+        {
+            TA = 'N';
+        }
+
+        if ( transB )
+        {
+            TB = 'C';
+        }
+        else
+        {
+            TB = 'N';
+        }
+
+        zgemm_(&TA, &TB, &M, &N, &K, reinterpret_cast<lapack_complex_double*>(&alpha), reinterpret_cast<const lapack_complex_double*>(pA), &lda, reinterpret_cast<const lapack_complex_double*>(pB), &ldb, reinterpret_cast<lapack_complex_double*>(&beta), reinterpret_cast<lapack_complex_double*>(pC), &ldc);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in gemm(hoNDArray< std::complex<float> >& C, const hoNDArray< std::complex<float> >& A, bool transA, const hoNDArray< std::complex<float> >& B, bool transB) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH 
+void gemm(hoNDArray< complext<double> >& C, const hoNDArray< complext<double> >& A, bool transA, const hoNDArray< complext<double> >& B, bool transB)
+{
+    try
+    {
+        typedef hoNDArray< std::complex<double> > ArrayType;
+        gemm( reinterpret_cast<ArrayType&>(C), reinterpret_cast<const ArrayType&>(A), transA, reinterpret_cast<const ArrayType&>(B), transB );
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in gemm(hoNDArray< complext<double> >& C, const hoNDArray< complext<double> >& A, bool transA, const hoNDArray< complext<double> >& B, bool transB) ...");
+    }
+}
+
+/// ------------------------------------------------------------------------------------
+
+template<> EXPORTCPUCOREMATH 
+void syrk(hoNDArray<float>& C, const hoNDArray<float>& A, char uplo, bool isATA)
+{
+    try
+    {
+        typedef float T;
+
+        GADGET_CHECK_THROW( (&A!=&C) );
+
+        char TA;
+
+        lapack_int lda = (lapack_int)A.get_size(0);
+        const T* pA = A.begin(); 
+
+        lapack_int M = (lapack_int)A.get_size(0);
+        lapack_int K = (lapack_int)A.get_size(1);
+        if ( isATA )
+        { 
+            M = (lapack_int)A.get_size(1);
+            K = (lapack_int)A.get_size(0);
+        }
+
+        if ( (C.get_size(0)!=M) || (C.get_size(1)!=M) )
+        {
+            C.create(M, M);
+        }
+
+        T* pC = C.begin();
+        lapack_int ldc = (lapack_int)C.get_size(0);
+
+        float alpha(1), beta(0);
+
+        if ( isATA )
+        {
+            TA = 'T';
+        }
+        else
+        {
+            TA = 'N';
+        }
+
+        ssyrk_(&uplo, &TA, &M, &K, &alpha, pA, &lda, &beta, pC, &ldc);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in ssyrk(hoNDArray<float>& C, const hoNDArray<float>& A, char uplo, bool isATA) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH 
+void syrk(hoNDArray<double>& C, const hoNDArray<double>& A, char uplo, bool isATA)
+{
+    try
+    {
+        typedef double T;
+
+        GADGET_CHECK_THROW( (&A!=&C) );
+
+        char TA;
+
+        lapack_int lda = (lapack_int)A.get_size(0);
+        const T* pA = A.begin(); 
+
+        lapack_int M = (lapack_int)A.get_size(0);
+        lapack_int K = (lapack_int)A.get_size(1);
+        if ( isATA )
+        { 
+            M = (lapack_int)A.get_size(1);
+            K = (lapack_int)A.get_size(0);
+        }
+
+        if ( (C.get_size(0)!=M) || (C.get_size(1)!=M) )
+        {
+            C.create(M, M);
+        }
+
+        T* pC = C.begin();
+        lapack_int ldc = (lapack_int)C.get_size(0);
+
+        double alpha(1), beta(0);
+
+        if ( isATA )
+        {
+            TA = 'T';
+        }
+        else
+        {
+            TA = 'N';
+        }
+
+        dsyrk_(&uplo, &TA, &M, &K, &alpha, pA, &lda, &beta, pC, &ldc);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in syrk(hoNDArray<double>& C, const hoNDArray<double>& A, char uplo, bool isATA) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH 
+void syrk(hoNDArray< std::complex<float> >& C, const hoNDArray< std::complex<float> >& A, char uplo, bool isATA)
+{
+    try
+    {
+        typedef  std::complex<float>  T;
+
+        GADGET_CHECK_THROW( (&A!=&C) );
+
+        char TA;
+
+        lapack_int lda = (lapack_int)A.get_size(0);
+        const T* pA = A.begin(); 
+
+        lapack_int N = (lapack_int)A.get_size(0);
+        lapack_int K = (lapack_int)A.get_size(1);
+        if ( isATA )
+        { 
+            N = (lapack_int)A.get_size(1);
+            K = (lapack_int)A.get_size(0);
+        }
+
+        GADGET_CHECK_THROW ( (C.get_size(0)==N) && (C.get_size(1)==N) );
+
+        T* pC = C.begin();
+        lapack_int ldc = (lapack_int)C.get_size(0);
+
+        lapack_complex_float alpha(1), beta(0);
+
+        if ( isATA )
+        {
+            TA = 'T';
+        }
+        else
+        {
+            TA = 'N';
+        }
+
+        csyrk_(&uplo, &TA, &N, &K, &alpha, pA, &lda, &beta, pC, &ldc);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in syrk(hoNDArray< std::complex<float> >& C, const hoNDArray< std::complex<float> >& A, char uplo, bool isATA) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH 
+void syrk(hoNDArray< complext<float> >& C, const hoNDArray< complext<float> >& A, char uplo, bool isATA)
+{
+    try
+    {
+        typedef  hoNDArray< std::complex<float> > ArrayType;
+        syrk( reinterpret_cast<ArrayType&>(C), reinterpret_cast<const ArrayType&>(A), uplo, isATA);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in syrk(hoNDArray< complext<float> >& C, const hoNDArray< complext<float> >& A, char uplo, bool isATA) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH 
+void syrk(hoNDArray< std::complex<double> >& C, const hoNDArray< std::complex<double> >& A, char uplo, bool isATA)
+{
+    try
+    {
+        typedef  std::complex<double>  T;
+
+        GADGET_CHECK_THROW( (&A!=&C) );
+
+        char TA;
+
+        lapack_int lda = (lapack_int)A.get_size(0);
+        const T* pA = A.begin(); 
+
+        lapack_int M = (lapack_int)A.get_size(0);
+        lapack_int K = (lapack_int)A.get_size(1);
+        if ( isATA )
+        { 
+            M = (lapack_int)A.get_size(1);
+            K = (lapack_int)A.get_size(0);
+        }
+
+        if ( (C.get_size(0)!=M) || (C.get_size(1)!=M) )
+        {
+            C.create(M, M);
+        }
+
+        T* pC = C.begin();
+        lapack_int ldc = (lapack_int)C.get_size(0);
+
+        lapack_complex_double alpha(1), beta(0);
+
+        if ( isATA )
+        {
+            TA = 'T';
+        }
+        else
+        {
+            TA = 'N';
+        }
+
+        zsyrk_(&uplo, &TA, &M, &K, &alpha, pA, &lda, &beta, pC, &ldc);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in syrk(hoNDArray< std::complex<double> >& C, const hoNDArray< std::complex<double> >& A, char uplo, bool isATA) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH 
+void syrk(hoNDArray< complext<double> >& C, const hoNDArray< complext<double> >& A, char uplo, bool isATA)
+{
+    try
+    {
+        typedef  hoNDArray< std::complex<double> > ArrayType;
+        syrk( reinterpret_cast<ArrayType&>(C), reinterpret_cast<const ArrayType&>(A), uplo, isATA);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in syrk(hoNDArray< complext<double> >& C, const hoNDArray< complext<double> >& A, char uplo, bool isATA) ...");
+    }
+}
+
+/// ------------------------------------------------------------------------------------
+
+template<> EXPORTCPUCOREMATH 
+void herk(hoNDArray<float>& C, const hoNDArray<float>& A, char uplo, bool isAHA)
+{
+    syrk(C, A, uplo, isAHA);
+}
+
+template<> EXPORTCPUCOREMATH 
+void herk(hoNDArray<double>& C, const hoNDArray<double>& A, char uplo, bool isAHA)
+{
+    syrk(C, A, uplo, isAHA);
+}
+
+template<> EXPORTCPUCOREMATH 
+void herk(hoNDArray< std::complex<float> >& C, const hoNDArray< std::complex<float> >& A, char uplo, bool isAHA)
+{
+    try
+    {
+        typedef  std::complex<float>  T;
+
+        GADGET_CHECK_THROW( (&A!=&C) );
+
+        char TA;
+
+        lapack_int lda = (lapack_int)A.get_size(0);
+        const T* pA = A.begin(); 
+
+        lapack_int N = (lapack_int)A.get_size(0);
+        lapack_int K = (lapack_int)A.get_size(1);
+        if ( isAHA )
+        { 
+            N = (lapack_int)A.get_size(1);
+            K = (lapack_int)A.get_size(0);
+        }
+
+        if ( (C.get_size(0)!=N) || (C.get_size(1)!=N) )
+        {
+            C.create(N, N);
+        }
+
+        T* pC = C.begin();
+        lapack_int ldc = (lapack_int)C.get_size(0);
+
+        lapack_complex_float alpha(1), beta(0);
+
+        if ( isAHA )
+        {
+            TA = 'C';
+        }
+        else
+        {
+            TA = 'N';
+        }
+
+        cherk_(&uplo, &TA, &N, &K, &alpha, pA, &lda, &beta, pC, &ldc);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in herk(hoNDArray< std::complex<float> >& C, const hoNDArray< std::complex<float> >& A, char uplo, bool isAHA) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH 
+void herk(hoNDArray< complext<float> >& C, const hoNDArray< complext<float> >& A, char uplo, bool isATA)
+{
+    try
+    {
+        typedef  hoNDArray< std::complex<float> > ArrayType;
+        herk( reinterpret_cast<ArrayType&>(C), reinterpret_cast<const ArrayType&>(A), uplo, isATA);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in herk(hoNDArray< complext<float> >& C, const hoNDArray< complext<float> >& A, char uplo, bool isATA) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH 
+void herk(hoNDArray< std::complex<double> >& C, const hoNDArray< std::complex<double> >& A, char uplo, bool isAHA)
+{
+    try
+    {
+        typedef  std::complex<double>  T;
+
+        GADGET_CHECK_THROW( (&A!=&C) );
+
+        char TA;
+
+        lapack_int lda = (lapack_int)A.get_size(0);
+        const T* pA = A.begin(); 
+
+        lapack_int N = (lapack_int)A.get_size(0);
+        lapack_int K = (lapack_int)A.get_size(1);
+        if ( isAHA )
+        { 
+            N = (lapack_int)A.get_size(1);
+            K = (lapack_int)A.get_size(0);
+        }
+
+        if ( (C.get_size(0)!=N) || (C.get_size(1)!=N) )
+        {
+            C.create(N, N);
+        }
+
+        T* pC = C.begin();
+        lapack_int ldc = (lapack_int)C.get_size(0);
+
+        lapack_complex_double alpha(1), beta(0);
+
+        if ( isAHA )
+        {
+            TA = 'C';
+        }
+        else
+        {
+            TA = 'N';
+        }
+
+        zherk_(&uplo, &TA, &N, &K, &alpha, pA, &lda, &beta, pC, &ldc);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in herk(hoNDArray< std::complex<double> >& C, const hoNDArray< std::complex<double> >& A, char uplo, bool isAHA) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH 
+void herk(hoNDArray< complext<double> >& C, const hoNDArray< complext<double> >& A, char uplo, bool isATA)
+{
+    try
+    {
+        typedef  hoNDArray< std::complex<double> > ArrayType;
+        herk( reinterpret_cast<ArrayType&>(C), reinterpret_cast<const ArrayType&>(A), uplo, isATA);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in herk(hoNDArray< complext<double> >& C, const hoNDArray< complext<double> >& A, char uplo, bool isATA) ...");
+    }
+}
+
+/// ------------------------------------------------------------------------------------
+
+template<typename T> 
+void potrf(hoNDArray<T>& A, char uplo)
+{
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return;
+        GADGET_CHECK_THROW(A.get_size(0)==A.get_size(1));
+
+        lapack_int info;
+        lapack_int n = (lapack_int)(A.get_size(0));
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)(A.get_size(0));
+
+        if ( typeid(T)==typeid(float) )
+        {
+            spotrf_(&uplo, &n, reinterpret_cast<float*>(pA), &lda, &info);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            dpotrf_(&uplo, &n, reinterpret_cast<double*>(pA), &lda, &info);
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<float> )) )
+        {
+            cpotrf_(&uplo, &n, reinterpret_cast<lapack_complex_float*>(pA), &lda, &info);
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<double> )) )
+        {
+            zpotrf_(&uplo, &n, reinterpret_cast<lapack_complex_double*>(pA), &lda, &info);
+        }
+        else
+        {
+            GADGET_THROW("potrf : unsupported type ... ");
+        }
+
+        GADGET_CHECK_THROW(info==0);
+
+        if ( uplo == 'U' )
+        {
+            // GADGET_CHECK_THROW(A.lowerTri(0));
+
+            size_t r, c;
+            for (c=0; c<n; c++)
+            {
+                for (r=c+1; r<n; r++)
+                {
+                    pA[r + c*n] = 0;
+                }
+            }
+        }
+        else
+        {
+            // GADGET_CHECK_THROW(A.upperTri(0));
+
+            size_t r, c;
+            for (r=0; r<n; r++)
+            {
+                for (c=r+1; c<n; c++)
+                {
+                    pA[r + c*n] = 0;
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in potrf(hoNDArray<T>& A, char uplo) ...");
+    }
+}
+
+template EXPORTCPUCOREMATH void potrf(hoNDArray<float>& A, char uplo);
+template EXPORTCPUCOREMATH void potrf(hoNDArray<double>& A, char uplo);
+template EXPORTCPUCOREMATH void potrf(hoNDArray< std::complex<float> >& A, char uplo);
+template EXPORTCPUCOREMATH void potrf(hoNDArray< complext<float> >& A, char uplo);
+template EXPORTCPUCOREMATH void potrf(hoNDArray< std::complex<double> >& A, char uplo);
+template EXPORTCPUCOREMATH void potrf(hoNDArray< complext<double> >& A, char uplo);
+
+/// ------------------------------------------------------------------------------------
+
+template<typename T> 
+void heev(hoNDArray<T>& A, hoNDArray<typename realType<T>::Type>& eigenValue)
+{
+    try
+    {
+        lapack_int M = (lapack_int)A.get_size(0);
+        GADGET_CHECK_THROW(A.get_size(1) == M);
+
+        if ( (eigenValue.get_size(0)!=M) || (eigenValue.get_size(1)!=1) )
+        {
+            eigenValue.create(M, 1);
+        }
+
+        lapack_int info;
+        char jobz = 'V';
+        char uplo = 'L';
+        T* pA = A.begin();
+        typename realType<T>::Type* pEV = eigenValue.begin();
+
+        //if ( typeid(T)==typeid(float) )
+        //{
+        //    info = LAPACKE_ssyev(LAPACK_COL_MAJOR, jobz, uplo, M, reinterpret_cast<float*>(pA), M, reinterpret_cast<float*>(pEV));
+        //}
+        //else if ( typeid(T)==typeid(double) )
+        //{
+        //    info = LAPACKE_dsyev(LAPACK_COL_MAJOR, jobz, uplo, M, reinterpret_cast<double*>(pA), M, reinterpret_cast<double*>(pEV));
+        //}
+        //else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<float> )) )
+        //{
+        //    info = LAPACKE_cheev(LAPACK_COL_MAJOR, jobz, uplo, M, reinterpret_cast<lapack_complex_float*>(pA), M, reinterpret_cast<float*>(pEV));
+        //}
+        //else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<double> )) )
+        //{
+        //    info = LAPACKE_zheev(LAPACK_COL_MAJOR, jobz, uplo, M, reinterpret_cast<lapack_complex_double*>(pA), M, reinterpret_cast<double*>(pEV));
+        //}
+        //else
+        //{
+        //    GADGET_THROW("heev : unsupported type " << typeid(T).name());
+        //}
+
+        lapack_int lwork;
+        lwork = M*M;
+
+        if ( typeid(T)==typeid(float) )
+        {
+            hoNDArray<float> work(M, M);
+            ssyev_(&jobz, &uplo, &M, reinterpret_cast<float*>(pA), &M, reinterpret_cast<float*>(pEV), work.begin(), &lwork, &info);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            hoNDArray<double> work(M, M);
+            dsyev_(&jobz, &uplo, &M, reinterpret_cast<double*>(pA), &M, reinterpret_cast<double*>(pEV), work.begin(), &lwork, &info);
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<float> )) )
+        {
+            hoNDArray< std::complex<float> > work(M, M);
+            hoNDArray<float> rwork(3*M);
+            cheev_(&jobz, &uplo, &M, reinterpret_cast<lapack_complex_float*>(pA), &M, reinterpret_cast<float*>(pEV), reinterpret_cast<lapack_complex_float*>(work.begin()), &lwork, rwork.begin(), &info);
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<double> )) )
+        {
+            hoNDArray< std::complex<double> > work(M, M);
+            hoNDArray<double> rwork(3*M);
+            zheev_(&jobz, &uplo, &M, reinterpret_cast<lapack_complex_double*>(pA), &M, reinterpret_cast<double*>(pEV), reinterpret_cast<lapack_complex_double*>(work.begin()), &lwork, rwork.begin(), &info);
+        }
+        else
+        {
+            GADGET_THROW("heev : unsupported type ... ");
+        }
+
+        GADGET_CHECK_THROW(info==0);
+    }
+    catch (...)
+    {
+        GADGET_THROW("Errors in heev(hoNDArray<T>& A, hoNDArray<typename realType<T>::Type>& eigenValue) ... ");
+    }
+}
+
+template EXPORTCPUCOREMATH void heev(hoNDArray<float>& A, hoNDArray<float>& eigenValue);
+template EXPORTCPUCOREMATH void heev(hoNDArray<double>& A, hoNDArray<double>& eigenValue);
+template EXPORTCPUCOREMATH void heev(hoNDArray< std::complex<float> >& A, hoNDArray<float>& eigenValue);
+template EXPORTCPUCOREMATH void heev(hoNDArray< complext<float> >& A, hoNDArray<float>& eigenValue);
+template EXPORTCPUCOREMATH void heev(hoNDArray< std::complex<double> >& A, hoNDArray<double>& eigenValue);
+template EXPORTCPUCOREMATH void heev(hoNDArray< complext<double> >& A, hoNDArray<double>& eigenValue);
+
+template<typename T> 
+void heev(hoNDArray< std::complex<T> >& A, hoNDArray< std::complex<T> >& eigenValue)
+{
+    try
+    {
+        long long M = (long long)A.get_size(0);
+        GADGET_CHECK_THROW(A.get_size(1) == M);
+
+        if ( (eigenValue.get_size(0)!=M) || (eigenValue.get_size(1)!=1) )
+        {
+            eigenValue.create(M, 1);
+        }
+
+        hoNDArray<typename realType<T>::Type> D(M, 1);
+        heev(A, D);
+        eigenValue.copyFrom(D);
+    }
+    catch (...)
+    {
+        GADGET_THROW("Errors in heev(hoNDArray< std::complex<T> >& A, hoNDArray< std::complex<T> >& eigenValue) ... ");
+    }
+}
+
+template EXPORTCPUCOREMATH void heev(hoNDArray< std::complex<float> >& A, hoNDArray< std::complex<float> >& eigenValue);
+template EXPORTCPUCOREMATH void heev(hoNDArray< std::complex<double> >& A, hoNDArray< std::complex<double> >& eigenValue);
+
+/// ------------------------------------------------------------------------------------
+
+template<typename T> 
+void potri(hoNDArray<T>& A)
+{
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return;
+        GADGET_CHECK_THROW(A.get_size(0)==A.get_size(1));
+
+        lapack_int info;
+        char uplo = 'L';
+        lapack_int n = (lapack_int)A.get_size(0);
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.get_size(0);
+
+        //if ( typeid(T)==typeid(float) )
+        //{
+        //    info = LAPACKE_spotrf(LAPACK_COL_MAJOR, uplo, n, reinterpret_cast<float*>(pA), lda);
+        //    GADGET_CHECK_THROW(info==0);
+
+        //    info = LAPACKE_spotri(LAPACK_COL_MAJOR, uplo, n, reinterpret_cast<float*>(pA), lda);
+        //    GADGET_CHECK_THROW(info==0);
+        //}
+        //else if ( typeid(T)==typeid(double) )
+        //{
+        //    info = LAPACKE_dpotrf(LAPACK_COL_MAJOR, uplo, n, reinterpret_cast<double*>(pA), lda);
+        //    GADGET_CHECK_THROW(info==0);
+
+        //    info = LAPACKE_dpotri(LAPACK_COL_MAJOR, uplo, n, reinterpret_cast<double*>(pA), lda);
+        //    GADGET_CHECK_THROW(info==0);
+        //}
+        //else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<float> )) )
+        //{
+        //    info = LAPACKE_cpotrf(LAPACK_COL_MAJOR, uplo, n, reinterpret_cast<lapack_complex_float*>(pA), lda);
+        //    GADGET_CHECK_THROW(info==0);
+
+        //    info = LAPACKE_cpotri(LAPACK_COL_MAJOR, uplo, n, reinterpret_cast<lapack_complex_float*>(pA), lda);
+        //    GADGET_CHECK_THROW(info==0);
+        //}
+        //else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<double> )) )
+        //{
+        //    info = LAPACKE_zpotrf(LAPACK_COL_MAJOR, uplo, n, reinterpret_cast<lapack_complex_double*>(pA), lda);
+        //    GADGET_CHECK_THROW(info==0);
+
+        //    info = LAPACKE_zpotri(LAPACK_COL_MAJOR, uplo, n, reinterpret_cast<lapack_complex_double*>(pA), lda);
+        //    GADGET_CHECK_THROW(info==0);
+        //}
+        //else
+        //{
+        //    GADGET_THROW("potri : unsupported type " << typeid(T).name());
+        //}
+
+        if ( typeid(T)==typeid(float) )
+        {
+            spotrf_(&uplo, &n, reinterpret_cast<float*>(pA), &lda, &info);
+            GADGET_CHECK_THROW(info==0);
+
+            spotri_(&uplo, &n, reinterpret_cast<float*>(pA), &lda, &info);
+            GADGET_CHECK_THROW(info==0);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            dpotrf_(&uplo, &n, reinterpret_cast<double*>(pA), &lda, &info);
+            GADGET_CHECK_THROW(info==0);
+
+            dpotri_(&uplo, &n, reinterpret_cast<double*>(pA), &lda, &info);
+            GADGET_CHECK_THROW(info==0);
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<float> )) )
+        {
+            cpotrf_(&uplo, &n, reinterpret_cast<lapack_complex_float*>(pA), &lda, &info);
+            GADGET_CHECK_THROW(info==0);
+
+            cpotri_(&uplo, &n, reinterpret_cast<lapack_complex_float*>(pA), &lda, &info);
+            GADGET_CHECK_THROW(info==0);
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<double> )) )
+        {
+            zpotrf_(&uplo, &n, reinterpret_cast<lapack_complex_double*>(pA), &lda, &info);
+            GADGET_CHECK_THROW(info==0);
+
+            zpotri_(&uplo, &n, reinterpret_cast<lapack_complex_double*>(pA), &lda, &info);
+            GADGET_CHECK_THROW(info==0);
+        }
+        else
+        {
+            GADGET_THROW("potri : unsupported type ... ");
+        }
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in potri(hoNDArray<T>& A) ...");
+    }
+}
+
+template EXPORTCPUCOREMATH void potri(hoNDArray<float>& A);
+template EXPORTCPUCOREMATH void potri(hoNDArray<double>& A);
+template EXPORTCPUCOREMATH void potri(hoNDArray< std::complex<float> >& A);
+template EXPORTCPUCOREMATH void potri(hoNDArray< complext<float> >& A);
+template EXPORTCPUCOREMATH void potri(hoNDArray< std::complex<double> >& A);
+template EXPORTCPUCOREMATH void potri(hoNDArray< complext<double> >& A);
+
+/// ------------------------------------------------------------------------------------
+
+template<typename T> 
+void trtri(hoNDArray<T>& A, char uplo)
+{
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return;
+        GADGET_CHECK_THROW(A.get_size(0)==A.get_size(1));
+
+        lapack_int info;
+        char diag = 'N';
+        lapack_int n = (lapack_int)A.get_size(0);
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.get_size(0);
+
+        /*if ( typeid(T)==typeid(float) )
+        {
+            info = LAPACKE_strtri(LAPACK_COL_MAJOR, uplo, diag, n, reinterpret_cast<float*>(pA), lda);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            info = LAPACKE_dtrtri(LAPACK_COL_MAJOR, uplo, diag, n, reinterpret_cast<double*>(pA), lda);
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<float> )) )
+        {
+            info = LAPACKE_ctrtri(LAPACK_COL_MAJOR, uplo, diag, n, reinterpret_cast<lapack_complex_float*>(pA), lda);
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<double> )) )
+        {
+            info = LAPACKE_ztrtri(LAPACK_COL_MAJOR, uplo, diag, n, reinterpret_cast<lapack_complex_double*>(pA), lda);
+        }
+        else
+        {
+            GADGET_THROW("trtri : unsupported type " << typeid(T).name());
+        }*/
+
+        if ( typeid(T)==typeid(float) )
+        {
+            strtri_(&uplo, &diag, &n, reinterpret_cast<float*>(pA), &lda, &info);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            dtrtri_(&uplo, &diag, &n, reinterpret_cast<double*>(pA), &lda, &info);
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<float> )) )
+        {
+            ctrtri_(&uplo, &diag, &n, reinterpret_cast<lapack_complex_float*>(pA), &lda, &info);
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<double> )) )
+        {
+            ztrtri_(&uplo, &diag, &n, reinterpret_cast<lapack_complex_double*>(pA), &lda, &info);
+        }
+        else
+        {
+            GADGET_THROW("trtri : unsupported type ... ");
+        }
+
+        GADGET_CHECK_THROW(info==0);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in trtri(hoNDArray<float>& A, char uplo) ...");
+    }
+}
+
+template EXPORTCPUCOREMATH void trtri(hoNDArray<float>& A, char uplo);
+template EXPORTCPUCOREMATH void trtri(hoNDArray<double>& A, char uplo);
+template EXPORTCPUCOREMATH void trtri(hoNDArray< std::complex<float> >& A, char uplo);
+template EXPORTCPUCOREMATH void trtri(hoNDArray< complext<float> >& A, char uplo);
+template EXPORTCPUCOREMATH void trtri(hoNDArray< std::complex<double> >& A, char uplo);
+template EXPORTCPUCOREMATH void trtri(hoNDArray< complext<double> >& A, char uplo);
+
+/// ------------------------------------------------------------------------------------
+
+template<typename T>
+void posv(hoNDArray<T>& A, hoNDArray<T>& b)
+{
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return;
+        if( b.get_number_of_elements()==0 ) return;
+        GADGET_CHECK_THROW(A.get_size(0)==b.get_size(0));
+
+        lapack_int info;
+        char uplo = 'L';
+        lapack_int n = (lapack_int)A.get_size(0);
+        lapack_int nrhs = (lapack_int)b.get_size(1);
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.get_size(0);
+        T* pB = b.begin();
+        lapack_int ldb = (lapack_int)b.get_size(0);
+
+        /*if ( typeid(T)==typeid(float) )
+        {
+            info = LAPACKE_sposv(LAPACK_COL_MAJOR, uplo, n, nrhs, reinterpret_cast<float*>(pA), lda, reinterpret_cast<float*>(pB), ldb);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            info = LAPACKE_dposv(LAPACK_COL_MAJOR, uplo, n, nrhs, reinterpret_cast<double*>(pA), lda, reinterpret_cast<double*>(pB), ldb);
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<float> )) )
+        {
+            info = LAPACKE_cposv(LAPACK_COL_MAJOR, uplo, n, nrhs, reinterpret_cast<lapack_complex_float*>(pA), lda, reinterpret_cast<lapack_complex_float*>(pB), ldb);
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<double> )) )
+        {
+            info = LAPACKE_zposv(LAPACK_COL_MAJOR, uplo, n, nrhs, reinterpret_cast<lapack_complex_double*>(pA), lda, reinterpret_cast<lapack_complex_double*>(pB), ldb);
+        }
+        else
+        {
+            GADGET_THROW("posv : unsupported type ... ");
+        }*/
+
+        /*
+        We are swithcing off OpenMP threading before this call.There seems to be a bad interaction between openmp, cuda, and BLAS.
+        This is a temporary fix that we should keep an eye on.
+        */
+
+#ifdef USE_OMP
+        int num_threads = omp_get_num_threads();
+        omp_set_num_threads(1);
+#endif //USE_OMP
+
+        if ( typeid(T)==typeid(float) )
+        {
+            sposv_(&uplo, &n, &nrhs, reinterpret_cast<float*>(pA), &lda, reinterpret_cast<float*>(pB), &ldb, &info);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            dposv_(&uplo, &n, &nrhs, reinterpret_cast<double*>(pA), &lda, reinterpret_cast<double*>(pB), &ldb, &info);
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<float> )) )
+        {
+            cposv_(&uplo, &n, &nrhs, reinterpret_cast<lapack_complex_float*>(pA), &lda, reinterpret_cast<lapack_complex_float*>(pB), &ldb, &info);
+        }
+        else if ( (typeid(T)==typeid( std::complex<double> )) || (typeid(T)==typeid( complext<double> )) )
+        {
+            zposv_(&uplo, &n, &nrhs, reinterpret_cast<lapack_complex_double*>(pA), &lda, reinterpret_cast<lapack_complex_double*>(pB), &ldb, &info);
+        }
+        else
+        {
+#ifdef USE_OMP
+            omp_set_num_threads(num_threads);
+#endif //USE_OM
+            GADGET_THROW("posv : unsupported type ... ");
+        }
+
+#ifdef USE_OMP
+        omp_set_num_threads(num_threads);
+#endif //USE_OMP
+
+        GADGET_CHECK_THROW(info==0);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in posv(hoNDArray<T>& A, hoNDArray<T>& b) ...");
+    }
+}
+
+template EXPORTCPUCOREMATH void posv(hoNDArray<float>& A, hoNDArray<float>& b);
+template EXPORTCPUCOREMATH void posv(hoNDArray<double>& A, hoNDArray<double>& b);
+template EXPORTCPUCOREMATH void posv(hoNDArray< std::complex<float> >& A, hoNDArray< std::complex<float> >& b);
+template EXPORTCPUCOREMATH void posv(hoNDArray< complext<float> >& A, hoNDArray< complext<float> >& b);
+template EXPORTCPUCOREMATH void posv(hoNDArray< std::complex<double> >& A, hoNDArray< std::complex<double> >& b);
+template EXPORTCPUCOREMATH void posv(hoNDArray< complext<double> >& A, hoNDArray< complext<double> >& b);
+
+/// ------------------------------------------------------------------------------------
+
+template<> EXPORTCPUCOREMATH
+void hesv(hoNDArray< float >& A, hoNDArray< float >& b)
+{
+    typedef float T;
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return;
+        if( b.get_number_of_elements()==0 ) return;
+        GADGET_CHECK_THROW(A.get_size(0)==b.get_size(0));
+
+        lapack_int info(0);
+        char uplo = 'L';
+        lapack_int n = (lapack_int)A.get_size(0);
+        lapack_int nrhs = (lapack_int)b.get_size(1);
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.get_size(0);
+        T* pB = b.begin();
+        lapack_int ldb = (lapack_int)b.get_size(0);
+
+        hoNDArray<lapack_int> ipiv_array(n);
+        Gadgetron::clear(ipiv_array);
+        lapack_int* ipiv = ipiv_array.begin();
+
+        lapack_int lwork(n*n);
+        hoNDArray<T> work_array(lwork);
+        Gadgetron::clear(work_array);
+        T* work = work_array.begin();
+
+        ssysv_(&uplo, &n, &nrhs, reinterpret_cast<float*>(pA), &lda, ipiv, reinterpret_cast<float*>(pB), &ldb, reinterpret_cast<float*>(work), &lwork, &info);
+
+        GADGET_CHECK_THROW(info==0);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in hesv(hoNDArray< float >& A, hoNDArray< float >& b) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH
+void hesv(hoNDArray< double >& A, hoNDArray< double >& b)
+{
+    typedef double T;
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return;
+        if( b.get_number_of_elements()==0 ) return;
+        GADGET_CHECK_THROW(A.get_size(0)==b.get_size(0));
+
+        lapack_int info(0);
+        char uplo = 'L';
+        lapack_int n = (lapack_int)A.get_size(0);
+        lapack_int nrhs = (lapack_int)b.get_size(1);
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.get_size(0);
+        T* pB = b.begin();
+        lapack_int ldb = (lapack_int)b.get_size(0);
+
+        hoNDArray<lapack_int> ipiv_array(n);
+        Gadgetron::clear(ipiv_array);
+        lapack_int* ipiv = ipiv_array.begin();
+
+        lapack_int lwork(n*n);
+        hoNDArray<T> work_array(lwork);
+        Gadgetron::clear(work_array);
+        T* work = work_array.begin();
+
+        dsysv_(&uplo, &n, &nrhs, reinterpret_cast<double*>(pA), &lda, ipiv, reinterpret_cast<double*>(pB), &ldb, reinterpret_cast<double*>(work), &lwork, &info);
+
+        GADGET_CHECK_THROW(info==0);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in hesv(hoNDArray< double >& A, hoNDArray< double >& b) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH
+void hesv(hoNDArray< std::complex<float> >& A, hoNDArray< std::complex<float> >& b)
+{
+    typedef std::complex<float> T;
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return;
+        if( b.get_number_of_elements()==0 ) return;
+        GADGET_CHECK_THROW(A.get_size(0)==b.get_size(0));
+
+        lapack_int info(0);
+        char uplo = 'L';
+        lapack_int n = (lapack_int)A.get_size(0);
+        lapack_int nrhs = (lapack_int)b.get_size(1);
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.get_size(0);
+        T* pB = b.begin();
+        lapack_int ldb = (lapack_int)b.get_size(0);
+
+        hoNDArray<lapack_int> ipiv_array(n);
+        Gadgetron::clear(ipiv_array);
+        lapack_int* ipiv = ipiv_array.begin();
+
+        lapack_int lwork(n*n);
+        hoNDArray<T> work_array(lwork);
+        Gadgetron::clear(work_array);
+        T* work = work_array.begin();
+
+        chesv_(&uplo, &n, &nrhs, reinterpret_cast<lapack_complex_float*>(pA), &lda, ipiv, reinterpret_cast<lapack_complex_float*>(pB), &ldb, reinterpret_cast<lapack_complex_float*>(work), &lwork, &info);
+
+        GADGET_CHECK_THROW(info==0);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in hesv(hoNDArray< std::complex<float> >& A, hoNDArray< std::complex<float> >& b) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH
+void hesv(hoNDArray< complext<float> >& A, hoNDArray< complext<float> >& b)
+{
+    typedef hoNDArray< std::complex<float> > ArrayType;
+    try
+    {
+        hesv( reinterpret_cast<ArrayType&>(A), reinterpret_cast<ArrayType&>(b) );
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in hesv(hoNDArray< complext<float> >& A, hoNDArray< complext<float> >& b) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH
+void hesv(hoNDArray< std::complex<double> >& A, hoNDArray< std::complex<double> >& b)
+{
+    typedef std::complex<double> T;
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return;
+        if( b.get_number_of_elements()==0 ) return;
+        GADGET_CHECK_THROW(A.get_size(0)==b.get_size(0));
+
+        lapack_int info(0);
+        char uplo = 'L';
+        lapack_int n = (lapack_int)A.get_size(0);
+        lapack_int nrhs = (lapack_int)b.get_size(1);
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.get_size(0);
+        T* pB = b.begin();
+        lapack_int ldb = (lapack_int)b.get_size(0);
+
+        hoNDArray<lapack_int> ipiv_array(n);
+        Gadgetron::clear(ipiv_array);
+        lapack_int* ipiv = ipiv_array.begin();
+
+        lapack_int lwork(n*n);
+        hoNDArray<T> work_array(lwork);
+        Gadgetron::clear(work_array);
+        T* work = work_array.begin();
+
+        zhesv_(&uplo, &n, &nrhs, reinterpret_cast<lapack_complex_double*>(pA), &lda, ipiv, reinterpret_cast<lapack_complex_double*>(pB), &ldb, reinterpret_cast<lapack_complex_double*>(work), &lwork, &info);
+
+        GADGET_CHECK_THROW(info==0);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in hesv(hoNDArray< std::complex<double> >& A, hoNDArray< std::complex<double> >& b) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH
+void hesv(hoNDArray< complext<double> >& A, hoNDArray< complext<double> >& b)
+{
+    typedef hoNDArray< std::complex<double> > ArrayType;
+    try
+    {
+        hesv( reinterpret_cast<ArrayType&>(A), reinterpret_cast<ArrayType&>(b) );
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in hesv(hoNDArray< complext<double> >& A, hoNDArray< complext<double> >& b) ...");
+    }
+}
+
+/// ------------------------------------------------------------------------------------
+
+template<> EXPORTCPUCOREMATH
+void gesv(hoNDArray<float>& A, hoNDArray<float>& b)
+{
+    typedef float T;
+
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return;
+        if( b.get_number_of_elements()==0 ) return;
+        GADGET_CHECK_THROW(A.get_size(0)==b.get_size(0));
+
+        lapack_int info(0);
+        lapack_int n = (lapack_int)A.get_size(0);
+        lapack_int nrhs = (lapack_int)b.get_size(1);
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.get_size(0);
+        T* pB = b.begin();
+        lapack_int ldb = (lapack_int)b.get_size(1);
+
+        hoNDArray<lapack_int> work(n);
+        Gadgetron::clear(work);
+        lapack_int* ipiv = work.begin();
+
+        sgesv_(&n, &nrhs, reinterpret_cast<float*>(pA), &lda, ipiv, reinterpret_cast<float*>(pB), &ldb, &info);
+
+        GADGET_CHECK_THROW(info==0);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in gesv(hoNDArray<float>& A, hoNDArray<float>& b) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH
+void gesv(hoNDArray<double>& A, hoNDArray<double>& b)
+{
+    typedef double T;
+
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return;
+        if( b.get_number_of_elements()==0 ) return;
+        GADGET_CHECK_THROW(A.get_size(0)==b.get_size(0));
+
+        lapack_int info(0);
+        lapack_int n = (lapack_int)A.get_size(0);
+        lapack_int nrhs = (lapack_int)b.get_size(1);
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.get_size(0);
+        T* pB = b.begin();
+        lapack_int ldb = (lapack_int)b.get_size(0);
+
+        hoNDArray<lapack_int> work(n);
+        Gadgetron::clear(work);
+        lapack_int* ipiv = work.begin();
+
+        dgesv_(&n, &nrhs, reinterpret_cast<double*>(pA), &lda, ipiv, reinterpret_cast<double*>(pB), &ldb, &info);
+
+        GADGET_CHECK_THROW(info==0);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in gesv(hoNDArray<double>& A, hoNDArray<double>& b) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH
+void gesv(hoNDArray< std::complex<float> >& A, hoNDArray< std::complex<float> >& b)
+{
+    typedef std::complex<float> T;
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return;
+        if( b.get_number_of_elements()==0 ) return;
+        GADGET_CHECK_THROW(A.get_size(0)==b.get_size(0));
+
+        lapack_int info(0);
+        lapack_int n = (lapack_int)A.get_size(0);
+        lapack_int nrhs = (lapack_int)b.get_size(1);
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.get_size(0);
+        T* pB = b.begin();
+        lapack_int ldb = (lapack_int)b.get_size(0);
+
+        hoNDArray<lapack_int> work(n);
+        Gadgetron::clear(work);
+        lapack_int* ipiv = work.begin();
+
+        cgesv_(&n, &nrhs, reinterpret_cast<lapack_complex_float*>(pA), &lda, ipiv, reinterpret_cast<lapack_complex_float*>(pB), &ldb, &info);
+
+        GADGET_CHECK_THROW(info==0);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in gesv(hoNDArray< std::complex<float> >& A, hoNDArray< std::complex<float> >& b) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH
+void gesv(hoNDArray< complext<float> >& A, hoNDArray< complext<float> >& b)
+{
+    typedef hoNDArray< std::complex<float> > ArrayType;
+    try
+    {
+        gesv( reinterpret_cast<ArrayType&>(A), reinterpret_cast<ArrayType&>(b) );
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in gesv(hoNDArray< complext<float> >& A, hoNDArray< complext<float> >& b) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH
+void gesv(hoNDArray< std::complex<double> >& A, hoNDArray< std::complex<double> >& b)
+{
+    typedef std::complex<double> T;
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return;
+        if( b.get_number_of_elements()==0 ) return;
+        GADGET_CHECK_THROW(A.get_size(0)==b.get_size(0));
+
+        lapack_int info(0);
+        lapack_int n = (lapack_int)A.get_size(0);
+        lapack_int nrhs = (lapack_int)b.get_size(1);
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.get_size(0);
+        T* pB = b.begin();
+        lapack_int ldb = (lapack_int)b.get_size(0);
+
+        hoNDArray<lapack_int> work(n);
+        Gadgetron::clear(work);
+        lapack_int* ipiv = work.begin();
+
+        zgesv_(&n, &nrhs, reinterpret_cast<lapack_complex_double*>(pA), &lda, ipiv, reinterpret_cast<lapack_complex_double*>(pB), &ldb, &info);
+
+        GADGET_CHECK_THROW(info==0);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in gesv(hoNDArray< std::complex<double> >& A, hoNDArray< std::complex<double> >& b) ...");
+    }
+}
+
+template<> EXPORTCPUCOREMATH
+void gesv(hoNDArray< complext<double> >& A, hoNDArray< complext<double> >& b)
+{
+    typedef hoNDArray< std::complex<double> > ArrayType;
+    try
+    {
+        gesv( reinterpret_cast<ArrayType&>(A), reinterpret_cast<ArrayType&>(b) );
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in gesv(hoNDArray< complext<double> >& A, hoNDArray< complext<double> >& b) ...");
+    }
+}
+
+/// ------------------------------------------------------------------------------------
+
+/// Computes the LU factorization of a general m-by-n matrix
+/// this function is called by general matrix inversion
+template<typename T> 
+void getrf(hoNDArray<T>& A, hoNDArray<lapack_int>& ipiv)
+{
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return;
+
+        lapack_int info;
+        lapack_int m = (lapack_int)A.get_size(0);
+        lapack_int n = (lapack_int)A.get_size(1);
+
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.get_size(0);
+
+        ipiv.create( std::min(m, n) );
+        lapack_int* pIPIV = ipiv.begin();
+
+        //if ( typeid(T)==typeid(float) )
+        //{
+        //    info = LAPACKE_sgetrf(LAPACK_COL_MAJOR, m, n, reinterpret_cast<float*>(pA), lda, reinterpret_cast<lapack_int*>(pIPIV));
+        //}
+        //else if ( typeid(T)==typeid(double) )
+        //{
+        //    info = LAPACKE_dgetrf(LAPACK_COL_MAJOR, m, n, reinterpret_cast<double*>(pA), lda, reinterpret_cast<lapack_int*>(pIPIV));
+        //}
+        //else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<float> )) )
+        //{
+        //    info = LAPACKE_cgetrf(LAPACK_COL_MAJOR, m, n, reinterpret_cast<lapack_complex_float*>(pA), lda, reinterpret_cast<lapack_int*>(pIPIV));
+        //}
+        //else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<double> )) )
+        //{
+        //    info = LAPACKE_zgetrf(LAPACK_COL_MAJOR, m, n, reinterpret_cast<lapack_complex_double*>(pA), lda, reinterpret_cast<lapack_int*>(pIPIV));
+        //}
+        //else
+        //{
+        //    GADGET_THROW("getrf : unsupported type " << typeid(T).name());
+        //}
+
+        if ( typeid(T)==typeid(float) )
+        {
+            sgetrf_(&m, &n, reinterpret_cast<float*>(pA), &lda, reinterpret_cast<lapack_int*>(pIPIV), &info);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            dgetrf_(&m, &n, reinterpret_cast<double*>(pA), &lda, reinterpret_cast<lapack_int*>(pIPIV), &info);
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<float> )) )
+        {
+            cgetrf_(&m, &n, reinterpret_cast<lapack_complex_float*>(pA), &lda, reinterpret_cast<lapack_int*>(pIPIV), &info);
+        }
+        else if ( (typeid(T)==typeid( std::complex<double> )) || (typeid(T)==typeid( complext<double> )) )
+        {
+            zgetrf_(&m, &n, reinterpret_cast<lapack_complex_double*>(pA), &lda, reinterpret_cast<lapack_int*>(pIPIV), &info);
+        }
+        else
+        {
+            GADGET_THROW("getrf : unsupported type ... ");
+        }
+
+        GADGET_CHECK_THROW(info==0);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in getrf(hoNDArray<T>& A, hoNDArray<T>& ipiv) ...");
+    }
+}
+
+template EXPORTCPUCOREMATH void getrf(hoNDArray<float>& A, hoNDArray<lapack_int>& ipiv);
+template EXPORTCPUCOREMATH void getrf(hoNDArray<double>& A, hoNDArray<lapack_int>& ipiv);
+template EXPORTCPUCOREMATH void getrf(hoNDArray< std::complex<float> >& A, hoNDArray<lapack_int>& ipiv);
+template EXPORTCPUCOREMATH void getrf(hoNDArray< complext<float> >& A, hoNDArray<lapack_int>& ipiv);
+template EXPORTCPUCOREMATH void getrf(hoNDArray< std::complex<double> >& A, hoNDArray<lapack_int>& ipiv);
+template EXPORTCPUCOREMATH void getrf(hoNDArray< complext<double> >& A, hoNDArray<lapack_int>& ipiv);
+
+/// ------------------------------------------------------------------------------------
+
+/// Computes the inverse of an LU-factored general matrix
+template<typename T> 
+void getri(hoNDArray<T>& A)
+{
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return;
+
+        lapack_int info;
+        lapack_int m = (lapack_int)A.get_size(0);
+        lapack_int n = (lapack_int)A.get_size(1);
+        GADGET_CHECK_THROW(m==n);
+
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.get_size(0);
+
+        hoNDArray<lapack_int> ipiv;
+        getrf(A, ipiv);
+
+        lapack_int* pIPIV = ipiv.begin();
+
+        lapack_int lwork = m*m;
+
+        /*if ( typeid(T)==typeid(float) )
+        {
+            info = LAPACKE_sgetri(LAPACK_COL_MAJOR, m, reinterpret_cast<float*>(pA), lda, reinterpret_cast<lapack_int*>(pIPIV));
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            info = LAPACKE_dgetri(LAPACK_COL_MAJOR, m, reinterpret_cast<double*>(pA), lda, reinterpret_cast<lapack_int*>(pIPIV));
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<float> )) )
+        {
+            info = LAPACKE_cgetri(LAPACK_COL_MAJOR, m, reinterpret_cast<lapack_complex_float*>(pA), lda, reinterpret_cast<lapack_int*>(pIPIV));
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<double> )) )
+        {
+            info = LAPACKE_zgetri(LAPACK_COL_MAJOR, m, reinterpret_cast<lapack_complex_double*>(pA), lda, reinterpret_cast<lapack_int*>(pIPIV));
+        }
+        else
+        {
+            GADGET_THROW("getri : unsupported type " << typeid(T).name());
+        }*/
+
+        if ( typeid(T)==typeid(float) )
+        {
+            hoNDArray<float> work(m, m);
+            sgetri_(&m, reinterpret_cast<float*>(pA), &lda, reinterpret_cast<lapack_int*>(pIPIV), work.begin(), &lwork, &info);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            hoNDArray<double> work(m, m);
+            dgetri_(&m, reinterpret_cast<double*>(pA), &lda, reinterpret_cast<lapack_int*>(pIPIV), work.begin(), &lwork, &info);
+        }
+        else if ( (typeid(T)==typeid( std::complex<float> )) || (typeid(T)==typeid( complext<float> )) )
+        {
+            hoNDArray< std::complex<float> > work(m, m);
+            cgetri_(&m, reinterpret_cast<lapack_complex_float*>(pA), &lda, reinterpret_cast<lapack_int*>(pIPIV), reinterpret_cast<lapack_complex_float*>(work.begin()), &lwork, &info);
+        }
+        else if ( (typeid(T)==typeid( std::complex<double> )) || (typeid(T)==typeid( complext<double> )) )
+        {
+            hoNDArray< std::complex<double> > work(m, m);
+            zgetri_(&m, reinterpret_cast<lapack_complex_double*>(pA), &lda, reinterpret_cast<lapack_int*>(pIPIV), reinterpret_cast<lapack_complex_double*>(work.begin()), &lwork, &info);
+        }
+        else
+        {
+            GADGET_THROW("getri : unsupported type ... ");
+        }
+
+        GADGET_CHECK_THROW(info==0);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in getri(hoNDArray<T>& A) ...");
+    }
+}
+
+template EXPORTCPUCOREMATH void getri(hoNDArray<float>& A);
+template EXPORTCPUCOREMATH void getri(hoNDArray<double>& A);
+template EXPORTCPUCOREMATH void getri(hoNDArray< std::complex<float> >& A);
+template EXPORTCPUCOREMATH void getri(hoNDArray< complext<float> >& A);
+template EXPORTCPUCOREMATH void getri(hoNDArray< std::complex<double> >& A);
+template EXPORTCPUCOREMATH void getri(hoNDArray< complext<double> >& A);
+
+/// ------------------------------------------------------------------------------------
+
+template<typename T>
+void SolveLinearSystem_Tikhonov(hoNDArray<T>& A, hoNDArray<T>& b, hoNDArray<T>& x, double lamda)
+{
+    GADGET_CHECK_THROW(b.get_size(0)==A.get_size(0));
+
+    hoNDArray<T> AHA(A.get_size(1), A.get_size(1));
+    Gadgetron::clear(AHA);
+
+    // hoNDArray<T> ACopy(A);
+    // GADGET_CHECK_THROW(gemm(AHA, ACopy, true, A, false));
+
+    //GDEBUG_STREAM("SolveLinearSystem_Tikhonov - A = " << Gadgetron::norm2(A));
+    //GDEBUG_STREAM("SolveLinearSystem_Tikhonov - b = " << Gadgetron::norm2(b));
+
+    char uplo = 'L';
+    bool isAHA = true;
+    herk(AHA, A, uplo, isAHA);
+    //GDEBUG_STREAM("SolveLinearSystem_Tikhonov - AHA = " << Gadgetron::norm2(AHA));
+
+    x.create(A.get_size(1), b.get_size(1));
+    gemm(x, A, true, b, false);
+    //GDEBUG_STREAM("SolveLinearSystem_Tikhonov - x = " << Gadgetron::norm2(x));
+
+    // apply the Tikhonov regularization
+    // Ideally, we shall apply the regularization is lamda*maxEigenValue
+    // However, computing the maximal eigenvalue is computational intensive
+    // A natural alternative is to use the trace of AHA matrix, which is the sum of all eigen values
+    // Since all eigen values are positive, the lamda*maxEigenValue is only ~10-20% different from lamda*sum(all eigenValues)
+    // for more information, refer to:
+    // Tikhonov A.N., Goncharsky A.V., Stepanov V.V., Yagola A.G., 1995,
+    // Numerical Methods for the Solution of Ill-Posed Problems, Kluwer Academic Publishers.
+
+    size_t col = AHA.get_size(0);
+    size_t c;
+
+    double trA = abs(AHA(0, 0));
+    for ( c=1; c<col; c++ )
+    {
+        //const T v = AHA(c, c);
+        //const typename realType<T>::Type rv = v.real();
+        //const typename realType<T>::Type iv = v.imag();
+        // trA += std::sqrt(rv*rv + iv*iv);
+        trA += abs( AHA(c, c) );
+    }
+    //GDEBUG_STREAM("SolveLinearSystem_Tikhonov - trA = " << trA);
+
+    double value = trA*lamda/col;
+    for ( c=0; c<col; c++ )
+    {
+        //const T v = AHA(c, c);
+        //const typename realType<T>::Type rv = v.real();
+        //const typename realType<T>::Type iv = v.imag();
+
+        //AHA(c,c) = T( (typename realType<T>::Type)( std::sqrt(rv*rv + iv*iv) + value ) );
+        AHA(c,c) = T( (typename realType<T>::Type)( abs( AHA(c, c) ) + value ) );
+    }
+
+    // if the data is properly SNR unit scaled, the minimal eigen value of AHA will be around 4.0 (real and imag have noise sigma being ~1.0)
+    if ( trA/col < 4.0 )
+    {
+        typename realType<T>::Type scalingFactor = (typename realType<T>::Type)(col*4.0/trA);
+        GDEBUG_STREAM("SolveLinearSystem_Tikhonov - trA is too small : " << trA << " for matrix order : " << col);
+        GDEBUG_STREAM("SolveLinearSystem_Tikhonov - scale the AHA and x by " << scalingFactor);
+        Gadgetron::scal( scalingFactor, AHA);
+        Gadgetron::scal( scalingFactor, x);
+    }
+
+    try
+    {
+        posv(AHA, x);
+        //GDEBUG_STREAM("SolveLinearSystem_Tikhonov - solution = " << Gadgetron::norm2(x));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("posv failed in SolveLinearSystem_Tikhonov(... ) ... ");
+        GDEBUG_STREAM("A = " << Gadgetron::norm2(A));
+        GDEBUG_STREAM("b = " << Gadgetron::norm2(b));
+        GDEBUG_STREAM("AHA = " << Gadgetron::norm2(AHA));
+        GDEBUG_STREAM("trA = " << trA);
+        GDEBUG_STREAM("x = " << Gadgetron::norm2(x));
+
+        gemm(x, A, true, b, false);
+        GDEBUG_STREAM("SolveLinearSystem_Tikhonov - x = " << Gadgetron::norm2(x));
+
+        try
+        {
+            hesv(AHA, x);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("hesv failed in SolveLinearSystem_Tikhonov(... ) ... ");
+
+            gemm(x, A, true, b, false);
+            GDEBUG_STREAM("SolveLinearSystem_Tikhonov - x = " << Gadgetron::norm2(x));
+
+            try
+            {
+                gesv(AHA, x);
+            }
+            catch(...)
+            {
+                GERROR_STREAM("gesv failed in SolveLinearSystem_Tikhonov(... ) ... ");
+                throw;
+            }
+        }
+    }
+}
+
+template EXPORTCPUCOREMATH void SolveLinearSystem_Tikhonov(hoNDArray<float>& A, hoNDArray<float>& b, hoNDArray<float>& x, double lamda);
+template EXPORTCPUCOREMATH void SolveLinearSystem_Tikhonov(hoNDArray<double>& A, hoNDArray<double>& b, hoNDArray<double>& x, double lamda);
+template EXPORTCPUCOREMATH void SolveLinearSystem_Tikhonov(hoNDArray< std::complex<float> >& A, hoNDArray< std::complex<float> >& b, hoNDArray< std::complex<float> >& x, double lamda);
+template EXPORTCPUCOREMATH void SolveLinearSystem_Tikhonov(hoNDArray< complext<float> >& A, hoNDArray< complext<float> >& b, hoNDArray< complext<float> >& x, double lamda);
+template EXPORTCPUCOREMATH void SolveLinearSystem_Tikhonov(hoNDArray< std::complex<double> >& A, hoNDArray< std::complex<double> >& b, hoNDArray< std::complex<double> >& x, double lamda);
+template EXPORTCPUCOREMATH void SolveLinearSystem_Tikhonov(hoNDArray< complext<double> >& A, hoNDArray< complext<double> >& b, hoNDArray< complext<double> >& x, double lamda);
+
+#endif // defined(USE_MKL) || defined(USE_LAPACK)
+
+}
diff --git a/toolboxes/core/cpu/math/hoNDArray_linalg.h b/toolboxes/core/cpu/math/hoNDArray_linalg.h
new file mode 100644
index 0000000..52db92f
--- /dev/null
+++ b/toolboxes/core/cpu/math/hoNDArray_linalg.h
@@ -0,0 +1,90 @@
+
+#pragma once
+
+#include "cpucore_math_export.h"
+
+#ifdef USE_ARMADILLO
+    #include "hoArmadillo.h"
+#endif // USE_ARMADILLO
+
+#ifndef lapack_int
+    #define lapack_int int
+#endif // lapack_int
+
+/// ----------------------------------------------------------------------
+/// the fortran interface of lapack and blas functions are called
+/// ----------------------------------------------------------------------
+
+namespace Gadgetron
+{
+
+// following matrix computation calls lapacke functions
+
+/// C = A*B for complex float
+EXPORTCPUCOREMATH void gemm(hoNDArray< std::complex<float> >& C, const hoNDArray< std::complex<float> >& A, const hoNDArray< std::complex<float> >& B);
+/// if transA==true, C = A'*B
+/// if transB==true, C=A*B'
+/// if both are true, C=A'*B'
+template<typename T> EXPORTCPUCOREMATH
+void gemm(hoNDArray<T>& C, 
+        const hoNDArray<T>& A, bool transA, 
+        const hoNDArray<T>& B, bool transB);
+
+/// perform a symmetric rank-k update (no conjugated).
+template<typename T> EXPORTCPUCOREMATH 
+void syrk(hoNDArray<T>& C, const hoNDArray<T>& A, char uplo, bool isATA);
+
+/// perform a Hermitian rank-k update.
+template<typename T> EXPORTCPUCOREMATH 
+void herk(hoNDArray<T>& C, const hoNDArray<T>& A, char uplo, bool isAHA);
+
+/// compute the Cholesky factorization of a real symmetric positive definite matrix A
+template<typename T> EXPORTCPUCOREMATH 
+void potrf(hoNDArray<T>& A, char uplo);
+
+/// compute all eigenvalues and eigenvectors of a Hermitian matrix A
+template<typename T> EXPORTCPUCOREMATH 
+void heev(hoNDArray<T>& A, hoNDArray<typename realType<T>::Type>& eigenValue);
+
+template<typename T> EXPORTCPUCOREMATH
+void heev(hoNDArray< std::complex<T> >& A, hoNDArray<  std::complex<T> >& eigenValue);
+
+/// compute inverse of a symmetric (Hermitian) positive-definite matrix A
+template<typename T> EXPORTCPUCOREMATH 
+void potri(hoNDArray<T>& A);
+
+/// compute the inverse of a triangular matrix A
+template<typename T> EXPORTCPUCOREMATH 
+void trtri(hoNDArray<T>& A, char uplo);
+
+/// solve Ax=b, a symmetric or Hermitian positive-definite matrix A and multiple right-hand sides b
+/// b is replaced with x
+template<typename T> EXPORTCPUCOREMATH
+void posv(hoNDArray<T>& A, hoNDArray<T>& b);
+
+/// solve Ax=b, a square symmetric / hermitian matrix A and multiple right-hand sides b
+/// for float and double, A is a symmetric matrix
+/// for complex type, A is a hermitian matrix
+/// b is replaced with x
+template<typename T> EXPORTCPUCOREMATH
+void hesv(hoNDArray<T>& A, hoNDArray<T>& b);
+
+/// solve Ax=b, a square matrix A and multiple right-hand sides b
+/// b is replaced with x
+template<typename T> EXPORTCPUCOREMATH
+void gesv(hoNDArray<T>& A, hoNDArray<T>& b);
+
+/// solve Ax=b with Tikhonov regularization
+template<typename T> EXPORTCPUCOREMATH
+void SolveLinearSystem_Tikhonov(hoNDArray<T>& A, hoNDArray<T>& b, hoNDArray<T>& x, double lamda);
+
+/// Computes the LU factorization of a general m-by-n matrix
+/// this function is called by general matrix inversion
+template<typename T> EXPORTCPUCOREMATH 
+void getrf(hoNDArray<T>& A, hoNDArray<lapack_int>& ipiv);
+
+/// Computes the inverse of an LU-factored general matrix
+template<typename T> EXPORTCPUCOREMATH 
+void getri(hoNDArray<T>& A);
+
+}
diff --git a/toolboxes/core/cpu/math/hoNDArray_math.h b/toolboxes/core/cpu/math/hoNDArray_math.h
new file mode 100644
index 0000000..dda0d0c
--- /dev/null
+++ b/toolboxes/core/cpu/math/hoNDArray_math.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_reductions.h"
diff --git a/toolboxes/core/cpu/math/hoNDArray_math_util.cpp b/toolboxes/core/cpu/math/hoNDArray_math_util.cpp
new file mode 100644
index 0000000..a5631d5
--- /dev/null
+++ b/toolboxes/core/cpu/math/hoNDArray_math_util.cpp
@@ -0,0 +1,2178 @@
+#include "hoNDArray_math_util.h"
+
+#ifndef lapack_int
+    #define lapack_int int
+#endif // lapack_int
+
+#ifndef lapack_complex_float
+    #define lapack_complex_float  std::complex<float> 
+#endif // lapack_complex_float
+
+#ifndef lapack_complex_double
+    #define lapack_complex_double  std::complex<double> 
+#endif // #ifndef lapack_complex_double
+
+//Declaration of BLAS and LAPACK routines
+extern "C"
+{
+    /// Finds the index of the element with the maximal absolute value.
+    lapack_int isamax_(lapack_int* N, float* x, lapack_int* incx);
+    lapack_int idamax_(lapack_int* N, double* x, lapack_int* incx);
+    lapack_int icamax_(lapack_int* N, lapack_complex_float* x, lapack_int* incx);
+    lapack_int izamax_(lapack_int* N, lapack_complex_double* x, lapack_int* incx);
+}
+
+#define NumElementsUseThreading 64*1024
+
+namespace Gadgetron
+{
+    // --------------------------------------------------------------------------------
+
+    inline void add(size_t N, const float* x, const float* y, float* r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, r, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            r[n] = x[n] + y[n];
+        }
+    }
+
+    inline void add(size_t N, const double* x, const double* y, double* r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, r, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            r[n] = x[n] + y[n];
+        }
+    }
+
+    inline void add(size_t N, const  std::complex<float> * x, const  std::complex<float> * y,  std::complex<float> * r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, r, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            const  std::complex<float> & vx = x[n];
+            const float re1 = vx.real();
+            const float im1 = vx.imag();
+
+            const  std::complex<float> & vy = y[n];
+            const float re2 = vy.real();
+            const float im2 = vy.imag();
+
+            reinterpret_cast<float(&)[2]>(r[n])[0] = re1 + re2;
+            reinterpret_cast<float(&)[2]>(r[n])[1] = im1 + im2;
+        }
+    }
+
+    inline void add(size_t N, const  std::complex<double> * x, const  std::complex<double> * y,  std::complex<double> * r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, r, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            const  std::complex<double> & vx = x[n];
+            const double re1 = vx.real();
+            const double im1 = vx.imag();
+
+            const  std::complex<double> & vy = y[n];
+            const double re2 = vy.real();
+            const double im2 = vy.imag();
+
+            reinterpret_cast<double(&)[2]>(r[n])[0] = re1 + re2;
+            reinterpret_cast<double(&)[2]>(r[n])[1] = im1 + im2;
+        }
+    }
+
+    template <typename T> 
+    void add(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r)
+    {
+        GADGET_DEBUG_CHECK_THROW(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        add(x.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+    }
+
+    template EXPORTCPUCOREMATH void add(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void add(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void add(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void add(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& r);
+
+    // --------------------------------------------------------------------------------
+
+    inline void subtract(size_t N, const float* x, const float* y, float* r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, r, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            r[n] = x[n] - y[n];
+        }
+    }
+
+    inline void subtract(size_t N, const double* x, const double* y, double* r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, r, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            r[n] = x[n] - y[n];
+        }
+    }
+
+    inline void subtract(size_t N, const  std::complex<float> * x, const  std::complex<float> * y,  std::complex<float> * r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, r, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            const  std::complex<float> & vx = x[n];
+            const float re1 = vx.real();
+            const float im1 = vx.imag();
+
+            const  std::complex<float> & vy = y[n];
+            const float re2 = vy.real();
+            const float im2 = vy.imag();
+
+            reinterpret_cast<float(&)[2]>(r[n])[0] = re1 - re2;
+            reinterpret_cast<float(&)[2]>(r[n])[1] = im1 - im2;
+        }
+    }
+
+    inline void subtract(size_t N, const  std::complex<double> * x, const  std::complex<double> * y,  std::complex<double> * r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, r, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            const  std::complex<double> & vx = x[n];
+            const double re1 = vx.real();
+            const double im1 = vx.imag();
+
+            const  std::complex<double> & vy = y[n];
+            const double re2 = vy.real();
+            const double im2 = vy.imag();
+
+            reinterpret_cast<double(&)[2]>(r[n])[0] = re1 - re2;
+            reinterpret_cast<double(&)[2]>(r[n])[1] = im1 - im2;
+        }
+    }
+
+    template <typename T> 
+    void subtract(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r)
+    {
+        GADGET_DEBUG_CHECK_THROW(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        subtract(x.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+    }
+
+    template EXPORTCPUCOREMATH void subtract(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void subtract(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void subtract(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void subtract(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& r);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    inline void multiply(size_t N, const T* x, const T* y, T* r)
+    {
+        long long n;
+        #pragma omp parallel for default(none) private(n) shared(N, x, y, r) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            const T& a = x[n];
+            const T& b = y[n];
+            r[n] = a*b;
+        }
+    }
+
+    inline void multiply(size_t N, const  std::complex<float> * x, const  std::complex<float> * y,  std::complex<float> * r)
+    {
+        long long n;
+        #pragma omp parallel for default(none) private(n) shared(N, x, y, r) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const std::complex<float>& a1 = x[n];
+            const std::complex<float>& b1 = y[n];
+            const float a = a1.real();
+            const float b = a1.imag();
+            const float c = b1.real();
+            const float d = b1.imag();
+
+            reinterpret_cast<float(&)[2]>(r[n])[0] = a*c-b*d;
+            reinterpret_cast<float(&)[2]>(r[n])[1] = a*d+b*c;
+        }
+    }
+
+    inline void multiply(size_t N, const  std::complex<double> * x, const  std::complex<double> * y,  std::complex<double> * r)
+    {
+        long long n;
+        #pragma omp parallel for default(none) private(n) shared(N, x, y, r) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const std::complex<double>& a1 = x[n];
+            const std::complex<double>& b1 = y[n];
+            const double a = a1.real();
+            const double b = a1.imag();
+            const double c = b1.real();
+            const double d = b1.imag();
+
+            reinterpret_cast<double(&)[2]>(r[n])[0] = a*c-b*d;
+            reinterpret_cast<double(&)[2]>(r[n])[1] = a*d+b*c;
+        }
+    }
+
+    template <typename T> 
+    void multiply(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r)
+    {
+        GADGET_DEBUG_CHECK_THROW(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        multiply(x.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+    }
+
+    template EXPORTCPUCOREMATH void multiply(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void multiply(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void multiply(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void multiply(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& r);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    inline void divide(size_t N, const T* x, const T* y, T* r)
+    {
+        long long n;
+        #pragma omp parallel for default(none) private(n) shared(N, x, y, r) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            const T& a = x[n];
+            const T& b = y[n];
+            r[n] = a/b;
+        }
+    }
+
+    inline void divide(size_t N, const  std::complex<float> * x, const  std::complex<float> * y,  std::complex<float> * r)
+    {
+        long long n;
+        #pragma omp parallel for default(none) private(n) shared(N, x, y, r) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const std::complex<float>& a1 = x[n];
+            const std::complex<float>& b1 = y[n];
+            const float a = a1.real();
+            const float b = a1.imag();
+            const float c = b1.real();
+            const float d = b1.imag();
+
+            const float m = 1/(c*c+d*d);
+
+            reinterpret_cast<float(&)[2]>(r[n])[0] = (a*c+b*d)*m;
+            reinterpret_cast<float(&)[2]>(r[n])[1] = (b*c-a*d)*m;
+        }
+    }
+
+    inline void divide(size_t N, const  std::complex<double> * x, const  std::complex<double> * y,  std::complex<double> * r)
+    {
+        long long n;
+        #pragma omp parallel for default(none) private(n) shared(N, x, y, r) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const std::complex<double>& a1 = x[n];
+            const std::complex<double>& b1 = y[n];
+            const double a = a1.real();
+            const double b = a1.imag();
+            const double c = b1.real();
+            const double d = b1.imag();
+
+            const double m = 1/(c*c+d*d);
+
+            reinterpret_cast<double(&)[2]>(r[n])[0] = (a*c+b*d)*m;
+            reinterpret_cast<double(&)[2]>(r[n])[1] = (b*c-a*d)*m;
+        }
+    }
+
+    template <typename T> 
+    void divide(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r)
+    {
+        GADGET_DEBUG_CHECK_THROW(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        divide(x.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+    }
+
+    template EXPORTCPUCOREMATH void divide(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void divide(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void divide(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void divide(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& r);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    void sqrt(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        size_t N = x.get_number_of_elements();
+        const T* pX = x.begin();
+        T* pR = r.begin();
+
+        long long n;
+        #pragma omp parallel for default(none) private(n) shared(N, pX, pR) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            pR[n] = std::sqrt(pX[n]);
+        }
+    }
+
+    template EXPORTCPUCOREMATH void sqrt(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void sqrt(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void sqrt(const hoNDArray< std::complex<float> >& x, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void sqrt(const hoNDArray< std::complex<double> >& x, hoNDArray< std::complex<double> >& r);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    void minAbsolute(const hoNDArray<T>& x, T& r, size_t& ind)
+    {
+        size_t N = x.get_number_of_elements();
+        const T* pX = x.begin();
+
+        ind = 0;
+        if ( N == 0 ) return;
+
+        long long n;
+
+        typename realType<T>::Type v = abs(pX[0]);
+        typename realType<T>::Type v2;
+
+        ind = 0;
+        for ( n=1; n<(long long)N; n++ )
+        {
+            v2 = std::abs(pX[n]);
+            if ( v2 < v )
+            {
+                v = v2;
+                ind = n;
+            }
+        }
+
+        r = pX[ind];
+    }
+
+    template EXPORTCPUCOREMATH void minAbsolute(const hoNDArray<float>& x, float& r, size_t& ind);
+    template EXPORTCPUCOREMATH void minAbsolute(const hoNDArray<double>& x, double& r, size_t& ind);
+    template EXPORTCPUCOREMATH void minAbsolute(const hoNDArray< std::complex<float> >& x,  std::complex<float> & r, size_t& ind);
+    template EXPORTCPUCOREMATH void minAbsolute(const hoNDArray< std::complex<double> >& x,  std::complex<double> & r, size_t& ind);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    void maxAbsolute(const hoNDArray<T>& x, T& r, size_t& ind)
+    {
+        size_t N = x.get_number_of_elements();
+        const T* pX = x.begin();
+
+        ind = 0;
+        if ( N == 0 ) return;
+
+        long long n;
+
+        typename realType<T>::Type v = abs(pX[0]);
+        typename realType<T>::Type v2;
+
+        ind = 0;
+        for ( n=1; n<(long long)N; n++ )
+        {
+            v2 = std::abs(pX[n]);
+            if ( v2 > v )
+            {
+                v = v2;
+                ind = n;
+            }
+        }
+
+        r = pX[ind];
+    }
+
+    template EXPORTCPUCOREMATH void maxAbsolute(const hoNDArray<float>& x, float& r, size_t& ind);
+    template EXPORTCPUCOREMATH void maxAbsolute(const hoNDArray<double>& x, double& r, size_t& ind);
+    template EXPORTCPUCOREMATH void maxAbsolute(const hoNDArray< std::complex<float> >& x,  std::complex<float> & r, size_t& ind);
+    template EXPORTCPUCOREMATH void maxAbsolute(const hoNDArray< std::complex<double> >& x,  std::complex<double> & r, size_t& ind);
+
+    // --------------------------------------------------------------------------------
+
+    inline void multiplyConj(size_t N, const  std::complex<float> * x, const  std::complex<float> * y,  std::complex<float> * r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, y, r) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            const float a = x[n].real();
+            const float b = x[n].imag();
+            const float c = y[n].real();
+            const float d = y[n].imag();
+
+            reinterpret_cast<float(&)[2]>(r[n])[0] = (a*c + b*d);
+            reinterpret_cast<float(&)[2]>(r[n])[1] = (c*b - a*d);
+        }
+    }
+
+    inline void multiplyConj(size_t N, const  std::complex<double> * x, const  std::complex<double> * y,  std::complex<double> * r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, y, r) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            const double a = x[n].real();
+            const double b = x[n].imag();
+            const double c = y[n].real();
+            const double d = y[n].imag();
+
+            reinterpret_cast<double(&)[2]>(r[n])[0] = (a*c + b*d);
+            reinterpret_cast<double(&)[2]>(r[n])[1] = (c*b - a*d);
+        }
+    }
+
+    template <typename T> 
+    void multiplyConj(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r)
+    {
+        GADGET_DEBUG_CHECK_THROW(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        multiplyConj(x.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+    }
+
+    template EXPORTCPUCOREMATH void multiplyConj(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void multiplyConj(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& r);
+
+    // --------------------------------------------------------------------------------
+
+    inline void conjugate(size_t N, const  std::complex<float> * x,  std::complex<float> * r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, r) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            reinterpret_cast<float(&)[2]>(r[n])[0] = reinterpret_cast< const float(&)[2]>(x[n])[0];
+            reinterpret_cast<float(&)[2]>(r[n])[1] = -(reinterpret_cast< const float(&)[2]>(x[n])[1]);
+        }
+    }
+
+    inline void conjugate(size_t N, const  std::complex<double> * x,  std::complex<double> * r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, r) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            reinterpret_cast<double(&)[2]>(r[n])[0] = reinterpret_cast< const double(&)[2]>(x[n])[0];
+            reinterpret_cast<double(&)[2]>(r[n])[1] = -(reinterpret_cast<const double(&)[2]>(x[n])[1]);
+        }
+    }
+
+    template <typename T> 
+    void conjugate(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        conjugate(x.get_number_of_elements(), x.begin(), r.begin());
+    }
+
+    template EXPORTCPUCOREMATH void conjugate(const hoNDArray< std::complex<float> >& x, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void conjugate(const hoNDArray< std::complex<double> >& x, hoNDArray< std::complex<double> >& r);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    inline void addEpsilon(size_t N, T* x)
+    {
+        typename realType<T>::Type eps = std::numeric_limits<typename realType<T>::Type>::epsilon();
+
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, eps) if (N>NumElementsUseThreading)
+        for (n=0; n<(long long)N; n++ )
+        {
+            if ( std::abs(x[n]) < eps )
+            {
+                x[n] += eps;
+            }
+        }
+    }
+
+    inline void addEpsilon(size_t N,  std::complex<float> * x)
+    {
+        const float eps = std::numeric_limits<float>::epsilon();
+
+        long long n;
+
+        #pragma omp parallel for private(n) if (N>NumElementsUseThreading)
+        for (n=0; n<(long long)N; n++ )
+        {
+            if ( std::abs(x[n]) < eps )
+            {
+                reinterpret_cast<float(&)[2]>(x[n])[0] += eps;
+            }
+        }
+    }
+
+    inline void addEpsilon(size_t N,  std::complex<double> * x)
+    {
+        const double eps = std::numeric_limits<double>::epsilon();
+
+        long long n;
+
+        #pragma omp parallel for private(n) if (N>NumElementsUseThreading)
+        for (n=0; n<(long long)N; n++ )
+        {
+            if ( std::abs(x[n]) < eps )
+            {
+                reinterpret_cast<double(&)[2]>(x[n])[0] += eps;
+            }
+        }
+    }
+
+    template <typename T> 
+    void addEpsilon(hoNDArray<T>& x)
+    {
+        addEpsilon(x.get_number_of_elements(), x.begin());
+    }
+
+    template EXPORTCPUCOREMATH void addEpsilon(hoNDArray<float>& x);
+    template EXPORTCPUCOREMATH void addEpsilon(hoNDArray<double>& x);
+    template EXPORTCPUCOREMATH void addEpsilon(hoNDArray< std::complex<float> >& x);
+    template EXPORTCPUCOREMATH void addEpsilon(hoNDArray< std::complex<double> >& x);
+
+    // --------------------------------------------------------------------------------
+
+    inline void norm2(size_t N, const float* x, float& r)
+    {
+        long long i;
+
+        float sum(0);
+
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            const float& re = x[i];
+            sum += ( re*re );
+        }
+
+        r = std::sqrt(sum);
+    }
+
+    inline void norm2(size_t N, const double* x, double& r)
+    {
+        long long i;
+
+        double sum(0);
+
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            const double& re = x[i];
+            sum += ( re*re );
+        }
+
+        r = std::sqrt(sum);
+    }
+
+    inline void norm2(size_t N, const  std::complex<float> * x, float& r)
+    {
+        long long i;
+
+        float sum(0);
+
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            const std::complex<float>& c = x[i];
+            const float re = c.real();
+            const float im = c.imag();
+            sum += ( (re*re) + (im * im) );
+        }
+
+        r = std::sqrt(sum);
+    }
+
+    inline void norm2(size_t N, const  std::complex<double> * x, double& r)
+    {
+        long long i;
+
+        double sum(0);
+
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            const std::complex<double>& c = x[i];
+            const double re = c.real();
+            const double im = c.imag();
+            sum += ( (re*re) + (im * im) );
+        }
+
+        r = std::sqrt(sum);
+    }
+
+    template <typename T> 
+    void norm2(const hoNDArray<T>& x, typename realType<T>::Type& r)
+    {
+        norm2(x.get_number_of_elements(), x.begin(), r);
+    }
+
+    template EXPORTCPUCOREMATH void norm2(const hoNDArray<float>& x, float& r);
+    template EXPORTCPUCOREMATH void norm2(const hoNDArray<double>& x, double& r);
+    template EXPORTCPUCOREMATH void norm2(const hoNDArray< std::complex<float> >& x, float& r);
+    template EXPORTCPUCOREMATH void norm2(const hoNDArray< std::complex<double> >& x, double& r);
+
+    template <typename T> inline 
+    typename realType<T>::Type norm2(const hoNDArray<T>& x)
+    {
+        typename realType<T>::Type r;
+        norm2(x, r);
+        return r;
+    }
+
+    template EXPORTCPUCOREMATH float norm2(const hoNDArray<float>& x);
+    template EXPORTCPUCOREMATH double norm2(const hoNDArray<double>& x);
+    template EXPORTCPUCOREMATH float norm2(const hoNDArray< std::complex<float> >& x);
+    template EXPORTCPUCOREMATH double norm2(const hoNDArray< std::complex<double> >& x);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> inline 
+    void norm1(size_t N, const T* x, typename realType<T>::Type& r)
+    {
+        long long n;
+
+        typename realType<T>::Type norm1Sum(0);
+
+        #pragma omp parallel for private(n) reduction(+:norm1Sum) if (N>NumElementsUseThreading)
+        for (n=0; n<(long long)N; n++)
+        {
+            const T& c = x[n];
+            norm1Sum += GT_ABS(c);
+        }
+
+        r = norm1Sum;
+    }
+
+    inline void norm1(size_t N, const  std::complex<float> * x, float& r)
+    {
+        long long i;
+        float sum = 0.0f;
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            const std::complex<float>& c = x[i];
+            const float re = c.real();
+            const float im = c.imag();
+            sum += std::sqrt( (re*re) + (im * im) );
+        }
+
+        r = sum;
+    }
+
+    inline void norm1(size_t N, const  std::complex<double> * x, double& r)
+    {
+        long long i;
+        double sum = 0.0;
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            const std::complex<double>& c = x[i];
+            const double re = c.real();
+            const double im = c.imag();
+            sum += std::sqrt( (re*re) + (im * im) );
+        }
+
+        r = sum;
+    }
+
+    template <typename T> 
+    void norm1(const hoNDArray<T>& x, typename realType<T>::Type& r)
+    {
+        norm1(x.get_number_of_elements(), x.begin(), r);
+    }
+
+    template EXPORTCPUCOREMATH void norm1(const hoNDArray<float>& x, float& r);
+    template EXPORTCPUCOREMATH void norm1(const hoNDArray<double>& x, double& r);
+    template EXPORTCPUCOREMATH void norm1(const hoNDArray< std::complex<float> >& x, float& r);
+    template EXPORTCPUCOREMATH void norm1(const hoNDArray< std::complex<double> >& x, double& r);
+
+    template <typename T> inline 
+    typename realType<T>::Type norm1(const hoNDArray<T>& x)
+    {
+        typename realType<T>::Type r;
+        norm1(x, r);
+        return r;
+    }
+
+    template EXPORTCPUCOREMATH float norm1(const hoNDArray<float>& x);
+    template EXPORTCPUCOREMATH double norm1(const hoNDArray<double>& x);
+    template EXPORTCPUCOREMATH float norm1(const hoNDArray< std::complex<float> >& x);
+    template EXPORTCPUCOREMATH double norm1(const hoNDArray< std::complex<double> >& x);
+
+    // --------------------------------------------------------------------------------
+
+    inline void dotc(size_t N, const  std::complex<float> * x, const  std::complex<float> * y,  std::complex<float> & r)
+    {
+        long long n;
+
+        float sum(0);
+
+        float sa(0), sb(0);
+
+        #pragma omp parallel for private(n) reduction(+:sa) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const float a = x[n].real();
+            const float b = x[n].imag();
+            const float c = y[n].real();
+            const float d = y[n].imag();
+
+            sa += (a*c + b*d);
+            sb += (c*b - a*d);
+        }
+
+        reinterpret_cast<float(&)[2]>(r)[0] = sa;
+        reinterpret_cast<float(&)[2]>(r)[1] = sb;
+    }
+
+    inline void dotc(size_t N, const  std::complex<double> * x, const  std::complex<double> * y,  std::complex<double> & r)
+    {
+        long long n;
+
+        double sum(0);
+
+        double sa(0), sb(0);
+
+        #pragma omp parallel for private(n) reduction(+:sa) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const double a = x[n].real();
+            const double b = x[n].imag();
+            const double c = y[n].real();
+            const double d = y[n].imag();
+
+            sa += (a*c + b*d);
+            sb += (c*b - a*d);
+        }
+
+        reinterpret_cast<double(&)[2]>(r)[0] = sa;
+        reinterpret_cast<double(&)[2]>(r)[1] = sb;
+    }
+
+    template <typename T> 
+    void dotc(const hoNDArray<T>& x, const hoNDArray<T>& y, T& r)
+    {
+        GADGET_DEBUG_CHECK_THROW(x.get_number_of_elements()==y.get_number_of_elements());
+        dotc(x.get_number_of_elements(), x.begin(), y.begin(), r);
+    }
+
+    template EXPORTCPUCOREMATH void dotc(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y,  std::complex<float> & r);
+    template EXPORTCPUCOREMATH void dotc(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y,  std::complex<double> & r);
+
+    template <typename T> 
+    T dotc(const hoNDArray<T>& x, const hoNDArray<T>& y)
+    {
+        T r;
+        dotc(x, y, r);
+        return r;
+    }
+
+    template EXPORTCPUCOREMATH std::complex<float> dotc(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y);
+    template EXPORTCPUCOREMATH std::complex<double> dotc(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y);
+
+    // --------------------------------------------------------------------------------
+
+    inline void dotu(size_t N, const float* x, const float* y, float& r)
+    {
+        long long n;
+
+        float res(0);
+
+        #pragma omp parallel for private(n) reduction(+:res) if (N>NumElementsUseThreading)
+        for (n=0; n<(long long)N; n++)
+        {
+            res += x[n]*y[n];
+        }
+
+        r = res;
+    }
+
+    inline void dotu(size_t N, const double* x, const double* y, double& r)
+    {
+        long long n;
+
+        double res(0);
+
+        #pragma omp parallel for private(n) reduction(+:res) if (N>NumElementsUseThreading)
+        for (n=0; n<(long long)N; n++)
+        {
+            res += x[n]*y[n];
+        }
+
+        r = res;
+    }
+
+    inline void dotu(size_t N, const  std::complex<float> * x, const  std::complex<float> * y,  std::complex<float> & r)
+    {
+        long long n;
+
+         std::complex<float>  sum(0);
+
+        float sa(0), sb(0);
+        #pragma omp parallel for private(n) reduction(+:sa) reduction(+:sb) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const float a = x[n].real();
+            const float b = x[n].imag();
+            const float c = y[n].real();
+            const float d = y[n].imag();
+
+            sa += (a*c - b*d);
+            sb += (c*b + a*d);
+        }
+
+        reinterpret_cast<float(&)[2]>(r)[0] = sa;
+        reinterpret_cast<float(&)[2]>(r)[1] = sb;
+    }
+
+    inline void dotu(size_t N, const  std::complex<double> * x, const  std::complex<double> * y,  std::complex<double> & r)
+    {
+        long long n;
+
+         std::complex<double>  sum(0);
+
+        double sa(0), sb(0);
+        #pragma omp parallel for private(n) reduction(+:sa) reduction(+:sb) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const double a = x[n].real();
+            const double b = x[n].imag();
+            const double c = y[n].real();
+            const double d = y[n].imag();
+
+            sa += (a*c - b*d);
+            sb += (c*b + a*d);
+        }
+
+        reinterpret_cast<double(&)[2]>(r)[0] = sa;
+        reinterpret_cast<double(&)[2]>(r)[1] = sb;
+    }
+
+    template <typename T> 
+    void dotu(const hoNDArray<T>& x, const hoNDArray<T>& y, T& r)
+    {
+        GADGET_DEBUG_CHECK_THROW(x.get_number_of_elements()==y.get_number_of_elements());
+        dotu(x.get_number_of_elements(), x.begin(), y.begin(), r);
+    }
+
+    template EXPORTCPUCOREMATH void dotu(const hoNDArray<float>& x, const hoNDArray<float>& y, float& r);
+    template EXPORTCPUCOREMATH void dotu(const hoNDArray<double>& x, const hoNDArray<double>& y, double& r);
+    template EXPORTCPUCOREMATH void dotu(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, std::complex<float>& r);
+    template EXPORTCPUCOREMATH void dotu(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, std::complex<double>& r);
+
+    template <typename T> 
+    T dotu(const hoNDArray<T>& x, const hoNDArray<T>& y)
+    {
+        T r = 0;
+        dotu(x, y, r);
+        return r;
+    }
+
+    template EXPORTCPUCOREMATH float dotu(const hoNDArray<float>& x, const hoNDArray<float>& y);
+    template EXPORTCPUCOREMATH double dotu(const hoNDArray<double>& x, const hoNDArray<double>& y);
+    template EXPORTCPUCOREMATH  std::complex<float>  dotu(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y);
+    template EXPORTCPUCOREMATH  std::complex<double>  dotu(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    void absolute(size_t N, const T* x, typename realType<T>::Type* r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, r) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            r[n]= GT_ABS(x[n]);
+        }
+    }
+
+    inline void absolute(size_t N, const  std::complex<float> * x, float* r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, r) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            const  std::complex<float> & c = x[n];
+            const float re = c.real();
+            const float im = c.imag();
+            r[n]= std::sqrt( (re*re) + (im * im) );
+        }
+    }
+
+    inline void absolute(size_t N, const  std::complex<double> * x, double* r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, r) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            const  std::complex<double> & c = x[n];
+            const double re = c.real();
+            const double im = c.imag();
+            r[n]= std::sqrt( (re*re) + (im * im) );
+        }
+    }
+
+    template <typename T> 
+    void absolute(const hoNDArray<T>& x, hoNDArray<typename realType<T>::Type>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        absolute(x.get_number_of_elements(), x.begin(), r.begin());
+    }
+
+    template EXPORTCPUCOREMATH void absolute(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void absolute(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void absolute(const hoNDArray< std::complex<float> >& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void absolute(const hoNDArray< std::complex<double> >& x, hoNDArray<double>& r);
+
+    // --------------------------------------------------------------------------------
+
+    inline void absolute(size_t N, const std::complex<float>* x, std::complex<float>* r)
+    {
+        try
+        {
+            long long n;
+
+            #pragma omp parallel for default(none) private(n) shared(N, x, r) if (N>NumElementsUseThreading)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                const std::complex<float>& c = x[n];
+                const float re = c.real();
+                const float im = c.imag();
+
+                reinterpret_cast<float(&)[2]>(r[n])[0] = std::sqrt( (re*re) + (im * im) );
+                reinterpret_cast<float(&)[2]>(r[n])[1] = 0;
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Error happened in absolute(size_t N, const std::complex<float>* x, std::complex<float>* r) ... ");
+        }
+    }
+
+    inline void absolute(size_t N, const std::complex<double>* x, std::complex<double>* r)
+    {
+        try
+        {
+            long long n;
+
+            #pragma omp parallel for default(none) private(n) shared(N, x, r) if (N>NumElementsUseThreading)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                const std::complex<double>& c = x[n];
+                const double re = c.real();
+                const double im = c.imag();
+
+                reinterpret_cast<double(&)[2]>(r[n])[0] = std::sqrt( (re*re) + (im * im) );
+                reinterpret_cast<double(&)[2]>(r[n])[1] = 0;
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Error happened in absolute(size_t N, const std::complex<double>* x, std::complex<double>* r) ... ");
+        }
+    }
+
+    template <typename T> 
+    void absolute(const hoNDArray< std::complex<T> >& x, hoNDArray< std::complex<T> >& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        absolute(x.get_number_of_elements(), x.begin(), r.begin());
+    }
+
+    template EXPORTCPUCOREMATH void absolute(const hoNDArray< std::complex<float> >& x, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void absolute(const hoNDArray< std::complex<double> >& x, hoNDArray< std::complex<double> >& r);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    void argument(const hoNDArray<T>& x, hoNDArray<typename realType<T>::Type>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        size_t N = x.get_number_of_elements();
+        const T* pX = x.begin();
+        typename realType<T>::Type* pR = r.begin();
+
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, pX, pR) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            pR[n] = std::arg( pX[n] );
+        }
+    }
+
+    template EXPORTCPUCOREMATH void argument(const hoNDArray< std::complex<float> >& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void argument(const hoNDArray< std::complex<double> >& x, hoNDArray<double>& r);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    void inv(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        if ( !r.dimensions_equal(&x) )
+        {
+            r = x;
+        }
+
+        size_t N = x.get_number_of_elements();
+        const T* pX = x.begin();
+        T* pR = r.begin();
+
+        T v(1.0);
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, pX, pR, v) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            pR[n] = v/pX[n];
+        }
+    }
+
+    template EXPORTCPUCOREMATH void inv(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void inv(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void inv(const hoNDArray< std::complex<float> >& x, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void inv(const hoNDArray< std::complex<double> >& x, hoNDArray< std::complex<double> >& r);
+
+    // --------------------------------------------------------------------------------
+
+    template<typename T> 
+    void conv2(size_t RO, size_t E1, size_t num, const T* x, size_t kRO, size_t kE1, const T* y, T* z)
+    {
+        try
+        {
+            long long halfKRO = (long long)(kRO/2);
+            long long halfKE1 = (long long)(kE1/2);
+
+            hoNDArray<T> flipY(2*halfKRO+1, 2*halfKE1+1);
+            T* pKer = flipY.begin();
+
+            long long n;
+            long long ro, e1;
+
+            // flip the kernel
+            for ( e1=0; e1<(long long)kE1; e1++ )
+            {
+                long long flip_e1 = 2*halfKE1 - e1;
+
+                for ( ro=0; ro<(long long)kRO; ro++ )
+                {
+                    long long flip_ro = 2*halfKRO - ro;
+
+                    flipY(flip_ro, flip_e1) = y[ro+e1*kRO];
+                }
+            }
+
+            // perform the convolution
+            #pragma omp parallel for default(none) private(n, ro, e1) shared(num, x, RO, E1, z, halfKRO, halfKE1, pKer)
+            for ( n=0; n<(long long)num; n++ )
+            {
+                const T* pX = x + n*RO*E1;
+                T* pZ = z + n*RO*E1;
+
+                long long kro, ke1, dro, de1;
+
+                for ( e1=0; e1<(long long)E1; e1++ )
+                {
+                    for ( ro=0; ro<(long long)RO; ro++ )
+                    {
+                        pZ[ro + e1*RO] = 0;
+                        for ( ke1=-halfKE1; ke1<=halfKE1; ke1++ )
+                        {
+                            de1 = ke1 + e1;
+                            if ( de1 < 0 )
+                            {
+                                de1 += E1;
+                            }
+                            else if ( de1 >= (long long)E1 )
+                            {
+                                de1 -= E1;
+                            }
+
+                            for ( kro=-halfKRO; kro<=halfKRO; kro++ )
+                            {
+                                dro = kro + ro;
+                                if ( dro < 0 )
+                                {
+                                    dro += RO;
+                                }
+                                else if ( dro >= (long long)RO )
+                                {
+                                    dro -= RO;
+                                }
+
+                                pZ[ro + e1*RO] += pKer[ kro+halfKRO + (ke1+halfKE1) * (2*halfKRO+1) ] * pX[dro + de1*RO];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors happened in conv2(size_t RO, size_t E1, size_t num, const T* x, size_t kRO, size_t kE1, const T* y, T* z) ... ");
+        }
+    }
+
+    template<typename T> 
+    void conv2(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& z)
+    {
+        try
+        {
+            if ( !z.dimensions_equal(&x) )
+            {
+                z = x;
+            }
+
+            long long RO = (long long) x.get_size(0);
+            long long E1 = (long long) x.get_size(1);
+            long long num = ((long long) x.get_number_of_elements()) / (RO*E1);
+
+            long long kRO = (long long) y.get_size(0);
+            long long kE1 = (long long) y.get_size(1);
+
+            conv2(RO, E1, num, x.begin(), kRO, kE1, y.begin(), z.begin());
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors happened in conv2(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& z) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void conv2(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& z);
+    template EXPORTCPUCOREMATH void conv2(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& z);
+    template EXPORTCPUCOREMATH void conv2(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& z);
+    template EXPORTCPUCOREMATH void conv2(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& z);
+
+    // --------------------------------------------------------------------------------
+
+    template<typename T> 
+    void conv3(size_t RO, size_t E1, size_t E2, size_t num, const T* x, size_t kRO, size_t kE1, size_t kE2, const T* y, T* z)
+    {
+        try
+        {
+            long long halfKRO = (long long)(kRO/2);
+            long long halfKE1 = (long long)(kE1/2);
+            long long halfKE2 = (long long)(kE2/2);
+
+            hoNDArray<T> flipY(2*halfKRO+1, 2*halfKE1+1, 2*halfKE2+1);
+            T* pKer = flipY.begin();
+
+            long long n, e2;
+            long long ro, e1;
+
+            // flip the kernel
+            for ( e2=0; e2<(long long)kE2; e2++ )
+            {
+                long long flip_e2 = 2*halfKE2 - e2;
+
+                for ( e1=0; e1<(long long)kE1; e1++ )
+                {
+                    long long flip_e1 = 2*halfKE1 - e1;
+
+                    for ( ro=0; ro<(long long)kRO; ro++ )
+                    {
+                        long long flip_ro = 2*halfKRO - ro;
+
+                        flipY(flip_ro, flip_e1, flip_e2) = y[ro+e1*kRO+e2*kRO*kE1];
+                    }
+                }
+            }
+
+            // perform the convolution
+            #pragma omp parallel for default(none) private(n) shared(num, x, RO, E1, E2, z, halfKRO, halfKE1, halfKE2, pKer) if ( num > 8 )
+            for ( n=0; n<(long long)num; n++ )
+            {
+                const T* pX = x + n*RO*E1*E2;
+                T* pZ = z + n*RO*E1*E2;
+
+                long long kro, ke1, ke2, dro, de1, de2;
+
+                #pragma omp parallel for default(none) private(ro, e1, e2, kro, ke1, ke2, dro, de1, de2) shared(pX, RO, E1, E2, pZ, halfKRO, halfKE1, halfKE2, pKer)
+                for ( e2=0; e2<(long long)E2; e2++ )
+                {
+                    for ( e1=0; e1<(long long)E1; e1++ )
+                    {
+                        for ( ro=0; ro<(long long)RO; ro++ )
+                        {
+                            pZ[ro + e1*RO + e2*RO*E1] = 0;
+                            for ( ke2=-halfKE2; ke2<=halfKE2; ke2++ )
+                            {
+                                de2 = ke2 + e2;
+                                if ( de2 < 0 )
+                                {
+                                    de2 += E2;
+                                }
+                                else if ( de2 >= (long long)E2 )
+                                {
+                                    de2 -= E2;
+                                }
+
+                                for ( ke1=-halfKE1; ke1<=halfKE1; ke1++ )
+                                {
+                                    de1 = ke1 + e1;
+                                    if ( de1 < 0 )
+                                    {
+                                        de1 += E1;
+                                    }
+                                    else if ( de1 >= (long long)E1 )
+                                    {
+                                        de1 -= E1;
+                                    }
+
+                                    for ( kro=-halfKRO; kro<=halfKRO; kro++ )
+                                    {
+                                        dro = kro + ro;
+                                        if ( dro < 0 )
+                                        {
+                                            dro += RO;
+                                        }
+                                        else if ( dro >= (long long)RO )
+                                        {
+                                            dro -= RO;
+                                        }
+
+                                        pZ[ro + e1*RO + e2*RO*E1] += pKer[ kro+halfKRO + (ke1+halfKE1)*(2*halfKRO+1) + (ke2+halfKE2)*(2*halfKRO+1)*(2*halfKE1+1) ] * pX[dro + de1*RO + de2*RO*E1];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors happened in conv3(size_t RO, size_t E1, size_t E2, size_t num, const T* x, size_t kRO, size_t kE1, size_t kE2, const T* y, T* z) ... ");
+        }
+    }
+
+    template<typename T> 
+    void conv3(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& z)
+    {
+        try
+        {
+            if ( !z.dimensions_equal(&x) )
+            {
+                z = x;
+            }
+
+            long long RO = (long long) x.get_size(0);
+            long long E1 = (long long) x.get_size(1);
+            long long E2 = (long long) x.get_size(2);
+            long long num = ((long long)x.get_number_of_elements()) / (RO*E1*E2);
+
+            long long kRO = (long long) y.get_size(0);
+            long long kE1 = (long long) y.get_size(1);
+            long long kE2 = (long long) y.get_size(2);
+
+            conv3(RO, E1, E2, num, x.begin(), kRO, kE1, kE2, y.begin(), z.begin());
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors happened in conv3(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& z) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void conv3(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& z);
+    template EXPORTCPUCOREMATH void conv3(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& z);
+    template EXPORTCPUCOREMATH void conv3(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& z);
+    template EXPORTCPUCOREMATH void conv3(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& z);
+
+    // --------------------------------------------------------------------------------
+
+    inline void axpy(float a, size_t N, const float* x, const float* y, float* r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, r, a , x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            r[n] = a*x[n] + y[n];
+        }
+    }
+
+    inline void axpy(double a, size_t N, const double* x, const double* y, double* r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, r, a , x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            r[n] = a*x[n] + y[n];
+        }
+    }
+
+    inline void axpy( std::complex<float>  a, size_t N, const  std::complex<float> * x, const  std::complex<float> * y,  std::complex<float> * r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, r, a, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            const  std::complex<float> & vx = x[n];
+            const float re1 = vx.real();
+            const float im1 = vx.imag();
+
+            const  std::complex<float> & vy = y[n];
+            const float re2 = vy.real();
+            const float im2 = vy.imag();
+
+            const float ar = a.real();
+            const float ai = a.imag();
+
+            reinterpret_cast<float(&)[2]>(r[n])[0] = re2 + ar*re1 - ai*im1;
+            reinterpret_cast<float(&)[2]>(r[n])[1] = im2 + ar*im1 + ai*re1;
+        }
+    }
+
+    inline void axpy( std::complex<double>  a, size_t N, const  std::complex<double> * x, const  std::complex<double> * y,  std::complex<double> * r)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, r, a, x, y) if(N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; ++n)
+        {
+            const  std::complex<double> & vx = x[n];
+            const double re1 = vx.real();
+            const double im1 = vx.imag();
+
+            const  std::complex<double> & vy = y[n];
+            const double re2 = vy.real();
+            const double im2 = vy.imag();
+
+            const double ar = a.real();
+            const double ai = a.imag();
+
+            reinterpret_cast<double(&)[2]>(r[n])[0] = re2 + ar*re1 - ai*im1;
+            reinterpret_cast<double(&)[2]>(r[n])[1] = im2 + ar*im1 + ai*re1;
+        }
+    }
+
+    template <typename T> 
+    void axpy(T a, const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r)
+    {
+        GADGET_DEBUG_CHECK_THROW(x.get_number_of_elements()==y.get_number_of_elements());
+
+        if ( r.get_number_of_elements() != x.get_number_of_elements() )
+        {
+            r = y;
+        }
+        else
+        {
+            if ( &r != &y )
+            {
+                memcpy(r.begin(), y.begin(), r.get_number_of_bytes());
+            }
+        }
+
+        axpy(a, x.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+    }
+
+    template EXPORTCPUCOREMATH void axpy(float a, const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH void axpy(double a, const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH void axpy( std::complex<float>  a, const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, hoNDArray< std::complex<float> >& r);
+    template EXPORTCPUCOREMATH void axpy( std::complex<double>  a, const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, hoNDArray< std::complex<double> >& r);
+
+    // --------------------------------------------------------------------------------
+
+    inline void scal(size_t N, float a, float* x)
+    {
+        long long n;
+        #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            x[n] *= a;
+        }
+    }
+
+    inline void scal(size_t N, double a, double* x)
+    {
+        long long n;
+        #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            x[n] *= a;
+        }
+    }
+
+    inline void scal(size_t N,  std::complex<float>  a,  std::complex<float> * x)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const  std::complex<float> & c = x[n];
+            const float re = c.real();
+            const float im = c.imag();
+
+            const float ar = a.real();
+            const float ai = a.imag();
+
+            reinterpret_cast<float(&)[2]>(x[n])[0] = re*ar-im*ai;
+            reinterpret_cast<float(&)[2]>(x[n])[1] = re*ai+im*ar;
+        }
+    }
+
+    inline void scal(size_t N,  std::complex<double>  a,  std::complex<double> * x)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const  std::complex<double> & c = x[n];
+            const double re = c.real();
+            const double im = c.imag();
+
+            const double ar = a.real();
+            const double ai = a.imag();
+
+            reinterpret_cast<double(&)[2]>(x[n])[0] = re*ar-im*ai;
+            reinterpret_cast<double(&)[2]>(x[n])[1] = re*ai+im*ar;
+        }
+    }
+
+    template <typename T> 
+    void scal(T a, hoNDArray<T>& x)
+    {
+        scal(x.get_number_of_elements(), a, x.begin());
+    }
+
+    template EXPORTCPUCOREMATH void scal(float a, hoNDArray<float>& x);
+    template EXPORTCPUCOREMATH void scal(double a, hoNDArray<double>& x);
+    template EXPORTCPUCOREMATH void scal( std::complex<float>  a, hoNDArray< std::complex<float> >& x);
+    template EXPORTCPUCOREMATH void scal( std::complex<double>  a, hoNDArray< std::complex<double> >& x);
+
+    // --------------------------------------------------------------------------------
+
+    inline void scal(size_t N, float a,  std::complex<float> * x)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const  std::complex<float> & c = x[n];
+            const float re = c.real();
+            const float im = c.imag();
+
+            reinterpret_cast<float(&)[2]>(x[n])[0] = re*a;
+            reinterpret_cast<float(&)[2]>(x[n])[1] = im*a;
+        }
+    }
+
+    inline void scal(size_t N, double a,  std::complex<double> * x)
+    {
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const  std::complex<double> & c = x[n];
+            const double re = c.real();
+            const double im = c.imag();
+
+            reinterpret_cast<double(&)[2]>(x[n])[0] = re*a;
+            reinterpret_cast<double(&)[2]>(x[n])[1] = im*a;
+        }
+    }
+
+    template <typename T> 
+    void scal(T a, hoNDArray< std::complex<T> >& x)
+    {
+        scal(x.get_number_of_elements(), a, x.begin());
+    }
+
+    template EXPORTCPUCOREMATH void scal(float a, hoNDArray< std::complex<float> >& x);
+    template EXPORTCPUCOREMATH void scal(double a, hoNDArray< std::complex<double> >& x);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    struct hoCompAscending
+    {
+        bool operator() (T a, T b) { return (a>=b); }
+    };
+
+    template <typename T> 
+    struct hoCompDescending
+    {
+        bool operator() (T a, T b) { return (a<b); }
+    };
+
+    template <typename T> 
+    void sort(size_t N, const T* x, T* r, bool isascending)
+    {
+        if ( r != x )
+        {
+            memcpy(r, x, sizeof(T)*N);
+        }
+
+        if ( isascending )
+        {
+            hoCompAscending<T> obj;
+            std::sort(r, r+N, obj);
+        }
+        else
+        {
+            hoCompDescending<T> obj;
+            std::sort(r, r+N, obj);
+        }
+    }
+
+    template <typename T> 
+    void sort(const hoNDArray<T>& x, hoNDArray<T>& r, bool isascending)
+    {
+        if ( &r != &x )
+        {
+            if ( r.get_number_of_elements()!=x.get_number_of_elements())
+            {
+                r = x;
+            }
+            else
+            {
+                memcpy(r.begin(), x.begin(), x.get_number_of_bytes());
+            }
+        }
+
+        sort(x.get_number_of_elements(), x.begin(), r.begin(), isascending);
+    }
+
+    template EXPORTCPUCOREMATH void sort(const hoNDArray<float>& x, hoNDArray<float>& r, bool isascending);
+    template EXPORTCPUCOREMATH void sort(const hoNDArray<double>& x, hoNDArray<double>& r, bool isascending);
+
+// --------------------------------------------------------------------------------
+
+    template<typename T> void fill( hoNDArray<T>* x, T val)
+    {
+        size_t N = x->get_number_of_elements();
+        T* pX = x->begin();
+
+        long long n;
+        #pragma omp parallel for default(none) private(n) shared(N, pX, val) if (N>NumElementsUseThreading)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            pX[n] = val;
+        }
+    }
+
+    template EXPORTCPUCOREMATH void fill( hoNDArray<float>* x, float val);
+    template EXPORTCPUCOREMATH void fill( hoNDArray<double>* x, double val);
+    template EXPORTCPUCOREMATH void fill( hoNDArray< std::complex<float> >* x,  std::complex<float>  val);
+    template EXPORTCPUCOREMATH void fill( hoNDArray< std::complex<double> >* x,  std::complex<double>  val);
+
+    // --------------------------------------------------------------------------------
+
+    template<typename T> void fill( hoNDArray<T>& x, T val )
+    {
+        Gadgetron::fill( &x, val);
+    }
+
+    template EXPORTCPUCOREMATH void fill( hoNDArray<float>& x, float val);
+    template EXPORTCPUCOREMATH void fill( hoNDArray<double>& x, double val);
+    template EXPORTCPUCOREMATH void fill( hoNDArray< std::complex<float> >& x,  std::complex<float>  val);
+    template EXPORTCPUCOREMATH void fill( hoNDArray< std::complex<double> >& x,  std::complex<double>  val);
+
+    // --------------------------------------------------------------------------------
+
+    inline void asum(size_t N, const float* x, float& r)
+    {
+        long long i;
+        float sum(0);
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            sum += GT_ABS(x[i]);
+        }
+
+        r = sum;
+    }
+
+    inline void asum(size_t N, const double* x, double& r)
+    {
+        long long i;
+        double sum(0);
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            sum += GT_ABS(x[i]);
+        }
+
+        r = sum;
+    }
+
+    inline void asum(size_t N, const  std::complex<float> * x, float& r)
+    {
+        long long i;
+        float sum(0);
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            const  std::complex<float> & c = x[i];
+            const float re = c.real();
+            const float im = c.imag();
+            sum += ( GT_ABS(re) + GT_ABS(im) );
+        }
+
+        r = sum;
+    }
+
+    inline void asum(size_t N, const  std::complex<double> * x, double& r)
+    {
+        long long i;
+        double sum(0);
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            const  std::complex<double> & c = x[i];
+            const double re = c.real();
+            const double im = c.imag();
+            sum += ( GT_ABS(re) + GT_ABS(im) );
+        }
+
+        r = sum;
+    }
+
+    template<class T> void asum(const hoNDArray<T>& x, typename realType<T>::Type& r)
+    {
+        asum(x.get_number_of_elements(), x.begin(), r);
+    }
+
+    template EXPORTCPUCOREMATH void asum( const hoNDArray<float>& x, float& r);
+    template EXPORTCPUCOREMATH void asum( const hoNDArray<double>& x, double& r);
+    template EXPORTCPUCOREMATH void asum( const hoNDArray< std::complex<float> >& x, float& r);
+    template EXPORTCPUCOREMATH void asum( const hoNDArray< std::complex<double> >& x, double& r);
+
+    template<class T> typename realType<T>::Type asum(const hoNDArray<T>& x)
+    {
+        typename realType<T>::Type r;
+        asum(x, r);
+        return r;
+    }
+
+    template EXPORTCPUCOREMATH float asum( const hoNDArray<float>& x);
+    template EXPORTCPUCOREMATH double asum( const hoNDArray<double>& x);
+    template EXPORTCPUCOREMATH float asum( const hoNDArray< std::complex<float> >& x);
+    template EXPORTCPUCOREMATH double asum( const hoNDArray< std::complex<double> >& x);
+
+    // --------------------------------------------------------------------------------
+
+    inline size_t amax(size_t N, const float* x)
+    {
+        lapack_int num = (lapack_int)(N);
+        lapack_int incx = 1;
+
+        return isamax_(&num, (float*)(x), &incx);
+    }
+
+    inline size_t amax(size_t N, const double* x)
+    {
+        lapack_int num = (lapack_int)(N);
+        lapack_int incx = 1;
+
+        return idamax_(&num, (double*)(x), &incx);
+    }
+
+    inline size_t amax(size_t N, const  std::complex<float> * x)
+    {
+        lapack_int num = (lapack_int)(N);
+        lapack_int incx = 1;
+
+        return icamax_(&num, (lapack_complex_float*)(x), &incx);
+    }
+
+    inline size_t amax(size_t N, const  std::complex<double> * x)
+    {
+        lapack_int num = (lapack_int)(N);
+        lapack_int incx = 1;
+
+        return izamax_(&num, (lapack_complex_double*)(x), &incx);
+    }
+
+    template<class T> size_t amax(const hoNDArray<T>& x)
+    {
+        return amax(x.get_number_of_elements(), x.begin());
+    }
+
+    template EXPORTCPUCOREMATH size_t amax( const hoNDArray<float>& x);
+    template EXPORTCPUCOREMATH size_t amax( const hoNDArray<double>& x);
+    template EXPORTCPUCOREMATH size_t amax( const hoNDArray< std::complex<float> >& x);
+    template EXPORTCPUCOREMATH size_t amax( const hoNDArray< std::complex<double> >& x);
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> 
+    void real_imag_to_complex(const hoNDArray<typename realType<T>::Type>& real, const hoNDArray<typename realType<T>::Type>& imag, hoNDArray<T>& cplx)
+    {
+        try
+        {
+            GADGET_CHECK_THROW(real.dimensions_equal(&imag));
+
+            if ( !cplx.dimensions_equal(&real) )
+            {
+                cplx.create(real.get_dimensions());
+            }
+
+            T* pRes = cplx.begin();
+            const typename realType<T>::Type* pReal = real.begin();
+            const typename realType<T>::Type* pImag = imag.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for private(n) shared(N, pRes, pReal, pImag)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pRes[n] = T(pReal[n], pImag[n]);
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in real_imag_to_complex(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void real_imag_to_complex(const hoNDArray<float>& real, const hoNDArray<float>& imag, hoNDArray< std::complex<float> >& cplx);
+    template EXPORTCPUCOREMATH void real_imag_to_complex(const hoNDArray<double>& real, const hoNDArray<double>& imag, hoNDArray< std::complex<double> >& cplx);
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> 
+    void complex_to_real_imag(const hoNDArray<T>& cplx, hoNDArray<typename realType<T>::Type>& real, hoNDArray<typename realType<T>::Type>& imag)
+    {
+        try
+        {
+            if ( !real.dimensions_equal(&cplx) )
+            {
+                real.create(cplx.get_dimensions());
+            }
+
+            if ( !imag.dimensions_equal(&cplx) )
+            {
+                imag.create(cplx.get_dimensions());
+            }
+
+            const T* pRes = cplx.begin();
+            typename realType<T>::Type* pReal = real.begin();
+            typename realType<T>::Type* pImag = imag.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, pRes, pReal, pImag)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pReal[n] = pRes[n].real();
+                pImag[n] = pRes[n].imag();
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_real_imag(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void complex_to_real_imag(const hoNDArray< std::complex<float> >& cplx, hoNDArray<float>& real, hoNDArray<float>& imag);
+    template EXPORTCPUCOREMATH void complex_to_real_imag(const hoNDArray< std::complex<double> >& cplx, hoNDArray<double>& real, hoNDArray<double>& imag);
+
+    template <> EXPORTCPUCOREMATH
+    void complex_to_real_imag(const hoNDArray<float>& cplx, hoNDArray<float>& real, hoNDArray<float>& imag)
+    {
+        try
+        {
+            if ( !real.dimensions_equal(&cplx) )
+            {
+                real.create(cplx.get_dimensions());
+            }
+
+            if ( !imag.dimensions_equal(&cplx) )
+            {
+                imag.create(cplx.get_dimensions());
+            }
+
+            const float* pRes = cplx.begin();
+            float* pReal = real.begin();
+            float* pImag = imag.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, pRes, pReal, pImag)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pReal[n] = pRes[n];
+                pImag[n] = 0;
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_real_imag(...) ... ");
+        }
+    }
+
+    template<> EXPORTCPUCOREMATH 
+    void complex_to_real_imag(const hoNDArray<double>& cplx, hoNDArray<double>& real, hoNDArray<double>& imag)
+    {
+        try
+        {
+            if ( !real.dimensions_equal(&cplx) )
+            {
+                real.create(cplx.get_dimensions());
+            }
+
+            if ( !imag.dimensions_equal(&cplx) )
+            {
+                imag.create(cplx.get_dimensions());
+            }
+
+            const double* pRes = cplx.begin();
+            double* pReal = real.begin();
+            double* pImag = imag.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, pRes, pReal, pImag)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pReal[n] = pRes[n];
+                pImag[n] = 0;
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_real_imag(...) ... ");
+        }
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> 
+    void complex_to_real(const hoNDArray<T>& cplx, hoNDArray<typename realType<T>::Type>& real)
+    {
+        try
+        {
+            if ( !real.dimensions_equal(&cplx) )
+            {
+                real.create(cplx.get_dimensions());
+            }
+
+            const T* pRes = cplx.begin();
+            typename realType<T>::Type* pReal = real.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, pRes, pReal)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pReal[n] = pRes[n].real();
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_real(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void complex_to_real(const hoNDArray< std::complex<float> >& cplx, hoNDArray<float>& real);
+    template EXPORTCPUCOREMATH void complex_to_real(const hoNDArray< std::complex<double> >& cplx, hoNDArray<double>& real);
+
+    template<class T> 
+    void complex_to_real(const hoNDArray<T>& cplx, hoNDArray<T>& real)
+    {
+        try
+        {
+            if ( !real.dimensions_equal(&cplx) )
+            {
+                real.create(cplx.get_dimensions());
+            }
+
+            const T* pRes = cplx.begin();
+            T* pReal = real.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for private(n) shared(N, pRes, pReal)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pReal[n] = T(pRes[n].real(), 0);
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_real(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void complex_to_real(const hoNDArray< std::complex<float> >& cplx, hoNDArray< std::complex<float> >& real);
+    template EXPORTCPUCOREMATH void complex_to_real(const hoNDArray< std::complex<double> >& cplx, hoNDArray< std::complex<double> >& real);
+
+    template<class T> 
+    void complex_to_real(hoNDArray<T>& cplx)
+    {
+        try
+        {
+            T* pRes = cplx.begin();
+
+            size_t N = cplx.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for private(n) shared(N, pRes)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pRes[n] = T(pRes[n].real(), 0);
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_real(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void complex_to_real(hoNDArray< std::complex<float> >& cplx);
+    template EXPORTCPUCOREMATH void complex_to_real(hoNDArray< std::complex<double> >& cplx);
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> 
+    void complex_to_imag(const hoNDArray<T>& cplx, hoNDArray<typename realType<T>::Type>& imag)
+    {
+        try
+        {
+            if ( !imag.dimensions_equal(&cplx) )
+            {
+                imag.create(cplx.get_dimensions());
+            }
+
+            const T* pRes = cplx.begin();
+            typename realType<T>::Type* pImag = imag.begin();
+
+            size_t N = imag.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, pRes, pImag)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pImag[n] = pRes[n].imag();
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_imag(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void complex_to_imag(const hoNDArray< std::complex<float> >& cplx, hoNDArray<float>& imag);
+    template EXPORTCPUCOREMATH void complex_to_imag(const hoNDArray< std::complex<double> >& cplx, hoNDArray<double>& imag);
+
+    template<class T> 
+    void complex_to_imag(const hoNDArray<T>& cplx, hoNDArray<T>& imag)
+    {
+        try
+        {
+            if ( !imag.dimensions_equal(&cplx) )
+            {
+                imag.create(cplx.get_dimensions());
+            }
+
+            const T* pRes = cplx.begin();
+            T* pImag = imag.begin();
+
+            size_t N = imag.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for private(n) shared(N, pRes, pImag)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pImag[n] = T(0, pRes[n].imag());
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_imag(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void complex_to_imag(const hoNDArray< std::complex<float> >& cplx, hoNDArray< std::complex<float> >& imag);
+    template EXPORTCPUCOREMATH void complex_to_imag(const hoNDArray< std::complex<double> >& cplx, hoNDArray< std::complex<double> >& imag);
+
+    template<class T> 
+    void complex_to_imag(hoNDArray<T>& cplx)
+    {
+        try
+        {
+            T* pRes = cplx.begin();
+
+            size_t N = cplx.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for private(n) shared(N, pRes)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pRes[n] = T( pRes[n].real(), 0);
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in complex_to_imag(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void complex_to_imag(hoNDArray< std::complex<float> >& cplx);
+    template EXPORTCPUCOREMATH void complex_to_imag(hoNDArray< std::complex<double> >& cplx);
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> 
+    void real_to_complex(const hoNDArray<typename realType<T>::Type>& real, hoNDArray<T>& cplx)
+    {
+        try
+        {
+            if ( !cplx.dimensions_equal(&real) )
+            {
+                cplx.create(real.get_dimensions());
+            }
+
+            const typename realType<T>::Type* pReal = real.begin();
+            T* pRes = cplx.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for private(n) shared(N, pRes, pReal)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pRes[n] = T(pReal[n], 0);
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in real_to_complex(...) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void real_to_complex(const hoNDArray< float >& real, hoNDArray< std::complex<float> >& cplx);
+    template EXPORTCPUCOREMATH void real_to_complex(const hoNDArray< double >& real, hoNDArray< std::complex<double> >& cplx);
+
+    // --------------------------------------------------------------------------------
+
+    template <class T>
+    void minValue(const hoNDArray<T>& a, T& v)
+    {
+        typedef T ValueType;
+
+        try
+        {
+            const ValueType* pA = a.begin();
+            size_t n = a.get_number_of_elements();
+            v = pA[0];
+
+            size_t ii;
+            for (ii=1; ii<n; ii++)
+            {
+                if (pA[ii]<v) v = pA[ii];
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in minValue(const hoNDArray<T>& a, T& v) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void minValue(const hoNDArray<float>& a, float& v);
+    template EXPORTCPUCOREMATH void minValue(const hoNDArray<double>& a, double& v);
+
+    template <class T>
+    void maxValue(const hoNDArray<T>& a, T& v)
+    {
+        typedef T ValueType;
+
+        try
+        {
+            const ValueType* pA = a.begin();
+            size_t n = a.get_number_of_elements();
+            v = pA[0];
+
+            size_t ii;
+            for (ii=1; ii<n; ii++)
+            {
+                if (pA[ii]>v) v = pA[ii];
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in maxValue(const hoNDArray<T>& a, T& v) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void maxValue(const hoNDArray<float>& a, float& v);
+    template EXPORTCPUCOREMATH void maxValue(const hoNDArray<double>& a, double& v);
+
+    // --------------------------------------------------------------------------------
+}
diff --git a/toolboxes/core/cpu/math/hoNDArray_math_util.h b/toolboxes/core/cpu/math/hoNDArray_math_util.h
new file mode 100644
index 0000000..64d7acd
--- /dev/null
+++ b/toolboxes/core/cpu/math/hoNDArray_math_util.h
@@ -0,0 +1,27 @@
+/** \file  hoNDArray_math_util.h
+    \brief math functions for hoNDArray and hoNDImage not using armadillo
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "ho2DArray.h"
+#include "ho3DArray.h"
+#include "ho4DArray.h"
+#include "ho5DArray.h"
+#include "ho6DArray.h"
+#include "ho7DArray.h"
+#include "hoNDImage.h"
+
+#include "complext.h"
+#include "cpucore_math_export.h"
+#include "GadgetronCommon.h"
+#include <complex>
+
+namespace Gadgetron
+{
+
+    
+
+    
+}
diff --git a/toolboxes/core/cpu/math/hoNDArray_reductions.cpp b/toolboxes/core/cpu/math/hoNDArray_reductions.cpp
new file mode 100644
index 0000000..8f4a4cc
--- /dev/null
+++ b/toolboxes/core/cpu/math/hoNDArray_reductions.cpp
@@ -0,0 +1,939 @@
+#include "hoNDArray_reductions.h"
+#include "hoArmadillo.h"
+
+#ifndef lapack_int
+    #define lapack_int int
+#endif // lapack_int
+
+#ifndef lapack_complex_float
+    #define lapack_complex_float  std::complex<float> 
+#endif // lapack_complex_float
+
+#ifndef lapack_complex_double
+    #define lapack_complex_double  std::complex<double> 
+#endif // #ifndef lapack_complex_double
+
+#define NumElementsUseThreading 64*1024
+
+//Declaration of BLAS and LAPACK routines
+extern "C"
+{
+    /// Finds the index of the element with the maximal absolute value.
+    lapack_int isamax_(lapack_int* N, float* x, lapack_int* incx);
+    lapack_int idamax_(lapack_int* N, double* x, lapack_int* incx);
+    lapack_int icamax_(lapack_int* N, lapack_complex_float* x, lapack_int* incx);
+    lapack_int izamax_(lapack_int* N, lapack_complex_double* x, lapack_int* incx);
+}
+
+namespace Gadgetron{
+
+    // --------------------------------------------------------------------------------
+
+    template<class REAL> REAL max(hoNDArray<REAL>* data){
+        return as_arma_col(data).max();
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class REAL> REAL min(hoNDArray<REAL>* data){
+        return as_arma_col(data).min();
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> T mean(hoNDArray<T>* data){
+        return (typename stdType<T>::Type) arma::mean(as_arma_col(data));
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> T sum(hoNDArray<T>* data){
+        return (typename stdType<T>::Type) arma::sum(as_arma_col(data));
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> T stddev(hoNDArray<T>* data){
+        return (typename stdType<T>::Type) arma::stddev(as_arma_col(data));
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template<class T> T dot( hoNDArray<T> *x, hoNDArray<T> *y, bool cc )
+    {
+        if( x == 0x0 || y == 0x0 )
+            throw std::runtime_error("Gadgetron::dot(): Invalid input array");
+
+        if( x->get_number_of_elements() != y->get_number_of_elements() )
+            throw std::runtime_error("Gadgetron::dot(): Array sizes mismatch");
+
+        arma::Col<typename stdType<T>::Type> xM = as_arma_col(x);
+        arma::Col<typename stdType<T>::Type> yM = as_arma_col(y);
+        typename stdType<T>::Type res = (cc) ? arma::cdot(xM,yM) : arma::dot(xM,yM);
+        return *((T*)(&res));
+    }
+
+    // --------------------------------------------------------------------------------
+
+    inline void asum(size_t N, const float* x, float& r)
+    {
+        long long i;
+        float sum(0);
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            sum += std::abs(x[i]);
+        }
+
+        r = sum;
+    }
+
+    inline void asum(size_t N, const double* x, double& r)
+    {
+        long long i;
+        double sum(0);
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            sum += std::abs(x[i]);
+        }
+
+        r = sum;
+    }
+
+    inline void asum(size_t N, const  std::complex<float> * x, float& r)
+    {
+        long long i;
+        float sum(0);
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            const  std::complex<float> & c = x[i];
+            const float re = c.real();
+            const float im = c.imag();
+            sum += ( std::abs(re) + std::abs(im) );
+        }
+
+        r = sum;
+    }
+
+    inline void asum(size_t N, const  std::complex<double> * x, double& r)
+    {
+        long long i;
+        double sum(0);
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            const  std::complex<double> & c = x[i];
+            const double re = c.real();
+            const double im = c.imag();
+            sum += ( std::abs(re) + std::abs(im) );
+        }
+
+        r = sum;
+    }
+
+    template<class T> void asum(const hoNDArray<T>& x, typename realType<T>::Type& r)
+    {
+        asum(x.get_number_of_elements(), x.begin(), r);
+    }
+
+    template EXPORTCPUCOREMATH void asum( const hoNDArray<float>& x, float& r);
+    template EXPORTCPUCOREMATH void asum( const hoNDArray<double>& x, double& r);
+    template EXPORTCPUCOREMATH void asum( const hoNDArray< std::complex<float> >& x, float& r);
+    template EXPORTCPUCOREMATH void asum( const hoNDArray< std::complex<double> >& x, double& r);
+
+    template<class T> typename realType<T>::Type asum(const hoNDArray<T>& x)
+    {
+        typename realType<T>::Type r;
+        asum(x, r);
+        return r;
+    }
+
+    template EXPORTCPUCOREMATH float asum( const hoNDArray<float>& x);
+    template EXPORTCPUCOREMATH double asum( const hoNDArray<double>& x);
+    template EXPORTCPUCOREMATH float asum( const hoNDArray< std::complex<float> >& x);
+    template EXPORTCPUCOREMATH double asum( const hoNDArray< std::complex<double> >& x);
+
+    template<class T> typename realType<T>::Type asum( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::asum(): Invalid input array");
+
+        typedef typename realType<T>::Type realT;
+        arma::Col<typename stdType<T>::Type> xM = as_arma_col(x);
+        return realT(arma::norm(xM,1));
+    }
+
+    template<class T> T asum( hoNDArray< std::complex<T> > *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::asum(): Invalid input array");
+
+        return arma::norm(arma::abs(real(as_arma_col(x)))+arma::abs(imag(as_arma_col(x))),1);
+    }
+
+    template<class T> T asum( hoNDArray< complext<T> > *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::asum(): Invalid input array");
+
+        return arma::norm(arma::abs(real(as_arma_col(x)))+arma::abs(imag(as_arma_col(x))),1);
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> inline 
+    void norm1(size_t N, const T* x, typename realType<T>::Type& r)
+    {
+        long long n;
+
+        typename realType<T>::Type norm1Sum(0);
+
+        #pragma omp parallel for private(n) reduction(+:norm1Sum) if (N>NumElementsUseThreading)
+        for (n=0; n<(long long)N; n++)
+        {
+            const T& c = x[n];
+            norm1Sum += std::abs(c);
+        }
+
+        r = norm1Sum;
+    }
+
+    inline void norm1(size_t N, const  std::complex<float> * x, float& r)
+    {
+        long long i;
+        float sum = 0.0f;
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            const std::complex<float>& c = x[i];
+            const float re = c.real();
+            const float im = c.imag();
+            sum += std::sqrt( (re*re) + (im * im) );
+        }
+
+        r = sum;
+    }
+
+    inline void norm1(size_t N, const  complext<float> * x, float& r)
+    {
+        norm1(N, (std::complex<float> *)x, r);
+    }
+
+    inline void norm1(size_t N, const  std::complex<double> * x, double& r)
+    {
+        long long i;
+        double sum = 0.0;
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            const std::complex<double>& c = x[i];
+            const double re = c.real();
+            const double im = c.imag();
+            sum += std::sqrt( (re*re) + (im * im) );
+        }
+
+        r = sum;
+    }
+
+    inline void norm1(size_t N, const  complext<double> * x, double& r)
+    {
+        norm1(N, (std::complex<double> *)x, r);
+    }
+
+    template <typename T> 
+    void norm1(const hoNDArray<T>& x, typename realType<T>::Type& r)
+    {
+        norm1(x.get_number_of_elements(), x.begin(), r);
+    }
+
+    template EXPORTCPUCOREMATH void norm1(const hoNDArray<float>& x, float& r);
+    template EXPORTCPUCOREMATH void norm1(const hoNDArray<double>& x, double& r);
+    template EXPORTCPUCOREMATH void norm1(const hoNDArray< std::complex<float> >& x, float& r);
+    template EXPORTCPUCOREMATH void norm1(const hoNDArray< complext<float> >& x, float& r);
+    template EXPORTCPUCOREMATH void norm1(const hoNDArray< std::complex<double> >& x, double& r);
+    template EXPORTCPUCOREMATH void norm1(const hoNDArray< complext<double> >& x, double& r);
+
+    template <typename T> inline 
+    typename realType<T>::Type norm1(const hoNDArray<T>& x)
+    {
+        typename realType<T>::Type r;
+        norm1(x, r);
+        return r;
+    }
+
+    template EXPORTCPUCOREMATH float norm1(const hoNDArray<float>& x);
+    template EXPORTCPUCOREMATH double norm1(const hoNDArray<double>& x);
+    template EXPORTCPUCOREMATH float norm1(const hoNDArray< std::complex<float> >& x);
+    template EXPORTCPUCOREMATH float norm1(const hoNDArray< complext<float> >& x);
+    template EXPORTCPUCOREMATH double norm1(const hoNDArray< std::complex<double> >& x);
+    template EXPORTCPUCOREMATH double norm1(const hoNDArray< complext<double> >& x);
+
+    template<class T> typename realType<T>::Type nrm1( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::nrm2(): Invalid input array");
+
+        /*typedef typename realType<T>::Type realT;
+        arma::Col<typename stdType<T>::Type> xM = as_arma_col(x);
+        return realT(arma::norm(xM,1));*/
+
+        return norm1(*x);
+    }
+
+    // --------------------------------------------------------------------------------
+
+    inline void norm2(size_t N, const float* x, float& r)
+    {
+        long long i;
+
+        float sum(0);
+
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            const float& re = x[i];
+            sum += ( re*re );
+        }
+
+        r = std::sqrt(sum);
+    }
+
+    inline void norm2(size_t N, const double* x, double& r)
+    {
+        long long i;
+
+        double sum(0);
+
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            const double& re = x[i];
+            sum += ( re*re );
+        }
+
+        r = std::sqrt(sum);
+    }
+
+    inline void norm2(size_t N, const  std::complex<float> * x, float& r)
+    {
+        long long i;
+
+        float sum(0);
+
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            const std::complex<float>& c = x[i];
+            const float re = c.real();
+            const float im = c.imag();
+            sum += ( (re*re) + (im * im) );
+        }
+
+        r = std::sqrt(sum);
+    }
+
+    inline void norm2(size_t N, const  complext<float> * x, float& r)
+    {
+        norm2(N, (std::complex<float> *)x, r);
+    }
+
+    inline void norm2(size_t N, const  std::complex<double> * x, double& r)
+    {
+        long long i;
+
+        double sum(0);
+
+        #pragma omp parallel for private(i) reduction(+:sum) if (N>NumElementsUseThreading)
+        for (i = 0; i < (long long)N; i++)
+        {
+            const std::complex<double>& c = x[i];
+            const double re = c.real();
+            const double im = c.imag();
+            sum += ( (re*re) + (im * im) );
+        }
+
+        r = std::sqrt(sum);
+    }
+
+    inline void norm2(size_t N, const  complext<double> * x, double& r)
+    {
+        norm2(N, (std::complex<double> *)x, r);
+    }
+
+    template <typename T> 
+    void norm2(const hoNDArray<T>& x, typename realType<T>::Type& r)
+    {
+        norm2(x.get_number_of_elements(), x.begin(), r);
+    }
+
+    template EXPORTCPUCOREMATH void norm2(const hoNDArray<float>& x, float& r);
+    template EXPORTCPUCOREMATH void norm2(const hoNDArray<double>& x, double& r);
+    template EXPORTCPUCOREMATH void norm2(const hoNDArray< std::complex<float> >& x, float& r);
+    template EXPORTCPUCOREMATH void norm2(const hoNDArray< complext<float> >& x, float& r);
+    template EXPORTCPUCOREMATH void norm2(const hoNDArray< std::complex<double> >& x, double& r);
+    template EXPORTCPUCOREMATH void norm2(const hoNDArray< complext<double> >& x, double& r);
+
+    template <typename T> inline 
+    typename realType<T>::Type norm2(const hoNDArray<T>& x)
+    {
+        typename realType<T>::Type r;
+        norm2(x, r);
+        return r;
+    }
+
+    template EXPORTCPUCOREMATH float norm2(const hoNDArray<float>& x);
+    template EXPORTCPUCOREMATH double norm2(const hoNDArray<double>& x);
+    template EXPORTCPUCOREMATH float norm2(const hoNDArray< std::complex<float> >& x);
+    template EXPORTCPUCOREMATH float norm2(const hoNDArray< complext<float> >& x);
+    template EXPORTCPUCOREMATH double norm2(const hoNDArray< std::complex<double> >& x);
+    template EXPORTCPUCOREMATH double norm2(const hoNDArray< complext<double> >& x);
+
+    template<class T> typename realType<T>::Type nrm2( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::nrm2(): Invalid input array");
+
+        /*typedef typename realType<T>::Type realT;
+        arma::Col<typename stdType<T>::Type> xM = as_arma_col(x);
+        return realT(arma::norm(xM,2));*/
+
+        return norm2(*x);
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    void minAbsolute(const hoNDArray<T>& x, T& r, size_t& ind)
+    {
+        size_t N = x.get_number_of_elements();
+        const T* pX = x.begin();
+
+        ind = 0;
+        if ( N == 0 ) return;
+
+        long long n;
+
+        typename realType<T>::Type v = abs(pX[0]);
+        typename realType<T>::Type v2;
+
+        ind = 0;
+        for ( n=1; n<(long long)N; n++ )
+        {
+            v2 = std::abs(pX[n]);
+            if ( v2 < v )
+            {
+                v = v2;
+                ind = n;
+            }
+        }
+
+        r = pX[ind];
+    }
+
+    template EXPORTCPUCOREMATH void minAbsolute(const hoNDArray<float>& x, float& r, size_t& ind);
+    template EXPORTCPUCOREMATH void minAbsolute(const hoNDArray<double>& x, double& r, size_t& ind);
+    template EXPORTCPUCOREMATH void minAbsolute(const hoNDArray< std::complex<float> >& x,  std::complex<float> & r, size_t& ind);
+    template EXPORTCPUCOREMATH void minAbsolute(const hoNDArray< std::complex<double> >& x,  std::complex<double> & r, size_t& ind);
+
+    template<class T> size_t amin( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::amin(): Invalid input array");
+
+        typedef typename realType<T>::Type realT;
+        arma::Col<realT> xM = arma::abs(as_arma_col(x));
+        arma::uword idx;
+        realT min = xM.min(idx);
+        return idx;
+    }
+
+    template<class T> size_t amin( hoNDArray< std::complex<T> > *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::amin(): Invalid input array");
+
+        arma::Col<T> xM = arma::abs(real(as_arma_col(x)))+arma::abs(imag(as_arma_col(x)));
+        arma::uword idx;
+        T min = xM.min(idx);
+        return idx;
+    }
+
+    template<class T> size_t amin( hoNDArray< complext<T> > *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::amin(): Invalid input array");
+
+        arma::Col<T> xM = arma::abs(real(as_arma_col(x)))+arma::abs(imag(as_arma_col(x)));
+        arma::uword idx;
+        T min = xM.min(idx);
+        return idx;
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    void maxAbsolute(const hoNDArray<T>& x, T& r, size_t& ind)
+    {
+        size_t N = x.get_number_of_elements();
+        const T* pX = x.begin();
+
+        ind = 0;
+        if ( N == 0 ) return;
+
+        long long n;
+
+        typename realType<T>::Type v = abs(pX[0]);
+        typename realType<T>::Type v2;
+
+        ind = 0;
+        for ( n=1; n<(long long)N; n++ )
+        {
+            v2 = std::abs(pX[n]);
+            if ( v2 > v )
+            {
+                v = v2;
+                ind = n;
+            }
+        }
+
+        r = pX[ind];
+    }
+
+    template EXPORTCPUCOREMATH void maxAbsolute(const hoNDArray<float>& x, float& r, size_t& ind);
+    template EXPORTCPUCOREMATH void maxAbsolute(const hoNDArray<double>& x, double& r, size_t& ind);
+    template EXPORTCPUCOREMATH void maxAbsolute(const hoNDArray< std::complex<float> >& x,  std::complex<float> & r, size_t& ind);
+    template EXPORTCPUCOREMATH void maxAbsolute(const hoNDArray< std::complex<double> >& x,  std::complex<double> & r, size_t& ind);
+
+    // --------------------------------------------------------------------------------
+
+    inline size_t amax(size_t N, const float* x)
+    {
+        lapack_int num = (lapack_int)(N);
+        lapack_int incx = 1;
+
+        return isamax_(&num, (float*)(x), &incx);
+    }
+
+    inline size_t amax(size_t N, const double* x)
+    {
+        lapack_int num = (lapack_int)(N);
+        lapack_int incx = 1;
+
+        return idamax_(&num, (double*)(x), &incx);
+    }
+
+    inline size_t amax(size_t N, const  std::complex<float> * x)
+    {
+        lapack_int num = (lapack_int)(N);
+        lapack_int incx = 1;
+
+        return icamax_(&num, (lapack_complex_float*)(x), &incx);
+    }
+
+    inline size_t amax(size_t N, const  std::complex<double> * x)
+    {
+        lapack_int num = (lapack_int)(N);
+        lapack_int incx = 1;
+
+        return izamax_(&num, (lapack_complex_double*)(x), &incx);
+    }
+
+    template<class T> size_t amax(const hoNDArray<T>& x)
+    {
+        return amax(x.get_number_of_elements(), x.begin());
+    }
+
+    template EXPORTCPUCOREMATH size_t amax( const hoNDArray<float>& x);
+    template EXPORTCPUCOREMATH size_t amax( const hoNDArray<double>& x);
+    template EXPORTCPUCOREMATH size_t amax( const hoNDArray< std::complex<float> >& x);
+    template EXPORTCPUCOREMATH size_t amax( const hoNDArray< std::complex<double> >& x);
+
+    template<class T> size_t amax( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::amax(): Invalid input array");
+
+        typedef typename realType<T>::Type realT;
+        arma::Col<realT> xM = arma::abs(as_arma_col(x));
+        arma::uword idx;
+        realT max = xM.max(idx);
+        return idx;
+    }
+
+    template<class T> size_t amax( hoNDArray< std::complex<T> > *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::amax(): Invalid input array");
+
+        arma::Col<T> xM = arma::abs(real(as_arma_col(x)))+arma::abs(imag(as_arma_col(x)));
+        arma::uword idx;
+        T max = xM.max(idx);
+        return idx;
+    }
+
+    template<class T> size_t amax( hoNDArray< complext<T> > *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::amax(): Invalid input array");
+
+        arma::Col<T> xM = arma::abs(real(as_arma_col(x)))+arma::abs(imag(as_arma_col(x)));
+        arma::uword idx;
+        T max = xM.max(idx);
+        return idx;
+    }
+
+    // --------------------------------------------------------------------------------
+
+    template EXPORTCPUCOREMATH float max(hoNDArray<float>*);
+    template EXPORTCPUCOREMATH float min(hoNDArray<float>*);
+    template EXPORTCPUCOREMATH float mean(hoNDArray<float>*);
+    template EXPORTCPUCOREMATH float sum(hoNDArray<float>*);
+    template EXPORTCPUCOREMATH float stddev(hoNDArray<float>*);
+
+    template EXPORTCPUCOREMATH double max(hoNDArray<double>*);
+    template EXPORTCPUCOREMATH double min(hoNDArray<double>*);
+    template EXPORTCPUCOREMATH double mean(hoNDArray<double>*);
+    template EXPORTCPUCOREMATH double sum(hoNDArray<double>*);
+    template EXPORTCPUCOREMATH double stddev(hoNDArray<double>*);
+
+    template EXPORTCPUCOREMATH complext<double> mean(hoNDArray<complext<double> >*);
+    template EXPORTCPUCOREMATH complext<double> sum(hoNDArray<complext<double> >*);
+
+    template EXPORTCPUCOREMATH complext<float> mean(hoNDArray<complext<float> >*);
+    template EXPORTCPUCOREMATH complext<float> sum(hoNDArray<complext<float> >*);
+
+    template EXPORTCPUCOREMATH std::complex<double> mean(hoNDArray<std::complex<double> >*);
+    template EXPORTCPUCOREMATH std::complex<double> sum(hoNDArray<std::complex<double> >*);
+
+    template EXPORTCPUCOREMATH std::complex<float> mean(hoNDArray<std::complex<float> >*);
+    template EXPORTCPUCOREMATH std::complex<float> sum(hoNDArray<std::complex<float> >*);
+
+
+    template EXPORTCPUCOREMATH float dot<float>( hoNDArray<float>*, hoNDArray<float>*, bool );
+    template EXPORTCPUCOREMATH float asum<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH float nrm2<float>( hoNDArray<float>* );
+
+    template EXPORTCPUCOREMATH size_t amin<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH size_t amax<float>( hoNDArray<float>* );
+
+    template EXPORTCPUCOREMATH double dot<double>( hoNDArray<double>*, hoNDArray<double>*, bool );
+    template EXPORTCPUCOREMATH double asum<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH double nrm2<double>( hoNDArray<double>* );
+
+    template EXPORTCPUCOREMATH size_t amin<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH size_t amax<double>( hoNDArray<double>* );
+
+    template EXPORTCPUCOREMATH std::complex<float> dot< std::complex<float> >( hoNDArray< std::complex<float> >*, hoNDArray< std::complex<float> >*, bool );
+    template EXPORTCPUCOREMATH float asum<float>( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH float nrm2< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH float nrm1< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH size_t amin<float>( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH size_t amax<float>( hoNDArray< std::complex<float> >* );
+
+    template EXPORTCPUCOREMATH std::complex<double> dot< std::complex<double> >( hoNDArray< std::complex<double> >*, hoNDArray< std::complex<double> >*, bool );
+    template EXPORTCPUCOREMATH double asum<double>( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH double nrm2< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH double nrm1< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH size_t amin<double>( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH size_t amax<double>( hoNDArray< std::complex<double> >* );
+
+    template EXPORTCPUCOREMATH complext<float> dot< complext<float> >( hoNDArray< complext<float> >*, hoNDArray< complext<float> >*, bool );
+    template EXPORTCPUCOREMATH float asum<float>( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH float nrm2< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH float nrm1< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH size_t amin<float>( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH size_t amax<float>( hoNDArray< complext<float> >* );
+
+    template EXPORTCPUCOREMATH complext<double> dot< complext<double> >( hoNDArray< complext<double> >*, hoNDArray< complext<double> >*, bool );
+    template EXPORTCPUCOREMATH double asum<double>( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH double nrm2< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH size_t amin<double>( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH size_t amax<double>( hoNDArray< complext<double> >* );
+
+    // --------------------------------------------------------------------------------
+
+    inline void dotc(size_t N, const  std::complex<float> * x, const  std::complex<float> * y,  std::complex<float> & r)
+    {
+        long long n;
+
+        float sum(0);
+
+        float sa(0), sb(0);
+
+        #pragma omp parallel for private(n) reduction(+:sa) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const float a = x[n].real();
+            const float b = x[n].imag();
+            const float c = y[n].real();
+            const float d = y[n].imag();
+
+            sa += (a*c + b*d);
+            sb += (c*b - a*d);
+        }
+
+        reinterpret_cast<float(&)[2]>(r)[0] = sa;
+        reinterpret_cast<float(&)[2]>(r)[1] = sb;
+    }
+
+    inline void dotc(size_t N, const  std::complex<double> * x, const  std::complex<double> * y,  std::complex<double> & r)
+    {
+        long long n;
+
+        double sum(0);
+
+        double sa(0), sb(0);
+
+        #pragma omp parallel for private(n) reduction(+:sa) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const double a = x[n].real();
+            const double b = x[n].imag();
+            const double c = y[n].real();
+            const double d = y[n].imag();
+
+            sa += (a*c + b*d);
+            sb += (c*b - a*d);
+        }
+
+        reinterpret_cast<double(&)[2]>(r)[0] = sa;
+        reinterpret_cast<double(&)[2]>(r)[1] = sb;
+    }
+
+    template <typename T> 
+    void dotc(const hoNDArray<T>& x, const hoNDArray<T>& y, T& r)
+    {
+        GADGET_DEBUG_CHECK_THROW(x.get_number_of_elements()==y.get_number_of_elements());
+        dotc(x.get_number_of_elements(), x.begin(), y.begin(), r);
+    }
+
+    template EXPORTCPUCOREMATH void dotc(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y,  std::complex<float> & r);
+    template EXPORTCPUCOREMATH void dotc(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y,  std::complex<double> & r);
+
+    template <typename T> 
+    T dotc(const hoNDArray<T>& x, const hoNDArray<T>& y)
+    {
+        T r;
+        dotc(x, y, r);
+        return r;
+    }
+
+    template EXPORTCPUCOREMATH std::complex<float> dotc(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y);
+    template EXPORTCPUCOREMATH std::complex<double> dotc(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y);
+
+    // --------------------------------------------------------------------------------
+
+    inline void dotu(size_t N, const float* x, const float* y, float& r)
+    {
+        long long n;
+
+        float res(0);
+
+        #pragma omp parallel for private(n) reduction(+:res) if (N>NumElementsUseThreading)
+        for (n=0; n<(long long)N; n++)
+        {
+            res += x[n]*y[n];
+        }
+
+        r = res;
+    }
+
+    inline void dotu(size_t N, const double* x, const double* y, double& r)
+    {
+        long long n;
+
+        double res(0);
+
+        #pragma omp parallel for private(n) reduction(+:res) if (N>NumElementsUseThreading)
+        for (n=0; n<(long long)N; n++)
+        {
+            res += x[n]*y[n];
+        }
+
+        r = res;
+    }
+
+    inline void dotu(size_t N, const  std::complex<float> * x, const  std::complex<float> * y,  std::complex<float> & r)
+    {
+        long long n;
+
+         std::complex<float>  sum(0);
+
+        float sa(0), sb(0);
+        #pragma omp parallel for private(n) reduction(+:sa) reduction(+:sb) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const float a = x[n].real();
+            const float b = x[n].imag();
+            const float c = y[n].real();
+            const float d = y[n].imag();
+
+            sa += (a*c - b*d);
+            sb += (c*b + a*d);
+        }
+
+        reinterpret_cast<float(&)[2]>(r)[0] = sa;
+        reinterpret_cast<float(&)[2]>(r)[1] = sb;
+    }
+
+    inline void dotu(size_t N, const  std::complex<double> * x, const  std::complex<double> * y,  std::complex<double> & r)
+    {
+        long long n;
+
+         std::complex<double>  sum(0);
+
+        double sa(0), sb(0);
+        #pragma omp parallel for private(n) reduction(+:sa) reduction(+:sb) if (N>NumElementsUseThreading)
+        for (n = 0; n < (long long)N; n++)
+        {
+            const double a = x[n].real();
+            const double b = x[n].imag();
+            const double c = y[n].real();
+            const double d = y[n].imag();
+
+            sa += (a*c - b*d);
+            sb += (c*b + a*d);
+        }
+
+        reinterpret_cast<double(&)[2]>(r)[0] = sa;
+        reinterpret_cast<double(&)[2]>(r)[1] = sb;
+    }
+
+    template <typename T> 
+    void dotu(const hoNDArray<T>& x, const hoNDArray<T>& y, T& r)
+    {
+        GADGET_DEBUG_CHECK_THROW(x.get_number_of_elements()==y.get_number_of_elements());
+        dotu(x.get_number_of_elements(), x.begin(), y.begin(), r);
+    }
+
+    template EXPORTCPUCOREMATH void dotu(const hoNDArray<float>& x, const hoNDArray<float>& y, float& r);
+    template EXPORTCPUCOREMATH void dotu(const hoNDArray<double>& x, const hoNDArray<double>& y, double& r);
+    template EXPORTCPUCOREMATH void dotu(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y, std::complex<float>& r);
+    template EXPORTCPUCOREMATH void dotu(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y, std::complex<double>& r);
+
+    template <typename T> 
+    T dotu(const hoNDArray<T>& x, const hoNDArray<T>& y)
+    {
+        T r = 0;
+        dotu(x, y, r);
+        return r;
+    }
+
+    template EXPORTCPUCOREMATH float dotu(const hoNDArray<float>& x, const hoNDArray<float>& y);
+    template EXPORTCPUCOREMATH double dotu(const hoNDArray<double>& x, const hoNDArray<double>& y);
+    template EXPORTCPUCOREMATH  std::complex<float>  dotu(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& y);
+    template EXPORTCPUCOREMATH  std::complex<double>  dotu(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& y);
+
+    // --------------------------------------------------------------------------------
+
+    template <typename T> 
+    struct hoCompAscending
+    {
+        bool operator() (T a, T b) { return (a>=b); }
+    };
+
+    template <typename T> 
+    struct hoCompDescending
+    {
+        bool operator() (T a, T b) { return (a<b); }
+    };
+
+    template <typename T> 
+    void sort(size_t N, const T* x, T* r, bool isascending)
+    {
+        if ( r != x )
+        {
+            memcpy(r, x, sizeof(T)*N);
+        }
+
+        if ( isascending )
+        {
+            hoCompAscending<T> obj;
+            std::sort(r, r+N, obj);
+        }
+        else
+        {
+            hoCompDescending<T> obj;
+            std::sort(r, r+N, obj);
+        }
+    }
+
+    template <typename T> 
+    void sort(const hoNDArray<T>& x, hoNDArray<T>& r, bool isascending)
+    {
+        if ( &r != &x )
+        {
+            if ( r.get_number_of_elements()!=x.get_number_of_elements())
+            {
+                r = x;
+            }
+            else
+            {
+                memcpy(r.begin(), x.begin(), x.get_number_of_bytes());
+            }
+        }
+
+        sort(x.get_number_of_elements(), x.begin(), r.begin(), isascending);
+    }
+
+    template EXPORTCPUCOREMATH void sort(const hoNDArray<float>& x, hoNDArray<float>& r, bool isascending);
+    template EXPORTCPUCOREMATH void sort(const hoNDArray<double>& x, hoNDArray<double>& r, bool isascending);
+
+    // --------------------------------------------------------------------------------
+
+    template <class T>
+    void minValue(const hoNDArray<T>& a, T& v)
+    {
+        typedef T ValueType;
+
+        try
+        {
+            const ValueType* pA = a.begin();
+            size_t n = a.get_number_of_elements();
+            v = pA[0];
+
+            size_t ii;
+            for (ii=1; ii<n; ii++)
+            {
+                if (pA[ii]<v) v = pA[ii];
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in minValue(const hoNDArray<T>& a, T& v) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void minValue(const hoNDArray<float>& a, float& v);
+    template EXPORTCPUCOREMATH void minValue(const hoNDArray<double>& a, double& v);
+
+    template <class T>
+    void maxValue(const hoNDArray<T>& a, T& v)
+    {
+        typedef T ValueType;
+
+        try
+        {
+            const ValueType* pA = a.begin();
+            size_t n = a.get_number_of_elements();
+            v = pA[0];
+
+            size_t ii;
+            for (ii=1; ii<n; ii++)
+            {
+                if (pA[ii]>v) v = pA[ii];
+            }
+        }
+        catch(...)
+        {
+            GADGET_THROW("Errors in maxValue(const hoNDArray<T>& a, T& v) ... ");
+        }
+    }
+
+    template EXPORTCPUCOREMATH void maxValue(const hoNDArray<float>& a, float& v);
+    template EXPORTCPUCOREMATH void maxValue(const hoNDArray<double>& a, double& v);
+
+    // --------------------------------------------------------------------------------
+}
diff --git a/toolboxes/core/cpu/math/hoNDArray_reductions.h b/toolboxes/core/cpu/math/hoNDArray_reductions.h
new file mode 100644
index 0000000..639623d
--- /dev/null
+++ b/toolboxes/core/cpu/math/hoNDArray_reductions.h
@@ -0,0 +1,203 @@
+#pragma once
+
+#include "hoNDArray.h"
+#include "cpucore_math_export.h"
+
+#ifdef max
+    #undef max
+#endif // max
+
+#ifdef min
+    #undef min
+#endif // min
+
+namespace Gadgetron{
+
+    /***
+    * Finds the maximum element of the array
+    */
+    template<class REAL> EXPORTCPUCOREMATH REAL max(hoNDArray<REAL>* data);
+
+    /***
+    * Finds the minimum element of the array
+    */
+    template<class REAL> EXPORTCPUCOREMATH REAL min(hoNDArray<REAL>* data);
+
+    /***
+    * Finds the mean of the array
+    */
+    template<class T> EXPORTCPUCOREMATH T mean(hoNDArray<T>* data);
+
+    /***
+    * Calculates the sum of the array
+    */
+    template<class T> EXPORTCPUCOREMATH T sum(hoNDArray<T>* data);
+
+    /***
+    * Calculates the std of the array
+    */
+    template<class T> EXPORTCPUCOREMATH T stddev(hoNDArray<T>* data);
+
+    /**
+    * @brief Calculates the dot product of two arrays (as vectors).
+    * @param[in] x Array 1. For complex arrays the complex conjugate of x is used.
+    * @param[in] y Array 2.
+    * @param[in] cc Specifies whether to use the complex conjugate of x (when applicable).
+    * @return The dot product of x and y
+    */
+    template<class T> EXPORTCPUCOREMATH T dot( hoNDArray<T> *x, hoNDArray<T> *y, bool cc = true );
+
+    /**
+    * @brief Calculates the sum of the l1-norms of the array entries
+    * @param[in] arr Input array
+    * @return The l1-norm of the array
+    */
+    template<class T> EXPORTCPUCOREMATH typename realType<T>::Type asum( hoNDArray<T> *x );
+    template<class T> EXPORTCPUCOREMATH void asum(const hoNDArray<T>& x, typename realType<T>::Type& r);
+    template<class T> EXPORTCPUCOREMATH typename realType<T>::Type asum(const hoNDArray<T>& x);
+
+    /**
+    * @brief Calculates the sum of the l1-norms of the array entries
+    * @param[in] arr Input array
+    * @return The l1-norm of the array
+    */
+    template<class T> EXPORTCPUCOREMATH T asum( hoNDArray< std::complex<T> > *x );
+
+    /**
+    * @brief Calculates the sum of the l1-norms of the array entries
+    * @param[in] arr Input array
+    * @return The l1-norm of the array
+    */
+    template<class T> EXPORTCPUCOREMATH T asum( hoNDArray< complext<T> > *x );
+
+    /**
+    * @brief Calculates the l2-norm of the array (as a vector)
+    * @param[in] arr Input array
+    * @return The l2-norm of the array
+    */
+    template<class T> EXPORTCPUCOREMATH typename realType<T>::Type nrm2( hoNDArray<T> *x );
+
+    /**
+    * @brief Calculates the l1-norm of the array (as a vector)
+    * @param[in] arr Input array
+    * @return The l1-norm of the array
+    */
+    template<class T> EXPORTCPUCOREMATH typename realType<T>::Type nrm1( hoNDArray<T> *x );
+
+    /**
+    * @brief Returns the index of the array element with the smallest absolute value (l1 norm)
+    * @param[in] x Input data
+    * @return The array index corresponding to the smallest element in the array (0-indexing)
+    */
+    template<class T> EXPORTCPUCOREMATH size_t amin( hoNDArray<T> *x );
+
+    /**
+    * @brief Returns the index of the array element with the smallest absolute value (l1 norm)
+    * @param[in] x Input data
+    * @return The array index corresponding to the smallest element in the array (0-indexing)
+    */
+    template<class T> EXPORTCPUCOREMATH size_t amin( hoNDArray< std::complex<T> > *x );
+
+    /**
+    * @brief Returns the index of the array element with the smallest absolute value (l1 norm)
+    * @param[in] x Input data
+    * @return The array index corresponding to the smallest element in the array (0-indexing)
+    */
+    template<class T> EXPORTCPUCOREMATH size_t amin( hoNDArray< complext<T> > *x );
+
+    /**
+    * @brief Returns the index of the array element with the largest absolute value (l1-norm)
+    * @param[in] x Input data
+    * @return The array index corresponding to the largest element in the array (0-indexing)
+    */
+    template<class T> EXPORTCPUCOREMATH size_t amax( hoNDArray<T> *x );
+
+    /**
+    * @brief Returns the index of the array element with the largest absolute value (l1-norm)
+    * @param[in] x Input data
+    * @return The array index corresponding to the largest element in the array (0-indexing)
+    */
+    template<class T> EXPORTCPUCOREMATH size_t amax( hoNDArray< std::complex<T> > *x );
+
+    /**
+    * @brief Returns the index of the array element with the largest absolute value (l1-norm)
+    * @param[in] x Input data
+    * @return The array index corresponding to the largest element in the array (0-indexing)
+    */
+    template<class T> EXPORTCPUCOREMATH size_t amax( hoNDArray< complext<T> > *x );
+
+    /**
+    * @brief ind = min(abs(x(:))
+    find the minimal absolute value of x and its position index ind
+    r = x[ind], not abs(x[ind])
+    */
+    template <typename T> EXPORTCPUCOREMATH 
+    void minAbsolute(const hoNDArray<T>& x, T& r, size_t& ind);
+
+    /**
+    * @brief ind = max(abs(x(:))
+    find the miximal absolute value of x and its position index ind
+    r = x[ind], not abs(x[ind])
+    */
+    template <typename T> EXPORTCPUCOREMATH 
+    void maxAbsolute(const hoNDArray<T>& x, T& r, size_t& ind);
+
+    /**
+    * @brief r = norm(x(:), 2)
+    compute L2 norm of x
+    */
+    template <typename T> EXPORTCPUCOREMATH 
+    void norm2(const hoNDArray<T>& x, typename realType<T>::Type& r);
+
+    template <typename T> EXPORTCPUCOREMATH 
+    typename realType<T>::Type norm2(const hoNDArray<T>& x);
+
+    /**
+    * @brief r = norm(x(:), 1)
+    compute L1 norm of x = sum( abs(x(:) )
+    */
+    template <typename T> EXPORTCPUCOREMATH 
+    void norm1(const hoNDArray<T>& x, typename realType<T>::Type& r);
+
+    template <typename T> EXPORTCPUCOREMATH 
+    typename realType<T>::Type norm1(const hoNDArray<T>& x);
+
+    /**
+    * @brief dot product of conj(x) and y
+    r = conj(x) dot y
+    */
+    template <typename T> EXPORTCPUCOREMATH 
+    void dotc(const hoNDArray<T>& x, const hoNDArray<T>& y, T& r);
+
+    template <typename T> EXPORTCPUCOREMATH 
+    T dotc(const hoNDArray<T>& x, const hoNDArray<T>& y);
+
+    /**
+    * @brief dot product of x and y
+    r = x dot y
+    */
+    template <typename T> EXPORTCPUCOREMATH 
+    void dotu(const hoNDArray<T>& x, const hoNDArray<T>& y, T& r);
+
+    template <typename T> EXPORTCPUCOREMATH 
+    T dotu(const hoNDArray<T>& x, const hoNDArray<T>& y);
+
+    /**
+    * @brief sort the ND array
+    */
+    template <typename T> EXPORTCPUCOREMATH void sort(const hoNDArray<T>& x, hoNDArray<T>& r, bool isascending);
+
+    /**
+    * @brief finds the index of the element with the maximal absolute value.
+    */
+    template<class T> EXPORTCPUCOREMATH size_t amax(const hoNDArray<T>& x);
+
+    /**
+    * @brief get the min and max value from an array (only for float and double type)
+    */
+    template <class T> EXPORTCPUCOREMATH 
+    void minValue(const hoNDArray<T>& a, T& v);
+
+    template <class T> EXPORTCPUCOREMATH 
+    void maxValue(const hoNDArray<T>& a, T& v);
+}
diff --git a/toolboxes/core/cpu/math/hoNDImage_util.cpp b/toolboxes/core/cpu/math/hoNDImage_util.cpp
new file mode 100644
index 0000000..c7eee25
--- /dev/null
+++ b/toolboxes/core/cpu/math/hoNDImage_util.cpp
@@ -0,0 +1,877 @@
+/** \file   hoNDImage_util.hxx
+    \brief  operations on the hoNDImage class.
+*/
+
+#include "hoNDImage_util.h"
+#include "hoNDBoundaryHandler.h"
+
+namespace Gadgetron
+{
+
+template<class T, unsigned int D> 
+bool gradient(const hoNDImage<T, D>& x, hoNDImage<T, D> gx[])
+{
+    try
+    {
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            if ( !gx[ii].dimensions_equal(x) )
+            {
+                gx[ii] = x;
+            }
+        }
+
+        if ( D == 1 )
+        {
+            long long sx = (long long)x.get_size(0);
+            const T* pX = x.begin();
+            T* pGx = gx[0].begin();
+
+            long long x;
+
+            // #pragma omp parallel for default(none) private(x) shared(sx, pX, pGx)
+            for ( x=1; x<sx-1; x++ )
+            {
+                pGx[x] = pX[x+1] - pX[x-1];
+            }
+
+            pGx[0] = pX[1] - pX[0];
+            pGx[sx-1] = pX[sx-1] - pX[sx-2];
+        }
+        else if ( D == 2 )
+        {
+            long long sx = (long long)x.get_size(0);
+            long long sy = (long long)x.get_size(1);
+
+            const T* pX = x.begin();
+            T* pGx = gx[0].begin();
+            T* pGy = gx[1].begin();
+
+            long long x, y;
+
+            // #pragma omp parallel for default(none) private(x, y) shared(sx, sy, pX, pGx, pGy)
+            for ( y=1; y<sy-1; y++ )
+            {
+                for ( x=1; x<sx-1; x++ )
+                {
+                    size_t offset = x + y*sx;
+
+                    pGx[offset] = pX[offset+1] - pX[offset-1];
+                    pGy[offset] = pX[offset+sx] - pX[offset-sx];
+                }
+            }
+
+            // #pragma omp parallel for default(none) private(x) shared(sx, sy, pX, pGx, pGy)
+            for ( x=1; x<sx-1; x++ )
+            {
+                pGx[x] = pX[x+1] - pX[x-1];
+
+                size_t offset = x + (sy-1)*sx;
+                pGx[offset] = pX[offset+1] - pX[offset-1];
+
+                pGy[x] = pX[x+sx] - pX[x];
+                pGy[x + (sy-1)*sx] = pX[x + (sy-1)*sx] - pX[x + (sy-2)*sx];
+            }
+
+            // #pragma omp parallel for default(none) private(y) shared(sx, sy, pX, pGx, pGy)
+            for ( y=1; y<sy-1; y++ )
+            {
+                size_t offset = y*sx;
+                pGy[offset] = pX[offset+sx] - pX[offset-sx];
+
+                pGx[offset] = pX[offset+1] - pX[offset];
+
+                offset = sx-1 + y*sx;
+                pGy[offset] = pX[offset+sx] - pX[offset-sx];
+
+                pGx[offset] = pX[offset] - pX[offset-1];
+            }
+
+            pGx[0] = pX[1]-pX[0];
+            pGx[sx-1] = pX[sx-1]-pX[sx-2];
+            pGx[(sy-1)*sx] = pX[(sy-1)*sx+1]-pX[(sy-1)*sx];
+            pGx[sx*sy-1] = pX[sx*sy-1]-pX[sx*sy-2];
+
+            pGy[0] = pX[sx]-pX[0];
+            pGy[sx-1] = pX[2*sx-1]-pX[sx-1];
+            pGy[(sy-1)*sx] = pX[(sy-1)*sx] - pX[(sy-2)*sx];
+            pGy[sx*sy-1] = pX[sx*sy-1] - pX[sx*sy-1-sx];
+        }
+        else if ( D == 3 )
+        {
+            long long sx = (long long)x.get_size(0);
+            long long sy = (long long)x.get_size(1);
+            long long sz = (long long)x.get_size(2);
+
+            const T* pX = x.begin();
+            T* pGx = gx[0].begin();
+            T* pGy = gx[1].begin();
+            T* pGz = gx[2].begin();
+
+            long long x, y, z;
+
+            #pragma omp parallel default(none) private(x, y, z) shared(sx, sy, sz, pX, pGx, pGy, pGz)
+            {
+                long long z_positive, z_negative, y_positive, y_negative;
+                size_t offset, offset_z_positive, offset_z_negative, offset_y_positive, offset_y_negative;
+
+                #pragma omp for 
+                for ( z=0; z<sz; z++ )
+                {
+                    z_positive = z+1;
+                    z_positive = (z_positive==sz) ? sz-1 : z_positive;
+
+                    z_negative = z-1;
+                    z_negative = (z_negative==-1) ? 0 : z_negative;
+
+                    for ( y=0; y<sy; y++ )
+                    {
+
+                        y_positive = y+1;
+                        y_positive = (y_positive==sy) ? sy-1 : y_positive;
+
+                        y_negative = y-1;
+                        y_negative = (y_negative==-1) ? 0 : y_negative;
+
+                        offset = y*sx + z*sx*sy;
+
+                        offset_z_positive = y*sx + z_positive*sx*sy;
+                        offset_z_negative = y*sx + z_negative*sx*sy;
+
+                        offset_y_positive = y_positive*sx + z*sx*sy;
+                        offset_y_negative = y_negative*sx + z*sx*sy;
+
+                        for ( x=1; x<sx-1; x++ )
+                        {
+                            pGx[offset+x] = pX[offset+x+1] - pX[offset+x-1];
+                            pGy[offset+x] = pX[offset_y_positive+x] - pX[offset_y_negative+x];
+                            pGz[offset+x] = pX[offset_z_positive+x] - pX[offset_z_negative+x];
+                        }
+
+                        // x = 0
+                        pGx[offset] = pX[offset+1] - pX[offset];
+                        pGy[offset] = pX[offset_y_positive] - pX[offset_y_negative];
+                        pGz[offset] = pX[offset_z_positive] - pX[offset_z_negative];
+
+                        // x = sx-1
+                        pGx[offset+sx-1] = pX[offset+sx-1] - pX[offset+sx-2];
+                        pGy[offset+sx-1] = pX[offset_y_positive+sx-1] - pX[offset_y_negative+sx-1];
+                        pGz[offset+sx-1] = pX[offset_z_positive+sx-1] - pX[offset_z_negative+sx-1];
+                    }
+                }
+            }
+        }
+        else
+        {
+            size_t N = x.get_number_of_elements();
+
+            long long n;
+
+            std::vector<size_t> dim(D);
+            x.get_dimensions(dim);
+
+            #pragma omp parallel default(none) private(n) shared(N, dim, x, gx)
+            {
+                size_t ind[D];
+                size_t ind_positive[D];
+                size_t ind_negative[D];
+                bool inside = true;
+                unsigned int ii;
+
+                #pragma omp for 
+                for ( n=0; n<(long long)N; n++ )
+                {
+                    x.calculate_index(n, ind);
+
+                    inside = true;
+                    for ( ii=0; ii<D; ii++ )
+                    {
+                        if ( ind[ii]==0 || ind[ii]==dim[ii]-1 )
+                        {
+                            inside = false;
+                            break;
+                        }
+                    }
+
+                    if ( inside )
+                    {
+                        for ( ii=0; ii<D; ii++ )
+                        {
+                            memcpy(ind_positive, ind, sizeof(size_t)*D);
+                            memcpy(ind_negative, ind, sizeof(size_t)*D);
+
+                            ind_positive[ii] = ind[ii] + 1;
+                            ind_negative[ii] = ind[ii] - 1;
+
+                            gx[ii](n) = x(ind_positive) - x(ind_negative);
+                        }
+                    }
+                    else
+                    {
+                        for ( ii=0; ii<D; ii++ )
+                        {
+                            memcpy(ind_positive, ind, sizeof(size_t)*D);
+                            memcpy(ind_negative, ind, sizeof(size_t)*D);
+
+                            ind_positive[ii] = ind[ii] + 1;
+                            ind_positive[ii] = (ind_positive[ii]==dim[ii]) ? dim[ii]-1 : ind_positive[ii];
+
+                            ind_negative[ii] = ind[ii] - 1;
+                            ind_negative[ii] = (ind_negative[ii]==-1) ? 0 : ind_negative[ii];
+
+                            gx[ii](n) = x(ind_positive) - x(ind_negative);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in gradient(const hoNDImage<T, D>& x, hoNDImage<T, D> gx[D]) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gaussianKernel(T sigma, double kerWidthInUnitOfSigma, double deltaKer, hoNDArray<T>& ker)
+{
+    try
+    {
+        long long N  =  (long long)(2*std::ceil(kerWidthInUnitOfSigma*sigma/deltaKer) + 1);
+
+        ker.create(N);
+
+        T kerSum = 0;
+
+        T D = (T)( (deltaKer*deltaKer)/(2*sigma*sigma) );
+
+        long long ii;
+        for ( ii=-N/2; ii<=N/2; ii++ )
+        {
+            ker(ii+N/2) = std::exp( -(ii*ii*D) );
+            kerSum += ker(ii+N/2);
+        }
+
+        T GNorm = (T)(1/std::sqrt(2*3.141592653579*sigma*sigma));
+        GNorm /= kerSum;
+
+        Gadgetron::scal(GNorm, ker);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in gaussianKernel(T sigma, double kerWidthInUnitOfSigma, double deltaKer, hoNDArray<T>& ker) ... ");
+        return false;
+    }
+    return true;
+}
+
+// As well-know in the computer vision, the gaussian filter is implemented as the DERICHE filter
+// therefore, the computation cost is independent from the sigma
+// [1] Deriche, R., 1992, Recursively implementing the Gaussian and its derivatives: Proceedings of the 2nd International Conference on Image Processing, Singapore, p. 263�267.
+// [2] http://en.wikipedia.org/wiki/Deriche_edge_detector gives details about this filter
+// this implementation is based on this webpage
+
+template <class T, class T2>
+inline void DericheSmoothing(T* pData, size_t N, T* mem, T2 sigma, size_t offset=0)
+{
+    typedef typename realType<T>::Type real_type;
+
+    if ( sigma < 1e-6 ) sigma = (T2)(1e-6);
+
+    // following the note of http://en.wikipedia.org/wiki/Deriche_edge_detector
+
+    real_type alpha = (real_type)(1.4105/sigma); // this value 1.4105 is from equation 37 of ref [1]
+    real_type e_alpha = (real_type)( std::exp( (double)(-alpha) ) );
+    real_type e_alpha_sqr = e_alpha*e_alpha;
+    real_type k = ( (1-e_alpha)*(1-e_alpha) ) / ( 1 + 2*alpha*e_alpha - e_alpha_sqr );
+
+    real_type a1 = k;
+    real_type a2 = k * e_alpha * (alpha-1);
+    real_type a3 = k * e_alpha * (alpha+1);
+    real_type a4 = -k * e_alpha_sqr;
+
+    real_type b1 = 2 * e_alpha;
+    real_type b2 = -e_alpha_sqr;
+
+    // compute the left to right filtering and the right to left filtering
+    // for the speed, just use the zero boundary condition
+    // TODO: try out other boundary conditions
+    T* forward = mem;
+    T* reverse = mem + N;
+
+    if ( offset == 0 )
+    {
+        forward[0] = a1 * pData[0];
+        reverse[N-1] = 0;
+
+        size_t ii;
+
+        if ( N > 1 )
+        {
+            forward[1] = a1 * pData[1] + a2*pData[0] + b1 * forward[0];
+            reverse[N-2] = a3 * pData[N-1] + b1 * reverse[N-1];
+
+            for ( ii=2; ii<N; ii++ )
+            {
+                forward[ii] = (a1*pData[ii] + a2*pData[ii-1]) + (b1*forward[ii-1] + b2*forward[ii-2]);
+                reverse[N-1-ii] = (a3*pData[N-ii] + a4*pData[N-ii+1]) + (b1*reverse[N-ii] + b2*reverse[N-ii+1]);
+            }
+        }
+
+        // Gadgetron::math::add(N, forward, reverse, pData);
+
+        for ( ii=0; ii<N; ii++ )
+        {
+            pData[ii] = forward[ii] + reverse[ii];
+        }
+    }
+    else
+    {
+        forward[0] = a1 * pData[0];
+        reverse[N-1] = 0;
+
+        if ( N > 1 )
+        {
+            forward[1] = a1 * pData[offset] + a2*pData[0] + b1 * forward[0];
+            reverse[N-2] = a3 * pData[(N-1)*offset] + b1 * reverse[N-1];
+
+            size_t ii;
+            for ( ii=2; ii<N; ii++ )
+            {
+                forward[ii] = (a1*pData[ii*offset] + a2*pData[(ii-1)*offset]) + (b1*forward[ii-1] + b2*forward[ii-2]);
+                reverse[N-1-ii] = (a3*pData[(N-ii)*offset] + a4*pData[(N-ii+1)*offset]) + (b1*reverse[N-ii] + b2*reverse[N-ii+1]);
+            }
+
+            for ( ii=0; ii<N; ii++ )
+            {
+                pData[ii*offset] = forward[ii] + reverse[ii];
+            }
+        }
+    }
+}
+
+template<class ArrayType, class T2> 
+bool filterGaussian(ArrayType& img, T2 sigma[], typename ArrayType::value_type* mem)
+{
+    try
+    {
+        typedef typename ArrayType::value_type T;
+
+        size_t D = img.get_number_of_dimensions();
+
+        if ( D == 1 )
+        {
+            if ( sigma[0] > 0 )
+            {
+                size_t sx = img.get_size(0);
+
+                bool allocate = false;
+                if ( mem == NULL )
+                {
+                    mem = new T[2*sx];
+                    allocate = true;
+                }
+
+                Gadgetron::DericheSmoothing(img.begin(), sx, mem, sigma[0]);
+
+                if ( allocate ) delete [] mem;
+            }
+        }
+        else if ( D == 2 )
+        {
+            long long sx = (long long)img.get_size(0);
+            long long sy = (long long)img.get_size(1);
+
+            T* pData = img.begin();
+
+            long long x, y;
+
+            if ( mem != NULL )
+            {
+                if ( sigma[0] > 0 )
+                {
+                    // filter along x
+                    {
+                        for ( y=0; y<sy; y++ )
+                        {
+                            Gadgetron::DericheSmoothing(pData+y*sx, sx, mem, sigma[0]);
+                        }
+                    }
+                }
+
+                if ( sigma[1] > 0 )
+                {
+                    // filter along y
+                    {
+                        for ( x=0; x<sx; x++ )
+                        {
+                            Gadgetron::DericheSmoothing(pData+x, sy, mem, sigma[1], sx);
+                        }
+                    }
+                }
+            }
+            else
+            {
+                if ( sigma[0] > 0 )
+                {
+                    // filter along x
+                    // #pragma omp parallel default(none) private(y) shared(sx, sy, pData, sigma)
+                    {
+                        T* mem = new T[2*sx];
+
+                        // #pragma omp for 
+                        for ( y=0; y<sy; y++ )
+                        {
+                            Gadgetron::DericheSmoothing(pData+y*sx, sx, mem, sigma[0]);
+                        }
+
+                        delete [] mem;
+                    }
+                }
+
+                if ( sigma[1] > 0 )
+                {
+                    // filter along y
+                    //#pragma omp parallel default(none) private(x) shared(sx, sy, pData, sigma)
+                    {
+                        T* mem = new T[2*sy];
+
+                        // #pragma omp for 
+                        for ( x=0; x<sx; x++ )
+                        {
+                            Gadgetron::DericheSmoothing(pData+x, sy, mem, sigma[1], sx);
+                        }
+
+                        delete [] mem;
+                    }
+                }
+            }
+        }
+        else if ( D == 3 )
+        {
+            long long sx = (long long)img.get_size(0);
+            long long sy = (long long)img.get_size(1);
+            long long sz = (long long)img.get_size(2);
+
+            T* pData = img.begin();
+
+            long long x, y, z;
+
+            if ( sigma[0] > 0 )
+            {
+                // filter along x
+                #pragma omp parallel default(none) private(y, z) shared(sx, sy, sz, pData, sigma)
+                {
+                    T* mem = new T[2*sx];
+
+                    #pragma omp for 
+                    for ( z=0; z<sz; z++ )
+                    {
+                        for ( y=0; y<sy; y++ )
+                        {
+                            Gadgetron::DericheSmoothing(pData+y*sx+z*sx*sy, sx, mem, sigma[0]);
+                        }
+                    }
+
+                    delete [] mem;
+                }
+            }
+
+            if ( sigma[1] > 0 )
+            {
+                // filter along y
+                #pragma omp parallel default(none) private(x, y, z) shared(sx, sy, sz, pData, sigma)
+                {
+                    T* buf = new T[3*sy];
+                    T* mem = buf + sy;
+
+                    #pragma omp for 
+                    for ( z=0; z<sz; z++ )
+                    {
+                        for ( x=0; x<sx; x++ )
+                        {
+                            size_t offset = x + z*sx*sy;
+
+                            for ( y=0; y<sy; y++ )
+                            {
+                                buf[y] = pData[offset + y*sx];
+                            }
+
+                            Gadgetron::DericheSmoothing(buf, sy, mem, sigma[1]);
+
+                            for ( y=0; y<sy; y++ )
+                            {
+                                pData[offset + y*sx] = buf[y];
+                            }
+                        }
+                    }
+
+                    delete [] buf;
+                }
+            }
+
+            if ( sigma[2] > 0 )
+            {
+                // filter along z
+                #pragma omp parallel default(none) private(x, y, z) shared(sx, sy, sz, pData, sigma)
+                {
+                    T* buf = new T[3*sz];
+                    T* mem = buf + sz;
+
+                    #pragma omp for 
+                    for ( y=0; y<sy; y++ )
+                    {
+                        for ( x=0; x<sx; x++ )
+                        {
+                            size_t offset = x + y*sx;
+
+                            for ( z=0; z<sz; z++ )
+                            {
+                                buf[z] = pData[offset + z*sx*sy];
+                            }
+
+                            Gadgetron::DericheSmoothing(buf, sz, mem, sigma[2]);
+
+                            for ( z=0; z<sz; z++ )
+                            {
+                                pData[offset + z*sx*sy] = buf[z];
+                            }
+                        }
+                    }
+
+                    delete [] buf;
+                }
+            }
+        }
+        else if ( D == 4 )
+        {
+            long long sx = (long long)img.get_size(0);
+            long long sy = (long long)img.get_size(1);
+            long long sz = (long long)img.get_size(2);
+            long long st = (long long)img.get_size(3);
+
+            T* pData = img.begin();
+
+            long long x, y, z, t;
+
+            if ( sigma[0] > 0 )
+            {
+                // filter along x
+                #pragma omp parallel default(none) private(y, z, t) shared(sx, sy, sz, st, pData, sigma)
+                {
+                    T* mem = new T[2*sx];
+
+                    #pragma omp for 
+                    for ( t=0; t<st; t++ )
+                    {
+                        for ( z=0; z<sz; z++ )
+                        {
+                            for ( y=0; y<sy; y++ )
+                            {
+                                Gadgetron::DericheSmoothing(pData+y*sx+z*sx*sy+t*sx*sy*sz, sx, mem, sigma[0]);
+                            }
+                        }
+                    }
+
+                    delete [] mem;
+                }
+            }
+
+            if ( sigma[1] > 0 )
+            {
+                // filter along y
+                #pragma omp parallel default(none) private(x, y, z, t) shared(sx, sy, sz, st, pData, sigma)
+                {
+                    T* buf = new T[3*sy];
+                    T* mem = buf + sy;
+
+                    #pragma omp for 
+                    for ( t=0; t<st; t++ )
+                    {
+                        for ( z=0; z<sz; z++ )
+                        {
+                            for ( x=0; x<sx; x++ )
+                            {
+                                size_t offset = x + z*sx*sy + t*sx*sy*sz;
+
+                                for ( y=0; y<sy; y++ )
+                                {
+                                    buf[y] = pData[offset + y*sx];
+                                }
+
+                                Gadgetron::DericheSmoothing(buf, sy, mem, sigma[1]);
+
+                                for ( y=0; y<sy; y++ )
+                                {
+                                    pData[offset + y*sx] = buf[y];
+                                }
+                            }
+                        }
+                    }
+
+                    delete [] buf;
+                }
+            }
+
+            if ( sigma[2] > 0 )
+            {
+                // filter along z
+                #pragma omp parallel default(none) private(x, y, z, t) shared(sx, sy, sz, st, pData, sigma)
+                {
+                    T* buf = new T[3*sz];
+                    T* mem = buf + sz;
+
+                    #pragma omp for 
+                    for ( t=0; t<st; t++ )
+                    {
+                        for ( y=0; y<sy; y++ )
+                        {
+                            for ( x=0; x<sx; x++ )
+                            {
+                                size_t offset = x + y*sx + t*sx*sy*sz;
+
+                                for ( z=0; z<sz; z++ )
+                                {
+                                    buf[z] = pData[offset + z*sx*sy];
+                                }
+
+                                Gadgetron::DericheSmoothing(buf, sz, mem, sigma[2]);
+
+                                for ( z=0; z<sz; z++ )
+                                {
+                                    pData[offset + z*sx*sy] = buf[z];
+                                }
+                            }
+                        }
+                    }
+
+                    delete [] buf;
+                }
+            }
+
+            if ( sigma[3] > 0 )
+            {
+                // filter along t
+                #pragma omp parallel default(none) private(x, y, z, t) shared(sx, sy, sz, st, pData, sigma)
+                {
+                    T* buf = new T[3*st];
+                    T* mem = buf + st;
+
+                    #pragma omp for 
+                    for ( z=0; z<sz; z++ )
+                    {
+                        for ( y=0; y<sy; y++ )
+                        {
+                            for ( x=0; x<sx; x++ )
+                            {
+                                size_t offset = x + y*sx + z*sx*sy;
+
+                                for ( t=0; t<st; t++ )
+                                {
+                                    buf[t] = pData[offset + t*sx*sy*sz];
+                                }
+
+                                Gadgetron::DericheSmoothing(buf, st, mem, sigma[3]);
+
+                                for ( t=0; t<st; t++ )
+                                {
+                                    pData[offset + t*sx*sy*sz] = buf[t];
+                                }
+                            }
+                        }
+                    }
+
+                    delete [] buf;
+                }
+            }
+        }
+        else
+        {
+            std::vector<long long> dim(D);
+
+            unsigned int ii;
+            for ( ii=0; ii<D; ii++ )
+            {
+                dim[ii] = (long long)img.get_size(ii);
+            }
+
+            T* pData = img.begin();
+
+            long long N = (long long)img.get_number_of_elements();
+
+            std::vector<size_t> offsetFactor(D);
+            img.get_offset_factor(offsetFactor);
+
+            // filter along every dimension
+            for ( ii=0; ii<D; ii++ )
+            {
+                if ( sigma[ii] > 0 )
+                {
+                    long long num = N/dim[ii];
+
+                    long long n;
+
+                    if ( ii == 0 )
+                    {
+                        #pragma omp parallel default(none) private(n) shared(num, dim, pData, sigma)
+                        {
+                            T* mem = new T[ 2*dim[0] ];
+
+                            #pragma omp for 
+                            for ( n=0; n<num; n++ )
+                            {
+                                Gadgetron::DericheSmoothing(pData+n*dim[0], dim[0], mem, sigma[0]);
+                            }
+
+                            delete [] mem;
+                        }
+                    }
+                    else
+                    {
+                        std::vector<size_t> dimCurr(D-1);
+
+                        unsigned int jj;
+                        for ( jj=0; jj<D; jj++ )
+                        {
+                            if ( jj < ii )
+                            {
+                                dimCurr[jj] = dim[jj];
+                            }
+
+                            if ( jj > ii )
+                            {
+                                dimCurr[jj-1] = dim[jj];
+                            }
+                        }
+
+                        std::vector<size_t> offsetFactorCurr(D-1);
+                        NDArray<T>::calculate_offset_factors(dimCurr, offsetFactorCurr);
+
+                        #pragma omp parallel default(none) private(n) shared(D, num, dim, img, pData, sigma, ii, offsetFactor, offsetFactorCurr)
+                        {
+                            T* buf = new T[ 3*dim[ii] ];
+                            T* mem = buf + dim[ii];
+
+                            std::vector<size_t> ind(D);
+                            std::vector<size_t> indCurr(D-1);
+
+                            std::vector<size_t> offset(dim[ii]);
+
+                            #pragma omp for 
+                            for ( n=0; n<num; n++ )
+                            {
+                                NDArray<T>::calculate_index(n, offsetFactorCurr, indCurr);
+
+                                unsigned int jj;
+                                for ( jj=0; jj<D; jj++ )
+                                {
+                                    if ( jj < ii )
+                                    {
+                                        ind[jj] = indCurr[jj];
+                                    }
+
+                                    if ( jj > ii )
+                                    {
+                                        ind[jj] = indCurr[jj-1];
+                                    }
+                                }
+
+                                ind[ii] = 0;
+                                offset[0] = img.calculate_offset(ind);
+                                buf[0] = pData[ offset[0] ];
+
+                                long long d;
+                                for ( d=1; d<dim[ii]; d++ )
+                                {
+                                    offset[d] = offset[d-1] + offsetFactor[ii];
+                                    buf[d] = pData[ offset[d] ];
+                                }
+
+                                Gadgetron::DericheSmoothing(buf, dim[ii], mem, sigma[ii]);
+
+                                for ( d=0; d<dim[ii]; d++ )
+                                {
+                                    pData[ offset[d] ] = buf[d];
+                                }
+                            }
+
+                            delete [] buf;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in filterGaussian(const hoNDImage<T, D>& x, T sigma[], typename ArrayType::value_type* mem) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template EXPORTCPUCOREMATH bool filterGaussian(hoNDArray<float>& img, float sigma[], float* mem);
+template EXPORTCPUCOREMATH bool filterGaussian(hoNDArray<float>& img, double sigma[], float* mem);
+template EXPORTCPUCOREMATH bool filterGaussian(hoNDArray<double>& img, double sigma[], double* mem);
+template EXPORTCPUCOREMATH bool filterGaussian(hoNDArray<double>& img, float sigma[], double* mem);
+
+template EXPORTCPUCOREMATH bool filterGaussian(hoNDArray< std::complex<float> >& img, float sigma[],  std::complex<float> * mem);
+template EXPORTCPUCOREMATH bool filterGaussian(hoNDArray< std::complex<double> >& img, double sigma[],  std::complex<double> * mem);
+template EXPORTCPUCOREMATH bool filterGaussian(hoNDArray< std::complex<float> >& img, double sigma[],  std::complex<float> * mem);
+template EXPORTCPUCOREMATH bool filterGaussian(hoNDArray< std::complex<double> >& img, float sigma[],  std::complex<double> * mem);
+
+template EXPORTCPUCOREMATH bool filterGaussian(ho2DArray<float>& img, float sigma[], float* mem);
+template EXPORTCPUCOREMATH bool filterGaussian(ho2DArray<float>& img, double sigma[], float* mem);
+template EXPORTCPUCOREMATH bool filterGaussian(ho2DArray<double>& img, float sigma[], double* mem);
+template EXPORTCPUCOREMATH bool filterGaussian(ho2DArray<double>& img, double sigma[], double* mem);
+
+template EXPORTCPUCOREMATH bool filterGaussian(hoMatrix<float>& img, float sigma[], float* mem);
+template EXPORTCPUCOREMATH bool filterGaussian(hoMatrix<float>& img, double sigma[], float* mem);
+template EXPORTCPUCOREMATH bool filterGaussian(hoMatrix<double>& img, double sigma[], double* mem);
+template EXPORTCPUCOREMATH bool filterGaussian(hoMatrix<double>& img, float sigma[], double* mem);
+
+template EXPORTCPUCOREMATH bool gaussianKernel(float sigma, double kerWidthInUnitOfSigma, double deltaKer, hoNDArray<float>& ker);
+template EXPORTCPUCOREMATH bool gaussianKernel(double sigma, double kerWidthInUnitOfSigma, double deltaKer, hoNDArray<double>& ker);
+
+#define DimImage 1
+#include "hoNDImage_util_instantiate.hxx"
+#undef DimImage
+
+#define DimImage 2
+#include "hoNDImage_util_instantiate.hxx"
+#undef DimImage
+
+#define DimImage 3
+#include "hoNDImage_util_instantiate.hxx"
+#undef DimImage
+
+#define DimImage 4
+#include "hoNDImage_util_instantiate.hxx"
+#undef DimImage
+
+#define DimImage 5
+#include "hoNDImage_util_instantiate.hxx"
+#undef DimImage
+
+#define DimImage 6
+#include "hoNDImage_util_instantiate.hxx"
+#undef DimImage
+
+#define DimImage 7
+#include "hoNDImage_util_instantiate.hxx"
+#undef DimImage
+
+#define DimImage 8
+#include "hoNDImage_util_instantiate.hxx"
+#undef DimImage
+
+#define DimImage 9
+#include "hoNDImage_util_instantiate.hxx"
+#undef DimImage
+
+}
diff --git a/toolboxes/core/cpu/math/hoNDImage_util.h b/toolboxes/core/cpu/math/hoNDImage_util.h
new file mode 100644
index 0000000..8d6927d
--- /dev/null
+++ b/toolboxes/core/cpu/math/hoNDImage_util.h
@@ -0,0 +1,75 @@
+/** \file hoNDImage_util.h
+\brief math operations on the hoNDImage class.
+*/
+
+#pragma once
+
+#include "ho2DArray.h"
+#include "ho3DArray.h"
+#include "ho4DArray.h"
+#include "ho5DArray.h"
+#include "ho6DArray.h"
+#include "ho7DArray.h"
+#include "hoNDImage.h"
+#include "cpucore_math_export.h"
+
+#include <complex>
+
+#include "hoNDArray_reductions.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDInterpolator.h"
+
+namespace Gadgetron
+{
+    /// compute the gradient for an ND image
+    /// the central difference is computed, the border-value boundary condition is used
+    template<class T, unsigned int D> EXPORTCPUCOREMATH bool gradient(const hoNDImage<T, D>& x, hoNDImage<T, D> gx[]);
+
+    /// compute a gaussian kernel
+    template<class T> EXPORTCPUCOREMATH bool gaussianKernel(T sigma, double kerWidthInUnitOfSigma, double deltaKer, hoNDArray<T>& ker);
+
+    /// perform the gaussian filter for every dimension
+    /// sigma is in the unit of pixel
+    template<class ArrayType, class T2> EXPORTCPUCOREMATH bool filterGaussian(ArrayType& x, T2 sigma[], typename ArrayType::value_type* mem=NULL);
+
+    /// perform midian filter
+    /// w is the window size
+    template<class ArrayType> bool filterMedian(const ArrayType& img, size_t w[], ArrayType& img_out);
+
+    /// downsample the image by a ratio
+    /// new image size = image size / ratio
+    /// e.g., if ratio = 2, downsample by 2
+    template<typename T, typename InterpolatorType, unsigned int D> 
+    bool downsampleImage(const hoNDImage<T, D>& in, InterpolatorType& interp, hoNDImage<T, D>& out, float ratio[]);
+
+    /// upsample the image by a ratio
+    /// new image size = image size * ratio
+    /// e.g., if ratio = 2, upsample by 2
+    template<typename T, typename InterpolatorType, unsigned int D> 
+    bool upsampleImage(const hoNDImage<T, D>& in, InterpolatorType& interp, hoNDImage<T, D>& out, float ratio[]);
+
+    /// resample the image to specific image size
+    /// input and output images occupy the same space region
+    /// the pixel size of output images are adjusted accordingly
+    template<typename T, typename InterpolatorType, unsigned int D> 
+    bool resampleImage(const hoNDImage<T, D>& in, InterpolatorType& interp, const std::vector<size_t>& dim_out, hoNDImage<T, D>& out);
+
+    /// reduce image size by 2 with averaging across two neighbors
+    template<typename T, typename BoundaryHandlerType, unsigned int D> 
+    bool downsampleImageBy2WithAveraging(const hoNDImage<T, D>& in, BoundaryHandlerType& bh, hoNDImage<T, D>& out);
+
+    /// expand image size by 2 with linear interpolation
+    template<typename T, typename BoundaryHandlerType, unsigned int D> 
+    bool expandImageBy2(const hoNDImage<T, D>& in, BoundaryHandlerType& bh, hoNDImage<T, D>& out);
+
+    /// filter the image along the first dimension using a 1D kernel
+    template<class ArrayType> bool filter1D(const ArrayType& img, const hoNDArray<typename realType<typename ArrayType::value_type>::Type>& ker, GT_BOUNDARY_CONDITION bh, ArrayType& img_out);
+
+    /**
+    * @brief r = correlation_coefficient(a, b)
+    */
+    template <typename T, unsigned int D> 
+    bool corrCoef(const hoNDImage<T, D>& a, const hoNDImage<T, D>& b, T& r);
+}
+
+#include "hoNDImage_util.hxx"
diff --git a/toolboxes/core/cpu/math/hoNDImage_util.hxx b/toolboxes/core/cpu/math/hoNDImage_util.hxx
new file mode 100644
index 0000000..a6f4b80
--- /dev/null
+++ b/toolboxes/core/cpu/math/hoNDImage_util.hxx
@@ -0,0 +1,1022 @@
+/** \file   hoNDImage_util.hxx
+    \brief  operations on the hoNDImage class.
+*/
+
+namespace Gadgetron
+{
+    template <typename T, unsigned int D> 
+    bool corrCoef(const hoNDImage<T, D>& a, const hoNDImage<T, D>& b, T& r)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(a.dimensions_equal(&b));
+
+            r = -1;
+
+            T ma, mb;
+            ma = Gadgetron::mean( const_cast< hoNDImage<T, D>* >(&a) );
+            mb = Gadgetron::mean( const_cast< hoNDImage<T, D>* >(&b) );
+
+            size_t N = a.get_number_of_elements();
+
+            const T* pA = a.begin();
+            const T* pB = b.begin();
+
+            size_t n;
+
+            double x(0), y(0), z(0);
+            for ( n=0; n<N; n++ )
+            {
+                x += (pA[n]-ma)*(pA[n]-ma);
+                y += (pB[n]-mb)*(pB[n]-mb);
+                z += (pA[n]-ma)*(pB[n]-mb);
+            }
+
+            double p = std::sqrt(x*y);
+            if ( p > 0 )
+            {
+                r = (T)(z/p);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in corrCoef(const hoNDImage<T, D>& a, const hoNDImage<T, D>& b, T& r) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename T, typename InterpolatorType, unsigned int D> 
+    bool downsampleImage(const hoNDImage<T, D>& in, InterpolatorType& interp, hoNDImage<T, D>& out, float ratio[])
+    {
+        try
+        {
+            std::vector<size_t> dim(D);
+            in.get_dimensions(dim);
+
+            std::vector<size_t> dim_out(D);
+
+            unsigned int ii;
+            for ( ii=0; ii<D; ii++ )
+            {
+                dim_out[ii] = (size_t)(dim[ii]/ratio[ii]);
+            }
+
+            return Gadgetron::resampleImage(in, interp, dim_out, out);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in downsampleImage(const hoNDImage<T, D>& in, InterpolatorType& interp, hoNDImage<T, D>& out, float ratio[]) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename T, typename InterpolatorType, unsigned int D> 
+    bool upsampleImage(const hoNDImage<T, D>& in, InterpolatorType& interp, hoNDImage<T, D>& out, float ratio[])
+    {
+        try
+        {
+            std::vector<size_t> dim(D);
+            in.get_dimensions(dim);
+
+            std::vector<size_t> dim_out(D);
+
+            unsigned int ii;
+            for ( ii=0; ii<D; ii++ )
+            {
+                dim_out[ii] = (size_t)(dim[ii]*ratio[ii]);
+            }
+
+            return Gadgetron::resampleImage(in, interp, dim_out, out);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in upsampleImage(const hoNDImage<T, D>& in, InterpolatorType& interp, hoNDImage<T, D>& out, float ratio[]) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename T, typename InterpolatorType, unsigned int D> 
+    bool resampleImage(const hoNDImage<T, D>& in, InterpolatorType& interp, const std::vector<size_t>& dim_out, hoNDImage<T, D>& out)
+    {
+        try
+        {
+            typedef typename hoNDImage<T, D>::coord_type coord_type;
+
+            /// get the coordinate parameters
+            std::vector<size_t> dim;
+            in.get_dimensions(dim);
+
+            std::vector<coord_type> pixelSize;
+            in.get_pixel_size(pixelSize);
+
+            std::vector<coord_type> origin;
+            in.get_origin(origin);
+
+            typename hoNDImage<T, D>::axis_type axis;
+            in.get_axis(axis);
+
+            /// compute new pixel sizes
+            std::vector<coord_type> pixelSize_out(D);
+
+            unsigned int ii;
+            for ( ii=0; ii<D; ii++ )
+            {
+                if ( dim_out[ii] > 1 )
+                {
+                    pixelSize_out[ii] = (dim[ii]-1)*pixelSize[ii] / (dim_out[ii]-1);
+                }
+                else
+                {
+                    pixelSize_out[ii] = (dim[ii]-1)*pixelSize[ii];
+                }
+            }
+
+            /// set up the out image
+            out.create(dim_out, pixelSize_out, origin, axis);
+
+            /// set up the interpolator
+            interp.setArray( const_cast< hoNDImage<T, D>& >(in) );
+
+            /// compute the out image
+
+            size_t N = out.get_number_of_elements();
+
+            if ( D == 2 )
+            {
+                long long ox = (long long)dim_out[0];
+                long long oy = (long long)dim_out[1];
+
+                long long x, y;
+
+                #pragma omp parallel default(none) private(x, y) shared(N, ox, oy, in, out, interp)
+                {
+                    coord_type px, py, ix_in, iy_in;
+
+                    #pragma omp for 
+                    for ( y=0; y<oy; y++ )
+                    {
+                        for ( x=0; x<ox; x++ )
+                        {
+                            out.image_to_world( (size_t)x, (size_t)y, px, py);
+
+                            in.world_to_image(px, py, ix_in, iy_in);
+
+                            out( (size_t)(x+y*ox) ) = interp(ix_in, iy_in);
+                        }
+                    }
+                }
+            }
+            else if ( D == 3 )
+            {
+                long long ox = (long long)dim_out[0];
+                long long oy = (long long)dim_out[1];
+                long long oz = (long long)dim_out[2];
+
+                long long x, y, z;
+
+                #pragma omp parallel default(none) private(x, y, z) shared(N, ox, oy, oz, in, out, interp)
+                {
+                    coord_type ix_in, iy_in, iz_in;
+                    coord_type px, py, pz;
+
+                    #pragma omp for 
+                    for ( z=0; z<oz; z++ )
+                    {
+                        for ( y=0; y<oy; y++ )
+                        {
+                            size_t offset = y*ox + z*ox*oy;
+
+                            for ( x=0; x<ox; x++ )
+                            {
+                                out.image_to_world( (size_t)x, (size_t)y, (size_t)z, px, py, pz);
+
+                                in.world_to_image(px, py, pz, ix_in, iy_in, iz_in);
+
+                                out( (size_t)(x+offset) ) = interp(ix_in, iy_in, iz_in);
+                            }
+                        }
+                    }
+                }
+            }
+            else if ( D == 4 )
+            {
+                long long ox = (long long)dim_out[0];
+                long long oy = (long long)dim_out[1];
+                long long oz = (long long)dim_out[2];
+                long long ot = (long long)dim_out[3];
+
+                long long x, y, z, t;
+
+                #pragma omp parallel default(none) private(x, y, z, t) shared(N, ox, oy, oz, ot, in, out, interp)
+                {
+                    coord_type ix_in, iy_in, iz_in, it_in;
+                    coord_type px, py, pz, pt;
+
+                    #pragma omp for 
+                    for ( t=0; t<ot; t++ )
+                    {
+                        for ( z=0; z<oz; z++ )
+                        {
+                            for ( y=0; y<oy; y++ )
+                            {
+                                size_t offset = y*ox + z*ox*oy + t*ox*oy*oz;
+
+                                for ( x=0; x<ox; x++ )
+                                {
+                                    out.image_to_world( (size_t)x, (size_t)y, (size_t)z, (size_t)t, px, py, pz, pt);
+
+                                    in.world_to_image(px, py, pz, pt, ix_in, iy_in, iz_in, it_in);
+
+                                    out( (size_t)(x+offset) ) = interp(ix_in, iy_in, iz_in, it_in);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                long long n;
+
+                #pragma omp parallel default(none) private(n) shared(N, in, out, interp)
+                {
+                    std::vector<size_t> ind_o(D);
+                    std::vector<coord_type> ind_i(D);
+
+                    std::vector<coord_type> pos(D);
+
+                    #pragma omp for 
+                    for ( n=0; n<N; n++ )
+                    {
+                        out.calculate_index(n, ind_o);
+                        out.image_to_world(ind_o, pos);
+
+                        in.world_to_image(pos, ind_i);
+
+                        out(n) = interp(ind_i);
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in resampleImage(const hoNDImage<T, D>& in, InterpolatorType& interp, hoNDImage<T, D>& out, size_t size_out[D]) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename T, typename BoundaryHandlerType, unsigned int D> 
+    bool downsampleImageBy2WithAveraging(const hoNDImage<T, D>& in, BoundaryHandlerType& bh, hoNDImage<T, D>& out)
+    {
+        try
+        {
+            typedef typename hoNDImage<T, D>::coord_type coord_type;
+
+            bh.setArray( const_cast< hoNDImage<T, D>& >(in) );
+
+            /// get the coordinate parameters
+            std::vector<size_t> dim;
+            in.get_dimensions(dim);
+
+            std::vector<coord_type> pixelSize;
+            in.get_pixel_size(pixelSize);
+
+            std::vector<coord_type> origin;
+            in.get_origin(origin);
+
+            typename hoNDImage<T, D>::axis_type axis;
+            in.get_axis(axis);
+
+            /// compute out image size and pixel size
+            std::vector<size_t> dim_out(D);
+            std::vector<coord_type> pixelSize_out(D);
+
+            unsigned int ii;
+            for ( ii=0; ii<D; ii++ )
+            {
+                dim_out[ii] = (dim[ii] >> 1);
+                pixelSize_out[ii] = 2*pixelSize[ii];
+            }
+
+            out.create(dim_out, pixelSize_out, origin, axis);
+
+            if ( D == 2 )
+            {
+                size_t sx = dim_out[0];
+                size_t sy = dim_out[1];
+
+                T weight = 1.0/5;
+
+                long long x, y;
+
+                #pragma omp parallel for default(none) private(x, y) shared(sx, sy, bh, out)
+                for ( y=0; y<(long long)sy; y++ )
+                {
+                    long long iy = y<<1;
+
+                    for ( x=0; x<(long long)sx; x++ )
+                    {
+                        long long ix = x<<1;
+                        out( (size_t)(x+y*sx) ) = bh(ix, iy) + ( bh(ix+1, iy) + bh(ix-1, iy) ) + ( bh(ix, iy+1) + bh(ix, iy-1) );
+                    }
+                }
+
+                Gadgetron::scal(weight, out);
+            }
+            else if ( D == 3 )
+            {
+                size_t sx = dim_out[0];
+                size_t sy = dim_out[1];
+                size_t sz = dim_out[2];
+
+                T weight = 1.0/7;
+
+                long long x, y, z;
+
+                #pragma omp parallel for default(none) private(x, y, z) shared(sx, sy, sz, bh, out)
+                for ( z=0; z<sz; z++ )
+                {
+                    long long iz = z<<1;
+
+                    for ( y=0; y<sy; y++ )
+                    {
+                        long long iy = y<<1;
+
+                        size_t offset = y*sx + z*sx*sy;
+
+                        for ( x=0; x<sx; x++ )
+                        {
+                            long long ix = x<<1;
+
+                            out( (size_t)(x+offset) ) = bh(ix, iy, iz) 
+                                        + ( bh(ix+1, iy, iz) + bh(ix-1, iy, iz) ) 
+                                        + ( bh(ix, iy+1, iz) + bh(ix, iy-1, iz) )
+                                        + ( bh(ix, iy, iz+1) + bh(ix, iy, iz-1) );
+                        }
+                    }
+                }
+
+                Gadgetron::scal(weight, out);
+            }
+            else if ( D == 4 )
+            {
+                size_t sx = dim_out[0];
+                size_t sy = dim_out[1];
+                size_t sz = dim_out[2];
+                size_t st = dim_out[3];
+
+                T weight = 1.0/9;
+
+                long long x, y, z, t;
+
+                #pragma omp parallel for default(none) private(x, y, z, t) shared(sx, sy, sz, st, bh, out)
+                for ( t=0; t<st; t++ )
+                {
+                    long long it = t<<1;
+
+                    for ( z=0; z<sz; z++ )
+                    {
+                        long long iz = z<<1;
+
+                        for ( y=0; y<sy; y++ )
+                        {
+                            long long iy = y<<1;
+
+                            size_t offset = y*sx + z*sx*sy + t*sx*sy*sz;
+
+                            for ( x=0; x<sx; x++ )
+                            {
+                                long long ix = x<<1;
+
+                                out( (size_t)(x+offset) ) = bh(ix, iy, iz, it) 
+                                            + ( bh(ix+1, iy, iz, it) + bh(ix-1, iy, iz, it) ) 
+                                            + ( bh(ix, iy+1, iz, it) + bh(ix, iy-1, iz, it) )
+                                            + ( bh(ix, iy, iz+1, it) + bh(ix, iy, iz-1, it) )
+                                            + ( bh(ix, iy, iz, it+1) + bh(ix, iy, iz, it-1) );
+                            }
+                        }
+                    }
+                }
+
+                Gadgetron::scal(weight, out);
+            }
+            else
+            {
+                T weight = 1.0/(2*D+1);
+
+                size_t N = out.get_number_of_elements();
+
+                long long n;
+
+                #pragma omp parallel default(none) private(n) shared(N, bh, out, dim_out)
+                {
+                    std::vector<size_t> ind_out(D);
+                    std::vector<long long> ind_in(D);
+
+                    #pragma omp for 
+                    for ( n=0; n<N; n++ )
+                    {
+                        out.calculate_index(n, ind_out);
+
+                        unsigned int ii;
+                        for ( ii=0; ii<D; ii++ )
+                        {
+                            ind_in[ii] = ind_out[ii]<<1;
+                        }
+
+                        T v = bh(ind_in);
+
+                        for ( ii=0; ii<D; ii++ )
+                        {
+                            ind_in[ii]++;
+                            v += bh(ind_in);
+
+                            ind_in[ii]--;
+                            ind_in[ii]--;
+                            v += bh(ind_in);
+
+                            ind_in[ii]++;
+                        }
+
+                        out(n) = v;
+                    }
+                }
+
+                Gadgetron::scal(weight, out);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in downsampleImageBy2WithAveraging(const hoNDImage<T, D>& in, hoNDImage<T, D>& out) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename T, typename BoundaryHandlerType, unsigned int D> 
+    bool expandImageBy2(const hoNDImage<T, D>& in, BoundaryHandlerType& bh, hoNDImage<T, D>& out)
+    {
+        try
+        {
+            typedef typename hoNDImage<T, D>::coord_type coord_type;
+
+            bh.setArray( const_cast< hoNDImage<T, D>& >(in) );
+
+            /// get the coordinate parameters
+            std::vector<size_t> dim;
+            in.get_dimensions(dim);
+
+            std::vector<coord_type> pixelSize;
+            in.get_pixel_size(pixelSize);
+
+            std::vector<coord_type> origin;
+            in.get_origin(origin);
+
+            typename hoNDImage<T, D>::axis_type axis;
+            in.get_axis(axis);
+
+            /// compute out pixel size
+            std::vector<coord_type> pixelSize_out(D);
+
+            unsigned int ii;
+            for ( ii=0; ii<D; ii++ )
+            {
+                pixelSize_out[ii] = pixelSize[ii]* (coord_type)0.5;
+            }
+
+            out.set_pixel_size(pixelSize_out);
+            out.set_origin(origin);
+            out.set_axis(axis);
+
+            if ( D == 2 )
+            {
+                size_t sx = dim[0];
+                size_t sy = dim[1];
+
+                long long x, y;
+
+                #pragma omp parallel for default(none) private(x, y) shared(sx, sy, bh, out)
+                for ( y=0; y<sy; y++ )
+                {
+                    size_t oy = y<<1;
+
+                    for ( x=0; x<sx; x++ )
+                    {
+                        size_t ox = x<<1;
+
+                        T p00 = bh(x, y);
+                        T p10 = bh(x+1, y);
+                        T p01 = bh(x, y+1);
+                        T p11 = bh(x+1, y+1);
+
+                        out( ox, oy ) = p00;
+                        out( ox+1, oy ) = 0.5*(p00 + p10);
+                        out( ox, oy+1 ) = 0.5*(p00 + p01);
+                        out( ox+1, oy+1 ) = 0.25*(p00+p10+p01+p11);
+                    }
+                }
+
+                // if out has odd sizes
+                size_t sx_out = out.get_size(0);
+                size_t sy_out = out.get_size(1);
+
+                if ( (2*sx) < sx_out )
+                {
+                    for ( y=0; y<sy_out; y++ )
+                    {
+                        size_t offset = y*sx_out + sx_out-1;
+                        out(offset) = out(offset-1);
+                    }
+                }
+
+                if ( (2*sy) < sy_out )
+                {
+                    memcpy(out.begin()+(sy_out-1)*sx_out, out.begin()+(sy_out-2)*sx_out, sizeof(T)*sx_out);
+                }
+            }
+            else if ( D == 3 )
+            {
+                size_t sx = dim[0];
+                size_t sy = dim[1];
+                size_t sz = dim[2];
+
+                long long x, y, z;
+
+                #pragma omp parallel for default(none) private(x, y, z) shared(sx, sy, sz, bh, out)
+                for ( z=0; z<sz; z++ )
+                {
+                    size_t oz = z<<1;
+
+                    for ( y=0; y<sy; y++ )
+                    {
+                        size_t oy = y<<1;
+
+                        for ( x=0; x<sx; x++ )
+                        {
+                            size_t ox = x<<1;
+
+                            T p000 = bh(x, y, z);
+                            T p100 = bh(x+1, y, z);
+                            T p010 = bh(x, y+1, z);
+                            T p110 = bh(x+1, y+1, z);
+
+                            T p001 = bh(x, y, z+1);
+                            T p101 = bh(x+1, y, z+1);
+                            T p011 = bh(x, y+1, z+1);
+                            T p111 = bh(x+1, y+1, z+1);
+
+                            out( ox, oy, oz ) = p000;
+                            out( ox+1, oy, oz ) = 0.5*(p000 + p100);
+                            out( ox, oy+1, oz ) = 0.5*(p000 + p010);
+                            out( ox+1, oy+1, oz ) = 0.25*(p000+p100+p010+p110);
+
+                            out( ox, oy, oz+1 ) = 0.5*(p000 + p001);
+                            out( ox+1, oy, oz+1 ) = 0.25*(p000 + p100 + p001 + p101);
+                            out( ox, oy+1, oz+1 ) = 0.25*(p000 + p010 + p001 + p011);
+                            out( ox+1, oy+1, oz+1 ) = 0.125*(p000+p100+p010+p110+p001+p101+p011+p111);
+                        }
+                    }
+                }
+
+                // if out has odd sizes
+                size_t sx_out = out.get_size(0);
+                size_t sy_out = out.get_size(1);
+                size_t sz_out = out.get_size(2);
+
+                if ( (2*sx) < sx_out )
+                {
+                    for ( z=0; z<sz_out; z++ )
+                    {
+                        for ( y=0; y<sy_out; y++ )
+                        {
+                            size_t offset = y*sx_out + z*sx_out*sy_out;
+
+                            out( size_t(sx_out-1+offset) ) = out( size_t(sx_out-2+offset) );
+                        }
+                    }
+                }
+
+                if ( (2*sy) < sy_out )
+                {
+                    for ( z=0; z<sz_out; z++ )
+                    {
+                        size_t offset = z*sx_out*sy_out + (sy_out-1)*sx_out;
+
+                        for ( x=0; x<sx_out; x++ )
+                        {
+                            out( (size_t)(x+offset) ) = out( (size_t)(x+offset-sx_out) );
+                        }
+                    }
+                }
+
+                if ( (2*sz) < sz_out )
+                {
+                    memcpy(out.begin()+(sz_out-1)*sx_out*sy_out, out.begin()+(sz_out-2)*sx_out*sy_out, sizeof(T)*sx_out*sy_out);
+                }
+            }
+            else
+            {
+                hoNDInterpolatorLinear<hoNDImage<T, D> > interp(const_cast< hoNDImage<T, D>& >(in), bh);
+
+                size_t N = out.get_number_of_elements();
+
+                long long n;
+
+                #pragma omp parallel default(none) private(n) shared(N, bh, in, out, interp)
+                {
+                    std::vector<size_t> ind_out(D);
+                    std::vector<coord_type> ind_in(D);
+
+                    #pragma omp for 
+                    for ( n=0; n<N; n++ )
+                    {
+                        out.calculate_index(n, ind_out);
+
+                        unsigned int ii;
+                        for ( ii=0; ii<D; ii++ )
+                        {
+                            ind_in[ii] = (coord_type)(ind_out[ii]*0.5);
+                        }
+
+                        out( (size_t)(n) ) = interp(ind_in);
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in expandImageBy2(const hoNDImage<T, D>& in, BoundaryHandlerType& bh, hoNDImage<T, D>& out) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<class ArrayType> 
+    bool filterMedian(const ArrayType& img, size_t w[], ArrayType& img_out)
+    {
+        try
+        {
+            typedef typename ArrayType::value_type T;
+
+            size_t D = img.get_number_of_dimensions();
+
+            img_out = img;
+
+            if ( D == 1 )
+            {
+                long long halfW = w[0]/2;
+                long long N = (long long)img.get_number_of_elements();
+
+                long long n, m, t;
+
+                #pragma omp parallel default(none) private(n, m, t) shared(halfW, N, img, img_out)
+                {
+                    std::vector<T> buf(2*halfW+1);
+
+                    #pragma omp for 
+                    for ( n=0; n<N; n++ )
+                    {
+                        for ( m=-halfW; m<=halfW; m++ )
+                        {
+                            t = n + m;
+                            if ( t<0 ) t = 0;
+                            if ( t > N-1 ) t = N-1;
+                            buf[m+halfW] = img( (size_t)t );
+                        }
+
+                        std::sort(buf.begin(), buf.end());
+
+                        img_out(n) = buf[halfW];
+                    }
+                }
+            }
+            else if ( D == 2 )
+            {
+                long long halfX = w[0]/2;
+                long long halfY = w[1]/2;
+                long long sx = (long long)img.get_size(0);
+                long long sy = (long long)img.get_size(1);
+
+                const T* pImg = img.begin();
+                T* pImgOut = img_out.begin();
+
+                long long WX = 2*halfX+1;
+                long long WY = 2*halfY+1;
+
+                long long medianInd = WX*WY/2;
+
+                long long x, y, tx, ty, hx, hy;
+                #pragma omp parallel default(none) private(x, y, tx, ty, hx, hy) shared(halfX, halfY, sx, sy, WX, WY, pImg, pImgOut, medianInd)
+                {
+                    std::vector<T> buf(WX*WY);
+
+                    #pragma omp for 
+                    for ( y=halfY; y<sy-halfY; y++ )
+                    {
+                        for ( x=halfX; x<sx-halfX; x++ )
+                        {
+                            size_t ind(0);
+                            for ( hy=-halfY; hy<=halfY; hy++ )
+                            {
+                                ty = hy + y;
+
+                                for ( hx=-halfX; hx<=halfX; hx++ )
+                                {
+                                    tx = hx + x;
+
+                                    buf[ind++] = pImg[tx + ty*sx];
+                                }
+                            }
+
+                            std::sort(buf.begin(), buf.end());
+
+                            pImgOut[x + y*sx] = buf[medianInd];
+                        }
+                    }
+                }
+
+                std::vector<T> buf(WX*WY);
+
+                for ( y=0; y<halfY; y++ )
+                {
+                    for ( x=0; x<sx; x++ )
+                    {
+                        size_t ind(0);
+                        for ( hy=-halfY; hy<=halfY; hy++ )
+                        {
+                            ty = hy + y;
+                            if ( ty < 0 ) ty = 0;
+
+                            for ( hx=-halfX; hx<=halfX; hx++ )
+                            {
+                                tx = hx + x;
+                                if ( tx < 0 ) tx = 0;
+                                if ( tx > sx-1 ) tx = sx-1;
+
+                                buf[ind++] = pImg[tx + ty*sx];
+                            }
+                        }
+
+                        std::sort(buf.begin(), buf.end());
+
+                        pImgOut[x + y*sx] = buf[medianInd];
+                    }
+                }
+
+                for ( y=sy-halfY; y<sy; y++ )
+                {
+                    for ( x=0; x<sx; x++ )
+                    {
+                        size_t ind(0);
+                        for ( hy=-halfY; hy<=halfY; hy++ )
+                        {
+                            ty = hy + y;
+                            if ( ty > sy-1 ) ty = sy-1;
+
+                            for ( hx=-halfX; hx<=halfX; hx++ )
+                            {
+                                tx = hx + x;
+                                if ( tx < 0 ) tx = 0;
+                                if ( tx > sx-1 ) tx = sx-1;
+
+                                buf[ind++] = pImg[tx + ty*sx];
+                            }
+                        }
+
+                        std::sort(buf.begin(), buf.end());
+
+                        pImgOut[x + y*sx] = buf[medianInd];
+                    }
+                }
+            }
+            else if ( D == 3 )
+            {
+                long long halfX = w[0]/2;
+                long long halfY = w[1]/2;
+                long long halfZ = w[2]/2;
+                long long sx = (long long)img.get_size(0);
+                long long sy = (long long)img.get_size(1);
+                long long sz = (long long)img.get_size(2);
+
+                const T* pImg = img.begin();
+                T* pImgOut = img_out.begin();
+
+                long long WX = 2*halfX+1;
+                long long WY = 2*halfY+1;
+                long long WZ = 2*halfZ+1;
+
+                long long medianInd = WX*WY*WZ/2;
+
+                long long x, y, z, tx, ty, tz, hx, hy, hz;
+                #pragma omp parallel default(none) private(x, y, z, tx, ty, tz, hx, hy, hz) shared(halfX, halfY, halfZ, sx, sy, sz, WX, WY, WZ, pImg, pImgOut, medianInd)
+                {
+                    std::vector<T> buf(WX*WY*WZ);
+
+                    #pragma omp for 
+                    for ( z=halfZ; z<sz-halfZ; z++ )
+                    {
+                        for ( y=halfY; y<sy-halfY; y++ )
+                        {
+                            for ( x=halfX; x<sx-halfX; x++ )
+                            {
+                                size_t ind(0);
+                                for ( hz=-halfZ; hz<=halfZ; hz++ )
+                                {
+                                    tz = hz + z;
+
+                                    for ( hy=-halfY; hy<=halfY; hy++ )
+                                    {
+                                        ty = hy + y;
+
+                                        for ( hx=-halfX; hx<=halfX; hx++ )
+                                        {
+                                            tx = hx + x;
+
+                                            buf[ind++] = pImg[tx + ty*sx + tz*sx*sy];
+                                        }
+                                    }
+                                }
+
+                                std::sort(buf.begin(), buf.end());
+
+                                pImgOut[x + y*sx + z*sx*sy] = buf[medianInd];
+                            }
+                        }
+                    }
+                }
+
+                std::vector<T> buf(WX*WY*WZ);
+
+                for ( z=0; z<halfZ; z++ )
+                {
+                    for ( y=0; y<sy; y++ )
+                    {
+                        for ( x=0; x<sx; x++ )
+                        {
+                            size_t ind(0);
+                            for ( hz=-halfZ; hz<=halfZ; hz++ )
+                            {
+                                tz = hz + z;
+                                if ( tz < 0 ) tz = 0;
+
+                                for ( hy=-halfY; hy<=halfY; hy++ )
+                                {
+                                    ty = hy + y;
+                                    if ( ty < 0 ) ty = 0;
+                                    if ( ty > sy-1 ) ty = sy-1;
+
+                                    for ( hx=-halfX; hx<=halfX; hx++ )
+                                    {
+                                        tx = hx + x;
+                                        if ( tx < 0 ) tx = 0;
+                                        if ( tx > sx-1 ) tx = sx-1;
+
+                                        buf[ind++] = pImg[tx + ty*sx + tz*sx*sy];
+                                    }
+                                }
+                            }
+
+                            std::sort(buf.begin(), buf.end());
+
+                            pImgOut[x + y*sx + z*sx*sy] = buf[medianInd];
+                        }
+                    }
+                }
+
+                for ( z=sz-halfZ; z<sz; z++ )
+                {
+                    for ( y=0; y<sy; y++ )
+                    {
+                        for ( x=0; x<sx; x++ )
+                        {
+                            size_t ind(0);
+                            for ( hz=-halfZ; hz<=halfZ; hz++ )
+                            {
+                                tz = hz + z;
+                                if ( tz > sz-1 ) tz = sz-1;
+
+                                for ( hy=-halfY; hy<=halfY; hy++ )
+                                {
+                                    ty = hy + y;
+                                    if ( ty < 0 ) ty = 0;
+                                    if ( ty > sy-1 ) ty = sy-1;
+
+                                    for ( hx=-halfX; hx<=halfX; hx++ )
+                                    {
+                                        tx = hx + x;
+                                        if ( tx < 0 ) tx = 0;
+                                        if ( tx > sx-1 ) tx = sx-1;
+
+                                        buf[ind++] = pImg[tx + ty*sx + tz*sx*sy];
+                                    }
+                                }
+                            }
+
+                            std::sort(buf.begin(), buf.end());
+
+                            pImgOut[x + y*sx + z*sx*sy] = buf[medianInd];
+                        }
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in filterMedian(const ArrayType& img, size_t w[], ArrayType& img_out) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<class ArrayType> 
+    bool filter1D(const ArrayType& img, const hoNDArray<typename realType<typename ArrayType::value_type>::Type>& ker, GT_BOUNDARY_CONDITION bh, ArrayType& img_out)
+    {
+        try
+        {
+            typedef typename ArrayType::value_type T;
+            typedef typename realType<T>::Type real_value_type;
+
+            typedef hoNDImage<T, 1> Array1DType;
+
+            long long RO = (long long)img.get_size(0);
+            long long num = (long long)(img.get_number_of_elements()/RO);
+
+            img_out = img;
+
+            long long kerLen = (long long)ker.get_size(0);
+            long long kerHalfLen = kerLen/2;
+
+            const real_value_type* pKer = ker.begin();
+
+            long long ii;
+
+            #pragma omp parallel default(none) private(ii) shared(bh, num, RO, img, img_out, kerLen, kerHalfLen, pKer)
+            {
+                hoNDBoundaryHandler<Array1DType>* pBH = NULL;
+
+                hoNDBoundaryHandlerFixedValue<Array1DType> bhFixedValue;
+                hoNDBoundaryHandlerBorderValue<Array1DType> bhBorderValue;
+                hoNDBoundaryHandlerPeriodic<Array1DType> bhPeriodic;
+                hoNDBoundaryHandlerMirror<Array1DType> bhMirror;
+
+                pBH = &bhBorderValue;
+
+                if ( bh == GT_BOUNDARY_CONDITION_FIXEDVALUE )
+                {
+                    pBH = &bhFixedValue;
+                }
+                else if ( bh == GT_BOUNDARY_CONDITION_BORDERVALUE )
+                {
+                    pBH = &bhBorderValue;
+                }
+                else if ( bh == GT_BOUNDARY_CONDITION_PERIODIC )
+                {
+                    pBH = &bhPeriodic;
+                }
+                else if ( bh == GT_BOUNDARY_CONDITION_MIRROR )
+                {
+                    pBH = &bhMirror;
+                }
+
+                #pragma omp for 
+                for ( ii=0; ii<num; ii++ )
+                {
+                    Array1DType img1D(RO, const_cast<T*>(img.begin()+ii*RO));
+                    pBH->setArray(img1D);
+
+                    Array1DType img_out1D(RO, img_out.begin()+ii*RO);
+
+                    long long k, j;
+                    for ( k=0; k<RO; k++ )
+                    {
+                        T v = 0;
+                        for ( j=0; j<kerLen; j++ )
+                        {
+                            v += (*pBH)(k+j-kerHalfLen) * pKer[kerLen-j-1];
+                        }
+
+                        img_out1D(k) = v;
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in filter1D(const ArrayType& img, const hoNDArray<T>& ker, GT_BOUNDARY_CONDITION bh, ArrayType& img_out) ... ");
+            return false;
+        }
+
+        return true;
+    }
+}
diff --git a/toolboxes/core/cpu/math/hoNDImage_util_instantiate.hxx b/toolboxes/core/cpu/math/hoNDImage_util_instantiate.hxx
new file mode 100644
index 0000000..fb05753
--- /dev/null
+++ b/toolboxes/core/cpu/math/hoNDImage_util_instantiate.hxx
@@ -0,0 +1,15 @@
+
+template EXPORTCPUCOREMATH bool filterGaussian(hoNDImage<float, DimImage>& img, float sigma[], float* mem);
+template EXPORTCPUCOREMATH bool filterGaussian(hoNDImage<double, DimImage>& img, double sigma[], double* mem);
+template EXPORTCPUCOREMATH bool filterGaussian(hoNDImage<float, DimImage>& img, double sigma[], float* mem);
+template EXPORTCPUCOREMATH bool filterGaussian(hoNDImage<double, DimImage>& img, float sigma[], double* mem);
+
+template EXPORTCPUCOREMATH bool filterGaussian(hoNDImage< std::complex<float> , DimImage>& img, float sigma[],  std::complex<float> * mem);
+template EXPORTCPUCOREMATH bool filterGaussian(hoNDImage< std::complex<double> , DimImage>& img, double sigma[],  std::complex<double> * mem);
+template EXPORTCPUCOREMATH bool filterGaussian(hoNDImage< std::complex<float> , DimImage>& img, double sigma[],  std::complex<float> * mem);
+template EXPORTCPUCOREMATH bool filterGaussian(hoNDImage< std::complex<double> , DimImage>& img, float sigma[],  std::complex<double> * mem);
+
+template EXPORTCPUCOREMATH bool gradient(const hoNDImage<float, DimImage>& x, hoNDImage<float, DimImage> gx[]);
+template EXPORTCPUCOREMATH bool gradient(const hoNDImage<double, DimImage>& x, hoNDImage<double, DimImage> gx[]);
+template EXPORTCPUCOREMATH bool gradient(const hoNDImage< std::complex<float> , DimImage>& x, hoNDImage< std::complex<float> , DimImage> gx[]);
+template EXPORTCPUCOREMATH bool gradient(const hoNDImage< std::complex<double> , DimImage>& x, hoNDImage< std::complex<double> , DimImage> gx[]);
diff --git a/toolboxes/core/gpu/CMakeLists.txt b/toolboxes/core/gpu/CMakeLists.txt
new file mode 100644
index 0000000..62277cc
--- /dev/null
+++ b/toolboxes/core/gpu/CMakeLists.txt
@@ -0,0 +1,87 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUCORE__)
+endif (WIN32)
+
+if(WIN32)
+  link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+include_directories( 
+  ${CUDA_INCLUDE_DIRS}
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+)
+
+cuda_add_library(gadgetron_toolbox_gpucore SHARED 
+    check_CUDA.h
+    CUBLASContextProvider.h
+    cudaDeviceManager.h
+    cuNDArray.h
+    cuNDArray_blas.h
+    cuNDArray_elemwise.h
+    cuNDArray_operators.h
+    cuNDArray_utils.h
+    cuNDArray_fileio.h
+    cuNDArray_reductions.h
+    GadgetronCuException.h
+    gpucore_export.h
+    GPUTimer.h
+    hoCuNDArray.h
+    hoCuNDArray_blas.h
+    hoCuNDArray_elemwise.h
+    hoCuNDArray_utils.h
+    radial_utilities.h
+    real_utilities_device.h
+    setup_grid.h
+    cuNDArray_operators.cu
+    cuNDArray_elemwise.cu
+    cuNDArray_blas.cu
+    cuNDArray_utils.cu
+    cuNDArray_reductions.cu
+    radial_utilities.cu
+    hoCuNDArray_blas.cpp
+    CUBLASContextProvider.cpp
+    cudaDeviceManager.cpp
+    cuSparseMatrix.cu
+  )
+
+set_target_properties(gadgetron_toolbox_gpucore PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_toolbox_gpucore 
+  gadgetron_toolbox_cpucore
+  gadgetron_toolbox_log
+  ${Boost_LIBRARIES}
+  ${CUDA_LIBRARIES} 
+  ${CUDA_CUFFT_LIBRARIES} 
+  ${CUDA_CUBLAS_LIBRARIES} 
+  ${CUDA_CUSPARSE_LIBRARIES}
+  ${MKL_LIBRARIES}
+  )
+
+install(TARGETS gadgetron_toolbox_gpucore DESTINATION lib COMPONENT main)
+
+install(FILES
+  gpucore_export.h
+  cuNDArray.h
+  cuNDArray_operators.h
+  cuNDArray_elemwise.h
+  cuNDArray_blas.h
+  cuNDArray_utils.h
+  cuNDArray_math.h
+  cuNDArray_fileio.h
+  cuNDArray_reductions.h
+  hoCuNDArray.h
+  hoCuNDArray_blas.h
+  hoCuNDArray_elemwise.h
+  hoCuNDArray_utils.h
+  hoCuNDArray_math.h
+  GPUTimer.h				
+  GadgetronCuException.h
+  radial_utilities.h
+  real_utilities_device.h
+  check_CUDA.h
+  cudaDeviceManager.h
+  CUBLASContextProvider.h
+  setup_grid.h
+  cuSparseMatrix.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/core/gpu/CUBLASContextProvider.cpp b/toolboxes/core/gpu/CUBLASContextProvider.cpp
new file mode 100644
index 0000000..dc540bc
--- /dev/null
+++ b/toolboxes/core/gpu/CUBLASContextProvider.cpp
@@ -0,0 +1,115 @@
+#include "CUBLASContextProvider.h"
+
+#include <cuda_runtime_api.h>
+
+#ifdef _WITH_CULA_SUPPORT
+#include <cula_lapack_device.h>
+#endif
+
+
+CUBLASContextProvider* CUBLASContextProvider::instance()
+{
+		if (!instance_) instance_ = new CUBLASContextProvider();
+		return instance_;
+}
+
+CUBLASContextProvider::~CUBLASContextProvider()
+{
+	std::map<int, cublasHandle_t>::iterator it = handles_.begin();
+
+	while (it != handles_.end()) {
+		if (cudaSetDevice(it->first)!= cudaSuccess) {
+		    std::cerr << "Error: unable to set CUDA device." << std::endl;
+		}
+
+#ifdef _WITH_CULA_SUPPORT
+		culaShutdown();
+#endif
+
+		cublasDestroy_v2(it->second);
+		it++;
+	}
+
+}
+
+cublasHandle_t* CUBLASContextProvider::getCublasHandle(int device_no)
+{
+	std::map<int, cublasHandle_t>::iterator it;
+
+
+	//Let's see if we have the handle already:
+	it = handles_.find(device_no);
+
+	if (it != handles_.end()) {
+		return &handles_[device_no];
+	}
+
+
+	//We don't have the handle yet, let's check if it makes sense to create one
+
+	int number_of_devices = 0;
+	if (cudaGetDeviceCount(&number_of_devices)!= cudaSuccess) {
+	    std::cerr << "Error: unable to query number of CUDA devices.\n" << std::endl;
+	    return 0;
+	}
+
+	if (number_of_devices == 0) {
+	      std::cerr << "Error: No available CUDA devices.\n" << std::endl;
+	      return 0;
+     }
+
+	  if (device_no >= number_of_devices) {
+	      std::cerr << "Requested device number exceeds number of devices." << std::endl;
+		  return 0;
+	  }
+
+	  //OK, so we are OK to create the handle. Before we do that, let's capture the current cuda device.
+
+	  int current_device_no;
+	if (cudaGetDevice(&current_device_no)!= cudaSuccess) {
+		 std::cerr << "Error: unable to get current CUDA device.\n" << std::endl;
+		      return 0;
+	}
+
+	if (current_device_no != device_no) {
+		//We must switch context
+		if (cudaSetDevice(device_no)!= cudaSuccess) {
+		    std::cerr << "Error: unable to set CUDA device." << std::endl;
+		      return 0;
+		}
+	}
+
+	cublasHandle_t handle; // this is a struct pointer
+
+	//GDEBUG_STREAM("*********   CREATING NEW CONTEXT ************" << std::endl);
+
+	if (cublasCreate_v2(&handle) != CUBLAS_STATUS_SUCCESS) {
+		std::cerr << "CUBLASContextProvider: unable to create cublas handle\n" << std::endl;
+		return 0;
+	}
+
+	handles_[device_no] = handle;
+
+#ifdef _WITH_CULA_SUPPORT
+	culaStatus s;
+	s = culaInitialize();
+	if(s != culaNoError) {
+		std::cerr << "CUBLASContextProvider: failed to initialize CULA" << std::endl;
+		return 0;
+	}
+#endif
+
+	if (current_device_no != device_no) {
+		//We must switch context back
+		if (cudaSetDevice(current_device_no)!= cudaSuccess) {
+		   std::cerr << "Error: unable to set CUDA device.\n" << std::endl;
+		    return 0;
+		}
+	}
+
+	return &handles_[device_no];
+}
+
+
+CUBLASContextProvider* CUBLASContextProvider::instance_ = 0;
+
diff --git a/toolboxes/core/gpu/CUBLASContextProvider.h b/toolboxes/core/gpu/CUBLASContextProvider.h
new file mode 100644
index 0000000..27c62cf
--- /dev/null
+++ b/toolboxes/core/gpu/CUBLASContextProvider.h
@@ -0,0 +1,35 @@
+/*
+ * CUBLASContextProvider.h
+ *
+ *  Created on: Mar 22, 2012
+ *      Author: Michael S. Hansen
+ */
+
+#ifndef CUBLASCONTEXTPROVIDER_H_
+#define CUBLASCONTEXTPROVIDER_H_
+#pragma once
+
+#include "gpucore_export.h"
+
+#include <cublas_v2.h>
+#include <map>
+#include <iostream>
+
+class EXPORTGPUCORE CUBLASContextProvider
+{
+
+public:
+	static CUBLASContextProvider* instance();
+
+	cublasHandle_t* getCublasHandle(int device_no = 0);
+
+private:
+	CUBLASContextProvider() {}
+	virtual ~CUBLASContextProvider();
+
+	static CUBLASContextProvider* instance_;
+
+	std::map<int, cublasHandle_t> handles_;
+};
+
+#endif /* CUBLASCONTEXTPROVIDER_H_ */
diff --git a/toolboxes/core/gpu/GPUTimer.h b/toolboxes/core/gpu/GPUTimer.h
new file mode 100644
index 0000000..89a6348
--- /dev/null
+++ b/toolboxes/core/gpu/GPUTimer.h
@@ -0,0 +1,73 @@
+/** file GPUTimer.h
+    Utility to measure Cuda performance. 
+*/
+
+#ifndef __GPUTIMER_H
+#define __GPUTIMER_H
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <cuda_runtime_api.h>
+
+namespace Gadgetron{
+
+    class GPUTimer
+    {
+    public:
+        GPUTimer() : name_("GPUTimer"), timing_in_destruction_(true)
+        {
+            start();
+        }
+
+        GPUTimer(bool timing) : name_("GPUTimer"), timing_in_destruction_(timing)
+        {
+            if ( timing_in_destruction_ )
+            {
+                start();
+            }
+        }
+
+        GPUTimer(const char* name) : name_(name), timing_in_destruction_(true)
+        {
+            start();
+        }
+
+        virtual ~GPUTimer() 
+        {
+            if ( timing_in_destruction_ )
+            {
+                stop();
+            }
+        }
+
+        virtual void start()
+        {
+            cudaEventCreate(&start_event_);
+            cudaEventCreate(&stop_event_);
+            cudaEventRecord( start_event_, 0 );
+        }
+
+        virtual void stop()
+        {
+            float time;
+            cudaEventRecord( stop_event_, 0 );
+            cudaEventSynchronize( stop_event_ );
+            cudaEventElapsedTime( &time, start_event_, stop_event_ );
+            cudaEventDestroy( start_event_ );
+            cudaEventDestroy( stop_event_ );
+
+            GDEBUG_STREAM(name_ << ": " << time << " ms" << std::endl; std::cout.flush());
+        }
+
+        void set_timing_in_destruction(bool timing) { timing_in_destruction_ = timing; }
+
+        cudaEvent_t start_event_;
+        cudaEvent_t stop_event_;
+
+        std::string name_;
+        bool timing_in_destruction_;
+    };
+}
+#endif //__GPUTIMER_H
diff --git a/toolboxes/core/gpu/GadgetronCuException.h b/toolboxes/core/gpu/GadgetronCuException.h
new file mode 100644
index 0000000..dfa4cfa
--- /dev/null
+++ b/toolboxes/core/gpu/GadgetronCuException.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include <stdexcept>
+
+namespace Gadgetron{
+  
+  class cuda_error : public std::runtime_error
+  {
+  public:
+    cuda_error(std::string msg) : std::runtime_error(msg) {}
+    cuda_error(cudaError_t errN) : std::runtime_error(cudaGetErrorString(errN)) {
+    }
+  };
+}
diff --git a/toolboxes/core/gpu/check_CUDA.h b/toolboxes/core/gpu/check_CUDA.h
new file mode 100644
index 0000000..f59b1f1
--- /dev/null
+++ b/toolboxes/core/gpu/check_CUDA.h
@@ -0,0 +1,40 @@
+/** \file check_CUDA.h
+    \brief Macroes to check whether GPU-based code has caused any errors, and if so, throw a runtime exception accordingly.
+*/
+
+#pragma once
+
+#include "GadgetronCuException.h"
+
+namespace Gadgetron {
+
+  /**
+   *  Should never be used in the code, use CHECK_FOR_CUDA_ERROR(); instead
+   *  inspired by cutil.h: CUT_CHECK_ERROR
+   */
+  inline void CHECK_FOR_CUDA_ERROR(char const * cur_fun, const char* file, const int line) {
+    cudaError_t errorCode = cudaGetLastError();
+    if (errorCode != cudaSuccess) {
+      throw cuda_error(errorCode);
+    }
+#ifdef DEBUG
+    cudaThreadSynchronize();
+    errorCode = cudaGetLastError();
+    if (errorCode != cudaSuccess) {
+      throw cuda_error(errorCode);
+    }
+#endif
+  }
+}
+
+/**
+ *  Checks for CUDA errors and throws an exception if an error was detected.
+ */
+#define CHECK_FOR_CUDA_ERROR(); CHECK_FOR_CUDA_ERROR(BOOST_CURRENT_FUNCTION,__FILE__,__LINE__);
+
+/**
+ *  Call "res", checks for CUDA errors and throws an exception if an error was detected.
+ */
+#define CUDA_CALL(res) {cudaError_t errorCode = res; if (errorCode != cudaSuccess) { throw cuda_error(errorCode); }}
+
+#define CUSPARSE_CALL(res) {cusparseStatus_t errorCode = res; if (errorCode != CUSPARSE_STATUS_SUCCESS){ std::stringstream ss; ss << "CUSPARSE failed with error: " <<  gadgetron_getCusparseErrorString(errorCode); throw cuda_error(ss.str());}}
diff --git a/toolboxes/core/gpu/cuNDArray.h b/toolboxes/core/gpu/cuNDArray.h
new file mode 100644
index 0000000..6f4ff28
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray.h
@@ -0,0 +1,878 @@
+/** \file cuNDArray.h
+\brief GPU-based N-dimensional array (data container)
+*/
+
+#ifndef CUNDARRAY_H
+#define CUNDARRAY_H
+#pragma once
+
+#include "NDArray.h"
+#include "hoNDArray.h"
+#include "complext.h"
+#include "GadgetronCuException.h"
+#include "check_CUDA.h"
+#include "hoCuNDArray.h"
+#include <boost/shared_ptr.hpp>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <thrust/device_vector.h>
+
+namespace Gadgetron{
+
+    template <typename T> class cuNDArray : public NDArray<T>
+    {
+
+    public:
+
+        // Constructors
+        //
+
+        cuNDArray();
+        cuNDArray(const cuNDArray<T> &a);
+        cuNDArray(const cuNDArray<T> *a);
+        explicit cuNDArray(const hoNDArray<T> &a);
+        explicit cuNDArray(hoNDArray<T> *a);
+
+#if __cplusplus > 199711L
+        // Move constructor
+        cuNDArray(cuNDArray<T>&& a);
+#endif
+        explicit cuNDArray(std::vector<size_t> *dimensions);
+        cuNDArray(std::vector<size_t> *dimensions, int device_no);
+        cuNDArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+
+        explicit cuNDArray(std::vector<size_t> &dimensions);
+        cuNDArray(std::vector<size_t> &dimensions, int device_no);
+        cuNDArray(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct = false);
+
+        explicit cuNDArray(boost::shared_ptr<std::vector<size_t> > dimensions);
+        cuNDArray(boost::shared_ptr<std::vector<size_t> > dimensions, int device_no);
+        cuNDArray(boost::shared_ptr<std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+        explicit cuNDArray(size_t len);
+        cuNDArray(size_t sx, size_t sy);
+        cuNDArray(size_t sx, size_t sy, size_t sz);
+        cuNDArray(size_t sx, size_t sy, size_t sz, size_t st);
+        cuNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp);
+        cuNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq);
+        cuNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr);
+        cuNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss);
+
+        // Destructor
+        virtual ~cuNDArray();
+
+        // Assignment operator
+        cuNDArray<T>& operator=(const cuNDArray<T>& rhs);
+
+#if __cplusplus > 199711L
+        cuNDArray<T>& operator=(cuNDArray<T>&& rhs);
+#endif
+        cuNDArray<T>& operator=(const hoNDArray<T>& rhs);
+
+        virtual void create(std::vector<size_t> *dimensions);
+        virtual void create(std::vector<size_t> *dimensions, int device_no);
+        virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+
+        virtual void create(std::vector<size_t> &dimensions);
+        virtual void create(std::vector<size_t> &dimensions, int device_no);
+        virtual void create(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct = false);
+
+        virtual void create(boost::shared_ptr<std::vector<size_t> > dimensions);
+        virtual void create(boost::shared_ptr<std::vector<size_t> > dimensions, int device_no);
+        virtual void create(boost::shared_ptr<std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+        virtual void create(size_t len);
+        virtual void create(size_t sx, size_t sy);
+        virtual void create(size_t sx, size_t sy, size_t sz);
+        virtual void create(size_t sx, size_t sy, size_t sz, size_t st);
+        virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp);
+        virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq);
+        virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr);
+        virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss);
+
+        virtual boost::shared_ptr< hoNDArray<T> > to_host() const;
+        virtual void to_host( hoNDArray<T> *out ) const;
+
+        virtual void set_device(int device);
+        int get_device();
+
+        thrust::device_ptr<T> get_device_ptr();
+        thrust::device_ptr<T> begin();
+        thrust::device_ptr<T> end();
+        const thrust::device_ptr<T> begin() const;
+        const thrust::device_ptr<T> end() const;
+
+        T at( size_t idx );
+        T operator[]( size_t idx );
+
+
+    protected:
+
+        int device_; 
+
+        virtual void allocate_memory();
+        virtual void deallocate_memory();
+    };
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray() : NDArray<T>::NDArray() 
+    { 
+        cudaGetDevice(&this->device_); 
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(const cuNDArray<T> &a) : NDArray<T>::NDArray() 
+    {
+        cudaGetDevice(&this->device_);
+        this->data_ = 0;
+        this->dimensions_ = a.get_dimensions();
+        allocate_memory();
+        if (a.device_ == this->device_) {
+            CUDA_CALL(cudaMemcpy(this->data_, a.data_, this->elements_*sizeof(T), cudaMemcpyDeviceToDevice));
+        } else {
+            //This memory is on a different device, we must move it.
+            cudaSetDevice(a.device_);
+            boost::shared_ptr< hoNDArray<T> > tmp = a.to_host();
+            cudaSetDevice(this->device_);
+            cudaError_t err = cudaMemcpy(this->data_, tmp->get_data_ptr(), this->elements_*sizeof(T), cudaMemcpyHostToDevice);
+            if (err !=cudaSuccess) {
+                deallocate_memory();
+                this->data_ = 0;
+                this->dimensions_->clear();
+                throw cuda_error(err);
+            }
+        }
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(const cuNDArray<T> *a) : NDArray<T>::NDArray() 
+    {
+        cudaGetDevice(&this->device_);
+        this->data_ = 0;
+        this->dimensions_ = a->get_dimensions();
+        allocate_memory();
+        if (a->device_ == this->device_) {
+            CUDA_CALL(cudaMemcpy(this->data_, a->data_, this->elements_*sizeof(T), cudaMemcpyDeviceToDevice));
+        } else {
+            //This memory is on a different device, we must move it.
+            cudaSetDevice(a->device_);
+            boost::shared_ptr< hoNDArray<T> > tmp = a->to_host();
+            cudaSetDevice(this->device_);
+            cudaError_t err = cudaMemcpy(this->data_, tmp->get_data_ptr(), this->elements_*sizeof(T), cudaMemcpyHostToDevice);
+            if (err !=cudaSuccess) {
+                deallocate_memory();
+                this->data_ = 0;
+                this->dimensions_->clear();
+                throw cuda_error(err);
+            }
+        }
+    }
+
+
+#if __cplusplus > 199711L
+    template <typename T>
+    cuNDArray<T>::cuNDArray(cuNDArray<T>&& a) : NDArray<T>::NDArray()
+    {
+    	device_ = a.device_;
+    	this->data_ = a.data_;
+    	*this->dimensions_ = *a.dimensions_;
+    	this->elements_ = a.elements_;
+    	a.dimensions_.reset();
+    	a.data_=nullptr;
+    }
+#endif
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(const hoNDArray<T> &a) : NDArray<T>::NDArray() 
+    {
+        cudaGetDevice(&this->device_);
+        this->dimensions_ = a.get_dimensions();
+        allocate_memory();
+        if (cudaMemcpy(this->data_, a.get_data_ptr(), this->elements_*sizeof(T), cudaMemcpyHostToDevice) != cudaSuccess) {
+            deallocate_memory();
+            this->data_ = 0;
+            this->dimensions_->clear();
+        }
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(hoNDArray<T> *a) : NDArray<T>::NDArray() 
+    {
+        cudaGetDevice(&this->device_);
+        this->dimensions_ = a->get_dimensions();
+        allocate_memory();
+        if (cudaMemcpy(this->data_, a->get_data_ptr(), this->elements_*sizeof(T), cudaMemcpyHostToDevice) != cudaSuccess) {
+            deallocate_memory();
+            this->data_ = 0;
+            this->dimensions_->clear();
+        }
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(std::vector<size_t> *dimensions) : NDArray<T>::NDArray() 
+    {
+        cudaGetDevice(&this->device_);
+        create(dimensions);
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(std::vector<size_t> *dimensions, int device_no) : NDArray<T>::NDArray() 
+    {
+        cudaGetDevice(&this->device_);
+        create(dimensions,device_no);
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+    {
+        cudaGetDevice(&this->device_);
+        create(dimensions,data,delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(std::vector<size_t> &dimensions) : NDArray<T>::NDArray() 
+    {
+        cudaGetDevice(&this->device_);
+        create(dimensions);
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(std::vector<size_t> &dimensions, int device_no) : NDArray<T>::NDArray() 
+    {
+        cudaGetDevice(&this->device_);
+        create(dimensions,device_no);
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+    {
+        cudaGetDevice(&this->device_);
+        create(dimensions,data,delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(boost::shared_ptr<std::vector<size_t> > dimensions) : NDArray<T>::NDArray()
+    {
+        cudaGetDevice(&this->device_);
+        create(dimensions.get());
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(boost::shared_ptr<std::vector<size_t> > dimensions, int device_no) : NDArray<T>::NDArray()
+    {
+        cudaGetDevice(&this->device_);
+        create(dimensions.get(),device_no);
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(boost::shared_ptr<std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+    {
+        cudaGetDevice(&this->device_);
+        create(dimensions.get(),data,delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(size_t len)
+    {
+        std::vector<size_t> dim(1);
+        dim[0] = len;
+        cudaGetDevice(&this->device_);
+        create(dim);
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(size_t sx, size_t sy)
+    {
+        std::vector<size_t> dim(2);
+        dim[0] = sx;
+        dim[1] = sy;
+        cudaGetDevice(&this->device_);
+        create(dim);
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(size_t sx, size_t sy, size_t sz)
+    {
+        std::vector<size_t> dim(3);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        cudaGetDevice(&this->device_);
+        create(dim);
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(size_t sx, size_t sy, size_t sz, size_t st)
+    {
+        std::vector<size_t> dim(4);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        cudaGetDevice(&this->device_);
+        create(dim);
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp)
+    {
+        std::vector<size_t> dim(5);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        cudaGetDevice(&this->device_);
+        create(dim);
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq)
+    {
+        std::vector<size_t> dim(6);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        cudaGetDevice(&this->device_);
+        create(dim);
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr)
+    {
+        std::vector<size_t> dim(7);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+        cudaGetDevice(&this->device_);
+        create(dim);
+    }
+
+    template <typename T> 
+    cuNDArray<T>::cuNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss)
+    {
+        std::vector<size_t> dim(8);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+        dim[7] = ss;
+        cudaGetDevice(&this->device_);
+        create(dim);
+    }
+
+    template <typename T> 
+    cuNDArray<T>:: ~cuNDArray()
+    { 
+        if (this->delete_data_on_destruct_) 
+            deallocate_memory();  
+    }
+
+#if __cplusplus > 199711L
+    template <typename T>
+    cuNDArray<T>& cuNDArray<T>::operator=(cuNDArray<T>&& rhs){
+
+    	if (&rhs == this) return *this;
+    	this->clear();
+    	*this->dimensions_ = *rhs.dimensions_;
+    	this->elements_ = rhs.elements_;
+    	rhs.dimensions_.reset();
+    	device_ = rhs.device_;
+    	this->data_ = rhs.data_;
+    	rhs.data_ = nullptr;
+    	return *this;
+    }
+#endif
+
+    template <typename T> 
+    cuNDArray<T>& cuNDArray<T>::operator=(const cuNDArray<T>& rhs)
+    {
+        int cur_device; 
+        CUDA_CALL(cudaGetDevice(&cur_device));
+        bool dimensions_match = this->dimensions_equal(&rhs);
+        if (dimensions_match && (rhs.device_ == cur_device) && (cur_device == this->device_)) {
+            CUDA_CALL(cudaMemcpy(this->data_, rhs.data_, this->elements_*sizeof(T), cudaMemcpyDeviceToDevice));
+        }
+        else {
+            CUDA_CALL(cudaSetDevice(this->device_));
+            if( !dimensions_match ){
+                deallocate_memory();
+                this->elements_ = rhs.elements_;
+                this->dimensions_ = rhs.get_dimensions();
+                allocate_memory();
+            }
+            if (this->device_ == rhs.device_) {
+                if (cudaMemcpy(this->data_, rhs.data_, this->elements_*sizeof(T), cudaMemcpyDeviceToDevice) !=cudaSuccess) {	    
+                    cudaSetDevice(cur_device);
+                    throw cuda_error("cuNDArray::operator=: failed to copy data (2)");
+                }
+            } else {
+                if( cudaSetDevice(rhs.device_) != cudaSuccess) {
+                    cudaSetDevice(cur_device);
+                    throw cuda_error("cuNDArray::operator=: unable to set device no (2)");
+                }
+                boost::shared_ptr< hoNDArray<T> > tmp = rhs.to_host();
+                if( cudaSetDevice(this->device_) != cudaSuccess) {
+                    cudaSetDevice(cur_device);
+                    throw cuda_error("cuNDArray::operator=: unable to set device no (3)");
+                }
+                if (cudaMemcpy(this->data_, tmp->get_data_ptr(), this->elements_*sizeof(T), cudaMemcpyHostToDevice) != cudaSuccess) {
+                    cudaSetDevice(cur_device);
+                    throw cuda_error("cuNDArray::operator=: failed to copy data (3)");
+                }
+            }
+            if( cudaSetDevice(cur_device) != cudaSuccess) {
+                throw cuda_error("cuNDArray::operator=: unable to restore to current device");
+            }
+        }
+        return *this;
+    }
+
+    template <typename T> 
+    cuNDArray<T>& cuNDArray<T>::operator=(const hoNDArray<T>& rhs)
+    {
+        int cur_device; 
+        CUDA_CALL(cudaGetDevice(&cur_device));
+        bool dimensions_match = this->dimensions_equal(&rhs);
+        if (dimensions_match && (cur_device == this->device_)) {
+            CUDA_CALL(cudaMemcpy(this->get_data_ptr(), rhs.get_data_ptr(), this->get_number_of_elements()*sizeof(T), cudaMemcpyHostToDevice));
+        }
+        else {
+            CUDA_CALL(cudaSetDevice(this->device_));
+            if( !dimensions_match ){
+                deallocate_memory();
+                this->elements_ = rhs.get_number_of_elements();
+                this->dimensions_ = rhs.get_dimensions();
+                allocate_memory();
+            }
+            if (cudaMemcpy(this->get_data_ptr(), rhs.get_data_ptr(), this->get_number_of_elements()*sizeof(T),
+                cudaMemcpyHostToDevice) !=cudaSuccess) {
+                    cudaSetDevice(cur_device);
+                    throw cuda_error("cuNDArray::operator=: failed to copy data (1)");
+            }
+            if( cudaSetDevice(cur_device) != cudaSuccess) {
+                throw cuda_error("cuNDArray::operator=: unable to restore to current device");
+            }
+        }
+        return *this;
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::create(std::vector<size_t> *dimensions)
+    {
+        if ( this->dimensions_equal(dimensions) )
+        {
+            return;
+        }
+
+        return NDArray<T>::create(dimensions);
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::create(std::vector<size_t> *dimensions, int device_no)
+    {
+        if (device_no < 0){
+            throw cuda_error("cuNDArray::create: illegal device no");
+        }
+
+        if ( this->dimensions_equal(dimensions) && this->device_==device_no )
+        {
+            return;
+        }
+
+        this->device_ = device_no; 
+        NDArray<T>::create(dimensions);
+    }
+
+    template <typename T> 
+    void cuNDArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+    {
+        if (!data) {
+            throw std::runtime_error("cuNDArray::create: 0x0 pointer provided");
+        }
+
+        int tmp_device; 
+        if( cudaGetDevice(&tmp_device) != cudaSuccess) {
+            throw cuda_error("cuNDArray::create: Unable to query for device");
+        }
+
+        cudaDeviceProp deviceProp; 
+        if( cudaGetDeviceProperties( &deviceProp, tmp_device) != cudaSuccess) {
+            throw cuda_error("cuNDArray::create: Unable to query device properties");
+        }
+
+        if (deviceProp.unifiedAddressing) {
+            cudaPointerAttributes attrib;
+            if (cudaPointerGetAttributes(&attrib, data) != cudaSuccess) {
+                CHECK_FOR_CUDA_ERROR();
+                throw cuda_error("cuNDArray::create: Unable to determine attributes of pointer");
+            }
+            this->device_ = attrib.device;
+        } else {
+            this->device_ = tmp_device;
+        }
+
+        NDArray<T>::create(dimensions, data, delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::create(std::vector<size_t> &dimensions)
+    {
+        if ( this->dimensions_equal(&dimensions) )
+        {
+            return;
+        }
+
+        return NDArray<T>::create(dimensions);
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::create(std::vector<size_t> &dimensions, int device_no)
+    {
+        if (device_no < 0){
+            throw cuda_error("cuNDArray::create: illegal device no");
+        }
+
+        if ( this->dimensions_equal(&dimensions) && this->device_==device_no )
+        {
+            return;
+        }
+
+        this->device_ = device_no; 
+        NDArray<T>::create(dimensions);
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::create(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct)
+    {
+        if (!data) {
+            throw std::runtime_error("cuNDArray::create: 0x0 pointer provided");
+        }
+
+        int tmp_device; 
+        if( cudaGetDevice(&tmp_device) != cudaSuccess) {
+            throw cuda_error("cuNDArray::create: Unable to query for device");
+        }
+
+        cudaDeviceProp deviceProp;
+        if( cudaGetDeviceProperties( &deviceProp, tmp_device) != cudaSuccess) {
+            throw cuda_error("cuNDArray::create: Unable to query device properties");
+        }
+
+        if (deviceProp.unifiedAddressing) {
+            cudaPointerAttributes attrib;
+            if (cudaPointerGetAttributes(&attrib, data) != cudaSuccess) {
+                CHECK_FOR_CUDA_ERROR();
+                throw cuda_error("cuNDArray::create: Unable to determine attributes of pointer");
+            }
+            this->device_ = attrib.device;
+        } else {
+            this->device_ = tmp_device;
+        }
+
+        NDArray<T>::create(dimensions, data, delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::create(boost::shared_ptr<std::vector<size_t> > dimensions){
+        this->create(dimensions.get());
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::create(boost::shared_ptr<std::vector<size_t> > dimensions, int device_no){
+        this->create(dimensions.get(),device_no);
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::create(boost::shared_ptr<std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct){
+        this->create(dimensions.get(), data, delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::create(size_t len)
+    {
+        std::vector<size_t> dim(1);
+        dim[0] = len;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::create(size_t sx, size_t sy)
+    {
+        std::vector<size_t> dim(2);
+        dim[0] = sx;
+        dim[1] = sy;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::create(size_t sx, size_t sy, size_t sz)
+    {
+        std::vector<size_t> dim(3);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st)
+    {
+        std::vector<size_t> dim(4);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp)
+    {
+        std::vector<size_t> dim(5);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq)
+    {
+        std::vector<size_t> dim(6);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr)
+    {
+        std::vector<size_t> dim(7);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss)
+    {
+        std::vector<size_t> dim(8);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = st;
+        dim[4] = sp;
+        dim[5] = sq;
+        dim[6] = sr;
+        dim[7] = ss;
+        this->create(dim);
+    }
+
+    template <typename T> 
+    inline boost::shared_ptr< hoNDArray<T> > cuNDArray<T>::to_host() const
+    {
+        boost::shared_ptr< hoNDArray<T> > ret(new hoNDArray<T>(this->dimensions_.get()));
+        if (cudaMemcpy(ret->get_data_ptr(), this->data_, this->elements_*sizeof(T), cudaMemcpyDeviceToHost) != cudaSuccess) {
+            throw cuda_error("cuNDArray::to_host(): failed to copy memory from device");
+        }
+
+        return ret;
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::to_host( hoNDArray<T> *out ) const 
+    {
+        if( !out ){
+            throw std::runtime_error("cuNDArray::to_host(): illegal array passed.");
+        }
+
+        if( out->get_number_of_elements() != this->get_number_of_elements() ){	
+            out->create( this->get_dimensions().get());
+        }
+
+        if( cudaMemcpy( out->get_data_ptr(), this->data_, this->elements_*sizeof(T), cudaMemcpyDeviceToHost) != cudaSuccess) {
+            throw cuda_error("cuNDArray::to_host(): failed to copy memory from device");
+        }
+    }
+
+    template <typename T> 
+    inline void cuNDArray<T>::set_device(int device)
+    {
+        if( device_ == device )
+            return;
+
+        int cur_device;
+        if( cudaGetDevice(&cur_device) != cudaSuccess) {
+            throw cuda_error("cuNDArray::set_device: unable to get device no");
+        }
+
+        if( cur_device != device_ && cudaSetDevice(device_) != cudaSuccess) {
+            throw cuda_error("cuNDArray::set_device: unable to set device no");
+        }
+
+        boost::shared_ptr< hoNDArray<T> > tmp = to_host();
+        deallocate_memory();
+        if( cudaSetDevice(device) != cudaSuccess) {
+            cudaSetDevice(cur_device);
+            throw cuda_error("cuNDArray::set_device: unable to set device no (2)");
+        }
+
+        device_ = device;
+        allocate_memory();
+        if (cudaMemcpy(this->data_, tmp->get_data_ptr(), this->elements_*sizeof(T), cudaMemcpyHostToDevice) != cudaSuccess) {
+            cudaSetDevice(cur_device);
+            throw cuda_error("cuNDArray::set_device: failed to copy data");
+        }
+
+        if( cudaSetDevice(cur_device) != cudaSuccess) {
+            throw cuda_error("cuNDArray::set_device: unable to restore device to current device");
+        }
+    }
+
+    template <typename T> 
+    inline int cuNDArray<T>::get_device() { return device_; }
+
+    template <typename T> 
+    inline thrust::device_ptr<T> cuNDArray<T>::get_device_ptr()
+    {
+        return thrust::device_ptr<T>(this->data_);
+    }
+
+    template <typename T> 
+    inline thrust::device_ptr<T> cuNDArray<T>::begin()
+    {
+        return thrust::device_ptr<T>(this->data_);
+    }
+
+    template <typename T> 
+    inline thrust::device_ptr<T> cuNDArray<T>::end()
+    {
+        return thrust::device_ptr<T>(this->data_)+this->get_number_of_elements();
+    }
+
+    template <typename T> 
+    inline const thrust::device_ptr<T> cuNDArray<T>::begin() const
+    {
+        return thrust::device_ptr<T>(this->data_);
+    }
+
+    template <typename T>
+    inline const thrust::device_ptr<T> cuNDArray<T>::end() const
+    {
+        return thrust::device_ptr<T>(this->data_)+this->get_number_of_elements();
+    }
+
+    template <typename T>
+    inline T cuNDArray<T>::at( size_t idx )
+    {
+        if( idx >= this->get_number_of_elements() ){
+            throw std::runtime_error("cuNDArray::at(): index out of range.");
+        }
+        T res;
+        CUDA_CALL(cudaMemcpy(&res, &this->get_data_ptr()[idx], sizeof(T), cudaMemcpyDeviceToHost));
+        return res;
+    }
+
+    template <typename T> 
+    inline T cuNDArray<T>::operator[]( size_t idx )
+    {
+        if( idx >= this->get_number_of_elements() ){
+            throw std::runtime_error("cuNDArray::operator[]: index out of range.");
+        }
+        T res;
+        CUDA_CALL(cudaMemcpy(&res, &this->get_data_ptr()[idx], sizeof(T), cudaMemcpyDeviceToHost));
+        return res;
+    }
+
+    template <typename T> 
+    void cuNDArray<T>::allocate_memory()
+    {
+        deallocate_memory();
+
+        this->elements_ = 1;
+        if (this->dimensions_->empty())
+            throw std::runtime_error("cuNDArray::allocate_memory() : dimensions is empty.");
+        for (size_t i = 0; i < this->dimensions_->size(); i++) {
+            this->elements_ *= (*this->dimensions_)[i];
+        } 
+
+        size_t size = this->elements_ * sizeof(T);
+
+        int device_no_old;
+        if (cudaGetDevice(&device_no_old) != cudaSuccess) {
+            throw cuda_error("cuNDArray::allocate_memory: unable to get device no");
+        }
+
+        if (device_ != device_no_old) {
+            if (cudaSetDevice(device_) != cudaSuccess) {
+                throw cuda_error("cuNDArray::allocate_memory: unable to set device no");
+            }
+        }
+
+        if (cudaMalloc((void**) &this->data_,size) != cudaSuccess) {
+            size_t free = 0, total = 0;
+            cudaMemGetInfo(&free, &total);
+            std::stringstream err("cuNDArray::allocate_memory() : Error allocating CUDA memory");
+            err << "CUDA Memory: " << free << " (" << total << ")";
+
+            err << "   memory requested: " << size << "( ";
+            for (size_t i = 0; i < this->dimensions_->size(); i++) {
+                std::cerr << (*this->dimensions_)[i] << " ";
+            }
+            err << ")";
+            this->data_ = 0;
+            throw std::runtime_error(err.str());
+        }
+
+        if (device_ != device_no_old) {
+            if (cudaSetDevice(device_no_old) != cudaSuccess) {
+                throw cuda_error("cuNDArray::allocate_memory: unable to restore device no");
+            }
+        }
+    }
+
+    template <typename T> 
+    void cuNDArray<T>::deallocate_memory()
+    {
+        if (this->data_) {
+
+            int device_no_old;
+            CUDA_CALL(cudaGetDevice(&device_no_old));
+            if (device_ != device_no_old) {
+                CUDA_CALL(cudaSetDevice(device_));
+            }
+
+            CUDA_CALL(cudaFree(this->data_));
+            if (device_ != device_no_old) {
+                CUDA_CALL(cudaSetDevice(device_no_old));
+            }
+            this->data_ = 0;
+        }
+    }
+
+
+
+}
+
+#endif //CUNDARRAY_H
diff --git a/toolboxes/core/gpu/cuNDArray_blas.cu b/toolboxes/core/gpu/cuNDArray_blas.cu
new file mode 100644
index 0000000..8bf19b1
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_blas.cu
@@ -0,0 +1,312 @@
+#include "cuNDArray_blas.h"
+#include "complext.h"
+#include "GadgetronCuException.h"
+#include "cudaDeviceManager.h"
+
+#include <cublas_v2.h>
+
+namespace Gadgetron{
+
+#define CUBLAS_CALL(fun) {cublasStatus_t err = fun; if (err != CUBLAS_STATUS_SUCCESS) {throw cuda_error(gadgetron_getCublasErrorString(err));}}
+
+  //NRM2
+  //
+
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_axpy(cublasHandle_t hndl, int n, const T* a , const T* x , int incx,  T* y, int incy);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_dot(cublasHandle_t, int, const T*, int, const  T*, int, T*, bool cc = true);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_nrm2(cublasHandle_t, int, const T*, int, typename realType<T>::Type *result);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_amax(cublasHandle_t handle, int n,const T *x, int incx, int *result);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_amin(cublasHandle_t handle, int n,const T *x, int incx, int *result);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_asum(cublasHandle_t handle, int n,const T *x, int incx, typename realType<T>::Type *result);
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_nrm2<float>(cublasHandle_t hndl, int n, const float*  x, int inc, float* res){
+    return cublasSnrm2(hndl,n,x,inc,res);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_nrm2<double>(cublasHandle_t hndl, int n, const double*  x, int inc, double* res){
+    return cublasDnrm2(hndl,n,x,inc,res);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_nrm2<float_complext>(cublasHandle_t hndl, int n, const float_complext*  x, int inc, float* res){
+    return cublasScnrm2(hndl,n,(const cuComplex*)x,inc,res);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_nrm2<double_complext>(cublasHandle_t hndl, int n, const double_complext*  x, int inc, double* res){
+    return cublasDznrm2(hndl,n,(const cuDoubleComplex*) x,inc,res);
+  }
+
+  //DOT
+  //
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_dot<float>(cublasHandle_t hndl, int n , const float* x , int incx, const  float* y , int incy, float* res, bool cc){
+    return cublasSdot( hndl, n, x, incx, y, incy, res);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_dot<double>(cublasHandle_t hndl, int n , const double* x , int incx, const  double* y , int incy, double* res, bool cc){
+    return cublasDdot( hndl, n, x, incx, y, incy, res);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_dot<float_complext>(cublasHandle_t hndl, int n , const float_complext* x ,
+										int incx, const  float_complext* y , int incy, float_complext* res, bool cc){
+    if(cc)
+      return cublasCdotc( hndl, n, (const cuComplex*) x, incx, (const cuComplex*) y, incy, (cuComplex*) res);
+    else
+      return cublasCdotu( hndl, n, (const cuComplex*) x, incx, (const cuComplex*) y, incy, (cuComplex*) res);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_dot<double_complext>(cublasHandle_t hndl, int n , const double_complext* x ,
+										 int incx, const  double_complext* y , int incy, double_complext* res, bool cc){
+    if(cc)
+      return cublasZdotc( hndl, n, (const cuDoubleComplex*) x, incx, (const cuDoubleComplex*) y, incy, (cuDoubleComplex*) res);
+    else
+      return cublasZdotu( hndl, n, (const cuDoubleComplex*) x, incx, (const cuDoubleComplex*) y, incy, (cuDoubleComplex*) res);
+  }
+
+  // AXPY
+  //
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_axpy<float>(cublasHandle_t hndl , int n , const float* a , const float* x , int incx ,  float* y , int incy){
+    return cublasSaxpy(hndl,n,a,x,incx,y,incy);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_axpy<double>(cublasHandle_t hndl , int n , const double* a , const double* x , int incx ,  double* y , int incy){
+    return cublasDaxpy(hndl,n,a,x,incx,y,incy);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_axpy<float_complext>(cublasHandle_t hndl , int n , const float_complext* a , const float_complext* x , int incx ,  float_complext* y , int incy){
+    return cublasCaxpy(hndl,n,(const cuComplex*) a, (const cuComplex*) x,incx, (cuComplex*)y,incy);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_axpy<double_complext>(cublasHandle_t hndl , int n , const double_complext* a , const double_complext* x , int incx ,  double_complext* y , int incy){
+    return cublasZaxpy(hndl,n,(const cuDoubleComplex*) a, (const cuDoubleComplex*) x,incx, (cuDoubleComplex*)y,incy);
+  }
+
+  //SUM
+  //
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_asum<float>(cublasHandle_t hndl, int n,const float *x, int incx, float *result){
+    return cublasSasum(hndl,n,x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_asum<double>(cublasHandle_t hndl, int n,const double *x, int incx, double *result){
+    return cublasDasum(hndl,n,x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_asum<float_complext>(cublasHandle_t hndl, int n,const float_complext *x, int incx, float *result){
+    return cublasScasum(hndl,n,(const cuComplex*) x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_asum<double_complext>(cublasHandle_t hndl, int n,const double_complext *x, int incx, double *result){
+    return cublasDzasum(hndl,n,(const cuDoubleComplex*) x,incx,result);
+  }
+
+  //AMIN
+  //
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_amin<float>(cublasHandle_t hndl, int n,const float *x, int incx, int *result){
+    return cublasIsamin(hndl,n,x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_amin<double>(cublasHandle_t hndl, int n,const double *x, int incx, int *result){
+    return cublasIdamin(hndl,n,x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_amin<float_complext>(cublasHandle_t hndl, int n,const float_complext *x, int incx, int *result){
+    return cublasIcamin(hndl,n, (const cuComplex* ) x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_amin<double_complext>(cublasHandle_t hndl, int n,const double_complext *x, int incx, int *result){
+    return cublasIzamin(hndl,n, (const cuDoubleComplex* ) x,incx,result);
+  }
+
+  //AMAX
+  //
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_amax<float>(cublasHandle_t hndl, int n,const float *x, int incx, int *result){
+    return cublasIsamax(hndl,n,x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_amax<double>(cublasHandle_t hndl, int n,const double *x, int incx, int *result){
+    return cublasIdamax(hndl,n,x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_amax<float_complext>(cublasHandle_t hndl, int n,const float_complext *x, int incx, int *result){
+    return cublasIcamax(hndl,n, (const cuComplex* ) x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_amax<double_complext>(cublasHandle_t hndl, int n,const double_complext *x, int incx, int *result){
+    return cublasIzamax(hndl,n, (const cuDoubleComplex* ) x,incx,result);
+  }
+
+  template<class T> typename realType<T>::Type nrm2( cuNDArray<T> *arr )
+  {
+    if( arr == 0x0 )
+      throw std::runtime_error("Gadgetron::nrm2(): Invalid input array");
+
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    typedef typename realType<T>::Type REAL;
+    REAL ret;
+
+    CUBLAS_CALL(cublas_nrm2<T>( cudaDeviceManager::Instance()->lockHandle(device), 
+                                (int)arr->get_number_of_elements(), arr->get_data_ptr(), 1, &ret));
+
+    cudaDeviceManager::Instance()->unlockHandle(device);
+
+    return ret;
+  }
+
+  template<class T> T dot( cuNDArray<T> *arr1, cuNDArray<T> *arr2, bool cc )
+  {
+    if( arr1 == 0x0 || arr2 == 0x0 )
+      throw std::runtime_error("Gadgetron::dot(): Invalid input array");
+    
+    if( arr1->get_number_of_elements() != arr2->get_number_of_elements() )
+      throw std::runtime_error("Gadgetron::dot(): Array sizes mismatch");
+
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    T ret;
+
+    CUBLAS_CALL(cublas_dot( cudaDeviceManager::Instance()->lockHandle(device), 
+                            (int)arr1->get_number_of_elements(), arr1->get_data_ptr(), 1, arr2->get_data_ptr(), 1, &ret, cc ));
+
+    cudaDeviceManager::Instance()->unlockHandle(device);
+
+    return ret;
+  }
+
+  template<class T> void axpy( T a, cuNDArray<T> *x, cuNDArray<T> *y )
+  {
+    if( x == 0x0 || y == 0x0 )
+      throw std::runtime_error("Gadgetron::axpy(): Invalid input array");
+    
+    if( x->get_number_of_elements() != y->get_number_of_elements() )
+      throw std::runtime_error("Gadgetron::axpy(): Array sizes mismatch");
+
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+
+    CUBLAS_CALL(cublas_axpy(cudaDeviceManager::Instance()->lockHandle(device), 
+                            (int)x->get_number_of_elements(), &a, x->get_data_ptr(), 1, y->get_data_ptr(), 1));
+
+    cudaDeviceManager::Instance()->unlockHandle(device);
+  }
+
+  template<class T> void axpy( T a,  cuNDArray< complext<T> > *x, cuNDArray< complext<T> > *y )
+  {
+    axpy( complext<T>(a), x, y );
+  }
+
+  template<class T> typename realType<T>::Type asum(cuNDArray<T>* x)
+  {
+    if( x == 0x0 )
+      throw std::runtime_error("Gadgetron::asum(): Invalid input array");
+    
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    typename realType<T>::Type result;
+
+    CUBLAS_CALL(cublas_asum(cudaDeviceManager::Instance()->lockHandle(device),
+                            (int)x->get_number_of_elements(), x->get_data_ptr(), 1, &result));
+
+    cudaDeviceManager::Instance()->unlockHandle(device);
+
+    return result;
+  }
+  
+  template<class T> size_t amin( cuNDArray<T>* x )
+  {
+    if( x == 0x0 )
+      throw std::runtime_error("Gadgetron::amin(): Invalid input array");
+
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    int result;
+
+    CUBLAS_CALL(cublas_amin(cudaDeviceManager::Instance()->lockHandle(device),
+                            (int)x->get_number_of_elements(), x->get_data_ptr(), 1, &result));
+
+    cudaDeviceManager::Instance()->unlockHandle(device);
+    
+    if( result > x->get_number_of_elements() ){
+      throw std::runtime_error("Gadgetron::amin(): computed index is out of bounds");
+    }
+    
+    return (size_t)result-1;
+  }
+  
+  template<class T> size_t amax(cuNDArray<T> *x )
+  {
+    if( x == 0x0 )
+      throw std::runtime_error("Gadgetron::amax(): Invalid input array");
+
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    int result;
+
+    CUBLAS_CALL(cublas_amax(cudaDeviceManager::Instance()->lockHandle(device),
+                            (int)x->get_number_of_elements(), x->get_data_ptr(), 1, &result));
+
+    cudaDeviceManager::Instance()->unlockHandle(device);
+    
+    if( result > x->get_number_of_elements() ){
+      throw std::runtime_error("Gadgetron::amax(): computed index is out of bounds");
+    }
+    
+    return (size_t)result-1;
+  }
+  
+  std::string gadgetron_getCublasErrorString(cublasStatus_t err)
+  {
+    switch (err){
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "NOT INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "ALLOC FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "INVALID VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "ARCH MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "MAPPING ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "EXECUTION FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "INTERNAL ERROR";      
+    case CUBLAS_STATUS_SUCCESS:
+      return "SUCCES";
+    default:
+      return "UNKNOWN CUBLAS ERROR";
+    }
+  }
+
+  
+  //
+  // Instantiation
+  //
+  
+  template EXPORTGPUCORE float dot(cuNDArray<float>*,cuNDArray<float>*,bool);
+  template EXPORTGPUCORE float nrm2(cuNDArray<float>*);
+  template EXPORTGPUCORE void axpy(float,cuNDArray<float>*,cuNDArray<float>*);
+  template EXPORTGPUCORE size_t amin(cuNDArray<float>*);
+  template EXPORTGPUCORE size_t amax(cuNDArray<float>*);
+  template EXPORTGPUCORE float asum(cuNDArray<float>*);
+
+  template EXPORTGPUCORE double dot(cuNDArray<double>*,cuNDArray<double>*,bool);
+  template EXPORTGPUCORE double nrm2(cuNDArray<double>*);
+  template EXPORTGPUCORE void axpy(double,cuNDArray<double>*,cuNDArray<double>*);
+  template EXPORTGPUCORE size_t amin(cuNDArray<double>*);
+  template EXPORTGPUCORE size_t amax(cuNDArray<double>*);
+  template EXPORTGPUCORE double asum(cuNDArray<double>*);
+
+  template EXPORTGPUCORE float_complext dot(cuNDArray<float_complext>*,cuNDArray<float_complext>*,bool);
+  template EXPORTGPUCORE float nrm2(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE void axpy(float_complext,cuNDArray<float_complext>*,cuNDArray<float_complext>*);
+  template EXPORTGPUCORE void axpy(float,cuNDArray<float_complext>*,cuNDArray<float_complext>*);
+  template EXPORTGPUCORE size_t amin(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE size_t amax(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE float asum(cuNDArray<float_complext>*);
+
+  template EXPORTGPUCORE double_complext dot(cuNDArray<double_complext>*,cuNDArray<double_complext>*,bool);
+  template EXPORTGPUCORE double nrm2(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE void axpy(double_complext,cuNDArray<double_complext>*,cuNDArray<double_complext>*);
+  template EXPORTGPUCORE void axpy(double,cuNDArray<double_complext>*,cuNDArray<double_complext>*);
+  template EXPORTGPUCORE size_t amin(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE size_t amax(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE double asum(cuNDArray<double_complext>*);
+}
diff --git a/toolboxes/core/gpu/cuNDArray_blas.h b/toolboxes/core/gpu/cuNDArray_blas.h
new file mode 100644
index 0000000..f6d8cce
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_blas.h
@@ -0,0 +1,51 @@
+/** \file cuNDArray_blas.h
+    \brief BLAS level-1 functions on the cuNDArray class.
+    
+    cuNDArray_blas.h provides BLAS level-1 functions on the cuNDArray class.
+    The cuNDArray is temporarily reshaped to a column vector for the respective operations.
+    The implementation is based on CUBLAS.
+    This code is purposely split into a header and underlying implementation (.cpp) 
+    as this allows specific instantiation of the supported template types.     
+    The supported types are float, double, std::complex<float>, std::complex<double>, 
+    Gadgetron::complext<float>, and Gadgetron::complext<double>.
+*/
+
+#pragma once
+
+#include "cuNDArray.h"
+#include "complext.h"
+#include "gpucore_export.h"
+
+#include <cublas_v2.h>
+
+namespace Gadgetron{
+
+  template<class T> EXPORTGPUCORE T dot( cuNDArray<T> *x, cuNDArray<T> *y, bool cc = true );
+
+  template<class T> EXPORTGPUCORE typename realType<T>::Type nrm2( cuNDArray<T> *x );
+
+  template<class T> EXPORTGPUCORE void axpy( T a, cuNDArray<T>* x, cuNDArray<T>* y );
+
+  template<class T> EXPORTGPUCORE void axpy( T a, cuNDArray<complext<T> > *x, cuNDArray<complext<T> > *y );
+  
+  /**
+   * @brief Gets the index of the index of the element with minimum absolute
+   * @param x Input data
+   * @return index of absolute minimum values
+   * @details Note that this returns the C-style index and NOT the Fortran index.
+   */
+  template<class T> EXPORTGPUCORE size_t amin( cuNDArray<T> *x );
+  
+  /**
+   * @brief Gets the index of the index of the element with maximum absolute
+   * @param x Input data
+   * @return index of absolute maximum values
+   * @details Note that this returns the C-style index and NOT the Fortran index.
+   */
+  template<class T> EXPORTGPUCORE size_t amax( cuNDArray<T> *x);
+  
+  template<class T> EXPORTGPUCORE typename realType<T>::Type asum( cuNDArray<T> *x );
+  
+  EXPORTGPUCORE std::string gadgetron_getCublasErrorString(cublasStatus_t err);
+
+}
diff --git a/toolboxes/core/gpu/cuNDArray_elemwise.cu b/toolboxes/core/gpu/cuNDArray_elemwise.cu
new file mode 100644
index 0000000..b4279a7
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_elemwise.cu
@@ -0,0 +1,703 @@
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_blas.h"
+#include "complext.h"
+
+#include <complex>
+#include <thrust/functional.h>
+
+using namespace Gadgetron;
+using namespace std;
+
+template<typename T> struct cuNDA_abs : public thrust::unary_function<T,typename realType<T>::Type>
+{
+  __device__ typename Gadgetron::realType<T>::Type operator()(const T &x) const {return abs(x);}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<typename realType<T>::Type> > 
+Gadgetron::abs( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::abs(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<typename realType<T>::Type> > result(new cuNDArray<typename realType<T>::Type>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<typename realType<T>::Type> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_abs<T>());
+  return result;
+}
+
+template<class T> void 
+Gadgetron::abs_inplace( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::abs_inplace(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_abs<T>());
+}  
+  
+template<typename T> struct cuNDA_abs_square : public thrust::unary_function<T,typename realType<T>::Type>
+{
+  __device__ typename Gadgetron::realType<T>::Type operator()(const T &x) const 
+  { 
+    typename realType<T>::Type tmp = abs(x);
+    return tmp*tmp;
+  }
+};
+
+template<class T> boost::shared_ptr< cuNDArray<typename realType<T>::Type> > 
+Gadgetron::abs_square( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::abs_square(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<typename realType<T>::Type> > result(new cuNDArray<typename realType<T>::Type>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<typename realType<T>::Type> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_abs_square<T>());
+  return result;
+}
+
+template<typename T> struct cuNDA_sqrt : public thrust::unary_function<T,T>
+{
+  __device__ T operator()(const T &x) const {return sqrt(x);}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<T> > 
+Gadgetron::sqrt( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::sqrt(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<T> > result(new cuNDArray<T>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<T> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_sqrt<T>());
+  return result;
+}
+
+template<class T> void 
+Gadgetron::sqrt_inplace( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::sqrt_inplace(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_sqrt<T>());
+}
+ 
+template<typename T> struct cuNDA_square : public thrust::unary_function<T,T>
+{
+  __device__ T operator()(const T &x) const {return x*x;}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<T> > Gadgetron::square( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::square(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<T> > result(new cuNDArray<T>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<T> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_square<T>());
+  return result;
+}
+
+template<class T> void 
+Gadgetron::square_inplace( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::square_inplace(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_square<T>());
+}  
+
+template<typename T> struct cuNDA_reciprocal : public thrust::unary_function<T,T>
+{
+  __device__ T operator()(const T &x) const {return T(1)/x;}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<T> > Gadgetron::reciprocal( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::reciprocal(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<T> > result(new cuNDArray<T>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<T> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_reciprocal<T>());
+  return result;
+}
+
+template<class T> void 
+Gadgetron::reciprocal_inplace( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::reciprocal_inplace(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_reciprocal<T>());
+}  
+ 
+template<typename T> struct cuNDA_reciprocal_sqrt : public thrust::unary_function<T,T>
+{
+  __device__ T operator()(const T &x) const {return T(1)/sqrt(x);}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<T> > Gadgetron::reciprocal_sqrt( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::reciprocal_sqrt(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<T> > result(new cuNDArray<T>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<T> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_reciprocal_sqrt<T>());
+  return result;
+}
+
+template<class T> void 
+Gadgetron::reciprocal_sqrt_inplace( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::reciprocal_sqrt_inplace(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_reciprocal_sqrt<T>());
+}  
+
+template<typename T> struct cuNDA_sgn : public thrust::unary_function<T,T>
+{
+  __device__ T operator()(const T &x) const {return sgn(x);}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<T> > Gadgetron::sgn( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::sgn(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<T> > result(new cuNDArray<T>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<T> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_sgn<T>());
+  return result;
+}
+
+template<class T> void 
+Gadgetron::sgn_inplace( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::sgn_inplace(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_sgn<T>());
+}  
+ 
+template<typename T> struct cuNDA_real : public thrust::unary_function<T,typename realType<T>::Type>
+{
+  __device__ typename realType<T>::Type operator()(const T &x) const {return real(x);}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<typename realType<T>::Type> > 
+Gadgetron::real( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::real(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<typename realType<T>::Type> > result(new cuNDArray<typename realType<T>::Type>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<typename realType<T>::Type> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_real<T>());
+  return result;
+}
+
+template <typename T> struct cuNDA_imag : public thrust::unary_function<T,typename realType<T>::Type>
+{
+  __device__ typename realType<T>::Type operator()(const T &x) const {return imag(x);}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<typename realType<T>::Type> > 
+Gadgetron::imag( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::imag(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<typename realType<T>::Type> > result(new cuNDArray<typename realType<T>::Type>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<typename realType<T>::Type> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_imag<T>());
+  return result;
+}
+
+template <typename T> struct cuNDA_conj : public thrust::unary_function<T,T>
+{
+  __device__ T operator()(const T &x) const {return conj(x);}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<T> > 
+Gadgetron::conj( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::conj(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<T> > result(new cuNDArray<T>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<T> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_conj<T>());
+  return result;
+}
+
+template <typename T> struct cuNDA_real_to_complex : public thrust::unary_function<typename realType<T>::Type,T>
+{
+  __device__ T operator()(const typename realType<T>::Type &x) const {return T(x);}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<T> > 
+Gadgetron::real_to_complex( cuNDArray<typename realType<T>::Type> *x )
+{
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::real_to_complex(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<T> > result(new cuNDArray<T>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<T> resPtr = result->get_device_ptr();
+  thrust::device_ptr<typename realType<T>::Type> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_real_to_complex<T>());
+  return result;
+}
+
+template <typename T,typename T2> struct cuNDA_convert_to : public thrust::unary_function<T,T2>
+{
+  __device__ T2 operator()(T &x) const {return T2(x);}
+};
+
+template <typename T,typename T2> struct cuNDA_convert_to<complext<T>,complext<T2> > : public thrust::unary_function<complext<T>,complext<T2> >
+{
+  __device__ complext<T2> operator()(complext<T> &x) const {return complext<T2>(x.vec[0],x.vec[1]);}
+};
+
+template<class T, class T2> boost::shared_ptr< cuNDArray<T2> >
+Gadgetron::convert_to( cuNDArray<T> *x )
+{
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::convert_to(): Invalid input array");
+
+  boost::shared_ptr< cuNDArray<T2> > result(new cuNDArray<T2>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<T2> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_convert_to<T,T2>());
+  return result;
+}
+
+template<class T, class T2> void
+Gadgetron::convert_to( cuNDArray<T> *x ,cuNDArray<T2> * y)
+{
+  if( x == 0x0 || !x->dimensions_equal(y))
+    throw std::runtime_error("Gadgetron::convert_to(): Invalid input array");
+  thrust::device_ptr<T2> resPtr = y->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_convert_to<T,T2>());
+}
+
+template<class T> void Gadgetron::clear( cuNDArray<T> *x )
+{
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::clear(): Invalid input array");
+
+  if ( x->get_number_of_elements() > 0 )
+  {
+    cudaMemset(x->get_data_ptr(),0,sizeof(T)*x->get_number_of_elements());
+  }
+}
+
+template<class T> void 
+Gadgetron::fill( cuNDArray<T> *x, T val )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::fill(): Invalid input array");
+  
+  thrust::device_ptr<T> devPtr = x->get_device_ptr();
+  thrust::fill(devPtr,devPtr+x->get_number_of_elements(),val);
+}  
+
+template<typename T> struct cuNDA_clamp : public thrust::unary_function<T,T>
+{
+  cuNDA_clamp( T _min, T _max, T _min_val, T _max_val ) : min(_min), max(_max),min_val(_min_val), max_val(_max_val) {}
+  __device__ T operator()(const T &x) const 
+  {
+    if( x < min ) return min_val;
+    else if ( x >= max) return max_val;
+    else return x;
+  }
+  T min, max;
+  T min_val, max_val;
+};
+
+template<typename T> struct cuNDA_clamp< complext<T> > : public thrust::unary_function< complext<T>, complext<T> >
+{
+	cuNDA_clamp( T _min, T _max, complext<T> _min_val, complext<T> _max_val ) : min(_min), max(_max),min_val(_min_val), max_val(_max_val) {}
+  __device__ complext<T> operator()(const complext<T> &x) const 
+  {
+    if( real(x) < min ) return min_val;
+    else if ( real(x) >= max) return max_val;
+    else return complext<T>(real(x));
+  }
+  T min, max;
+  complext<T> min_val, max_val;
+};
+
+template<class T> void 
+Gadgetron::clamp( cuNDArray<T> *x, typename realType<T>::Type min, typename realType<T>::Type max, T min_val, T max_val)
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::clamp(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_clamp<T>(min, max,min_val, max_val));
+}  
+
+template<class T> void 
+Gadgetron::clamp( cuNDArray<T> *x, typename realType<T>::Type min, typename realType<T>::Type max)
+{
+    clamp(x,min,max,T(min),T(max));
+}
+
+template<typename T> struct cuNDA_clamp_min : public thrust::unary_function<T,T>
+{
+  cuNDA_clamp_min( T _min ) : min(_min) {}
+  __device__ T operator()(const T &x) const 
+  {
+    if( x < min ) return min;
+    else return x;
+  }
+  T min;
+};
+
+template<typename T> struct cuNDA_clamp_min< complext<T> > : public thrust::unary_function< complext<T>, complext<T> >
+{
+  cuNDA_clamp_min( T _min ) : min(_min) {}
+  __device__ complext<T> operator()(const complext<T> &x) const 
+  {
+    if( real(x) < min ) return complext<T>(min);
+    else return complext<T>(real(x));
+  }
+  T min;
+};
+
+template<class T> void 
+Gadgetron::clamp_min( cuNDArray<T> *x, typename realType<T>::Type min )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::clamp_min(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_clamp_min<T>(min));
+}  
+
+template<typename T> struct cuNDA_clamp_max : public thrust::unary_function<T,T>
+{
+  cuNDA_clamp_max( T _max ) : max(_max) {}
+  __device__ T operator()(const T &x) const 
+  {
+    if( x > max ) return max;
+    else return x;
+  }
+  T max;
+};
+
+template<typename T> struct cuNDA_clamp_max< complext<T> > : public thrust::unary_function< complext<T>, complext<T> >
+{
+  cuNDA_clamp_max( T _max ) : max(_max) {}
+  __device__ complext<T> operator()(const complext<T> &x) const 
+  {
+    if( real(x) > max ) return complext<T>(max);
+    else return complext<T>(real(x));
+  }
+  T max;
+};
+
+template<class T> void 
+Gadgetron::clamp_max( cuNDArray<T> *x, typename realType<T>::Type max )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::clamp_max(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_clamp_max<T>(max));
+}  
+
+template<class T> void 
+Gadgetron::normalize( cuNDArray<T> *x, typename realType<T>::Type val )
+{
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::normalize(): Invalid input array");
+  
+  size_t max_idx = amax(x);
+  T max_val_before;
+  CUDA_CALL(cudaMemcpy(&max_val_before, &x->get_data_ptr()[max_idx], sizeof(T), cudaMemcpyDeviceToHost));
+  typename realType<T>::Type scale = val/abs(max_val_before);
+  *x *= scale;
+}
+
+
+template<typename T> struct cuNDA_shrink1 : public thrust::unary_function<T,T>
+{
+  cuNDA_shrink1( typename realType<T>::Type _gamma ) : gamma(_gamma) {}
+  __device__ T operator()(const T &x) const {
+    typename realType<T>::Type absX = abs(x);
+    T sgnX = (absX <= typename realType<T>::Type(0)) ? T(0) : x/absX;
+    return sgnX*max(absX-gamma, typename realType<T>::Type(0));
+  }
+  typename realType<T>::Type gamma;
+};
+
+template<class T> void 
+Gadgetron::shrink1( cuNDArray<T> *x, typename realType<T>::Type gamma, cuNDArray<T> *out )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::shrink1(): Invalid input array");
+  
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::device_ptr<T> outPtr = (out == 0x0) ? x->get_device_ptr() : out->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),outPtr,cuNDA_shrink1<T>(gamma));
+}
+
+template<typename T> struct cuNDA_pshrink : public thrust::unary_function<T,T>
+{
+  cuNDA_pshrink( typename realType<T>::Type _gamma, typename realType<T>::Type _p ) : gamma(_gamma),p(_p) {}
+  __device__ T operator()(const T &x) const {
+    typename realType<T>::Type absX = abs(x);
+    T sgnX = (absX <= typename realType<T>::Type(0)) ? T(0) : x/absX;
+    return sgnX*max(absX-gamma*pow(absX,p-1), typename realType<T>::Type(0));
+  }
+  typename realType<T>::Type gamma;
+  typename realType<T>::Type p;
+};
+
+template<class T> void
+Gadgetron::pshrink( cuNDArray<T> *x, typename realType<T>::Type gamma,typename realType<T>::Type p, cuNDArray<T> *out )
+{
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::shrink1(): Invalid input array");
+
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::device_ptr<T> outPtr = (out == 0x0) ? x->get_device_ptr() : out->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),outPtr,cuNDA_pshrink<T>(gamma,p));
+}  
+
+template<typename T> struct cuNDA_shrinkd : public thrust::binary_function<T,typename realType<T>::Type,T>
+{
+  cuNDA_shrinkd( typename realType<T>::Type _gamma ) : gamma(_gamma) {}
+  __device__ T operator()(const T &x, const typename realType<T>::Type &s) const {
+  	T xs = (s <= typename realType<T>::Type(0)) ? T(0) : x/s;
+    return xs*max(s-gamma,typename realType<T>::Type(0));
+  }
+  typename realType<T>::Type gamma;
+};
+
+template<class T> void 
+Gadgetron::shrinkd( cuNDArray<T> *x, cuNDArray<typename realType<T>::Type> *s, typename realType<T>::Type gamma, cuNDArray<T> *out )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::shrinkd(): Invalid input array");
+  
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::device_ptr<T> outPtr = (out == 0x0) ? x->get_device_ptr() : out->get_device_ptr();
+  thrust::device_ptr<typename realType<T>::Type> sPtr = s->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),sPtr,outPtr,cuNDA_shrinkd<T>(gamma));
+}  
+
+
+template<typename T> struct cuNDA_pshrinkd : public thrust::binary_function<T,typename realType<T>::Type,T>
+{
+  cuNDA_pshrinkd( typename realType<T>::Type _gamma,typename realType<T>::Type _p ) : gamma(_gamma), p(_p) {}
+  __device__ T operator()(const T &x, const typename realType<T>::Type &s) const {
+    return x/s*max(s-gamma*pow(s,p-1),typename realType<T>::Type(0));
+  }
+  typename realType<T>::Type gamma;
+  typename realType<T>::Type p;
+};
+
+template<class T> void
+Gadgetron::pshrinkd( cuNDArray<T> *x, cuNDArray<typename realType<T>::Type> *s, typename realType<T>::Type gamma,typename realType<T>::Type p, cuNDArray<T> *out )
+{
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::shrinkd(): Invalid input array");
+
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::device_ptr<T> outPtr = (out == 0x0) ? x->get_device_ptr() : out->get_device_ptr();
+  thrust::device_ptr<typename realType<T>::Type> sPtr = s->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),sPtr,outPtr,cuNDA_pshrinkd<T>(gamma,p));
+}
+
+//
+// Instantiation
+//
+
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::abs<float>( cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::abs_inplace<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::abs_square<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::sqrt<float>( cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::sqrt_inplace<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::square<float>( cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::square_inplace<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::reciprocal<float>( cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_inplace<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::reciprocal_sqrt<float>( cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_sqrt_inplace<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::sgn<float>( cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::sgn_inplace<float>( cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::clear<float>( cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::fill<float>( cuNDArray<float>*, float );
+template EXPORTGPUCORE void Gadgetron::clamp<float>( cuNDArray<float>*, float, float );
+template EXPORTGPUCORE void Gadgetron::clamp<float>( cuNDArray<float>*, float, float, float,float );
+template EXPORTGPUCORE void Gadgetron::clamp_min<float>( cuNDArray<float>*, float );
+template EXPORTGPUCORE void Gadgetron::clamp_max<float>( cuNDArray<float>*, float );
+template EXPORTGPUCORE void Gadgetron::normalize<float>( cuNDArray<float>*, float );
+template EXPORTGPUCORE void Gadgetron::shrink1<float>( cuNDArray<float>*, float, cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::pshrink<float>( cuNDArray<float>*, float,float, cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::shrinkd<float> ( cuNDArray<float>*, cuNDArray<float>*, float, cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::pshrinkd<float> ( cuNDArray<float>*, cuNDArray<float>*, float,float, cuNDArray<float>* );
+
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::abs<double>( cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::abs_inplace<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::abs_square<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::sqrt<double>( cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::sqrt_inplace<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::square<double>( cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::square_inplace<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::reciprocal<double>( cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_inplace<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::reciprocal_sqrt<double>( cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_sqrt_inplace<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::sgn<double>( cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::sgn_inplace<double>( cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::clear<double>( cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::fill<double>( cuNDArray<double>*, double );
+template EXPORTGPUCORE void Gadgetron::clamp<double>( cuNDArray<double>*, double, double );
+template EXPORTGPUCORE void Gadgetron::clamp<double>( cuNDArray<double>*, double, double, double, double );
+template EXPORTGPUCORE void Gadgetron::clamp_min<double>( cuNDArray<double>*, double );
+template EXPORTGPUCORE void Gadgetron::clamp_max<double>( cuNDArray<double>*, double );
+template EXPORTGPUCORE void Gadgetron::normalize<double>( cuNDArray<double>*, double );
+template EXPORTGPUCORE void Gadgetron::shrink1<double>( cuNDArray<double>*, double, cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::pshrink<double>( cuNDArray<double>*, double,double, cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::shrinkd<double> ( cuNDArray<double>*, cuNDArray<double>*, double, cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::pshrinkd<double> ( cuNDArray<double>*, cuNDArray<double>*, double,double, cuNDArray<double>* );
+
+/*template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::abs< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< std::complex<float> > > Gadgetron::sqrt< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::abs_square< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE void Gadgetron::sqrt_inplace< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< std::complex<float> > > Gadgetron::square< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE void Gadgetron::square_inplace< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< std::complex<float> > > Gadgetron::reciprocal< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_inplace< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< std::complex<float> > > Gadgetron::reciprocal_sqrt< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_sqrt_inplace< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE void Gadgetron::clear< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE void Gadgetron::fill< std::complex<float> >( cuNDArray< std::complex<float> >*, std::complex<float> );
+template EXPORTGPUCORE void Gadgetron::normalize< std::complex<float> >( cuNDArray< std::complex<float> >*, float );
+template EXPORTGPUCORE void Gadgetron::shrink1< std::complex<float> >( cuNDArray< std::complex<float> >*, float );
+template EXPORTGPUCORE void Gadgetron::shrinkd< std::complex<float> > ( cuNDArray< std::complex<float> >*, cuNDArray<float>*, float );
+
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::abs< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< std::complex<double> > > Gadgetron::sqrt< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::abs_square< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE void Gadgetron::sqrt_inplace< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< std::complex<double> > > Gadgetron::square< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE void Gadgetron::square_inplace< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< std::complex<double> > > Gadgetron::reciprocal< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_inplace< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< std::complex<double> > > Gadgetron::reciprocal_sqrt< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_sqrt_inplace< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE void Gadgetron::clear< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE void Gadgetron::fill< std::complex<double> >( cuNDArray< std::complex<double> >*, std::complex<double> );
+template EXPORTGPUCORE void Gadgetron::normalize< std::complex<double> >( cuNDArray< std::complex<double> >*, double );
+template EXPORTGPUCORE void Gadgetron::shrink1< std::complex<double> >( cuNDArray< std::complex<double> >*, double );
+template EXPORTGPUCORE void Gadgetron::shrinkd< std::complex<double> > ( cuNDArray< std::complex<double> >*, cuNDArray<double>*, double );
+*/
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::abs< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::abs_inplace<complext<float> >(cuNDArray<complext<float> >*);
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< complext<float> > > Gadgetron::sqrt< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::abs_square< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::sqrt_inplace< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< complext<float> > > Gadgetron::square< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::square_inplace< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< complext<float> > > Gadgetron::reciprocal< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_inplace< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< complext<float> > > Gadgetron::reciprocal_sqrt< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_sqrt_inplace< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<complext<float> > > Gadgetron::sgn<complext<float> >( cuNDArray<complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::sgn_inplace<complext<float> >( cuNDArray<complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::clear< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::fill< complext<float> >( cuNDArray< complext<float> >*, complext<float> );
+template EXPORTGPUCORE void Gadgetron::clamp< complext<float> >( cuNDArray< complext<float> >*, float, float );
+template EXPORTGPUCORE void Gadgetron::clamp_min< complext<float> >( cuNDArray< complext<float> >*, float );
+template EXPORTGPUCORE void Gadgetron::clamp_max< complext< float> >( cuNDArray<complext<float> >*, float );
+template EXPORTGPUCORE void Gadgetron::normalize< complext<float> >( cuNDArray< complext<float> >*, float );
+template EXPORTGPUCORE void Gadgetron::shrink1< complext<float> >( cuNDArray< complext<float> >*, float, cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::pshrink< complext<float> >( cuNDArray< complext<float> >*, float,float, cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::shrinkd< complext<float> > ( cuNDArray< complext<float> >*, cuNDArray<float>*, float, cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::pshrinkd< complext<float> > ( cuNDArray< complext<float> >*, cuNDArray<float>*, float,float, cuNDArray< complext<float> >* );
+
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::abs< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< complext<double> > > Gadgetron::sqrt< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::abs_square< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::sqrt_inplace< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< complext<double> > > Gadgetron::square< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::square_inplace< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< complext<double> > > Gadgetron::reciprocal< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_inplace< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< complext<double> > > Gadgetron::reciprocal_sqrt< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_sqrt_inplace< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<complext<double> > > Gadgetron::sgn<complext<double> >( cuNDArray<complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::sgn_inplace<complext<double> >( cuNDArray<complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::clear< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::fill< complext<double> >( cuNDArray< complext<double> >*, complext<double> );
+template EXPORTGPUCORE void Gadgetron::clamp< complext<double> >( cuNDArray< complext<double> >*, double, double );
+template EXPORTGPUCORE void Gadgetron::clamp_min< complext<double> >( cuNDArray< complext<double> >*, double );
+template EXPORTGPUCORE void Gadgetron::clamp_max< complext<double> >( cuNDArray<complext<double> >*, double );
+template EXPORTGPUCORE void Gadgetron::normalize< complext<double> >( cuNDArray< complext<double> >*, double );
+template EXPORTGPUCORE void Gadgetron::shrink1< complext<double> >( cuNDArray< complext<double> >*, double, cuNDArray< complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::pshrink< complext<double> >( cuNDArray< complext<double> >*, double, double, cuNDArray< complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::shrinkd< complext<double> > ( cuNDArray< complext<double> >*, cuNDArray<double>*, double, cuNDArray< complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::pshrinkd< complext<double> > ( cuNDArray< complext<double> >*, cuNDArray<double>*, double,double, cuNDArray< complext<double> >* );
+
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::real<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::imag<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::conj<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::real<float_complext>( cuNDArray<float_complext>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::imag<float_complext>( cuNDArray<float_complext>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > Gadgetron::conj<float_complext>( cuNDArray<float_complext>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > Gadgetron::real_to_complex<float_complext>( cuNDArray<float>* );
+
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::real<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::imag<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::conj<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::real<double_complext>( cuNDArray<double_complext>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::imag<double_complext>( cuNDArray<double_complext>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > Gadgetron::conj<double_complext>( cuNDArray<double_complext>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > Gadgetron::real_to_complex<double_complext>( cuNDArray<double>* );
+
+template boost::shared_ptr< cuNDArray<double> > Gadgetron::convert_to<float,double>( cuNDArray<float>* );
+template boost::shared_ptr< cuNDArray<float> > Gadgetron::convert_to<double,float>( cuNDArray<double>* );
+template boost::shared_ptr< cuNDArray<double_complext> > Gadgetron::convert_to<float_complext,double_complext>( cuNDArray<float_complext>* );
+template boost::shared_ptr< cuNDArray<float_complext> > Gadgetron::convert_to<double_complext,float_complext>( cuNDArray<double_complext>* );
+
+template void Gadgetron::convert_to<float,double>( cuNDArray<float>*,cuNDArray<double>* );
+template void Gadgetron::convert_to<double,float>( cuNDArray<double>*, cuNDArray<float>* );
+template void Gadgetron::convert_to<float_complext,double_complext>( cuNDArray<float_complext>*,cuNDArray<double_complext>*  );
+template void Gadgetron::convert_to<double_complext,float_complext>( cuNDArray<double_complext>*, cuNDArray<float_complext>*);
diff --git a/toolboxes/core/gpu/cuNDArray_elemwise.h b/toolboxes/core/gpu/cuNDArray_elemwise.h
new file mode 100644
index 0000000..6be2c9e
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_elemwise.h
@@ -0,0 +1,255 @@
+/** \file cuNDArray_elemwise.h
+    \brief Element-wise math operations on the cuNDArray class.
+    
+    cuNDArray_elementwise.h defines element-wise array operations on the cuNDArray class.
+    Many of the provided functions come in two flavours:
+    1) A function that returns a smart pointer to a new array holding the result of the element-wise operation, and
+    2) A function that perform in-place element-wise computation replacing the input array.
+    When both versions are available the in-place version is suffixed _inplace.
+    Some functions (clear, fill, clamp, clamp_min, clamp_max, normalize, shrink1, shrinkd) are only provided as in-place operations,
+    and they do not carry the _inplace suffix in order to keep user code compact.
+    A few functions return a different type as its input array 
+    (abs on complex data, real, imag, real_to_std_complex, real_to_complext) and consequently is not offered as an in place operation.
+    The functions provided in cuNDArray_elemwise are deliberatly placed outside the NDArray derived classes
+    - to allow the NDArray classes to be lightweight header only data containers for both the cpu and gpu instances
+    - to allow for external library optimized implementations of the element-wise functions without adding such dependencies to the core data container
+    The present cpu implementation is based on Thrust.
+    The implementation is purposely split into a header and underlying implementation (.cpp) 
+    as this allows specific instantiation of the supported template types.     
+    The supported types are float, double Gadgetron::complext<float> and Gadgetron::complext<double> -- with some deliberate omissions.
+    Arrays of type std::complex<float> and std::complex<double> are currently not supported since the thrust device functors cannot 
+    link to std:: functions (as they are not declared as __device__). 
+    However, arrays of type std::complex are binary compatible with arrays of type Gadgetron::complext (for which we have support)
+    and can safely be cast to such.
+*/
+
+#pragma once
+
+#include "cuNDArray.h"
+#include "gpucore_export.h"
+
+namespace Gadgetron{
+
+  /**
+   * @brief Calculates the element-wise absolute values (l2 norm) of the array entries
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise absolute values of the input.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<typename realType<T>::Type> > abs( cuNDArray<T> *x );
+
+  /**
+   * @brief Calculates the element-wise absolute values (l2 norm) of the array entries (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTGPUCORE void abs_inplace( cuNDArray<T> *x );
+    
+  /**
+   * @brief Calculates the element-wise squared absolute values of the array entries
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise absolute values of the input.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<typename realType<T>::Type> > abs_square( cuNDArray<T> *x );
+
+  /**
+   * @brief Calculates the element-wise sqrt of the array entries.
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise sqrt of the input.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> > sqrt( cuNDArray<T> *x );
+
+  /**
+   * @brief Calculates the element-wise sqrt of the array entries (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTGPUCORE void sqrt_inplace( cuNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the element-wise square of the array entries.
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise square of the input.
+   *
+   * For real numbers this functions is equivalent to square. 
+   * For complex arrays abs_square() and square() differ however.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> > square( cuNDArray<T> *x );
+    
+  /**
+   * @brief Calculates the element-wise square of the array entries (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTGPUCORE void square_inplace( cuNDArray<T> *x );
+    
+  /**
+   * @brief Calculates the element-wise reciprocal of the array entries.
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise reciprocal of the input.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> > reciprocal( cuNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the element-wise reciprocal of the array entries (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTGPUCORE void reciprocal_inplace( cuNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the element-wise reciprocal sqrt of the array entries.
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise reciprocal sqrt of the input.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> > reciprocal_sqrt( cuNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the element-wise reciprocal sqrt of the array entries (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTGPUCORE void reciprocal_sqrt_inplace( cuNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the elementwise signum function on the array.
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise sgn of the input.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> > sgn( cuNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the elementwise signum function on the array (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTGPUCORE void sgn_inplace( cuNDArray<T> *x );
+
+  /**
+   * @brief Extract the real component from a complex array.
+   * @param[in] x Input array.
+   * @return A new array of the real component of the complex array.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<typename realType<T>::Type> > real( cuNDArray<T> *x );
+
+  /**
+   * @brief Extract the imaginary component from a complex array.
+   * @param[in] x Input array.
+   * @return A new array of the imaginary component of the complex array.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<typename realType<T>::Type> > imag( cuNDArray<T> *x );
+
+  /**
+   * @brief Create a new array of the complex conjugate of the input array. For real arrays a copy of the input array is return.
+   * @param[in] x Input array.
+   * @return A new array of the complex conjugate of the input array.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> > conj( cuNDArray<T> *x );
+
+  /**
+   * @brief Construct a complex array from a real array.
+   * @param[in] x Input array.
+   * @return A new complex array containing the input array in the real component and zeros in the imaginary component.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> > real_to_complex( cuNDArray<typename realType<T>::Type> *x );
+  
+  /**
+   * Converts array from type T to type T2
+   * @param[in] x Input array
+   * @return A copy of x with the type T2
+   */
+  template<class T,class T2> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T2> > convert_to( cuNDArray<T> *x );
+
+  /**
+   * Converts array from type T to type T2. Input and output array must be same size.
+   * @param[in] x Input array
+   * @param[out] y Output array, will contain a copy of x with type T2
+   */
+  template<class T,class T2> EXPORTGPUCORE void convert_to( cuNDArray<T> *x, cuNDArray<T2> *y );
+
+  //
+  // From hereon the functions are all in-place although without the _inplace suffix...
+  //
+
+  /**
+   * @brief Clears the array to all zeros (in place). Faster than fill.
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTGPUCORE void clear( cuNDArray<T> *x );
+
+  /**
+   * @brief Fills the array with a user provided constant value (in place).
+   * @param[in,out] x Input and output array.
+   * @param[in] val Fill value.
+   */
+  template<class T> EXPORTGPUCORE void fill( cuNDArray<T> *x, T val );
+
+  /**
+   * @brief Clamps all values in the array to the minimum and maximum values specified (in place).
+   * @param[in,out] x Input and output array.
+   * @param[in] min minimum value.
+   * @param[in] max maximum value.
+   * @param[in] min_val value to which everything below the minimum will be set
+   * @param[in] max_val value to which everything above the maximum will be set
+   */
+  template<class T> EXPORTGPUCORE void clamp( cuNDArray<T> *x, typename realType<T>::Type min, typename realType<T>::Type max, T min_val, T max_val );
+
+  /**
+   * @brief Clamps all values in the array to the minimum and maximum values specified (in place).
+   * @param[in,out] x Input and output array.
+   * @param[in] min minimum value.
+   * @param[in] max maximum value.
+   */
+  template<class T> EXPORTGPUCORE void clamp( cuNDArray<T> *x, typename realType<T>::Type min, typename realType<T>::Type max);
+
+  /**
+   * @brief Clamps all values in the array to a minimum value allowed (in place).
+   * @param[in,out] x Input and output array.
+   * @param[in] min Minimum value.
+   */
+  template<class T> EXPORTGPUCORE void clamp_min( cuNDArray<T> *x, typename realType<T>::Type min );
+
+  /**
+   * @brief Clamps all values in the array to a maximum value allowed (in place).
+   * @param[in,out] x Input and output array.
+   * @param[in] max Maximum value.
+   */
+  template<class T> EXPORTGPUCORE void clamp_max( cuNDArray<T> *x, typename realType<T>::Type max );
+
+  /**
+   * @brief In place normalization (scaling) to a new maximum absolute array value val.
+   * @param[in,out] x Input and output array.
+   * @param[in] val New maximum absolute array value (according to the l2-norm)
+   */  
+  template<class T> EXPORTGPUCORE void normalize( cuNDArray<T> *x, typename realType<T>::Type val = typename realType<T>::Type(1) );
+
+  /**
+   * @brief In place shrinkage (soft thresholding), i.e. shrink(x,gamma) = x/abs(x)*max(abs(x)-gamma,0).
+   * @param[out] out Output array. Can be 0x0 in which case an in place transform is performed.
+   * @param[in,out] x Input array (and output array if out == 0x0).
+   * @param[in] gamma Shrinkage control parameter
+   */  
+  template<class T> EXPORTGPUCORE void shrink1( cuNDArray<T> *x, typename realType<T>::Type gamma, cuNDArray<T> *out = 0x0 );
+
+
+
+  /**
+   * @brief In place p-shrinkage (soft thresholding), i.e. pshrink(x,gamma,p) = x/abs(x)*max(abs(x)-gamma*abs(x)^(p-1),0).
+   * @param[out] out Output array. Can be 0x0 in which case an in place transform is performed.
+   * @param[in,out] x Input array (and output array if out == 0x0).
+   * @param[in] gamma Shrinkage control parameter
+   * @param[in] p p value of the shrinkage. Should be less than 1 and more than 0.
+   */
+  template<class T> EXPORTGPUCORE void pshrink( cuNDArray<T> *x, typename realType<T>::Type gamma,typename realType<T>::Type p, cuNDArray<T> *out = 0x0 );
+
+  /**
+   * @brief In place shrinkage (soft thresholding, multi-dimensional), i.e. shrink(x,gamma,s) = x/s*max(s-gamma,0).
+   * @param[out] out Output array. Can be 0x0 in which case an in place transform is performed.
+   * @param[in,out] x Input array (and output array if out == 0x0).
+   * @param[in] s Input array, normalization.
+   * @param[in] gamma Shrinkage control parameter
+   */  
+  template<class T> EXPORTGPUCORE void shrinkd ( cuNDArray<T> *x, cuNDArray<typename realType<T>::Type> *s, typename realType<T>::Type gamma, cuNDArray<T> *out = 0x0 );
+
+  /**
+     * @brief In place p-shrinkage (soft thresholding, multi-dimensional), i.e. pshrink(x,s,gamma,p) = x/s*max(s-gamma*s^(p-1),0).
+     * @param[out] out Output array. Can be 0x0 in which case an in place transform is performed.
+     * @param[in,out] x Input array (and output array if out == 0x0).
+     * @param[in] gamma Shrinkage control parameter
+     * @param[in] p p value of the shrinkage. Should be less than 1 and more than 0.
+     */
+    template<class T> EXPORTGPUCORE void pshrinkd ( cuNDArray<T> *x, cuNDArray<typename realType<T>::Type> *s, typename realType<T>::Type gamma,typename realType<T>::Type p, cuNDArray<T> *out = 0x0 );
+}
diff --git a/toolboxes/core/gpu/cuNDArray_fileio.h b/toolboxes/core/gpu/cuNDArray_fileio.h
new file mode 100644
index 0000000..79f6fb2
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_fileio.h
@@ -0,0 +1,10 @@
+#pragma once
+#include "hoNDArray_fileio.h"
+
+namespace Gadgetron{
+template<class T> void write_nd_array(cuNDArray<T>* array, std::string s){
+	write_nd_array(array->to_host().get(),s.c_str());
+}
+
+
+}
diff --git a/toolboxes/core/gpu/cuNDArray_kernels.cu b/toolboxes/core/gpu/cuNDArray_kernels.cu
new file mode 100644
index 0000000..30330ad
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_kernels.cu
@@ -0,0 +1,179 @@
+#include "cuNDArray.h"
+#include "vector_td.h"
+#include <sstream>
+
+namespace Gadgetron{
+
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<int>* in,
+				 cuNDArray<int>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<int2>* in,
+				 cuNDArray<int2>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<int3>* in,
+				 cuNDArray<int3>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<int4>* in,
+				 cuNDArray<int4>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<unsigned int>* in,
+				 cuNDArray<unsigned int>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<uint2>* in,
+				 cuNDArray<uint2>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<uint3>* in,
+				 cuNDArray<uint3>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<uint4>* in,
+				 cuNDArray<uint4>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<float>* in,
+				 cuNDArray<float>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<float2>* in,
+				 cuNDArray<float2>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<float3>* in,
+				 cuNDArray<float3>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<float4>* in,
+				 cuNDArray<float4>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<double>* in,
+				 cuNDArray<double>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<double2>* in,
+				 cuNDArray<double2>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<double3>* in,
+				 cuNDArray<double3>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<double4>* in,
+				 cuNDArray<double4>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<intd1>* in,
+				 cuNDArray<intd1>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<intd2>* in,
+				 cuNDArray<intd2>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<intd3>* in,
+				 cuNDArray<intd3>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<intd4>* in,
+				 cuNDArray<intd4>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<uint64d1>* in,
+				 cuNDArray<uint64d1>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<uint64d2>* in,
+				 cuNDArray<uint64d2>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<uint64d3>* in,
+				 cuNDArray<uint64d3>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<uint64d4>* in,
+				 cuNDArray<uint64d4>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<floatd1>* in,
+				 cuNDArray<floatd1>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<floatd2>* in,
+				 cuNDArray<floatd2>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<floatd3>* in,
+				 cuNDArray<floatd3>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<floatd4>* in,
+				 cuNDArray<floatd4>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<doubled1>* in,
+				 cuNDArray<doubled1>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<doubled2>* in,
+				 cuNDArray<doubled2>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<doubled3>* in,
+				 cuNDArray<doubled3>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<doubled4>* in,
+				 cuNDArray<doubled4>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+				   
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<float_complext>* in,
+				 cuNDArray<float_complext>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<double_complext>* in,
+				 cuNDArray<double_complext>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+}
diff --git a/toolboxes/core/gpu/cuNDArray_math.h b/toolboxes/core/gpu/cuNDArray_math.h
new file mode 100644
index 0000000..bb8e90d
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_math.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_reductions.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_utils.h"
diff --git a/toolboxes/core/gpu/cuNDArray_operators.cu b/toolboxes/core/gpu/cuNDArray_operators.cu
new file mode 100644
index 0000000..243ef3f
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_operators.cu
@@ -0,0 +1,238 @@
+#include "cuNDArray_operators.h"
+#include "complext.h"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  // Private utility to verify array dimensions. 
+  // It "replaces" NDArray::dimensions_equal() to support batch mode.
+  // There is an identical function for all array instances (currently hoNDArray, cuNDArray, hoCuNDAraay)
+  // !!! Remember to fix any bugs in all versions !!!
+  //
+  template<class T,class S> static bool compatible_dimensions( const cuNDArray<T> &x, const cuNDArray<S> &y )
+  {
+    return ((x.get_number_of_elements()%y.get_number_of_elements())==0);
+  }
+
+  template<typename T>
+  class cuNDA_modulus : public thrust::unary_function<T,T>
+  {
+  public:
+    cuNDA_modulus(int x):mod(x) {};
+    __host__ __device__ T operator()(const T &y) const {return y%mod;}
+  private:
+    const int mod;
+  };
+
+  //
+  // This transform support batch mode when the number of elements in x is a multiple of the number of elements in y
+  //
+  template<class T,class S,class F>  
+  static void equals_transform(cuNDArray<T> &x, cuNDArray<S> &y){
+    if (x.dimensions_equal(&y)){
+      thrust::transform(x.begin(), x.end(), y.begin(), x.begin(), F());
+    } else if (compatible_dimensions(x,y))
+      {
+        typedef thrust::transform_iterator<cuNDA_modulus<int>,thrust::counting_iterator<int>, int> transform_it;
+        transform_it indices = thrust::make_transform_iterator(thrust::make_counting_iterator(0),cuNDA_modulus<int>(y.get_number_of_elements()));
+        thrust::permutation_iterator<thrust::device_ptr<S>,transform_it> p = thrust::make_permutation_iterator(y.begin(),indices);
+        thrust::transform(x.begin(),x.end(),p,x.begin(),F());
+      } else {
+      throw std::runtime_error("The provided cuNDArrays have incompatible dimensions for operator {+=,-=,*=,/=}");
+    }
+  }
+
+  template<typename T>
+  struct cuNDA_plus : public thrust::binary_function<complext<T>, T, complext<T> >
+  {
+    __device__ complext<T> operator()(const complext<T> &x, const T &y) const {return x+y;}
+  };
+
+  template<typename T>
+  struct cuNDA_minus : public thrust::binary_function<complext<T>, T, complext<T> >
+  {
+    __device__ complext<T> operator()(const complext<T> &x, const T &y) const {return x-y;}
+  };
+
+  template<typename T>
+  struct cuNDA_multiply : public thrust::binary_function<complext<T>, T, complext<T> >
+  {
+    __device__ complext<T> operator()(const complext<T> &x, const T &y) const {return x*y;}
+  };
+
+  template<typename T>
+  struct cuNDA_divide : public thrust::binary_function<complext<T>, T, complext<T> >
+  {
+    __device__ complext<T> operator()(const complext<T> &x, const T &y) const {return x/y;}
+  };
+
+  template<class T> cuNDArray<typename boost::enable_if<enable_operators<T>, T >::type > & operator+= (cuNDArray<T> &x, cuNDArray<T> &y){
+    equals_transform< T,T,thrust::plus<T> >(x,y);
+    return x;
+  }
+
+  template<class T> cuNDArray<typename boost::enable_if<enable_operators<T>, T >::type > & operator+= (cuNDArray<T> &x , T y){
+    thrust::constant_iterator<T> iter(y);
+    thrust::transform(x.begin(), x.end(), iter, x.begin(), thrust::plus<T>());
+    return x;
+  }
+
+  template<class T> cuNDArray<complext<typename boost::enable_if<enable_operators<T>, T >::type > >& operator+= (cuNDArray< complext<T> > &x , cuNDArray<T> &y){
+    equals_transform< complext<T>,T,cuNDA_plus<T> >(x,y);
+    return x;
+  }
+
+  template<class T> cuNDArray<complext<typename boost::enable_if<enable_operators<T>, T >::type > >& operator+= (cuNDArray<complext<T> > &x , T y){
+    thrust::constant_iterator<T> iter(y);
+    thrust::transform(x.begin(), x.end(), iter, x.begin(), cuNDA_plus<T>());
+    return x;
+  }
+
+  template<class T> cuNDArray<typename boost::enable_if<enable_operators<T>, T >::type >& operator-= (cuNDArray<T> & x , cuNDArray<T> & y){
+    equals_transform< T,T,thrust::minus<T> >(x,y);
+    return x;
+  }
+
+  template<class T> cuNDArray<typename boost::enable_if<enable_operators<T>, T >::type >& operator-= (cuNDArray<T> &x , T y){
+    thrust::constant_iterator<T> iter(y);
+    thrust::transform(x.begin(), x.end(), iter, x.begin(), thrust::minus<T>());
+    return x;
+  }
+
+  template<class T> cuNDArray<complext<typename boost::enable_if<enable_operators<T>, T >::type > >& operator-= (cuNDArray< complext<T> > &x , cuNDArray<T> &y){
+    equals_transform< complext<T>,T,cuNDA_minus<T> >(x,y);
+    return x;
+  }
+
+  template<class T> cuNDArray<complext<typename boost::enable_if<enable_operators<T>, T >::type > >& operator-= (cuNDArray<complext<T> > &x , T y){
+    thrust::constant_iterator<T> iter(y);
+    thrust::transform(x.begin(), x.end(), iter, x.begin(), cuNDA_minus<T>());
+    return x;
+  }
+
+  template<class T> cuNDArray<typename boost::enable_if<enable_operators<T>, T >::type >& operator*= (cuNDArray<T> &x , cuNDArray<T> &y){
+    equals_transform< T,T,thrust::multiplies<T> >(x,y);
+    return x;
+  }
+
+  template<class T> cuNDArray<typename boost::enable_if<enable_operators<T>, T >::type >& operator*= (cuNDArray<T> &x , T y){
+    thrust::constant_iterator<T> iter(y);
+    thrust::transform(x.begin(), x.end(), iter, x.begin(), thrust::multiplies<T>());
+    return x;
+  }
+
+  template<class T> cuNDArray<complext<typename boost::enable_if<enable_operators<T>, T >::type > >& operator*= (cuNDArray< complext<T> > &x , cuNDArray<T> &y){
+    equals_transform< complext<T>,T,cuNDA_multiply<T> >(x,y);
+    return x;
+  }
+
+  template<class T> cuNDArray<complext<typename boost::enable_if<enable_operators<T>, T >::type > >& operator*= (cuNDArray<complext<T> > &x , T y){
+    thrust::constant_iterator<T> iter(y);
+    thrust::transform(x.begin(), x.end(), iter, x.begin(), cuNDA_multiply<T>());
+    return x;
+  }
+
+  template<class T> cuNDArray<typename boost::enable_if<enable_operators<T>, T >::type >& operator/= (cuNDArray<T> &x , cuNDArray<T> &y){
+    equals_transform< T,T,thrust::divides<T> >(x,y);
+    return x;
+  }
+
+  template<class T> cuNDArray<typename boost::enable_if<enable_operators<T>, T >::type >& operator/= (cuNDArray<T> &x , T y){
+    thrust::constant_iterator<T> iter(y);
+    thrust::transform(x.begin(), x.end(), iter, x.begin(), thrust::divides<T>());
+    return x;
+  }
+
+  template<class T> cuNDArray<complext<typename boost::enable_if<enable_operators<T>, T >::type > >& operator/= (cuNDArray< complext<T> > &x , cuNDArray<T> &y){
+    equals_transform< complext<T>,T,cuNDA_divide<T> >(x,y);
+    return x;
+  }
+
+  template<class T> cuNDArray<complext<typename boost::enable_if<enable_operators<T>, T >::type > >& operator/= (cuNDArray<complext<T> > &x , T y){
+    thrust::constant_iterator<T> iter(y);
+    thrust::transform(x.begin(), x.end(), iter, x.begin(), cuNDA_divide<T>());
+    return x;
+  }
+
+  //
+  // Instantiation
+  //
+
+  template EXPORTGPUCORE cuNDArray<float>& operator+=<float>(cuNDArray<float>&, cuNDArray<float>&);
+  template EXPORTGPUCORE cuNDArray<float>& operator+=<float>(cuNDArray<float>&, float);
+  template EXPORTGPUCORE cuNDArray<float>& operator-=<float>(cuNDArray<float>&, cuNDArray<float>&);
+  template EXPORTGPUCORE cuNDArray<float>& operator-=<float>(cuNDArray<float>&, float);
+  template EXPORTGPUCORE cuNDArray<float>& operator*=<float>(cuNDArray<float>&, cuNDArray<float>&);
+  template EXPORTGPUCORE cuNDArray<float>& operator*=<float>(cuNDArray<float>&, float);
+  template EXPORTGPUCORE cuNDArray<float>& operator/=<float>(cuNDArray<float>&, cuNDArray<float>&);
+  template EXPORTGPUCORE cuNDArray<float>& operator/=<float>(cuNDArray<float>&, float);
+
+  template EXPORTGPUCORE cuNDArray<double>& operator+=<double>(cuNDArray<double>&, cuNDArray<double>&);
+  template EXPORTGPUCORE cuNDArray<double>& operator+=<double>(cuNDArray<double>&, double);
+  template EXPORTGPUCORE cuNDArray<double>& operator-=<double>(cuNDArray<double>&, cuNDArray<double>&);
+  template EXPORTGPUCORE cuNDArray<double>& operator-=<double>(cuNDArray<double>&, double);
+  template EXPORTGPUCORE cuNDArray<double>& operator*=<double>(cuNDArray<double>&, cuNDArray<double>&);
+  template EXPORTGPUCORE cuNDArray<double>& operator*=<double>(cuNDArray<double>&, double);
+  template EXPORTGPUCORE cuNDArray<double>& operator/=<double>(cuNDArray<double>&, cuNDArray<double>&);
+  template EXPORTGPUCORE cuNDArray<double>& operator/=<double>(cuNDArray<double>&, double);
+
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator+=< complext<float> > 
+  (cuNDArray< complext<float> >&, cuNDArray< complext<float> >&);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator+=< complext<float> > 
+  (cuNDArray< complext<float> >&, complext<float>);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator-=< complext<float> > 
+  (cuNDArray< complext<float> >&, cuNDArray< complext<float> >&);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator-=< complext<float> > 
+  (cuNDArray< complext<float> >&, complext<float>);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator*=< complext<float> >
+  (cuNDArray< complext<float> >&, cuNDArray< complext<float> >&);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator*=< complext<float> >
+  (cuNDArray< complext<float> >&, complext<float>);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator/=< complext<float> > 
+  (cuNDArray< complext<float> >&, cuNDArray< complext<float> >&);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator/=< complext<float> > 
+  (cuNDArray< complext<float> >&, complext<float>);
+
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator+=<float>(cuNDArray< complext<float> >&, cuNDArray<float>&);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator-=<float>(cuNDArray< complext<float> >&, cuNDArray<float>&);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator*=<float>(cuNDArray< complext<float> >&, cuNDArray<float>&);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator/=<float>(cuNDArray< complext<float> >&, cuNDArray<float>&);
+
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator+=<float>(cuNDArray< complext<float> >&, float);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator-=<float>(cuNDArray< complext<float> >&, float);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator*=<float>(cuNDArray< complext<float> >&, float);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator/=<float>(cuNDArray< complext<float> >&, float);
+
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator+=< complext<double> > 
+  (cuNDArray< complext<double> >&, cuNDArray< complext<double> >&);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator+=< complext<double> > 
+  (cuNDArray< complext<double> >&, complext<double>);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator-=< complext<double> > 
+  (cuNDArray< complext<double> >&, cuNDArray< complext<double> >&);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator-=< complext<double> > 
+  (cuNDArray< complext<double> >&, complext<double>);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator*=< complext<double> >
+  (cuNDArray< complext<double> >&, cuNDArray< complext<double> >&);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator*=< complext<double> >
+  (cuNDArray< complext<double> >&, complext<double>);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator/=< complext<double> > 
+  (cuNDArray< complext<double> >&, cuNDArray< complext<double> >&);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator/=< complext<double> > 
+  (cuNDArray< complext<double> >&, complext<double>);
+
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator+=<double>(cuNDArray< complext<double> >&, cuNDArray<double>&);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator-=<double>(cuNDArray< complext<double> >&, cuNDArray<double>&);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator*=<double>(cuNDArray< complext<double> >&, cuNDArray<double>&);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator/=<double>(cuNDArray< complext<double> >&, cuNDArray<double>&);
+
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator+=<double>(cuNDArray< complext<double> >&, double);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator-=<double>(cuNDArray< complext<double> >&, double);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator*=<double>(cuNDArray< complext<double> >&, double);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator/=<double>(cuNDArray< complext<double> >&, double);
+}
diff --git a/toolboxes/core/gpu/cuNDArray_operators.h b/toolboxes/core/gpu/cuNDArray_operators.h
new file mode 100644
index 0000000..fb486a9
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_operators.h
@@ -0,0 +1,168 @@
+/** \file cuNDArray_operators.h
+    \brief Common element-wise arithmetic operators on the cuNDArray class.
+    
+    cuNDArray_operators.h defines element-wise arithmetic array operations on the cuNDArray class.
+    We define the common operators +=, -=, *= and \= for both array-array and array-constant operations.
+    We have deliberately omitted to define operator+, operator- etc. since this would require returning an cuNDArray,
+    in turn invoking an explicit memcpy by the assignment operator.
+    Batch mode functionality is provided.
+    The implementation is based on Thrust.
+    This code is purposely split into a header and underlying implementation (.cu) 
+    as this allows specific instantiation of the supported template types. 
+    Furthermore thrust code can only be compiled by nvcc.
+    The supported types are float, double, Gadgetron::complext<float> and Gadgetron::complext<double>. 
+    Scalars can be applied to complex numbers of corresponding precision.
+*/
+
+#pragma once
+
+#include "cuNDArray.h"
+#include "gpucore_export.h"
+#include "Gadgetron_enable_types.h"
+
+namespace Gadgetron {
+
+  /**
+   * @brief Implementation of element-wise operator+= on two cuNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+   
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<typename boost::enable_if<enable_operators<T>, T >::type > & operator+= (cuNDArray<T> &x, cuNDArray<T> &y);
+  
+  /**
+   * @brief Implementation of element-wise operator+= on a cuNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<typename boost::enable_if<enable_operators<T>, T >::type > & operator+= (cuNDArray<T> &x, T y );
+    
+  /**
+   * @brief Implementation of element-wise operator+= on two cuNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+   
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<complext<typename boost::enable_if<enable_operators<T>, T >::type > > & operator+= (cuNDArray<complext<T> > &x, cuNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator+= on a cuNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<complext< typename boost::enable_if<enable_operators<T>, T >::type > > & operator+= (cuNDArray<complext<T> > &x, T y );
+
+  /**
+   * @brief Implementation of element-wise operator-= on two cuNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+   
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<typename boost::enable_if<enable_operators<T>, T >::type > & operator-= (cuNDArray<T> &x, cuNDArray<T> &y);
+  
+  /**
+   * @brief Implementation of element-wise operator-= on a cuNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<typename boost::enable_if<enable_operators<T>, T >::type > & operator-= (cuNDArray<T> &x, T y );
+    
+  /**
+   * @brief Implementation of element-wise operator-= on two cuNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+   
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<complext< typename boost::enable_if<enable_operators<T>, T >::type > > & operator-= (cuNDArray<complext<T> > &x, cuNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator-= on a cuNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<complext< typename boost::enable_if<enable_operators<T>, T >::type > > & operator-= (cuNDArray<complext<T> > &x, T y );
+
+  /**
+   * @brief Implementation of element-wise operator*= on two cuNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+   
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray< typename boost::enable_if<enable_operators<T>, T >::type >  & operator*= (cuNDArray<T> &x, cuNDArray<T> &y);
+  
+  /**
+   * @brief Implementation of element-wise operator*= on a cuNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray< typename boost::enable_if<enable_operators<T>, T >::type >  & operator*= (cuNDArray<T> &x, T y );
+    
+  /**
+   * @brief Implementation of element-wise operator*= on two cuNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+   
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<complext< typename boost::enable_if<enable_operators<T>, T >::type > > & operator*= (cuNDArray<complext<T> > &x, cuNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator*= on a cuNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<complext< typename boost::enable_if<enable_operators<T>, T >::type > > & operator*= (cuNDArray<complext<T> > &x, T y );
+
+  /**
+   * @brief Implementation of element-wise operator/= on two cuNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+   
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray< typename boost::enable_if<enable_operators<T>, T >::type >  & operator/= (cuNDArray<T> &x, cuNDArray<T> &y);
+  
+  /**
+   * @brief Implementation of element-wise operator/= on a cuNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray< typename boost::enable_if<enable_operators<T>, T >::type > & operator/= (cuNDArray<T> &x, T y );
+    
+  /**
+   * @brief Implementation of element-wise operator/= on two cuNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+   
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<complext< typename boost::enable_if<enable_operators<T>, T >::type > >  & operator/= (cuNDArray<complext<T> > &x, cuNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator/= on a cuNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<complext< typename boost::enable_if<enable_operators<T>, T >::type > >  & operator/= (cuNDArray<complext<T> > &x, T y );
+}
diff --git a/toolboxes/core/gpu/cuNDArray_reductions.cu b/toolboxes/core/gpu/cuNDArray_reductions.cu
new file mode 100644
index 0000000..2e69141
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_reductions.cu
@@ -0,0 +1,103 @@
+#include "cuNDArray_reductions.h"
+#include "setup_grid.h"
+#include <thrust/extrema.h>
+
+namespace Gadgetron {
+
+  template<class T> static void 
+  find_stride( cuNDArray<T> *in, size_t dim, size_t *stride, std::vector<size_t> *dims )
+  {
+    *stride = 1;
+    for( unsigned int i=0; i<in->get_number_of_dimensions(); i++ ){
+      if( i != dim )
+        dims->push_back(in->get_size(i));
+      if( i < dim )
+        *stride *= in->get_size(i);
+    }
+  }
+  
+  // Sum
+  //
+  template<class T> 
+  __global__ void sum_kernel( T *in, T *out, 
+                              unsigned int stride, unsigned int number_of_batches, unsigned int number_of_elements )
+  {
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+
+    if( idx < number_of_elements ){
+
+      unsigned int in_idx = (idx/stride)*stride*number_of_batches+(idx%stride);
+
+      T val = in[in_idx];
+
+      for( unsigned int i=1; i<number_of_batches; i++ ) 
+        val += in[i*stride+in_idx];
+
+      out[idx] = val; 
+    }
+  }
+
+  // Sum
+  //
+  template<class T>  boost::shared_ptr< cuNDArray<T> > sum( cuNDArray<T> *in, unsigned int dim )
+  {
+    // Some validity checks
+    if( !(in->get_number_of_dimensions()>1) ){
+      throw std::runtime_error("sum: underdimensioned.");;
+    }
+
+    if( dim > in->get_number_of_dimensions()-1 ){
+      throw std::runtime_error( "sum: dimension out of range.");;
+    }
+
+    unsigned int number_of_batches = in->get_size(dim);
+    unsigned int number_of_elements = in->get_number_of_elements()/number_of_batches;
+
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( number_of_elements, &blockDim, &gridDim );
+
+    // Find element stride
+    size_t stride; std::vector<size_t> dims;
+    find_stride<T>( in, dim, &stride, &dims );
+
+    // Invoke kernel
+    boost::shared_ptr< cuNDArray<T> > out(new cuNDArray<T>());
+    out->create(&dims);
+
+    sum_kernel<T><<< gridDim, blockDim >>>( in->get_data_ptr(), out->get_data_ptr(), stride, number_of_batches, number_of_elements );
+
+    CHECK_FOR_CUDA_ERROR();
+    return out;
+  }
+
+  template<class T> T mean(cuNDArray<T>* in)
+  {
+    return thrust::reduce(in->begin(),in->end(),T(0),thrust::plus<T>())/T(in->get_number_of_elements());
+  }
+
+  template<class T> T min(cuNDArray<T>* in)
+	{
+  	return *thrust::min_element(in->begin(),in->end());
+	}
+
+  template<class T> T max(cuNDArray<T>* in)
+	{
+		return *thrust::max_element(in->begin(),in->end());
+	}
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > sum<float>( cuNDArray<float>*, unsigned int);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > sum<double>( cuNDArray<double>*, unsigned int);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > sum<float_complext>( cuNDArray<float_complext>*, unsigned int);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > sum<double_complext>( cuNDArray<double_complext>*, unsigned int);  
+
+  template EXPORTGPUCORE float mean<float>(cuNDArray<float>*);
+  template EXPORTGPUCORE float_complext mean<float_complext>(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE double mean<double>(cuNDArray<double>*);
+  template EXPORTGPUCORE double_complext mean<double_complext>(cuNDArray<double_complext>*);
+
+  template EXPORTGPUCORE float min<float>(cuNDArray<float>*);
+  template EXPORTGPUCORE float max<float>(cuNDArray<float>*);
+  template EXPORTGPUCORE double min<double>(cuNDArray<double>*);
+	template EXPORTGPUCORE double max<double>(cuNDArray<double>*);
+}
diff --git a/toolboxes/core/gpu/cuNDArray_reductions.h b/toolboxes/core/gpu/cuNDArray_reductions.h
new file mode 100644
index 0000000..6d9867b
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_reductions.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "cuNDArray.h"
+#include "gpucore_export.h"
+
+namespace Gadgetron{
+
+  template<class T> EXPORTGPUCORE boost::shared_ptr<cuNDArray<T> > sum(cuNDArray<T> *data, unsigned int dim );
+  
+  template<class T> EXPORTGPUCORE T mean(cuNDArray<T>* data);
+  
+  template<class T> EXPORTGPUCORE T min(cuNDArray<T>* data);
+
+  template<class T> EXPORTGPUCORE T max(cuNDArray<T>* data);
+}
diff --git a/toolboxes/core/gpu/cuNDArray_utils.cu b/toolboxes/core/gpu/cuNDArray_utils.cu
new file mode 100644
index 0000000..391eae6
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_utils.cu
@@ -0,0 +1,963 @@
+#include "cuNDArray_utils.h"
+#include "vector_td_utilities.h"
+#include "cudaDeviceManager.h"
+#include "setup_grid.h"
+#include "cuNDArray_math.h"
+
+#include <math_functions.h>
+#include <cmath>
+
+namespace Gadgetron {
+
+  template <class T> 
+  __global__ void cuNDArray_permute_kernel(const  T*  __restrict__ in, T* __restrict__ out,
+                                            unsigned int ndim,
+                                            const unsigned int* __restrict__ dims,
+                                            const unsigned int* __restrict__ strides_out,
+                                            unsigned int elements,
+                                            int shift_mode)
+  {
+    unsigned int idx_in = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    unsigned int idx_out = 0;
+    unsigned int idx_in_tmp = idx_in;
+
+    if (idx_in < elements) {
+
+      unsigned int cur_index;
+      for (unsigned int i = 0; i < ndim; i++) {
+        unsigned int idx_in_remainder = idx_in_tmp / dims[i];
+        cur_index = idx_in_tmp-(idx_in_remainder*dims[i]); //cur_index = idx_in_tmp%dims[i];
+        if (shift_mode < 0) { //IFFTSHIFT
+          idx_out += ((cur_index+(dims[i]>>1))%dims[i])*strides_out[i];
+        } else if (shift_mode > 0) { //FFTSHIFT
+          idx_out += ((cur_index+((dims[i]+1)>>1))%dims[i])*strides_out[i];
+        } else {
+          idx_out += cur_index*strides_out[i];
+        }
+        idx_in_tmp = idx_in_remainder;
+      }
+      out[idx_in] = in[idx_out];
+    }
+  }
+
+  template <class T> void cuNDArray_permute( cuNDArray<T>* in,
+                                             cuNDArray<T>* out,
+                                             std::vector<size_t> *order,
+                                             int shift_mode)
+  {    
+    if( out == 0x0 ){
+      throw cuda_error("cuNDArray_permute(internal): 0x0 output");
+    }
+
+
+    cudaError_t err;
+
+    T* in_ptr = in->get_data_ptr();
+    T* out_ptr = 0;
+
+    if (out) {
+      out_ptr = out->get_data_ptr();
+    } else {
+      if (cudaMalloc((void**) &out_ptr, in->get_number_of_elements()*sizeof(T)) != cudaSuccess) {
+        throw cuda_error("cuNDArray_permute : Error allocating CUDA memory");
+      }
+    }
+
+    unsigned int* dims        = new unsigned int[in->get_number_of_dimensions()];
+    unsigned int* strides_out = new unsigned int[in->get_number_of_dimensions()];
+
+    if (!dims || !strides_out) {
+      throw cuda_error("cuNDArray_permute: failed to allocate temporary storage for arrays");
+    }
+
+    for (unsigned int i = 0; i < in->get_number_of_dimensions(); i++) {
+      dims[i] = (*in->get_dimensions())[(*order)[i]];
+      strides_out[i] = 1;    
+      for (unsigned int j = 0; j < (*order)[i]; j++) {
+        strides_out[i] *= (*in->get_dimensions())[j];
+      }
+    }
+
+    unsigned int* dims_dev        = 0;
+    unsigned int* strides_out_dev = 0;
+
+    if (cudaMalloc((void**) &dims_dev, in->get_number_of_dimensions()*sizeof(unsigned int)) != cudaSuccess) {
+      throw cuda_error("cuNDArray_permute : Error allocating CUDA dims memory");
+    }
+
+    if (cudaMalloc((void**) &strides_out_dev, in->get_number_of_dimensions()*sizeof(unsigned int)) != cudaSuccess) {
+      throw cuda_error("cuNDArray_permute : Error allocating CUDA strides_out memory");
+    }
+
+    if (cudaMemcpy(dims_dev, dims, in->get_number_of_dimensions()*sizeof(unsigned int), cudaMemcpyHostToDevice) != cudaSuccess) {
+      err = cudaGetLastError();
+      std::stringstream ss;
+      ss << "cuNDArray_permute : Error uploading dimensions to device, " << cudaGetErrorString(err);
+      throw cuda_error(ss.str());
+    }
+
+    if (cudaMemcpy(strides_out_dev, strides_out, in->get_number_of_dimensions()*sizeof(unsigned int), cudaMemcpyHostToDevice) != cudaSuccess) {
+      throw cuda_error("cuNDArray_permute : Error uploading strides to device");
+    }
+
+    dim3 blockDim(512,1,1);
+    dim3 gridDim;
+    if( in->get_number_of_dimensions() > 2 ){
+      gridDim = dim3((unsigned int) std::ceil((double)in->get_size(0)*in->get_size(1)/blockDim.x), 1, 1 );
+      for( unsigned int d=2; d<in->get_number_of_dimensions(); d++ )
+        gridDim.y *= in->get_size(d);
+    }
+    else
+      gridDim = dim3((unsigned int) std::ceil((double)in->get_number_of_elements()/blockDim.x), 1, 1 );
+
+    cuNDArray_permute_kernel<<< gridDim, blockDim >>>( in_ptr, out_ptr, in->get_number_of_dimensions(), 
+                                                       dims_dev, strides_out_dev, in->get_number_of_elements(), shift_mode);
+
+    err = cudaGetLastError();
+    if( err != cudaSuccess ){
+      std::stringstream ss;
+      ss <<"cuNDArray_permute : Error during kernel call: " << cudaGetErrorString(err);
+      throw cuda_error(ss.str());
+    }
+
+    if (cudaFree(dims_dev) != cudaSuccess) {
+      err = cudaGetLastError();
+      std::stringstream ss;
+      ss << "cuNDArray_permute: failed to delete device memory (dims_dev) " << cudaGetErrorString(err);
+      throw cuda_error(ss.str());
+    }
+
+    if (cudaFree(strides_out_dev) != cudaSuccess) {
+      err = cudaGetLastError();
+      std::stringstream ss;
+      ss << "cuNDArray_permute: failed to delete device memory (strides_out_dev) "<< cudaGetErrorString(err);
+      throw cuda_error(ss.str());
+    }    
+    delete [] dims;
+    delete [] strides_out;    
+  }  
+
+  template <class T> boost::shared_ptr< cuNDArray<T> >
+  permute( cuNDArray<T> *in, std::vector<size_t> *dim_order, int shift_mode )
+  {
+    if( in == 0x0 || dim_order == 0x0 ) {
+      throw std::runtime_error("permute(): invalid pointer provided");
+    }    
+
+    std::vector<size_t> dims;
+    for (size_t i = 0; i < dim_order->size(); i++)
+      dims.push_back(in->get_size(dim_order->at(i)));
+
+    boost::shared_ptr< cuNDArray<T> > out( new cuNDArray<T>(dims) );
+    permute( in, out.get(), dim_order, shift_mode );
+    return out;
+  }
+
+  template <class T> void
+  permute( cuNDArray<T> *in, cuNDArray<T> *out, std::vector<size_t> *dim_order, int shift_mode )
+  {
+    if( in == 0x0 || out == 0x0 || dim_order == 0x0 ) {
+      throw std::runtime_error("permute(): invalid pointer provided");
+    }    
+    if (out->get_number_of_dimensions() != in->get_number_of_dimensions() || out->get_number_of_elements() != in->get_number_of_elements()){
+    	throw std::runtime_error("permute(): Input and output have differing dimensions and/or differing number of elements");
+    }
+
+    if( in == out ){
+      throw std::runtime_error("permute(): in-place permutation not supported");
+    }   
+
+    //Check ordering array
+    if (dim_order->size() > in->get_number_of_dimensions()) {
+      throw std::runtime_error("permute(): invalid length of dimension ordering array");
+    }
+
+    std::vector<size_t> dim_count(in->get_number_of_dimensions(),0);
+    for (unsigned int i = 0; i < dim_order->size(); i++) {
+      if ((*dim_order)[i] >= in->get_number_of_dimensions()) {
+        throw std::runtime_error("permute(): invalid dimension order array");
+      }
+      dim_count[(*dim_order)[i]]++;
+    }
+
+    //Create an internal array to store the dimensions
+    std::vector<size_t> dim_order_int;
+
+    //Check that there are no duplicate dimensions
+    for (unsigned int i = 0; i < dim_order->size(); i++) {
+      if (dim_count[(*dim_order)[i]] != 1) {
+        throw std::runtime_error("permute(): invalid dimension order array (duplicates)");
+      }
+      dim_order_int.push_back((*dim_order)[i]);
+    }
+
+    for (unsigned int i = 0; i < dim_order_int.size(); i++) {
+      if ((*in->get_dimensions())[dim_order_int[i]] != out->get_size(i)) {
+        throw std::runtime_error("permute(): dimensions of output array do not match the input array");
+      }
+    }
+
+    //Pad dimension order array with dimension not mentioned in order array
+    if (dim_order_int.size() < in->get_number_of_dimensions()) {
+      for (unsigned int i = 0; i < dim_count.size(); i++) {
+        if (dim_count[i] == 0) {
+          dim_order_int.push_back(i);
+        }
+      }
+    }
+
+
+    //Check if permute is needed
+    {
+    	bool skip_permute = true;
+    	for (size_t i = 0; i < dim_order_int.size(); i++)
+    		skip_permute &= (i == dim_order_int[i]);
+
+    	if (skip_permute){
+    		*out = *in;
+    		return;
+    	}
+    }
+    cuNDArray_permute(in, out, &dim_order_int, shift_mode);
+  }
+
+  template<class T> boost::shared_ptr< cuNDArray<T> >
+  shift_dim( cuNDArray<T> *in, int shift )
+  {
+    if( in == 0x0 ) {
+      throw std::runtime_error("shift_dim(): invalid input pointer provided");
+    }    
+
+    std::vector<size_t> order;
+    for (int i = 0; i < in->get_number_of_dimensions(); i++) {
+      order.push_back(static_cast<unsigned int>((i+shift)%in->get_number_of_dimensions()));
+    }
+    return permute(in, &order);
+  }
+
+  template<class T> 
+  void shift_dim( cuNDArray<T> *in, cuNDArray<T> *out, int shift )
+  {
+    if( in == 0x0 || out == 0x0 ) {
+      throw std::runtime_error("shift_dim(): invalid pointer provided");
+    }    
+
+    std::vector<size_t> order;
+    for (int i = 0; i < in->get_number_of_dimensions(); i++) {
+      order.push_back(static_cast<unsigned int>((i+shift)%in->get_number_of_dimensions()));
+    }
+    permute(in,out,&order);
+  }
+
+  // Expand
+  //
+  template<class T> 
+  __global__ void expand_kernel( 
+                                const T * __restrict__ in, T * __restrict__ out,
+                                unsigned int number_of_elements_in, unsigned int number_of_elements_out, unsigned int new_dim_size )
+  {
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;    
+    if( idx < number_of_elements_out ){
+      out[idx] = in[idx%number_of_elements_in];
+    }
+  }
+
+  // Expand
+  //
+  template<class T> boost::shared_ptr< cuNDArray<T> > 
+  expand( cuNDArray<T> *in, size_t new_dim_size )
+  {
+    unsigned int number_of_elements_out = in->get_number_of_elements()*new_dim_size;
+
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( number_of_elements_out, &blockDim, &gridDim );
+
+    // Find element stride
+    std::vector<size_t> dims = *in->get_dimensions();
+    dims.push_back(new_dim_size);
+
+    // Invoke kernel
+    boost::shared_ptr< cuNDArray<T> > out( new cuNDArray<T>());
+    out->create(&dims);
+
+    expand_kernel<T><<< gridDim, blockDim >>>( in->get_data_ptr(), out->get_data_ptr(), 
+                                               in->get_number_of_elements(), number_of_elements_out, new_dim_size );
+
+    CHECK_FOR_CUDA_ERROR();    
+    return out;
+  }
+
+  // Crop
+  template<class T, unsigned int D> __global__ void crop_kernel
+  ( vector_td<unsigned int,D> offset, vector_td<unsigned int,D> matrix_size_in, vector_td<unsigned int,D> matrix_size_out,
+    const T * __restrict__ in, T * __restrict__ out, unsigned int num_batches, unsigned int num_elements )
+  {
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    const unsigned int frame_offset = idx/num_elements;
+    
+    if( idx < num_elements*num_batches ){
+      const typename uintd<D>::Type co = idx_to_co<D>( idx-frame_offset*num_elements, matrix_size_out );
+      const typename uintd<D>::Type co_os = offset + co;
+      const unsigned int in_idx = co_to_idx<D>(co_os, matrix_size_in)+frame_offset*prod(matrix_size_in);
+      out[idx] = in[in_idx];
+    }
+  }
+
+  // Crop
+  template<class T, unsigned int D>
+  void crop( typename uint64d<D>::Type offset, cuNDArray<T> *in, cuNDArray<T> *out )
+  {
+    if( in == 0x0 || out == 0x0 ){
+      throw std::runtime_error("crop: 0x0 ndarray provided");
+    }
+
+    if( in->get_number_of_dimensions() != out->get_number_of_dimensions() ){
+      throw std::runtime_error("crop: image dimensions mismatch");
+    }
+
+    if( in->get_number_of_dimensions() < D ){
+      std::stringstream ss;
+      ss << "crop: number of image dimensions should be at least " << D;
+      throw std::runtime_error(ss.str());
+    }
+
+    typename uint64d<D>::Type matrix_size_in = from_std_vector<size_t,D>( *in->get_dimensions() );
+    typename uint64d<D>::Type matrix_size_out = from_std_vector<size_t,D>( *out->get_dimensions() );
+
+    unsigned int number_of_batches = 1;
+    for( unsigned int d=D; d<in->get_number_of_dimensions(); d++ ){
+        number_of_batches *= in->get_size(d);
+      }
+
+           if( weak_greater(offset+matrix_size_out, matrix_size_in) ){
+             throw std::runtime_error( "crop: cropping size mismatch");
+           }
+
+           // Setup block/grid dimensions
+           dim3 blockDim; dim3 gridDim;
+         setup_grid( prod(matrix_size_out), &blockDim, &gridDim, number_of_batches );
+
+         // Invoke kernel
+         crop_kernel<T,D><<< gridDim, blockDim >>>
+           ( vector_td<unsigned int,D>(offset), vector_td<unsigned int,D>(matrix_size_in), vector_td<unsigned int,D>(matrix_size_out),
+           in->get_data_ptr(), out->get_data_ptr(), number_of_batches, prod(matrix_size_out) );
+    
+    CHECK_FOR_CUDA_ERROR();
+  }
+
+  template<class T, unsigned int D> boost::shared_ptr< cuNDArray<T> > 
+  crop( typename uint64d<D>::Type offset, typename uint64d<D>::Type size, cuNDArray<T> *in )
+  {
+    if( in == 0x0 ){
+      throw std::runtime_error("crop: 0x0 array provided");
+    }
+    std::vector<size_t> dims = to_std_vector(size);
+    for( unsigned int d=D; d<in->get_number_of_dimensions(); d++ ){
+      dims.push_back(in->get_size(d));
+    }
+    boost::shared_ptr< cuNDArray<T> > result( new cuNDArray<T>(&dims) );
+    crop<T,D>(offset, in, result.get());
+    return result;
+  }  
+
+  // Expand and zero fill
+  template<class T, unsigned int D> 
+  __global__ void pad_kernel( vector_td<unsigned int,D> matrix_size_in, vector_td<unsigned int,D> matrix_size_out,
+                              const T * __restrict__ in, T * __restrict__ out, unsigned int number_of_batches, unsigned int num_elements, T val )
+  {
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    const unsigned int frame_offset = idx/num_elements;
+
+    if( idx < num_elements*number_of_batches ){
+
+      const typename uintd<D>::Type co_out = idx_to_co<D>( idx-frame_offset*num_elements, matrix_size_out );
+      typename uintd<D>::Type offset;
+      for(unsigned int d=0; d<D; d++)
+      {
+          offset[d] = matrix_size_out[d]/2 - matrix_size_in[d]/2;
+      }
+
+      T _out;
+      bool inside = (co_out>=offset) && (co_out<(matrix_size_in+offset));
+
+      if( inside )
+        _out = in[co_to_idx<D>(co_out-offset, matrix_size_in)+frame_offset*prod(matrix_size_in)];
+      else{      
+        _out = val;
+      }
+
+      out[idx] = _out;
+    }
+  }
+
+  template<class T, unsigned int D> 
+  void pad( cuNDArray<T> *in, cuNDArray<T> *out, T val )
+  { 
+    if( in == 0x0 || out == 0x0 ){
+      throw std::runtime_error("pad: 0x0 ndarray provided");
+    }
+
+    if( in->get_number_of_dimensions() != out->get_number_of_dimensions() ){
+      throw std::runtime_error("pad: image dimensions mismatch");
+    }
+
+    if( in->get_number_of_dimensions() < D ){
+      std::stringstream ss;
+      ss << "pad: number of image dimensions should be at least " << D;
+      throw std::runtime_error(ss.str());
+    }
+
+    typename uint64d<D>::Type matrix_size_in = from_std_vector<size_t,D>( *in->get_dimensions() );
+    typename uint64d<D>::Type matrix_size_out = from_std_vector<size_t,D>( *out->get_dimensions() );
+
+    unsigned int number_of_batches = 1;
+    for( unsigned int d=D; d<in->get_number_of_dimensions(); d++ ){
+      number_of_batches *= in->get_size(d);
+    }
+
+    if( weak_greater(matrix_size_in,matrix_size_out) ){
+      throw std::runtime_error("pad: size mismatch, cannot expand");
+    }
+
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( prod(matrix_size_out), &blockDim, &gridDim, number_of_batches );
+
+    // Invoke kernel
+    pad_kernel<T,D><<< gridDim, blockDim >>> 
+      ( vector_td<unsigned int,D>(matrix_size_in), vector_td<unsigned int,D>(matrix_size_out),
+        in->get_data_ptr(), out->get_data_ptr(), number_of_batches, prod(matrix_size_out), val );
+
+    CHECK_FOR_CUDA_ERROR();
+  }
+
+  template<class T, unsigned int D> boost::shared_ptr< cuNDArray<T> >
+  pad( typename uint64d<D>::Type size, cuNDArray<T> *in, T val )
+  {
+    if( in == 0x0 ){
+      throw std::runtime_error("pad: 0x0 array provided");
+    }
+    std::vector<size_t> dims = to_std_vector(size);
+    for( unsigned int d=D; d<in->get_number_of_dimensions(); d++ ){
+      dims.push_back(in->get_size(d));
+    }
+    boost::shared_ptr< cuNDArray<T> > result( new cuNDArray<T>(&dims) );
+    pad<T,D>(in, result.get(), val);
+    return result;
+  }
+
+  template<class T, unsigned int D> 
+  __global__ void fill_border_kernel( vector_td<unsigned int,D> matrix_size_in, vector_td<unsigned int,D> matrix_size_out,
+                                      T *image, unsigned int number_of_batches, unsigned int number_of_elements, T val )
+  {
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+
+    if( idx < number_of_elements ){
+      const vector_td<unsigned int,D> co_out = idx_to_co<D>( idx, matrix_size_out );
+      const vector_td<unsigned int,D> offset = (matrix_size_out-matrix_size_in)>>1;
+      if( weak_less( co_out, offset ) || weak_greater_equal( co_out, matrix_size_in+offset ) ){
+	      for( unsigned int batch=0; batch<number_of_batches; batch++ ){
+          image[idx+batch*number_of_elements] = val;
+        }
+      }
+      else
+	      ; // do nothing
+    }
+  }
+
+  // Zero fill border (rectangular)
+  template<class T, unsigned int D> 
+  void fill_border( typename uint64d<D>::Type matrix_size_in, cuNDArray<T> *in_out, T val )
+  { 
+    typename uint64d<D>::Type matrix_size_out = from_std_vector<size_t,D>( *in_out->get_dimensions() );
+
+    if( weak_greater(matrix_size_in, matrix_size_out) ){
+      throw std::runtime_error("fill_border: size mismatch, cannot zero fill");
+    }
+
+    unsigned int number_of_batches = 1;
+    for( unsigned int d=D; d<in_out->get_number_of_dimensions(); d++ ){
+      number_of_batches *= in_out->get_size(d);
+    }
+
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( prod(matrix_size_out), &blockDim, &gridDim );
+
+    // Invoke kernel
+    fill_border_kernel<T,D><<< gridDim, blockDim >>>
+      ( vector_td<unsigned int,D>(matrix_size_in), vector_td<unsigned int,D>(matrix_size_out),
+        in_out->get_data_ptr(), number_of_batches, prod(matrix_size_out), val );
+
+    CHECK_FOR_CUDA_ERROR();
+  }
+
+
+  template<class T, unsigned int D>
+  __global__ void fill_border_kernel( typename realType<T>::Type radius, vector_td<int,D> matrix_size,
+                                      T *image, unsigned int number_of_batches, unsigned int number_of_elements, T val )
+  {
+    const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+
+    if( idx < number_of_elements ){
+      const vector_td<typename realType<T>::Type,D> co_out( (matrix_size>>1) - idx_to_co<D>( idx, matrix_size ));
+      if(  norm(co_out) > radius ){
+	      for( unsigned int batch=0; batch<number_of_batches; batch++ ){
+          image[idx+batch*number_of_elements] = val;
+        }
+      }
+      else
+	      ; // do nothing
+    }
+  }
+
+  // Zero fill border (radial)
+  template<class T, unsigned int D>
+  void fill_border( typename realType<T>::Type radius, cuNDArray<T> *in_out, T val )
+  {
+    typename uint64d<D>::Type matrix_size_out = from_std_vector<size_t,D>( *in_out->get_dimensions() );
+
+
+    unsigned int number_of_batches = 1;
+    for( unsigned int d=D; d<in_out->get_number_of_dimensions(); d++ ){
+      number_of_batches *= in_out->get_size(d);
+    }
+
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( prod(matrix_size_out), &blockDim, &gridDim );
+
+    // Invoke kernel
+    fill_border_kernel<T,D><<< gridDim, blockDim >>>
+      (radius, vector_td<int,D>(matrix_size_out),
+        in_out->get_data_ptr(), number_of_batches, prod(matrix_size_out), val );
+
+    CHECK_FOR_CUDA_ERROR();
+  }
+  template<class T, unsigned int D> __global__ void 
+  upsample_kernel( typename uintd<D>::Type matrix_size_in,
+                   typename uintd<D>::Type matrix_size_out,
+                   unsigned int num_batches,
+                   const T * __restrict__ image_in,
+                   T * __restrict__ image_out )
+  {
+    typedef typename realType<T>::Type REAL;
+    
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    const unsigned int num_elements_out = prod(matrix_size_out);
+    
+    if( idx < num_elements_out*num_batches ){
+      
+      const unsigned int batch = idx/num_elements_out;
+      const unsigned int batch_offset_in = batch*prod(matrix_size_in);
+      
+      const typename uintd<D>::Type co_out = idx_to_co<D>( idx-batch*num_elements_out, matrix_size_out );
+      const typename uintd<D>::Type co_in = co_out >> 1;
+      const typename uintd<D>::Type ones(1);
+      const typename uintd<D>::Type twos(2);
+      const typename uintd<D>::Type offset = co_out%twos;
+      
+      const unsigned int num_cells = 1 << D;
+      
+      T cellsum(0);
+      unsigned int count = 0;
+      
+      for( unsigned int i=0; i<num_cells; i++ ){
+        
+        const typename uintd<D>::Type stride = idx_to_co<D>( i, twos );
+        
+        if( offset >= stride ){
+          cellsum += image_in[batch_offset_in+co_to_idx(amin(co_in+stride, matrix_size_in-ones), matrix_size_in)];
+          count++;
+        }
+      }
+
+      image_out[idx] = cellsum / REAL(count);
+    }
+  }
+
+  //
+  // Linear upsampling by a factor of two (on a D-dimensional grid) 
+  // Note that this operator is the transpose of the downsampling operator below by design
+  // - based on Briggs et al, A Multigrid Tutorial 2nd edition, pp. 34-35
+  // 
+  
+  template<class T, unsigned int D> boost::shared_ptr< cuNDArray<T> > upsample( cuNDArray<T>* in )
+	{
+    if( in == 0x0 )
+      throw std::runtime_error("upsample: illegal input pointer");
+
+    std::vector<size_t> dims_out = *in->get_dimensions();
+    for( unsigned int i=0; i<D; i++ ) dims_out[i] <<= 1;
+    boost::shared_ptr< cuNDArray<T> > out(new cuNDArray<T>(&dims_out));
+    upsample<T,D>( in, out.get() );
+    return out;
+	}
+
+  template<class T, unsigned int D> void upsample( cuNDArray<T> *in, cuNDArray<T> *out )
+  {
+    if( in == 0x0 || out == 0x0 )
+      throw std::runtime_error("upsample: illegal input pointer");
+
+    typename uint64d<D>::Type matrix_size_in  = from_std_vector<size_t,D>( *in->get_dimensions() );
+    typename uint64d<D>::Type matrix_size_out = from_std_vector<size_t,D>( *out->get_dimensions() );
+
+    if( (matrix_size_in<<1) != matrix_size_out ){
+      throw std::runtime_error("upsample: arrays do not correspond to upsampling by a factor of two");
+    }
+
+    unsigned int number_of_batches = 1;
+    for( unsigned int d=D; d<out->get_number_of_dimensions(); d++ ){
+      number_of_batches *= out->get_size(d);
+    }
+
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( prod(matrix_size_out), &blockDim, &gridDim, number_of_batches );
+
+    // Invoke kernel
+    upsample_kernel<T,D><<< gridDim, blockDim >>>
+      ( vector_td<unsigned int,D>(matrix_size_in), vector_td<unsigned int,D>(matrix_size_out),
+        number_of_batches, in->get_data_ptr(), out->get_data_ptr() );
+
+    CHECK_FOR_CUDA_ERROR();    
+  }
+  //
+  // Linear downsampling by a factor of two (on a D-dimensional grid)
+  // Note that this operator is the transpose of the upsampling operator above by design
+  // - based on Briggs et al, A Multigrid Tutorial 2nd edition, pp. 36.
+  // 
+
+  template<class T, unsigned int D> __global__ void 
+  downsample_kernel( typename intd<D>::Type matrix_size_in,
+                     typename intd<D>::Type matrix_size_out,
+                     int num_batches,
+                     const T * __restrict__ image_in,
+                     T * __restrict__ image_out )
+  {
+    typedef typename realType<T>::Type REAL;
+    
+    const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    const int num_elements_out = prod(matrix_size_out);
+    
+    if( idx < num_elements_out*num_batches ){
+      
+      const int batch = idx/num_elements_out;
+      const int batch_offset_in = batch*prod(matrix_size_in);
+      
+      const typename intd<D>::Type co_out = idx_to_co<D>( idx-batch*num_elements_out, matrix_size_out );
+      const typename intd<D>::Type co_in = co_out << 1;
+      
+      T cellsum[D+1];
+      for( unsigned int d=0; d<D+1; d++ ){
+        cellsum[d] = T(0);
+      }
+      
+      //const int num_cells = pow(3,D); // no pow for integers on device
+      int num_cells = 1; 
+      for( int i=0; i<D; i++ ) num_cells *=3;
+
+      const REAL denominator = pow(REAL(4),REAL(D));
+      
+      for( int i=0; i<num_cells; i++ ){
+        
+        const typename intd<D>::Type zeros(0);
+        const typename intd<D>::Type ones(1);
+        const typename intd<D>::Type threes(3);
+        const typename intd<D>::Type stride = idx_to_co<D>(i,threes)-ones; // in the range [-1;1]^D
+        
+        int distance = 0;
+        for( int d=0; d<D; d++ ){
+          if( abs(stride[d])>0 )
+            distance++;
+        }
+        
+        cellsum[distance] += image_in[batch_offset_in+co_to_idx(amax(zeros, amin(matrix_size_in-ones,co_in+stride)), matrix_size_in)];
+      }
+      
+      T res = T(0);
+      
+      for( unsigned int d=0; d<D+1; d++ ){
+        res += (REAL(1<<(D-d))*cellsum[d]);
+      }
+      
+      image_out[idx] = res / denominator;
+    }
+  }
+
+  template<class T, unsigned int D> boost::shared_ptr< cuNDArray<T> > downsample( cuNDArray<T>* in )
+  {
+    if( in == 0x0 )
+      throw std::runtime_error("downsample: illegal input pointer");
+    
+    std::vector<size_t> dims_out = *in->get_dimensions();
+    for( unsigned int i=0; i<D; i++ ) dims_out[i] >>= 1;
+    boost::shared_ptr< cuNDArray<T> > out(new cuNDArray<T>(&dims_out));
+    downsample<T,D>( in, out.get() );
+    return out;
+  }
+
+  template<class T, unsigned int D> void downsample( cuNDArray<T> *in, cuNDArray<T> *out )
+  {
+    if( in == 0x0 || out == 0x0 )
+      throw std::runtime_error("downsample: illegal input pointer");
+
+    typename uint64d<D>::Type matrix_size_in  = from_std_vector<size_t,D>( *in->get_dimensions() );
+    typename uint64d<D>::Type matrix_size_out = from_std_vector<size_t,D>( *out->get_dimensions() );
+
+    if( (matrix_size_in>>1) != matrix_size_out ){
+      throw std::runtime_error("downsample: arrays do not correspond to downsampling by a factor of two");
+    }
+
+    unsigned int number_of_batches = 1;
+    for( unsigned int d=D; d<out->get_number_of_dimensions(); d++ ){
+      number_of_batches *= out->get_size(d);
+    }
+
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( prod(matrix_size_out), &blockDim, &gridDim, number_of_batches );
+
+    // Invoke kernel
+    downsample_kernel<T,D><<< gridDim, blockDim >>>
+      ( vector_td<int,D>(matrix_size_in), vector_td<int,D>(matrix_size_out),
+        (int)number_of_batches, in->get_data_ptr(), out->get_data_ptr() );
+
+    CHECK_FOR_CUDA_ERROR();    
+  }
+
+  //
+  // Instantiation
+  //
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > permute( cuNDArray<float>*, std::vector<size_t>*, int );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > permute( cuNDArray<double>*, std::vector<size_t>*, int );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > permute( cuNDArray<float_complext>*, std::vector<size_t>*, int );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > permute( cuNDArray<double_complext>*, std::vector<size_t>*, int );  
+
+  template EXPORTGPUCORE void permute( cuNDArray<float>*, cuNDArray<float>*, std::vector<size_t>*, int);
+  template EXPORTGPUCORE void permute( cuNDArray<double>*, cuNDArray<double>*, std::vector<size_t>*, int);
+  template EXPORTGPUCORE void permute( cuNDArray<float_complext>*, cuNDArray<float_complext>*, std::vector<size_t>*, int);
+  template EXPORTGPUCORE void permute( cuNDArray<double_complext>*, cuNDArray<double_complext>*, std::vector<size_t>*, int);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > shift_dim( cuNDArray<float>*, int );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > shift_dim( cuNDArray<double>*, int );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > shift_dim( cuNDArray<float_complext>*, int );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > shift_dim( cuNDArray<double_complext>*, int );
+
+  template EXPORTGPUCORE void shift_dim( cuNDArray<float>*, cuNDArray<float>*, int shift );
+  template EXPORTGPUCORE void shift_dim( cuNDArray<double>*, cuNDArray<double>*, int shift );
+  template EXPORTGPUCORE void shift_dim( cuNDArray<float_complext>*, cuNDArray<float_complext>*, int shift );
+  template EXPORTGPUCORE void shift_dim( cuNDArray<double_complext>*, cuNDArray<double_complext>*, int shift );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > expand<float>( cuNDArray<float>*, size_t);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > expand<double>( cuNDArray<double>*, size_t);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > expand<float_complext>( cuNDArray<float_complext>*, size_t);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > expand<double_complext>( cuNDArray<double_complext>*, size_t);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > crop<float,1>( typename uint64d<1>::Type, typename uint64d<1>::Type, cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > crop<float,2>( typename uint64d<2>::Type, typename uint64d<2>::Type, cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > crop<float,3>( typename uint64d<3>::Type, typename uint64d<3>::Type, cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > crop<float,4>( typename uint64d<4>::Type, typename uint64d<4>::Type, cuNDArray<float>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > crop<float_complext,1>( typename uint64d<1>::Type, typename uint64d<1>::Type, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > crop<float_complext,2>( typename uint64d<2>::Type, typename uint64d<2>::Type, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > crop<float_complext,3>( typename uint64d<3>::Type, typename uint64d<3>::Type, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > crop<float_complext,4>( typename uint64d<4>::Type, typename uint64d<4>::Type, cuNDArray<float_complext>*);
+
+  template EXPORTGPUCORE void crop<float,1>( uint64d1, cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void crop<float,2>( uint64d2, cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void crop<float,3>( uint64d3, cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void crop<float,4>( uint64d4, cuNDArray<float>*, cuNDArray<float>*);
+
+  template EXPORTGPUCORE void crop<complext<float>,1>( uint64d1, cuNDArray<complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUCORE void crop<complext<float>,2>( uint64d2, cuNDArray<complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUCORE void crop<complext<float>,3>( uint64d3, cuNDArray<complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUCORE void crop<complext<float>,4>( uint64d4, cuNDArray<complext<float> >*, cuNDArray< complext<float> >*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > pad<float,1>( typename uint64d<1>::Type, cuNDArray<float>*, float );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > pad<float,2>( typename uint64d<2>::Type, cuNDArray<float>*, float );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > pad<float,3>( typename uint64d<3>::Type, cuNDArray<float>*, float );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > pad<float,4>( typename uint64d<4>::Type, cuNDArray<float>*, float );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > pad<float_complext,1>( typename uint64d<1>::Type, cuNDArray<float_complext>*, float_complext );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > pad<float_complext,2>( typename uint64d<2>::Type, cuNDArray<float_complext>*, float_complext );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > pad<float_complext,3>( typename uint64d<3>::Type, cuNDArray<float_complext>*, float_complext );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > pad<float_complext,4>( typename uint64d<4>::Type, cuNDArray<float_complext>*, float_complext );
+
+  template EXPORTGPUCORE void pad<float,1>( cuNDArray<float>*, cuNDArray<float>*, float);
+  template EXPORTGPUCORE void pad<float,2>( cuNDArray<float>*, cuNDArray<float>*, float);
+  template EXPORTGPUCORE void pad<float,3>( cuNDArray<float>*, cuNDArray<float>*, float);
+  template EXPORTGPUCORE void pad<float,4>( cuNDArray<float>*, cuNDArray<float>*, float);
+
+  template EXPORTGPUCORE void pad<float_complext,1>( cuNDArray<float_complext>*, cuNDArray<float_complext>*, float_complext);
+  template EXPORTGPUCORE void pad<float_complext,2>( cuNDArray<float_complext>*, cuNDArray<float_complext>*, float_complext);  
+  template EXPORTGPUCORE void pad<float_complext,3>( cuNDArray<float_complext>*, cuNDArray<float_complext>*, float_complext);
+  template EXPORTGPUCORE void pad<float_complext,4>( cuNDArray<float_complext>*, cuNDArray<float_complext>*, float_complext);
+
+  template EXPORTGPUCORE void fill_border<float,1>(uint64d1, cuNDArray<float>*,float);
+  template EXPORTGPUCORE void fill_border<float,2>(uint64d2, cuNDArray<float>*,float);
+  template EXPORTGPUCORE void fill_border<float,3>(uint64d3, cuNDArray<float>*,float);
+  template EXPORTGPUCORE void fill_border<float,4>(uint64d4, cuNDArray<float>*,float);
+  template EXPORTGPUCORE void fill_border<float,1>(float, cuNDArray<float>*,float);
+	template EXPORTGPUCORE void fill_border<float,2>(float, cuNDArray<float>*,float);
+	template EXPORTGPUCORE void fill_border<float,3>(float, cuNDArray<float>*,float);
+	template EXPORTGPUCORE void fill_border<float,4>(float, cuNDArray<float>*,float);
+
+  template EXPORTGPUCORE void fill_border<float_complext,1>(uint64d1, cuNDArray<float_complext>*,float_complext);
+  template EXPORTGPUCORE void fill_border<float_complext,2>(uint64d2, cuNDArray<float_complext>*,float_complext);
+  template EXPORTGPUCORE void fill_border<float_complext,3>(uint64d3, cuNDArray<float_complext>*,float_complext);
+  template EXPORTGPUCORE void fill_border<float_complext,4>(uint64d4, cuNDArray<float_complext>*,float_complext);
+  template EXPORTGPUCORE void fill_border<float_complext,1>(float, cuNDArray<float_complext>*,float_complext);
+	template EXPORTGPUCORE void fill_border<float_complext,2>(float, cuNDArray<float_complext>*,float_complext);
+	template EXPORTGPUCORE void fill_border<float_complext,3>(float, cuNDArray<float_complext>*,float_complext);
+	template EXPORTGPUCORE void fill_border<float_complext,4>(float, cuNDArray<float_complext>*,float_complext);
+
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > crop<double,1>( typename uint64d<1>::Type, typename uint64d<1>::Type, cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > crop<double,2>( typename uint64d<2>::Type, typename uint64d<2>::Type, cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > crop<double,3>( typename uint64d<3>::Type, typename uint64d<3>::Type, cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > crop<double,4>( typename uint64d<4>::Type, typename uint64d<4>::Type, cuNDArray<double>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > crop<double_complext,1>( typename uint64d<1>::Type, typename uint64d<1>::Type, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > crop<double_complext,2>( typename uint64d<2>::Type, typename uint64d<2>::Type, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > crop<double_complext,3>( typename uint64d<3>::Type, typename uint64d<3>::Type, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > crop<double_complext,4>( typename uint64d<4>::Type, typename uint64d<4>::Type, cuNDArray<double_complext>*);
+
+  template EXPORTGPUCORE void crop<double,1>( uint64d1, cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void crop<double,2>( uint64d2, cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void crop<double,3>( uint64d3, cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void crop<double,4>( uint64d4, cuNDArray<double>*, cuNDArray<double>*);
+
+  template EXPORTGPUCORE void crop<complext<double>,1>( uint64d1, cuNDArray<complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUCORE void crop<complext<double>,2>( uint64d2, cuNDArray<complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUCORE void crop<complext<double>,3>( uint64d3, cuNDArray<complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUCORE void crop<complext<double>,4>( uint64d4, cuNDArray<complext<double> >*, cuNDArray< complext<double> >*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > pad<double,1>( typename uint64d<1>::Type, cuNDArray<double>*, double );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > pad<double,2>( typename uint64d<2>::Type, cuNDArray<double>*, double );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > pad<double,3>( typename uint64d<3>::Type, cuNDArray<double>*, double );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > pad<double,4>( typename uint64d<4>::Type, cuNDArray<double>*, double );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > pad<double_complext,1>( typename uint64d<1>::Type, cuNDArray<double_complext>*, double_complext );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > pad<double_complext,2>( typename uint64d<2>::Type, cuNDArray<double_complext>*, double_complext );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > pad<double_complext,3>( typename uint64d<3>::Type, cuNDArray<double_complext>*, double_complext );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > pad<double_complext,4>( typename uint64d<4>::Type, cuNDArray<double_complext>*, double_complext );
+
+  template EXPORTGPUCORE void pad<double,1>( cuNDArray<double>*, cuNDArray<double>*, double);
+  template EXPORTGPUCORE void pad<double,2>( cuNDArray<double>*, cuNDArray<double>*, double);
+  template EXPORTGPUCORE void pad<double,3>( cuNDArray<double>*, cuNDArray<double>*, double);
+  template EXPORTGPUCORE void pad<double,4>( cuNDArray<double>*, cuNDArray<double>*, double);
+
+  template EXPORTGPUCORE void pad<double_complext,1>( cuNDArray<double_complext>*, cuNDArray<double_complext>*, double_complext);
+  template EXPORTGPUCORE void pad<double_complext,2>( cuNDArray<double_complext>*, cuNDArray<double_complext>*, double_complext);  
+  template EXPORTGPUCORE void pad<double_complext,3>( cuNDArray<double_complext>*, cuNDArray<double_complext>*, double_complext);
+  template EXPORTGPUCORE void pad<double_complext,4>( cuNDArray<double_complext>*, cuNDArray<double_complext>*, double_complext);
+
+  template EXPORTGPUCORE void fill_border<double,1>(uint64d1, cuNDArray<double>*,double);
+  template EXPORTGPUCORE void fill_border<double,2>(uint64d2, cuNDArray<double>*,double);
+  template EXPORTGPUCORE void fill_border<double,3>(uint64d3, cuNDArray<double>*,double);
+  template EXPORTGPUCORE void fill_border<double,4>(uint64d4, cuNDArray<double>*,double);
+  template EXPORTGPUCORE void fill_border<double,1>(double, cuNDArray<double>*,double);
+	template EXPORTGPUCORE void fill_border<double,2>(double, cuNDArray<double>*,double);
+	template EXPORTGPUCORE void fill_border<double,3>(double, cuNDArray<double>*,double);
+	template EXPORTGPUCORE void fill_border<double,4>(double, cuNDArray<double>*,double);
+
+  template EXPORTGPUCORE void fill_border<double_complext,1>(uint64d1, cuNDArray<double_complext>*,double_complext);
+  template EXPORTGPUCORE void fill_border<double_complext,2>(uint64d2, cuNDArray<double_complext>*,double_complext);
+  template EXPORTGPUCORE void fill_border<double_complext,3>(uint64d3, cuNDArray<double_complext>*,double_complext);
+  template EXPORTGPUCORE void fill_border<double_complext,4>(uint64d4, cuNDArray<double_complext>*,double_complext);
+  template EXPORTGPUCORE void fill_border<double_complext,1>(double, cuNDArray<double_complext>*,double_complext);
+	template EXPORTGPUCORE void fill_border<double_complext,2>(double, cuNDArray<double_complext>*,double_complext);
+	template EXPORTGPUCORE void fill_border<double_complext,3>(double, cuNDArray<double_complext>*,double_complext);
+	template EXPORTGPUCORE void fill_border<double_complext,4>(double, cuNDArray<double_complext>*,double_complext);
+
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > upsample<float,1>(cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > upsample<float,2>(cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > upsample<float,3>(cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > upsample<float,4>(cuNDArray<float>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > upsample<float_complext,1>(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > upsample<float_complext,2>(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > upsample<float_complext,3>(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > upsample<float_complext,4>(cuNDArray<float_complext>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > upsample<double,1>(cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > upsample<double,2>(cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > upsample<double,3>(cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > upsample<double,4>(cuNDArray<double>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > upsample<double_complext,1>(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > upsample<double_complext,2>(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > upsample<double_complext,3>(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > upsample<double_complext,4>(cuNDArray<double_complext>*);
+
+  template EXPORTGPUCORE void upsample<float,1>(cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void upsample<float,2>(cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void upsample<float,3>(cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void upsample<float,4>(cuNDArray<float>*, cuNDArray<float>*);
+
+  template EXPORTGPUCORE void upsample<float_complext,1>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE void upsample<float_complext,2>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE void upsample<float_complext,3>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE void upsample<float_complext,4>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+
+  template EXPORTGPUCORE void upsample<double,1>(cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void upsample<double,2>(cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void upsample<double,3>(cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void upsample<double,4>(cuNDArray<double>*, cuNDArray<double>*);
+
+  template EXPORTGPUCORE void upsample<double_complext,1>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE void upsample<double_complext,2>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE void upsample<double_complext,3>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE void upsample<double_complext,4>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > downsample<float,1>(cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > downsample<float,2>(cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > downsample<float,3>(cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > downsample<float,4>(cuNDArray<float>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > downsample<float_complext,1>(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > downsample<float_complext,2>(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > downsample<float_complext,3>(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > downsample<float_complext,4>(cuNDArray<float_complext>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > downsample<double,1>(cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > downsample<double,2>(cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > downsample<double,3>(cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > downsample<double,4>(cuNDArray<double>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > downsample<double_complext,1>(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > downsample<double_complext,2>(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > downsample<double_complext,3>(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > downsample<double_complext,4>(cuNDArray<double_complext>*);
+
+  template EXPORTGPUCORE void downsample<float,1>(cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void downsample<float,2>(cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void downsample<float,3>(cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void downsample<float,4>(cuNDArray<float>*, cuNDArray<float>*);
+
+  template EXPORTGPUCORE void downsample<float_complext,1>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE void downsample<float_complext,2>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE void downsample<float_complext,3>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE void downsample<float_complext,4>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+
+  template EXPORTGPUCORE void downsample<double,1>(cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void downsample<double,2>(cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void downsample<double,3>(cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void downsample<double,4>(cuNDArray<double>*, cuNDArray<double>*);
+
+  template EXPORTGPUCORE void downsample<double_complext,1>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE void downsample<double_complext,2>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE void downsample<double_complext,3>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE void downsample<double_complext,4>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+
+
+  // We can probably instantiate the functions below functionsfor many more types? E.g. arrays of floatd2. 
+  // For now we just introduce what we have needed...
+  //
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<floatd2> > expand<floatd2>( cuNDArray<floatd2>*, size_t);  
+}
diff --git a/toolboxes/core/gpu/cuNDArray_utils.h b/toolboxes/core/gpu/cuNDArray_utils.h
new file mode 100644
index 0000000..84c001f
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_utils.h
@@ -0,0 +1,128 @@
+/**
+ * @file cuNDArray_utils.h
+ */
+#pragma once
+
+#include "cuNDArray.h"
+#include "vector_td.h"
+#include "gpucore_export.h"
+
+namespace Gadgetron{
+
+/**
+ * @brief Cyclicly shifts the order of the array dimensions
+ */
+template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> >
+shift_dim( cuNDArray<T> *in, int shift );
+/**
+ * @brief Cyclicly shifts the order of the array dimensions
+ */
+template<class T> EXPORTGPUCORE void
+shift_dim( cuNDArray<T> *in, cuNDArray<T> *out, int shift );
+
+/**
+ * @brief Permutes the array dimensions following the specified dimension order
+ */
+template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> >
+permute( cuNDArray<T> *in, std::vector<size_t> *dim_order, int shift_mode = 0 );
+
+/**
+ * @brief Permutes the array dimensions following the specified dimension order
+ */
+template<class T> EXPORTGPUCORE void
+permute( cuNDArray<T> *in, cuNDArray<T> *out, std::vector<size_t> *dim_order, int shift_mode = 0 );
+
+/**
+ * @brief Creates a cropped version of the array
+ * @param[in] crop_offset Offset of the corner of the crop size
+ * @param[in] crop_size Size of the output array
+ * @param[in] in Array to crop
+ */
+template<class T, unsigned int D> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> >
+crop( typename uint64d<D>::Type crop_offset, typename uint64d<D>::Type crop_size, cuNDArray<T> *in );
+
+/**
+ * @brief Creates a cropped version of the array
+ * @param[in] crop_offset Offset of the corner of the crop size
+ * @param[in] in Array to crop
+ * @param[out] out Array into which the cropped array is placed
+ */
+template<class T, unsigned int D> EXPORTGPUCORE
+void crop( typename uint64d<D>::Type crop_offset, cuNDArray<T> *in, cuNDArray<T> *out );
+
+/**
+ * @brief Creates a padded version of the array
+  * @param[in] size Size of the output array
+ * @param[in] in Array to pad
+ * @param[in] val Numerical value of the padding
+ */
+template<class T, unsigned int D> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> >
+pad( typename uint64d<D>::Type size, cuNDArray<T> *in, T val = T(0) );
+
+
+/**
+ * @brief Creates a padded version of the array
+ * @param[in] in Array to pad
+ * @param[in] out Output array
+ * @param[in] val Numerical value of the padding
+ */
+template<class T, unsigned int D> EXPORTGPUCORE
+void pad( cuNDArray<T> *in, cuNDArray<T> *out, T val = T(0) );
+
+/**
+ * @brief Fills the image with a given value outside a box
+ * @param[in] matrix_size Box size
+ * @param[in,out] image Array to fill
+ * @param[in] val Fill value
+ */
+template<class T, unsigned int D> EXPORTGPUCORE
+void fill_border( typename uint64d<D>::Type matrix_size, cuNDArray<T> *image, T val = T(0) );
+
+/**
+ * @brief Fills the image with a given value outside a radius from the center
+ * @param[in] radius Radius of the circle
+ * @param[in,out] in_out Array to fill
+ * @param[in] val Fill value
+ */
+template<class T, unsigned int D>
+void fill_border( typename realType<T>::Type radius, cuNDArray<T> *in_out, T val= T(0) );
+
+// Expand array to new dimension
+/**
+ * @brief Creates a new array, expanded into an additional dimension
+ * @param[in] data Input data
+ * @param[in] added_dim_size Size of the new dimension
+ */
+template<class T> EXPORTGPUCORE boost::shared_ptr<cuNDArray<T> >
+expand(cuNDArray<T> *data, size_t added_dim_size );
+
+/**
+ * @brief Creates an array of 2 times the size, created via linear interpolation
+ * @param[in] in Array to upsample
+ */
+template<class T, unsigned int D> EXPORTGPUCORE
+boost::shared_ptr< cuNDArray<T> > upsample( cuNDArray<T>* in );
+
+/**
+ * @brief Creates an array of 2 times the size, created via linear interpolation
+ * @param[in] in Array to upsample
+ * @param[out] out Output array
+ */
+template<class T, unsigned int D> EXPORTGPUCORE
+void upsample( cuNDArray<T> *in, cuNDArray<T> *out );
+
+/**
+ * @brief Creates an array of half the size, created via linear interpolation
+ * @param[in] in Array to downsample
+ */
+template<class T, unsigned int D> EXPORTGPUCORE
+boost::shared_ptr< cuNDArray<T> > downsample( cuNDArray<T>* in );
+
+/**
+ * @brief Creates an array of half the size, created via linear interpolation
+ * @param[in] in Array to downsample
+ * @param[out] out Output Array
+ */
+template<class T, unsigned int D> EXPORTGPUCORE
+void downsample( cuNDArray<T> *in, cuNDArray<T> *out );
+}
diff --git a/toolboxes/core/gpu/cuSparseMatrix.cu b/toolboxes/core/gpu/cuSparseMatrix.cu
new file mode 100644
index 0000000..ca3d0ca
--- /dev/null
+++ b/toolboxes/core/gpu/cuSparseMatrix.cu
@@ -0,0 +1,113 @@
+#include "cuSparseMatrix.h"
+#include <thrust/extrema.h>
+#include <thrust/device_ptr.h>
+#include "cuNDArray_math.h"
+using namespace Gadgetron;
+
+
+static cusparseStatus_t  sparseCSRMV(cusparseHandle_t handle, cusparseOperation_t transA,int m, int n, int nnz,
+		const float * alpha, const cusparseMatDescr_t descrA, const float * csrValA,
+		const int * csrRowPtrA, const int * csrColndA, const float *x, const float* beta, float* y){
+
+	return cusparseScsrmv( handle, transA,m, n, nnz,  alpha, descrA,  csrValA,  csrRowPtrA,  csrColndA, x,  beta, y);
+}
+
+
+static cusparseStatus_t  sparseCSRMV(cusparseHandle_t handle, cusparseOperation_t transA,int m, int n, int nnz,
+		const double * alpha, const cusparseMatDescr_t descrA, const double * csrValA,
+		const int * csrRowPtrA, const int * csrColndA, const double *x, const double* beta, double* y){
+
+	return cusparseDcsrmv( handle, transA,m, n, nnz,  alpha, descrA,  csrValA,  csrRowPtrA,  csrColndA, x,  beta, y);
+}
+
+
+static cusparseStatus_t  sparseCSRMV(cusparseHandle_t handle, cusparseOperation_t transA,int m, int n, int nnz,
+		const complext<float> * alpha, const cusparseMatDescr_t descrA, const complext<float> * csrValA,
+		const int * csrRowPtrA, const int * csrColndA, const complext<float> *x, const complext<float>* beta, complext<float>* y){
+
+	thrust::device_ptr<const int> csrRow(csrRowPtrA);
+
+	thrust::device_ptr<const int> csrCol(csrColndA);
+
+	thrust::device_ptr<const complext<float> >  csrVal(csrValA);
+
+	thrust::device_ptr<const complext<float> > xptr(x);
+	thrust::device_ptr<complext<float> > yptr(y);
+
+
+	if (transA == CUSPARSE_OPERATION_NON_TRANSPOSE){
+		std::cout << "In sum " << thrust::reduce(xptr,xptr+n) << " out sum " << thrust::reduce(yptr,yptr+m) << std::endl;
+	} else {
+		std::cout << "T In sum " << thrust::reduce(xptr,xptr+m) << " out sum " << thrust::reduce(yptr,yptr+n) << std::endl;
+	}
+
+
+
+	return cusparseCcsrmv( handle, transA,m, n, nnz,  (cuComplex*) alpha, descrA,  (cuComplex*) csrValA,  csrRowPtrA,  csrColndA, (cuComplex*) x,  (cuComplex*) beta,  (cuComplex*)y);
+}
+
+
+static cusparseStatus_t  sparseCSRMV(cusparseHandle_t handle, cusparseOperation_t transA,int m, int n, int nnz,
+		const complext<double> * alpha, const cusparseMatDescr_t descrA, const complext<double> * csrValA,
+		const int * csrRowPtrA, const int * csrColndA, const complext<double> *x, const complext<double>* beta, complext<double>* y){
+	return cusparseZcsrmv( handle, transA,m, n, nnz,  (cuDoubleComplex*) alpha, descrA,  (cuDoubleComplex*) csrValA,  csrRowPtrA,  csrColndA, (cuDoubleComplex*) x,  (cuDoubleComplex*) beta,  (cuDoubleComplex*)y);
+}
+
+template<class T> EXPORTGPUCORE void Gadgetron::sparseMV(T alpha,T beta, const cuCsrMatrix<T> & mat, const cuNDArray<T> & vec_in, cuNDArray<T>& vec_out, bool adjoint=false){
+
+	if (vec_in.get_number_of_elements() != (adjoint ? mat.m : mat.n))
+		throw std::runtime_error("Matrix and input vector have mismatching dimensions");
+	if (vec_out.get_number_of_elements() != (adjoint ? mat.n : mat.m))
+		throw std::runtime_error("Matrix and output vector have mismatching dimensions");
+
+	cusparseOperation_t trans = adjoint ?  CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+	cusparseStatus_t status = sparseCSRMV(cudaDeviceManager::Instance()->lockSparseHandle(),trans,mat.m,mat.n,mat.nnz,&alpha, mat.descr,
+			thrust::raw_pointer_cast(&mat.data[0]),thrust::raw_pointer_cast(&mat.csrRow[0]),thrust::raw_pointer_cast(&mat.csrColdnd[0]),vec_in.get_data_ptr(),&beta,vec_out.get_data_ptr());
+
+	cudaDeviceManager::Instance()->unlockSparseHandle();
+	if (status != CUSPARSE_STATUS_SUCCESS){
+		std::stringstream ss;
+		ss << "Sparse Matrix Vector multiplication failed. Error: ";
+		ss << gadgetron_getCusparseErrorString(status);
+		throw cuda_error(ss.str());
+	}
+
+
+
+}
+
+
+
+
+EXPORTGPUCORE std::string Gadgetron::gadgetron_getCusparseErrorString(cusparseStatus_t err)
+{
+  switch (err){
+  case CUSPARSE_STATUS_NOT_INITIALIZED:
+    return "NOT INITIALIZED";
+  case CUSPARSE_STATUS_ALLOC_FAILED:
+    return "ALLOC FAILED";
+  case CUSPARSE_STATUS_INVALID_VALUE:
+    return "INVALID VALUE";
+  case CUSPARSE_STATUS_ARCH_MISMATCH:
+    return "ARCH MISMATCH";
+  case CUSPARSE_STATUS_MAPPING_ERROR:
+    return "MAPPING ERROR";
+  case CUSPARSE_STATUS_EXECUTION_FAILED:
+    return "EXECUTION FAILED";
+  case CUSPARSE_STATUS_INTERNAL_ERROR:
+    return "INTERNAL ERROR";
+  case CUSPARSE_STATUS_SUCCESS:
+    return "SUCCES";
+  case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+  	return "MATRIX TYPE NOT SUPPORTED";
+  default:
+    return "UNKNOWN CUSPARSE ERROR";
+  }
+}
+
+
+
+template EXPORTGPUCORE void Gadgetron::sparseMV<float>(float alpha,float beta, const cuCsrMatrix<float> & mat, const cuNDArray<float> & vec_in, cuNDArray<float>& vec_out, bool adjoint);
+template EXPORTGPUCORE void Gadgetron::sparseMV<double>(double alpha,double beta, const cuCsrMatrix<double> & mat, const cuNDArray<double> & vec_in, cuNDArray<double>& vec_out, bool adjoint);
+template EXPORTGPUCORE void Gadgetron::sparseMV<complext<float> >(complext<float> alpha,complext<float> beta, const cuCsrMatrix<complext<float> > & mat, const cuNDArray<complext<float> > & vec_in, cuNDArray<complext<float> >& vec_out, bool adjoint);
+template EXPORTGPUCORE void Gadgetron::sparseMV<complext<double> >(complext<double> alpha,complext<double> beta, const cuCsrMatrix<complext<double> > & mat, const cuNDArray<complext<double> > & vec_in, cuNDArray<complext<double> >& vec_out, bool adjoint);
diff --git a/toolboxes/core/gpu/cuSparseMatrix.h b/toolboxes/core/gpu/cuSparseMatrix.h
new file mode 100644
index 0000000..ca713ff
--- /dev/null
+++ b/toolboxes/core/gpu/cuSparseMatrix.h
@@ -0,0 +1,49 @@
+/*
+ * CUSPARSE.h
+ *
+ *  Created on: Jan 28, 2015
+ *      Author: u051747
+ */
+
+#pragma once
+#include "cusparse_v2.h"
+#include "gpucore_export.h"
+#include <thrust/device_vector.h>
+#include "cuNDArray.h"
+#include "cudaDeviceManager.h"
+
+namespace Gadgetron{
+
+EXPORTGPUCORE std::string gadgetron_getCusparseErrorString(cusparseStatus_t err);
+
+template<class T> struct cuCsrMatrix {
+
+	cuCsrMatrix(){
+	CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
+	}
+
+	~cuCsrMatrix(){
+		CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+	}
+
+
+	int m,n, nnz;
+	thrust::device_vector<int> csrRow, csrColdnd;
+	thrust::device_vector<T> data;
+	cusparseMatDescr_t descr;
+};
+
+/**
+ * Performs a sparse matrix vector multiplication: vec_out = alpha*Mat * beta*vec_in
+ * @param alpha
+ * @param beta
+ * @param mat
+ * @param vec_in
+ * @param vec_out
+ * @param adjoint
+ */
+template<class T> EXPORTGPUCORE void sparseMV(T alpha,T beta, const cuCsrMatrix<T> & mat, const cuNDArray<T> & vec_in, cuNDArray<T>& vec_out, bool adjoint=false);
+
+
+
+}
diff --git a/toolboxes/core/gpu/cudaDeviceManager.cpp b/toolboxes/core/gpu/cudaDeviceManager.cpp
new file mode 100644
index 0000000..7660e37
--- /dev/null
+++ b/toolboxes/core/gpu/cudaDeviceManager.cpp
@@ -0,0 +1,264 @@
+#include "cudaDeviceManager.h"
+#include "check_CUDA.h"
+#include "cuNDArray_blas.h"
+
+#include <boost/thread/mutex.hpp>
+#include <boost/shared_array.hpp>
+#include <cuda_runtime_api.h>
+#include <stdlib.h>
+#include <sstream>
+
+namespace Gadgetron{
+
+  static boost::shared_array<boost::mutex> _mutex;
+  static boost::shared_array<boost::mutex> _sparseMutex;
+
+  cudaDeviceManager* cudaDeviceManager::_instance = 0;
+
+  cudaDeviceManager::cudaDeviceManager() {
+
+    // This constructor is executed only once for a singleton
+    //
+
+    atexit(&CleanUp);
+
+    if( cudaGetDeviceCount( &_num_devices ) != cudaSuccess) {
+      _num_devices = 0;
+      throw cuda_error( "Error: no Cuda devices present.");
+    }
+
+    _mutex = boost::shared_array<boost::mutex>(new boost::mutex[_num_devices]);
+    _sparseMutex = boost::shared_array<boost::mutex>(new boost::mutex[_num_devices]);
+
+    int old_device;
+    if( cudaGetDevice(&old_device) != cudaSuccess ) {
+      throw std::runtime_error( "Error: unable to get device no");
+    }
+
+    _total_global_mem = std::vector<size_t>(_num_devices,0);
+    _shared_mem_per_block = std::vector<size_t>(_num_devices,0);
+    _warp_size = std::vector<int>(_num_devices,0);
+    _max_blockdim = std::vector<int>(_num_devices,0);
+    _max_griddim = std::vector<int>(_num_devices,0);
+    _major = std::vector<int>(_num_devices,0);
+    _minor = std::vector<int>(_num_devices,0);
+    _handle = std::vector<cublasHandle_t>(_num_devices, (cublasContext*)0x0);
+    _sparse_handle = std::vector<cusparseHandle_t>(_num_devices, (cusparseHandle_t)0x0);
+
+    for( int device=0; device<_num_devices; device++ ){
+
+      if( cudaSetDevice(device) != cudaSuccess ) {
+        throw cuda_error( "Error: unable to set device no");
+      }
+
+      cudaDeviceProp deviceProp;
+
+      if( cudaGetDeviceProperties( &deviceProp, device ) != cudaSuccess) {
+        throw cuda_error("Error: unable to determine device properties.");
+      }
+
+      _total_global_mem[device] = deviceProp.totalGlobalMem;
+      _shared_mem_per_block[device] = deviceProp.sharedMemPerBlock;
+      _warp_size[device] = deviceProp.warpSize;
+      _max_blockdim[device] = deviceProp.maxThreadsDim[0];
+      _max_griddim[device] = deviceProp.maxGridSize[0];
+      _major[device] = deviceProp.major;
+      _minor[device] = deviceProp.minor;
+    }
+
+    if( cudaSetDevice(old_device) != cudaSuccess ) {
+      throw cuda_error( "Error: unable to restore device no");
+    }
+  }
+
+  cudaDeviceManager::~cudaDeviceManager() 
+  {
+
+    for (int device = 0; device < _num_devices; device++){
+      if (_handle[device] != NULL)
+        cublasDestroy(_handle[device]);
+      if (_sparse_handle[device] != NULL)
+      	cusparseDestroy(_sparse_handle[device]);
+    }
+  }
+
+  size_t cudaDeviceManager::total_global_mem()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return _total_global_mem[device];
+  }
+
+  size_t cudaDeviceManager::shared_mem_per_block()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return _shared_mem_per_block[device];
+  }
+
+  int cudaDeviceManager::max_blockdim()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return _max_blockdim[device];
+  }
+
+  int cudaDeviceManager::max_griddim()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return _max_griddim[device];
+  }
+
+  int cudaDeviceManager::warp_size()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return _warp_size[device];
+  }
+
+  int cudaDeviceManager::major_version()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return _major[device];
+  }
+
+  int cudaDeviceManager::minor_version()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return _minor[device];
+  }
+
+  size_t cudaDeviceManager::getFreeMemory()
+  {
+    size_t free,total;
+    CUDA_CALL(cudaMemGetInfo(&free,&total));
+    return free;
+  }
+
+  size_t cudaDeviceManager::getTotalMemory()
+  {
+    size_t free,total;
+    CUDA_CALL(cudaMemGetInfo(&free,&total));
+    return total;
+  }
+
+  size_t cudaDeviceManager::getFreeMemory(int device)
+  {
+    int oldDevice;
+    CUDA_CALL(cudaGetDevice(&oldDevice));
+    CUDA_CALL(cudaSetDevice(device));
+    size_t ret = getFreeMemory();
+    CUDA_CALL(cudaSetDevice(oldDevice));
+    return ret;
+  }
+
+  size_t cudaDeviceManager::getTotalMemory(int device)
+  {
+    int oldDevice;
+    CUDA_CALL(cudaGetDevice(&oldDevice));
+    CUDA_CALL(cudaSetDevice(device));
+    size_t ret = getTotalMemory();
+    CUDA_CALL(cudaSetDevice(oldDevice));
+    return ret;
+  }
+
+  cudaDeviceManager* cudaDeviceManager::Instance()
+  {
+    if (_instance == 0 ) _instance = new cudaDeviceManager;
+    return _instance;
+  }
+
+  cublasHandle_t cudaDeviceManager::lockHandle()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return lockHandle(device);
+  }
+
+  cublasHandle_t cudaDeviceManager::lockHandle(int device)
+  {
+    _mutex[device].lock();
+    if (_handle[device] == NULL){
+      cublasStatus_t ret = cublasCreate(&_handle[device]);
+      if (ret != CUBLAS_STATUS_SUCCESS) {
+      	std::stringstream ss;
+      	ss << "Error: unable to create cublas handle for device " << device << " : ";
+        ss << gadgetron_getCublasErrorString(ret) << std::endl;
+      	throw cuda_error(ss.str());
+      }
+      cublasSetPointerMode( _handle[device], CUBLAS_POINTER_MODE_HOST );
+    }
+    return _handle[device];
+  }
+
+  void cudaDeviceManager::unlockHandle()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return unlockHandle(device);
+  }
+
+  void cudaDeviceManager::unlockHandle(int device)
+  {
+    _mutex[device].unlock();
+  }
+
+  cusparseHandle_t cudaDeviceManager::lockSparseHandle()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return lockSparseHandle(device);
+  }
+
+  cusparseHandle_t cudaDeviceManager::lockSparseHandle(int device)
+  {
+    _sparseMutex[device].lock();
+    if (_sparse_handle[device] == NULL){
+      cusparseStatus_t ret = cusparseCreate(&_sparse_handle[device]);
+      if (ret != CUSPARSE_STATUS_SUCCESS) {
+      	std::stringstream ss;
+      	ss << "Error: unable to create cusparse handle for device " << device << " : ";
+        ss << gadgetron_getCusparseErrorString(ret) << std::endl;
+      	throw cuda_error(ss.str());
+      }
+      cusparseSetPointerMode(_sparse_handle[device],CUSPARSE_POINTER_MODE_HOST);
+      //cublasSetPointerMode( _handle[device], CUBLAS_POINTER_MODE_HOST );
+    }
+    return _sparse_handle[device];
+  }
+
+  void cudaDeviceManager::unlockSparseHandle()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return unlockSparseHandle(device);
+  }
+
+  void cudaDeviceManager::unlockSparseHandle(int device)
+  {
+    _sparseMutex[device].unlock();
+  }
+
+
+  int cudaDeviceManager::getCurrentDevice()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return device;
+  }
+
+  int cudaDeviceManager::getTotalNumberOfDevice()
+  {
+    int number_of_devices;
+    CUDA_CALL(cudaGetDeviceCount(&number_of_devices));
+    return number_of_devices;
+  }
+
+  void cudaDeviceManager::CleanUp()
+  {
+    delete _instance; _instance = 0;
+  }
+}
diff --git a/toolboxes/core/gpu/cudaDeviceManager.h b/toolboxes/core/gpu/cudaDeviceManager.h
new file mode 100644
index 0000000..d8d73ed
--- /dev/null
+++ b/toolboxes/core/gpu/cudaDeviceManager.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include "gpucore_export.h"
+
+#include <vector>
+#include <cublas_v2.h>
+#include "cuSparseMatrix.h"
+
+namespace Gadgetron{
+
+  class EXPORTGPUCORE cudaDeviceManager 
+  {
+  public:
+
+    // This class is used as a singleton.
+    // Use Instance() to access the public member functions.
+    //
+
+    static cudaDeviceManager* Instance();
+    
+    // Public member functions.
+    // If the function does not take a device id, it will use the current device.
+    //
+
+    inline size_t total_global_mem(int device){ return _total_global_mem[device]; }
+    inline size_t shared_mem_per_block(int device){ return _shared_mem_per_block[device]; }
+    inline int warp_size(int device){ return _warp_size[device]; }
+    inline int max_blockdim(int device){ return _max_blockdim[device]; }
+    inline int max_griddim(int device){ return _max_griddim[device]; }
+    inline int major_version(int device){ return _major[device]; }
+    inline int minor_version(int device){ return _minor[device]; }
+
+    size_t total_global_mem();
+    size_t shared_mem_per_block();
+    int major_version();
+    int minor_version();
+    int warp_size();
+    int max_blockdim();
+    int max_griddim();
+
+    int getCurrentDevice();
+
+    int getTotalNumberOfDevice();
+
+    size_t getFreeMemory();
+    size_t getFreeMemory(int device);
+
+    size_t getTotalMemory();
+    size_t getTotalMemory(int device);
+
+    // Access to Cublas is protected by a mutex
+    // Despite what the Cublas manual claims, we have not found it thread safe.
+
+    cublasHandle_t lockHandle();
+    cublasHandle_t lockHandle(int device);
+
+    void unlockHandle();
+    void unlockHandle(int device);
+
+    cusparseHandle_t lockSparseHandle();
+    cusparseHandle_t lockSparseHandle(int device);
+
+    void unlockSparseHandle();
+    void unlockSparseHandle(int device);
+
+
+  private:
+
+    // Use the Instance() method to access the singleton
+    //
+
+    cudaDeviceManager();
+    ~cudaDeviceManager();
+
+    static void CleanUp();
+    
+    int _num_devices;
+    std::vector<size_t> _total_global_mem; // in bytes
+    std::vector<size_t> _shared_mem_per_block; // in bytes
+    std::vector<int> _warp_size;
+    std::vector<int> _max_blockdim;
+    std::vector<int> _max_griddim;
+    std::vector<int> _major;
+    std::vector<int> _minor;
+    std::vector<cublasHandle_t> _handle;
+    std::vector<cusparseHandle_t> _sparse_handle;
+    static cudaDeviceManager * _instance;
+  };
+}
diff --git a/toolboxes/core/gpu/gpucore_export.h b/toolboxes/core/gpu/gpucore_export.h
new file mode 100644
index 0000000..c6d72f3
--- /dev/null
+++ b/toolboxes/core/gpu/gpucore_export.h
@@ -0,0 +1,18 @@
+/** \file gpucore_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef GPUCORE_EXPORT_H_
+#define GPUCORE_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GPUCORE__) || defined (gpucore_EXPORTS)
+#define EXPORTGPUCORE __declspec(dllexport)
+#else
+#define EXPORTGPUCORE __declspec(dllimport)
+#endif
+#else
+#define EXPORTGPUCORE
+#endif
+
+#endif /* GPUCORE_EXPORT_H_ */
diff --git a/toolboxes/core/gpu/hoCuNDArray.h b/toolboxes/core/gpu/hoCuNDArray.h
new file mode 100644
index 0000000..b8f001e
--- /dev/null
+++ b/toolboxes/core/gpu/hoCuNDArray.h
@@ -0,0 +1,171 @@
+/** \file hoNDArray.h
+    \brief CPU-based N-dimensional array (data container) for cpu->gpu->cpu (hoCu) solvers.
+
+    The existence of this class is mainly due to providing unique array type for the hoCu based math in
+    hoCuNDArray_operators.h, hoCuNDArray_elemwise.h, and hoCuNDArray_blas.h.
+    Unfortunately C++ does not let a derived class inherit its base class's constructors, which consequently need redefinition.
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "check_CUDA.h"
+
+namespace Gadgetron{
+
+  template<class T> class hoCuNDArray: public hoNDArray<T>
+  {
+  public:
+
+    hoCuNDArray() : hoNDArray<T>::hoNDArray() {}
+
+
+#if __cplusplus > 199711L
+    hoCuNDArray(hoCuNDArray<T>&& other) : hoNDArray<T>::hoNDArray(){
+    	this->data_ = other.data_;
+    	this->dimensions_ = other.dimensions_;
+    	this->elements_ = other.elements_;
+    	other.dimensions_.reset();
+    	other.data_ = nullptr;
+    }
+#endif
+
+    hoCuNDArray(std::vector<size_t> *dimensions) : hoNDArray<T>::hoNDArray() {
+      this->create(dimensions);
+    }
+
+    hoCuNDArray(std::vector<size_t> &dimensions) : hoNDArray<T>::hoNDArray() {
+      this->create(dimensions);
+    }
+  
+    hoCuNDArray(boost::shared_ptr< std::vector<size_t> > dimensions) : hoNDArray<T>::hoNDArray() {
+      this->create(dimensions.get());
+    }
+  
+    hoCuNDArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false) : hoNDArray<T>::hoNDArray() {
+      this->create(dimensions, data, delete_data_on_destruct);
+    }
+
+    hoCuNDArray(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct = false) : hoNDArray<T>::hoNDArray() {
+      this->create(dimensions, data, delete_data_on_destruct);
+    }
+  
+    hoCuNDArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false) : hoNDArray<T>::hoNDArray() {
+      this->create(dimensions.get(), data, delete_data_on_destruct);
+    }
+
+    // Copy constructors
+    hoCuNDArray(const hoNDArray<T> &a): hoNDArray<T>(){
+      this->data_ = 0;
+      this->dimensions_ = boost::shared_ptr< std::vector<size_t> >(new std::vector<size_t>(*a.get_dimensions()));
+      this->allocate_memory();
+      memcpy(this->data_, a.get_data_ptr(), this->elements_*sizeof(T));
+    }
+
+    hoCuNDArray(const hoNDArray<T> *a): hoNDArray<T>(){
+      if(!a) throw std::runtime_error("hoCuNDArray::hoCuNDArray(): 0x0 pointer provided.");
+      this->data_ = 0;
+      this->dimensions_ = boost::shared_ptr< std::vector<size_t> >(new std::vector<size_t>(*a->get_dimensions()));
+      this->allocate_memory();
+      memcpy(this->data_, a->get_data_ptr(), this->elements_*sizeof(T));
+    }
+
+    hoCuNDArray(const hoCuNDArray<T> &a): hoNDArray<T>(){
+      this->data_ = 0;
+      this->dimensions_ = boost::shared_ptr< std::vector<size_t> >(new std::vector<size_t>(*a.get_dimensions()));
+      this->allocate_memory();
+      memcpy(this->data_, a.get_data_ptr(), this->elements_*sizeof(T));
+    }
+
+    hoCuNDArray(const hoCuNDArray<T> *a): hoNDArray<T>(){
+      if(!a) throw std::runtime_error("hoCuNDArray::hoCuNDArray(): 0x0 pointer provided.");
+      this->data_ = 0;
+      this->dimensions_ = boost::shared_ptr< std::vector<size_t> >(new std::vector<size_t>(*a->get_dimensions()));
+      this->allocate_memory();
+      memcpy(this->data_, a->get_data_ptr(), this->elements_*sizeof(T));
+    }
+
+    virtual ~hoCuNDArray() {
+      if (this->delete_data_on_destruct_) {
+        this->deallocate_memory();
+      }
+    }
+
+    T& at( size_t idx ){
+      if( idx >= this->get_number_of_elements() ){
+        throw std::runtime_error("hoCuNDArray::at(): index out of range.");
+      }
+      return this->data_[idx];
+    }
+  
+    T& operator[]( size_t idx ){
+      if( idx >= this->get_number_of_elements() ){
+        throw std::runtime_error("hoCuNDArray::operator[]: index out of range.");
+      }
+      return this->data_[idx];
+    }
+
+    hoCuNDArray<T>& operator=(const hoCuNDArray<T>& rhs)
+    {
+        if ( &rhs == this ) return *this;
+
+        if ( rhs.get_number_of_elements() == 0 ){
+            this->clear();
+            return *this;
+        }
+
+        // Are the dimensions the same? Then we can just memcpy
+        if (this->dimensions_equal(&rhs)){
+            memcpy(this->data_, rhs.data_, this->elements_*sizeof(T));
+        }
+        else{
+            deallocate_memory();
+            this->data_ = 0;
+            *(this->dimensions_) = *(rhs.dimensions_);
+            *(this->offsetFactors_) = *(rhs.offsetFactors_);
+            this->allocate_memory();
+            memcpy( this->data_, rhs.data_, this->elements_*sizeof(T) );
+        }
+        return *this;
+    }
+
+#if __cplusplus > 199711L
+    hoCuNDArray<T>& operator=(hoCuNDArray<T>&& rhs)
+    {
+        if ( &rhs == this ) return *this;
+
+        this->clear();
+        this->dimensions_ = rhs.dimensions_;
+        this->offsetFactors_ = rhs.offsetFactors_;
+        rhs.dimensions_.reset();
+        rhs.offsetFactors_.reset();
+        this->data_ = rhs.data_;
+        rhs.data_ = nullptr;
+        return *this;
+    }
+#endif
+  protected:
+
+    virtual void allocate_memory()
+    {
+      this->deallocate_memory();
+      this->elements_ = 1;
+      if (this->dimensions_->empty())
+        throw std::runtime_error("hoCuNDArray::allocate_memory() : dimensions is empty.");
+      for (size_t i = 0; i < this->dimensions_->size(); i++) {
+        this->elements_ *= (*this->dimensions_)[i];
+      }
+
+      size_t size = this->elements_ * sizeof(T);
+      CUDA_CALL(cudaMallocHost((void**)&this->data_,size));
+    }
+
+    virtual void deallocate_memory()
+    {
+      if (this->data_) {
+        CUDA_CALL(cudaFreeHost(this->data_));
+        this->data_ = 0;
+      }
+    }
+  };
+}
diff --git a/toolboxes/core/gpu/hoCuNDArray_blas.cpp b/toolboxes/core/gpu/hoCuNDArray_blas.cpp
new file mode 100644
index 0000000..51947cf
--- /dev/null
+++ b/toolboxes/core/gpu/hoCuNDArray_blas.cpp
@@ -0,0 +1,260 @@
+#include "hoCuNDArray_blas.h"
+#include "cuNDArray_blas.h"
+#include "complext.h"
+#include "check_CUDA.h"
+
+namespace Gadgetron{
+
+#define CUBLAS_CALL(fun) {cublasStatus_t err = fun; if (err != CUBLAS_STATUS_SUCCESS) {throw cuda_error(gadgetron_getCublasErrorString(err));}}
+
+  // These are defined in cuNDArray_blas.cu
+  //
+
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_axpy(cublasHandle_t hndl, int n, const T* a , const T* x , int incx,  T* y, int incy);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_dot(cublasHandle_t, int, const T*, int, const  T*, int, T*, bool cc = true);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_nrm2(cublasHandle_t, int, const T*, int, typename realType<T>::Type *result);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_amax(cublasHandle_t handle, int n,const T *x, int incx, int *result);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_amin(cublasHandle_t handle, int n,const T *x, int incx, int *result);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_asum(cublasHandle_t handle, int n,const T *x, int incx, typename realType<T>::Type *result);
+
+  template<class T> EXPORTGPUCORE void axpy( T a, hoCuNDArray<T>* x, hoCuNDArray<T>* y )
+  {
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    size_t free = cudaDeviceManager::Instance()->getFreeMemory(device);
+    size_t batchSize = 1024*1024*(free/(sizeof(T)*2*1024*1024)); //Ensure 1Mb allocations
+    size_t remaining = x->get_number_of_elements();
+    batchSize = std::min(batchSize,remaining);
+    T* x_ptr = x->get_data_ptr();
+    T* y_ptr = y->get_data_ptr();
+    std::vector<size_t> dims;
+    dims.push_back(batchSize);
+    cuNDArray<T> cuX(&dims);
+    cuNDArray<T> cuY(&dims);
+
+    for (size_t i = 0; i < (x->get_number_of_elements()-1)/batchSize+1; i++){
+
+      size_t curSize = std::min(batchSize,remaining);
+
+      CUDA_CALL(cudaMemcpy(cuX.get_data_ptr(),x_ptr+i*batchSize,curSize*sizeof(T),cudaMemcpyHostToDevice));
+      CUDA_CALL(cudaMemcpy(cuY.get_data_ptr(),y_ptr+i*batchSize,curSize*sizeof(T),cudaMemcpyHostToDevice));
+
+      CUBLAS_CALL(cublas_axpy(cudaDeviceManager::Instance()->lockHandle(device), curSize,
+			      &a, cuX.get_data_ptr(), 1, cuY.get_data_ptr(), 1));
+
+      cudaDeviceManager::Instance()->unlockHandle(device);
+    
+      CUDA_CALL(cudaMemcpy(y_ptr,cuY.get_data_ptr(),curSize*sizeof(T),cudaMemcpyDeviceToHost));
+      remaining -= batchSize;
+    }
+  }
+
+  template<class T> EXPORTGPUCORE void axpy( T a, hoCuNDArray< complext<T> >*x, hoCuNDArray< complext<T> > *y )
+  {
+    axpy( complext<T>(a), x, y );
+  }
+
+  template<class T> EXPORTGPUCORE T dot( hoCuNDArray<T> *x, hoCuNDArray<T> *y, bool cc )
+  {
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    size_t free = cudaDeviceManager::Instance()->getFreeMemory(device);
+    size_t batchSize = 1024*1024*(free/(sizeof(T)*2*1024*1024)); //Ensure 1Mb allocations
+    size_t remaining = x->get_number_of_elements();
+    batchSize = std::min(batchSize,remaining);
+    T* x_ptr = x->get_data_ptr();
+    T* y_ptr = y->get_data_ptr();
+    std::vector<size_t> dims;
+    dims.push_back(batchSize);
+    cuNDArray<T> cuX(&dims);
+    cuNDArray<T> cuY(&dims);
+    T ret = T(0);
+
+    for (size_t i = 0; i < (x->get_number_of_elements()-1)/batchSize+1; i++){
+    
+      size_t curSize = std::min(batchSize,remaining);
+
+      CUDA_CALL(cudaMemcpy(cuX.get_data_ptr(),x_ptr+i*batchSize,curSize*sizeof(T),cudaMemcpyHostToDevice));
+      CUDA_CALL(cudaMemcpy(cuY.get_data_ptr(),y_ptr+i*batchSize,curSize*sizeof(T),cudaMemcpyHostToDevice));
+
+      T cur_ret;
+      CUBLAS_CALL(cublas_dot( cudaDeviceManager::Instance()->lockHandle(device), curSize,
+			      cuX.get_data_ptr(), 1,
+			      cuY.get_data_ptr(), 1,
+			      &cur_ret, cc ));
+
+      cudaDeviceManager::Instance()->unlockHandle(device);
+
+      remaining -= batchSize;
+      ret += cur_ret;
+    }
+    return ret;
+  }
+
+  template<class T> EXPORTGPUCORE typename realType<T>::Type nrm2( hoCuNDArray<T>* x )
+  {
+    typedef typename realType<T>::Type REAL;
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    size_t free = cudaDeviceManager::Instance()->getFreeMemory(device);
+    size_t batchSize = 1024*1024*(free/(sizeof(T)*1024*1024)); //Ensure 1Mb allocations
+    size_t remaining = x->get_number_of_elements();
+    batchSize = std::min(batchSize,remaining);
+    T* x_ptr = x->get_data_ptr();
+    std::vector<size_t> dims;
+    dims.push_back(batchSize);
+    cuNDArray<T> cuX(&dims);
+    REAL ret = 0;
+
+    for (size_t i = 0; i < (x->get_number_of_elements()-1)/batchSize+1; i++){
+
+      size_t curSize = std::min(batchSize,remaining);
+      CUDA_CALL(cudaMemcpy(cuX.get_data_ptr(),x_ptr+i*batchSize,curSize*sizeof(T),cudaMemcpyHostToDevice));
+
+      REAL cur_ret;
+      CUBLAS_CALL(cublas_nrm2<T>( cudaDeviceManager::Instance()->lockHandle(device), batchSize,
+				  cuX.get_data_ptr(), 1, &cur_ret));
+
+      cudaDeviceManager::Instance()->unlockHandle(device);
+
+      remaining -= batchSize;
+      ret += cur_ret*cur_ret;
+    }
+    return std::sqrt(ret);
+  }
+
+  template<class T> EXPORTGPUCORE typename realType<T>::Type asum( hoCuNDArray<T>* x )
+  {
+    typedef typename realType<T>::Type REAL;
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    size_t free = cudaDeviceManager::Instance()->getFreeMemory(device);
+    size_t batchSize = 1024*1024*(free/(sizeof(T)*1024*1024)); //Ensure 1Mb allocations
+    size_t remaining = x->get_number_of_elements();
+    batchSize = std::min(batchSize,remaining);
+    T* x_ptr = x->get_data_ptr();
+    std::vector<size_t> dims;
+    dims.push_back(batchSize);
+    cuNDArray<T> cuX(&dims);
+    REAL ret = 0;
+
+    for (size_t i = 0; i < (x->get_number_of_elements()-1)/batchSize+1; i++){
+
+      size_t curSize = std::min(batchSize,remaining);
+      CUDA_CALL(cudaMemcpy(cuX.get_data_ptr(),x_ptr+i*batchSize,curSize*sizeof(T),cudaMemcpyHostToDevice));
+
+      REAL cur_ret;
+      CUBLAS_CALL(cublas_asum( cudaDeviceManager::Instance()->lockHandle(device), batchSize,
+			       cuX.get_data_ptr(), 1,
+			       &cur_ret));
+
+      cudaDeviceManager::Instance()->unlockHandle(device);
+
+      remaining -= batchSize;
+      ret += cur_ret;
+    }
+    return ret;
+  }
+
+  template<class T> EXPORTGPUCORE size_t amin( hoCuNDArray<T>* x )
+  {
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    size_t free = cudaDeviceManager::Instance()->getFreeMemory(device);
+    size_t batchSize = 1024*1024*(free/(sizeof(T)*1024*1024)); //Ensure 1Mb allocations
+    size_t remaining = x->get_number_of_elements();
+    batchSize = std::min(batchSize,remaining);
+    T* x_ptr = x->get_data_ptr();
+    std::vector<size_t> dims;
+    dims.push_back(batchSize);
+    cuNDArray<T> cuX(&dims);
+    std::vector<size_t> results;
+ 
+    for (size_t i = 0; i < (x->get_number_of_elements()-1)/batchSize+1; i++){
+
+      size_t curSize = std::min(batchSize,remaining);
+      CUDA_CALL(cudaMemcpy(cuX.get_data_ptr(),x_ptr+i*batchSize,curSize*sizeof(T),cudaMemcpyHostToDevice));
+
+      int cur_ret;
+      CUBLAS_CALL(cublas_amin( cudaDeviceManager::Instance()->lockHandle(device), batchSize,
+			       cuX.get_data_ptr(), 1,
+			       &cur_ret));
+
+      cudaDeviceManager::Instance()->unlockHandle(device);
+
+      remaining -= batchSize;
+      results.push_back(cur_ret+i*batchSize-1);
+    }
+
+    size_t res =0;
+    for (size_t i =0; i < results.size(); i++){
+      if (abs(x_ptr[results[i]]) < abs(x_ptr[res])) res = results[i];
+    }
+    return res;
+  }
+
+  template<class T> EXPORTGPUCORE size_t amax( hoCuNDArray<T>* x )
+  {
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    size_t free = cudaDeviceManager::Instance()->getFreeMemory(device);
+    size_t batchSize = 1024*1024*(free/(sizeof(T)*1024*1024)); //Ensure 1Mb allocations
+    size_t remaining = x->get_number_of_elements();
+    batchSize = std::min(batchSize,remaining);
+    T* x_ptr = x->get_data_ptr();
+    std::vector<size_t> dims;
+    dims.push_back(batchSize);
+    cuNDArray<T> cuX(&dims);
+    std::vector<size_t> results;
+
+    for (size_t i = 0; i < (x->get_number_of_elements()-1)/batchSize+1; i++){
+
+      size_t curSize = std::min(batchSize,remaining);
+      CUDA_CALL(cudaMemcpy(cuX.get_data_ptr(),x_ptr+i*batchSize,curSize*sizeof(T),cudaMemcpyHostToDevice));
+
+      int cur_ret;
+      CUBLAS_CALL(cublas_amax( cudaDeviceManager::Instance()->lockHandle(device), batchSize,
+			       cuX.get_data_ptr(), 1,
+			       &cur_ret));
+
+      cudaDeviceManager::Instance()->unlockHandle(device);
+
+      remaining -= batchSize;
+      results.push_back(cur_ret+i*batchSize-1);
+    }
+
+    size_t res =0;
+    for (size_t i =0; i < results.size(); i++){
+      if (abs(x_ptr[results[i]]) > abs(x_ptr[res])) res = results[i];
+    }
+    return res;
+  }
+
+  //
+  // Instantiation
+  //
+
+  template EXPORTGPUCORE float dot(hoCuNDArray<float>*,hoCuNDArray<float>*,bool);
+  template EXPORTGPUCORE float nrm2(hoCuNDArray<float>*);
+  template EXPORTGPUCORE void axpy(float,hoCuNDArray<float>*,hoCuNDArray<float>*);
+  template EXPORTGPUCORE size_t amin(hoCuNDArray<float>*);
+  template EXPORTGPUCORE size_t amax(hoCuNDArray<float>*);
+  template EXPORTGPUCORE float asum(hoCuNDArray<float>*);
+
+  template EXPORTGPUCORE double dot(hoCuNDArray<double>*,hoCuNDArray<double>*,bool);
+  template EXPORTGPUCORE double nrm2(hoCuNDArray<double>*);
+  template EXPORTGPUCORE void axpy(double,hoCuNDArray<double>*,hoCuNDArray<double>*);
+  template EXPORTGPUCORE size_t amin(hoCuNDArray<double>*);
+  template EXPORTGPUCORE size_t amax(hoCuNDArray<double>*);
+  template EXPORTGPUCORE double asum(hoCuNDArray<double>*);
+
+  template EXPORTGPUCORE float_complext dot(hoCuNDArray<float_complext>*,hoCuNDArray<float_complext>*,bool);
+  template EXPORTGPUCORE float nrm2(hoCuNDArray<float_complext>*);
+  template EXPORTGPUCORE void axpy(float_complext,hoCuNDArray<float_complext>*,hoCuNDArray<float_complext>*);
+  template EXPORTGPUCORE void axpy(float,hoCuNDArray<float_complext>*,hoCuNDArray<float_complext>*);
+  template EXPORTGPUCORE size_t amin(hoCuNDArray<float_complext>*);
+  template EXPORTGPUCORE size_t amax(hoCuNDArray<float_complext>*);
+  template EXPORTGPUCORE float asum(hoCuNDArray<float_complext>*);
+
+  template EXPORTGPUCORE double_complext dot(hoCuNDArray<double_complext>*,hoCuNDArray<double_complext>*,bool);
+  template EXPORTGPUCORE double nrm2(hoCuNDArray<double_complext>*);
+  template EXPORTGPUCORE void axpy(double_complext,hoCuNDArray<double_complext>*,hoCuNDArray<double_complext>*);
+  template EXPORTGPUCORE void axpy(double,hoCuNDArray<double_complext>*,hoCuNDArray<double_complext>*);
+  template EXPORTGPUCORE size_t amin(hoCuNDArray<double_complext>*);
+  template EXPORTGPUCORE size_t amax(hoCuNDArray<double_complext>*);
+  template EXPORTGPUCORE double asum(hoCuNDArray<double_complext>*);
+}
diff --git a/toolboxes/core/gpu/hoCuNDArray_blas.h b/toolboxes/core/gpu/hoCuNDArray_blas.h
new file mode 100644
index 0000000..e8e44b0
--- /dev/null
+++ b/toolboxes/core/gpu/hoCuNDArray_blas.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "hoCuNDArray.h"
+#include "cudaDeviceManager.h"
+#include "gpucore_export.h"
+
+namespace Gadgetron{
+
+  template<class T> EXPORTGPUCORE T dot( hoCuNDArray<T> *x, hoCuNDArray<T> *y, bool cc = true );
+  
+  template<class T> EXPORTGPUCORE typename realType<T>::Type nrm2( hoCuNDArray<T> *x );
+  
+  template<class T> EXPORTGPUCORE void axpy( T a, hoCuNDArray<T> *x, hoCuNDArray<T> *y );
+  template<class T> EXPORTGPUCORE void axpy( T a, hoCuNDArray< complext<T> > *x, hoCuNDArray< complext<T> > *y );
+  
+  /**
+   * @brief Gets the index of the index of the element with minimum absolute
+   * @param x Input data
+   * @return index of absolute minimum values
+   */
+  template<class T> EXPORTGPUCORE size_t amin( hoCuNDArray<T> *x);
+  
+  /**
+   * @brief Gets the index of the index of the element with maximum absolute
+   * @param x Input data
+   * @return index of absolute maximum values
+   * @details Note that this returns the C-style index and NOT the Fortran index.
+   */
+  template<class T> EXPORTGPUCORE size_t amax( hoCuNDArray<T> *x );
+  
+  template<class T> EXPORTGPUCORE typename realType<T>::Type asum( hoCuNDArray<T> *x );
+}
diff --git a/toolboxes/core/gpu/hoCuNDArray_elemwise.h b/toolboxes/core/gpu/hoCuNDArray_elemwise.h
new file mode 100644
index 0000000..ba3aaa8
--- /dev/null
+++ b/toolboxes/core/gpu/hoCuNDArray_elemwise.h
@@ -0,0 +1,8 @@
+/**
+ * \file hoCuNDArray_elemwise.h
+ * \brief Element-wise math operations on the hoCuNDArray class.     For now just delegates everything to hoNDArray operators.
+ */
+
+#pragma once
+
+#include "hoNDArray_elemwise.h"
diff --git a/toolboxes/core/gpu/hoCuNDArray_math.h b/toolboxes/core/gpu/hoCuNDArray_math.h
new file mode 100644
index 0000000..671eb07
--- /dev/null
+++ b/toolboxes/core/gpu/hoCuNDArray_math.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "hoCuNDArray_blas.h"
+#include "hoCuNDArray_elemwise.h"
+#include "hoCuNDArray_utils.h"
diff --git a/toolboxes/core/gpu/hoCuNDArray_utils.h b/toolboxes/core/gpu/hoCuNDArray_utils.h
new file mode 100644
index 0000000..310a805
--- /dev/null
+++ b/toolboxes/core/gpu/hoCuNDArray_utils.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "hoCuNDArray.h"
+#include "hoNDArray_utils.h"
+#include "complext.h"
+
+namespace Gadgetron{
+
+  /**
+   * @brief Calculates the elementwise absolute value of the array
+   * @param[in] data Input data
+   * @return A new array containing the elementwise absolute value of data
+   */
+  template<class T>
+  boost::shared_ptr<hoCuNDArray<typename realType<T>::type> > abs(hoCuNDArray<T> *data){
+    return boost::static_pointer_cast<hoCuNDArray<typename realType<T>::type> >(abs(static_cast<hoNDArray<T>* >(data)));
+  }
+}
diff --git a/toolboxes/core/gpu/radial_utilities.cu b/toolboxes/core/gpu/radial_utilities.cu
new file mode 100644
index 0000000..b7f0bd7
--- /dev/null
+++ b/toolboxes/core/gpu/radial_utilities.cu
@@ -0,0 +1,481 @@
+#include "radial_utilities.h"
+#include "vector_td_operators.h"
+#include "vector_td_utilities.h"
+#include "real_utilities.h"
+#include "real_utilities_device.h"
+#include "check_CUDA.h"
+
+#include <math_constants.h>
+#include <vector>
+#include <iostream>
+
+using namespace std;
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int GOLDEN_RATIO_ANGULAR_STEP_SIZE> __inline__ __device__ REAL get_angle_step_GR();
+
+  template<> __inline__ __device__ float get_angle_step_GR<float,0>(){ return CUDART_PI_F*(3.0f-::sqrtf(5.0f))*0.5f; }   // GR_SMALLEST
+  template<> __inline__ __device__ float get_angle_step_GR<float,1>(){ return CUDART_PI_F/((::sqrtf(5.0f)+1.0f)*0.5f); } // GR_ORIGINAL
+  template<> __inline__ __device__ double get_angle_step_GR<double,0>(){ return CUDART_PI*(3.0-::sqrt(5.0))*0.5; }       // GR_SMALLEST
+  template<> __inline__ __device__ double get_angle_step_GR<double,1>(){ return CUDART_PI/((::sqrt(5.0)+1.0)*0.5); }     // GR_ORIGINAL
+
+  template<class REAL, unsigned int GOLDEN_RATIO_ANGULAR_STEP_SIZE> __global__ void
+  compute_radial_trajectory_golden_ratio_2d_kernel( typename reald<REAL,2>::Type *co, REAL angular_offset )
+  {
+    const unsigned int index = blockIdx.x*blockDim.x + threadIdx.x;              
+
+    const REAL samples_per_profile = (REAL) blockDim.x;
+    const REAL bias = samples_per_profile * REAL(0.5);
+    const REAL sample_idx_on_profile = (REAL)threadIdx.x;
+    const REAL profile = (REAL)blockIdx.x;
+    const REAL angle_step = get_angle_step_GR<REAL,GOLDEN_RATIO_ANGULAR_STEP_SIZE>();
+
+    REAL cos_angle, sin_angle;
+    gad_sincos<REAL>( (profile+angular_offset)*angle_step+get_pi<REAL>(), &sin_angle, &cos_angle );
+
+    typename reald<REAL,2>::Type sample_pos; 
+    sample_pos.vec[0] = (sample_idx_on_profile-bias)*cos_angle/samples_per_profile;
+    sample_pos.vec[1] = (sample_idx_on_profile-bias)*sin_angle/samples_per_profile;
+  
+    co[index] = sample_pos;
+  }
+
+  template<class REAL> boost::shared_ptr< cuNDArray< typename reald<REAL,2>::Type > >
+  compute_radial_trajectory_golden_ratio_2d( unsigned int num_samples_per_profile, unsigned int num_profiles_per_frame, 
+                                             unsigned int num_frames, unsigned int profile_offset, GOLDEN_RATIO_ANGULAR_STEP_SIZE mode )
+  {
+    typedef typename reald<REAL,2>::Type T;
+  
+    // Get device properties
+    int device; cudaGetDevice( &device );
+    cudaDeviceProp deviceProp; cudaGetDeviceProperties( &deviceProp, device );
+    const unsigned int warp_size = deviceProp.warpSize;
+  
+    if( num_samples_per_profile%warp_size ){
+      cout << endl << "compute_radial_trajectory_golden_ratio_2d: #samples/profile is not a multiple of the device's warp size." << endl;
+      return boost::shared_ptr< cuNDArray<T> >();
+    }
+
+    unsigned int number_of_samples_per_frame = num_samples_per_profile * num_profiles_per_frame;
+
+    // Allocate space for result
+    vector<size_t> dims; dims.push_back( number_of_samples_per_frame ); dims.push_back( num_frames );
+    boost::shared_ptr< cuNDArray<T> > co( new cuNDArray<T>(&dims) );
+  
+    if(!co.get()){
+      cout << endl << "Error:: compute_radial_trajectory_golden_ratio_2d: memory allocation failed." << endl;
+      return boost::shared_ptr< cuNDArray<T> >();
+    }
+  
+    // Set dimensions of grid/blocks.
+    dim3 dimBlock( num_samples_per_profile );
+    dim3 dimGrid( num_profiles_per_frame*num_frames );
+  
+    // Invoke kernel (nvcc has been protesting heavily on various other ways to do this...)
+    if( mode == GR_SMALLEST )
+      compute_radial_trajectory_golden_ratio_2d_kernel<REAL,0><<< dimGrid, dimBlock >>> 
+        ( co->get_data_ptr(), (REAL)profile_offset );
+    else
+      compute_radial_trajectory_golden_ratio_2d_kernel<REAL,1><<< dimGrid, dimBlock >>> 
+        ( co->get_data_ptr(), (REAL)profile_offset );
+    
+    CHECK_FOR_CUDA_ERROR();
+  
+    return co;
+  }
+  template<class REAL> __global__ static void
+  compute_radial_trajectory_variable_angle_2d_kernel( typename reald<REAL,2>::Type *co,REAL* angles, REAL one_over_num_profiles_per_frame, REAL one_over_num_frames )
+  {
+    const unsigned int index = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+
+    const REAL samples_per_profile = (REAL) blockDim.x;
+    const REAL bias = samples_per_profile * REAL(0.5);
+    const REAL sample_idx_on_profile = (REAL)threadIdx.x;
+    const int frame = blockIdx.y;
+
+
+    typename reald<REAL,2>::Type sample_pos;
+    sample_pos.vec[0] = (sample_idx_on_profile-bias)*cos(angles[frame])/samples_per_profile;
+    sample_pos.vec[1] = (sample_idx_on_profile-bias)*sin(angles[frame])/samples_per_profile;
+
+    co[index] = sample_pos;
+  }
+
+  template<class REAL> boost::shared_ptr< cuNDArray< typename reald<REAL,2>::Type > >
+  compute_radial_trajectory_variable_angle_2d(cuNDArray<REAL>* angles, unsigned int num_samples_per_profile, unsigned int num_profiles_per_frame, unsigned int num_frames, REAL angular_offset )
+  {
+    typedef typename reald<REAL,2>::Type T;
+
+    // Get device properties
+    int device; cudaGetDevice( &device );
+    cudaDeviceProp deviceProp; cudaGetDeviceProperties( &deviceProp, device );
+    const unsigned int warp_size = deviceProp.warpSize;
+
+    if( num_samples_per_profile%warp_size ){
+      cout << endl << "Error:: compute_radial_trajectory_fixed_angle_2d: #samples/profile is not a multiple of the device's warp size." << endl;
+      return boost::shared_ptr< cuNDArray<T> >();
+    }
+
+    unsigned int number_of_samples_per_frame = num_samples_per_profile * num_profiles_per_frame;
+
+    // Allocate space for result
+    vector<size_t> dims;
+    dims.push_back( number_of_samples_per_frame );
+    dims.push_back( num_frames );
+
+    boost::shared_ptr< cuNDArray<T> > co( new cuNDArray<T>(&dims) );
+
+    // Set dimensions of grid/blocks.
+    dim3 dimBlock( num_samples_per_profile );
+    dim3 dimGrid( num_profiles_per_frame, num_frames );
+
+    // Invoke kernel
+    compute_radial_trajectory_variable_angle_2d_kernel<REAL><<< dimGrid, dimBlock >>> ( co->get_data_ptr(), angles->get_data_ptr(),REAL(1)/(REAL)num_profiles_per_frame, REAL(1)/(REAL)num_frames);
+
+    CHECK_FOR_CUDA_ERROR();
+
+    return co;
+  }
+
+  template<class REAL> __global__ void
+  compute_radial_trajectory_fixed_angle_2d_kernel( typename reald<REAL,2>::Type *co, REAL one_over_num_profiles_per_frame, REAL one_over_num_frames, REAL angular_offset )
+  {
+    const unsigned int index = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+
+    const REAL samples_per_profile = (REAL) blockDim.x;
+    const REAL bias = samples_per_profile * REAL(0.5);
+    const REAL sample_idx_on_profile = (REAL)threadIdx.x;
+    const REAL lprofile = (REAL)blockIdx.x;
+    const REAL frame = (REAL)blockIdx.y;
+
+    REAL cos_angle, sin_angle;
+    gad_sincos<REAL>( (lprofile+frame*one_over_num_frames)*one_over_num_profiles_per_frame*get_pi<REAL>()+angular_offset+get_pi<REAL>(), &sin_angle, &cos_angle );
+
+    typename reald<REAL,2>::Type sample_pos;
+    sample_pos.vec[0] = (sample_idx_on_profile-bias)*cos_angle/samples_per_profile;
+    sample_pos.vec[1] = (sample_idx_on_profile-bias)*sin_angle/samples_per_profile;
+
+    co[index] = sample_pos;
+  }
+
+
+  template<class REAL> boost::shared_ptr< cuNDArray< typename reald<REAL,2>::Type > > 
+  compute_radial_trajectory_fixed_angle_2d( unsigned int num_samples_per_profile, unsigned int num_profiles_per_frame, unsigned int num_frames, REAL angular_offset )
+  {
+    typedef typename reald<REAL,2>::Type T;
+  
+    // Get device properties
+    int device; cudaGetDevice( &device );
+    cudaDeviceProp deviceProp; cudaGetDeviceProperties( &deviceProp, device );
+    const unsigned int warp_size = deviceProp.warpSize;
+  
+    if( num_samples_per_profile%warp_size ){
+      cout << endl << "Error:: compute_radial_trajectory_fixed_angle_2d: #samples/profile is not a multiple of the device's warp size." << endl;
+      return boost::shared_ptr< cuNDArray<T> >();
+    }
+
+    unsigned int number_of_samples_per_frame = num_samples_per_profile * num_profiles_per_frame;
+
+    // Allocate space for result
+    vector<size_t> dims; 
+    dims.push_back( number_of_samples_per_frame ); 
+    dims.push_back( num_frames );
+  
+    boost::shared_ptr< cuNDArray<T> > co( new cuNDArray<T>(&dims) );
+  
+    // Set dimensions of grid/blocks.
+    dim3 dimBlock( num_samples_per_profile );
+    dim3 dimGrid( num_profiles_per_frame, num_frames );
+  
+    // Invoke kernel
+    compute_radial_trajectory_fixed_angle_2d_kernel<REAL><<< dimGrid, dimBlock >>> ( co->get_data_ptr(), REAL(1)/(REAL)num_profiles_per_frame, REAL(1)/(REAL)num_frames, angular_offset );
+  
+    CHECK_FOR_CUDA_ERROR();
+  
+    return co;
+  }
+
+  // Find the (eight) neighbors to a given radial sample index
+
+  template<class REAL, unsigned int GOLDEN_RATIO_ANGULAR_STEP_SIZE, bool GR> 
+  __inline__ __device__ typename reald<REAL,2>::Type
+  compute_radial_neighbors( REAL sample_idx_on_profile, REAL angular_offset, REAL alpha, 
+                            REAL one_over_radial_oversampling_factor, REAL one_over_num_profiles,
+                            REAL bias, REAL samples_per_profile, REAL profile, REAL num_profiles,
+                            typename reald<REAL,2>::Type * __restrict__ p1, typename reald<REAL,2>::Type * __restrict__ p2,
+                            typename reald<REAL,2>::Type * __restrict__ p3, typename reald<REAL,2>::Type * __restrict__ p4,
+                            typename reald<REAL,2>::Type * __restrict__ p5, typename reald<REAL,2>::Type * __restrict__ p6,
+                            typename reald<REAL,2>::Type * __restrict__ p7, typename reald<REAL,2>::Type * __restrict__ p8  )
+  {
+    // The sample positions (scales) can be either of the _local_ indices 'sample_idx_on_profile' or 'samples_per_projection'-'sample_idx_on_profile'
+    // Beware of "skewness" around the origin, i.e. +1 sample one one side
+    const REAL ctr_scale       = alpha*((sample_idx_on_profile-bias)*one_over_radial_oversampling_factor);
+    const REAL ctr_scale_inv   = alpha*((samples_per_profile-sample_idx_on_profile-bias)*one_over_radial_oversampling_factor);
+    const REAL prev_scale      = alpha*((sample_idx_on_profile-bias-1)*one_over_radial_oversampling_factor);
+    const REAL prev_scale_inv  = alpha*((samples_per_profile-(sample_idx_on_profile-1)-bias)*one_over_radial_oversampling_factor);
+    const REAL next_scale      = alpha*((sample_idx_on_profile-bias+1)*one_over_radial_oversampling_factor);
+    const REAL next_scale_inv  = alpha*((samples_per_profile-(sample_idx_on_profile+1)-bias)*one_over_radial_oversampling_factor);
+  
+    // Unit circle position for current projection
+    REAL cos_angle, sin_angle;
+  
+    switch(GR){
+    
+    case true: // golden ratio
+      {
+        const REAL angle_step = get_angle_step_GR<REAL,GOLDEN_RATIO_ANGULAR_STEP_SIZE>();
+        gad_sincos<REAL>( (profile+angular_offset)*angle_step, &sin_angle, &cos_angle );
+      }
+      break;	  
+    case false: // fixed angle
+      {
+        gad_sincos<REAL>( profile*one_over_num_profiles*get_pi<REAL>(), &sin_angle, &cos_angle );	}
+      break;
+    }
+  
+    // Find the normal to the current projection direction
+    typename reald<REAL,2>::Type normal; normal.vec[0] = -sin_angle; normal.vec[1] = cos_angle;
+  
+    // The position of the idx itself
+    typename reald<REAL,2>::Type sample_pos; sample_pos.vec[0] = ctr_scale*cos_angle; sample_pos.vec[1] = ctr_scale*sin_angle;
+  
+    // The positions of the previous and next sample
+    (*p1).vec[0] = prev_scale*cos_angle; (*p1).vec[1] = prev_scale*sin_angle;
+    (*p2).vec[0] = next_scale*cos_angle; (*p2).vec[1] = next_scale*sin_angle;
+  
+    // Initialize remaining points;
+    (*p3).vec[0] = (*p4).vec[0] = (*p5).vec[0] = (*p6).vec[0] = (*p7).vec[0] = (*p8).vec[0] = 
+      (*p3).vec[1] = (*p4).vec[1] = (*p5).vec[1] = (*p6).vec[1] = (*p7).vec[1] = (*p8).vec[1] = get_max<REAL>(); // far away...
+  
+    // Run through all projections to find the closests neighbors
+  
+    for( unsigned int i=0; i<num_profiles; i++ ){
+    
+      if( i == profile )
+        continue;
+    
+      // Unit circle position projection 'i'
+      switch(GR)
+        {
+        case true:
+          {
+            const REAL angle_step = get_angle_step_GR<REAL,GOLDEN_RATIO_ANGULAR_STEP_SIZE>();
+            gad_sincos<REAL>( ((REAL)i+angular_offset)*angle_step, &sin_angle, &cos_angle );
+          }
+          break;
+
+        case false:
+          {
+            gad_sincos<REAL>( (REAL)i*one_over_num_profiles*get_pi<REAL>(), &sin_angle, &cos_angle );
+          }
+          break;	
+        }
+
+      // Determine sample positions on projection
+      typename reald<REAL,2>::Type prev_pos_1;  prev_pos_1.vec[0] = prev_scale*cos_angle;      prev_pos_1.vec[1] = prev_scale*sin_angle;
+      typename reald<REAL,2>::Type prev_pos_2;  prev_pos_2.vec[0] = prev_scale_inv*cos_angle;  prev_pos_2.vec[1] = prev_scale_inv*sin_angle;
+      typename reald<REAL,2>::Type ctr_pos_1;   ctr_pos_1.vec[0]  = ctr_scale*cos_angle;       ctr_pos_1.vec[1]  = ctr_scale*sin_angle;
+      typename reald<REAL,2>::Type ctr_pos_2;   ctr_pos_2.vec[0]  = ctr_scale_inv*cos_angle;   ctr_pos_2.vec[1]  = ctr_scale_inv*sin_angle;
+      typename reald<REAL,2>::Type next_pos_1;  next_pos_1.vec[0] = next_scale*cos_angle;      next_pos_1.vec[1] = next_scale*sin_angle;
+      typename reald<REAL,2>::Type next_pos_2;  next_pos_2.vec[0] = next_scale_inv*cos_angle;  next_pos_2.vec[1] = next_scale_inv*sin_angle;
+    
+      // The dot product is used to ensure we find a neighbor on each side
+      if( dot<REAL,2>(ctr_pos_1-sample_pos, normal) > REAL(0) ){
+    
+        if( norm_squared<REAL>(ctr_pos_1-sample_pos) < norm_squared<REAL>(*p4-sample_pos) ){
+          *p3 = prev_pos_1;
+          *p4 = ctr_pos_1;
+          *p5 = next_pos_1;
+        }
+      }
+      else{
+     
+        if( norm_squared<REAL>(ctr_pos_1-sample_pos) < norm_squared<REAL>(*p7-sample_pos) ){
+          *p6 = prev_pos_1;
+          *p7 = ctr_pos_1;
+          *p8 = next_pos_1;
+        }
+      }
+  
+      // The dot product is used to ensure we find a neighbor on each side
+      if( dot<REAL,2>(ctr_pos_2-sample_pos, normal) >  REAL(0) ){
+  
+        if( norm_squared<REAL>(ctr_pos_2-sample_pos) < norm_squared<REAL>(*p4-sample_pos) ){
+          *p3 = prev_pos_2;
+          *p4 = ctr_pos_2;
+          *p5 = next_pos_2;
+        }
+      }
+      else{
+      
+        if( norm_squared<REAL>(ctr_pos_2-sample_pos) < norm_squared<REAL>(*p7-sample_pos) ){
+          *p6 = prev_pos_2;
+          *p7 = ctr_pos_2;
+          *p8 = next_pos_2;
+        }
+      }
+    }
+  
+    return sample_pos;
+  }
+
+  template<class REAL, unsigned int GOLDEN_RATIO_ANGULAR_STEP_SIZE, bool GR> __global__ void
+  compute_radial_dcw_2d_kernel( REAL alpha, REAL one_over_radial_oversampling_factor, REAL one_over_num_profiles, REAL angular_offset, REAL *dcw )
+  {
+    const REAL samples_per_profile = (REAL) (blockDim.x<<1);
+    const REAL sample_idx_on_profile = (REAL)(blockIdx.x*blockDim.x+threadIdx.x);
+    const REAL num_profiles = (REAL)gridDim.y;
+    const REAL profile = (REAL)blockIdx.y;
+    const REAL bias = samples_per_profile*REAL(0.5);
+
+    const unsigned int index = blockIdx.y*samples_per_profile + sample_idx_on_profile;
+  
+    REAL weight;
+  
+    if( sample_idx_on_profile == blockDim.x ){
+
+      // Special case - center of profile/k-space
+      const REAL radius = (alpha*one_over_radial_oversampling_factor)*REAL(0.5);
+      const REAL area = radius*radius*get_pi<REAL>();
+      weight = area/num_profiles;
+    }
+    else{
+    
+      // General case - all neighbors exist
+    
+      // Compute sample positions for the current sample and all neighbors
+      // The ordering of p1..p8 in the call below follows the edge of the "Voronoi polygon"
+    
+      typename reald<REAL,2>::Type sample_pos;
+      typename reald<REAL,2>::Type p1, p2, p3, p4, p5, p6, p7, p8;
+    
+      sample_pos = compute_radial_neighbors<REAL,GOLDEN_RATIO_ANGULAR_STEP_SIZE,GR>
+        ( sample_idx_on_profile, angular_offset, alpha, 
+          one_over_radial_oversampling_factor, one_over_num_profiles, bias, samples_per_profile, profile, num_profiles,
+          &p1, &p5, &p2, &p3, &p4, &p8, &p7, &p6 );
+    
+      // Find midpoints of lines from sample_pos to all other points.
+      p1 = REAL(0.5)*(sample_pos+p1); // computing "sample_pos+(p1-sample_pos)/2"
+      p2 = REAL(0.5)*(sample_pos+p2);
+      p3 = REAL(0.5)*(sample_pos+p3);
+      p4 = REAL(0.5)*(sample_pos+p4);
+      p5 = REAL(0.5)*(sample_pos+p5);
+      p6 = REAL(0.5)*(sample_pos+p6);
+      p7 = REAL(0.5)*(sample_pos+p7);
+      p8 = REAL(0.5)*(sample_pos+p8);
+    
+      // The weight is determined by the area of the polygon (http://local.wasp.uwa.edu.au/~pbourke/geometry/polyarea/)
+      weight = REAL(0.5)*
+        ((p1.vec[0]*p2.vec[1]-p2.vec[0]*p1.vec[1])+
+         (p2.vec[0]*p3.vec[1]-p3.vec[0]*p2.vec[1])+
+         (p3.vec[0]*p4.vec[1]-p4.vec[0]*p3.vec[1])+
+         (p4.vec[0]*p5.vec[1]-p5.vec[0]*p4.vec[1])+
+         (p5.vec[0]*p6.vec[1]-p6.vec[0]*p5.vec[1])+
+         (p6.vec[0]*p7.vec[1]-p7.vec[0]*p6.vec[1])+
+         (p7.vec[0]*p8.vec[1]-p8.vec[0]*p7.vec[1])+
+         (p8.vec[0]*p1.vec[1]-p1.vec[0]*p8.vec[1]));                        
+    
+      if( weight<REAL(0) ) weight *= -REAL(1);
+    }
+  
+    dcw[index] = weight;
+  }
+
+  template<class REAL, unsigned int GOLDEN_RATIO_ANGULAR_STEP_SIZE, bool GR> boost::shared_ptr< cuNDArray<REAL> >
+  compute_radial_dcw_2d( unsigned int samples_per_profile, unsigned int num_profiles, 
+                         REAL alpha, REAL one_over_radial_oversampling_factor, unsigned int profile_offset = 0 )
+  {
+    if( num_profiles < 4 ){
+      cout << endl << "Error:: compute_radial_dcw_<*>_2d: use at least four profiles" << endl;
+      return boost::shared_ptr< cuNDArray<REAL> >();
+    }
+  
+    // Get device properties
+    int device; cudaGetDevice( &device );
+    cudaDeviceProp deviceProp; cudaGetDeviceProperties( &deviceProp, device );
+    const unsigned int warp_size = deviceProp.warpSize;
+  
+    if( samples_per_profile%2 ){
+      cout << endl << "Error:: compute_radial_dcw_<*>_2d: samples/profile must be even." << endl;
+      return boost::shared_ptr< cuNDArray<REAL> >();
+    }
+
+    if( samples_per_profile%warp_size ){
+      cout << endl << "Error:: compute_radial_dcw_<*>_2d: samples/profile number a multiple of the device's warp size." << endl;
+      return boost::shared_ptr< cuNDArray<REAL> >();
+    }
+
+    unsigned int number_of_samples = samples_per_profile * num_profiles;
+  
+    // Allocate space for result
+    vector<size_t> dims; dims.push_back( number_of_samples );
+    boost::shared_ptr< cuNDArray<REAL> > dcw( new cuNDArray<REAL>(&dims) );
+  
+    if(!dcw.get()){
+      cout << endl << "Error:: compute_radial_dcw_<*>_2d: memory allocation failed." << endl;
+      return boost::shared_ptr< cuNDArray<REAL> >();
+    }
+  
+    // Set dimensions of grid/blocks. (division by two due to resource limitations)
+    dim3 dimBlock( samples_per_profile>>1 );
+    dim3 dimGrid( 2, num_profiles );
+  
+    // Invoke kernel
+    compute_radial_dcw_2d_kernel<REAL,GOLDEN_RATIO_ANGULAR_STEP_SIZE,GR><<< dimGrid, dimBlock >>> 
+      ( alpha, one_over_radial_oversampling_factor, REAL(1)/(REAL)num_profiles, (REAL)profile_offset, dcw->get_data_ptr() );
+  
+    CHECK_FOR_CUDA_ERROR();
+  
+    return dcw;
+  }
+
+  template<class REAL> boost::shared_ptr< cuNDArray<REAL> >
+  compute_radial_dcw_golden_ratio_2d( unsigned int samples_per_profile, unsigned int num_profiles, 
+                                      REAL alpha, REAL one_over_radial_oversampling_factor, unsigned int profile_offset,
+                                      GOLDEN_RATIO_ANGULAR_STEP_SIZE mode)
+  {
+    if( mode == GR_SMALLEST )
+      return compute_radial_dcw_2d<REAL,0,true>
+        ( samples_per_profile, num_profiles, alpha, one_over_radial_oversampling_factor, profile_offset );
+    else if( mode == GR_ORIGINAL )
+      return compute_radial_dcw_2d<REAL,1,true>
+        ( samples_per_profile, num_profiles, alpha, one_over_radial_oversampling_factor, profile_offset );
+    else
+      throw std::runtime_error("\ncompute_radial_dcw_golden_ratio_2d() :: unexpected mode\n");
+  }
+
+  template<class REAL> boost::shared_ptr< cuNDArray<REAL> >
+  compute_radial_dcw_fixed_angle_2d( unsigned int samples_per_profile, unsigned int num_profiles, 
+                                     REAL alpha, REAL one_over_radial_oversampling_factor )
+  {
+    // The golden ratio template type is ignored when the tailing template argument is false
+    return compute_radial_dcw_2d<REAL,GR_ORIGINAL,false>
+      ( samples_per_profile, num_profiles, alpha, one_over_radial_oversampling_factor );
+  }
+
+  //
+  // Instantiation
+  //
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray< typename reald<float,2>::Type > > 
+  compute_radial_trajectory_fixed_angle_2d<float>( unsigned int, unsigned int, unsigned int, float );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray< typename reald<double,2>::Type > > 
+  compute_radial_trajectory_fixed_angle_2d<double>( unsigned int, unsigned int, unsigned int, double );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray< typename reald<float,2>::Type > > 
+  compute_radial_trajectory_golden_ratio_2d<float>( unsigned int, unsigned int, unsigned int, unsigned int, GOLDEN_RATIO_ANGULAR_STEP_SIZE );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray< typename reald<double,2>::Type > > 
+  compute_radial_trajectory_golden_ratio_2d<double>( unsigned int, unsigned int, unsigned int, unsigned int, GOLDEN_RATIO_ANGULAR_STEP_SIZE );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> >compute_radial_dcw_fixed_angle_2d<float>( unsigned int, unsigned int, float, float);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> >compute_radial_dcw_fixed_angle_2d<double>( unsigned int, unsigned int, double, double );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> >
+  compute_radial_dcw_golden_ratio_2d<float>( unsigned int, unsigned int, float, float, unsigned int, GOLDEN_RATIO_ANGULAR_STEP_SIZE );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> >
+  compute_radial_dcw_golden_ratio_2d<double>( unsigned int, unsigned int, double, double, unsigned int, GOLDEN_RATIO_ANGULAR_STEP_SIZE );
+}
diff --git a/toolboxes/core/gpu/radial_utilities.h b/toolboxes/core/gpu/radial_utilities.h
new file mode 100644
index 0000000..cf4c254
--- /dev/null
+++ b/toolboxes/core/gpu/radial_utilities.h
@@ -0,0 +1,41 @@
+#pragma once
+#include "gpucore_export.h"
+
+#include "cuNDArray.h"
+#include "vector_td.h"
+
+#include <boost/shared_ptr.hpp>
+
+namespace Gadgetron{
+
+  enum GOLDEN_RATIO_ANGULAR_STEP_SIZE {
+    GR_SMALLEST = 0, // 180*(3-sqrt(5.0))/2.0    = 68.7539 degrees
+    GR_ORIGINAL = 1  // 180/(sqrtf(5.0)+1.0)/2.0 = 111,2461 degrees 
+  };
+
+   // Compute variable angle radial trajectory in the normalized range [-1/2;1/2]
+  template<class REAL> EXPORTGPUCORE boost::shared_ptr< cuNDArray< typename reald<REAL,2>::Type > >
+  compute_radial_trajectory_variable_angle_2d(cuNDArray<REAL> * angles, unsigned int num_samples_per_profile, unsigned int num_profiles_per_frame,
+                                            unsigned int num_frames, REAL angular_offset = REAL(0) );
+ // Compute fixed angle radial trajectory in the normalized range [-1/2;1/2]
+  template<class REAL> EXPORTGPUCORE boost::shared_ptr< cuNDArray< typename reald<REAL,2>::Type > >
+  compute_radial_trajectory_fixed_angle_2d( unsigned int num_samples_per_profile, unsigned int num_profiles_per_frame, 
+                                            unsigned int num_frames, REAL angular_offset = REAL(0) );
+
+  // Compute golden ratio radial trajectory in the normalized range [-1/2;1/2]
+  template<class REAL> EXPORTGPUCORE boost::shared_ptr< cuNDArray< typename reald<REAL,2>::Type > >
+  compute_radial_trajectory_golden_ratio_2d( unsigned int num_samples_per_profile, unsigned int num_profiles_per_frame, 
+                                             unsigned int num_frames, 
+                                             unsigned int profile_offset = 0, GOLDEN_RATIO_ANGULAR_STEP_SIZE = GR_ORIGINAL );
+
+  // Compute fixed angle radial density compensation weights (a function of the chose reconstruction settings: matrix_size and oversampling factor)
+  template<class REAL> EXPORTGPUCORE boost::shared_ptr< cuNDArray<REAL> >
+  compute_radial_dcw_fixed_angle_2d( unsigned int num_samples_per_profile, unsigned int num_profiles, 
+                                     REAL alpha, REAL one_over_radial_oversampling_factor);
+
+  // Compute golden ratio radial density compensation weights (a function of the chose reconstruction settings: matrix_size and oversampling factor)
+  template<class REAL> EXPORTGPUCORE boost::shared_ptr< cuNDArray<REAL> >
+  compute_radial_dcw_golden_ratio_2d( unsigned int num_samples_per_profile, unsigned int num_profiles, 
+                                      REAL alpha, REAL one_over_radial_oversampling_factor, 
+                                      unsigned int profile_offset = 0, GOLDEN_RATIO_ANGULAR_STEP_SIZE = GR_ORIGINAL );
+}
diff --git a/toolboxes/core/gpu/real_utilities_device.h b/toolboxes/core/gpu/real_utilities_device.h
new file mode 100644
index 0000000..57059e1
--- /dev/null
+++ b/toolboxes/core/gpu/real_utilities_device.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <math_constants.h>
+#include <math_functions.h>
+
+//
+// Math prototypes
+//
+
+template<class REAL> __inline__ __host__ __device__ void gad_sincos( REAL angle, REAL *a, REAL *b );
+template<class REAL> __inline__ __host__ __device__ REAL gad_rsqrt( REAL val );
+
+
+//
+// Implementation
+//
+
+template<> __inline__ __host__ __device__ void gad_sincos<float>( float angle, float *a, float *b ){ sincosf(angle, a,b); }
+template<> __inline__ __host__ __device__ void gad_sincos<double>( double angle, double *a, double *b ){ sincos(angle, a,b); }
+
+template<> __inline__ __host__ __device__ float gad_rsqrt<float>( float val ){ return rsqrtf(val); }
+template<> __inline__ __host__ __device__ double gad_rsqrt<double>( double val ){ return rsqrt(val); }
diff --git a/toolboxes/core/gpu/setup_grid.h b/toolboxes/core/gpu/setup_grid.h
new file mode 100644
index 0000000..34b402c
--- /dev/null
+++ b/toolboxes/core/gpu/setup_grid.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "cudaDeviceManager.h"
+#include "GadgetronCuException.h"
+
+namespace Gadgetron {
+
+  inline 
+  void setup_grid( unsigned int number_of_elements, dim3 *blockDim, dim3* gridDim, unsigned int num_batches = 1 )
+  {    
+    int cur_device = cudaDeviceManager::Instance()->getCurrentDevice();
+    int maxGridDim = cudaDeviceManager::Instance()->max_griddim(cur_device);
+    int maxBlockDim = cudaDeviceManager::Instance()->max_blockdim(cur_device);
+
+    // The default one-dimensional block dimension is...
+    *blockDim = dim3(256);
+    *gridDim = dim3((number_of_elements+blockDim->x-1)/blockDim->x, num_batches);
+
+    // Extend block/grid dimensions if we exceeded the maximum grid dimension
+    if( gridDim->x > maxGridDim){
+      blockDim->x = maxBlockDim;
+      gridDim->x = (number_of_elements+blockDim->x-1)/blockDim->x;
+    }
+
+    if( gridDim->x > maxGridDim ){
+      gridDim->x = (unsigned int)std::floor(std::sqrt(float(number_of_elements)/float(blockDim->x)));
+      unsigned int num_elements_1d = blockDim->x*gridDim->x;
+      gridDim->y *= ((number_of_elements+num_elements_1d-1)/num_elements_1d);
+    }
+
+    if( gridDim->x > maxGridDim || gridDim->y > maxGridDim){
+      // If this ever becomes an issue, there is an additional grid dimension to explore for compute models >= 2.0.
+      throw cuda_error("setup_grid(): too many elements requested.");
+    }
+  }
+}
diff --git a/toolboxes/core/real_utilities.h b/toolboxes/core/real_utilities.h
new file mode 100644
index 0000000..66aaf7d
--- /dev/null
+++ b/toolboxes/core/real_utilities.h
@@ -0,0 +1,72 @@
+/** \file real_utilities.h
+    \brief A simple template based interface to some common C float/double constants to ease writing of templated code.
+*/
+
+#pragma once
+
+#include "core_defines.h"
+
+#ifdef _USE_MATH_DEFINES
+#include <math.h>
+#else
+#define _USE_MATH_DEFINES
+#include <math.h>
+#undef _USE_MATH_DEFINES
+#endif
+
+#include <float.h>
+
+//
+// Get scalar limits of operation
+//
+
+template<class T> __inline__ __host__ __device__ T get_min();
+template<class T> __inline__ __host__ __device__ T get_max();
+template<class T> __inline__ __host__ __device__ T get_epsilon();
+
+//
+// Math prototypes
+//
+
+template<class REAL> __inline__ __device__ REAL get_pi();
+
+//
+// Implementation
+//
+
+template<> __inline__ __host__ __device__ float get_min<float>()
+{
+  return FLT_MIN;
+}
+
+template<> __inline__ __host__ __device__ double get_min<double>()
+{
+  return DBL_MIN;
+}
+
+template<> __inline__ __host__ __device__ float get_max<float>()
+{
+  return FLT_MAX;
+}
+
+template<> __inline__ __host__ __device__ double get_max<double>()
+{
+  return DBL_MAX;
+}
+
+template<> __inline__ __host__ __device__ float get_epsilon<float>()
+{
+  return FLT_EPSILON;
+}
+
+template<> __inline__ __host__ __device__ double get_epsilon<double>()
+{
+  return DBL_EPSILON;
+}
+
+template<> __inline__ __host__ __device__ float get_pi(){ return (float)M_PI; }
+template<> __inline__ __host__ __device__ double get_pi(){ return M_PI; }
+
+template <typename T> __inline__ __host__ __device__ int sgn(T val) {
+    return (T(0) < val) - (val < T(0));
+}
diff --git a/toolboxes/core/vector_td.h b/toolboxes/core/vector_td.h
new file mode 100644
index 0000000..ebbe3b0
--- /dev/null
+++ b/toolboxes/core/vector_td.h
@@ -0,0 +1,317 @@
+/** \file vector_td.h
+    \brief The class vector_td defines a D-dimensional vector of type T.
+
+    The class vector_td defines a D-dimensional vector of type T.
+    It is used in the Gadgetron to represent short vectors.
+    I.e. it is purposedly templetated with dimensionality D as type unsigned int instead of size_t.
+    For larger vectors consider using the NDArray class instead (or a std::vector).
+    The vector_td class can be used on both the cpu and gpu.
+    The accompanying headers vector_td_opeators.h and vector_td_utilities.h define most of the functionality.
+    Note that vector_td should not be used to represent complex numbers. For that we provide the custom class complext instead.
+*/
+
+#pragma once
+
+#include "core_defines.h"
+
+#include <stdlib.h> // for size_t
+
+#ifdef max
+#undef max
+#endif // max
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class vector_td
+  {
+  public:
+
+    T vec[D];
+    __inline__ __host__ __device__ vector_td(){};
+
+#if __cplusplus > 199711L
+    template <typename... X>
+    constexpr __inline__ __host__ __device__ vector_td(X... xs) : vec{xs...} { }
+#endif
+    __inline__ __host__ __device__ vector_td(const vector_td & other){
+       	for (unsigned int i = 0; i < D; i++)
+           	vec[i] = other[i];
+        }
+
+    template<class T2> __inline__ __host__ __device__ explicit vector_td(const vector_td<T2,D> & other){
+    	for (unsigned int i = 0; i < D; i++)
+        	vec[i] = (T) other[i];
+     }
+
+    __inline__ __host__ __device__ explicit vector_td(T x){
+    	for (unsigned int i = 0; i < D; i++)
+               	vec[i] = x;
+		 }
+    __inline__ __host__ __device__ T& operator[](const unsigned int i)
+    {
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator[](const unsigned int i) const
+    {
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ T& operator()(const unsigned int i)
+    {
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator()(const unsigned int i) const
+    {
+      return vec[i];
+    }
+  };
+
+  //
+  // Some typedefs for convenience (templated typedefs are not (yet) available in C++)
+  //
+
+  template< class REAL, unsigned int D > struct reald{
+    typedef vector_td< REAL, D > Type;
+  };
+
+  template< unsigned int D > struct uintd{
+    typedef vector_td< unsigned int, D > Type;
+  };
+
+  template< unsigned int D > struct uint64d{
+    typedef vector_td< size_t, D > Type;
+  };
+
+  template< unsigned int D > struct intd{
+    typedef vector_td< int, D > Type;
+  };
+
+  template< unsigned int D > struct int64d{
+    typedef vector_td< long long, D > Type;
+  };
+
+  template< unsigned int D > struct floatd{
+    typedef typename reald< float, D >::Type Type;
+  };
+
+  template< unsigned int D > struct doubled{
+    typedef typename reald< double, D >::Type Type;
+  };
+
+  template<class T> class vector_td<T,1>
+  {
+  public:
+
+    T vec[1];
+
+    __inline__ __host__ __device__ vector_td(const vector_td & other){
+					vec[0] = other[0];
+		 }
+    template<class T2> __inline__ __host__ __device__ explicit vector_td(const vector_td<T2,1> & other){
+    	vec[0] = (T) other[0];
+    }
+
+    __inline__ __host__ __device__ vector_td(){}
+
+    __inline__ __host__ __device__ vector_td(T x){ // Not explicit because we actually want to be able to do implicit conversions here.
+      vec[0]=x;
+    }
+
+    __inline__ __host__ __device__ T& operator[](const unsigned int i){
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator[](const unsigned int i) const {
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ T& operator()(const unsigned int i){
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator()(const unsigned int i) const {
+      return vec[i];
+    }
+  };
+
+  template<class T> class vector_td<T,2>
+  {
+  public:
+
+    T vec[2];
+
+    __inline__ __host__ __device__ vector_td(const vector_td & other){
+      	for (unsigned int i = 0; i < 2; i++)
+          	vec[i] = other[i];
+		 }
+
+    //__inline__ __host__ __device__ explicit vector_td(T (&other)[2]) : vec(other){};
+
+    template<class T2> __inline__ __host__ __device__ explicit vector_td(const vector_td<T2,2> & other){
+    	for (unsigned int i = 0; i < 2; i++)
+        	vec[i] = (T) other[i];
+     }
+#if __cplusplus > 199711L
+
+    constexpr __inline__ __host__ __device__ vector_td( T x, T y) : vec{x,y} { }
+//    template <typename... X>
+//    constexpr __inline__ __host__ __device__ vector_td(X... xs) : vec{xs...} { }
+#else
+    __inline__ __host__ __device__ vector_td(T x, T y){
+    	vec[0] = x;
+    	vec[1] = y;
+    }
+#endif
+    __inline__ __host__ __device__ vector_td(){}
+
+    __inline__ __host__ __device__ explicit vector_td(T x){
+      vec[0]=x;
+      vec[1]=x;
+    }
+    __inline__ __host__ __device__ T& operator[](const unsigned int i){
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator[](const unsigned int i) const {
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ T& operator()(const unsigned int i){
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator()(const unsigned int i) const {
+      return vec[i];
+    }
+  };
+
+  template<class T> class vector_td<T,3>
+  {
+  public:
+
+    T vec[3];
+
+    __inline__ __host__ __device__ vector_td(const vector_td & other){
+      	for (unsigned int i = 0; i < 3; i++)
+          	vec[i] = other[i];
+		 }
+    template<class T2> __inline__ __host__ __device__ explicit vector_td(const vector_td<T2,3> & other){
+    	for (unsigned int i = 0; i < 3; i++)
+        	vec[i] = (T) other[i];
+     }
+    __inline__ __host__ __device__ vector_td(){}
+
+    __inline__ __host__ __device__ vector_td(T x, T y,T z){
+      vec[0]=x;
+      vec[1]=y;
+      vec[2]=z;
+    }
+
+    __inline__ __host__ __device__ explicit vector_td(T x){
+      vec[0]=x;
+      vec[1]=x;
+      vec[2]=x;
+    }
+
+    __inline__ __host__ __device__ T& operator[](const unsigned int i){
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator[](const unsigned int i) const {
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ T& operator()(const unsigned int i){
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator()(const unsigned int i) const {
+      return vec[i];
+    }
+  };
+
+  template<class T> class vector_td<T,4>
+  {
+  public:
+
+    T vec[4];
+
+    __inline__ __host__ __device__ vector_td(const vector_td & other){
+    	for (unsigned int i = 0; i < 4; i++)
+        	vec[i] = other[i];
+     }
+    //__inline__ __host__ __device__ explicit vector_td(T (&other)[4]) : vec(other){};
+    template<class T2> __inline__ __host__ __device__ explicit vector_td(const vector_td<T2,4> & other){
+    	for (unsigned int i = 0; i < 4; i++)
+        	vec[i] = (T) other[i];
+     }
+
+#if __cplusplus > 199711L
+    constexpr __inline__ __host__ __device__ vector_td( T x, T y, T z, T t) : vec{x,y,z,t} { }
+#else
+    __inline__ __host__ __device__ vector_td(T x, T y, T z, T t){
+    	vec[0] = x;
+    	vec[1] = y;
+    	vec[2] = z;
+    	vec[3] = t;
+    }
+#endif
+
+    __inline__ __host__ __device__ vector_td(){}
+
+    __inline__ __host__ __device__ explicit vector_td(T x){
+      vec[0]=x;
+      vec[1]=x;
+      vec[2]=x;
+      vec[3]=x;
+    }
+
+    __inline__ __host__ __device__ T& operator[](const unsigned int i){
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator[](const unsigned int i) const {
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ T& operator()(const unsigned int i){
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator()(const unsigned int i) const {
+      return vec[i];
+    }
+  };
+
+  typedef vector_td<unsigned int,1> uintd1;
+  typedef vector_td<unsigned int,2> uintd2;
+  typedef vector_td<unsigned int,3> uintd3;
+  typedef vector_td<unsigned int,4> uintd4;
+
+  typedef vector_td<size_t,1> uint64d1;
+  typedef vector_td<size_t,2> uint64d2;
+  typedef vector_td<size_t,3> uint64d3;
+  typedef vector_td<size_t,4> uint64d4;
+
+  typedef vector_td<int,1> intd1;
+  typedef vector_td<int,2> intd2;
+  typedef vector_td<int,3> intd3;
+  typedef vector_td<int,4> intd4;
+
+  typedef vector_td<long long,1> int64d1;
+  typedef vector_td<long long,2> int64d2;
+  typedef vector_td<long long,3> int64d3;
+  typedef vector_td<long long,4> int64d4;
+
+  typedef vector_td<float,1> floatd1;
+  typedef vector_td<float,2> floatd2;
+  typedef vector_td<float,3> floatd3;
+  typedef vector_td<float,4> floatd4;
+
+  typedef vector_td<double,1> doubled1;
+  typedef vector_td<double,2> doubled2;
+  typedef vector_td<double,3> doubled3;
+  typedef vector_td<double,4> doubled4;
+}
diff --git a/toolboxes/core/vector_td_io.h b/toolboxes/core/vector_td_io.h
new file mode 100644
index 0000000..a70cc94
--- /dev/null
+++ b/toolboxes/core/vector_td_io.h
@@ -0,0 +1,49 @@
+/** \file vector_td_io.h
+    \brief Basic iostream "communication" using the vector_td class
+*/
+
+#pragma once
+
+#include "vector_td.h"
+
+#include <cmath>
+#include <iostream>
+#include <algorithm>
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> ::std::ostream& operator<<(std::ostream& os, const vector_td<T,D>& vec) {
+    os <<'[' ;
+    for (int i = 0; i < D-1; i++) os << vec[i] << ", ";
+    return os << vec[D-1] <<']';
+  }
+
+  template<class T, unsigned int D> std::istream& operator>>(std::istream& is, vector_td<T,D>& vec) {
+    char tmp;
+    is.get(tmp);
+    if (tmp != '['){
+      is.setstate(std::ios::failbit);
+      return is;
+    }
+
+    for (int i = 0; i < D-1; i++){
+      T val;
+      tmp = ' ';
+      is >> val;
+      vec[i]=val;
+      while (tmp == ' ') is.get(tmp);
+      if (tmp != ','){
+	is.setstate(std::ios::failbit);
+	return is;
+      }
+    }
+    tmp = ' ';
+    is >> vec[D-1];
+    while (tmp == ' ') is.get(tmp);
+    if (tmp != ']'){
+      is.setstate(std::ios::failbit);
+      return is;
+    }
+    return is;
+  }
+}
diff --git a/toolboxes/core/vector_td_operators.h b/toolboxes/core/vector_td_operators.h
new file mode 100644
index 0000000..5048050
--- /dev/null
+++ b/toolboxes/core/vector_td_operators.h
@@ -0,0 +1,435 @@
+/** \file vector_td_operators.h
+    \brief Common operators for the vector_td class
+*/
+
+#pragma once
+
+#include "vector_td.h"
+#include "core_defines.h"
+
+namespace Gadgetron{
+
+  //
+  // Return types
+  //
+
+  template <class T, class I> struct vectorTDReturnType {};
+  template <class T> struct vectorTDReturnType<T,T> {typedef T type;};
+  template<> struct vectorTDReturnType<unsigned int, int> {typedef int type;};
+  template<> struct vectorTDReturnType<int, unsigned int> {typedef int type;};
+  template<> struct vectorTDReturnType<int, bool> {typedef int type;};
+  template<> struct vectorTDReturnType<bool,int> {typedef int type;};
+  template<> struct vectorTDReturnType<unsigned int, bool> {typedef int type;};
+  template<> struct vectorTDReturnType<bool,unsigned int> {typedef int type;};
+  template<> struct vectorTDReturnType<float, unsigned int> {typedef float type;};
+  template<> struct vectorTDReturnType<unsigned int, float> {typedef float type;};
+  template<> struct vectorTDReturnType<float, int> {typedef float type;};
+  template<> struct vectorTDReturnType<int, float> {typedef float type;};
+  template<> struct vectorTDReturnType<float, bool> {typedef float type;};
+	template<> struct vectorTDReturnType<bool, float> {typedef float type;};
+  template<> struct vectorTDReturnType<double, unsigned int> {typedef double type;};
+  template<> struct vectorTDReturnType<unsigned int, double> {typedef double type;};
+  template<> struct vectorTDReturnType<double, int> {typedef double type;};
+  template<> struct vectorTDReturnType<int, double> {typedef double type;};
+  template<> struct vectorTDReturnType<double, bool> {typedef double type;};
+  template<> struct vectorTDReturnType<bool, double> {typedef double type;};
+  template<> struct vectorTDReturnType<double, float> {typedef double type;};
+  template<> struct vectorTDReturnType<float,double> {typedef double type;};
+
+  //
+  // Operators are defined as component wise operations.
+  //
+
+  //
+  // Arithmetic operators
+  //
+
+  template< class T,class R,  unsigned int D > __inline__ __host__ __device__
+  void operator+= ( vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] += v2.vec[i];
+  }
+
+  template< class T,class R,  unsigned int D > __inline__ __host__ __device__
+  void operator+= ( vector_td<T,D> &v1, const R &v2 )
+  {
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] += v2;
+  }
+
+  template< class T,class R,  unsigned int D > __inline__ __host__ __device__
+  void operator-= ( vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] -= v2.vec[i];
+  }
+
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  void operator*= ( vector_td<T,D> &v1, const R &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] *= v2;
+  }
+
+  template< class T,class R,  unsigned int D > __inline__ __host__ __device__
+  void operator *=  ( vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+	{
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] *= v2.vec[i];
+  }
+
+  template< class T,class R,  unsigned int D > __inline__ __host__ __device__
+  void operator /= ( vector_td<T,D> &v1, const R &v2 )
+  {
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] /= v2;
+  }
+
+  template< class T,class R,  unsigned int D > __inline__ __host__ __device__
+  void operator /=  ( vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  {
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] /= v2.vec[i];
+  }
+
+  template< class T,class R,  unsigned int D > __inline__ __host__ __device__
+  void component_wise_div_eq ( vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] /= v2.vec[i];
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator+ ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = v1.vec[i]+v2.vec[i];
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator+ ( const vector_td<T,D> &v1, const R &v2 )
+  {
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = v1.vec[i]+v2;
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator- ( const vector_td<T,D> &v1, const R &v2 )
+  {
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = v1.vec[i]-v2;
+    return res;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator+ (const R &v2, const vector_td<T,D> &v1 )
+  {
+    return v1+v2;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator- ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = v1.vec[i]-v2.vec[i];
+    return res;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<T,D> operator- ( const vector_td<T,D> &v1)
+  {
+    vector_td<T,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = -v1.vec[i];
+    return res;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> component_wise_mul ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = v1.vec[i]*v2.vec[i];
+    return res;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__
+  vector_td<T,D> component_wise_mul ( const vector_td<T,D> &v1, const vector_td<T,D> &v2 )
+  {
+    vector_td<T,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = v1.vec[i]*v2.vec[i];
+    return res;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator* ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  {
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res;
+    for(unsigned int i=0; i<D; i++ )  res.vec[i] = v1.vec[i]*v2.vec[i];
+    return res;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator* ( const vector_td<T,D> &v1, const R &v2 )
+  { 
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = v1.vec[i]*v2;
+    return res;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator* ( const R &v1, const vector_td<T,D> &v2 )
+  { 
+    return v2*v1;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator/ ( const vector_td<T,D> &v1, const R &v2 )
+  {
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = v1.vec[i]/v2;
+    return res;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator/ ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  {
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res = v1;
+    for(unsigned int i=0; i<D; i++ ) res[i] /= v2[i];
+    return res;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> component_wise_div ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    return v1/v2;
+  }
+
+  // 
+  // "Strong" comparison operators
+  //
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  bool operator== ( const vector_td<T,D> &v1, const vector_td<T,D> &v2 ) 
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(!(v1.vec[i] == v2.vec[i])) return false;
+    return true;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  bool operator!= ( const vector_td<T,D> &v1, const vector_td<T,D> &v2 ) 
+  { 
+    for(unsigned int i=0; i<D; i++ ) if((v1.vec[i] != v2.vec[i])) return true;
+    return false;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  bool operator&& ( const vector_td<T,D> &v1, const vector_td<T,D> &v2 ) 
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(!(v1.vec[i] && v2.vec[i])) return false;
+    return true;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  bool operator|| ( const vector_td<T,D> &v1, const vector_td<T,D> &v2 ) 
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(!(v1.vec[i] || v2.vec[i])) return false;
+    return true;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__ 
+  bool operator< ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(!(v1.vec[i] < v2.vec[i])) return false;
+    return true;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  bool operator<= ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(!(v1.vec[i] <= v2.vec[i])) return false;
+    return true;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  bool operator> ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(!(v1.vec[i] > v2.vec[i])) return false;
+    return true;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  bool operator>= ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(!(v1.vec[i] >= v2.vec[i])) return false;
+    return true;
+  }
+
+  //
+  // "Weak" comparison "operators"
+  //
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  bool weak_equal ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(v1.vec[i] == v2.vec[i]) return true;
+    return false;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__
+  bool weak_not_equal ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(v1.vec[i] != v2.vec[i]) return true;
+    return false;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__
+  bool weak_and ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(v1.vec[i] && v2.vec[i]) return true;
+    return false;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  bool weak_or ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(v1.vec[i] || v2.vec[i]) return true;
+    return false;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  bool weak_less ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(v1.vec[i] < v2.vec[i]) return true;
+    return false;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  bool weak_less_equal ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(v1.vec[i] <= v2.vec[i]) return true;
+    return false;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__
+  bool weak_greater ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(v1.vec[i] > v2.vec[i]) return true;
+    return false;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  bool weak_greater_equal ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(v1.vec[i] >= v2.vec[i]) return true;
+    return false;
+  }
+
+  //
+  // Vector comparison "operators"
+  //
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  vector_td<bool,D> vector_equal ( const vector_td<T,D> &v1, const vector_td<T,D> &v2 )
+  { 
+    vector_td<T,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = (v1.vec[i] == v2.vec[i]);
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  vector_td<bool,D> vector_not_equal ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    vector_td<T,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = (v1.vec[i] != v2.vec[i]);
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  vector_td<T,D> vector_and ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    vector_td<T,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = (v1.vec[i] && v2.vec[i]);
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  vector_td<bool,D> vector_or ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+  	vector_td<bool,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = (v1.vec[i] || v2.vec[i]);
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  vector_td<bool,D> vector_less ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+  	vector_td<bool,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = (v1.vec[i] < v2.vec[i]);
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  vector_td<bool,D> vector_less_equal ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+  	vector_td<bool,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = (v1.vec[i] <= v2.vec[i]);
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  vector_td<bool,D> vector_greater ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  {
+    vector_td<bool,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = (v1.vec[i] > v2.vec[i]);
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  vector_td<bool,D> vector_greater_equal ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  {  
+  	vector_td<bool,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = (v1.vec[i] >= v2.vec[i]);
+    return res;
+  }
+
+  //
+  // Integer only operators
+  //
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  void operator<<= ( vector_td<T,D> &v1, size_t shifts ) 
+  { 
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] <<= shifts;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  void operator>>= ( vector_td<T,D> &v1, size_t shifts ) 
+  { 
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] >>= shifts;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<T,D> operator<< ( const vector_td<T,D> &v1, size_t shifts ) 
+  { 
+    vector_td<T,D> res = v1;
+    res <<= shifts;
+    return res;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<T,D> operator>> ( const vector_td<T,D> &v1, size_t shifts ) 
+  { 
+    vector_td<T,D> res = v1;
+    res >>= shifts;
+    return res;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  void operator%= ( vector_td<T,D> &v1, const vector_td<T,D> &v2 ) 
+  { 
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] %= v2.vec[i];
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<T,D> operator% ( const vector_td<T,D> &v1, const vector_td<T,D> &v2 ) 
+  { 
+    vector_td<T,D> res = v1;
+    res %= v2;
+    return res;
+  }
+}
diff --git a/toolboxes/core/vector_td_utilities.h b/toolboxes/core/vector_td_utilities.h
new file mode 100644
index 0000000..25a30d0
--- /dev/null
+++ b/toolboxes/core/vector_td_utilities.h
@@ -0,0 +1,495 @@
+/** \file vector_td_utilities.h
+    \brief The class vector_td defines a D-dimensional vector of type T.
+
+    The class vector_td defines a D-dimensional vector of type T.
+    It is used in the Gadgetron to represent small (one- to four-dimensional) vectors only.
+    For larger vectors consider using the NDArray class instead.
+    The vector_td class can be used on both the cpu and gpu.
+    The accompanying headers vector_td_opeators.h and vector_td_utilities.h define most of the functionality.
+*/
+
+#pragma once
+
+#include "vector_td.h"
+#include "vector_td_operators.h"
+#include "real_utilities.h"
+#include "core_defines.h"
+#include "complext.h"
+
+#include <float.h>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+#include <cmath>
+
+#ifdef max
+#undef max
+#endif
+
+#ifdef min
+#undef min
+#endif
+
+#ifndef __CUDA_ARCH__ // workaround for nvcc
+using std::ceil;  
+using std::floor; 
+using std::abs;   
+using std::sqrt;
+#endif
+
+namespace Gadgetron{
+
+  // Windows/Cuda has some issues when using min and max.
+  // For now we define our own implementation
+
+  template <class T> __inline__ __host__ __device__ const T& _vector_td_min (const T& a, const T& b) {
+    return (a>b)?b:a;
+  }
+  template <class T> __inline__ __host__ __device__ const T& _vector_td_max (const T& a, const T& b) {
+    return (a<b)?b:a;
+  }
+
+    //
+  // In-place operations
+  //
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  void clear( vector_td<T,D> &vec, const T &val = T(0) )
+  {
+    for (unsigned int i=0; i<D; i++) {
+      vec[i] = val;
+    }
+  }
+  
+  //
+  // Component-wise math operations
+  //
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  vector_td<T,D> abs( const vector_td<T,D>& vec )
+  {
+    vector_td<T,D> res;
+    for (unsigned int i=0; i<D; i++) {
+      res[i] = ::abs(vec[i]);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  vector_td<int,D> sgn( const vector_td<T,D>& vec )
+  {
+    vector_td<int,D> res;
+    for (unsigned int i=0; i<D; i++) {
+      res[i] = sgn(vec[i]);
+    }
+    return res;
+  }
+
+  template<class REAL, unsigned int D> __inline__ __host__ __device__ 
+  vector_td<REAL,D> ceil( const vector_td<REAL,D> vec )
+  {
+    vector_td<REAL,D> res;
+    for (unsigned int i=0; i<D; i++) {
+      res[i] = ::ceil(vec[i]);
+    }
+    return res;
+  }
+
+  template<class REAL, unsigned int D> __inline__ __host__ __device__ 
+  vector_td<REAL,D> floor( const vector_td<REAL,D> vec )
+  {
+    vector_td<REAL,D> res;
+    for (unsigned int i=0; i<D; i++) {
+      res[i] = ::floor(vec[i]);
+    }
+    return res;
+  }
+
+
+  //
+  // Grid <-> index transformations
+  //
+
+  template<unsigned int D> __inline__ __host__ __device__ 
+  typename uintd<D>::Type idx_to_co( unsigned int idx, const vector_td<unsigned,D> dims )
+  {
+    typename uintd<D>::Type co;
+    unsigned int idx_tmp = idx;
+    for (unsigned int i=0; i<D; i++) {
+      co[i] = idx_tmp%dims[i];
+      idx_tmp -= co[i];
+      idx_tmp /= dims[i];
+    }
+    return co;
+  } 
+
+  template<unsigned int D> __inline__ __host__ __device__ 
+  typename uint64d<D>::Type idx_to_co( size_t idx, const vector_td<size_t,D> dims )
+  {
+    typename uint64d<D>::Type co;
+    size_t idx_tmp = idx;
+    for (unsigned int i=0; i<D; i++) {
+      co[i] = idx_tmp%dims[i];
+      idx_tmp -= co[i];
+      idx_tmp /= dims[i];
+    }
+    return co;
+  } 
+
+  template<unsigned int D> __inline__ __host__ __device__ 
+  typename intd<D>::Type idx_to_co( int idx, const vector_td<int,D> dims )
+  {
+    typename intd<D>::Type co;
+    int idx_tmp = idx;
+    for (unsigned int i=0; i<D; i++) {
+      co[i] = idx_tmp%dims[i];
+      idx_tmp -= co[i];
+      idx_tmp /= dims[i];
+    }
+    return co;
+  } 
+
+  template<unsigned int D> __inline__ __host__ __device__ 
+  typename int64d<D>::Type idx_to_co( long long idx, const vector_td<long long,D> dims )
+  {
+    typename int64d<D>::Type co;
+    long long idx_tmp = idx;
+    for (unsigned int i=0; i<D; i++) {
+      co[i] = idx_tmp%dims[i];
+      idx_tmp -= co[i];
+      idx_tmp /= dims[i];
+    }
+    return co;
+  } 
+
+  template<unsigned int D> __inline__ __host__ __device__
+  unsigned int co_to_idx( const vector_td<unsigned int,D> co, const vector_td<unsigned int,D> dims )
+  {
+    unsigned int idx = 0;
+    unsigned int block_size = 1;
+    for (unsigned int i=0; i<D; i++) {
+      idx += (block_size*co[i]);
+      block_size *= dims[i];
+    }
+    return idx;
+  }
+
+  template<unsigned int D> __inline__ __host__ __device__
+  size_t co_to_idx( const vector_td< size_t,D> co, const vector_td<size_t,D> dims )
+  {
+    size_t idx = 0;
+    size_t block_size = 1;
+    for (unsigned int i=0; i<D; i++) {
+      idx += (block_size*co[i]);
+      block_size *= dims[i];
+    }
+    return idx;
+  }
+
+  template<unsigned int D> __inline__ __host__ __device__
+  int co_to_idx( const vector_td<int,D> co, const vector_td<int,D> dims )
+  {
+    int idx = 0;
+    int block_size = 1;
+    for (unsigned int i=0; i<D; i++) {
+      idx += (block_size*co[i]);
+      block_size *= dims[i];
+    }
+    return idx;
+  }
+
+  template<unsigned int D> __inline__ __host__ __device__
+  long long co_to_idx( const vector_td<long long,D> co, const vector_td<long long,D> dims )
+  {
+    long long idx = 0;
+    long long block_size = 1;
+    for (unsigned int i=0; i<D; i++) {
+      idx += (block_size*co[i]);
+      block_size *= dims[i];
+    }
+    return idx;
+  }
+  
+  template<unsigned int D> __inline__ __host__ __device__ 
+  unsigned int co_to_idx( const vector_td<unsigned int,D> co, 
+                          const vector_td<unsigned int,D> dims, 
+                          const vector_td<unsigned int,D> order )
+  {
+    unsigned int idx = 0;
+    unsigned int block_size = 1;
+    for (unsigned int i=0; i<D; i++){
+      idx += (block_size*co.d[order[i]]);
+      block_size *= dims.d[order[i]];
+    }
+    return idx;
+  } 
+
+  template<unsigned int D> __inline__ __host__ __device__ 
+  size_t co_to_idx( const vector_td<size_t,D> co, 
+                    const vector_td<size_t,D> dims, 
+                    const vector_td<unsigned int,D> order )
+  {
+    size_t idx = 0;
+    size_t block_size = 1;
+    for (unsigned int i=0; i<D; i++){
+      idx += (block_size*co.d[order[i]]);
+      block_size *= dims.d[order[i]];
+    }
+    return idx;
+  } 
+
+  template<int D> __inline__ __host__ __device__ 
+  int co_to_idx( const vector_td<int,D> co, 
+                 const vector_td<int,D> dims, 
+                 const vector_td<unsigned int,D> order )
+  {
+    int idx = 0;
+    int block_size = 1;
+    for (unsigned int i=0; i<D; i++){
+      idx += (block_size*co.d[order[i]]);
+      block_size *= dims.d[order[i]];
+    }
+    return idx;
+  } 
+
+  template<unsigned int D> __inline__ __host__ __device__ 
+  long long co_to_idx( const vector_td<long long,D> co, 
+                       const vector_td<long long,D> dims, 
+                       const vector_td<unsigned int,D> order )
+  {
+    long long idx = 0;
+    long long block_size = 1;
+    for (unsigned int i=0; i<D; i++){
+      idx += (block_size*co.d[order[i]]);
+      block_size *= dims.d[order[i]];
+    }
+    return idx;
+  } 
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  vector_td<T,D> counting_vec()
+  {
+    vector_td<T,D> res;
+    for(unsigned int i=0; i<D; i++) {
+      res[i]=T(i);
+    }
+    return res;
+  }
+
+  //
+  // Conversion between vector_td and std::vector
+  //
+
+  template<class T, unsigned int D> inline
+  std::vector<T> to_std_vector( vector_td<T,D> vec )
+  {
+    std::vector<T> out(D);
+    for(unsigned int i=0; i<D; i++ )
+      out[i] = vec[i];
+    return out;
+  }
+
+  template<class T, unsigned int D> inline
+  vector_td<T,D> from_std_vector( std::vector<T> &_vector )
+  {
+    vector_td<T,D> out;
+    for( unsigned int i=0; i<D; i++ ){
+      if( i<_vector.size() )
+        out[i] = _vector[i];
+      else
+        out[i] = T(1);
+    }
+    return out;
+  }
+
+  //
+  // Reductions on vector_td<T,D>
+  //
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  T prod( const vector_td<T,D>& vec )
+  {
+    T res = vec[0];
+    for (unsigned int i=1; i<D; i++){
+      res *= vec[i];
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  T sum( const vector_td<T,D>& vec )
+  {
+    T res = vec[0];
+    for (unsigned int i=1; i<D; i++){
+      res += vec[i];
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  T dot( const vector_td<T,D>& vec1, const vector_td<T,D>& vec2 )
+  {
+    T res = (vec1[0]*vec2[0]);
+    for (unsigned int i=1; i<D; i++){
+      res += (vec1[i]*vec2[i]);
+    }
+    return res;
+  }
+
+  template<class REAL, unsigned int D> __inline__ __host__ __device__
+  complext<REAL> dot(const vector_td<complext<REAL>, D>& vec1, const vector_td<REAL, D>& vec2)
+  {
+	  complext<REAL> res = (vec1[0] * vec2[0]);
+	  for (unsigned int i = 1; i<D; i++){
+		  res += (vec1[i] * vec2[i]);
+	  }
+	  return res;
+
+  }
+
+  template<class REAL, unsigned int D> __inline__ __host__ __device__
+  complext<REAL> dot(const vector_td<REAL, D>& vec1, const vector_td<complext<REAL>, D>& vec2)
+  {
+	  complext<REAL> res = (vec1[0] * vec2[0]);
+	  for (unsigned int i = 1; i<D; i++){
+		  res += (vec1[i] * vec2[i]);
+	  }
+	  return res;
+
+  }
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  T max( const vector_td<T,D>& vec )
+  {
+    T res = vec[0];
+    for (unsigned int i=1; i<D; i++){
+      res = _vector_td_max(res,vec[i]);
+    }
+    return res;
+  }
+  
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  T min( const vector_td<T,D>& vec )
+  {
+    T res = vec[0];
+    for (unsigned int i=1; i<D; i++){
+      res = _vector_td_min(res,vec[i]);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  vector_td<T,D> amin( const vector_td<T,D>& vec1, const vector_td<T,D>& vec2)
+  {
+    vector_td<T,D> res;
+    for (unsigned int i=0; i<D; i++){
+      res[i] = _vector_td_min(vec1[i],vec2[i]);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  vector_td<T,D> amax( const vector_td<T,D>& vec1, const vector_td<T,D>& vec2)
+  {
+    vector_td<T,D> res;
+    for (unsigned int i=0; i<D; i++){
+      res[i] = _vector_td_max(vec1[i],vec2[i]);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  vector_td<T,D> amin( const vector_td<T,D>& vec1, T val)
+  {
+    vector_td<T,D> res;
+    for (unsigned int i=0; i<D; i++){
+      res[i] = _vector_td_min(vec1[i],val);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  vector_td<T,D> amax( const vector_td<T,D>& vec1, T val )
+  {
+    vector_td<T,D> res;
+    for (unsigned int i=0; i<D; i++){
+      res[i] = _vector_td_max(vec1[i],val);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  T max_not_nan( const vector_td<T,D>& vec )
+  {
+    unsigned int i=0;
+    while (isnan(vec[i])) i++;
+    if (i >= D) return 0;
+    T res = vec[i];
+    for (++i; i<D; i++){
+      if (!isnan(vec[i])) res = _vector_td_max(res,vec[i]);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  T min_not_nan( const vector_td<T,D>& vec )
+  {
+    unsigned int i=0;
+    while (isnan(vec[i])) i++;
+    T res = vec[i];
+    for (++i; i<D; i++){
+      if (!isnan(vec[i])) res = _vector_td_min(res,vec[i]);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  unsigned int argmin( const vector_td<T,D>& vec )
+  {
+    unsigned int res= 0;
+    for (unsigned int i=1; i<D; i++){
+      if (vec[i] < vec[res] ) res = i;
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  unsigned int argmin_not_nan( const vector_td<T,D>& vec )
+  {
+    unsigned int res= 0;
+    for (unsigned int i=1; i<D; i++){
+      if (vec[i] < vec[res] && !isnan(vec[i])) res = i;
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  unsigned int argmax( const vector_td<T,D>& vec )
+  {
+    unsigned int res= 0;
+    for (unsigned int i=1; i<D; i++){
+      if (vec[i] > vec[res] ) res = i;
+    }
+    return res;
+  }
+
+  //
+  // Reductions on reald<REAL,D>
+  //
+
+  template<class REAL, unsigned int D> __inline__ __host__ __device__ 
+  REAL norm_squared( const vector_td<REAL,D> vec )
+  {
+    REAL res = REAL(0);
+    for (unsigned int i=0; i<D; i++){
+      res += (vec[i]*vec[i]);
+    }
+    return res;
+  }
+
+  template<class REAL, unsigned int D> __inline__ __host__ __device__ 
+  REAL norm( const vector_td<REAL,D> vec )
+  {
+    return ::sqrt(norm_squared<REAL,D>(vec));
+  }
+
+}
diff --git a/toolboxes/ct/CMakeLists.txt b/toolboxes/ct/CMakeLists.txt
new file mode 100644
index 0000000..fdc9262
--- /dev/null
+++ b/toolboxes/ct/CMakeLists.txt
@@ -0,0 +1,3 @@
+IF(CUDA_FOUND)
+  add_subdirectory(xray)
+ENDIF(CUDA_FOUND)
diff --git a/toolboxes/ct/xray/CMakeLists.txt b/toolboxes/ct/xray/CMakeLists.txt
new file mode 100644
index 0000000..a4a2f34
--- /dev/null
+++ b/toolboxes/ct/xray/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(CUDA_FOUND AND HDF5_FOUND)
+  add_subdirectory(gpu)
+endif()
diff --git a/toolboxes/ct/xray/gpu/CBCT_acquisition.h b/toolboxes/ct/xray/gpu/CBCT_acquisition.h
new file mode 100644
index 0000000..88d0278
--- /dev/null
+++ b/toolboxes/ct/xray/gpu/CBCT_acquisition.h
@@ -0,0 +1,298 @@
+/**
+ * Data format for CBCT acquisition (data and geometry)
+ **/
+
+#pragma once
+
+#include "vector_td_io.h"
+#include "hoNDArray.h"
+#include "hoNDArray_utils.h"
+
+#include <hdf5.h>
+#include <hdf5_hl.h>
+#include <vector>
+#include <sstream>
+#include <stdexcept>
+#include <boost/shared_ptr.hpp>
+
+namespace Gadgetron{
+
+class CBCT_geometry
+{
+public:
+
+	CBCT_geometry() {
+		SAD_ = 0.0f;
+		SDD_ = 0.0f;
+		FOV_ = floatd2(0.0f);
+	}
+
+	~CBCT_geometry() {}
+
+	inline void set_SAD( float SAD ) { SAD_ = SAD; }
+	inline float get_SAD() { return SAD_; }
+
+	inline void set_SDD( float SDD ) { SDD_ = SDD; }
+	inline float get_SDD() { return SDD_; }
+
+	inline void set_FOV( floatd2 v ) { FOV_ = v; }
+	inline floatd2 get_FOV() { return FOV_; }
+
+	inline void set_angles( std::vector<float> &angles ) { angles_ = angles; }
+	inline std::vector<float>& get_angles() { return angles_; }
+
+	inline void set_offsets( std::vector<floatd2> &offsets ) { offsets_ = offsets; }
+	inline std::vector<floatd2>& get_offsets() { return offsets_; }
+
+	// Basic output support
+	//
+
+	void print( std::ostream& os )
+	{
+		os << "------------ GEOMETRY ------------" << std::endl;
+		if (angles_.size() == 0)
+			os << "Angles: " << "EMPTY" << std::endl;
+		else {
+			os << "Angles: ";
+			os << "Angles: " << angles_.front() << " ... " << angles_.back()
+	  										 << ", number of angles: " << angles_.size() << std::endl;
+		}
+
+		if (offsets_.size() == 0)
+			os << "Offsets: " << "EMPTY" << std::endl;
+		else {
+			os << "Offsets: contains " << offsets_.size() << " elements" << std::endl;
+		}
+
+		os << "SDD: " << SDD_ << "mm" << std::endl;
+		os << "SAD: " << SAD_ << "mm" << std::endl;
+		os << "FOV: " << FOV_ << "mm" << std::endl;
+		os << "----------------------------------" << std::endl;
+	}
+
+	void save( hid_t file_id )
+	{
+		{
+			unsigned int dataformat_version=2;
+			hsize_t dims[1] = {1};
+			H5LTmake_dataset(file_id, "/geometry_dataformat_version", 1, dims, H5T_NATIVE_UINT, &dataformat_version);
+		}
+		{
+			hsize_t dims[1] = {1};
+			H5LTmake_dataset(file_id, "/SAD", 1, dims, H5T_NATIVE_FLOAT, &SAD_);
+		}
+		{
+			hsize_t dims[1] = {1};
+			H5LTmake_dataset(file_id, "/SDD", 1, dims, H5T_NATIVE_FLOAT, &SDD_);
+		}
+		{
+			hsize_t dims[1] = {2};
+			H5LTmake_dataset(file_id, "/FOV", 1, dims, H5T_NATIVE_FLOAT, &FOV_);
+		}
+		{
+			hsize_t dims[1] = {angles_.size()};
+			H5LTmake_dataset(file_id, "/angles", 1, dims, H5T_NATIVE_FLOAT, &angles_[0]);
+		}
+		{
+			std::vector<float> offsetx, offsety;
+			for( unsigned int i=0; i<offsets_.size(); i++ ){
+				floatd2 offset = offsets_[i];
+				offsetx.push_back(offset[0]);
+				offsety.push_back(offset[1]);
+			}
+			hsize_t dims[1] = {offsets_.size()};
+			H5LTmake_dataset(file_id, "/offsetx", 1, dims, H5T_NATIVE_FLOAT, &offsetx[0]);
+			H5LTmake_dataset(file_id, "/offsety", 1, dims, H5T_NATIVE_FLOAT, &offsety[0]);
+		}
+	}
+
+protected:
+
+	float SAD_;
+	float SDD_;
+	floatd2 FOV_;
+	std::vector<float> angles_;
+	std::vector<floatd2> offsets_;
+};
+
+class CBCT_acquisition {
+
+public:
+
+	CBCT_acquisition() {}
+
+	CBCT_acquisition( boost::shared_ptr< hoNDArray<float> > projections,
+			boost::shared_ptr<CBCT_geometry> geometry )
+	{
+		geometry_ = geometry;
+		projections_ = projections;
+	}
+
+	virtual ~CBCT_acquisition() {}
+
+	inline void set_geometry( boost::shared_ptr<CBCT_geometry> geometry ) {
+		geometry_ = geometry;
+	}
+
+	inline boost::shared_ptr<CBCT_geometry> get_geometry() {
+		return geometry_; }
+
+	inline void set_projections( boost::shared_ptr< hoNDArray<float> > projections ) {
+		projections_ = projections;
+	}
+
+	inline boost::shared_ptr< hoNDArray<float> > get_projections() {
+		return projections_;
+	}
+
+	void downsample( unsigned int num_downsamples )
+	{
+		for (int k = 0; k < num_downsamples; k++)
+			projections_ = Gadgetron::downsample<float,2>(projections_.get());
+	}
+
+	void load( std::string filename )
+	{
+		// Open hdf5 file
+		//
+
+		hid_t file_id = H5Fopen (filename.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
+
+		// Load geometry.
+		// This loader is for version 2 of the format
+		//
+
+		unsigned int geom_dataformat_version;
+		herr_t errCode;
+		errCode = H5LTread_dataset (file_id, "/geometry_dataformat_version", H5T_NATIVE_UINT, &geom_dataformat_version);
+
+		if (errCode < 0){
+			throw std::runtime_error("Error reading /geometry_dataformat_version from file.");
+		}
+		unsigned int needed = 2;
+		if (!(geom_dataformat_version == needed)) {
+			std::stringstream ss;
+			ss << "wrong geometry data version inside hdf5 file, found: "
+					<< geom_dataformat_version << ", needed: " << needed << std::endl;
+			throw std::runtime_error(ss.str());
+		}
+
+		// Allocate new geometry
+		//
+
+		geometry_ = boost::shared_ptr<CBCT_geometry>(new CBCT_geometry());
+
+		// Get angles array
+		//
+
+		hsize_t dim;
+		errCode = H5LTget_dataset_info(file_id,"/angles",&dim,NULL,NULL);
+		if (errCode < 0) 	throw std::runtime_error("Error getting /angles dataset info from file.");
+
+		std::vector<float> angles (dim,0.0f);
+		geometry_->set_angles(angles);
+		errCode=H5LTread_dataset (file_id, "/angles", H5T_NATIVE_FLOAT, &geometry_->get_angles()[0]);
+		if (errCode < 0) 	throw std::runtime_error("Error reading /angles from file.");
+
+		// Get offsets array
+		//
+
+		errCode=H5LTget_dataset_info(file_id,"/offsetx",&dim,NULL,NULL);
+		if (errCode < 0) 	throw std::runtime_error("Error getting /offsetx dataset info from file.");
+		std::vector<float> offsets_x = std::vector<float>(dim,0.0f);
+		errCode=H5LTread_dataset (file_id, "/offsetx", H5T_NATIVE_FLOAT, &offsets_x[0]);
+		if (errCode < 0) 	throw std::runtime_error("Error reading /offsetx from file.");
+
+		errCode=H5LTget_dataset_info(file_id,"/offsety",&dim,NULL,NULL);
+		if (errCode < 0) 	throw std::runtime_error("Error getting /offsety dataset info from file.");
+		std::vector<float> offsets_y = std::vector<float>(dim,0.0f);
+		errCode=H5LTread_dataset (file_id, "/offsety", H5T_NATIVE_FLOAT, &offsets_y[0]);
+		if (errCode < 0) 	throw std::runtime_error("Error reading /offsety from file.");
+
+		if( offsets_x.size() != offsets_y.size() ){
+			throw std::runtime_error("CBCT_geometry::load : x/y offset arrays has different lengths");
+		}
+
+		geometry_->get_offsets().clear();
+		for( unsigned int i=0; i<offsets_x.size(); i++ ){
+			geometry_->get_offsets().push_back(floatd2(offsets_x[i], offsets_y[i]));
+		}
+
+		// Test data format of the projections
+		//
+
+		unsigned int proj_dataformat_version;
+		errCode=H5LTread_dataset (file_id, "/projection_dataformat_version", H5T_NATIVE_UINT, &proj_dataformat_version);
+		if (errCode < 0) 	throw std::runtime_error("Error reading /projection_dataformat_version from file.");
+
+		needed = 1;
+		if (!(proj_dataformat_version == needed)) {
+			std::stringstream ss;
+			ss << "wrong projection data format version inside hdf5 file, found: "
+					<< proj_dataformat_version << ", needed: " << needed;
+			throw std::runtime_error(ss.str());
+		}
+
+		hsize_t vec_dim[3];
+		errCode=H5LTget_dataset_info(file_id,"/projections",vec_dim,NULL,NULL);
+		if (errCode < 0) 	throw std::runtime_error("Error getting /projections dataset info from file.");
+		std::vector<size_t> dims;
+		dims.push_back(vec_dim[2]);
+		dims.push_back(vec_dim[1]);
+		dims.push_back(vec_dim[0]);
+
+		projections_ = boost::shared_ptr<hoNDArray<float> >(new hoNDArray<float>(&dims));
+		errCode=H5LTread_dataset (file_id,"/projections", H5T_NATIVE_FLOAT, projections_->get_data_ptr());
+		if (errCode < 0) 	throw std::runtime_error("Error reading /projections from file.");
+
+		// Get SAD / SDD / FOV
+		//
+
+		float SAD, SDD;
+		floatd2 FOV;
+
+		errCode=H5LTread_dataset (file_id, "/SAD", H5T_NATIVE_FLOAT, &SAD);
+		if (errCode < 0) 	throw std::runtime_error("Error reading /SAD from file.");
+		errCode=H5LTread_dataset (file_id, "/SDD", H5T_NATIVE_FLOAT, &SDD);
+		if (errCode < 0) 	throw std::runtime_error("Error reading /SDD from file.");
+		errCode=H5LTread_dataset (file_id, "/FOV", H5T_NATIVE_FLOAT, &FOV);
+		if (errCode < 0){
+			floatd2 spacing;
+			errCode=H5LTread_dataset (file_id, "/spacing", H5T_NATIVE_FLOAT, &spacing);
+			FOV[0] = spacing[0]*dims[0];
+			FOV[1] = spacing[1]*dims[1];
+			if (errCode < 0) throw std::runtime_error("Error reading /FOV from file.");
+		}
+
+		geometry_->set_SAD(SAD);
+		geometry_->set_SDD(SDD);
+		geometry_->set_FOV(FOV);
+		H5Fclose (file_id);
+	}
+
+	void save( std::string filename )
+	{
+		hid_t file_id = H5Fcreate (filename.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
+
+		unsigned int dataformat_version=1;
+		hsize_t dims[1] = {1};
+		H5LTmake_dataset(file_id,"/projection_dataformat_version", 1, dims, H5T_NATIVE_UINT, &dataformat_version);
+
+		boost::shared_ptr<std::vector<size_t > > pdims = projections_->get_dimensions();
+		hsize_t *dims2 = new hsize_t[pdims->size()];
+		for (int i = 0; i < pdims->size(); i++)
+			dims2[i] = pdims->at(pdims->size()-i-1);
+		H5LTmake_dataset(file_id,"/projections", pdims->size(), dims2, H5T_NATIVE_FLOAT, projections_->get_data_ptr());
+		delete[] dims2;
+
+		geometry_->save(file_id);
+
+		H5Fclose (file_id);
+	}
+
+protected:
+
+	boost::shared_ptr<CBCT_geometry> geometry_;
+	boost::shared_ptr< hoNDArray<float> > projections_;
+};
+}
diff --git a/toolboxes/ct/xray/gpu/CBCT_binning.h b/toolboxes/ct/xray/gpu/CBCT_binning.h
new file mode 100644
index 0000000..42617e2
--- /dev/null
+++ b/toolboxes/ct/xray/gpu/CBCT_binning.h
@@ -0,0 +1,166 @@
+/**
+ * Temporal binning for CBCT
+ **/
+
+#pragma once
+
+#include "CBCT_acquisition.h"
+
+#include <hdf5.h>
+#include <hdf5_hl.h>
+#include <vector>
+#include <set>
+#include <stdexcept>
+#include <boost/iterator/counting_iterator.hpp>
+
+namespace Gadgetron {
+
+  class CBCT_binning
+  {
+
+  public:
+    
+    CBCT_binning() {}
+    CBCT_binning( std::vector< std::vector<unsigned int> > binning ) : binning_(binning) {}
+
+    ~CBCT_binning() {}
+
+    inline unsigned int get_number_of_bins()
+    {
+      return binning_.size();
+    }
+
+    inline unsigned int get_number_of_projections()
+    {
+      unsigned int acc = 0;
+      for( unsigned int i=0; i<binning_.size(); i++ )
+        acc += binning_[i].size();
+      return acc;
+    }
+
+    inline unsigned int get_number_of_projections( unsigned int bin )
+    {
+      if( bin >= binning_.size() )
+        throw std::runtime_error("CBCT_binning::get_number_of_projections(int) : bin is out of range");
+      else
+        return binning_[bin].size();
+    }
+
+    inline int get_maximum_projection_index()
+    {
+      int max_proj = -1;
+      for( unsigned int i=0; i<binning_.size(); i++ )
+        for( unsigned int j=0; j<binning_[i].size(); j++ )
+          if( int(binning_[i][j]) > max_proj ) 
+            max_proj = binning_[i][j];
+      return max_proj;
+    }
+    
+    inline void set_bins( std::vector< std::vector<unsigned int> > &bins ) {
+      binning_ = bins;
+    }
+
+    inline std::vector< std::vector<unsigned int> > get_bins() {
+      return binning_;
+    }
+
+    inline void set_bin( std::vector<unsigned int> &bin, unsigned int bin_number )
+    {
+      if( bin_number > binning_.size() )
+        throw std::runtime_error("CBCT_binning::set_bin() : bin is out of range");
+      else if( bin_number == binning_.size() )
+        binning_.push_back(bin);
+      else
+        binning_[bin_number] = bin;
+    }
+
+    inline std::vector<unsigned int> get_bin( unsigned int bin )
+    {
+      if( bin >= binning_.size() )
+        throw std::runtime_error("CBCT_binning::get_bin() : bin is out of range");
+      else
+        return binning_[bin];
+    }
+
+    inline void set_as_default_3d_bin( unsigned int num_projections )
+    {
+      binning_.push_back( std::vector<unsigned int>( boost::counting_iterator<unsigned int>(0),
+                                                     boost::counting_iterator<unsigned int>(num_projections) ));
+    }
+
+    CBCT_binning get_3d_binning(){
+    	std::vector<std::vector<unsigned int> > bin_3d;
+    	std::set<unsigned int> uniques;
+    	for (int i = 0; i < binning_.size(); i++){
+    		uniques.insert(binning_[i].begin(),binning_[i].end());
+    	}
+
+    	bin_3d.push_back(std::vector<unsigned int>(uniques.begin(),uniques.end()));
+    	return CBCT_binning(bin_3d);
+    }
+
+    void load( std::string filename )
+    {
+      // Open file and make sure it is the expected version
+      //
+
+      hid_t file_id = H5Fopen (filename.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
+
+      unsigned int dataformat_version;
+      herr_t errCode;
+      errCode=H5LTread_dataset (file_id, "/binning_dataformat_version", H5T_NATIVE_UINT, &dataformat_version);
+
+      if(errCode < 0)
+        throw std::runtime_error("Error reading /binning_dataformat_version");
+
+      unsigned int needed = 1;
+      if (!(dataformat_version == needed)) {
+        std::cerr << "wrong format of binning file, found: "
+                  << dataformat_version << ", needed: " << needed << std::endl;
+        exit(EXIT_FAILURE);
+      }
+
+      // And get the bins
+      //
+
+      binning_.clear();
+
+      unsigned int numBins;
+      errCode=H5LTread_dataset (file_id, "/numBins", H5T_NATIVE_UINT, &numBins);
+      if(errCode < 0)
+        throw std::runtime_error("Error reading /numBins_dataformat_version");
+      //GDEBUG_STREAM("Found " << numBins << " bins in file" << filename << std::endl);
+
+      // Ok, so this really isn't very elegant.
+      // A folder in the hdf5 file containing the data would be better...
+      //
+
+      for (unsigned int i=1; i<=numBins; i++) {
+        std::stringstream path;
+        path << "/bin_" << i;
+        hsize_t dim;
+        errCode=H5LTget_dataset_info(file_id,path.str().c_str(),&dim,NULL,NULL);
+        if(errCode < 0)
+          throw std::runtime_error("Error reading bin info");
+        binning_.push_back(std::vector<unsigned int>(dim,0.0f));
+        errCode=H5LTread_dataset (file_id, path.str().c_str(), H5T_NATIVE_UINT, &binning_.back()[0]);
+        if(errCode < 0)
+          throw std::runtime_error("Error reading bin data");
+      }
+    }
+
+    void print( std::ostream &os = std::cout )
+    {
+      os << "---------- BINNING DATA ----------" << std::endl;
+      os << "Number of bins: " << binning_.size() << std::endl;
+      for (unsigned int b=0; b<binning_.size(); b++)
+        os << "Number of projections in bin[" << b
+           << "]: " << binning_[b].size() << std::endl;
+      os << "----------------------------------" << std::endl;
+    }
+
+  protected:
+    std::vector< std::vector<unsigned int> > binning_;
+  };
+}
+
diff --git a/toolboxes/ct/xray/gpu/CMakeLists.txt b/toolboxes/ct/xray/gpu/CMakeLists.txt
new file mode 100644
index 0000000..edfb35b
--- /dev/null
+++ b/toolboxes/ct/xray/gpu/CMakeLists.txt
@@ -0,0 +1,49 @@
+find_package(HDF5 REQUIRED HL)
+
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUXRAY__)
+endif (WIN32)
+
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CUDA_INCLUDE_DIRS}
+  ${Boost_INCLUDE_DIR}
+  ${HDF5_INCLUDE_DIR}
+  ${HDF5_INCLUDE_DIR}/cpp
+  ${ISMRMRD_INCLUDE_DIR}
+  ${ARMADILLO_INCLUDE_DIRS}
+)
+
+cuda_add_library(gadgetron_toolbox_gpuxray SHARED
+  conebeam_projection.cu 
+  hoCuConebeamProjectionOperator.cpp 
+  )
+
+set_target_properties(gadgetron_toolbox_gpuxray PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(
+  gadgetron_toolbox_gpuxray gadgetron_toolbox_gpucore gadgetron_toolbox_gpunfft
+  gadgetron_toolbox_cpucore_math
+  ${CUDA_LIBRARIES}
+)
+
+install(TARGETS gadgetron_toolbox_gpuxray DESTINATION lib COMPONENT main)
+
+install(FILES 
+  CBCT_acquisition.h
+  CBCT_binning.h
+  conebeam_projection.h
+  hoCuConebeamProjectionOperator.h
+  gpuxray_export.h 
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/ct/xray/gpu/conebeam_projection.cu b/toolboxes/ct/xray/gpu/conebeam_projection.cu
new file mode 100644
index 0000000..0774dbd
--- /dev/null
+++ b/toolboxes/ct/xray/gpu/conebeam_projection.cu
@@ -0,0 +1,1151 @@
+//
+// This code performs 3D cone beam CT forwards and backwards projection
+//
+
+#include "conebeam_projection.h"
+#include "float3x3.h"
+#include "hoCuNDArray_math.h"
+#include "vector_td.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_utils.h"
+#include "cuNFFT.h"
+#include "check_CUDA.h"
+#include "GPUTimer.h"
+#include "cudaDeviceManager.h"
+#include "hoNDArray_fileio.h"
+#include "setup_grid.h"
+
+#include <cuda_runtime_api.h>
+#include <math_constants.h>
+#include <cufft.h>
+#include <iostream>
+#include <cmath>
+#include <algorithm>
+#include <vector>
+
+#define PS_ORIGIN_CENTERING
+#define IS_ORIGIN_CENTERING
+//#define FLIP_Z_AXIS
+
+// Read the projection/image data respectively as a texture (for input)
+// - taking advantage of the cache and hardware interpolation
+//
+
+#define NORMALIZED_TC 1
+
+static texture<float, 3, cudaReadModeElementType> 
+image_tex( NORMALIZED_TC, cudaFilterModeLinear, cudaAddressModeBorder );
+
+static texture<float, cudaTextureType2DLayered, cudaReadModeElementType> 
+projections_tex( NORMALIZED_TC, cudaFilterModeLinear, cudaAddressModeBorder );
+
+namespace Gadgetron 
+{
+
+// Utility to convert from degrees to radians
+//
+
+static inline __host__ __device__
+float degrees2radians(float degree) {
+	return degree * (CUDART_PI_F/180.0f);
+}
+
+// Utilities for filtering in frequency space
+//
+
+static boost::shared_ptr< cuNDArray<float_complext> > cb_fft( cuNDArray<float> *data )
+  		{
+	if( data == 0x0 )
+		throw std::runtime_error("CB FFT : illegal input pointer provided");
+
+	std::vector<size_t> in_dims = *data->get_dimensions();
+	std::vector<size_t> out_dims;
+	out_dims.push_back((in_dims[0]>>1)+1);
+	out_dims.push_back(in_dims[1]);
+	out_dims.push_back(in_dims[2]);
+
+	boost::shared_ptr< cuNDArray<float_complext> > result( new cuNDArray<float_complext>(&out_dims) );
+	cufftHandle plan;
+
+	if( cufftPlanMany( &plan, 1, (int*)(&in_dims[0]), 0x0, 1, in_dims[0], 0x0, 1, out_dims[0], CUFFT_R2C, in_dims[1]*in_dims[2] ) != CUFFT_SUCCESS) {
+		throw std::runtime_error("CB FFT plan failed");
+	}
+
+	if( cufftExecR2C( plan, data->get_data_ptr(), (cuFloatComplex*) result->get_data_ptr() ) != CUFFT_SUCCESS ) {
+		throw std::runtime_error("CB FFT execute failed");;
+	}
+
+	if( cufftDestroy(plan) != CUFFT_SUCCESS) {
+		throw std::runtime_error("CB FFT failed to destroy plan");
+	}
+
+	return result;
+  		}
+
+static void cb_ifft( cuNDArray<float_complext> *in_data, cuNDArray<float> *out_data )
+{
+	if( in_data == 0x0 || out_data == 0x0 )
+		throw std::runtime_error("CB FFT : illegal input or output pointer provided");
+
+	std::vector<size_t> in_dims = *in_data->get_dimensions();
+	std::vector<size_t> out_dims = *out_data->get_dimensions();
+
+	cufftHandle plan;
+
+	if( cufftPlanMany( &plan, 1, (int*)(&out_dims[0]), 0x0, 1, in_dims[0], 0x0, 1, out_dims[0], CUFFT_C2R, in_dims[1]*in_dims[2] ) != CUFFT_SUCCESS) {
+		throw std::runtime_error("CB iFFT plan failed");
+	}
+
+	if( cufftExecC2R( plan, (cuFloatComplex*) in_data->get_data_ptr(), out_data->get_data_ptr() ) != CUFFT_SUCCESS ) {
+		throw std::runtime_error("CB iFFT execute failed");;
+	}
+
+	if( cufftDestroy(plan) != CUFFT_SUCCESS) {
+		throw std::runtime_error("CB iFFT failed to destroy plan");
+	}
+
+	*out_data /= float(out_dims[0]);
+}
+
+//
+// Redundancy correction for short scan mode
+// - i.e. for less than a full rotation of data
+//
+// See "Optimal short scan convolution reconstruction for fanbeam CT", Dennis Parker, Med. Phys. 9(2) 1982
+// and (for the implementation) "Parker weights revisited", Wesarg et al, Med. Phys. 29(3) 2002.
+//
+
+static __device__ const float epsilon = 0.001f;
+
+static __inline__ __device__ float S( float beta )
+{
+	if( beta <= -0.5f ) return 0.0f;
+	else if( beta > -0.5f && beta < 0.5f ) return 0.5f*(1.0f+sinf(CUDART_PI_F*beta));
+	else /*if( beta >= 0.5f )*/ return 1.0f;
+}
+
+static __inline__ __device__ float B( float alpha, float delta )
+{
+	return 2.0f*(delta-alpha)+epsilon;
+}
+
+static __inline__ __device__ float b( float alpha, float delta )
+{
+	const float q = 0.1f; // with q=1 this formulae reduce to conventional Parker weights
+	return q*B(alpha, delta);
+}
+
+__global__ void
+redundancy_correct_kernel( float *projections,
+		const float * __restrict__ angles,
+		uintd3 dims, // Dimensions of the projections array
+		float delta  // The half-fan angle
+)
+{
+	const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+	const unsigned int num_elements = prod(dims);
+
+	if( idx < num_elements ){
+
+		const float in = projections[idx];
+		const uintd3 co = idx_to_co<3>( idx, dims );
+		const float tan_delta = tanf(delta);
+		const float alpha = -atanf((float(co[0])/float(dims[0])-0.5f)*2.0f*tan_delta);
+		const float beta = degrees2radians(angles[co[2]]);
+
+		float omega = 0.5f*(S(beta/b(alpha, delta)-0.5f)+
+				S((beta+2.0f*(alpha-delta)-epsilon)/b(alpha, delta)+0.5f)-
+				S((beta-CUDART_PI_F+2.0f*alpha)/b(-alpha, delta)-0.5f)-
+				S((beta-CUDART_PI_F-2.0f*delta-epsilon)/b(-alpha, delta)+0.5f));
+
+		projections[idx] = in*omega;
+	}
+}
+
+void
+redundancy_correct( cuNDArray<float> *projections,
+		float *angles_DevPtr,
+		float delta // The half-fan angle in radians
+)
+{
+	//
+	// Validate the input
+	//
+
+	if( projections == 0x0 ){
+		throw std::runtime_error("Error: redundancy_correct: illegal array pointer provided");
+	}
+
+	if( projections->get_number_of_dimensions() != 3 ){
+		throw std::runtime_error("Error: redundancy_correct: projections array must be three-dimensional");
+	}
+
+	const size_t projection_res_x = projections->get_size(0);
+	const size_t projection_res_y = projections->get_size(1);
+	const size_t num_projections = projections->get_size(2);
+	uintd3 dims(projection_res_x, projection_res_y, num_projections);
+
+	// Launch kernel
+	//
+
+	dim3 dimBlock, dimGrid;
+	setup_grid( prod(dims), &dimBlock, &dimGrid );
+
+	redundancy_correct_kernel<<< dimGrid, dimBlock >>>( projections->get_data_ptr(), angles_DevPtr, dims, delta );
+	CHECK_FOR_CUDA_ERROR();
+}
+
+
+/***
+ * Redundancy (or offset) correction from Wang. Med. Phys 2002, doi: 10.1118/1.1489043
+ */
+__global__ static void
+offset_correct_kernel( float *projections,
+		const floatd2 * __restrict__ offsets,
+		uintd3 dims, // Dimensions of the projections array
+		floatd2 phys_dims, // Physical dimensions in mm
+		float SAD, // Source origin distance
+		float SDD // Source detector distance
+)
+{
+	const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+	const unsigned int num_elements = prod(dims);
+
+	if( idx < num_elements ){
+
+		const uintd3 co = idx_to_co<3>( idx, dims );
+		const floatd2 offset = offsets[co[2]];
+		const float t = phys_dims[0]*(float(co[0])/(float(dims[0]))-0.5f)+offset[0];
+		const float omega = phys_dims[0]/2.0f-fabs(offset[0]);
+		//const float omega = phys_dims[0]*float(co[0])/(2.0f*float(dims[0]));
+
+		if( fabs(t) <= fabs(omega) ){
+			//float w = 0.5*sinf(CUDART_PI_F*atanf(t/SDD)/(2.0f*atanf(omega/SDD)))+0.5;
+			float sqrt_w = sinf(CUDART_PI_F*(t+omega)/(4.0f*omega));
+			float w = sqrt_w*sqrt_w;
+			projections[idx] *= w;
+		}
+	}
+}
+
+static void
+offset_correct( cuNDArray<float> *projections,
+		floatd2* offsets, // Ptr to cuda array
+		floatd2 phys_dims,
+		float SAD, // Source origin distance
+		float SDD // Source detector distance
+)
+{
+	//
+	// Validate the input
+	//
+
+	if( projections == 0x0 ){
+		throw std::runtime_error("Error: offset_correct: illegal array pointer provided");
+	}
+
+	if( projections->get_number_of_dimensions() != 3 ){
+		throw std::runtime_error("Error: offset_correct: projections array must be three-dimensional");
+	}
+
+	const size_t projection_res_x = projections->get_size(0);
+	const size_t projection_res_y = projections->get_size(1);
+	const size_t num_projections = projections->get_size(2);
+	uintd3 dims(projection_res_x, projection_res_y, num_projections);
+
+	// Launch kernel
+	//
+
+	dim3 dimBlock, dimGrid;
+	setup_grid( prod(dims), &dimBlock, &dimGrid );
+
+	offset_correct_kernel<<< dimGrid, dimBlock >>>( projections->get_data_ptr(), offsets, dims, phys_dims, SAD, SDD );
+	CHECK_FOR_CUDA_ERROR();
+}
+
+
+/***
+ * Redundancy (or offset) correction from Wang. Med. Phys 2002, doi: 10.1118/1.1489043
+ */
+__global__ static void
+offset_correct_kernel_sqrt( float *projections,
+		const floatd2 * __restrict__ offsets,
+		uintd3 dims, // Dimensions of the projections array
+		floatd2 phys_dims, // Physical dimensions in mm
+		float SAD, // Source origin distance
+		float SDD // Source detector distance
+)
+{
+	const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+	const unsigned int num_elements = prod(dims);
+
+	if( idx < num_elements ){
+
+		const uintd3 co = idx_to_co<3>( idx, dims );
+		const floatd2 offset = offsets[co[2]];
+		const float t = phys_dims[0]*(float(co[0])/(float(dims[0]))-0.5f)+offset[0];
+		const float omega = phys_dims[0]/2.0f-fabs(offset[0]);
+		//const float omega = phys_dims[0]*float(co[0])/(2.0f*float(dims[0]));
+
+		if( fabs(t) <= fabs(omega) ){
+			//float w = 0.5*sinf(CUDART_PI_F*atanf(t/SDD)/(2.0f*atanf(omega/SDD)))+0.5;
+			float sqrt_w = sinf(CUDART_PI_F*(t+omega)/(4.0f*omega));
+			projections[idx] *= sqrt_w;
+		}
+	}
+}
+
+static void
+offset_correct_sqrt( cuNDArray<float> *projections,
+		floatd2* offsets, // Ptr to cuda array
+		floatd2 phys_dims,
+		float SAD, // Source origin distance
+		float SDD // Source detector distance
+)
+{
+	//
+	// Validate the input
+	//
+
+	if( projections == 0x0 ){
+		throw std::runtime_error("Error: offset_correct: illegal array pointer provided");
+	}
+
+	if( projections->get_number_of_dimensions() != 3 ){
+		throw std::runtime_error("Error: offset_correct: projections array must be three-dimensional");
+	}
+
+	const size_t projection_res_x = projections->get_size(0);
+	const size_t projection_res_y = projections->get_size(1);
+	const size_t num_projections = projections->get_size(2);
+	uintd3 dims(projection_res_x, projection_res_y, num_projections);
+
+	// Launch kernel
+	//
+
+	dim3 dimBlock, dimGrid;
+	setup_grid( prod(dims), &dimBlock, &dimGrid );
+
+	offset_correct_kernel_sqrt<<< dimGrid, dimBlock >>>( projections->get_data_ptr(), offsets, dims, phys_dims, SAD, SDD );
+	CHECK_FOR_CUDA_ERROR();
+}
+
+
+void apply_offset_correct(hoCuNDArray<float>* projections,std::vector<floatd2>& offsets,		floatd2 ps_dims_in_mm, float SDD,	float SAD){
+
+	std::vector<size_t> dims = *projections->get_dimensions();
+	size_t projection_size = dims[0]*dims[1];
+
+
+	thrust::device_vector<floatd2> offsets_devVec(offsets);
+	//Calculate number of projections we can fit on device, rounded to nearest MB
+	size_t batch_size = (1024)*(cudaDeviceManager::Instance()->getFreeMemory()/(1024*projection_size*sizeof(float)));
+	size_t remaining = dims[2];
+
+	for (unsigned int i = 0; i < dims[2]/(batch_size+1)+1; i++){
+		std::vector<size_t> projection_dims = dims;
+		projection_dims[2] = std::min(remaining,batch_size);
+		//Make a view of the batch of projections
+		hoCuNDArray<float> projections_view(projection_dims,projections->get_data_ptr()+batch_size*i);
+		cuNDArray<float> cu_projections(projections_view); //Copy to device
+		floatd2* cu_offsets = thrust::raw_pointer_cast(&offsets_devVec[i*batch_size]);
+		offset_correct_sqrt(&cu_projections,cu_offsets,ps_dims_in_mm,SAD,SDD);
+
+		cudaMemcpy(projections_view.get_data_ptr(),cu_projections.get_data_ptr(),cu_projections.get_number_of_bytes(),cudaMemcpyDeviceToHost);
+		remaining -= batch_size;
+	}
+}
+
+//
+// Forwards projection
+//
+
+__global__ void
+conebeam_forwards_projection_kernel( float * __restrict__ projections,
+		float * __restrict__ angles,
+		floatd2 *offsets,
+		floatd3 is_dims_in_pixels,
+		floatd3 is_dims_in_mm,
+		intd2 ps_dims_in_pixels_int,
+		floatd2 ps_dims_in_mm,
+		int num_projections,
+		float SDD,
+		float SAD,
+		int num_samples_per_ray )
+{
+	const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+	const int num_elements = prod(ps_dims_in_pixels_int)*num_projections;
+
+	if( idx < num_elements){
+
+		const intd3 co = idx_to_co<3>( idx, intd3(ps_dims_in_pixels_int[0], ps_dims_in_pixels_int[1], num_projections) );
+
+		// Projection space dimensions and spacing
+		//
+
+		const floatd2 ps_dims_in_pixels = floatd2(ps_dims_in_pixels_int[0], ps_dims_in_pixels_int[1]);
+		const floatd2 ps_spacing = ps_dims_in_mm / ps_dims_in_pixels;
+
+		// Determine projection angle and rotation matrix
+		//
+
+		const float angle = angles[co[2]];
+		const float3x3 rotation = calcRotationMatrixAroundZ(degrees2radians(angle));
+
+		// Find start and end point for the line integral (image space)
+		//
+
+		floatd3 startPoint = floatd3(0.0f, -SAD, 0.0f);
+		startPoint = mul(rotation, startPoint);
+
+		// Projection plate indices
+		//
+
+#ifdef PS_ORIGIN_CENTERING
+		const floatd2 ps_pc = floatd2(co[0], co[1]) + floatd2(0.5);
+#else
+		const floatd2 ps_pc = floatd2(co[0], co[1]);
+#endif
+
+		// Convert the projection plate coordinates into image space,
+		// - local to the plate in metric units
+		// - including half-fan and sag correction
+		//
+
+		const floatd2 proj_coords = (ps_pc / ps_dims_in_pixels - 0.5f) * ps_dims_in_mm + offsets[co[2]];
+
+		// Define the end point for the line integrals
+		//
+
+		const float ADD = SDD - SAD; // in mm.
+		floatd3 endPoint = floatd3(proj_coords[0], ADD, proj_coords[1]);
+		endPoint = mul(rotation, endPoint);
+
+		// Find direction vector of the line integral
+		//
+
+		floatd3 dir = endPoint-startPoint;
+
+		// Perform integration only inside the bounding cylinder of the image volume
+		//
+
+		const floatd3 vec_over_dir = (is_dims_in_mm-startPoint)/dir;
+		const floatd3 vecdiff_over_dir = (-is_dims_in_mm-startPoint)/dir;
+		const floatd3 start = amin(vecdiff_over_dir, vec_over_dir);
+		const floatd3 end   = amax(vecdiff_over_dir, vec_over_dir);
+
+		float a1 = fmax(max(start),0.0f);
+		float aend = fmin(min(end),1.0f);
+		startPoint += a1*dir;
+
+		const float sampling_distance = norm((aend-a1)*dir)/num_samples_per_ray;
+
+		// Now perform conversion of the line integral start/end into voxel coordinates
+		//
+
+		startPoint /= is_dims_in_mm;
+#ifdef FLIP_Z_AXIS
+		startPoint[2] *= -1.0f;
+#endif
+		startPoint += 0.5f;
+		dir /= is_dims_in_mm;
+#ifdef FLIP_Z_AXIS
+		dir[2] *= -1.0f;
+#endif
+		dir /= float(num_samples_per_ray); // now in step size units
+
+		//
+		// Perform line integration
+		//
+
+		float result = 0.0f;
+
+		for ( int sampleIndex = 0; sampleIndex<num_samples_per_ray; sampleIndex++) {
+
+#ifndef IS_ORIGIN_CENTERING
+			floatd3 samplePoint = startPoint+dir*float(sampleIndex) + floatd3(0.5f)/is_dims_in_pixels;
+#else
+			floatd3 samplePoint = startPoint+dir*float(sampleIndex);
+#endif
+
+			// Accumulate result
+			//
+
+			result += tex3D( image_tex, samplePoint[0], samplePoint[1], samplePoint[2] );
+		}
+
+		// Output (normalized to the length of the ray)
+		//
+
+		projections[idx] = result*sampling_distance;
+	}
+}
+
+//
+// Forwards projection of a 3D volume onto a set of (binned) projections
+//
+
+void
+conebeam_forwards_projection( hoCuNDArray<float> *projections,
+		hoCuNDArray<float> *image,
+		std::vector<float> angles,
+		std::vector<floatd2> offsets,
+		std::vector<unsigned int> indices,
+		int projections_per_batch,
+		float samples_per_pixel,
+		floatd3 is_dims_in_mm,
+		floatd2 ps_dims_in_mm,
+		float SDD,
+		float SAD)
+{
+	//
+	// Validate the input
+	//
+
+	if( projections == 0x0 || image == 0x0 ){
+		throw std::runtime_error("Error: conebeam_forwards_projection: illegal array pointer provided");
+	}
+
+	if( projections->get_number_of_dimensions() != 3 ){
+		throw std::runtime_error("Error: conebeam_forwards_projection: projections array must be three-dimensional");
+	}
+
+	if( image->get_number_of_dimensions() != 3 ){
+		throw std::runtime_error("Error: conebeam_forwards_projection: image array must be three-dimensional");
+	}
+
+	if( projections->get_size(2) != angles.size() || projections->get_size(2) != offsets.size() ) {
+		throw std::runtime_error("Error: conebeam_forwards_projection: inconsistent sizes of input arrays/vectors");
+	}
+
+	int projection_res_x = projections->get_size(0);
+	int projection_res_y = projections->get_size(1);
+
+	int num_projections_in_bin = indices.size();
+	int num_projections_in_all_bins = projections->get_size(2);
+
+	int matrix_size_x = image->get_size(0);
+	int matrix_size_y = image->get_size(1);
+	int matrix_size_z = image->get_size(2);
+
+	hoCuNDArray<float> *int_projections = projections;
+
+	if( projections_per_batch > num_projections_in_bin )
+		projections_per_batch = num_projections_in_bin;
+
+	int num_batches = (num_projections_in_bin+projections_per_batch-1) / projections_per_batch;
+
+	// Build texture from input image
+	//
+
+	cudaFuncSetCacheConfig(conebeam_forwards_projection_kernel, cudaFuncCachePreferL1);
+	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+	cudaExtent extent;
+	extent.width = matrix_size_x;
+	extent.height = matrix_size_y;
+	extent.depth = matrix_size_z;
+
+	cudaMemcpy3DParms cpy_params = {0};
+	cpy_params.kind = cudaMemcpyHostToDevice;
+	cpy_params.extent = extent;
+
+	cudaArray *image_array;
+	cudaMalloc3DArray(&image_array, &channelDesc, extent);
+	CHECK_FOR_CUDA_ERROR();
+
+	cpy_params.dstArray = image_array;
+	cpy_params.srcPtr = make_cudaPitchedPtr
+			((void*)image->get_data_ptr(), extent.width*sizeof(float), extent.width, extent.height);
+	cudaMemcpy3D(&cpy_params);
+	CHECK_FOR_CUDA_ERROR();
+
+	cudaBindTextureToArray(image_tex, image_array, channelDesc);
+	CHECK_FOR_CUDA_ERROR();
+
+	// Allocate the angles, offsets and projections in device memory
+	//
+
+	float *projections_DevPtr, *projections_DevPtr2;
+	cudaMalloc( (void**) &projections_DevPtr, projection_res_x*projection_res_y*projections_per_batch*sizeof(float));
+	cudaMalloc( (void**) &projections_DevPtr2, projection_res_x*projection_res_y*projections_per_batch*sizeof(float));
+
+	cudaStream_t mainStream, indyStream;
+	cudaStreamCreate(&mainStream);
+	cudaStreamCreate(&indyStream);
+
+	std::vector<float> angles_vec;
+	std::vector<floatd2> offsets_vec;
+
+	for( int p=0; p<indices.size(); p++ ){
+
+		int from_id = indices[p];
+
+		if( from_id >= num_projections_in_all_bins ) {
+			throw std::runtime_error("Error: conebeam_forwards_projection: illegal index in bin");
+		}
+
+		angles_vec.push_back(angles[from_id]);
+		offsets_vec.push_back(offsets[from_id]);
+	}
+
+	thrust::device_vector<float> angles_devVec(angles_vec);
+	thrust::device_vector<floatd2> offsets_devVec(offsets_vec);
+
+	//
+	// Iterate over the batches
+	//
+
+	for (unsigned int batch=0; batch<num_batches; batch++ ){
+
+		int from_projection = batch * projections_per_batch;
+		int to_projection = (batch+1) * projections_per_batch;
+
+		if (to_projection > num_projections_in_bin)
+			to_projection = num_projections_in_bin;
+
+		int projections_in_batch = to_projection-from_projection;
+
+		// Block/grid configuration
+		//
+
+		dim3 dimBlock, dimGrid;
+		setup_grid( projection_res_x*projection_res_y*projections_in_batch, &dimBlock, &dimGrid );
+
+		// Launch kernel
+		//
+
+		floatd3 is_dims_in_pixels(matrix_size_x, matrix_size_y, matrix_size_z);
+		intd2 ps_dims_in_pixels(projection_res_x, projection_res_y);
+
+		float* raw_angles = thrust::raw_pointer_cast(&angles_devVec[from_projection]);
+		floatd2* raw_offsets = thrust::raw_pointer_cast(&offsets_devVec[from_projection]);
+
+		conebeam_forwards_projection_kernel<<< dimGrid, dimBlock, 0, mainStream >>>
+				( projections_DevPtr, raw_angles, raw_offsets,
+						is_dims_in_pixels, is_dims_in_mm, ps_dims_in_pixels, ps_dims_in_mm,
+						projections_in_batch, SDD, SAD, samples_per_pixel*float(matrix_size_x) );
+
+		// If not initial batch, start copying the old stuff
+		//
+
+		int p = from_projection;
+		while( p<to_projection) {
+
+			int num_sequential_projections = 1;
+			while( p+num_sequential_projections < to_projection &&
+					indices[p+num_sequential_projections]==(indices[p+num_sequential_projections-1]+1) ){
+				num_sequential_projections++;
+			}
+
+			int to_id = indices[p];
+			int size = projection_res_x*projection_res_y;
+
+			cudaMemcpyAsync( int_projections->get_data_ptr()+to_id*size,
+					projections_DevPtr+(p-from_projection)*size,
+					size*num_sequential_projections*sizeof(float),
+					cudaMemcpyDeviceToHost, mainStream);
+
+			p += num_sequential_projections;
+		}
+
+		std::swap(projections_DevPtr, projections_DevPtr2);
+		std::swap(mainStream, indyStream);
+	}
+
+	cudaFree(projections_DevPtr2);
+	cudaFree(projections_DevPtr);
+	cudaFreeArray(image_array);
+
+	CUDA_CALL(cudaStreamDestroy(indyStream));
+	CUDA_CALL(cudaStreamDestroy(mainStream));
+	CHECK_FOR_CUDA_ERROR();
+
+}
+
+template <bool FBP> __global__ void
+conebeam_backwards_projection_kernel( float * __restrict__ image,
+		const float * __restrict__ angles,
+		floatd2 *offsets,
+		intd3 is_dims_in_pixels_int,
+		floatd3 is_dims_in_mm,
+		floatd2 ps_dims_in_pixels,
+		floatd2 ps_dims_in_mm,
+		int num_projections_in_batch,
+		float num_projections_in_bin,
+		float SDD,
+		float SAD,
+		bool accumulate )
+{
+	// Image voxel to backproject into (pixel coordinate and index)
+	//
+
+	const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+	const int num_elements = prod(is_dims_in_pixels_int);
+
+	if( idx < num_elements ){
+
+		const intd3 co = idx_to_co<3>(idx, is_dims_in_pixels_int);
+
+#ifdef IS_ORIGIN_CENTERING
+		const floatd3 is_pc = floatd3(co[0], co[1], co[2]) + floatd3(0.5);
+#else
+		const floatd3 is_pc = floatd3(co[0], co[1], co[2]);
+#endif
+
+		// Normalized image space coordinate [-0.5, 0.5[
+		//
+
+		const floatd3 is_dims_in_pixels(is_dims_in_pixels_int[0],is_dims_in_pixels_int[1],is_dims_in_pixels_int[2]);
+
+#ifdef FLIP_Z_AXIS
+		floatd3 is_nc = is_pc / is_dims_in_pixels - floatd3(0.5f);
+		is_nc[2] *= -1.0f;
+#else
+		const floatd3 is_nc = is_pc / is_dims_in_pixels - floatd3(0.5f);
+#endif
+
+		// Image space coordinate in metric units
+		//
+
+		const floatd3 pos = is_nc * is_dims_in_mm;
+
+		// Read the existing output value for accumulation at this point.
+		// The cost of this fetch is hidden by the loop
+
+		const float incoming = (accumulate) ? image[idx] : 0.0f;
+
+		// Backprojection loop
+		//
+
+		float result = 0.0f;
+
+		for( int projection = 0; projection < num_projections_in_batch; projection++ ) {
+
+			// Projection angle
+			//
+
+			const float angle = degrees2radians(angles[projection]);
+
+			// Projection rotation matrix
+			//
+
+			const float3x3 inverseRotation = calcRotationMatrixAroundZ(-angle);
+
+			// Rotated image coordinate (local to the projection's coordinate system)
+			//
+
+			const floatd3 pos_proj = mul(inverseRotation, pos);
+
+			// Project the image position onto the projection plate.
+			// Account for half-fan and sag offsets.
+			//
+
+			const floatd3 startPoint = floatd3(0.0f, -SAD, 0.0f);
+			floatd3 dir = pos_proj - startPoint;
+			dir = dir / dir[1];
+			const floatd3 endPoint = startPoint + dir * SDD;
+			const floatd2 endPoint2d = floatd2(endPoint[0], endPoint[2]) - offsets[projection];
+
+			// Convert metric projection coordinates into pixel coordinates
+			//
+
+#ifndef PS_ORIGIN_CENTERING
+			floatd2 ps_pc = ((endPoint2d / ps_dims_in_mm) + floatd2(0.5f)) + floatd2(0.5f)/ps_dims_in_pixels;
+			//floatd2 ps_pc = ((endPoint2d / ps_dims_in_mm) + floatd2(0.5f)) * ps_dims_in_pixels + floatd2(0.5f);
+#else
+			floatd2 ps_pc = ((endPoint2d / ps_dims_in_mm) + floatd2(0.5f));
+#endif
+
+			// Apply filter (filtered backprojection mode only)
+			//
+
+			float weight = 1.0;
+
+			if( FBP ){
+
+				// Equation 3.59, page 96 and equation 10.2, page 386
+				// in Computed Tomography 2nd edition, Jiang Hsieh
+				//
+
+				const float xx = pos[0];
+				const float yy = pos[1];
+				const float beta = angle;
+				const float r = hypotf(xx,yy);
+				const float phi = atan2f(yy,xx);
+				const float D = SAD;
+				const float ym = r*sinf(beta-phi);
+				const float U = (D+ym)/D;
+				weight = 1.0f/(U*U);
+			}
+
+			// Read the projection data (bilinear interpolation enabled) and accumulate
+			//
+
+			result +=  weight * tex2DLayered( projections_tex, ps_pc[0], ps_pc[1], projection );
+		}
+
+		// Output normalized image
+		//
+
+		image[idx] = incoming + result / num_projections_in_bin;
+	}
+}
+
+//
+// Backprojection
+//
+
+template <bool FBP>
+void conebeam_backwards_projection( hoCuNDArray<float> *projections,
+		hoCuNDArray<float> *image,
+		std::vector<float> angles,
+		std::vector<floatd2> offsets,
+		std::vector<unsigned int> indices,
+		int projections_per_batch,
+		intd3 is_dims_in_pixels,
+		floatd3 is_dims_in_mm,
+		floatd2 ps_dims_in_mm,
+		float SDD,
+		float SAD,
+		bool short_scan,
+		bool use_offset_correction,
+		bool accumulate,
+		cuNDArray<float> *cosine_weights,
+		cuNDArray<float> *frequency_filter
+)
+{
+	//
+	// Validate the input
+	//
+
+	if( projections == 0x0 || image == 0x0 ){
+		throw std::runtime_error("Error: conebeam_backwards_projection: illegal array pointer provided");
+	}
+
+	if( projections->get_number_of_dimensions() != 3 ){
+		throw std::runtime_error("Error: conebeam_backwards_projection: projections array must be three-dimensional");
+	}
+
+	if( image->get_number_of_dimensions() != 3 ){
+		throw std::runtime_error("Error: conebeam_backwards_projection: image array must be three-dimensional");
+	}
+
+	if( projections->get_size(2) != angles.size() || projections->get_size(2) != offsets.size() ) {
+		throw std::runtime_error("Error: conebeam_backwards_projection: inconsistent sizes of input arrays/vectors");
+	}
+
+	if( FBP && !(cosine_weights && frequency_filter) ){
+		throw std::runtime_error("Error: conebeam_backwards_projection: for _filtered_ backprojection both cosine weights and a filter must be provided");
+	}
+
+	// Some utility variables
+	//
+
+	int matrix_size_x = image->get_size(0);
+	int matrix_size_y = image->get_size(1);
+	int matrix_size_z = image->get_size(2);
+
+	floatd3 is_dims(matrix_size_x, matrix_size_y, matrix_size_z);
+	int num_image_elements = matrix_size_x*matrix_size_y*matrix_size_z;
+
+	int projection_res_x = projections->get_size(0);
+	int projection_res_y = projections->get_size(1);
+
+	floatd2 ps_dims_in_pixels(projection_res_x, projection_res_y);
+
+	int num_projections_in_all_bins = projections->get_size(2);
+	int num_projections_in_bin = indices.size();
+
+	if( projections_per_batch > num_projections_in_bin )
+		projections_per_batch = num_projections_in_bin;
+
+	int num_batches = (num_projections_in_bin+projections_per_batch-1) / projections_per_batch;
+
+	// Allocate device memory for the backprojection result
+	//
+
+	boost::shared_ptr< cuNDArray<float> > image_device;
+
+	if( accumulate ){
+		image_device = boost::shared_ptr< cuNDArray<float> >(new cuNDArray<float>(image));
+	}
+	else{
+		image_device = boost::shared_ptr< cuNDArray<float> >(new cuNDArray<float>(image->get_dimensions().get()));
+	}
+
+	// Allocate the angles, offsets and projections in device memory
+	//
+
+	float *projections_DevPtr, *projections_DevPtr2;
+	cudaMalloc( (void**) &projections_DevPtr, projection_res_x*projection_res_y*projections_per_batch*sizeof(float));
+	cudaMalloc( (void**) &projections_DevPtr2, projection_res_x*projection_res_y*projections_per_batch*sizeof(float));
+
+	cudaStream_t mainStream, indyStream;
+	cudaStreamCreate(&mainStream);
+	cudaStreamCreate(&indyStream);
+
+	std::vector<float> angles_vec;
+	std::vector<floatd2> offsets_vec;
+
+	for( int p=0; p<indices.size(); p++ ){
+
+		int from_id = indices[p];
+
+		if( from_id >= num_projections_in_all_bins ) {
+			throw std::runtime_error("Error: conebeam_backwards_projection: illegal index in bin");
+		}
+
+		angles_vec.push_back(angles[from_id]);
+		offsets_vec.push_back(offsets[from_id]);
+	}
+
+	thrust::device_vector<float> angles_devVec(angles_vec);
+	thrust::device_vector<floatd2> offsets_devVec(offsets_vec);
+
+	// From/to for the first batch
+	// - to enable working streams...
+	//
+
+	int from_projection = 0;
+	int to_projection = projections_per_batch;
+
+	if (to_projection > num_projections_in_bin )
+		to_projection = num_projections_in_bin;
+
+	int projections_in_batch = to_projection-from_projection;
+
+	std::vector<size_t> dims;
+	dims.push_back(projection_res_x);
+	dims.push_back(projection_res_y);
+	dims.push_back(projections_in_batch);
+
+	std::vector<size_t> dims_next;
+
+	cuNDArray<float> *projections_batch = new cuNDArray<float>(&dims, projections_DevPtr);
+
+	// Upload first projections batch adhering to the binning.
+	// Be sure to copy sequentially numbered projections in one copy operation.
+	//
+
+	{
+		int p = from_projection;
+
+		while( p<to_projection ) {
+
+			int num_sequential_projections = 1;
+			while( p+num_sequential_projections < to_projection &&
+					indices[p+num_sequential_projections]==(indices[p+num_sequential_projections-1]+1) ){
+				num_sequential_projections++;
+			}
+
+			int from_id = indices[p];
+			int size = projection_res_x*projection_res_y;
+
+			cudaMemcpyAsync( projections_batch->get_data_ptr()+(p-from_projection)*size,
+					projections->get_data_ptr()+from_id*size,
+					size*num_sequential_projections*sizeof(float), cudaMemcpyHostToDevice, mainStream );
+
+			CHECK_FOR_CUDA_ERROR();
+
+			p += num_sequential_projections;
+		}
+	}
+
+	//
+	// Iterate over batches
+	//
+
+	for( int batch = 0; batch < num_batches; batch++ ) {
+
+		from_projection = batch * projections_per_batch;
+		to_projection = (batch+1) * projections_per_batch;
+
+		if (to_projection > num_projections_in_bin )
+			to_projection = num_projections_in_bin;
+
+		projections_in_batch = to_projection-from_projection;
+
+		float* raw_angles = thrust::raw_pointer_cast(&angles_devVec[from_projection]);
+		floatd2* raw_offsets = thrust::raw_pointer_cast(&offsets_devVec[from_projection]);
+
+
+		if( FBP ){
+
+			// Apply cosine weighting : "SDD / sqrt(SDD*SDD + u*u + v*v)"
+			// - with (u,v) positions given in metric units on a virtual detector at the origin
+			//
+
+			*projections_batch *= *cosine_weights;
+
+			// Redundancy correct
+			// - for short scan mode
+			//
+
+			if( short_scan ){
+				float delta = std::atan(ps_dims_in_mm[0]/(2.0f*SDD));
+				redundancy_correct( projections_batch, raw_angles, delta );
+			}
+
+			// Apply frequency filter
+			// - use zero padding to avoid the cyclic boundary conditions induced by the fft
+			//
+
+			std::vector<size_t> batch_dims = *projections_batch->get_dimensions();
+			uint64d3 pad_dims(batch_dims[0]<<1, batch_dims[1], batch_dims[2]);
+			boost::shared_ptr< cuNDArray<float> > padded_projections = pad<float,3>( pad_dims, projections_batch );
+			boost::shared_ptr< cuNDArray<complext<float> > > complex_projections = cb_fft( padded_projections.get() );
+			*complex_projections *= *frequency_filter;
+			cb_ifft( complex_projections.get(), padded_projections.get() );
+			uint64d3 crop_offsets(batch_dims[0]>>1, 0, 0);
+			crop<float,3>( crop_offsets, padded_projections.get(), projections_batch );
+
+			// Apply offset correction
+					// - for half fan mode, sag correction etc.
+					//
+			if (use_offset_correction)
+				offset_correct( projections_batch, raw_offsets, ps_dims_in_mm, SAD, SDD );
+
+
+		} else if (use_offset_correction)
+			offset_correct_sqrt( projections_batch, raw_offsets, ps_dims_in_mm, SAD, SDD );
+
+		// Build array for input texture
+		//
+
+		cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+		cudaExtent extent;
+		extent.width = projection_res_x;
+		extent.height = projection_res_y;
+		extent.depth = projections_in_batch;
+
+		cudaArray *projections_array;
+		cudaMalloc3DArray( &projections_array, &channelDesc, extent, cudaArrayLayered );
+		CHECK_FOR_CUDA_ERROR();
+
+		cudaMemcpy3DParms cpy_params = {0};
+		cpy_params.extent = extent;
+		cpy_params.dstArray = projections_array;
+		cpy_params.kind = cudaMemcpyDeviceToDevice;
+		cpy_params.srcPtr =
+				make_cudaPitchedPtr( (void*)projections_batch->get_data_ptr(), projection_res_x*sizeof(float),
+						projection_res_x, projection_res_y );
+		cudaMemcpy3DAsync( &cpy_params, mainStream );
+		CHECK_FOR_CUDA_ERROR();
+
+		cudaBindTextureToArray( projections_tex, projections_array, channelDesc );
+		CHECK_FOR_CUDA_ERROR();
+
+		// Upload projections for the next batch
+		// - to enable streaming
+		//
+
+		if( batch < num_batches-1 ){ // for using multiple streams to hide the cost of the uploads
+
+			int from_projection_next = (batch+1) * projections_per_batch;
+			int to_projection_next = (batch+2) * projections_per_batch;
+
+			if (to_projection_next > num_projections_in_bin )
+				to_projection_next = num_projections_in_bin;
+
+			int projections_in_batch_next = to_projection_next-from_projection_next;
+
+			// printf("batch: %03i, handling projections: %03i - %03i, angles: %.2f - %.2f\n",
+			//	 batch+1, from_projection_next, to_projection_next-1, angles[from_projection_next], angles[to_projection_next-1]);
+
+			// Allocate device memory for projections and upload
+			//
+
+			dims_next.clear();
+			dims_next.push_back(projection_res_x);
+			dims_next.push_back(projection_res_y);
+			dims_next.push_back(projections_in_batch_next);
+
+			cuNDArray<float> projections_batch_next(&dims, projections_DevPtr2);
+
+			// Upload projections adhering to the binning.
+			// Be sure to copy sequentially numbered projections in one copy operation.
+			//
+
+			int p = from_projection_next;
+
+			while( p<to_projection_next ) {
+
+				int num_sequential_projections = 1;
+				while( p+num_sequential_projections < to_projection_next &&
+						indices[p+num_sequential_projections]==(indices[p+num_sequential_projections-1]+1) ){
+					num_sequential_projections++;
+				}
+
+				int from_id = indices[p];
+				int size = projection_res_x*projection_res_y;
+
+				cudaMemcpyAsync( projections_batch_next.get_data_ptr()+(p-from_projection_next)*size,
+						projections->get_data_ptr()+from_id*size,
+						size*num_sequential_projections*sizeof(float), cudaMemcpyHostToDevice, indyStream );
+
+				CHECK_FOR_CUDA_ERROR();
+
+				p += num_sequential_projections;
+			}
+		}
+
+		// Define dimensions of grid/blocks.
+		//
+
+		dim3 dimBlock, dimGrid;
+		setup_grid( matrix_size_x*matrix_size_y*matrix_size_z, &dimBlock, &dimGrid );
+
+		// Invoke kernel
+		//
+
+		cudaFuncSetCacheConfig(conebeam_backwards_projection_kernel<FBP>, cudaFuncCachePreferL1);
+
+		conebeam_backwards_projection_kernel<FBP><<< dimGrid, dimBlock, 0, mainStream >>>
+				( image_device->get_data_ptr(), raw_angles, raw_offsets,
+						is_dims_in_pixels, is_dims_in_mm, ps_dims_in_pixels, ps_dims_in_mm,
+						projections_in_batch, num_projections_in_bin, SDD, SAD, (batch==0) ? accumulate : true );
+
+		CHECK_FOR_CUDA_ERROR();
+
+		// Cleanup
+		//
+
+		cudaUnbindTexture(projections_tex);
+		cudaFreeArray(projections_array);
+		CHECK_FOR_CUDA_ERROR();
+
+		std::swap(projections_DevPtr, projections_DevPtr2);
+		std::swap(mainStream, indyStream);
+
+		delete projections_batch;
+		if( batch < num_batches-1 )
+			projections_batch = new cuNDArray<float>(&dims_next, projections_DevPtr);
+	}
+
+	// Copy result from device to host
+	//
+
+	cudaMemcpy( image->get_data_ptr(), image_device->get_data_ptr(),
+			num_image_elements*sizeof(float), cudaMemcpyDeviceToHost );
+
+	CHECK_FOR_CUDA_ERROR();
+
+	cudaFree(projections_DevPtr2);
+	cudaFree(projections_DevPtr);
+	CUDA_CALL(cudaStreamDestroy(indyStream));
+	CUDA_CALL(cudaStreamDestroy(mainStream));
+	CHECK_FOR_CUDA_ERROR();
+}
+
+// Template instantiations
+//
+
+template void conebeam_backwards_projection<false>
+( hoCuNDArray<float>*, hoCuNDArray<float>*, std::vector<float>, std::vector<floatd2>, std::vector<unsigned int>,
+		int, intd3, floatd3, floatd2, float, float, bool, bool, bool, cuNDArray<float>*, cuNDArray<float>* );
+
+template void conebeam_backwards_projection<true>
+( hoCuNDArray<float>*, hoCuNDArray<float>*, std::vector<float>, std::vector<floatd2>, std::vector<unsigned int>,
+		int, intd3, floatd3, floatd2, float, float, bool, bool, bool, cuNDArray<float>*, cuNDArray<float>* );
+}
diff --git a/toolboxes/ct/xray/gpu/conebeam_projection.h b/toolboxes/ct/xray/gpu/conebeam_projection.h
new file mode 100644
index 0000000..ffea9c4
--- /dev/null
+++ b/toolboxes/ct/xray/gpu/conebeam_projection.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include "hoCuNDArray.h"
+#include "cuNDArray.h"
+#include "vector_td.h"
+#include "gpuxray_export.h"
+
+namespace Gadgetron {
+
+
+
+/**
+ * Applies the sqrt of the FPB offset correction. Should be used on projection data with offset detector for iterative reconstruction.
+ * @param projections
+ * @param offsets
+ * @param ps_dims_in_mm
+ * @param SDD
+ * @param SAD
+ */
+void apply_offset_correct(hoCuNDArray<float>* projections,std::vector<floatd2>& offsets,		floatd2 ps_dims_in_mm, float SDD,	float SAD);
+/**
+ *
+ * @param projections
+ * @param image
+ * @param angles
+ * @param offsets
+ * @param indices
+ * @param projections_per_batch
+ * @param samples_per_pixel
+ * @param is_dims_in_mm
+ * @param ps_dims_in_mm
+ * @param SDD
+ * @param SAD
+ * @param accumulate
+ */
+  // Forwards projection of a 3D volume onto a set of projections.
+  // - dependening on the provided binnning indices, just a subset of the projections can be targeted.
+  //
+  
+  EXPORTGPUXRAY void conebeam_forwards_projection
+    ( hoCuNDArray<float> *projections,
+				hoCuNDArray<float> *image,
+				std::vector<float> angles, 
+				std::vector<floatd2> offsets, 
+				std::vector<unsigned int> indices,
+				int projections_per_batch, 
+				float samples_per_pixel,
+				floatd3 is_dims_in_mm, 
+				floatd2 ps_dims_in_mm,
+				float SDD, 
+				float SAD
+  );
+  
+  // Backprojection of a set of projections onto a 3D volume.
+  // - depending on the provided binnning indices, just a subset of the projections can be included
+  //
+
+  template <bool FBP> EXPORTGPUXRAY void conebeam_backwards_projection( 
+        hoCuNDArray<float> *projections,
+        hoCuNDArray<float> *image,
+        std::vector<float> angles, 
+        std::vector<floatd2> offsets, 
+        std::vector<unsigned int> indices,
+        int projections_per_batch,
+        intd3 is_dims_in_pixels, 
+        floatd3 is_dims_in_mm, 
+        floatd2 ps_dims_in_mm,
+        float SDD, 
+        float SAD,
+        bool short_scan,
+        bool use_offset_correction,
+        bool accumulate, 
+        cuNDArray<float> *cosine_weights = 0x0,
+        cuNDArray<float> *frequency_filter = 0x0
+  );
+}
diff --git a/toolboxes/ct/xray/gpu/float3x3.h b/toolboxes/ct/xray/gpu/float3x3.h
new file mode 100644
index 0000000..db06acc
--- /dev/null
+++ b/toolboxes/ct/xray/gpu/float3x3.h
@@ -0,0 +1,66 @@
+#pragma once
+
+//#define SINCOSF __sincosf // fast math
+#define SINCOSF sincosf
+#include "vector_td_utilities.h"
+
+namespace Gadgetron{
+
+struct float3x3 {
+    floatd3 row0;
+    floatd3 row1;
+    floatd3 row2;
+};
+
+__inline__ __host__ __device__ 
+float3x3 make_float3x3(float v0, float v1, float v2,
+                       float v3, float v4, float v5,
+                       float v6, float v7, float v8) {
+    float3x3 m;
+    m.row0 = floatd3(v0, v1, v2);
+    m.row1 = floatd3(v3, v4, v5);
+    m.row2 = floatd3(v6, v7, v8);
+    return m;
+}
+
+__inline__ __device__ 
+floatd3 mul(float3x3 m, floatd3 v) {
+    return floatd3( dot(m.row0,v), dot(m.row1,v), dot(m.row2,v) );
+}
+
+
+__inline__ __device__ float3x3 calcRotationMatrixAroundX(float angle) {
+    float cos_angle, sin_angle;
+    SINCOSF(angle, &sin_angle, &cos_angle);
+  
+    // Build projection rotation matrix
+    float3x3 rotation = make_float3x3(1,         0,          0,
+                                      0, cos_angle, -sin_angle,
+                                      0, sin_angle,  cos_angle);
+    return rotation;
+}
+
+__inline__ __device__ float3x3 calcRotationMatrixAroundY(float angle) {
+    float cos_angle, sin_angle;
+    SINCOSF(angle, &sin_angle, &cos_angle);
+  
+    // Build projection rotation matrix
+    float3x3 rotation = make_float3x3( cos_angle, 0, sin_angle,
+                                               0, 1,         0,
+                                      -sin_angle, 0, cos_angle);
+    return rotation;
+}
+
+__inline__ __host__  __device__ float3x3 calcRotationMatrixAroundZ(float angle) {
+    float cos_angle, sin_angle;
+    sincosf(angle, &sin_angle, &cos_angle);
+  
+    // Build projection rotation matrix
+    float3x3 rotation = make_float3x3(cos_angle, -sin_angle, 0,
+                                      sin_angle,  cos_angle, 0,
+                                              0,          0, 1);
+    return rotation;
+}
+
+
+}
diff --git a/toolboxes/ct/xray/gpu/gpuxray_export.h b/toolboxes/ct/xray/gpu/gpuxray_export.h
new file mode 100644
index 0000000..3e420fd
--- /dev/null
+++ b/toolboxes/ct/xray/gpu/gpuxray_export.h
@@ -0,0 +1,19 @@
+/** \file gpuxray_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef GPUXRAY_EXPORT_H_
+#define GPUXRAY_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GPUXRAY__) || defined (gpuxray_EXPORTS)
+#define EXPORTGPUXRAY __declspec(dllexport)
+#else
+#define EXPORTGPUXRAY __declspec(dllimport)
+#endif
+#else
+#define EXPORTGPUXRAY
+#endif
+
+
+#endif /* GPUXRAY_EXPORT_H_ */
diff --git a/toolboxes/ct/xray/gpu/hoCuConebeamProjectionOperator.cpp b/toolboxes/ct/xray/gpu/hoCuConebeamProjectionOperator.cpp
new file mode 100644
index 0000000..7b065c7
--- /dev/null
+++ b/toolboxes/ct/xray/gpu/hoCuConebeamProjectionOperator.cpp
@@ -0,0 +1,261 @@
+#include "hoCuConebeamProjectionOperator.h"
+#include "conebeam_projection.h"
+#include "vector_td_operators.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_blas.h"
+#include "GPUTimer.h"
+
+#include <vector>
+#include <stdio.h>
+
+namespace Gadgetron
+{
+
+void hoCuConebeamProjectionOperator
+::offset_correct(hoCuNDArray<float>* projections){
+
+	if( !preprocessed_ ){
+		throw std::runtime_error( "Error: hoCuConebeamProjectionOperator::offset_correct: setup not performed");
+	}
+	float SDD = acquisition_->get_geometry()->get_SDD();
+	float SAD = acquisition_->get_geometry()->get_SAD();
+	floatd2 ps_dims_in_mm = acquisition_->get_geometry()->get_FOV();
+	apply_offset_correct( projections,acquisition_->get_geometry()->get_offsets(),ps_dims_in_mm, SDD, SAD);
+}
+
+void hoCuConebeamProjectionOperator
+::compute_default_frequency_filter()
+{
+	// This code computes the default frequency filter used in filtered backprojection
+	// _Important_ aspects:
+	// - the filter is defined as single precision weights (non-complex)
+	// - the filter defines the scalar weights for the positive frequencies only (i.e "one side)
+	//   - however, the size of the filter still equals the full size of the 1D dimension to filter +1 ...
+	//   - ... due to zero padding and cufft expecting an additional element.
+	//
+
+	if( !preprocessed_ )
+		throw std::runtime_error("Error: hoCuConebeamProjectionOperator::compute_default_frequency_filter() : setup not performed");
+
+	std::vector<size_t> dims;
+	dims.push_back(acquisition_->get_projections()->get_size(0)+1);
+
+	hoCuNDArray<float> host_weights(&dims);
+	float* data = host_weights.get_data_ptr();
+
+	const float A2 = dims[0]*dims[0];
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif    
+	for( int i=0; i<dims[0]; i++ ) {
+		float k = float(i);
+		data[i] = k*A2/(A2-k*k)*std::exp(-A2/(A2-k*k)); // From Guo et al, Journal of X-Ray Science and Technology 2011, doi: 10.3233/XST-2011-0294
+	}
+
+	frequency_filter_ = boost::shared_ptr< cuNDArray<float> >(new cuNDArray<float>(&host_weights));
+	float sum = asum(frequency_filter_.get());
+	*frequency_filter_ *= (dims[0]/sum);
+}
+
+void hoCuConebeamProjectionOperator
+::compute_cosine_weights()
+{
+	if( !preprocessed_ )
+		throw std::runtime_error("Error: hoCuConebeamProjectionOperator::compute_cosine_weights() : setup not performed");
+
+	uintd2 ps_dims_in_pixels( acquisition_->get_projections()->get_size(0), acquisition_->get_projections()->get_size(1) );
+	floatd2 ps_dims_in_mm = acquisition_->get_geometry()->get_FOV();
+
+	double SAD = double(acquisition_->get_geometry()->get_SAD());
+	double SDD = double(acquisition_->get_geometry()->get_SDD());
+
+	std::vector<size_t> dims;
+	dims.push_back(ps_dims_in_pixels[0]);
+	dims.push_back(ps_dims_in_pixels[1]);
+
+	hoCuNDArray<float> weights(&dims);
+	float* data = weights.get_data_ptr();
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+	for(  int y=0; y<ps_dims_in_pixels[1]; y++ ) {
+		for( int x=0; x<ps_dims_in_pixels[0]; x++ ) {
+
+			double xx = (( double(x) / double(ps_dims_in_pixels[0])) - 0.5) * ps_dims_in_mm[0];
+			double yy = (( double(y) / double(ps_dims_in_pixels[1])) - 0.5) * ps_dims_in_mm[1];
+			double s = SAD * xx/SDD;
+			double v = SAD * yy/SDD;
+
+			// Equation 10.1, page 386 in Computed Tomography 2nd edition, Jiang Hsieh
+			//
+
+			double value = SAD / std::sqrt( SAD*SAD + s*s + v*v );
+			data[x+y*ps_dims_in_pixels[0]] = float(value);
+		}
+	}
+	cosine_weights_ = boost::shared_ptr< cuNDArray<float> >(new cuNDArray<float>(&weights));
+}
+
+void hoCuConebeamProjectionOperator
+::mult_M( hoCuNDArray<float> *image, hoCuNDArray<float> *projections, bool accumulate )
+{
+
+	// Validate the input
+	//
+
+	if( image == 0x0 || projections == 0x0 ){
+		throw std::runtime_error("Error: hoCuConebeamProjectionOperator::mult_M: illegal array pointer provided");
+	}
+
+	if( (image->get_number_of_dimensions() != 4) &&  (image->get_number_of_dimensions() != 3) ){
+		throw std::runtime_error("Error: hoCuConebeamProjectionOperator::mult_M: image array must be four or three -dimensional");
+	}
+
+	if( projections->get_number_of_dimensions() != 3 ){
+		throw std::runtime_error("Error: hoCuConebeamProjectionOperator::mult_M: projections array must be three-dimensional");
+	}
+
+	if( !preprocessed_ ){
+		throw std::runtime_error( "Error: hoCuConebeamProjectionOperator::mult_M: setup not performed");
+	}
+
+	if( !binning_.get() ){
+		throw std::runtime_error( "Error: hoCuConebeamProjectionOperator::mult_M: binning not provided");
+	}
+
+	if( projections->get_size(2) != acquisition_->get_geometry()->get_angles().size() ||
+			projections->get_size(2) != acquisition_->get_geometry()->get_offsets().size() ){
+		throw std::runtime_error("Error: hoCuConebeamProjectionOperator::mult_M: inconsistent sizes of input arrays/vectors");
+	}
+
+	hoCuNDArray<float> *projections2 = projections;
+	if (accumulate)
+	  projections2 = new hoCuNDArray<float>(projections->get_dimensions());
+	// Iterate over the temporal dimension.
+	// I.e. reconstruct one 3D volume at a time.
+	//
+
+	for( int b=0; b<binning_->get_number_of_bins(); b++ ) {
+
+		floatd2 ps_dims_in_pixels_float(projections->get_size(0), projections->get_size(1));
+		floatd2 ps_dims_in_mm = acquisition_->get_geometry()->get_FOV();
+		floatd2 ps_spacing_in_mm = ps_dims_in_mm / ps_dims_in_pixels_float;
+
+		float SDD = acquisition_->get_geometry()->get_SDD();
+		float SAD = acquisition_->get_geometry()->get_SAD();
+
+		std::vector<size_t> dims_3d = *image->get_dimensions();
+		if (dims_3d.size()==4)
+			dims_3d.pop_back();
+
+		int num_3d_elements = dims_3d[0]*dims_3d[1]*dims_3d[2];
+
+		//Make a 3d view into the 4d image
+		hoCuNDArray<float> image_3d(&dims_3d, image->get_data_ptr()+b*num_3d_elements);
+
+		conebeam_forwards_projection( projections2, &image_3d,
+				acquisition_->get_geometry()->get_angles(),
+				acquisition_->get_geometry()->get_offsets(),
+				binning_->get_bin(b),
+				projections_per_batch_, samples_per_pixel_,
+				is_dims_in_mm_, ps_dims_in_mm,
+					      SDD, SAD);
+	}
+
+	if (use_offset_correction_ && !use_fbp_)
+	  this->offset_correct(projections2);
+	if (accumulate){
+	  *projections += *projections2;
+	  delete projections2;
+	}
+
+
+}
+
+void hoCuConebeamProjectionOperator
+::mult_MH( hoCuNDArray<float> *projections, hoCuNDArray<float> *image, bool accumulate )
+{
+
+	// Validate the input
+	//
+
+	if( image == 0x0 || projections == 0x0 ){
+		throw std::runtime_error("Error: hoCuConebeamProjectionOperator::mult_MH:: illegal array pointer provided");
+	}
+
+	if( (image->get_number_of_dimensions() != 4) &&  (image->get_number_of_dimensions() != 3) ){
+		throw std::runtime_error("Error: hoCuConebeamProjectionOperator::mult_MH: image array must be four or three -dimensional");
+	}
+
+	if( projections->get_number_of_dimensions() != 3 ){
+		throw std::runtime_error("Error: hoCuConebeamProjectionOperator::mult_MH: projections array must be three-dimensional");
+	}
+
+	if( !preprocessed_ ){
+		throw std::runtime_error( "Error: hoCuConebeamProjectionOperator::mult_MH: setup not performed");
+	}
+
+	if( !binning_.get() ){
+		throw std::runtime_error( "Error: hoCuConebeamProjectionOperator::mult_MH: binning not provided");
+	}
+
+	if( projections->get_size(2) != acquisition_->get_geometry()->get_angles().size() ||
+			projections->get_size(2) != acquisition_->get_geometry()->get_offsets().size() ){
+		throw std::runtime_error("Error: hoCuConebeamProjectionOperator::mult_MH: inconsistent sizes of input arrays/vectors");
+	}
+
+	// Iterate over the temporal dimension.
+	// I.e. reconstruct one 3D volume at a time.
+	//
+
+	for( int b=0; b<binning_->get_number_of_bins(); b++ ) {
+
+		floatd2 ps_dims_in_pixels_float(projections->get_size(0), projections->get_size(1));
+		floatd2 ps_dims_in_mm = acquisition_->get_geometry()->get_FOV();
+		floatd2 ps_spacing_in_mm = ps_dims_in_mm / ps_dims_in_pixels_float;
+
+		intd3 is_dims_in_pixels( image->get_size(0), image->get_size(1), image->get_size(2) );
+
+		float SDD = acquisition_->get_geometry()->get_SDD();
+		float SAD = acquisition_->get_geometry()->get_SAD();
+
+		std::vector<size_t> dims_3d = *image->get_dimensions();
+		if (dims_3d.size() ==4)
+			dims_3d.pop_back();
+
+		int num_3d_elements = dims_3d[0]*dims_3d[1]*dims_3d[2];
+
+		hoCuNDArray<float> image_3d(&dims_3d, image->get_data_ptr()+b*num_3d_elements);
+
+		if( use_fbp_ ){
+
+			if( !cosine_weights_.get() )
+				compute_cosine_weights();
+
+			if( !frequency_filter_.get() )
+				compute_default_frequency_filter();
+
+			conebeam_backwards_projection<true>
+			( projections, &image_3d,
+					acquisition_->get_geometry()->get_angles(),
+					acquisition_->get_geometry()->get_offsets(),
+					binning_->get_bin(b),
+					projections_per_batch_,
+					is_dims_in_pixels, is_dims_in_mm_, ps_dims_in_mm,
+					SDD, SAD, short_scan_, use_offset_correction_, accumulate,
+					cosine_weights_.get(), frequency_filter_.get() );
+		}
+		else
+			conebeam_backwards_projection<false>
+		( projections, &image_3d,
+				acquisition_->get_geometry()->get_angles(),
+				acquisition_->get_geometry()->get_offsets(),
+				binning_->get_bin(b),
+				projections_per_batch_,
+				is_dims_in_pixels, is_dims_in_mm_, ps_dims_in_mm,
+				SDD, SAD, short_scan_, use_offset_correction_, accumulate );
+	}
+}
+}
diff --git a/toolboxes/ct/xray/gpu/hoCuConebeamProjectionOperator.h b/toolboxes/ct/xray/gpu/hoCuConebeamProjectionOperator.h
new file mode 100644
index 0000000..d003bfa
--- /dev/null
+++ b/toolboxes/ct/xray/gpu/hoCuConebeamProjectionOperator.h
@@ -0,0 +1,147 @@
+#pragma once
+
+#include "cuNDArray.h"
+#include "linearOperator.h"
+#include "CBCT_acquisition.h"
+#include "CBCT_binning.h"
+#include "hoCuNDArray_math.h"
+#include "gpuxray_export.h"
+
+#include <numeric>
+#include <math_constants.h>
+#include <vector>
+
+namespace Gadgetron{
+  
+  class EXPORTGPUXRAY hoCuConebeamProjectionOperator : public linearOperator< hoCuNDArray<float> >
+  {
+  public:
+    hoCuConebeamProjectionOperator() : linearOperator< hoCuNDArray<float> >()
+    {
+      samples_per_pixel_ = 1.5f;
+      projections_per_batch_ = 20;
+      use_fbp_ = false;
+      short_scan_ = false;
+      preprocessed_ = false;
+      use_offset_correction_ = false;
+      allow_offset_correction_override_ = true;
+    }
+
+    virtual ~hoCuConebeamProjectionOperator() {}
+
+    virtual void mult_M( hoCuNDArray<float> *in, hoCuNDArray<float> *out, bool accumulate = false );
+    virtual void mult_MH( hoCuNDArray<float> *in, hoCuNDArray<float> *out, bool accumulate = false );
+
+    virtual void offset_correct(hoCuNDArray<float>* proj);
+
+    virtual void setup( boost::shared_ptr<CBCT_acquisition> acquisition,
+                        floatd3 is_dims_in_mm )
+    {      
+      acquisition_ = acquisition;
+      is_dims_in_mm_ = is_dims_in_mm;
+      
+      // Determine the minimum and maximum angles scanned and transform array angles from [0;max_angle_].
+      //
+      
+      std::vector<float> &angles = acquisition->get_geometry()->get_angles();      
+      float min_value = *std::min_element(angles.begin(), angles.end() );
+      transform(angles.begin(), angles.end(), angles.begin(), bind2nd(std::minus<float>(), min_value));
+ 
+      // Are we in a short scan setup?
+      // - we say yes if we have covered less than PI+3*delta radians
+      //
+
+      float angle_span = *std::max_element(angles.begin(), angles.end() );
+      floatd2 ps_dims_in_mm = acquisition_->get_geometry()->get_FOV();
+      float SDD = acquisition_->get_geometry()->get_SDD();
+      float delta = std::atan(ps_dims_in_mm[0]/(2.0f*SDD)); // Fan angle
+      
+      if( angle_span*CUDART_PI_F/180.0f > CUDART_PI_F+3.0f*delta )
+        short_scan_ = false;
+      else
+        short_scan_ = true;
+      
+      /*
+      std::cout << std::endl <<  *std::min_element(angles.begin(), angles.end() ) << " " 
+      << *std::max_element(angles.begin(), angles.end() ) << std::endl;
+      */
+
+      std::vector<floatd2> offsets = acquisition_->get_geometry()->get_offsets();
+      floatd2 mean_offset = std::accumulate(offsets.begin(),offsets.end(),floatd2(0,0))/float(offsets.size());
+
+      if( allow_offset_correction_override_ && mean_offset[0] > ps_dims_in_mm[0]*0.1f )
+      	use_offset_correction_ = true;
+      
+      preprocessed_ = true;
+    }
+
+    virtual void setup( boost::shared_ptr<CBCT_acquisition> acquisition,
+                        boost::shared_ptr<CBCT_binning> binning,
+                        floatd3 is_dims_in_mm )
+    {
+      binning_ = binning;
+      setup( acquisition, is_dims_in_mm );
+    }
+
+
+    inline void set_use_filtered_backprojection( bool use_fbp ){
+      use_fbp_ = use_fbp;      
+    }
+
+    inline void set_use_offset_correction( bool use_correction ){
+      use_offset_correction_ = use_correction;
+      allow_offset_correction_override_ = false;
+    }
+
+    inline bool get_use_offset_correction(){
+      return use_offset_correction_;
+    }
+
+    inline void set_num_projections_per_batch( unsigned int projections_per_batch ){
+      projections_per_batch_ = projections_per_batch;
+    }
+
+    inline void set_num_samples_per_pixel( float samples_per_pixel ){
+      samples_per_pixel_ = samples_per_pixel;
+    }
+
+    inline void set_frequency_filter( boost::shared_ptr< cuNDArray<float> > weights ){
+      frequency_filter_ = weights;
+    }
+
+    void set_acquisition( boost::shared_ptr<CBCT_acquisition> acquisition ){
+      acquisition_ = acquisition;
+    }
+
+    boost::shared_ptr<CBCT_acquisition> get_acquisition(){
+      return acquisition_;
+    }
+
+    void set_binning( boost::shared_ptr<CBCT_binning> binning ){
+      binning_ = binning;
+    }
+
+    boost::shared_ptr<CBCT_binning> get_binning(){
+      return binning_;
+    }
+    
+
+  protected:
+    virtual void compute_default_frequency_filter();
+    virtual void compute_cosine_weights();
+
+  protected:
+    boost::shared_ptr<CBCT_acquisition> acquisition_;
+    boost::shared_ptr<CBCT_binning> binning_;
+    floatd3 is_dims_in_mm_;
+    float samples_per_pixel_;
+    bool use_fbp_;
+    unsigned int projections_per_batch_;
+    bool preprocessed_;
+    bool short_scan_;
+    bool use_offset_correction_;
+    bool allow_offset_correction_override_;
+    boost::shared_ptr< cuNDArray<float> > cosine_weights_;
+    boost::shared_ptr< cuNDArray<float> > frequency_filter_;
+  };
+}
diff --git a/toolboxes/dwt/CMakeLists.txt b/toolboxes/dwt/CMakeLists.txt
new file mode 100644
index 0000000..c46aebc
--- /dev/null
+++ b/toolboxes/dwt/CMakeLists.txt
@@ -0,0 +1,3 @@
+if (CUDA_FOUND)
+  add_subdirectory(gpu)
+endif(CUDA_FOUND)
diff --git a/toolboxes/dwt/gpu/CMakeLists.txt b/toolboxes/dwt/gpu/CMakeLists.txt
new file mode 100644
index 0000000..a6cb3e6
--- /dev/null
+++ b/toolboxes/dwt/gpu/CMakeLists.txt
@@ -0,0 +1,37 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUDWT__)
+endif (WIN32)
+
+if(WIN32)
+  link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+
+include_directories( 
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CUDA_INCLUDE_DIRS}
+  ${Boost_INCLUDE_DIR}
+)
+
+cuda_add_library(gadgetron_toolbox_gpudwt SHARED 
+    cuNDDWT.h
+    cuNDDWT.cu
+  )
+
+set_target_properties(gadgetron_toolbox_gpudwt PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_toolbox_gpudwt 
+  gadgetron_toolbox_cpucore
+  gadgetron_toolbox_gpucore 
+  ${Boost_LIBRARIES}
+  ${CUDA_LIBRARIES} 
+  ${CUDA_CUFFT_LIBRARIES} 
+  )
+
+install(TARGETS gadgetron_toolbox_gpudwt DESTINATION lib COMPONENT main)
+
+install(FILES
+  cuNDDWT.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/dwt/gpu/cuDWTOperator.h b/toolboxes/dwt/gpu/cuDWTOperator.h
new file mode 100644
index 0000000..45d2b42
--- /dev/null
+++ b/toolboxes/dwt/gpu/cuDWTOperator.h
@@ -0,0 +1,135 @@
+#pragma once
+
+#include "linearOperator.h"
+#include "cuNDDWT.h"
+#include "vector_td.h"
+#include <random>
+#include <numeric>
+namespace Gadgetron {
+
+template<class T, unsigned int D> class cuDWTOperator : public linearOperator<cuNDArray<T> > {
+public:
+	cuDWTOperator() : linearOperator<cuNDArray<T> >::linearOperator() {
+		run_dimensions = std::vector<size_t>(D,0);
+		std::iota(run_dimensions.begin(),run_dimensions.end(),0);
+		daubechies4 = vector_td<typename realType<T>::Type, 4>(0.6830127f, 1.1830127f, 0.3169873f, -0.1830127f);
+	}
+
+	void set_levels(unsigned int levels_) { levels=levels_;}
+	void use_random(bool random){use_random_ = random; }
+
+	virtual ~cuDWTOperator(){};
+
+	virtual void mult_MH_M(cuNDArray<T> * in, cuNDArray<T> * out, bool accumulate = false){
+		if (accumulate) *out += *in;
+		else *out = *in;
+	}
+	virtual void mult_M(cuNDArray<T> * in, cuNDArray<T> * out, bool accumulate = false){
+		unsigned int loc_levels = levels;
+		auto img_dim = *in->get_dimensions();
+		if (levels == 0 ) loc_levels = calc_levels(img_dim);
+
+		if (use_random_){
+			std::default_random_engine generator;
+			std::uniform_int_distribution<int> dist(0,3);
+			shift_ = dist(generator);
+		}
+		cuNDArray<T> * tmp_in = in;
+		cuNDArray<T> * tmp_out = out;
+		if (accumulate ) tmp_out = new cuNDArray<T>(out->get_dimensions());
+		if (run_dimensions.size() > 1) tmp_in = new cuNDArray<T>(in);
+
+
+		for (auto i = 0; i < loc_levels; i++){
+			for (auto dim : run_dimensions){
+				cuNDArray<T> small_in(img_dim, tmp_in->get_data_ptr());
+				cuNDArray<T> small_out(img_dim, tmp_out->get_data_ptr());
+				//std::cout << "Dimension " << dim << std::endl;
+				DWT1<T,D,4>(&small_in,&small_out,daubechies4,dim,shift_);
+				std::swap(tmp_in,tmp_out);
+			}
+			//Resize for next level
+			for (int n = 0; n < D; n++)
+				img_dim[n] /= 2;
+		}
+
+		if (out != tmp_in && !accumulate)*out = *tmp_in;
+		if (accumulate)	*out += *tmp_in;
+		if (tmp_in != in && tmp_in != out) delete tmp_in;
+		if (tmp_out != in && tmp_out != out) delete tmp_out;
+
+
+
+	}
+
+	virtual void mult_MH(cuNDArray<T> * in, cuNDArray<T> * out, bool accumulate = false){
+		//iint shift = 0;
+
+
+		cuNDArray<T> * tmp_in = in;
+		cuNDArray<T> * tmp_out = out;
+		if (accumulate ) tmp_out = new cuNDArray<T>(out->get_dimensions());
+		if (run_dimensions.size() > 1) tmp_in = new cuNDArray<T>(in);
+
+		auto img_dim = *in->get_dimensions();
+		auto loc_levels = levels;
+		if (levels == 0 ) loc_levels = calc_levels(img_dim);
+		//Get smallest dimension;
+		for (auto n = 0; n < D; n++)
+			img_dim[n] /= std::pow(2,loc_levels-1);
+		for (auto i = loc_levels; i > 0; i--){
+			for (auto dim = run_dimensions.rbegin(); dim != run_dimensions.rend(); ++dim){
+				cuNDArray<T> small_in(img_dim, tmp_in->get_data_ptr());
+				cuNDArray<T> small_out(img_dim, tmp_out->get_data_ptr());
+				IDWT1<T,D,4>(&small_in,&small_out,daubechies4,*dim,shift_);
+				std::swap(tmp_in,tmp_out);
+			}
+			for (auto n = 0; n < D; n++)
+				img_dim[n] *= 2;
+		}
+		if (out != tmp_in && !accumulate)*out = *tmp_in;
+		if (accumulate)	*out += *tmp_in;
+		if (tmp_in != in && tmp_in != out) delete tmp_in;
+		if (tmp_out != in && tmp_out != out) delete tmp_out;
+
+	}
+
+	void set_shift(int shift){ shift_ = shift;}
+
+	vector_td<typename realType<T>::Type, 4> daubechies4;
+	/*
+	static auto daubechies4 = vector_td<typename realType<T>::Type ,4>{0.6830127f,1.1830127f,0.3169873f,-0.1830127f};
+	static auto haahr = vector_td<typename realType<T>::Type,2>{1.0f,1.0f};
+	static auto daubechies6= vector_td<typename realType<T>::Type,6>{0.47046721f,1.14111692f,0.650365f,-0.19093442f, -0.12083221f,0.0498175f};
+	*/
+
+
+private:
+
+	unsigned int calc_levels(std::vector<size_t>& dims){
+		unsigned int min_dim = std::numeric_limits<unsigned int>::max();
+		for (auto dim : run_dimensions){
+			min_dim = std::min(min_dim,max_divisions(dims[dim]));
+		}
+		return min_dim;
+
+	}
+
+	static const unsigned int max_divisions(unsigned int num){
+		unsigned int count = 0;
+		while (num%2==0 && num > 4) {
+			count++;
+			num /= 2;
+		}
+		return count;
+
+	}
+
+	std::vector<size_t> run_dimensions;
+	unsigned int levels=0;
+	bool use_random_ = false;
+	int shift_=0;
+
+
+};
+}
diff --git a/toolboxes/dwt/gpu/cuHaarWaveletOperator.cu b/toolboxes/dwt/gpu/cuHaarWaveletOperator.cu
new file mode 100644
index 0000000..7f706de
--- /dev/null
+++ b/toolboxes/dwt/gpu/cuHaarWaveletOperator.cu
@@ -0,0 +1,365 @@
+#include "cuHaarWaveletOperator.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_utils.h"
+#include "vector_td_utilities.h"
+#include "vector_td_io.h"
+#include "complext.h"
+#include <iostream>
+#include "check_CUDA.h"
+#include "cudaDeviceManager.h"
+#include <stdio.h>
+#include <iostream>
+using namespace Gadgetron;
+
+
+// Template Power function
+template<unsigned int i, unsigned int j>
+struct Pow
+{
+	enum { Value = i*Pow<i,j-1>::Value};
+};
+
+template <unsigned int i>
+struct Pow<i,1>
+{
+	enum { Value = i};
+};
+
+
+template<class T> struct Haar{
+	static inline __device__ void lift(T& x1,T& x2){
+		T s0 = x1;
+		T d0 = x2;
+		x1 = (s0+d0)/T(2); //s_l
+		x2 = s0-d0; //d_l
+	}
+
+	static inline __device__ void sink(T& x1,T& x2){
+		T s0 = x1;
+		T d0 = x2;
+		x1 = s0+d0/T(2);
+		x2 = s0-d0/T(2);
+	}
+};
+
+
+template<class T, unsigned int D,class wave, unsigned int N>  struct recWave{
+
+	static inline __device__ void loadData(T* elements, T* data, const vector_td<int,D>& dims){
+		int offset = 1;
+		for (int i = 0; i < N; i++)
+			offset *= dims[N-i-1];
+
+		recWave<T,D,wave,N-1>::loadData(elements,data,dims);
+		recWave<T,D,wave,N-1>::loadData(elements+Pow<2,N>::Value,data+offset,dims);
+	}
+
+	static inline __device__ void predict(T* elements){
+		recWave<T,D,wave,N-1>::predict(elements);
+		recWave<T,D,wave,N-1>::predict(elements+Pow<2,N>::Value);
+		for (int i = 0; i < Pow<2,N>::Value; i++){
+			wave::lift(elements[i],elements[Pow<2,N>::Value+i]);
+		}
+	}
+
+
+
+	static inline __device__ void ipredict(T* elements){
+		for (int i = 0; i < Pow<2,N>::Value; i++){
+			wave::sink(elements[i],elements[Pow<2,N>::Value+i]);
+		}
+		recWave<T,D,wave,N-1>::ipredict(elements);
+		recWave<T,D,wave,N-1>::ipredict(elements+Pow<2,N>::Value);
+	}
+
+	static inline __device__ void saveData(T* elements, T* data, const vector_td<int,D>& dims){
+		int offset = 1;
+		for (int i = 0; i < N; i++)
+			offset *= dims[N-i-1];
+
+		recWave<T,D,wave,N-1>::saveData(elements,data,dims);
+		recWave<T,D,wave,N-1>::saveData(elements+Pow<2,N>::Value,data+offset,dims);
+	}
+
+
+};
+
+template<class T, unsigned int D, class wave> struct recWave<T,D,wave,0>{
+
+	static inline __device__ void loadData(T* elements, T* data, const vector_td<int,D>& dims){
+		elements[0] = data[0];
+		elements[1] = data[1];
+	}
+
+	static inline __device__ void predict(T* elements){
+		wave::lift(elements[0],elements[1]);
+	}
+
+	static inline __device__ void ipredict(T* elements){
+		wave::sink(elements[0],elements[1]);
+	}
+	static inline __device__ void saveData(T* elements, T* data, const vector_td<int,D>& dims){
+		data[0] = elements[0];
+		data[1] = elements[1];
+	}
+};
+
+
+template<class T, unsigned int D, class wave> __global__ void haarKernel(T* in, T* out, const vector_td<int,D> dims){
+
+	T elements[Pow<2,D>::Value];
+	int newsize = prod(dims)/Pow<2,D>::Value;
+	const vector_td<int,D> dims2 = dims/2;
+
+	const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	if( idx < newsize ){
+		vector_td<int,D> co = idx_to_co<D>(idx,dims2);
+		co *= 2;
+		recWave<T,D,wave,D-1>::loadData(elements,in+co_to_idx<D>(co,dims),dims);
+		recWave<T,D,wave,D-1>::predict(elements);
+
+		for (int i = 0; i < Pow<2,D>::Value; i++){
+			out[i*newsize+idx] = elements[i];
+		}
+
+	}
+}
+
+
+template<class T, unsigned int D, class wave> __global__ void inv_haarKernel(T* in, T* out, const vector_td<int,D> dims){
+
+	T elements[Pow<2,D>::Value];
+
+	const vector_td<int,D> dims2 = dims/2;
+	int oldsize = prod(dims2);
+	const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	if( idx < oldsize ){
+
+		for (int i = 0; i < Pow<2,D>::Value; i++){
+			elements[i] = in[i*oldsize+idx] ;
+		}
+		recWave<T,D,wave,D-1>::ipredict(elements);
+		vector_td<int,D> co = idx_to_co<D>(idx,dims2);
+		co *= 2;
+
+		recWave<T,D,wave,D-1>::saveData(elements,out+co_to_idx<D>(co,dims),dims);
+
+
+
+	}
+}
+
+static inline bool isPowerOfTwo (unsigned int x)
+{
+	return ((x != 0) && ((x & (~x + 1)) == x));
+}
+
+template<typename T> inline T next_power2(T value)
+{
+	--value;
+	for(size_t i = 1; i < sizeof(T) * CHAR_BIT; i*=2)
+		value |= value >> i;
+	return value+1;
+}
+
+
+template<class T, unsigned int D> void cuHaarWaveletOperator<T,D>::set_domain_dimensions(std::vector<size_t>* dims){
+
+	linearOperator<cuNDArray<T> >::set_domain_dimensions(dims);
+	std::vector<size_t> newdims;
+	for (int i = 0; i < dims->size(); i++){
+		if (isPowerOfTwo(dims->at(i)))
+			newdims.push_back(dims->at(i));
+		else
+			newdims.push_back(next_power2(dims->at(i)));
+	}
+
+	//BAD BAD BAD IDEAS are going here. This is a dirty dirty hack
+	/*
+	unsigned int m=0;
+	for (int i =0; i < dims->size(); i++)
+		if (dims->at(i) > m) m = dims->at(i);
+
+	if (!isPowerOfTwo(m))
+		m = next_power2(m);
+
+	std::vector<unsigned int> newdims;
+
+	for (int i =0; i < dims->size(); i++)
+		newdims.push_back(m);
+*/
+	linearOperator<cuNDArray<T> >::set_codomain_dimensions(&newdims);
+
+}
+
+template<class T, unsigned int D> void cuHaarWaveletOperator<T,D>::mult_M(cuNDArray<T>* in, cuNDArray<T>* out, bool accumulate ){
+	if (! in->dimensions_equal(this->get_domain_dimensions().get()))
+		throw std::runtime_error("cuHaarWaveletOperator::mult_M: size of input array does not match operator domain size.");
+	if (! out->dimensions_equal(this->get_codomain_dimensions().get()))
+		throw std::runtime_error("cuHaarWaveletOperator::mult_M: size of output array does not match operator codomain size.");
+
+
+	cuNDArray<T>* tmp_in = new cuNDArray<T>(this->get_codomain_dimensions());
+
+	if (in->dimensions_equal(tmp_in))
+		*tmp_in = *in;
+	else
+		pad<T,D>(in,tmp_in);
+
+
+	cuNDArray<T>* tmp_out;
+	if (accumulate)
+		tmp_out =new cuNDArray<T>(tmp_in->get_dimensions());
+	else
+		tmp_out = out;
+
+	typename intd<D>::Type dims = vector_td<int,D>( from_std_vector<size_t,D>(*(tmp_in->get_dimensions())));
+
+	typename intd<D>::Type dims2;
+	for (int i = 0; i < D; i++) dims2[i] = 2;
+
+	while(dims >= dims2){
+		int elements=  prod(dims)/Pow<2,D>::Value;
+		int threadsPerBlock =std::min(elements,256);
+		dim3 dimBlock( threadsPerBlock);
+		int totalBlocksPerGrid = std::max(1,elements/threadsPerBlock);
+		dim3 dimGrid(totalBlocksPerGrid);
+		haarKernel<T,D,Haar<T>  ><<<dimGrid,dimBlock>>>(tmp_in->get_data_ptr(),tmp_out->get_data_ptr(),dims);
+		CHECK_FOR_CUDA_ERROR();
+		dims /= 2;
+		*tmp_in = *tmp_out;
+	}
+	delete tmp_in;
+
+	dims2 /= 2;
+	if (dims != dims2){
+		std::vector<size_t> sdim = to_std_vector(vector_td<size_t,D>(dims));
+		cuNDArray<T> smallArray(&sdim,tmp_out->get_data_ptr());
+		smallArray.squeeze();
+		cuNDArray<T> smallTmp(smallArray);
+		linearOperator<cuNDArray<T> >* smallWave;
+		switch(smallArray.get_number_of_dimensions()){
+		case 1:
+			smallWave = new cuHaarWaveletOperator<T,1>;
+			break;
+		case 2:
+			smallWave = new cuHaarWaveletOperator<T,2>;
+			break;
+		case 3:
+			smallWave = new cuHaarWaveletOperator<T,3>;
+			break;
+		default:
+			throw std::logic_error("cuHaarWaveletOperator::mult_M: Illegal number of input dimensions given");
+		}
+		smallWave->set_domain_dimensions(smallArray.get_dimensions().get());
+		smallWave->mult_M(&smallTmp,&smallArray,false);
+		delete smallWave;
+	}
+
+	if (accumulate){
+		*out += *tmp_out;
+		delete tmp_out;
+	}
+
+
+}
+
+
+template<class T, unsigned int D> void cuHaarWaveletOperator<T,D>::mult_MH(cuNDArray<T>* in, cuNDArray<T>* out, bool accumulate ){
+
+
+	if (! out->dimensions_equal(this->get_domain_dimensions().get()))
+		throw std::runtime_error("cuHaarWaveletOperator::mult_MH: size of output array does not match operator domain size.");
+	if (! in->dimensions_equal(this->get_codomain_dimensions().get()))
+		throw std::runtime_error("cuHaarWaveletOperator::mult_MH: size of input array does not match operator codomain size.");
+	cuNDArray<T>* tmp_out = new cuNDArray<T>(*in);
+
+	cuNDArray<T>* tmp_in = new cuNDArray<T>(*in);
+
+
+	const typename intd<D>::Type dims = vector_td<int,D>( from_std_vector<size_t,D>(*(in->get_dimensions())));
+
+	typename intd<D>::Type cur_dims = dims/min(dims);
+
+
+	if (prod(cur_dims) > 1){
+		std::vector<size_t> sdim = to_std_vector(vector_td<size_t,D>(cur_dims));
+		cuNDArray<T> smallIn(&sdim,tmp_in->get_data_ptr());
+		smallIn.squeeze();
+		cuNDArray<T> smallOut(&sdim,tmp_out->get_data_ptr());
+		smallOut.squeeze();
+		linearOperator<cuNDArray<T> >* smallWave;
+
+		switch(smallIn.get_number_of_dimensions()){
+		case 1:
+			smallWave = new cuHaarWaveletOperator<T,1>;
+			break;
+		case 2:
+			smallWave = new cuHaarWaveletOperator<T,2>;
+			break;
+		case 3:
+			smallWave = new cuHaarWaveletOperator<T,3>;
+			break;
+		default:
+			throw std::logic_error("cuHaarWaveletOperator::mult_M: 5D wavelets are currently considered overly ambitious.");
+		}
+		smallWave->set_domain_dimensions(smallIn.get_dimensions().get());
+		smallWave->mult_MH(&smallIn,&smallOut,false);
+		smallIn = smallOut;
+		delete smallWave;
+	}
+
+	while(cur_dims <= dims){
+		int elements = prod(cur_dims*2)/Pow<2,D>::Value;
+		int threadsPerBlock =std::min(elements,cudaDeviceManager::Instance()->max_blockdim());
+		dim3 dimBlock( threadsPerBlock);
+		int totalBlocksPerGrid = std::max(1,elements/cudaDeviceManager::Instance()->max_blockdim());
+		dim3 dimGrid(totalBlocksPerGrid);
+
+		inv_haarKernel<T,D,Haar<T> ><<<dimGrid,dimBlock>>>(tmp_in->get_data_ptr(),tmp_out->get_data_ptr(),cur_dims);
+		CHECK_FOR_CUDA_ERROR();
+		cur_dims *= 2;
+		*tmp_in = *tmp_out;
+	}
+
+	if (!in->dimensions_equal(&this->domain_dims_)){
+		delete tmp_in;
+		tmp_in = new cuNDArray<T>(&this->domain_dims_);
+		vector_td<size_t,D> offset;
+		for (int i = 0; i < D; i++ ) offset[i] = (this->codomain_dims_[i]-this->domain_dims_[i])/2;
+		crop<T,D>(offset,tmp_out,tmp_in);
+	}
+
+	if (accumulate){
+		*out += *tmp_in;
+	} else {
+		*out = *tmp_in;
+	}
+	delete tmp_in;
+	delete tmp_out;
+
+}
+
+template class  cuHaarWaveletOperator<float,1>;
+template class  cuHaarWaveletOperator<float,2>;
+template class  cuHaarWaveletOperator<float,3>;
+template class  cuHaarWaveletOperator<float,4>;
+
+template class  cuHaarWaveletOperator<double,1>;
+template class  cuHaarWaveletOperator<double,2>;
+template class  cuHaarWaveletOperator<double,3>;
+template class  cuHaarWaveletOperator<double,4>;
+
+template class  cuHaarWaveletOperator<float_complext,1>;
+template class  cuHaarWaveletOperator<float_complext,2>;
+template class  cuHaarWaveletOperator<float_complext,3>;
+template class  cuHaarWaveletOperator<float_complext,4>;
+
+template class  cuHaarWaveletOperator<double_complext,1>;
+template class  cuHaarWaveletOperator<double_complext,2>;
+template class  cuHaarWaveletOperator<double_complext,3>;
+template class  cuHaarWaveletOperator<double_complext,4>;
+
+
diff --git a/toolboxes/dwt/gpu/cuHaarWaveletOperator.h b/toolboxes/dwt/gpu/cuHaarWaveletOperator.h
new file mode 100644
index 0000000..79aee36
--- /dev/null
+++ b/toolboxes/dwt/gpu/cuHaarWaveletOperator.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "linearOperator.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray.h"
+
+namespace Gadgetron {
+
+template<class T, unsigned int D> class cuHaarWaveletOperator : public linearOperator<cuNDArray<T> >{
+
+public:
+	cuHaarWaveletOperator() : linearOperator<cuNDArray<T> >(){};
+
+	virtual ~cuHaarWaveletOperator(){};
+	virtual void mult_M(cuNDArray<T>*,cuNDArray<T>*,bool );
+	virtual void mult_MH(cuNDArray<T>*,cuNDArray<T>*,bool );
+	virtual void mult_MH_M(cuNDArray<T>* in ,cuNDArray<T>* out,bool accumulate ){
+		if (accumulate){
+			*out += *in;
+		} else {
+			*out = *in;
+		}
+	}
+	virtual boost::shared_ptr< linearOperator< cuNDArray<T>  > >  clone(){
+				return linearOperator< cuNDArray<T> >::clone(this);
+			}
+	virtual void set_domain_dimensions(std::vector<size_t>* dims);
+
+};
+
+}
diff --git a/toolboxes/dwt/gpu/cuNDDWT.cu b/toolboxes/dwt/gpu/cuNDDWT.cu
new file mode 100644
index 0000000..563fde4
--- /dev/null
+++ b/toolboxes/dwt/gpu/cuNDDWT.cu
@@ -0,0 +1,205 @@
+#include "cuNDDWT.h"
+#include "vector_td.h"
+#include "vector_td_utilities.h"
+#include "complext.h"
+#include "cuNDArray_math.h"
+using namespace Gadgetron;
+
+
+template<class T, unsigned int D> struct Daubechies {
+
+};
+/*
+template <class T> struct Daubechies<T,4>{
+	static vector_td<T,4> coefficients = vector_td<T,4>({0.6830127,1.1830127,0.3169873,-0.1830127});
+	//vector_td<T,4> coefficients;
+};
+ */
+
+
+template<class T, unsigned int D, unsigned int WD> __global__ static void
+dwt_kernel( vector_td<int,D> dims,  const T  * __restrict__ in, T * __restrict__ out, int dir, vector_td<typename realType<T>::Type,WD> wavelet, int shift )
+{
+	const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	if( idx < prod(dims)/2){
+		vector_td<int,D> dims2 = dims;
+		dims2[dir] /= 2;
+		vector_td<T,WD> data;
+		typename intd<D>::Type co = idx_to_co<D>(idx, dims2);
+		//co[dir] *= 2; //We're doing the decimated wavelet
+		co[dir] = (co[dir]+shift+dims[dir])%dims[dir]; //Wrap around
+		for (int i = 0; i < WD; i++){
+			data[i] = in[co_to_idx<D>(co, dims)];
+			co[dir] = (co[dir]+1+dims[dir])%dims[dir]; //Wrap around
+		}
+		T s = dot(data,wavelet); //Getting the scaling element is easy
+
+		T d = 0;
+		float sign = 1;
+
+		//Reverse wavelet and shift sign of every second element
+		for (int i = 0; i< WD; i++){
+			d+= wavelet[WD-i-1]*sign*data[i];
+			sign *= -1;
+		}
+
+		//co = idx_to_co<D>(idx,dims2);
+		//size_t out_index = co_to_idx<D>(co,dims2);
+		out[idx] = s;
+		out[idx+prod(dims)/2] =d;
+	}
+}
+
+template<class T, unsigned int D, unsigned int WD> __global__ static void
+idwt_kernel( vector_td<int,D> dims,  const T  * __restrict__ in, T * __restrict__ out, int dim, vector_td<typename realType<T>::Type,WD> wavelet, int shift )
+{
+	const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	if( idx < prod(dims)/2){
+		vector_td<int,D> dims2 = dims;
+		dims2[dim] /= 2;
+		typename intd<D>::Type co = idx_to_co<D>(idx, dims2);
+
+
+		T res1 = 0;
+		T res2 = 0;
+		co[dim] = (co[dim]+dims2[dim]+WD-1)%dims2[dim];
+		for (int i = 0; i < WD/2; i++){
+			T s = in[co_to_idx<D>(co,dims2)];
+			res1 += wavelet[2*i]*s;
+			res2 += wavelet[2*i+1]*s;
+			co[dim] = (co[dim]-1+dims2[dim])%dims2[dim];
+		}
+
+		//Create the diff coefficients. Yes we could compute them on the fly.
+		vector_td<T,WD> diff;
+		{
+			float sign = 1;
+			for (int i = 0; i < WD; i++){
+				diff[i] = wavelet[WD-i-1]*sign;
+				sign *= -1;
+			}
+		}
+
+		co = idx_to_co<D>(idx, dims2);
+		//co[dim] += dims2[dim];
+		//co[dim] = (co[dim]+dims[dim]+WD-1)%dims[dim];
+
+		co[dim] = (co[dim]+dims2[dim]+WD-1)%dims2[dim];
+		//co[dim] += dims2[dim];
+		for (int i = 0; i < WD/2; i++){
+			T d = in[co_to_idx<D>(co,dims2)+prod(dims)/2];
+			res1 += diff[2*i]*d;
+			res2 += diff[2*i+1]*d;
+			co[dim] = (co[dim]-1+dims2[dim])%dims2[dim];
+		}
+
+		co = idx_to_co<D>(idx, dims2);
+		co[dim] *= 2;
+		co[dim] = (co[dim]+dims[dim]+shift+2*WD-2)%dims[dim];
+		out[co_to_idx<D>(co,dims)] = res1;
+		co[dim] = (co[dim]+dims[dim]+1)%dims[dim];
+		out[co_to_idx<D>(co,dims)] = res2;
+	}
+}
+static inline bool isPowerOfTwo (size_t x)
+{
+	return ((x != 0) && ((x & (~x + 1)) == x));
+}
+
+
+/*
+const
+
+template < struct Daubechies {
+	({0.6830127,1.1830127,0.3169873,-0.1830127});
+};
+/*
+template<class T, unsigned int D, class wave> __device__ static void lift(T& data[D]){
+
+}
+ */
+/**
+ *
+ * @param in Input array
+ * @param out Output array
+ * @param wavelet vector of the scaling function coefficients for the wavelet
+ */
+template<class T, unsigned int D, unsigned int WD> void Gadgetron::DWT1( cuNDArray<T>* in, cuNDArray<T>* out, vector_td<typename realType<T>::Type,WD> wavelet, int dim, int shift){
+
+	if (!(isPowerOfTwo(in->get_size(dim)) && in->get_size(dim) >= WD)){
+		std::cout << "Dimension is: " << in->get_size(dim) << std::endl;
+		throw std::runtime_error("DWT: Illegal input dimensions for DWT. Power of two reconstructions only");
+	}
+
+	size_t tot_threads = in->get_number_of_elements()/2; //1 thread per 2 elements
+	int threadsPerBlock =std::min(tot_threads,size_t(256));
+	dim3 dimBlock( threadsPerBlock);
+	int totalBlocksPerGrid = std::max(size_t(1),tot_threads/threadsPerBlock);
+	dim3 dimGrid(totalBlocksPerGrid);
+
+	const typename intd<D>::Type dims = vector_td<int,D>( from_std_vector<size_t,D>(*(in->get_dimensions())));
+	dwt_kernel<T,D,WD><<<dimGrid,dimBlock>>>(dims, in->get_data_ptr(),out->get_data_ptr(),dim,wavelet,shift);
+	cudaThreadSynchronize();
+	CHECK_FOR_CUDA_ERROR()
+	*out *= T(1.0/std::sqrt(sum(wavelet)));
+
+
+}
+
+/**
+ *
+ * @param in Input array
+ * @param out Output array
+ * @param wavelet vector of the scaling function coefficients for the wavelet
+ */
+template<class T, unsigned int D, unsigned int WD> void Gadgetron::IDWT1( cuNDArray<T>* in, cuNDArray<T>* out, vector_td<typename realType<T>::Type,WD> wavelet, int dim, int shift){
+
+	if (!(isPowerOfTwo(in->get_size(dim)) && in->get_size(dim) >= WD)){
+
+		std::cout << "Dimension " << dim <<" is: " << in->get_size(dim) << " " << in->get_number_of_dimensions() << std::endl;
+		throw std::runtime_error("IDWT: Illegal input dimensions for DWT. Power of two reconstructions only");
+	}
+
+	size_t tot_threads = in->get_number_of_elements()/2; //1 thread per 2 elements
+	int threadsPerBlock =std::min(tot_threads,size_t(256));
+	dim3 dimBlock( threadsPerBlock);
+	int totalBlocksPerGrid = std::max(size_t(1),tot_threads/threadsPerBlock);
+	dim3 dimGrid(totalBlocksPerGrid);
+
+	const typename intd<D>::Type dims = vector_td<int,D>( from_std_vector<size_t,D>(*(in->get_dimensions())));
+	idwt_kernel<T,D,WD><<<dimGrid,dimBlock>>>(dims, in->get_data_ptr(),out->get_data_ptr(),dim,wavelet,shift);
+
+	*out *= T(1.0/std::sqrt(sum(wavelet)));
+	CHECK_FOR_CUDA_ERROR();
+
+
+}
+
+template EXPORTGPUDWT void Gadgetron::DWT1<float,2,6>(cuNDArray<float>*, cuNDArray<float>*, vector_td<float,6> ,int,int);
+template EXPORTGPUDWT void Gadgetron::IDWT1<float,2,6>(cuNDArray<float>*, cuNDArray<float>*, vector_td<float,6> ,int,int);
+template EXPORTGPUDWT void Gadgetron::DWT1<float,2,4>(cuNDArray<float>*, cuNDArray<float>*, vector_td<float,4> ,int,int);
+template EXPORTGPUDWT void Gadgetron::IDWT1<float,2,4>(cuNDArray<float>*, cuNDArray<float>*, vector_td<float,4> ,int,int);
+template EXPORTGPUDWT void Gadgetron::DWT1<float,2,2>(cuNDArray<float>*, cuNDArray<float>*, vector_td<float,2> ,int,int);
+template EXPORTGPUDWT void Gadgetron::IDWT1<float,2,2>(cuNDArray<float>*, cuNDArray<float>*, vector_td<float,2> ,int,int);
+
+template EXPORTGPUDWT void Gadgetron::DWT1<float_complext,2,6>(cuNDArray<float_complext>*, cuNDArray<float_complext>*, vector_td<float,6> ,int,int);
+template EXPORTGPUDWT void Gadgetron::IDWT1<float_complext,2,6>(cuNDArray<float_complext>*, cuNDArray<float_complext>*, vector_td<float,6> ,int,int);
+template EXPORTGPUDWT void Gadgetron::DWT1<float_complext,2,4>(cuNDArray<float_complext>*, cuNDArray<float_complext>*, vector_td<float,4> ,int,int);
+template EXPORTGPUDWT void Gadgetron::IDWT1<float_complext,2,4>(cuNDArray<float_complext>*, cuNDArray<float_complext>*, vector_td<float,4> ,int,int);
+template EXPORTGPUDWT void Gadgetron::DWT1<float_complext,2,2>(cuNDArray<float_complext>*, cuNDArray<float_complext>*, vector_td<float,2> ,int,int);
+template EXPORTGPUDWT void Gadgetron::IDWT1<float_complext,2,2>(cuNDArray<float_complext>*, cuNDArray<float_complext>*, vector_td<float,2> ,int,int);
+
+
+template EXPORTGPUDWT void Gadgetron::DWT1<float,3,6>(cuNDArray<float>*, cuNDArray<float>*, vector_td<float,6> ,int,int);
+template EXPORTGPUDWT void Gadgetron::IDWT1<float,3,6>(cuNDArray<float>*, cuNDArray<float>*, vector_td<float,6> ,int,int);
+template EXPORTGPUDWT void Gadgetron::DWT1<float,3,4>(cuNDArray<float>*, cuNDArray<float>*, vector_td<float,4> ,int,int);
+template EXPORTGPUDWT void Gadgetron::IDWT1<float,3,4>(cuNDArray<float>*, cuNDArray<float>*, vector_td<float,4> ,int,int);
+template EXPORTGPUDWT void Gadgetron::DWT1<float,3,2>(cuNDArray<float>*, cuNDArray<float>*, vector_td<float,2> ,int,int);
+template EXPORTGPUDWT void Gadgetron::IDWT1<float,3,2>(cuNDArray<float>*, cuNDArray<float>*, vector_td<float,2> ,int,int);
+
+template EXPORTGPUDWT void Gadgetron::DWT1<float_complext,3,6>(cuNDArray<float_complext>*, cuNDArray<float_complext>*, vector_td<float,6> ,int,int);
+template EXPORTGPUDWT void Gadgetron::IDWT1<float_complext,3,6>(cuNDArray<float_complext>*, cuNDArray<float_complext>*, vector_td<float,6> ,int,int);
+template EXPORTGPUDWT void Gadgetron::DWT1<float_complext,3,4>(cuNDArray<float_complext>*, cuNDArray<float_complext>*, vector_td<float,4> ,int,int);
+template EXPORTGPUDWT void Gadgetron::IDWT1<float_complext,3,4>(cuNDArray<float_complext>*, cuNDArray<float_complext>*, vector_td<float,4> ,int,int);
+template EXPORTGPUDWT void Gadgetron::DWT1<float_complext,3,2>(cuNDArray<float_complext>*, cuNDArray<float_complext>*, vector_td<float,2> ,int,int);
+template EXPORTGPUDWT void Gadgetron::IDWT1<float_complext,3,2>(cuNDArray<float_complext>*, cuNDArray<float_complext>*, vector_td<float,2> ,int,int);
diff --git a/toolboxes/dwt/gpu/cuNDDWT.h b/toolboxes/dwt/gpu/cuNDDWT.h
new file mode 100644
index 0000000..31d6dfb
--- /dev/null
+++ b/toolboxes/dwt/gpu/cuNDDWT.h
@@ -0,0 +1,27 @@
+#pragma once
+#include "cuNDArray.h"
+#include "complext.h"
+#include "vector_td.h"
+
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GPUDWT__)
+#define EXPORTGPUDWT __declspec(dllexport)
+#else
+#define EXPORTGPUDWT __declspec(dllimport)
+#endif
+#else
+#define EXPORTGPUDWT
+#endif
+
+namespace Gadgetron{
+
+//template<class T> void dwt(cuNDArray<T> * in_out);
+
+template<class T, unsigned int D, unsigned int WD> EXPORTGPUDWT void DWT1( cuNDArray<T>* in,
+		cuNDArray<T>* out, vector_td<typename realType<T>::Type,WD> wavelet, int dim, int shift = 0);
+
+template<class T, unsigned int D, unsigned int WD> EXPORTGPUDWT void IDWT1( cuNDArray<T>* in,
+		cuNDArray<T>* out, vector_td<typename realType<T>::Type,WD> wavelet, int dim, int shift = 0);
+
+}
diff --git a/toolboxes/fft/CMakeLists.txt b/toolboxes/fft/CMakeLists.txt
new file mode 100644
index 0000000..2ce2ea8
--- /dev/null
+++ b/toolboxes/fft/CMakeLists.txt
@@ -0,0 +1,11 @@
+if(FFTW3_FOUND)
+  add_subdirectory(cpu)
+else(FFTW3_FOUND)
+  message("FFTW not found, not compiling cpu fft toolbox")
+endif(FFTW3_FOUND)
+
+if(CUDA_FOUND)
+  add_subdirectory(gpu)
+else(CUDA_FOUND)
+  message("CUDA not found, not compiling gpu fft toolbox")
+endif(CUDA_FOUND)
\ No newline at end of file
diff --git a/toolboxes/fft/cpu/CMakeLists.txt b/toolboxes/fft/cpu/CMakeLists.txt
new file mode 100644
index 0000000..8188ad0
--- /dev/null
+++ b/toolboxes/fft/cpu/CMakeLists.txt
@@ -0,0 +1,47 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_CPUFFT__)
+endif (WIN32)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+  ${FFTW3_INCLUDE_DIR}
+  ${Boost_INCLUDE_DIR}
+  ${ISMRMRD_INCLUDE_DIR}
+  ${ARMADILLO_INCLUDE_DIRS}
+  )
+
+add_library(gadgetron_toolbox_cpufft SHARED 
+  cpufft_export.h 
+  hoNDFFT.h
+  hoNDFFT.cpp
+  )
+
+set_target_properties(gadgetron_toolbox_cpufft PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+if (MKL_FOUND)
+    target_link_libraries(gadgetron_toolbox_cpufft
+      gadgetron_toolbox_cpucore
+      gadgetron_toolbox_log
+      gadgetron_toolbox_cpucore_math
+      ${MKL_LIBRARIES} 
+      ${Boost_LIBRARIES} 
+      )
+else (MKL_FOUND)
+target_link_libraries(gadgetron_toolbox_cpufft
+  gadgetron_toolbox_cpucore
+  gadgetron_toolbox_log
+  gadgetron_toolbox_cpucore_math
+  ${FFTW3_LIBRARIES} 
+  ${Boost_LIBRARIES} 
+  )
+endif (MKL_FOUND)
+
+install(TARGETS gadgetron_toolbox_cpufft DESTINATION lib COMPONENT main)
+
+install(FILES
+  cpufft_export.h 
+  hoNDFFT.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/fft/cpu/cpufft_export.h b/toolboxes/fft/cpu/cpufft_export.h
new file mode 100644
index 0000000..d7ce52f
--- /dev/null
+++ b/toolboxes/fft/cpu/cpufft_export.h
@@ -0,0 +1,18 @@
+/** \file cpufft_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef CPUFFT_EXPORT_H_
+#define CPUFFT_EXPORT_H_
+
+#if defined (WIN32)
+    #if defined (__BUILD_GADGETRON_CPUFFT__) || defined (cpufft_EXPORTS)
+        #define EXPORTCPUFFT __declspec(dllexport)
+    #else
+        #define EXPORTCPUFFT __declspec(dllimport)
+    #endif
+#else
+#define EXPORTCPUFFT
+#endif
+
+#endif /* CPUCORE_EXPORT_H_ */
diff --git a/toolboxes/fft/cpu/hoNDFFT.cpp b/toolboxes/fft/cpu/hoNDFFT.cpp
new file mode 100644
index 0000000..859ab5f
--- /dev/null
+++ b/toolboxes/fft/cpu/hoNDFFT.cpp
@@ -0,0 +1,1447 @@
+
+//Include for Visual studio, 'cos reasons.
+#define _USE_MATH_DEFINES
+#include <cmath>
+
+#include "hoNDFFT.h"
+#include "hoMatrix.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_math.h"
+
+namespace Gadgetron{
+
+template<typename T> hoNDFFT<T>* hoNDFFT<T>::instance()
+    																{
+	if (!instance_) instance_ = new hoNDFFT<T>();
+	return instance_;
+    																}
+
+template<class T> hoNDFFT<T>* hoNDFFT<T>::instance_ = NULL;
+
+
+template<class T> void hoNDFFT<T>::fft_int_uneven(hoNDArray< ComplexType >* input, size_t dim_to_transform, int sign)
+   {
+       if (sign != -1 && sign != 1) return;
+       if (dim_to_transform >= input->get_number_of_dimensions()) return;
+
+       int stride     = 1;           //Distance between points in transform
+       int dist       = 1;           //Distance between vectors
+       int trafos     = 1;           //Transformations per chunk
+       int chunks     = 1;           //Number of chunks
+       int chunk_size = 1;           //Points per chunk
+       int length     = 1;           //Length of each transform
+       int total_dist = 1;
+
+
+
+       typename fftw_types<T>::plan * fft_plan        = 0;
+       ComplexType*    fft_storage     = 0;
+
+       ComplexType* fft_buffer = 0;
+       ComplexType* data_ptr = 0;
+
+       //Set sizes
+       length = (int)input->get_size(dim_to_transform);
+
+
+       T scale = std::sqrt((T)length);
+
+
+       if (dim_to_transform != 0)
+       {
+           for (size_t i = 0; i < dim_to_transform; i++)
+           {
+               chunk_size *= (int)input->get_size(i);
+           }
+           stride = chunk_size;
+           trafos = chunk_size;
+           chunk_size *= length;
+
+           for (size_t i = dim_to_transform+1; i < input->get_number_of_dimensions(); i++)
+           {
+               chunks *= (int)input->get_size(i);
+           }
+       }
+       else
+       {
+           for (size_t i = 1; i < input->get_number_of_dimensions(); i++)
+           {
+               trafos *= (int)input->get_size(i);
+           }
+           chunk_size = trafos*length;
+
+           dist = length;
+       }
+
+       total_dist = trafos*dist;
+
+
+       //Allocate storage and make plan
+       {
+           mutex_.lock();
+           fft_storage = (ComplexType*)fftw_malloc_(sizeof(T)*length*2);
+           if (fft_storage == 0)
+           {
+               GDEBUG_STREAM("Failed to allocate buffer for FFT" << std::endl);
+               return;
+           }
+           fft_buffer = fft_storage;
+
+           unsigned planner_flags = FFTW_MEASURE | FFTW_DESTROY_INPUT;
+
+           fft_plan = fftw_plan_dft_1d_(length, fft_storage, fft_storage, sign, planner_flags);
+
+           if (fft_plan == 0)
+           {
+               fftw_free_(fft_storage);
+               GDEBUG_STREAM("Failed to create plan for FFT" << std::endl);
+               return;
+           }
+           mutex_.unlock();
+       }
+
+       //Grab address of data
+       data_ptr = input->get_data_ptr();
+
+       register int idx1_max = chunks*chunk_size;
+       register int idx1, idx2;       //Index variables
+       register int idx2_limit;
+       register int middle_point = ((length+1)/2);
+
+       for (idx1 = 0; idx1 < idx1_max; idx1+=chunk_size) //Loop over all chunks
+       {
+           idx2_limit = idx1+total_dist;
+           for (idx2 = idx1; idx2 < idx2_limit; idx2+=dist) //Loop over all transformations
+           {
+               ///Copy data to buffer.
+               {
+                   register int j, idx3 = idx2;
+                   for (j = middle_point; j < length; idx3+=stride)
+                   {
+                       fft_buffer[j++] = data_ptr[idx3  ];
+                   }
+                   for (j = 0; j < middle_point; idx3+=stride)
+                   {
+                       fft_buffer[j++] = data_ptr[idx3  ];
+                   }
+               }
+
+               fftw_execute_(fft_plan);
+
+               {
+                   register int j, idx3 = idx2;
+
+                   for (j = middle_point; j < length; idx3+=stride)
+                   {
+                       data_ptr[idx3  ] = fft_buffer[j++]*scale;
+                   }
+                   for (j = 0; j < middle_point; idx3+=stride)
+                   {
+                       data_ptr[idx3  ] = fft_buffer[j++]*scale;
+                       data_ptr[idx3] = fft_buffer[j++]*scale;
+                   }
+               }
+
+           } //Loop over transformations
+       } //Loop over chunks
+
+       //clean up
+       {
+           mutex_.lock();
+           if (fft_plan != 0)
+           {
+               fftw_destroy_plan_(fft_plan);
+           }
+
+           if (fft_storage != 0)
+           {
+               fftw_free_(fft_storage);
+           }
+           mutex_.unlock();
+       }
+   }
+
+template<class T> void hoNDFFT<T>::fft_int(hoNDArray< ComplexType >* input, size_t dim_to_transform, int sign)	{
+	if (sign != -1 && sign != 1) throw std::runtime_error("hoNDFFT::fft_int: illegal sign provided");
+	if (dim_to_transform >= input->get_number_of_dimensions()) throw std::runtime_error("hoNDFFT::fft_int: ransform dimension larger than dimension of input array ");
+
+	//Only works for even dimensions. Fall back to slow version
+	if (input->get_size(dim_to_transform)%2 == 1){
+		fft_int_uneven(input,dim_to_transform,sign);
+		return;
+	}
+	int stride     = 1;           //Distance between points in transform
+	int dist       = 1;           //Distance between vectors
+	int trafos     = 1;           //Transformations per chunk
+	int chunks     = 1;           //Number of chunks
+	int chunk_size = 1;           //Points per chunk
+	int length     = 1;           //Length of each transform
+	int total_dist = 1;
+
+
+	typename fftw_types<T>::plan * fft_plan        = 0;
+
+
+	//Set sizes
+	length = (int)input->get_size(dim_to_transform);
+
+	T scale = 1/std::sqrt((T)length);
+	if (dim_to_transform != 0)
+	{
+		for (size_t i = 0; i < dim_to_transform; i++)
+		{
+			chunk_size *= (int)input->get_size(i);
+		}
+		stride = chunk_size;
+		trafos = chunk_size;
+		chunk_size *= length;
+
+		for (size_t i = dim_to_transform+1; i < input->get_number_of_dimensions(); i++)
+		{
+			chunks *= (int)input->get_size(i);
+		}
+	}
+	else
+	{
+		for (size_t i = 1; i < input->get_number_of_dimensions(); i++)
+		{
+			trafos *= (int)input->get_size(i);
+		}
+		chunk_size = trafos*length;
+
+		dist = length;
+	}
+
+
+	total_dist = trafos*dist;
+
+	//Flip frequencies to center image
+	if (sign == FFTW_BACKWARD)
+		timeswitch(input,dim_to_transform);
+//Grab address of data
+	ComplexType* data_ptr = input->get_data_ptr();
+
+	//Allocate storage and make plan
+	{
+		mutex_.lock();
+
+		unsigned planner_flags = FFTW_ESTIMATE;
+		fft_plan = fftw_plan_many_dft_(1,&length,trafos,data_ptr,&length,stride,dist,data_ptr,&length,stride,dist,sign,planner_flags);
+		//fftw_print_plan_(fft_plan);
+		if (fft_plan == NULL)
+		{
+			throw std::runtime_error("hoNDFFT: failed to create fft plan");
+		}
+		mutex_.unlock();
+
+	}
+
+#pragma omp parallel for
+	for (int k = 0; k < chunks; k++)
+		fftw_execute_dft_(fft_plan,data_ptr+k*chunk_size,data_ptr+k*chunk_size);
+
+//Flip frequencies to center DC freq
+	if (sign == FFTW_FORWARD)
+		timeswitch(input,dim_to_transform);
+
+
+	//clean up
+	{
+		mutex_.lock();
+		if (fft_plan != 0)
+		{
+			fftw_destroy_plan_(fft_plan);
+		}
+
+		mutex_.unlock();
+	}
+
+
+	*input *= scale;
+}
+template<typename T>
+inline size_t hoNDFFT<T>::fftshiftPivot(size_t x)
+{
+	return (size_t)(ceil(x*0.5));
+}
+
+template<typename T>
+inline size_t hoNDFFT<T>::ifftshiftPivot(size_t x)
+{
+	return (size_t)(floor(x*0.5));
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fftshift1D(const ComplexType* a, ComplexType* r, size_t x, size_t pivot)
+{
+	memcpy(r, a+pivot, sizeof(ComplexType)*(x-pivot));
+	memcpy(r+x-pivot, a, sizeof(ComplexType)*pivot);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifftshift1D(const ComplexType* a, ComplexType* r, size_t x, size_t pivot)
+{
+	return fftshift1D(a, r, x, pivot);
+}
+
+template<typename T>
+void hoNDFFT<T>::fftshiftPivot1D(ComplexType* a, size_t x, size_t n, size_t pivot)
+{
+	long long counter;
+
+#pragma omp parallel private(counter) shared(n, x, pivot, a) if ( n > 256 )
+	{
+		hoNDArray< ComplexType > aTmp(x);
+
+#pragma omp for
+		for ( counter=0; counter<(long long)n; counter++ )
+		{
+			fftshift1D(a+counter*x, aTmp.begin(), x, pivot);
+			memcpy(a+counter*x, aTmp.begin(), sizeof(ComplexType)*x);
+		}
+	}
+}
+
+template<typename T>
+void hoNDFFT<T>::fftshiftPivot1D(const ComplexType* a, ComplexType* r, size_t x, size_t n, size_t pivot)
+{
+	long long counter;
+
+#pragma omp parallel for private(counter) shared(n, x, pivot, a, r) if ( n > 256 )
+	for ( counter=0; counter<(long long)n; counter++ )
+	{
+		fftshift1D(a+counter*x, r+counter*x, x, pivot);
+	}
+}
+
+
+template<typename T>
+void hoNDFFT<T>::fftshift1D(hoNDArray< ComplexType >& a)
+{
+	size_t x = a.get_size(0);
+	size_t pivot = fftshiftPivot(x);
+	size_t numOfShifts = a.get_number_of_elements()/x;
+	fftshiftPivot1D(a.begin(), x, numOfShifts, pivot);
+}
+
+template<typename T>
+void hoNDFFT<T>::fftshift1D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	if ( !r.dimensions_equal(&a) )
+	{
+		r = a;
+	}
+
+	size_t x = a.get_size(0);
+	size_t pivot = fftshiftPivot(x);
+	size_t numOfShifts = a.get_number_of_elements()/x;
+	fftshiftPivot1D(a.begin(), r.begin(), x, numOfShifts, pivot);
+}
+
+template<typename T>
+void hoNDFFT<T>::ifftshift1D(hoNDArray< ComplexType >& a)
+{
+	size_t x = a.get_size(0);
+	size_t pivot = ifftshiftPivot(x);
+	size_t numOfShifts = a.get_number_of_elements()/x;
+
+	fftshiftPivot1D(a.begin(), x, numOfShifts, pivot);
+}
+
+template<typename T>
+void hoNDFFT<T>::ifftshift1D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	if ( !r.dimensions_equal(&a) )
+	{
+		r = a;
+	}
+
+	size_t x = a.get_size(0);
+	size_t pivot = ifftshiftPivot(x);
+	size_t numOfShifts = a.get_number_of_elements()/x;
+
+	fftshiftPivot1D(a.begin(), r.begin(), x, numOfShifts, pivot);
+}
+
+template<typename T>
+void hoNDFFT<T>::fftshiftPivot2D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t n, size_t pivotx, size_t pivoty)
+{
+	if (a==NULL || r == NULL) throw std::runtime_error("hoNDFFT::fftshiftPivot2D: void ptr provided");
+
+	long long tt;
+
+#pragma omp parallel for private(tt) shared(a, r, x, y, n, pivotx, pivoty) if (n>16)
+	for ( tt=0; tt<(long long)n; tt++ )
+	{
+		const ComplexType* ac = a + tt*x*y;
+		ComplexType* rc = r + tt*x*y;
+
+		size_t ay, ry;
+
+		for ( ay=pivoty; ay<y; ay++ )
+		{
+			ry = ay - pivoty;
+			memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+			memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+		}
+
+		for ( ay=0; ay<pivoty; ay++ )
+		{
+			ry = ay + y - pivoty;
+			memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+			memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+		}
+	}
+}
+
+template<typename T>
+void hoNDFFT<T>::fftshiftPivot2D(ComplexType* a, size_t x, size_t y, size_t n, size_t pivotx, size_t pivoty)
+{
+
+	if (a==NULL ) throw std::runtime_error("hoNDFFT::fftshiftPivot2D: void ptr provided");
+
+	long long tt;
+
+#pragma omp parallel private(tt) shared(a, x, y, n, pivotx, pivoty) if (n>16)
+	{
+		hoNDArray< ComplexType > aTmp(x*y);
+		ComplexType* rc = aTmp.begin();
+
+#pragma omp for
+		for ( tt=0; tt<(long long)n; tt++ )
+		{
+			ComplexType* ac = a + tt*x*y;
+
+			size_t ay, ry;
+
+			for ( ay=pivoty; ay<y; ay++ )
+			{
+				ry = ay - pivoty;
+				memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+				memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+			}
+
+			for ( ay=0; ay<pivoty; ay++ )
+			{
+				ry = ay + y - pivoty;
+				memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+				memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+			}
+
+			memcpy(ac, rc, sizeof(ComplexType)*x*y);
+		}
+	}
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fftshift2D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t n)
+{
+
+	if (a==NULL || r == NULL ) throw std::runtime_error("hoNDFFT::fftshift2D: void ptr provided");
+
+	size_t pivotx = fftshiftPivot(x);
+	size_t pivoty = fftshiftPivot(y);
+
+	fftshiftPivot2D(a, r, x, y, n, pivotx, pivoty);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifftshift2D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t n)
+{
+
+	if (a==NULL || r == NULL ) throw std::runtime_error("hoNDFFT::ifftshift2D: void ptr provided");
+
+	size_t pivotx = ifftshiftPivot(x);
+	size_t pivoty = ifftshiftPivot(y);
+
+	fftshiftPivot2D(a, r, x, y, n, pivotx, pivoty);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fftshift2D(ComplexType* a, size_t x, size_t y, size_t n)
+{
+
+	if (a==NULL ) throw std::runtime_error("hoNDFFT::fftshift2D: void ptr provided");
+
+	size_t pivotx = fftshiftPivot(x);
+	size_t pivoty = fftshiftPivot(y);
+
+	fftshiftPivot2D(a, x, y, n, pivotx, pivoty);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifftshift2D(ComplexType* a, size_t x, size_t y, size_t n)
+{
+
+	size_t pivotx = ifftshiftPivot(x);
+	if (a==NULL ) throw std::runtime_error("hoNDFFT::ifftshift2D: void ptr provided");
+	size_t pivoty = ifftshiftPivot(y);
+
+	fftshiftPivot2D(a, x, y, n, pivotx, pivoty);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fftshift2D(hoNDArray< ComplexType >& a)
+{
+	size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1));
+	return fftshift2D(a.begin(), a.get_size(0), a.get_size(1), n);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fftshift2D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	if ( !r.dimensions_equal(&a) )
+	{
+		r = a;
+	}
+
+	size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1));
+	return fftshift2D(a.begin(), r.begin(), a.get_size(0), a.get_size(1), n);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifftshift2D(hoNDArray< ComplexType >& a)
+{
+	size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1));
+	return ifftshift2D(a.begin(), a.get_size(0), a.get_size(1), n);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifftshift2D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	if ( !r.dimensions_equal(&a) )
+	{
+		r = a;
+	}
+
+	size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1));
+	return ifftshift2D(a.begin(), r.begin(), a.get_size(0), a.get_size(1), n);
+}
+
+template<typename T>
+void hoNDFFT<T>::fftshiftPivot3D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t z, size_t n, size_t pivotx, size_t pivoty,  size_t pivotz)
+{
+
+	if (a==NULL  || r == NULL) throw std::runtime_error("hoNDFFT::fftshift2D: void ptr provided");
+
+	long long tt;
+
+#pragma omp parallel for private(tt) shared(a, r, x, y, z, n, pivotx, pivoty, pivotz) if (n>16)
+	for ( tt=0; tt<(long long)n; tt++ )
+	{
+		size_t ay, ry, az, rz;
+
+		for ( az=pivotz; az<z; az++ )
+		{
+			rz = az - pivotz;
+
+			const ComplexType* ac = a + tt*x*y*z + az*x*y;
+			ComplexType* rc = r + tt*x*y*z + rz*x*y;
+
+			for ( ay=pivoty; ay<y; ay++ )
+			{
+				ry = ay - pivoty;
+				memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+				memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+			}
+
+			for ( ay=0; ay<pivoty; ay++ )
+			{
+				ry = ay + y - pivoty;
+				memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+				memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+			}
+		}
+
+		for ( az=0; az<pivotz; az++ )
+		{
+			rz = az + z - pivotz;
+
+			const ComplexType* ac = a + tt*x*y*z + az*x*y;
+			ComplexType* rc = r + tt*x*y*z + rz*x*y;
+
+			for ( ay=pivoty; ay<y; ay++ )
+			{
+				ry = ay - pivoty;
+				memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+				memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+			}
+
+			for ( ay=0; ay<pivoty; ay++ )
+			{
+				ry = ay + y - pivoty;
+				memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+				memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+			}
+		}
+	}
+}
+
+template<typename T>
+void hoNDFFT<T>::fftshiftPivot3D(ComplexType* a, size_t x, size_t y, size_t z, size_t n, size_t pivotx, size_t pivoty,  size_t pivotz)
+{
+
+	if (a==NULL  ) throw std::runtime_error("hoNDFFT::fftshiftPivot3D: void ptr provided");
+
+	long long tt;
+
+#pragma omp parallel private(tt) shared(a, x, y, z, n, pivotx, pivoty, pivotz) if (n>16)
+	{
+		hoNDArray< ComplexType > aTmp(x*y*z);
+
+#pragma omp for
+		for ( tt=0; tt<(long long)n; tt++ )
+		{
+			size_t ay, ry, az, rz;
+
+			for ( az=pivotz; az<z; az++ )
+			{
+				rz = az - pivotz;
+
+				const ComplexType* ac = a + tt*x*y*z + az*x*y;
+				ComplexType* rc = aTmp.begin() + rz*x*y;
+
+				for ( ay=pivoty; ay<y; ay++ )
+				{
+					ry = ay - pivoty;
+					memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+					memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+				}
+
+				for ( ay=0; ay<pivoty; ay++ )
+				{
+					ry = ay + y - pivoty;
+					memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+					memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+				}
+			}
+
+			for ( az=0; az<pivotz; az++ )
+			{
+				rz = az + z - pivotz;
+
+				const ComplexType* ac = a + tt*x*y*z + az*x*y;
+				ComplexType* rc = aTmp.begin() + rz*x*y;
+
+				for ( ay=pivoty; ay<y; ay++ )
+				{
+					ry = ay - pivoty;
+					memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+					memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+				}
+
+				for ( ay=0; ay<pivoty; ay++ )
+				{
+					ry = ay + y - pivoty;
+					memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+					memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+				}
+			}
+
+			memcpy(a+tt*x*y*z, aTmp.begin(), sizeof(ComplexType)*x*y*z);
+		}
+	}
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fftshift3D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t z, size_t n)
+{
+
+	if (a==NULL  || r==NULL ) throw std::runtime_error("hoNDFFT::fftshift3D: void ptr provided");
+
+	size_t pivotx = fftshiftPivot(x);
+	size_t pivoty = fftshiftPivot(y);
+	size_t pivotz = fftshiftPivot(z);
+
+	fftshiftPivot3D(a, r, x, y, z, n, pivotx, pivoty, pivotz);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifftshift3D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t z, size_t n)
+{
+
+	if (a==NULL  || r==NULL ) throw std::runtime_error("hoNDFFT::ifftshift3D: void ptr provided");
+
+	size_t pivotx = ifftshiftPivot(x);
+	size_t pivoty = ifftshiftPivot(y);
+	size_t pivotz = ifftshiftPivot(z);
+
+	fftshiftPivot3D(a, r, x, y, z, n, pivotx, pivoty, pivotz);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fftshift3D(ComplexType* a, size_t x, size_t y, size_t z, size_t n)
+{
+	if (a==NULL   ) throw std::runtime_error("hoNDFFT::fftshift3D: void ptr provided");
+	size_t pivotx = fftshiftPivot(x);
+	size_t pivoty = fftshiftPivot(y);
+	size_t pivotz = fftshiftPivot(z);
+	fftshiftPivot3D(a, x, y, z, n, pivotx, pivoty, pivotz);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifftshift3D(ComplexType* a, size_t x, size_t y, size_t z, size_t n)
+{
+	if (a==NULL   ) throw std::runtime_error("hoNDFFT::ifftshift3D: void ptr provided");
+
+	size_t pivotx = ifftshiftPivot(x);
+	size_t pivoty = ifftshiftPivot(y);
+	size_t pivotz = ifftshiftPivot(z);
+
+	fftshiftPivot3D(a, x, y, z, n, pivotx, pivoty, pivotz);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fftshift3D(hoNDArray< ComplexType >& a)
+{
+	size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1)*a.get_size(2));
+	return fftshift3D(a.begin(), a.get_size(0), a.get_size(1), a.get_size(2), n);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fftshift3D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	if ( !r.dimensions_equal(&a) )
+	{
+		r = a;
+	}
+
+	size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1)*a.get_size(2));
+	return fftshift3D(a.begin(), r.begin(), a.get_size(0), a.get_size(1), a.get_size(2), n);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifftshift3D(hoNDArray< ComplexType >& a)
+{
+	size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1)*a.get_size(2));
+	return ifftshift3D(a.begin(), a.get_size(0), a.get_size(1), a.get_size(2), n);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifftshift3D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	if ( !r.dimensions_equal(&a) )
+	{
+		r = a;
+	}
+
+	size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1)*a.get_size(2));
+	return ifftshift3D(a.begin(), r.begin(), a.get_size(0), a.get_size(1), a.get_size(2), n);
+}
+
+// -----------------------------------------------------------------------------------------
+
+template<typename T>
+inline void hoNDFFT<T>::fft1(hoNDArray< ComplexType >& a)
+{
+	return fft1(a, true);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifft1(hoNDArray< ComplexType >& a)
+{
+	return fft1(a, false);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fft1(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	if ( !r.dimensions_equal(&a) )
+	{
+		r.create(a.get_dimensions());
+	}
+
+	return fft1(const_cast<hoNDArray< ComplexType >&>(a), r, true);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifft1(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	if ( !r.dimensions_equal(&a) )
+	{
+		r.create(a.get_dimensions());
+	}
+
+	return fft1(const_cast<hoNDArray< ComplexType >&>(a), r, false);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fft1c(hoNDArray< ComplexType >& a)
+{
+	ifftshift1D(a);
+	fft1(a);
+	fftshift1D(a);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifft1c(hoNDArray< ComplexType >& a)
+{
+	ifftshift1D(a);
+	ifft1(a);
+	fftshift1D(a);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fft1c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	ifftshift1D(a, r);
+	fft1(r);
+	fftshift1D(r);
+
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifft1c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	ifftshift1D(a, r);
+	ifft1(r);
+	fftshift1D(r);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fft1c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf)
+{
+	ifftshift1D(a, r);
+	fft1(r, buf);
+	fftshift1D(buf, r);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifft1c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf)
+{
+	ifftshift1D(a, r);
+	ifft1(r, buf);
+	fftshift1D(buf, r);
+}
+
+// -----------------------------------------------------------------------------------------
+
+template<typename T>
+inline void hoNDFFT<T>::fft2(hoNDArray< ComplexType >& a)
+{
+	fft2(a, true);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifft2(hoNDArray< ComplexType >& a)
+{
+	fft2(a, false);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fft2(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	//r = a;
+	//return fft2(r);
+	if ( !r.dimensions_equal(&a) )
+	{
+		r.create(a.get_dimensions());
+	}
+
+	fft2(const_cast<hoNDArray< ComplexType >&>(a), r, true);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifft2(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	/*r = a;
+        return ifft2(r);*/
+
+	if ( !r.dimensions_equal(&a) )
+	{
+		r.create(a.get_dimensions());
+	}
+
+	fft2(const_cast<hoNDArray< ComplexType >&>(a), r, false);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fft2c(hoNDArray< ComplexType >& a)
+{
+	ifftshift2D(a);
+	fft2(a);
+	fftshift2D(a);
+
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifft2c(hoNDArray< ComplexType >& a)
+{
+	ifftshift2D(a);
+	ifft2(a);
+	fftshift2D(a);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fft2c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	ifftshift2D(a, r);
+	fft2(r);
+	fftshift2D(r);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifft2c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	ifftshift2D(a, r);
+	ifft2(r);
+	fftshift2D(r);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fft2c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf)
+{
+	ifftshift2D(a, r);
+	fft2(r, buf);
+	fftshift2D(buf, r);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifft2c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf)
+{
+	ifftshift2D(a, r);
+	ifft2(r, buf);
+	fftshift2D(buf, r);
+}
+
+// -----------------------------------------------------------------------------------------
+
+template<typename T>
+inline void hoNDFFT<T>::fft3(hoNDArray< ComplexType >& a)
+{
+	fft3(a, true);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifft3(hoNDArray< ComplexType >& a)
+{
+	fft3(a, false);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fft3(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	/*r = a;
+        return fft3(r);*/
+	if ( !r.dimensions_equal(&a) )
+	{
+		r.create(a.get_dimensions());
+	}
+
+	fft3(const_cast<hoNDArray< ComplexType >&>(a), r, true);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifft3(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	/*r = a;
+        return ifft3(r);*/
+	if ( !r.dimensions_equal(&a) )
+	{
+		r.create(a.get_dimensions());
+	}
+
+	fft3(const_cast<hoNDArray< ComplexType >&>(a), r, false);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fft3c(hoNDArray< ComplexType >& a)
+{
+	ifftshift3D(a);
+	fft3(a);
+	fftshift3D(a);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifft3c(hoNDArray< ComplexType >& a)
+{
+	ifftshift3D(a);
+	ifft3(a);
+	fftshift3D(a);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fft3c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	ifftshift3D(a, r);
+	fft3(r);
+	fftshift3D(r);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifft3c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+{
+	ifftshift3D(a, r);
+	ifft3(r);
+	fftshift3D(r);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::fft3c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf)
+{
+	ifftshift3D(a, r);
+	fft3(r, buf);
+	fftshift3D(buf, r);
+}
+
+template<typename T>
+inline void hoNDFFT<T>::ifft3c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf)
+{
+	ifftshift3D(a, r);
+	ifft3(r, buf);
+	fftshift3D(buf, r);
+}
+
+template<typename T>
+void hoNDFFT<T>::fft1(hoNDArray< ComplexType >& a, bool forward)
+{
+	hoNDArray< ComplexType > res(a);
+	fft1(res, a, forward);
+}
+
+template<typename T>
+void hoNDFFT<T>::fft2(hoNDArray< ComplexType >& a, bool forward)
+{
+	hoNDArray< ComplexType > res(a);
+	fft2(res, a, forward);
+}
+
+template<typename T>
+void hoNDFFT<T>::fft3(hoNDArray< ComplexType >& a, bool forward)
+{
+	hoNDArray< ComplexType > res(a);
+	fft3(res, a, forward);
+}
+
+template<typename T>
+void hoNDFFT<T>::fft1(hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, bool forward)
+{
+	r = a;
+
+	int n0 = (int)a.get_size(0);
+	T fftRatio = T(1.0/std::sqrt( T(n0) ));
+
+
+	int num = (int)(a.get_number_of_elements()/n0);
+	int num_thr = get_num_threads_fft1(n0, num);
+
+	int n;
+
+
+	typename fftw_types<T>::plan * p;
+
+	if( num_thr > 1 )
+	{
+		{
+			mutex_.lock();
+			if ( forward )
+			{
+				p = fftw_plan_dft_1d_(n0,a.get_data_ptr(),r.get_data_ptr(),FFTW_FORWARD,FFTW_ESTIMATE);
+			}
+			else
+			{
+				p = fftw_plan_dft_1d_(n0,a.get_data_ptr(),r.get_data_ptr(),FFTW_BACKWARD,FFTW_ESTIMATE);
+			}
+			mutex_.unlock();
+		}
+
+#pragma omp parallel for private(n) shared(num, p, a, n0, r) num_threads(num_thr)
+		for ( n=0; n<num; n++ )
+		{
+			fftw_execute_dft_(p, a.get_data_ptr()+n*n0,
+					r.get_data_ptr()+n*n0);
+		}
+
+		{
+			mutex_.lock();
+			fftw_destroy_plan_(p);
+			mutex_.unlock();
+		}
+	}
+	else
+	{
+		// multiple fft interface
+		{
+			mutex_.lock();
+			if ( forward )
+			{
+				p = fftw_plan_many_dft_(1, &n0, num,
+						a.get_data_ptr(), NULL,
+						1, n0,
+						r.get_data_ptr(), NULL,
+						1, n0,
+						FFTW_FORWARD, FFTW_ESTIMATE);
+			}
+			else
+			{
+				p = fftw_plan_many_dft_(1, &n0, num,
+						a.get_data_ptr(),NULL,
+						1, n0,
+						r.get_data_ptr(), NULL,
+						1, n0,
+						FFTW_BACKWARD, FFTW_ESTIMATE);
+			}
+			mutex_.unlock();
+		}
+
+		fftw_execute_(p);
+
+		{
+			mutex_.lock();
+			fftw_destroy_plan_(p);
+			mutex_.unlock();
+		}
+	}
+
+	r *= fftRatio;
+}
+
+template<typename T>
+void hoNDFFT<T>::fft2(hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, bool forward)
+{
+	r = a;
+
+	int n0 = (int)a.get_size(1);
+	int n1 = (int)a.get_size(0);
+
+	T fftRatio = T(1.0/std::sqrt( T(n0*n1) ));
+
+	int num = (int)(a.get_number_of_elements()/(n0*n1));
+	int num_thr = get_num_threads_fft2(n0, n1, num);
+
+	int n;
+
+
+	typename fftw_types<T>::plan * p;
+
+	if ( num_thr > 1 )
+	{
+		{
+			mutex_.lock();
+			p = fftw_plan_dft_2d_(n0, n1,
+					a.begin(),
+					r.begin(),
+					forward ? FFTW_FORWARD : FFTW_BACKWARD, FFTW_ESTIMATE);
+			mutex_.unlock();
+		}
+
+#pragma omp parallel for private(n) shared(num, p, a, n0, n1, r) num_threads(num_thr)
+		for ( n=0; n<num; n++ )
+		{
+			fftw_execute_dft_(p, a.begin()+n*n0*n1,
+					r.begin()+n*n0*n1);
+		}
+
+		{
+			mutex_.lock();
+			fftw_destroy_plan_(p);
+			mutex_.unlock();
+		}
+	}
+	else
+	{
+		// multiple fft interface
+
+		int n[] = {n0, n1};
+		int idist = n0*n1;
+		int odist = n0*n1;
+
+		{
+			mutex_.lock();
+
+			p = fftw_plan_many_dft_(2, n, num,
+					a.begin(), NULL,
+					1, idist,
+					r.begin(), NULL,
+					1, odist,
+					forward ? FFTW_FORWARD : FFTW_BACKWARD, FFTW_ESTIMATE);
+
+			mutex_.unlock();
+		}
+
+		fftw_execute_(p);
+
+		{
+			mutex_.lock();
+			fftw_destroy_plan_(p);
+			mutex_.unlock();
+		}
+	}
+
+	r *= fftRatio;
+
+}
+
+template<typename T>
+void hoNDFFT<T>::fft3(hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, bool forward)
+{
+	r = a;
+
+	int n2 = (int)a.get_size(0);
+	int n1 = (int)a.get_size(1);
+	int n0 = (int)a.get_size(2);
+
+	T fftRatio = T(1.0/std::sqrt( T(n0*n1*n2) ));
+
+	int num = (int)(a.get_number_of_elements()/(n0*n1*n2));
+	int num_thr = get_num_threads_fft3(n0, n1, n2, num);
+
+	long long n;
+
+	typename fftw_types<T>::plan * p;
+
+	{
+		mutex_.lock();
+
+		p = fftw_plan_dft_3d_(n0, n1, n2,
+				a.get_data_ptr(),
+				r.get_data_ptr(),
+				forward ? FFTW_FORWARD : FFTW_BACKWARD, FFTW_ESTIMATE);
+
+		mutex_.unlock();
+	}
+
+#pragma omp parallel for private(n) shared(num, p, a, n0, n1, n2, r) if (num_thr > 1) num_threads(num_thr)
+	for ( n=0; n<num; n++ )
+	{
+		fftw_execute_dft_(p, a.begin()+n*n0*n1*n2,
+				r.begin()+n*n0*n1*n2);
+	}
+
+	{
+		mutex_.lock();
+		fftw_destroy_plan_(p);
+		mutex_.unlock();
+	}
+
+	r *= fftRatio;
+
+}
+// TODO: implement more optimized threading strategy
+template<typename T>
+inline int hoNDFFT<T>::get_num_threads_fft1(size_t n0, size_t num)
+{
+	if ( num_of_max_threads_ == 1 ) return 1;
+
+	if ( n0*num>1024*128 )
+	{
+		return num_of_max_threads_;
+	}
+	else if ( n0*num>512*128 )
+	{
+		return ( (num_of_max_threads_>8) ? 8 : num_of_max_threads_);
+	}
+	else if ( n0*num>256*128 )
+	{
+		return ( (num_of_max_threads_>4) ? 4 : num_of_max_threads_);
+	}
+	else if ( n0*num>128*128 )
+	{
+		return 2;
+	}
+
+	return 1;
+}
+
+template<typename T>
+inline int hoNDFFT<T>::get_num_threads_fft2(size_t n0, size_t n1, size_t num)
+{
+	if ( num_of_max_threads_ == 1 ) return 1;
+
+	if ( n0*n1*num>128*128*64 )
+	{
+		return num_of_max_threads_;
+	}
+	else if ( n0*n1*num>128*128*32 )
+	{
+		return ( (num_of_max_threads_>8) ? 8 : num_of_max_threads_);
+	}
+	else if ( n0*n1*num>128*128*16 )
+	{
+		return ( (num_of_max_threads_>4) ? 4 : num_of_max_threads_);
+	}
+	else if ( n0*n1*num>128*128*8 )
+	{
+		return 2;
+	}
+
+	return 1;
+}
+
+template<typename T>
+inline int hoNDFFT<T>::get_num_threads_fft3(size_t n0, size_t n1, size_t n2, size_t num)
+{
+	if ( num_of_max_threads_ == 1 ) return 1;
+
+	if ( num >= num_of_max_threads_ )
+	{
+		return num_of_max_threads_;
+	}
+
+	return 1;
+}
+
+
+template<typename T>
+void hoNDFFT<T>::timeswitch(hoNDArray<ComplexType>* inout, int dim_to_transform){
+	size_t batchsize = 1;
+	for (int i = 0; i < dim_to_transform; i++)
+		batchsize *= inout->get_size(i);
+
+	size_t dimsize = inout->get_size(dim_to_transform);
+
+
+	ComplexType* data = inout->get_data_ptr();
+	size_t num_elements = inout->get_number_of_elements();
+#pragma omp parallel for
+	for (long int k = 0; k < num_elements; k++){
+		size_t index = (k/batchsize)%dimsize;
+		if (index%2 == 1)
+			data[k] *= -1;
+	}
+}
+
+template<typename T>
+void hoNDFFT<T>::phaseshift(hoNDArray<ComplexType>* inout, T phase, int dim_to_transform){
+	size_t batchsize = 1;
+	for (int i = 0; i < dim_to_transform; i++)
+		batchsize *= inout->get_size(i);
+
+	size_t dimsize = inout->get_size(dim_to_transform);
+
+
+	ComplexType* data = inout->get_data_ptr();
+	size_t num_elements = inout->get_number_of_elements();
+#pragma omp parallel for
+	for (long int k = 0; k < num_elements; k++){
+		float index = (k/batchsize)%dimsize-dimsize/2.0;
+			data[k] *= ComplexType(0,2.0*M_PI*index*phase);
+
+	}
+}
+template<> int hoNDFFT<float>::fftw_import_wisdom_from_file_(FILE* file){
+	return fftwf_import_wisdom_from_file(file);
+}
+
+template<> int hoNDFFT<double>::fftw_import_wisdom_from_file_(FILE* file){
+	return fftw_import_wisdom_from_file(file);
+}
+
+template<> void hoNDFFT<float>::fftw_export_wisdom_to_file_(FILE* file){
+	return fftwf_export_wisdom_to_file(file);
+}
+
+
+template<> void hoNDFFT<double>::fftw_export_wisdom_to_file_(FILE* file){
+	return fftw_export_wisdom_to_file(file);
+}
+
+template<> void hoNDFFT<float>::fftw_cleanup_(){
+	fftwf_cleanup();
+}
+
+template<> void hoNDFFT<double>::fftw_cleanup_(){
+	fftw_cleanup();
+}
+
+template<> void* hoNDFFT<float>::fftw_malloc_(size_t n){
+	return fftwf_malloc(n);
+}
+
+template<> void* hoNDFFT<double>::fftw_malloc_(size_t n){
+	return fftw_malloc(n);
+}
+
+template<> void hoNDFFT<float>::fftw_free_(void* ptr){
+	fftwf_free(ptr);
+}
+
+template<> void hoNDFFT<double>::fftw_free_(void* ptr){
+	fftw_free(ptr);
+}
+
+template<> void hoNDFFT<float>::fftw_execute_dft_( fftwf_plan_s * ptr, ComplexType* in, ComplexType* out){
+	fftwf_execute_dft(ptr,(fftwf_complex*)in, (fftwf_complex*) out);
+}
+
+
+template<> void hoNDFFT<double>::fftw_execute_dft_(fftw_plan_s * ptr, ComplexType* in, ComplexType* out){
+	fftw_execute_dft(ptr, (fftw_complex*)in, (fftw_complex*)out);
+}
+
+template<> void hoNDFFT<float>::fftw_execute_( fftwf_plan_s * ptr){
+	fftwf_execute(ptr);
+}
+
+
+template<> void hoNDFFT<double>::fftw_execute_(fftw_plan_s * ptr){
+	fftw_execute(ptr);
+}
+
+template<> typename fftw_types<float>::plan * hoNDFFT<float>::fftw_plan_dft_1d_(int rank,  ComplexType* in, ComplexType * out,int sign, unsigned int flags){
+	return fftwf_plan_dft_1d(rank,(fftwf_complex*)in,(fftwf_complex*)out,sign,flags);
+}
+template<> typename fftw_types<double>::plan * hoNDFFT<double>::fftw_plan_dft_1d_(int rank, ComplexType* in, ComplexType * out,int sign, unsigned int flags){
+	return fftw_plan_dft_1d(rank,(fftw_complex*)in,(fftw_complex*)out,sign,flags);
+}
+
+template<> typename fftw_types<float>::plan * hoNDFFT<float>::fftw_plan_dft_2d_(int n0, int n1,  ComplexType* in, ComplexType * out,int sign, unsigned int flags){
+	return fftwf_plan_dft_2d(n0,n1,(fftwf_complex*)in,(fftwf_complex*)out,sign,flags);
+}
+template<> typename fftw_types<double>::plan * hoNDFFT<double>::fftw_plan_dft_2d_(int n0, int n1, ComplexType* in, ComplexType * out,int sign, unsigned int flags){
+	return fftw_plan_dft_2d(n0,n1,(fftw_complex*)in,(fftw_complex*)out,sign,flags);
+}
+
+
+template<> typename fftw_types<float>::plan * hoNDFFT<float>::fftw_plan_dft_3d_(int n0, int n1, int n2, ComplexType* in, ComplexType * out,int sign, unsigned int flags){
+	return fftwf_plan_dft_3d(n0,n1,n2,(fftwf_complex*)in,(fftwf_complex*)out,sign,flags);
+}
+template<> typename fftw_types<double>::plan * hoNDFFT<double>::fftw_plan_dft_3d_(int n0, int n1, int n2, ComplexType* in, ComplexType * out,int sign, unsigned int flags){
+	return fftw_plan_dft_3d(n0,n1,n2,(fftw_complex*)in,(fftw_complex*)out,sign,flags);
+}
+
+template<> typename fftw_types<float>::plan * hoNDFFT<float>::fftw_plan_many_dft_(int rank, const int *n, int howmany,
+		ComplexType *in, const int *inembed,
+		int istride, int idist,
+		ComplexType *out, const int *onembed,
+		int ostride, int odist,
+		int sign, unsigned flags){
+	return fftwf_plan_many_dft(rank,n,howmany,(fftwf_complex*)in,inembed,istride,idist,(fftwf_complex*)out,onembed,ostride,odist,sign,flags);
+}
+
+template<> typename fftw_types<double>::plan * hoNDFFT<double>::fftw_plan_many_dft_(int rank, const int *n, int howmany,
+		ComplexType *in, const int *inembed,
+		int istride, int idist,
+		ComplexType *out, const int *onembed,
+		int ostride, int odist,
+		int sign, unsigned flags){
+	return fftw_plan_many_dft(rank,n,howmany,(fftw_complex*)in,inembed,istride,idist,(fftw_complex*)out,onembed,ostride,odist,sign,flags);
+}
+
+template<> void hoNDFFT<float>::fftw_destroy_plan_( typename fftw_types<float>::plan * p ){
+	fftwf_destroy_plan(p);
+}
+
+template<> void hoNDFFT<double>::fftw_destroy_plan_( typename fftw_types<double>::plan * p ){
+	fftw_destroy_plan(p);
+}
+
+template<> void hoNDFFT<double>::fftw_print_plan_( typename fftw_types<double>::plan * p ){
+	fftw_print_plan(p);
+}
+
+
+template<> void hoNDFFT<float>::fftw_print_plan_( typename fftw_types<float>::plan * p ){
+	fftwf_print_plan(p);
+}
+
+// -----------------------------------------------------------------------------------------
+
+
+//
+// Instantiation
+//
+
+template class EXPORTCPUFFT hoNDFFT<float>;
+template class EXPORTCPUFFT hoNDFFT<double>;
+}
diff --git a/toolboxes/fft/cpu/hoNDFFT.h b/toolboxes/fft/cpu/hoNDFFT.h
new file mode 100644
index 0000000..1d06fa5
--- /dev/null
+++ b/toolboxes/fft/cpu/hoNDFFT.h
@@ -0,0 +1,282 @@
+/** \file hoNDFFT.h
+    \brief Wrappers for FFTW for ndarrays of type std::complex.
+*/
+
+#ifndef hoNDFFT_H
+#define hoNDFFT_H
+
+#include "hoNDArray.h"
+#include "cpufft_export.h"
+
+#include <boost/thread/mutex.hpp>
+#include <iostream>
+#include <fftw3.h>
+#include <complex>
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron{
+
+template<class T> struct fftw_types{};
+
+template<> struct fftw_types<float>{
+	typedef fftwf_complex complex;
+	typedef fftwf_plan_s plan;
+};
+
+template<> struct fftw_types<double>{
+	typedef fftw_complex complex;
+	typedef fftw_plan_s plan;
+};
+
+
+    /** 
+    Generic class for Fast Fourier Transforms using FFTW on the hoNDArray class.
+    This class is a singleton because the planning and memory allocation routines of FFTW are NOT threadsafe.
+    The class' template type is a REAL, ie. float or double.
+
+		Note that scaling is 1/sqrt(N) fir both FFT and IFFT, where N is the number of elements along the FFT dimensions
+    Access using e.g.
+    FFT<float>::instance()
+    */
+    template <typename T> class EXPORTCPUFFT hoNDFFT
+    {
+    public:
+
+        typedef std::complex<T> ComplexType;
+
+        static hoNDFFT<T>* instance(); 
+
+        void fft(hoNDArray< ComplexType >* input, unsigned int dim_to_transform)
+        {
+            //-1 refers to the sign of the transform, -1 for FFTW_FORWARD
+            fft_int(input,dim_to_transform,-1);
+        }
+
+        void ifft(hoNDArray< ComplexType >* input, unsigned int dim_to_transform)
+        {
+            //1 refers to the sign of the transform, +1 for FFTW_BACKWARD
+            fft_int(input,dim_to_transform,1);
+        }
+
+        void fft(hoNDArray< ComplexType >* input)
+        {
+            for (size_t i = 0; i < input->get_number_of_dimensions(); i++) {
+                //-1 refers to the sign of the transform, -1 for FFTW_FORWARD
+                fft_int(input,i,-1);
+            }
+        }
+
+        void ifft(hoNDArray< ComplexType >* input)
+        {
+            for (size_t i = 0; i < input->get_number_of_dimensions(); i++) {
+                //1 refers to the sign of the transform, +1 for FFTW_BACKWARD
+                fft_int(input,i,1);
+            }
+        }
+
+
+        void fft(hoNDArray< complext<T> >* input, unsigned int dim_to_transform)
+        {
+            fft((hoNDArray<ComplexType>*) input, dim_to_transform);
+        }
+
+        void ifft(hoNDArray< complext<T> >* input, unsigned int dim_to_transform)
+        {
+            ifft((hoNDArray<ComplexType>*) input, dim_to_transform);
+        }
+
+        void fft(hoNDArray< complext<T> >* input)
+        {
+            fft((hoNDArray<ComplexType>*) input);
+        }
+
+        void ifft(hoNDArray< complext<T> >* input)
+        {
+        	ifft((hoNDArray<ComplexType>*) input);
+        }
+
+
+        // 1D
+        void fftshift1D(hoNDArray< ComplexType >& a);
+        void fftshift1D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        void ifftshift1D(hoNDArray< ComplexType >& a);
+        void ifftshift1D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        // 2D
+        void fftshift2D(hoNDArray< ComplexType >& a);
+        void fftshift2D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        void ifftshift2D(hoNDArray< ComplexType >& a);
+        void ifftshift2D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        // 3D
+        void fftshift3D(hoNDArray< ComplexType >& a);
+        void fftshift3D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        void ifftshift3D(hoNDArray< ComplexType >& a);
+        void ifftshift3D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        // 1D fft, in-place and out-of-place
+        // the first dimension will be transformed
+        void fft1(hoNDArray< ComplexType >& a);
+        void ifft1(hoNDArray< ComplexType >& a);
+
+        void fft1(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+        void ifft1(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        // centered 1D fft
+        void fft1c(hoNDArray< ComplexType >& a);
+        void ifft1c(hoNDArray< ComplexType >& a);
+
+        void fft1c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+        void ifft1c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        void fft1c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf);
+        void ifft1c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf);
+
+        // 2D fft, in-place and out-of-place
+        // the first and second dimensions will be transformed
+        void fft2(hoNDArray< ComplexType >& a);
+        void ifft2(hoNDArray< ComplexType >& a);
+
+        void fft2(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+        void ifft2(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        // centered 2D fft
+        void fft2c(hoNDArray< ComplexType >& a);
+        void ifft2c(hoNDArray< ComplexType >& a);
+
+        void fft2c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+        void ifft2c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        void fft2c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf);
+        void ifft2c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf);
+
+        // 3D fft, in-place and out-of-place
+        // the first, second and third dimensions will be transformed
+        void fft3(hoNDArray< ComplexType >& a);
+        void ifft3(hoNDArray< ComplexType >& a);
+
+        void fft3(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+        void ifft3(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        // centered 3D fft
+        void fft3c(hoNDArray< ComplexType >& a);
+        void ifft3c(hoNDArray< ComplexType >& a);
+
+        void fft3c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+        void ifft3c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        void fft3c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf);
+        void ifft3c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf);
+
+    protected:
+
+        //We are making these protected since this class is a singleton
+
+        hoNDFFT() {
+
+
+#ifdef USE_OMP
+            num_of_max_threads_ = omp_get_num_procs();
+#else
+            num_of_max_threads_ = 1;
+#endif // USE_OMP
+        }
+
+        virtual ~hoNDFFT() { fftw_cleanup_(); }
+
+        void fft_int(hoNDArray< ComplexType >* input, size_t dim_to_transform, int sign);
+
+        void fft_int_uneven(hoNDArray< ComplexType >* input, size_t dim_to_transform, int sign);
+
+        int   fftw_import_wisdom_from_file_(FILE*);
+        void  fftw_export_wisdom_to_file_(FILE*);
+        void  fftw_cleanup_();
+        void* fftw_malloc_(size_t);
+        void  fftw_free_(void* p);
+        void  fftw_execute_dft_(typename fftw_types<T>::plan * p, ComplexType*, ComplexType*);
+        void  fftw_execute_(typename fftw_types<T>::plan * p);
+        void fftw_print_plan_(typename fftw_types<T>::plan *p);
+
+        typename fftw_types<T>::plan * fftw_plan_dft_1d_(int rank, ComplexType*, ComplexType*, int, unsigned);
+        typename fftw_types<T>::plan * fftw_plan_dft_2d_(int dim0,int dim1, ComplexType*, ComplexType*, int, unsigned);
+        typename fftw_types<T>::plan * fftw_plan_dft_3d_(int dim0,int dim1,int dim2, ComplexType*, ComplexType*, int, unsigned);
+
+        typename fftw_types<T>::plan * fftw_plan_many_dft_(int rank, const int *n, int howmany,
+                                          ComplexType *in, const int *inembed,
+                                          int istride, int idist,
+                                          ComplexType *out, const int *onembed,
+                                          int ostride, int odist,
+                                          int sign, unsigned flags);
+        typename fftw_types<T>::plan * fftw_plan_dft_(int rank, ComplexType*, ComplexType*, int, unsigned);
+
+        void  fftw_destroy_plan_(typename fftw_types<T>::plan *);
+
+
+
+        static hoNDFFT<T>* instance_;
+        boost::mutex mutex_;
+
+        int num_of_max_threads_;
+
+        // the fft and ifft shift pivot for a certain length
+        // [0 .. pivot-1] will be shifted to the right end
+        size_t fftshiftPivot(size_t len);
+        size_t ifftshiftPivot(size_t len);
+
+        // 1D
+        void fftshift1D(const ComplexType* a, ComplexType* r, size_t x, size_t pivot);
+        void ifftshift1D(const ComplexType* a, ComplexType* r, size_t x, size_t pivot);
+
+        void fftshiftPivot1D(ComplexType* a, size_t x, size_t n, size_t pivot);
+        void fftshiftPivot1D(const ComplexType* a, ComplexType* r, size_t x, size_t n, size_t pivot);
+
+        // 2D
+        void fftshiftPivot2D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t n, size_t pivotx, size_t pivoty);
+        void fftshiftPivot2D(ComplexType* a, size_t x, size_t y, size_t n, size_t pivotx, size_t pivoty);
+
+        void fftshift2D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t n);
+        void ifftshift2D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t n);
+
+        void fftshift2D(ComplexType* a, size_t x, size_t y, size_t n);
+        void ifftshift2D(ComplexType* a, size_t x, size_t y, size_t n);
+
+        // 3D
+        void fftshiftPivot3D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t z, size_t n, size_t pivotx, size_t pivoty, size_t pivotz);
+        void fftshiftPivot3D(ComplexType* a, size_t x, size_t y, size_t z, size_t n, size_t pivotx, size_t pivoty, size_t pivotz);
+
+        void fftshift3D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t z, size_t n);
+        void ifftshift3D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t z, size_t n);
+
+        void fftshift3D(ComplexType* a, size_t x, size_t y, size_t z, size_t n);
+        void ifftshift3D(ComplexType* a, size_t x, size_t y, size_t z, size_t n);
+
+        // forward: true, fft; false, inverse fft
+        void fft1(hoNDArray< ComplexType >& a, bool forward);
+        void fft2(hoNDArray< ComplexType >& a, bool forward);
+        void fft3(hoNDArray< ComplexType >& a, bool forward);
+
+        void fft1(hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, bool forward);
+        void fft2(hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, bool forward);
+        void fft3(hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, bool forward);
+
+        // get the number of threads used for fft
+        int get_num_threads_fft1(size_t n0, size_t num);
+        int get_num_threads_fft2(size_t n0, size_t n1, size_t num);
+        int get_num_threads_fft3(size_t n0, size_t n1, size_t n2, size_t num);
+
+        /**
+         * Multiplies array k'th element by -1^k, causing the result of an FFT to have the frequencies centered
+         */
+        void timeswitch(hoNDArray<ComplexType>* a, int transform_dim);
+
+        void phaseshift(hoNDArray<ComplexType>* a, T phase, int transform_dim);
+    };
+}
+
+#endif //hoNDFFT_H
diff --git a/toolboxes/fft/gpu/CMakeLists.txt b/toolboxes/fft/gpu/CMakeLists.txt
new file mode 100644
index 0000000..3e8794f
--- /dev/null
+++ b/toolboxes/fft/gpu/CMakeLists.txt
@@ -0,0 +1,35 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUFFT__)
+endif (WIN32)
+
+include_directories( 
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CUDA_INCLUDE_DIRS}
+  ${Boost_INCLUDE_DIR}
+)
+
+cuda_add_library(gadgetron_toolbox_gpufft SHARED 
+    cuNDFFT.h
+    cuNDFFT.cpp
+    cuNDFFT.cu
+  )
+
+set_target_properties(gadgetron_toolbox_gpufft PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_toolbox_gpufft 
+  gadgetron_toolbox_cpucore
+  gadgetron_toolbox_log
+  gadgetron_toolbox_gpucore 
+  ${Boost_LIBRARIES}
+  ${CUDA_LIBRARIES} 
+  ${CUDA_CUFFT_LIBRARIES} 
+  )
+
+install(TARGETS gadgetron_toolbox_gpufft DESTINATION lib COMPONENT main)
+
+install(FILES
+  gpufft_export.h
+  cuNDFFT.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/fft/gpu/cuNDFFT.cpp b/toolboxes/fft/gpu/cuNDFFT.cpp
new file mode 100644
index 0000000..013835f
--- /dev/null
+++ b/toolboxes/fft/gpu/cuNDFFT.cpp
@@ -0,0 +1,171 @@
+#include "cuNDFFT.h"
+#include "vector_td.h"
+#include "cuNDArray.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_operators.h"
+
+#include <cufft.h>
+#include <cuComplex.h>
+#include <sstream>
+
+namespace Gadgetron{
+
+  template<class T> cuNDFFT<T>* cuNDFFT<T>::instance()
+  {
+    if (!__instance)
+	  __instance = new cuNDFFT<T>;
+	  return __instance;
+  }
+
+  template<class T> cuNDFFT<T>* cuNDFFT<T>::__instance = NULL;
+
+  template<class T> cufftType_t get_transform_type();
+  template<> cufftType_t get_transform_type<float>() { return CUFFT_C2C; }
+  template<> cufftType_t get_transform_type<double>() { return CUFFT_Z2Z; }
+  
+  template<class T> cufftResult_t cuNDA_FFT_execute( cufftHandle plan, cuNDArray< complext<T> > *in_out, int direction );
+  
+  template<> cufftResult_t cuNDA_FFT_execute<float>( cufftHandle plan, cuNDArray<float_complext> *in_out, int direction ){
+    return cufftExecC2C(plan, (cuFloatComplex*)in_out->get_data_ptr(), (cuFloatComplex*)in_out->get_data_ptr(), direction); }
+
+  template<> cufftResult_t cuNDA_FFT_execute<double>( cufftHandle plan, cuNDArray<double_complext> *in_out, int direction ){
+    return cufftExecZ2Z(plan, (cuDoubleComplex*)in_out->get_data_ptr(), (cuDoubleComplex*)in_out->get_data_ptr(), direction); }
+  
+  template<class T> void
+  cuNDFFT<T>::fft_int( cuNDArray< complext<T> > *input, std::vector<size_t> *dims_to_transform, int direction, bool do_scale )
+  {
+    std::vector<size_t> new_dim_order;
+    std::vector<size_t> reverse_dim_order;
+    std::vector<size_t> dims;
+    std::vector<size_t> dim_count(input->get_number_of_dimensions(),0);
+    
+    size_t array_ndim = input->get_number_of_dimensions();
+    boost::shared_ptr< std::vector<size_t> > array_dims = input->get_dimensions();
+    
+    dims = std::vector<size_t>(dims_to_transform->size(),0);
+    for (size_t i = 0; i < dims_to_transform->size(); i++) {
+      if ((*dims_to_transform)[i] >= array_ndim) {
+    	std::stringstream ss;
+    	ss << "cuNDFFT::fft Invalid dimensions specified for transform " << (*dims_to_transform)[i] << "max " << array_ndim;
+	throw std::runtime_error(ss.str());;
+      }
+      if (dim_count[(*dims_to_transform)[i]] > 0) {
+	throw std::runtime_error("cuNDFFT::fft Invalid dimensions (duplicates) specified for transform");;
+      }
+      dim_count[(*dims_to_transform)[i]]++;
+      dims[dims_to_transform->size()-1-i] = (*array_dims)[(*dims_to_transform)[i]];
+    }
+    
+    new_dim_order = *dims_to_transform;
+    for (size_t i = 0; i < array_ndim; i++) {
+      if (!dim_count[i]) new_dim_order.push_back(i);
+    }
+    
+    reverse_dim_order = std::vector<size_t>(array_ndim,0);
+    for (size_t i = 0; i < array_ndim; i++) {
+      reverse_dim_order[new_dim_order[i]] = i;
+    }
+    
+    size_t ndim = dims.size();
+    size_t batches = 0;
+    size_t elements_in_ft = 1;
+    for (size_t i = 0; i < dims.size(); i++)
+      elements_in_ft *= dims[i];
+    batches = input->get_number_of_elements() / elements_in_ft;
+    
+    cufftHandle plan;
+    cufftResult ftres;
+    
+    std::vector<int> int_dims;
+    for( unsigned int i=0; i<dims.size(); i++ )
+      int_dims.push_back((int)dims[i]);
+
+    ftres = cufftPlanMany(&plan,ndim,&int_dims[0], &int_dims[0], 1, elements_in_ft, &int_dims[0], 1, elements_in_ft, get_transform_type<T>(), batches);
+    if (ftres != CUFFT_SUCCESS) {
+      std::stringstream ss;
+      ss << "cuNDFFT FFT plan failed: " << ftres;
+      throw std::runtime_error(ss.str());;
+    }
+    
+    bool must_permute = false;
+
+    {
+    	for (size_t i = 0; i < new_dim_order.size(); i++)
+    		must_permute |= (i != new_dim_order[i]);
+    }
+
+    if (must_permute)
+    	*input = *permute(input,&new_dim_order);
+
+    if (direction == CUFFT_INVERSE)
+    	for (size_t i =0; i < dims_to_transform->size(); i++)
+    	timeswitch(input,dims_to_transform->at(i));
+    
+    if( cuNDA_FFT_execute<T>( plan, input, direction ) != CUFFT_SUCCESS ) {
+      throw std::runtime_error("cuNDFFT FFT execute failed");;
+    }
+    
+    ftres = cufftDestroy( plan );
+    if (ftres != CUFFT_SUCCESS) {
+      std::stringstream ss;
+      ss << "cuNDFFT FFT plan destroy failed: " << ftres;
+      throw std::runtime_error(ss.str());;
+    }
+    if (direction == CUFFT_FORWARD)
+    	for (size_t i =0; i < dims_to_transform->size(); i++)
+    		timeswitch(input,dims_to_transform->at(i));
+    
+    if (do_scale) {
+      *input *= 1/std::sqrt(T(elements_in_ft));
+    }
+    
+    if (must_permute)
+    	*input = *permute(input,&reverse_dim_order);
+  }
+  
+  template<class T> void
+  cuNDFFT<T>::fft( cuNDArray< complext<T> > *input, std::vector<size_t> *dims_to_transform, bool do_scale )
+  {
+    fft_int(input, dims_to_transform, CUFFT_FORWARD, do_scale);
+  }
+  
+  template<class T> void
+  cuNDFFT<T>::ifft( cuNDArray< complext<T> > *input, std::vector<size_t> *dims_to_transform, bool do_scale )
+  {
+    fft_int(input, dims_to_transform, CUFFT_INVERSE, do_scale);
+  }
+  
+  template<class T> void
+  cuNDFFT<T>::fft( cuNDArray< complext<T> > *input, unsigned int dim_to_transform, bool do_scale )
+  {
+    std::vector<size_t> dims(1,dim_to_transform);
+    fft_int(input, &dims, CUFFT_FORWARD, do_scale);
+  }
+  
+  template<class T> void
+  cuNDFFT<T>::ifft( cuNDArray< complext<T> > *input, unsigned int dim_to_transform, bool do_scale )
+  {
+    std::vector<size_t> dims(1,dim_to_transform);
+    fft_int(input, &dims, CUFFT_INVERSE, do_scale);
+  }
+  
+  template<class T> void
+  cuNDFFT<T>::fft( cuNDArray< complext<T> > *input, bool do_scale )
+  {
+    std::vector<size_t> dims(input->get_number_of_dimensions(),0);
+    for (size_t i = 0; i < dims.size(); i++) dims[i] = i;
+    fft_int(input, &dims, CUFFT_FORWARD, do_scale);
+  }
+  
+  template<class T> void
+  cuNDFFT<T>::ifft( cuNDArray<complext<T> > *input, bool do_scale )
+  {
+    std::vector<size_t> dims(input->get_number_of_dimensions(),0);
+    for (size_t i = 0; i < dims.size(); i++) dims[i] = i;
+    fft_int(input, &dims, CUFFT_INVERSE, do_scale);
+  }
+  
+  // Instantiation
+  template class EXPORTGPUFFT cuNDFFT<float>;
+  template class EXPORTGPUFFT cuNDFFT<double>;
+}
diff --git a/toolboxes/fft/gpu/cuNDFFT.cu b/toolboxes/fft/gpu/cuNDFFT.cu
new file mode 100644
index 0000000..253a2e5
--- /dev/null
+++ b/toolboxes/fft/gpu/cuNDFFT.cu
@@ -0,0 +1,46 @@
+#include "cuNDFFT.h"
+#include "cudaDeviceManager.h"
+
+using namespace Gadgetron;
+
+template<class T> __global__ void timeswitch_kernel(T* data, unsigned int dimsize, unsigned int batchsize, size_t nelements){
+	unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < nelements){
+		size_t index = (idx/batchsize)%dimsize;
+		if (index%2 == 1)
+			data[idx] *= -1;
+	}
+}
+
+
+template<class T> void Gadgetron::timeswitch(cuNDArray<complext<T> >* inout, int dim_to_transform){
+
+
+	size_t batchsize = 1;
+	for (int i = 0; i < dim_to_transform; i++)
+		batchsize *= inout->get_size(i);
+
+	size_t dimsize = inout->get_size(dim_to_transform);
+
+	size_t nelements = inout->get_number_of_elements();
+
+	size_t max_block = cudaDeviceManager::Instance()->max_blockdim();
+	dim3 dimBlock(std::min(max_block,nelements));
+
+	size_t max_grid = cudaDeviceManager::Instance()->max_griddim();
+	size_t gridX = std::max(std::min(nelements/dimBlock.x,max_grid),size_t(1));
+	size_t gridY = std::max(size_t(1),nelements/(gridX*dimBlock.x));
+
+	dim3 dimGrid(gridX,gridY);
+
+	timeswitch_kernel<<<dimGrid,dimBlock>>>(inout->get_data_ptr(),dimsize,batchsize,nelements);
+
+
+
+}
+
+
+
+template EXPORTGPUFFT void Gadgetron::timeswitch<float>(cuNDArray<float_complext>*, int);
+
+template EXPORTGPUFFT void Gadgetron::timeswitch<double>(cuNDArray<double_complext>*, int);
diff --git a/toolboxes/fft/gpu/cuNDFFT.h b/toolboxes/fft/gpu/cuNDFFT.h
new file mode 100644
index 0000000..0b08216
--- /dev/null
+++ b/toolboxes/fft/gpu/cuNDFFT.h
@@ -0,0 +1,48 @@
+/** \file cuNDFFT.h
+    \brief Wrapper of the CUFFT library for ndarrays of type Gadgetron::complext.
+*/
+
+#ifndef CUNDFFT_H
+#define CUNDFFT_H
+#pragma once
+
+#include "cuNDArray.h"
+#include "gpufft_export.h"
+
+namespace Gadgetron{
+
+  /** \class cuNDFFT
+      \brief Wrapper of the CUFFT library for ndarrays of type complext.
+
+      Wrapper of the CUFFT library for ndarrays of type complext<REAL>.
+      The class' template type is a REAL, ie. float or double.
+      The FFTs are performed in-place.
+  */
+  template<class T> class EXPORTGPUFFT cuNDFFT
+  {
+  public:
+
+    static cuNDFFT<T>* instance();
+
+    void fft ( cuNDArray<complext<T> > *image, std::vector<size_t> *dims_to_transform, bool do_scale = true );
+    void ifft( cuNDArray<complext<T> > *image, std::vector<size_t> *dims_to_transform, bool do_scale = true );
+
+    void fft ( cuNDArray<complext<T> > *image, unsigned int dim_to_transform, bool do_scale = true);
+    void ifft( cuNDArray<complext<T> > *image, unsigned int dim_to_transform, bool do_scale = true );
+
+    void fft ( cuNDArray<complext<T> > *image, bool do_scale = true );
+    void ifft( cuNDArray<complext<T> > *image, bool do_scale = true );
+
+  protected:   
+    cuNDFFT() {}
+    virtual ~cuNDFFT() {}
+    void fft_int( cuNDArray<complext<T> > *image, std::vector<size_t> *dims_to_transform, int direction, bool do_scale = true );
+    static cuNDFFT<T>* __instance;
+
+
+  };
+
+  template<class T> void timeswitch(cuNDArray<complext<T> >*, int);
+}
+
+#endif
diff --git a/toolboxes/fft/gpu/gpufft_export.h b/toolboxes/fft/gpu/gpufft_export.h
new file mode 100644
index 0000000..ab0252a
--- /dev/null
+++ b/toolboxes/fft/gpu/gpufft_export.h
@@ -0,0 +1,18 @@
+/** \file gpufft_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef GPUFFT_EXPORT_H_
+#define GPUFFT_EXPORT_H_
+
+#if defined (WIN32)
+    #if defined (__BUILD_GADGETRON_GPUFFT__) || defined (gpufft_EXPORTS)
+        #define EXPORTGPUFFT __declspec(dllexport)
+    #else
+        #define EXPORTGPUFFT __declspec(dllimport)
+    #endif
+#else
+    #define EXPORTGPUFFT
+#endif
+
+#endif
diff --git a/toolboxes/gadgettools/CMakeLists.txt b/toolboxes/gadgettools/CMakeLists.txt
new file mode 100644
index 0000000..2fe43e5
--- /dev/null
+++ b/toolboxes/gadgettools/CMakeLists.txt
@@ -0,0 +1,56 @@
+if (WIN32)
+    ADD_DEFINITIONS(-D__BUILD_GADGETRON_GADGETTOOLS__)
+endif (WIN32)
+
+include_directories(
+                    ${CMAKE_BINARY_DIR}/apps/gadgetron
+                    ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+                    ${CMAKE_SOURCE_DIR}/toolboxes/core
+                    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+                    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+                    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+                    ${CMAKE_SOURCE_DIR}/apps/gadgetron
+                    ${CMAKE_SOURCE_DIR}/gadgets/mri_core 
+                    ${ACE_INCLUDE_DIR} 
+                    ${Boost_INCLUDE_DIR} 
+ )
+
+
+INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image )
+
+if(WIN32)
+    link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+if(WIN32)
+    link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+add_library(gadgetron_toolbox_gadgettools SHARED
+  gadgettools_export.h 
+  GadgetronSlotContainer.h 
+  GadgetronConnector.h 
+  GadgetronConnector.cpp 
+  GadgetCloudController.h 
+  GadgetronCloudConnector.h )
+
+set_target_properties(gadgetron_toolbox_gadgettools PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_toolbox_gadgettools
+	              gadgetron_toolbox_log
+                      optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY}
+                      ${Boost_LIBRARIES})
+
+install(TARGETS gadgetron_toolbox_gadgettools DESTINATION lib COMPONENT main)
+
+install (FILES 
+            GadgetCloudController.h 
+            GadgetronCloudConnector.h 
+            GadgetronConnector.h
+            gadgettools_export.h
+            GadgetronSlotContainer.h
+            DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+if(ISMRMRD_FOUND)
+  add_subdirectory(ismrmrd)
+endif(ISMRMRD_FOUND)
diff --git a/toolboxes/gadgettools/GadgetCloudController.h b/toolboxes/gadgettools/GadgetCloudController.h
new file mode 100644
index 0000000..e2ecf76
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetCloudController.h
@@ -0,0 +1,712 @@
+
+#pragma once
+
+#include "ace/Log_Msg.h"
+#include "ace/Synch.h"
+#include "ace/Reactor.h"
+#include "ace/WFMO_Reactor.h"
+#include "ace/TP_Reactor.h"
+#include "ace/SOCK_Stream.h"
+#include "ace/Stream.h"
+#include "ace/Message_Queue.h"
+#include "ace/Svc_Handler.h"
+#include "ace/Reactor_Notification_Strategy.h"
+
+#include <complex>
+#include <vector>
+#include "boost/tuple/tuple.hpp"
+#include "boost/tuple/tuple_comparison.hpp"
+#include "boost/tuple/tuple_io.hpp"
+
+#include "gadgettools_export.h"
+#include "Gadget.h"
+#include "GadgetMessageInterface.h"
+#include "GadgetronCloudConnector.h"
+#include "GadgetImageMessageReader.h"
+#include "GadgetImageMessageWriter.h"
+
+namespace Gadgetron
+{
+
+template<typename JobType> 
+class GadgetCloudJobProcessHandler
+{
+public:
+
+    GadgetCloudJobProcessHandler() {}
+    virtual ~GadgetCloudJobProcessHandler() {}
+
+    virtual bool processJob(int jobID, JobType& ajob) { return true; }
+};
+
+template<typename JobType> 
+class GadgetCloudController : public ACE_Task<ACE_MT_SYNCH>
+{
+public:
+
+    typedef boost::tuple<std::string, std::string, std::string, unsigned int> CloudNodeType;
+    typedef std::vector<CloudNodeType> CloudType;
+
+    GadgetCloudController();
+    virtual ~GadgetCloudController();
+
+    // this GadgetCloudController runs in the passive mode
+    virtual int open(void* = 0);
+
+    virtual int close(unsigned long flags);
+
+    // create connector and register the reader and writer for every connector
+    int createConnector(const CloudType& cloud, 
+        size_t msgID_reader, std::vector<GadgetMessageReader*>& readers, 
+        size_t msgID_writer, std::vector<GadgetMessageWriter*>& writers);
+
+    // connect to the cloud host, need to call createConnector first
+    // hostnames: the host name or IP addresses for every node
+    // port_nos: port number for every node
+    // xmlfiles: the xml configuration file name sent to every node
+    int connectToCloud(const CloudType& cloud);
+
+    // send jobs to the node and wait for jobs to be returned
+    // for every job, the node id identify which nodes to send this job
+    // this call can be called repeated and the wait function will wait for all jobs ever sent
+    int runJobsOnCloud(std::vector<JobType*>& job_list, std::vector<JobType*>& completed_job_list, const std::vector<int>& node_ids);
+    // function to ease the calling
+    int runJobsOnCloud(std::vector<JobType>& job_list, std::vector<JobType>& completed_job_list, const std::vector<int>& node_ids);
+
+    // should be called after calling runJobsOnCloud
+    int waitForJobToComplete();
+
+    // send close message to all nodes
+    int closeCloudNode();
+
+    virtual int handle_close (ACE_HANDLE handle, ACE_Reactor_Mask close_mask);
+
+    // set jobs on a node to be completed
+    // if jobID===-1, all jobs for this node is set to be completed
+    int setJobsTobeCompleted(unsigned int nodeID, int jobID=-1);
+
+    // get/set the node status, 0/-1 : available/unavailable
+    int get_node_status(int nodeID, int& status)
+    {
+        ACE_GUARD_RETURN(ACE_Thread_Mutex, guard, cloud_controller_mutex_, -1);
+        if ( (nodeID>=0) && (nodeID<node_status_.size()) )
+        {
+            status = node_status_[nodeID];
+        }
+        else
+        {
+            status = -1;
+        }
+        return 0;
+    }
+
+    int set_node_status(int nodeID, int status)
+    {
+        ACE_GUARD_RETURN(ACE_Thread_Mutex, guard, cloud_controller_mutex_, -1);
+        if ( (nodeID>=0) && (nodeID<node_status_.size()) ) node_status_[nodeID] = status;
+        return 0;
+    }
+
+    // append the job list
+    int appendJobList(std::vector<JobType*>& job_list, 
+        std::vector<JobType*>& completed_job_list, 
+        std::vector<int>& node_id_used, std::vector<int>& job_status);
+
+    // list to store jobs sent to nodes
+    std::vector<JobType*> job_list_;
+    // list to store completed jobs from the nodes
+    std::vector<JobType*> completed_job_list_;
+    // for every job, indicate which node a job is sent to
+    std::vector<int> node_id_used_;
+    // job status, 0/-1 : completed/not completed
+    std::vector<int> job_status_;
+
+    // a function handler to process job after receive
+    // this is a hook to give user a chance to do some processing after receiving every job
+    GadgetCloudJobProcessHandler<JobType>* job_handler_;
+
+private:
+
+    // connector to every node
+    // one connector for a node
+    // node id starts from 0, and increase by 1
+    std::vector<GadgetronCloudConnector<JobType>* > cloud_connectors_;
+
+    size_t cloud_msg_id_reader_;
+    size_t cloud_msg_id_writer_;
+
+    // number of available nodes in the cloud
+    unsigned int number_of_nodes_;
+
+    // node status, 0/-1 : available/unavailable
+    std::vector<int> node_status_;
+
+    // number of job actually sent to nodes
+    // if 0, then controller does not need to wait
+    unsigned int number_of_jobs_sent_out_;
+
+    // to protect the access to job_status_ and node_id_used_
+    ACE_Thread_Mutex cloud_controller_mutex_;
+};
+
+template <typename JobType> 
+GadgetCloudController<JobType>::GadgetCloudController() : cloud_msg_id_reader_(GADGET_MESSAGE_CLOUD_JOB), cloud_msg_id_writer_(GADGET_MESSAGE_CLOUD_JOB), job_handler_(NULL), number_of_jobs_sent_out_(0)
+{
+
+}
+
+template <typename JobType> 
+GadgetCloudController<JobType>::~GadgetCloudController()
+{
+    GDEBUG("Into ~GadgetCloudController() ... \n");
+    this->msg_queue()->deactivate();
+
+    for ( unsigned int ii=0; ii<cloud_connectors_.size(); ii++ )
+    {
+        if ( cloud_connectors_[ii] != NULL )
+        {
+            cloud_connectors_[ii]->close();
+            delete cloud_connectors_[ii];
+            cloud_connectors_[ii] = NULL;
+            GDEBUG("~GadgetCloudController() : clean connectors done \n");
+        }
+    }
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::open(void* p)
+{
+    GDEBUG("GadgetCloudController::open\n");
+
+    // set the high water mark of message queue to be 24GB
+    this->msg_queue()->high_water_mark( (size_t)(24.0*1024*1024*1024) );
+
+    return 0;
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::close(unsigned long flags)
+{
+    GDEBUG("GadgetCloudController::close\n");
+    int rval = 0;
+    if (flags == 1)
+    {
+        ACE_Message_Block *hangup = new ACE_Message_Block();
+        hangup->msg_type( ACE_Message_Block::MB_HANGUP );
+        if (this->putq(hangup) == -1)
+        {
+            hangup->release();
+	    GERROR("GadgetCloudController::close, putq\n");
+	    return -1;
+        }
+        rval = this->wait();
+    }
+    return rval;
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::createConnector(const CloudType& cloud, 
+    size_t msgID_reader, std::vector<GadgetMessageReader*>& readers, 
+    size_t msgID_writer, std::vector<GadgetMessageWriter*>& writers)
+{
+    number_of_nodes_ = (unsigned int)cloud.size();
+
+    if ( readers.size() != number_of_nodes_ ) return -1;
+    if ( writers.size() != number_of_nodes_ ) return -1;
+
+    cloud_connectors_.resize(number_of_nodes_, NULL);
+    node_status_.resize(number_of_nodes_, -1);
+
+    cloud_msg_id_reader_ = msgID_reader;
+    cloud_msg_id_writer_ = msgID_writer;
+
+    unsigned int ii;
+    for( ii=0; ii<number_of_nodes_; ii++ )
+    {
+        GadgetronCloudConnector<JobType>* con;
+        ACE_NEW_RETURN (con, GadgetronCloudConnector<JobType>, -1);
+
+        cloud_connectors_[ii] = con;
+        cloud_connectors_[ii]->nodeID_ = ii;
+
+        cloud_connectors_[ii]->register_reader(cloud_msg_id_reader_, readers[ii] );
+        cloud_connectors_[ii]->register_writer(cloud_msg_id_writer_, writers[ii] );
+
+        cloud_connectors_[ii]->set_cloud_controller(this);
+    }
+
+    return 0;
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::
+connectToCloud(const CloudType& cloud)
+{
+    number_of_nodes_ = (unsigned int)cloud.size();
+    if ( cloud_connectors_.size() != number_of_nodes_ ) return -1;
+
+    node_status_.resize(number_of_nodes_, -1);
+
+    unsigned int ii;
+    for( ii=0; ii<number_of_nodes_; ii++ )
+    {
+        if ( cloud_connectors_[ii] == NULL ) return -1;
+
+        std::string host = cloud[ii].get<0>();
+        std::string port = cloud[ii].get<1>();
+
+        if ( cloud_connectors_[ii]->open(cloud[ii].get<0>(), cloud[ii].get<1>())!=0 )
+        {
+            cloud_connectors_[ii]->set_status(false);
+
+            ACE_Time_Value tv( (time_t)GADGETRON_TIMEOUT_PERIOD );
+            ACE_OS::sleep(tv);
+
+            GDEBUG("Open connection to (%s):%s failed ... \n", host.c_str(), port.c_str());
+        }
+        else
+        {
+            ACE_Time_Value tv( (time_t)0.5);
+            ACE_OS::sleep(tv);
+
+            // send the xml file
+            if (cloud_connectors_[ii]->send_gadgetron_configuration_file(cloud[ii].get<2>()) != 0)
+            {
+                ACE_Time_Value tv( (time_t)GADGETRON_TIMEOUT_PERIOD );
+                ACE_OS::sleep(tv);
+
+                GDEBUG("Unable to send XML configuration to the Gadgetron cloud host (%s):%s ... \n", host.c_str(), port.c_str());
+            }
+            else
+            {
+                // indicate this node can be used
+                node_status_[ii] = 0;
+                cloud_connectors_[ii]->set_status(true);
+            }
+        }
+
+        if ( node_status_[ii] == 0 )
+        {
+            GDEBUG("--> Node (%s):%s is ready ... \n", host.c_str(), port.c_str());
+        }
+        else
+        {
+            GDEBUG("--> Node (%s):%s is NOT ready ... \n", host.c_str(), port.c_str());
+        }
+    }
+
+    bool hasGoodNode = false;
+    for( ii=0; ii<number_of_nodes_; ii++ )
+    {
+        if ( node_status_[ii] == 0 )
+        {
+            hasGoodNode = true;
+            break;
+        }
+    }
+
+    if ( !hasGoodNode )
+    {
+      GERROR("Unable to find even one good node ... \n");
+      return -1;
+    }
+
+    return 0;
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::
+runJobsOnCloud(std::vector<JobType*>& job_list, std::vector<JobType*>& completed_job_list, const std::vector<int>& node_ids)
+{
+  GDEBUG("GadgetCloudController : into runJobsOnCloud(...) ... \n");
+  if ( job_list.empty() )
+    {
+      GERROR("GadgetCloudController : job list is empty ... \n");
+      return -1;
+    }
+
+    if ( completed_job_list.empty() )
+    {
+      GERROR("GadgetCloudController : completed job list is empty ... \n");
+      return -1;
+    }
+
+    if ( job_list.size() != completed_job_list.size() )
+    {
+      GERROR("GadgetCloudController : job list size does not match ... \n");
+      return -1;
+    }
+
+    if ( job_list.size() != node_ids.size() )
+    {
+      GERROR("GadgetCloudController : job list size does not match the node id size ... \n");
+      return -1;
+    }
+
+    std::vector<int> node_ids_used(node_ids);
+
+    size_t numOfJobs = job_list.size();
+    std::vector<int> job_status(numOfJobs, -1);
+
+    size_t ii;
+    for( ii=0; ii<numOfJobs; ii++ )
+    {
+        int nodeID = node_ids_used[ii];
+        if ( nodeID == -1 )
+        {
+            job_status[ii] = 0;
+            continue;
+        }
+
+        if ( nodeID >= (int)number_of_nodes_ )
+        {
+            nodeID %= (int)number_of_nodes_;
+        }
+
+        /*while ( node_status_[nodeID] < 0 )
+        {
+            nodeID--;
+            if ( nodeID < 0 ) nodeID = number_of_nodes_-1;
+        }
+
+        if ( nodeID != node_ids_used[ii] ) node_ids_used[ii] = nodeID;*/
+
+        int status = -1;
+        this->get_node_status(nodeID, status);
+        if ( status < 0 )
+        {
+            // try again
+            if ( number_of_nodes_ > 1 )
+            {
+                nodeID += number_of_nodes_/2;
+                if ( nodeID >= (int)number_of_nodes_ )
+                {
+                    nodeID %= (int)number_of_nodes_;
+                }
+
+                this->get_node_status(nodeID, status);
+            }
+
+            if ( status < 0 )
+            {
+                node_ids_used[ii] = -1; // local node to perform this job
+                job_status[ii] = 0;
+            }
+            else
+            {
+                node_ids_used[ii] = nodeID;
+            }
+        }
+
+        GDEBUG("--> node for job %d is %d ... \n", ii, node_ids_used[ii]);
+    }
+
+    // append incoming jobs into the list
+    size_t startJobID = job_list_.size();
+
+    if ( this->appendJobList(job_list, completed_job_list, node_ids_used, job_status) == -1 )
+    {
+      GERROR("Unable to append job list ... \n");
+      return -1;
+    }
+
+    for( ii=0; ii<numOfJobs; ii++ )
+    {
+        int nodeID = node_ids_used[ii];
+        if ( nodeID == -1 )
+        {
+            GDEBUG("--> node for job %d is NOT ready ... \n", ii+startJobID);
+            continue;
+        }
+
+        // send job to a node
+        GadgetContainerMessage<GadgetMessageIdentifier>* m1 =
+                new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+        m1->getObjectPtr()->id = (ACE_INT16)cloud_msg_id_writer_;
+
+        GadgetContainerMessage<int>* m2 =
+                new GadgetContainerMessage<int>();
+
+        *(m2->getObjectPtr()) = (int)(ii+startJobID);
+
+        GadgetContainerMessage<JobType>* m3 =
+                new GadgetContainerMessage<JobType>();
+
+        *(m3->getObjectPtr()) = *(job_list[ii]);
+        m1->cont(m2);
+        m2->cont(m3);
+
+        if ( node_status_[nodeID] == 0 )
+        {
+            if (cloud_connectors_[nodeID]->putq(m1) == -1)
+            {
+	      GERROR("Unable to send job package %d on queue for node %d \n", ii+startJobID, nodeID);
+	      m1->release();
+	      return -1;
+            }
+            else
+            {
+	      GDEBUG("Send job %d to node %d ... \n", ii+startJobID, nodeID);
+	      number_of_jobs_sent_out_++;
+            }
+        }
+        else
+        {
+            m1->release();
+        }
+    }
+
+    GDEBUG("GadgetCloudController - all jobs sent ... \n");
+
+    return 0;
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::
+runJobsOnCloud(std::vector<JobType>& job_list, std::vector<JobType>& completed_job_list, const std::vector<int>& node_ids)
+{
+    if ( job_list.size() != completed_job_list.size() )
+    {
+      GERROR("GadgetCloudController : job list size does not match ... \n");
+      return -1;
+    }
+
+    if ( job_list.size() != node_ids.size() )
+    {
+      GERROR("GadgetCloudController : job list size does not match the node id size ... \n");
+      return -1;
+    }
+
+    std::vector<JobType*> jobPtr(job_list.size(), NULL);
+    std::vector<JobType*> completedJobPtr(completed_job_list.size(), NULL);
+
+    unsigned int N = job_list.size();
+
+    unsigned int ii;
+    for ( ii=0; ii<N; ii++ )
+    {
+        jobPtr[ii] = &job_list[ii];
+        completedJobPtr[ii] = &completed_job_list[ii];
+    }
+
+    return runJobsOnCloud(jobPtr, completedJobPtr, node_ids);
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::
+closeCloudNode()
+{
+    GDEBUG("GadgetCloudController : into closeCloudNode(...) ... \n");
+
+    unsigned int ii;
+
+    std::vector<bool> closeMsgSent(number_of_nodes_, false);
+    for( ii=0; ii<number_of_nodes_; ii++ )
+    {
+        int nodeID = ii;
+
+        if ( !closeMsgSent[nodeID] )
+        {
+            closeMsgSent[nodeID] = true;
+
+            // send the close message for this node
+            GadgetContainerMessage<GadgetMessageIdentifier>* m = new GadgetContainerMessage<GadgetMessageIdentifier>();
+            m->getObjectPtr()->id = GADGET_MESSAGE_CLOSE;
+
+            if (cloud_connectors_[nodeID]->putq(m) == -1)
+            {
+	      GERROR("Unable to send CLOSE package on queue for node %d \n", nodeID);
+	      m->release();
+	      return -1;
+            }
+        }
+    }
+
+    GDEBUG("GadgetCloudController - close message sent to all nodes ... \n");
+
+    return 0;
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::waitForJobToComplete()
+{
+    // block the caller thread
+    GDEBUG("GadgetCloudController waitForJobToComplete ... \n");
+
+    ACE_Message_Block *mb = 0;
+    ACE_Time_Value nowait (ACE_OS::gettimeofday ());
+
+    //collect a incoming package a package if we have one
+    while ( number_of_jobs_sent_out_>0 && (this->getq (mb) != -1) )
+    {
+        GadgetContainerMessage<int>* m_jobID =
+            AsContainerMessage<int>(mb);
+
+        if ( !m_jobID )
+        {
+	  GDEBUG("Invalid message id in the GadgetCloudController queue\n");
+	  break;
+        }
+
+        int jobID = *(m_jobID->getObjectPtr());
+
+        if ( jobID != -1 )
+        {
+            GadgetContainerMessage<JobType>* job =
+                AsContainerMessage<JobType>(mb->cont());
+
+            if ( !job )
+            {
+	      GDEBUG("Invalid message obj in the GadgetCloudController queue\n");
+	      break;
+            }
+
+            *(completed_job_list_[jobID]) = *(job->getObjectPtr());
+            job_status_[jobID] = 0;
+
+	    GDEBUG("--> receive completed job : %d ... \n", jobID);
+
+            if ( job_handler_ != NULL )
+            {
+                if ( !job_handler_->processJob( jobID, *(completed_job_list_[jobID]) ) )
+                {
+		  GDEBUG("job_handler_->processJob after receiving failed\n");
+                }
+            }
+        }
+        else
+        {
+	  GDEBUG("--> receive jobID == -1 ... \n");
+        }
+
+        mb->release();
+
+        // if all jobs are received, notice the caller thread
+        bool allJobProcessed = true;
+        {
+            ACE_GUARD_RETURN(ACE_Thread_Mutex, guard, cloud_controller_mutex_, -1);
+            for ( unsigned int ii=0; ii<job_status_.size(); ii++ )
+            {
+                if ( job_status_[ii] != 0 )
+                {
+                    allJobProcessed = false;
+                    break;
+                }
+            }
+        }
+
+        if ( allJobProcessed )
+        {
+	  GDEBUG("All jobs are completed and returned on GadgetCloudController queue\n");
+	  break;
+        }
+    }
+
+    // need to wait for all reader task to complete
+    for( unsigned int ii=0; ii<number_of_nodes_; ii++ )
+    {
+        if ( cloud_connectors_[ii]->status() )
+        {
+            cloud_connectors_[ii]->wait();
+        }
+    }
+
+    GDEBUG("GadgetCloudController waitForJobToComplete done ... \n");
+    return 0;
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::handle_close(ACE_HANDLE handle, ACE_Reactor_Mask close_mask)
+{
+    GDEBUG("GadgetCloudController handling close...\n");
+    return this->wait();
+}
+
+template<typename JobType> 
+int GadgetCloudController<JobType>::setJobsTobeCompleted(unsigned int nodeID, int jobID)
+{
+    ACE_GUARD_RETURN(ACE_Thread_Mutex, guard, cloud_controller_mutex_, -1);
+    try
+    {
+        if ( (nodeID>=0) && (nodeID<this->node_status_.size()) )
+        {
+            node_status_[nodeID] = -1;
+        }
+
+        size_t N = this->node_id_used_.size();
+        size_t ii;
+        for ( ii=0; ii<N; ii++ )
+        {
+            if ( this->node_id_used_[ii] == nodeID )
+            {
+                //if ( jobID>=0 && jobID<this->job_status_.size() )
+                //{
+                //    this->job_status_[jobID] = 0;
+                //}
+                //else
+                //{
+                //    if ( this->job_status_[ii]!= 0 ) this->job_status_[ii] = 0;
+                //}
+
+                // make sure all jobs on this node is marked as completed
+                if ( this->job_status_[ii]!= 0 ) this->job_status_[ii] = 0;
+            }
+        }
+    }
+    catch(...)
+    {
+      GERROR("GadgetCloudController, setJobsTobeCompleted() failed ... \n");
+      return -1;
+    }
+
+    return 0;
+}
+
+template<typename JobType> 
+int GadgetCloudController<JobType>::appendJobList(std::vector<JobType*>& job_list, 
+        std::vector<JobType*>& completed_job_list, 
+        std::vector<int>& node_id_used, std::vector<int>& job_status)
+{
+    ACE_GUARD_RETURN(ACE_Thread_Mutex, guard, cloud_controller_mutex_, -1);
+    try
+    {
+        size_t N = job_list.size();
+
+        if ( completed_job_list.size() != N )
+        {
+	  GERROR("GadgetCloudController appendJobList: job list size does not match ... \n");
+	  return -1;
+        }
+
+        if ( node_id_used.size() != N )
+        {
+	  GERROR("GadgetCloudController appendJobList: node_id_used size does not match ... \n");
+	  return -1;
+        }
+
+        if ( job_status.size() != N )
+        {
+	  GERROR("GadgetCloudController appendJobList: job_status size does not match ... \n");
+	  return -1;
+        }
+
+        size_t ii;
+        for ( ii=0; ii<N; ii++ )
+        {
+            job_list_.push_back(job_list[ii]);
+            completed_job_list_.push_back(completed_job_list[ii]);
+            node_id_used_.push_back(node_id_used[ii]);
+            job_status_.push_back(job_status[ii]);
+        }
+    }
+    catch(...)
+    {
+      GERROR("GadgetCloudController, appendJobList() failed ... \n");
+      return -1;
+    }
+
+    return 0;
+}
+
+}
diff --git a/toolboxes/gadgettools/GadgetronCloudConnector.h b/toolboxes/gadgettools/GadgetronCloudConnector.h
new file mode 100644
index 0000000..daa2159
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetronCloudConnector.h
@@ -0,0 +1,580 @@
+
+#pragma once
+
+#include <ace/Svc_Handler.h>
+#include <ace/Reactor.h>
+#include <ace/SOCK_Stream.h>
+#include <ace/SOCK_Connector.h>
+#include <ace/Reactor_Notification_Strategy.h>
+#include <string>
+#include "GadgetronSlotContainer.h"
+#include "GadgetMessageInterface.h"
+#include "GadgetronConnector.h"
+#include "gadgettools_export.h"
+#include "GadgetMRIHeaders.h"
+#include "log.h"
+
+#define GADGETRON_TIMEOUT_PERIOD 1.5
+
+namespace Gadgetron
+{
+
+template<typename JobType> class GadgetCloudController;
+template<typename JobType> class GadgetronCloudConnector;
+
+template<typename JobType> 
+class CloudWriterTask : public WriterTask
+{
+
+public:
+    typedef WriterTask inherited;
+
+    CloudWriterTask(ACE_SOCK_Stream* socket)
+    : inherited(socket), cloud_connector_(NULL)
+    {
+    }
+
+    virtual ~CloudWriterTask()
+    {
+    }
+
+    void set_cloud_connector(GadgetronCloudConnector<JobType>* connector)
+    {
+        cloud_connector_ = connector;
+    }
+
+    virtual int svc(void)
+    {
+        ACE_Message_Block* mb = 0;
+        while (this->getq (mb) != -1)
+        {
+            int retval = this->svcImpl(mb);
+
+            if ( retval == 2 )
+            {
+                GDEBUG("CloudWriterTask quit\n");
+                return 0;
+            }
+
+            if ( retval == -1 )
+            {
+                GDEBUG("CloudWriterTask svcImpl failed ... \n");
+                ACE_OS::sleep(ACE_Time_Value( (time_t)GADGETRON_TIMEOUT_PERIOD ));
+                return -1;
+            }
+        }
+
+        return 0;
+    }
+
+    virtual int svcImpl(ACE_Message_Block* mb)
+    {
+        ACE_Time_Value nowait (ACE_OS::gettimeofday ());
+
+        //Send a package if we have one
+        GadgetContainerMessage<GadgetMessageIdentifier>* mid =
+                AsContainerMessage<GadgetMessageIdentifier>(mb);
+
+        if (!mid)
+        {
+	  GERROR("Invalid message on output queue\n");
+	  mb->release();
+	  return -1;
+        }
+
+        //Is this a shutdown message?
+        if (mid->getObjectPtr()->id == GADGET_MESSAGE_CLOSE)
+        {
+            socket_->send_n(mid->getObjectPtr(),sizeof(GadgetMessageIdentifier));
+            GDEBUG("CloudWriterTask done\n");
+            return 2;
+        }
+
+        GadgetMessageWriter* w = writers_.find(mid->getObjectPtr()->id);
+
+        if (!w)
+        {
+	  GERROR("Unrecognized Message ID received: %d\n" ,mid->getObjectPtr()->id);
+	  mb->release();
+	  return -1;
+        }
+
+        if (w->write(socket_,mb->cont()) < 0)
+        {
+	  GDEBUG("Failed to write message to Gadgetron\n");
+
+            // notice the controller
+            GadgetContainerMessage<int>* m1 = 
+                dynamic_cast< GadgetContainerMessage<int>* >(mb->cont());
+
+            if ( m1 )
+            {
+                int jobID = *(m1->getObjectPtr());
+                cloud_connector_->setJobTobeCompletedAndNoticeController(jobID);
+            }
+            else
+            {
+                cloud_connector_->setJobTobeCompletedAndNoticeController();
+            }
+
+            mb->release ();
+            return -1;
+        }
+
+        mb->release();
+
+        GDEBUG("--> CloudWriterTask, write msg through socket done ... \n");
+
+        return 0;
+    }
+
+protected:
+    GadgetronCloudConnector<JobType>* cloud_connector_;
+};
+
+template<typename JobType> 
+class CloudReaderTask : public ACE_Task<ACE_MT_SYNCH>
+{
+
+public:
+    typedef ACE_Task<ACE_MT_SYNCH> inherited;
+
+    CloudReaderTask(ACE_SOCK_Stream* socket) : inherited(), socket_(socket), cloud_connector_(NULL)
+    {
+    }
+
+    virtual ~CloudReaderTask()
+    {
+        readers_.clear();
+    }
+
+    virtual int init(void)
+    {
+        return 0;
+    }
+
+    virtual int open(void* = 0)
+    {
+        GDEBUG("CloudReaderTask::open\n");
+        return this->activate( THR_NEW_LWP | THR_JOINABLE, 1 );
+    }
+
+    void set_cloud_connector(GadgetronCloudConnector<JobType>* connector)
+    {
+        cloud_connector_ = connector;
+    }
+
+    int register_reader(size_t slot, GadgetMessageReader* reader)
+    {
+        return readers_.insert( (unsigned short)slot,reader);
+    }
+
+    virtual int close(unsigned long flags)
+    {
+        GDEBUG("CloudReaderTask::close\n");
+        int rval = 0;
+        if (flags == 1) {
+            rval = this->wait();
+        }
+        return rval;
+    }
+
+    virtual int svc(void)
+    {
+        ssize_t recv_count = 0;
+        GadgetMessageIdentifier mid;
+
+        while (1)
+        {
+            if ((recv_count = cloud_connector_->peer().recv_n(&mid, sizeof(GadgetMessageIdentifier))) <= 0)
+            {
+	        GERROR("CloudReaderTask, failed to read message identifier\n");
+                ACE_OS::sleep(ACE_Time_Value( (time_t)GADGETRON_TIMEOUT_PERIOD ));
+                cloud_connector_->set_status(false);
+                cloud_connector_->setJobTobeCompletedAndNoticeController();
+                return -1;
+            }
+
+            //Is this a shutdown message?
+            if (mid.id == GADGET_MESSAGE_CLOSE)
+            {
+	      GDEBUG("CloudReaderTask, Close Message received\n");
+	      return 0;
+            }
+
+            GadgetMessageReader* r = readers_.find(mid.id);
+            if (r == 0)
+            {
+	      GERROR("CloudReaderTask, Unknown message id %d received\n", mid.id);
+	      cloud_connector_->set_status(false);
+	      cloud_connector_->setJobTobeCompletedAndNoticeController();
+	      return -1;
+            }
+
+            ACE_Message_Block* mb = r->read(&cloud_connector_->peer());
+
+            if (!mb)
+            {
+	      GERROR("CloudReaderTask, Failed to read message\n");
+	      ACE_OS::sleep(ACE_Time_Value( (time_t)GADGETRON_TIMEOUT_PERIOD ));
+	      cloud_connector_->set_status(false);
+	      cloud_connector_->setJobTobeCompletedAndNoticeController();
+	      return -1;
+            }
+            else
+            {
+                ACE_OS::sleep(ACE_Time_Value( (time_t)(0.5) ));
+
+                if (cloud_connector_->process(mid.id, mb) < 0)
+                {
+		  GERROR("ReaderTask, Failed to process message\n");
+		  cloud_connector_->set_status(false);
+		  cloud_connector_->setJobTobeCompletedAndNoticeController();
+		  return -1;
+                }
+            }
+        }
+
+        GDEBUG("CloudReaderTask, stop with return value 0 ... \n");
+        return 0;
+    }
+
+protected:
+
+    ACE_SOCK_Stream* socket_;
+    GadgetronSlotContainer<GadgetMessageReader> readers_;
+    GadgetronCloudConnector<JobType>* cloud_connector_;
+};
+
+template<typename JobType> 
+class GadgetronCloudConnector
+{
+public:
+
+    GadgetronCloudConnector();
+    virtual ~GadgetronCloudConnector();
+
+    int openImpl (std::string hostname, std::string port);
+    int open (std::string hostname, std::string port);
+
+    virtual int process(size_t messageid, ACE_Message_Block* mb);
+
+    void set_cloud_controller(GadgetCloudController<JobType>* controller);
+
+    // if jobID==-1, all jobs for this node is set to be completed
+    int setJobTobeCompletedAndNoticeController(int jobID=-1);
+
+    virtual int putq  (  ACE_Message_Block * mb ,  ACE_Time_Value *  timeout = 0);
+
+    virtual int register_reader(size_t slot, GadgetMessageReader* reader);
+    virtual int register_writer(size_t slot, GadgetMessageWriter* writer);
+
+    int close()
+    {
+        GDEBUG("Into GadgetronCloudConnector:close() ... \n");
+        GDEBUG("Closing socket \n");
+        peer().close();
+        GDEBUG("Socket closed \n");
+        cloud_writer_task_.flush();
+        cloud_reader_task_.close(0);
+        cloud_writer_task_.close(0);
+        return this->wait();
+    }
+
+    virtual int wait()
+    {
+        GDEBUG("Into GadgetronCloudConnector:wait() ... \n");
+
+        int retval;
+        GDEBUG("Waiting for cloud reader task:\n");
+        retval = cloud_reader_task_.wait();
+        GDEBUG("Reader task done\n");
+
+        GDEBUG("Waiting for cloud writer task\n");
+        retval = cloud_writer_task_.wait();
+        GDEBUG("Writer task done\n");
+
+        return retval;
+    }
+
+    CloudWriterTask<JobType>& writer_task()
+    {
+        return cloud_writer_task_;
+    }
+
+    CloudReaderTask<JobType>& reader_task()
+    {
+        return cloud_reader_task_;
+    }
+
+    bool status()
+     {
+        bool ret_val;
+        mtx_.acquire();
+        ret_val = status_;
+        mtx_.release();
+        return ret_val;
+    }
+
+    void set_status(bool s)
+    {
+        mtx_.acquire();
+        status_ = s;
+        mtx_.release();
+    }
+
+    int send_gadgetron_configuration_file(std::string config_xml_name);
+    int send_gadgetron_configuration_script(std::string config_xml_name);
+    int send_gadgetron_parameters(std::string xml_string);
+
+    ACE_SOCK_Stream& peer()
+    {
+        return peer_;
+    }
+
+    unsigned int nodeID_;
+
+protected:
+
+    ACE_Thread_Mutex mtx_;
+    bool status_;
+
+    std::string hostname_;
+    std::string port_;
+
+    GadgetCloudController<JobType>* cloud_controller_;
+    CloudWriterTask<JobType> cloud_writer_task_;
+    CloudReaderTask<JobType> cloud_reader_task_;
+
+    ACE_SOCK_Stream peer_;
+};
+
+template<typename JobType> 
+GadgetronCloudConnector<JobType>::GadgetronCloudConnector() : cloud_controller_(NULL), 
+                                                            nodeID_(0), 
+                                                            cloud_writer_task_(&this->peer()), 
+                                                            cloud_reader_task_(&this->peer()), 
+                                                            status_(false), 
+                                                            mtx_("CLOUDCONNECTOR_MTX")
+{
+    GDEBUG("Into GadgetronCloudConnector:GadgetronCloudConnector() ... \n");
+}
+
+template<typename JobType> 
+GadgetronCloudConnector<JobType>::~GadgetronCloudConnector()
+{
+    GDEBUG("Into GadgetronCloudConnector:~GadgetronCloudConnector() ... \n");
+    cloud_writer_task_.msg_queue()->deactivate();
+    cloud_reader_task_.msg_queue()->deactivate();
+    this->wait();
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::openImpl(std::string hostname, std::string port)
+{
+    hostname_= hostname;
+    port_ = port;
+
+    ACE_INET_Addr server(port_.c_str(),hostname_.c_str());
+    ACE_SOCK_Connector connector;
+
+    if (connector.connect(this->peer(),server) == -1)
+    {
+      GERROR("connect error");
+      return -1;
+    }
+
+    ACE_TCHAR peer_name[MAXHOSTNAMELENGTH];
+    ACE_INET_Addr peer_addr;
+    if (peer().get_remote_addr (peer_addr) == 0 && peer_addr.addr_to_string (peer_name, MAXHOSTNAMELENGTH) == 0)
+    {
+      GDEBUG("Connection from %s\n", peer_name);
+    }
+
+    return 0;
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::open(std::string hostname, std::string port)
+{
+    this->cloud_writer_task_.set_cloud_connector(this);
+    this->cloud_reader_task_.set_cloud_connector(this);
+
+    if ( this->openImpl(hostname, port) == 0 )
+    {
+        status_ = true;
+        this->cloud_writer_task_.open();
+        this->cloud_reader_task_.open();
+    }
+    else
+    {
+        status_ = false;
+        return -1;
+    }
+
+    return 0;
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::process(size_t messageid, ACE_Message_Block* mb)
+{
+    // insert message into the queue of cloud controller
+    if ( cloud_controller_ == NULL )
+    {
+      GERROR("GadgetronCloudConnector, pointer of could controller is null ...\n");
+      mb->release();
+      return -1;
+    }
+
+    if ( cloud_controller_->putq(mb) == -1)
+    {
+      GERROR("Unable to put received message into the queue of cloud controller %d\n", messageid);
+      mb->release();
+      return -1;
+    }
+
+    return 0;
+}
+
+template<typename JobType> 
+void GadgetronCloudConnector<JobType>::set_cloud_controller(GadgetCloudController<JobType>* controller)
+{
+    cloud_controller_ = controller;
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::putq(ACE_Message_Block* mb ,  ACE_Time_Value* timeout)
+{
+    return cloud_writer_task_.putq(mb,timeout);
+    /*int retval = cloud_writer_task_.svcImpl(mb);
+    if ( retval != 0 )
+    {
+        ACE_Time_Value tv(GADGETRON_TIMEOUT_PERIOD);
+        ACE_OS::sleep(tv);
+    }
+    return retval;*/
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::register_reader(size_t slot, GadgetMessageReader* reader)
+{
+    return cloud_reader_task_.register_reader(slot, reader);
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::register_writer(size_t slot, GadgetMessageWriter* writer)
+{
+    return cloud_writer_task_.register_writer(slot,writer);
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::setJobTobeCompletedAndNoticeController(int jobID)
+{
+    ACE_GUARD_RETURN(ACE_Thread_Mutex, guard, mtx_, -1);
+
+    GDEBUG("GadgetronCloudConnector, into setJobTobeCompletedAndNoticeController(...) ... \n");
+
+    // set the job to be completed and invalidate the node
+    if ( cloud_controller_->setJobsTobeCompleted(nodeID_, jobID) < 0 )
+    {
+      GERROR("GadgetronCloudConnector, cloud_controller_->setJobsTobeCompleted(%d, %d) failed ... \n", nodeID_, jobID);
+      return -1;
+    }
+
+    // put a invalid jobID==-1 to the controller message queue to trick the check
+    GadgetContainerMessage<int>* jobIDMsg = new GadgetContainerMessage<int>();
+    *(jobIDMsg->getObjectPtr()) = -1;
+
+    if (process(GADGET_MESSAGE_CLOUD_JOB, jobIDMsg) < 0)
+    {
+      GERROR("GadgetronCloudConnector, Failed to put jobIDMsg==-1 into the controller message queue\n");
+      return -1;
+    }
+
+    return 0;
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::send_gadgetron_configuration_file(std::string config_xml_name)
+{
+    GadgetMessageIdentifier id;
+    id.id = GADGET_MESSAGE_CONFIG_FILE;
+
+    GadgetMessageConfigurationFile ini;
+    ACE_OS_String::strncpy(ini.configuration_file, config_xml_name.c_str(),1024);
+
+    if (this->peer().send_n(&id, sizeof(GadgetMessageIdentifier)) != sizeof(GadgetMessageIdentifier))
+    {
+      GERROR("Unable to send GadgetMessageIdentifier\n");
+      return -1;
+    }
+
+    if (this->peer().send_n(&ini, sizeof(GadgetMessageConfigurationFile)) != sizeof(GadgetMessageConfigurationFile))
+     {
+       GERROR("Unable to send GadgetMessageConfigurationFile\n");
+       return -1;
+    }
+
+    return 0;
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::send_gadgetron_configuration_script(std::string config_xml)
+{
+    GadgetMessageIdentifier id;
+    id.id = GADGET_MESSAGE_CONFIG_SCRIPT;
+
+    GadgetMessageScript ini;
+    ini.script_length = config_xml.size()+1;
+
+    if (this->peer().send_n(&id, sizeof(GadgetMessageIdentifier)) != sizeof(GadgetMessageIdentifier))
+    {
+      GERROR("Unable to send GadgetMessageIdentifier\n");
+      return -1;
+    }
+
+    if (this->peer().send_n(&ini, sizeof(GadgetMessageScript)) != sizeof(GadgetMessageScript))
+    {
+      GERROR("Unable to send GadgetMessageScript\n");
+      return -1;
+    }
+
+    if (this->peer().send_n(config_xml.c_str(), ini.script_length) != ini.script_length)
+    {
+      GERROR("Unable to send parameter xml\n");
+      return -1;
+    }
+
+    return 0;
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::send_gadgetron_parameters(std::string xml_string)
+{
+    GadgetMessageIdentifier id;
+    id.id = GADGET_MESSAGE_PARAMETER_SCRIPT;
+
+    GadgetMessageScript conf;
+    conf.script_length = xml_string.size()+1;
+    if (this->peer().send_n(&id, sizeof(GadgetMessageIdentifier)) != sizeof(GadgetMessageIdentifier))
+    {
+      GERROR("Unable to send GadgetMessageIdentifier\n");
+      return -1;
+    }
+
+    if (this->peer().send_n(&conf, sizeof(GadgetMessageScript)) != sizeof(GadgetMessageScript))
+    {
+      GERROR("Unable to send GadgetMessageScript\n");
+      return -1;
+    }
+
+    if (this->peer().send_n(xml_string.c_str(), conf.script_length) != conf.script_length)
+    {
+      GERROR("Unable to send parameter xml\n");
+      return -1;
+    }
+
+    return 0;
+}
+
+}
diff --git a/toolboxes/gadgettools/GadgetronConnector.cpp b/toolboxes/gadgettools/GadgetronConnector.cpp
new file mode 100644
index 0000000..5302f55
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetronConnector.cpp
@@ -0,0 +1,208 @@
+#include "GadgetronConnector.h"
+
+#include <ace/SOCK_Connector.h>
+#include "log.h"
+
+using namespace Gadgetron;
+
+GadgetronConnector::GadgetronConnector()
+    //: notifier_ (0, this, ACE_Event_Handler::WRITE_MASK)
+    : writer_task_(&this->peer())
+{
+}
+
+GadgetronConnector::~GadgetronConnector() {
+    readers_.clear();
+    //writers_.clear();
+}
+
+int GadgetronConnector::openImpl(std::string hostname, std::string port)
+{
+    hostname_= hostname;
+    port_ = port;
+
+    //We will add a notification strategy to the message queue to make sure than handle_output gets triggered when packages are on the queue
+    //this->notifier_.reactor (this->reactor ());
+    //this->msg_queue ()->notification_strategy (&this->notifier_);
+
+    ACE_INET_Addr server(port_.c_str(),hostname_.c_str());
+    ACE_SOCK_Connector connector;
+
+    if (connector.connect(this->peer(),server) == -1) {
+      GERROR("Failed to connect");
+      return -1;
+    }
+
+    ACE_TCHAR peer_name[MAXHOSTNAMELENGTH];
+    ACE_INET_Addr peer_addr;
+    if (peer().get_remote_addr (peer_addr) == 0 && peer_addr.addr_to_string (peer_name, MAXHOSTNAMELENGTH) == 0) {
+      GDEBUG("Connection from %s\n", peer_name);
+    }
+
+    return 0;
+}
+
+int GadgetronConnector::open(std::string hostname, std::string port)
+{
+    //Make sure we have a reactor, otherwise assign one from the singleton instance
+    if (!this->reactor()) {
+      GDEBUG("Setting reactor");
+      this->reactor(ACE_Reactor::instance());
+    }
+
+    this->openImpl(hostname, port);
+
+    this->writer_task_.open();
+
+    if (this->reactor ()->register_handler(this, ACE_Event_Handler::READ_MASK) != 0) {
+      GERROR("Failed to register read handler\n");
+      return -2;
+    }
+
+    return this->activate( THR_NEW_LWP | THR_JOINABLE, 1); //Run single threaded. TODO: Add multithreaded support
+}
+
+int GadgetronConnector::handle_input(ACE_HANDLE fd)
+{
+    ssize_t recv_count = 0;
+    GadgetMessageIdentifier mid;
+
+    if ((recv_count = peer().recv_n(&mid, sizeof(GadgetMessageIdentifier))) <= 0) {
+      GWARN("GadgetronConnector, failed to read message identifier\n");
+      return -1;
+    }
+
+    //Is this a shutdown message?
+    if (mid.id == GADGET_MESSAGE_CLOSE) {
+      GDEBUG("GadgetronConnector, Close Message received\n");
+      return close();
+    }
+
+    GadgetMessageReader* r = readers_.find(mid.id);
+    if (r == 0) {
+      GERROR("GadgetronConnector, Unknown message id %d received\n", mid.id);
+      return -1;
+    }
+
+    ACE_Message_Block* mb = r->read(&peer());
+
+    if (!mb) {
+      GERROR("GadgetronConnector, Failed to read message\n");
+      return -1;
+    }  else {
+      if (process(mid.id, mb) < 0) {
+	GERROR("GadgetronConnector, Failed to process message\n");
+	return -1;
+      }
+    }
+
+    return 0;
+}
+
+int GadgetronConnector::handle_close(ACE_HANDLE handle, ACE_Reactor_Mask close_mask)
+{
+  GDEBUG("Handling close...\n");
+  this->reactor()->end_reactor_event_loop();
+  return 0;//this->wait();
+}
+
+int GadgetronConnector::svc(void)
+{
+    //ACE_thread_t old_owner;
+
+    //Take ownership of Reactor
+    this->reactor()->owner(ACE_Thread::self ());//, &old_owner);
+
+    this->reactor()->reset_event_loop();
+
+    ACE_Time_Value initialDelay (3);
+    ACE_Time_Value interval (0,100);
+
+    //Handle the events
+    this->reactor()->run_reactor_event_loop();
+
+    //this->reactor()->owner(&old_owner);
+    
+    GDEBUG("GadgetronConnector svc done...\n");
+
+    return 0;
+}
+
+int GadgetronConnector::register_reader(size_t slot, GadgetMessageReader *reader)
+{
+    return readers_.insert( (unsigned short)slot,reader);
+}
+
+int GadgetronConnector::send_gadgetron_configuration_file(std::string config_xml_name)
+{
+    GadgetMessageIdentifier id;
+    id.id = GADGET_MESSAGE_CONFIG_FILE;
+
+    GadgetMessageConfigurationFile ini;
+    ACE_OS_String::strncpy(ini.configuration_file, config_xml_name.c_str(),1024);
+
+
+    if (this->peer().send_n(&id, sizeof(GadgetMessageIdentifier)) != sizeof(GadgetMessageIdentifier)) {
+      GERROR("Unable to send GadgetMessageIdentifier\n");
+      return -1;
+    }
+
+    if (this->peer().send_n(&ini, sizeof(GadgetMessageConfigurationFile)) != sizeof(GadgetMessageConfigurationFile)) {
+      GERROR("Unable to send GadgetMessageConfigurationFile\n");
+      return -1;
+    }
+
+    return 0;
+}
+
+int GadgetronConnector::send_gadgetron_configuration_script(std::string config_xml)
+{
+    GadgetMessageIdentifier id;
+    id.id = GADGET_MESSAGE_CONFIG_SCRIPT;
+
+    GadgetMessageScript ini;
+    ini.script_length = (ACE_UINT32)config_xml.size()+1;
+
+    if (this->peer().send_n(&id, sizeof(GadgetMessageIdentifier)) != sizeof(GadgetMessageIdentifier)) {
+      GERROR("Unable to send GadgetMessageIdentifier\n");
+      return -1;
+    }
+
+    if (this->peer().send_n(&ini, sizeof(GadgetMessageScript)) != sizeof(GadgetMessageScript)) {
+      GERROR("Unable to send GadgetMessageScript\n");
+      return -1;
+    }
+
+    if (this->peer().send_n(config_xml.c_str(), ini.script_length) != ini.script_length) {
+      GERROR("Unable to send parameter xml\n");
+      return -1;
+    }
+
+    return 0;
+}
+
+int GadgetronConnector::send_gadgetron_parameters(std::string xml_string)
+{
+    GadgetMessageIdentifier id;
+    id.id = GADGET_MESSAGE_PARAMETER_SCRIPT;
+
+    GadgetMessageScript conf;
+    conf.script_length = (ACE_UINT32)xml_string.size()+1;
+    if (this->peer().send_n(&id, sizeof(GadgetMessageIdentifier)) != sizeof(GadgetMessageIdentifier)) {
+      GERROR("Unable to send GadgetMessageIdentifier\n");
+      return -1;
+    }
+
+    if (this->peer().send_n(&conf, sizeof(GadgetMessageScript)) != sizeof(GadgetMessageScript)) {
+      GERROR("Unable to send GadgetMessageScript\n");
+      return -1;
+    }
+
+    if (this->peer().send_n(xml_string.c_str(), conf.script_length) != conf.script_length) {
+      GERROR("Unable to send parameter xml\n");
+      return -1;
+    }
+
+    return 0;
+}
+
diff --git a/toolboxes/gadgettools/GadgetronConnector.h b/toolboxes/gadgettools/GadgetronConnector.h
new file mode 100644
index 0000000..42d5d1f
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetronConnector.h
@@ -0,0 +1,157 @@
+#ifndef GADGETRONCONNECTOR_H_
+#define GADGETRONCONNECTOR_H_
+
+#include "GadgetronSlotContainer.h"
+#include "GadgetMessageInterface.h"
+#include "gadgettools_export.h"
+
+#include <ace/Svc_Handler.h>
+#include <ace/Reactor.h>
+#include <ace/SOCK_Stream.h>
+#include <ace/Reactor_Notification_Strategy.h>
+#include <string>
+
+#define MAXHOSTNAMELENGTH 1024
+
+namespace Gadgetron{
+
+class WriterTask : public ACE_Task<ACE_MT_SYNCH>
+{
+
+public:
+	typedef ACE_Task<ACE_MT_SYNCH> inherited;
+
+	WriterTask(ACE_SOCK_Stream* socket)
+	: inherited()
+	, socket_(socket)
+	{
+	}
+
+	virtual ~WriterTask()
+	{
+	  writers_.clear();
+	}
+
+	virtual int init(void)
+	{
+	  return 0;
+	}
+
+	virtual int open(void* = 0)
+	{
+	  return this->activate( THR_NEW_LWP | THR_JOINABLE, 1 );
+	}
+
+	int register_writer(size_t slot, GadgetMessageWriter* writer) {
+		return writers_.insert( (unsigned int)slot,writer);
+	}
+
+	virtual int close(unsigned long flags)
+	{
+		int rval = 0;
+		if (flags == 1) {
+			ACE_Message_Block *hangup = new ACE_Message_Block();
+			hangup->msg_type( ACE_Message_Block::MB_HANGUP );
+			if (this->putq(hangup) == -1) {
+				hangup->release();
+				GERROR("WriterTask::close, putq\n");
+				return -1;
+			}
+			rval = this->wait();
+		}
+		return rval;
+	}
+
+	virtual int svc(void)
+	{
+		ACE_Message_Block *mb = 0;
+		ACE_Time_Value nowait (ACE_OS::gettimeofday ());
+
+
+		//Send a package if we have one
+		while (this->getq (mb) != -1) {
+			GadgetContainerMessage<GadgetMessageIdentifier>* mid =
+					AsContainerMessage<GadgetMessageIdentifier>(mb);
+
+
+			if (!mid) {
+			  GERROR("Invalid message on output queue\n");
+			  mb->release();
+			  return -1;
+			}
+
+			//Is this a shutdown message?
+			if (mid->getObjectPtr()->id == GADGET_MESSAGE_CLOSE) {
+				socket_->send_n(mid->getObjectPtr(),sizeof(GadgetMessageIdentifier));
+				return 0;
+			}
+
+			GadgetMessageWriter* w = writers_.find(mid->getObjectPtr()->id);
+
+			if (!w) {
+			  GERROR("Unrecognized Message ID received: %d\n",mid->getObjectPtr()->id);
+			  mb->release();
+			  return -1;
+			}
+
+			if (w->write(socket_,mb->cont()) < 0) {
+			  GERROR("Failed to write message to Gadgetron\n");
+			  mb->release ();
+			  return -1;
+			}
+
+			mb->release();
+		}
+
+		return 0;
+
+	}
+
+protected:
+	ACE_SOCK_Stream* socket_;
+	GadgetronSlotContainer<GadgetMessageWriter> writers_;
+};
+
+class EXPORTGADGETTOOLS GadgetronConnector: public ACE_Svc_Handler<ACE_SOCK_STREAM, ACE_MT_SYNCH> {
+
+public:
+	GadgetronConnector();
+	virtual ~GadgetronConnector();
+
+    int openImpl (std::string hostname, std::string port);
+	int open (std::string hostname, std::string port);
+	virtual int handle_input (ACE_HANDLE fd = ACE_INVALID_HANDLE);
+	//virtual int handle_output (ACE_HANDLE fd = ACE_INVALID_HANDLE);
+	virtual int handle_close (ACE_HANDLE handle, ACE_Reactor_Mask close_mask);
+	virtual int svc(void);
+
+	virtual int putq  (  ACE_Message_Block * mb ,  ACE_Time_Value *  timeout = 0) {
+		return writer_task_.putq(mb,timeout);
+	}
+
+	virtual int process(size_t messageid, ACE_Message_Block* mb) {
+		mb->release();
+		return 0;
+	}
+
+	virtual int register_reader(size_t slot, GadgetMessageReader* reader);
+	virtual int register_writer(size_t slot, GadgetMessageWriter* writer) {
+		return writer_task_.register_writer(slot,writer);
+	}
+
+	int send_gadgetron_configuration_file(std::string config_xml_name);
+	int send_gadgetron_configuration_script(std::string config_xml_name);
+	int send_gadgetron_parameters(std::string xml_string);
+
+protected:
+	//ACE_Reactor_Notification_Strategy notifier_;
+	std::string hostname_;
+	std::string port_;
+
+	GadgetronSlotContainer<GadgetMessageReader> readers_;
+	WriterTask writer_task_;
+	//GadgetronSlotContainer<GadgetMessageWriter> writers_;
+};
+
+}
+#endif /* GADGETRONCONNECTOR_H_ */
diff --git a/toolboxes/gadgettools/GadgetronOSUtil.cpp b/toolboxes/gadgettools/GadgetronOSUtil.cpp
new file mode 100644
index 0000000..8d46068
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetronOSUtil.cpp
@@ -0,0 +1,50 @@
+
+#include "GadgetronOSUtil.h"
+
+#ifdef _WIN32
+    #include <windows.h>
+#else
+    #include <sys/types.h>
+    #include <sys/stat.h>
+#endif // _WIN32
+
+#include <boost/filesystem.hpp>
+using namespace boost::filesystem;
+
+namespace Gadgetron{
+
+    bool create_folder_with_all_permissions(const std::string& workingdirectory)
+    {
+        if ( !boost::filesystem::exists(workingdirectory) )
+        {
+            boost::filesystem::path workingPath(workingdirectory);
+            if ( !boost::filesystem::create_directory(workingPath) )
+            {
+                GERROR_STREAM("Error creating the working directory " << workingdirectory);
+                return false;
+            }
+
+            // set the permission for the folder
+            #ifdef _WIN32
+                try
+                {
+                    boost::filesystem::permissions(workingPath, all_all);
+                }
+                catch(...)
+                {
+                    GERROR_STREAM("Error changing the permission of the working directory " << workingdirectory);
+                }
+            #else
+                // in case an older version of boost is used in non-win system
+                // the system call is used
+                int res = chmod(workingPath.c_str(), S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IXGRP|S_IROTH|S_IWOTH|S_IXOTH);
+                if ( res != 0 )
+                {
+                    GERROR_STREAM("Error changing the permission of the working directory " << workingdirectory);
+                }
+            #endif // _WIN32
+        }
+
+        return true;
+    }
+}
diff --git a/toolboxes/gadgettools/GadgetronOSUtil.h b/toolboxes/gadgettools/GadgetronOSUtil.h
new file mode 100644
index 0000000..a4b890c
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetronOSUtil.h
@@ -0,0 +1,21 @@
+#ifndef GADGETRONOSUTIL_H_
+#define GADGETRONOSUTIL_H_
+
+#include "gadgettools_export.h"
+#include "GadgetronCommon.h"
+#include <string>
+#include <iostream>
+
+#ifdef _WIN32
+    #include <windows.h>
+#else
+    
+#endif // _WIN32
+
+namespace Gadgetron{
+
+    EXPORTGADGETTOOLS bool create_folder_with_all_permissions(const std::string& workingdirectory);
+
+}
+
+#endif /* GADGETRONOSUTIL_H_ */
diff --git a/toolboxes/gadgettools/GadgetronSlotContainer.h b/toolboxes/gadgettools/GadgetronSlotContainer.h
new file mode 100644
index 0000000..83a3d8c
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetronSlotContainer.h
@@ -0,0 +1,53 @@
+#ifndef GADGETRONSLOTCONTAINER_H_
+#define GADGETRONSLOTCONTAINER_H_
+
+#include <algorithm>
+#include <vector>
+
+template <typename T> class GadgetronSlotContainer {
+
+public:
+	GadgetronSlotContainer() {}
+
+	virtual ~GadgetronSlotContainer()
+	{
+		clear();
+	}
+
+	T* find(unsigned int slot) {
+	    T* ret = 0;
+	    for (unsigned int i = 0; i < slots_.size(); i++) {
+	    	if (slots_[i] == slot) {
+	    		ret = items_[i];
+	    		break;
+	    	}
+	    }
+	    return ret;
+	  }
+
+	  int insert ( unsigned short slot, T* item) {
+		  if (this->find(slot)) {
+			  return -1;
+		  } else {
+			  slots_.push_back(slot);
+			  items_.push_back(item);
+		  }
+		  return 0;
+	  }
+
+	  int clear()
+	  {
+		  for (unsigned int i = 0; i < items_.size(); i++) {
+			  if (items_[i]) delete items_[i];
+		  }
+		  slots_.clear();
+		  items_.clear();
+		  return 0;
+	  }
+
+protected:
+	std::vector<unsigned int> slots_;
+	std::vector<T*> items_;
+};
+
+#endif /* GADGETRONSLOTCONTAINER_H_ */
diff --git a/toolboxes/gadgettools/demo.xml b/toolboxes/gadgettools/demo.xml
new file mode 100644
index 0000000..0043c9c
--- /dev/null
+++ b/toolboxes/gadgettools/demo.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" ?>
+<gadgetron>
+  <blah>
+
+  </blah>
+
+  <encoding>
+    <kspace>
+      <matrix_size>
+	<comment>Acquired matrix size</comment>
+	<value>256.9078</value>
+	<value>128</value>
+      </matrix_size>
+    </kspace>
+
+    <image>
+      <matrix_size>
+	<comment>This is the reconstructed matrix size</comment>
+	<value>128</value>
+	<value>128</value>
+      </matrix_size>
+    </image>
+
+  </encoding>
+
+  <hardware>
+    <gradients>
+      <strength>
+	<comment>This is the maximum gradient stregnth</comment>
+	<type>double</type>
+	<units>mT/m</units>
+	<value>40.0</value>
+      </strength>
+      
+      <slew_rate>
+	<units>T/m/s</units>
+	<value>100.0</value>
+      </slew_rate>
+    </gradients>
+  </hardware>
+
+
+</gadgetron>
\ No newline at end of file
diff --git a/toolboxes/gadgettools/gadgettools_export.h b/toolboxes/gadgettools/gadgettools_export.h
new file mode 100644
index 0000000..c7257fc
--- /dev/null
+++ b/toolboxes/gadgettools/gadgettools_export.h
@@ -0,0 +1,20 @@
+/** \file gadgettools_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef GADGETTOOLS_EXPORT_H_
+#define GADGETTOOLS_EXPORT_H_
+
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GADGETTOOLS__) || defined (gadgetron_toolbox_gadgettools_EXPORTS)
+#define EXPORTGADGETTOOLS __declspec(dllexport)
+#else
+#define EXPORTGADGETTOOLS __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETTOOLS
+#endif
+
+
+#endif /* GADGETTOOLS_EXPORT_H_ */
diff --git a/toolboxes/gadgettools/ismrmrd/CMakeLists.txt b/toolboxes/gadgettools/ismrmrd/CMakeLists.txt
new file mode 100644
index 0000000..33ba686
--- /dev/null
+++ b/toolboxes/gadgettools/ismrmrd/CMakeLists.txt
@@ -0,0 +1,4 @@
+install (FILES 
+            GadgetImageMessageReader.h 
+            GadgetImageMessageWriter.h
+            DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/gadgettools/ismrmrd/GadgetImageMessageReader.h b/toolboxes/gadgettools/ismrmrd/GadgetImageMessageReader.h
new file mode 100644
index 0000000..c9d710d
--- /dev/null
+++ b/toolboxes/gadgettools/ismrmrd/GadgetImageMessageReader.h
@@ -0,0 +1,196 @@
+#ifndef GADGETSOCKETRECEIVER_H
+#define GADGETSOCKETRECEIVER_H
+
+#include "ace/SOCK_Stream.h"
+#include "ace/Task.h"
+
+#include <complex>
+#include <iostream>
+
+#include "GadgetMRIHeaders.h"
+#include "ismrmrd/ismrmrd.h"
+#include "hoNDArray.h"
+#include "GadgetMessageInterface.h"
+#include "ismrmrd/meta.h"
+
+namespace Gadgetron
+{
+
+/**
+Default implementation of GadgetMessageReader for Image messages
+*/
+
+template <typename T> class GadgetImageMessageReader : public GadgetMessageReader
+{
+
+public:
+    virtual ACE_Message_Block* read(ACE_SOCK_Stream* stream) 
+    {
+        GadgetContainerMessage<ISMRMRD::ImageHeader>* imgh = 
+            new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+        ssize_t recv_count = 0;
+        if ((recv_count = stream->recv_n(imgh->getObjectPtr(), sizeof(ISMRMRD::ImageHeader))) <= 0) {
+	  GERROR("GadgetImageMessageReader, failed to read IMAGE Header\n");
+	  imgh->release();
+	  return 0;
+        }
+
+        std::vector<size_t> dims(3);
+        dims[0] = imgh->getObjectPtr()->matrix_size[0];
+        dims[1] = imgh->getObjectPtr()->matrix_size[1];
+        dims[2] = imgh->getObjectPtr()->matrix_size[2];
+
+        if (imgh->getObjectPtr()->channels > 1) {
+            dims.push_back(imgh->getObjectPtr()->channels);
+        } 
+
+        GadgetContainerMessage< hoNDArray< T > >* data =
+            new GadgetContainerMessage< hoNDArray< T > >();
+
+        try{ data->getObjectPtr()->create(&dims);}
+        catch (std::runtime_error &err){
+            GEXCEPTION(err,"GadgetImageMessageReader, failed to allocate memory\n");
+            imgh->release();
+            return 0;
+        }
+
+        imgh->cont(data);
+
+        if ((recv_count = stream->recv_n(data->getObjectPtr()->get_data_ptr(), sizeof(T)*data->getObjectPtr()->get_number_of_elements())) <= 0) {
+	  GERROR("GadgetImageMessageReader, failed to read data from socket\n");
+	  imgh->release();
+	  return 0;
+        }
+
+        return imgh;
+    }
+};
+
+// for images with attributes
+template <typename T> class GadgetImageAttribMessageReader : public GadgetMessageReader
+{
+public:
+
+    typedef unsigned long long size_t_type;
+
+    virtual ACE_Message_Block* read(ACE_SOCK_Stream* stream) 
+    {
+        GadgetContainerMessage<ISMRMRD::ImageHeader>* imgh = 
+            new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+        GadgetContainerMessage<ISMRMRD::MetaContainer>* imgAttrib = 
+            new GadgetContainerMessage<ISMRMRD::MetaContainer>();
+
+        // read in ISMRMRD image header
+        ssize_t recv_count = 0;
+        if ((recv_count = stream->recv_n( imgh->getObjectPtr(), sizeof(ISMRMRD::ImageHeader))) <= 0)
+        {
+	  GERROR("GadgetImageAttribMessageReader, failed to read IMAGE Header\n");
+	  imgh->release();
+	  imgAttrib->release();
+	  return 0;
+        }
+
+        // read in gadgetron image meta attributes
+        size_t_type len(0);
+        if ( ( recv_count = stream->recv_n( &len, sizeof(size_t_type)) ) <= 0 )
+        {
+	  GERROR("GadgetImageAttribMessageReader, failed to read IMAGE Meta Attributes length\n");
+	  imgh->release();
+	  imgAttrib->release();
+	  return 0;
+        }
+
+        char* buf = NULL;
+        try
+        {
+            buf = new char[len];
+            if ( buf == NULL )
+            {
+	      GERROR("GadgetImageAttribMessageReader, failed to allocate IMAGE Meta Attributes buffer\n");
+	      imgh->release();
+	      imgAttrib->release();
+	      return 0;
+            }
+
+            memset(buf, '\0', len);
+            memcpy(buf, &len, sizeof(size_t_type));
+        }
+        catch (std::runtime_error &err)
+        {
+            GEXCEPTION(err,"GadgetImageAttribMessageReader, failed to allocate IMAGE Meta Attributes buffer\n");
+            imgh->release();
+            imgAttrib->release();
+            return 0;
+        }
+
+        if ( ( recv_count = stream->recv_n( buf, len) ) <= 0 )
+        {
+	  GERROR("GadgetImageAttribMessageReader, failed to read IMAGE Meta Attributes\n");
+	  imgh->release();
+	  imgAttrib->release();
+	  delete [] buf;
+	  return 0;
+        }
+
+        try
+        {
+            ISMRMRD::deserialize(buf, *imgAttrib->getObjectPtr());
+        }
+        catch(...)
+        {
+	  GERROR("GadgetImageAttribMessageReader, failed to deserialize IMAGE Meta Attributes\n");
+	  imgh->release();
+	  imgAttrib->release();
+	  delete [] buf;
+	  return 0;
+        }
+
+        delete [] buf;
+
+        // read in image content
+        std::vector<size_t> dims(3);
+        dims[0] = imgh->getObjectPtr()->matrix_size[0];
+        dims[1] = imgh->getObjectPtr()->matrix_size[1];
+        dims[2] = imgh->getObjectPtr()->matrix_size[2];
+
+        if (imgh->getObjectPtr()->channels > 1)
+        {
+            dims.push_back(imgh->getObjectPtr()->channels);
+        }
+
+        GadgetContainerMessage< hoNDArray< T > >* data = new GadgetContainerMessage< hoNDArray< T > >();
+
+        try
+        {
+            data->getObjectPtr()->create(&dims);
+        }
+        catch (std::runtime_error &err)
+        {
+            GEXCEPTION(err,"GadgetImageAttribMessageReader, failed to allocate memory\n");
+            imgh->release();
+            imgAttrib->release();
+            data->release();
+            return 0;
+        }
+
+        imgh->cont(data);
+        data->cont(imgAttrib);
+
+        if ((recv_count = stream->recv_n(data->getObjectPtr()->get_data_ptr(), sizeof(T)*data->getObjectPtr()->get_number_of_elements())) <= 0)
+        {
+	  GERROR("GadgetImageAttribMessageReader, failed to read data from socket\n");
+	  imgh->release();
+	  imgAttrib->release();
+	  data->release();
+	  return 0;
+        }
+
+        return imgh;
+    }
+};
+
+}
+
+#endif //GADGETSOCKETRECEIVER_H
diff --git a/toolboxes/gadgettools/ismrmrd/GadgetImageMessageWriter.h b/toolboxes/gadgettools/ismrmrd/GadgetImageMessageWriter.h
new file mode 100644
index 0000000..2d268f2
--- /dev/null
+++ b/toolboxes/gadgettools/ismrmrd/GadgetImageMessageWriter.h
@@ -0,0 +1,78 @@
+#ifndef GADGETSOCKETSENDER_H
+#define GADGETSOCKETSENDER_H
+
+#include "ace/SOCK_Stream.h"
+#include "ace/Task.h"
+
+#include <complex>
+
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "GadgetContainerMessage.h"
+#include "GadgetMessageInterface.h"
+
+namespace Gadgetron
+{
+
+/**
+Default implementation of GadgetMessageWriter for Image messages
+*/
+
+template <typename T> class GadgetImageMessageWriter : public GadgetMessageWriter
+{
+public:
+    virtual int write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb) 
+    {
+        GadgetContainerMessage<ISMRMRD::ImageHeader>* imagemb = 
+            dynamic_cast< GadgetContainerMessage<ISMRMRD::ImageHeader>* >(mb);
+
+        GadgetContainerMessage< hoNDArray< T > >* datamb =
+            dynamic_cast< GadgetContainerMessage< hoNDArray< T > >* >(imagemb->cont());
+
+        if (!imagemb || !datamb) {
+	  GERROR("GadgetImageMessageWriter invalid image message objects");
+	  return -1;
+        }
+
+
+        ssize_t send_cnt = 0;
+        GadgetMessageIdentifier id;
+
+        switch (sizeof(T)) {
+        case 2: //Unsigned short
+            id.id = GADGET_MESSAGE_IMAGE_REAL_USHORT;
+            break;
+        case 4: //Float
+            id.id = GADGET_MESSAGE_IMAGE_REAL_FLOAT;
+            break;
+        case 8: //Complex float
+            id.id = GADGET_MESSAGE_IMAGE_CPLX_FLOAT;
+            break;
+        default:
+	  GERROR("GadgetImageMessageWriter Wrong data size detected\n");
+	  return -1;
+        }
+
+        if ((send_cnt = sock->send_n (&id, sizeof(GadgetMessageIdentifier))) <= 0) {
+	  GERROR("Unable to send image message identifier\n");
+	  return -1;
+        }
+
+        if ((send_cnt = sock->send_n (imagemb->getObjectPtr(), sizeof(ISMRMRD::ImageHeader))) <= 0) {
+	  GERROR("Unable to send image header\n");
+	  return -1;
+        }
+
+        if ((send_cnt = sock->send_n (datamb->getObjectPtr()->get_data_ptr(), sizeof(T)*datamb->getObjectPtr()->get_number_of_elements())) <= 0) {
+	  GERROR("Unable to send image data\n");
+	  return -1;
+        }
+
+        return 0;
+    }
+
+};
+
+}
+
+#endif //GADGETSOCKETSENDER_H
diff --git a/toolboxes/gadgettools/schema/gadgetron.xsd b/toolboxes/gadgettools/schema/gadgetron.xsd
new file mode 100644
index 0000000..ce9537a
--- /dev/null
+++ b/toolboxes/gadgettools/schema/gadgetron.xsd
@@ -0,0 +1,54 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<xs:schema xmlns="http://gadgetron.sf.net/gadgetron" xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified" targetNamespace="http://gadgetron.sf.net/gadgetron">
+
+    <xs:element name="gadgetronConfiguration">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element name="port" type="xs:string"/>
+                <xs:element maxOccurs="1" minOccurs="0" name="workingdirectory" type="xs:string"/>
+            </xs:sequence>
+        </xs:complexType>
+    </xs:element>
+
+    <xs:element name="gadgetronStreamConfiguration">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element maxOccurs="unbounded" minOccurs="0" name="reader">
+                    <xs:complexType>
+                        <xs:sequence>
+                            <xs:element name="slot" type="xs:unsignedShort"/>
+                            <xs:element name="dll" type="xs:string"/>
+                            <xs:element name="classname" type="xs:string"/>
+                        </xs:sequence>
+                    </xs:complexType>
+                </xs:element>
+                <xs:element maxOccurs="unbounded" minOccurs="0" name="writer">
+                    <xs:complexType>
+                        <xs:sequence>
+                            <xs:element maxOccurs="1" minOccurs="1" name="slot" type="xs:unsignedShort"/>
+                            <xs:element maxOccurs="1" minOccurs="1"  name="dll" type="xs:string"/>
+                            <xs:element maxOccurs="1" minOccurs="1"  name="classname" type="xs:string"/>
+                        </xs:sequence>
+                    </xs:complexType>
+                </xs:element>
+                <xs:element maxOccurs="unbounded" minOccurs="0" name="gadget">
+                    <xs:complexType>
+                        <xs:sequence>
+                            <xs:element maxOccurs="1" minOccurs="1"  name="name" type="xs:string"/>
+                            <xs:element maxOccurs="1" minOccurs="1"  name="dll" type="xs:string"/>
+                            <xs:element maxOccurs="1" minOccurs="1"  name="classname" type="xs:string"/>
+                            <xs:element maxOccurs="unbounded" minOccurs="0" name="property">
+                                <xs:complexType>
+                                    <xs:sequence>
+                                        <xs:element maxOccurs="1" minOccurs="1" name="name" type="xs:string"/>
+                                        <xs:element maxOccurs="1" minOccurs="1" name="value" type="xs:string"/>
+                                    </xs:sequence>
+                                </xs:complexType>
+                            </xs:element>
+                        </xs:sequence>
+                    </xs:complexType>
+                </xs:element>
+            </xs:sequence>
+        </xs:complexType>
+    </xs:element>
+</xs:schema>
diff --git a/toolboxes/gadgettools/test_gadget_xml.cpp b/toolboxes/gadgettools/test_gadget_xml.cpp
new file mode 100644
index 0000000..c553f10
--- /dev/null
+++ b/toolboxes/gadgettools/test_gadget_xml.cpp
@@ -0,0 +1,32 @@
+#include "GadgetXml.h"
+
+#include <iostream>
+
+int main(int argc, char** argv)
+{
+  GDEBUG_STREAM("GadgetXML Test Program" << std::endl);
+
+  TiXmlDocument doc( "demo.xml" );
+  doc.LoadFile();
+
+  GadgetXMLNode n(&doc);
+
+  std::vector<long> vals = n.get<long>(std::string("gadgetron.encoding.kspace.matrix_size.value"));
+
+  GDEBUG_STREAM("Number of values: " << vals.size() << std::endl);
+  for (unsigned int i = 0; i < vals.size(); i++) {
+    GDEBUG_STREAM("   :" << vals[i] << std::endl);
+  }
+
+
+  //Let's add something to the document
+  n.add(std::string("gadgetron.encoding.mysection.value"), 6.789);
+  n.add(std::string("gadgetron.encoding.mysection.value"), 612);
+  n.add(std::string("gadgetron.encoding.mysection.value"), 512);
+  n.add(std::string("gadgetron.encoding.mysection.value"), vals);
+  
+  n.get_document()->Print();
+  
+
+  return 0;
+}
diff --git a/toolboxes/gtplus/CMakeLists.txt b/toolboxes/gtplus/CMakeLists.txt
new file mode 100644
index 0000000..a1314d8
--- /dev/null
+++ b/toolboxes/gtplus/CMakeLists.txt
@@ -0,0 +1,236 @@
+# gadgetron_toolbox_gtplus is a toolbox for general reconstruction to support all ISMRMRD dimensions
+
+if ( HAS_64_BIT )
+
+    if (WIN32)
+        ADD_DEFINITIONS(-D__BUILD_GADGETRON_PLUS__)
+    endif (WIN32)
+
+    if(WIN32)
+        link_directories(${Boost_LIBRARY_DIRS})
+    endif(WIN32)
+
+    if (MKL_FOUND)
+        MESSAGE("MKL Found for gadgetron_toolbox_gtplus ... ")
+        list(APPEND EXTRA_MKL_LIBRARIES mkl_core)
+        if ( USE_OPENMP )
+            list(APPEND EXTRA_MKL_LIBRARIES mkl_intel_thread)
+        endif ( USE_OPENMP )
+
+        INCLUDE_DIRECTORIES( ${MKL_INCLUDE_DIR} )
+        LINK_DIRECTORIES( ${MKL_LIB_DIR} ${MKL_COMPILER_LIB_DIR} )
+    endif (MKL_FOUND)
+
+    include_directories(
+        ${ACE_INCLUDE_DIR} 
+        ${Boost_INCLUDE_DIR}
+        ${ISMRMRD_INCLUDE_DIR}
+        ${FFTW3_INCLUDE_DIR}
+        ${ARMADILLO_INCLUDE_DIRS}
+        ${MKL_INCLUDE_DIR}
+        ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/core
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+        ${CMAKE_SOURCE_DIR}/toolboxes/fft/cpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/algorithm
+        ${CMAKE_SOURCE_DIR}/toolboxes/operators
+        ${CMAKE_SOURCE_DIR}/toolboxes/operators/cpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+        ${CMAKE_SOURCE_DIR}/toolboxes/solvers/cpu
+        ${HDF5_INCLUDE_DIR}
+        ${HDF5_INCLUDE_DIR}/cpp
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+        ${CMAKE_SOURCE_DIR}/toolboxes/mri_core
+        ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+        ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/gtplus
+        ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/util
+        ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/workflow
+        ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/algorithm 
+        ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/solver
+        ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+        ${CMAKE_SOURCE_DIR}/toolboxes/mri_core
+        ${CMAKE_SOURCE_DIR}/apps/gadgetron
+        ${CMAKE_SOURCE_DIR}/apps/matlab
+        ${CMAKE_SOURCE_DIR}/gadgets/mri_core 
+        ${CMAKE_SOURCE_DIR}/gadgets/gtPlus 
+    )
+
+    set( gtplus_io_header_files 
+        GtPlusIOExport.h 
+        util/gtPlusIOBase.h
+        util/gtPlusIOAnalyze.h)
+
+    set( gtplus_io_src_files 
+        util/gtPlusIOBase.cpp
+        util/gtPlusIOAnalyze.cpp)
+
+    set( util_header_files 
+        util/gtPlusUtil.h 
+        util/gtPlusUtil.hxx )
+
+    set( util_src_files )
+
+    if (MKL_FOUND OR ARMADILLO_FOUND)
+        set( workflow_header_files 
+            workflow/gtPlusISMRMRDReconWorkFlow.h
+            workflow/gtPlusISMRMRDReconWorkFlowCartesian.h
+            workflow/gtPlusISMRMRDReconWorkFlowCartesian2DT.h
+            workflow/gtPlusISMRMRDReconWorkFlowCartesian3DT.h
+            workflow/gtPlusISMRMRDReconUtil.h
+            workflow/gtPlusISMRMRDReconUtil.hxx
+            workflow/gtPlusISMRMRDReconCoilMapEstimation.h
+            workflow/gtPlusISMRMRDReconWorkOrder.h
+            workflow/gtPlusISMRMRDReconWorkOrder2DT.h
+            workflow/gtPlusISMRMRDReconWorkOrder3DT.h
+            workflow/gtPlusISMRMRDReconWorker.h
+            workflow/gtPlusISMRMRDReconWorker2DT.h
+            workflow/gtPlusISMRMRDReconWorker3DT.h
+            workflow/gtPlusISMRMRDReconWorker2DTGRAPPA.h
+            workflow/gtPlusISMRMRDReconWorker2DTSPIRIT.h
+            workflow/gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h
+            workflow/gtPlusISMRMRDReconWorker2DTNoAcceleration.h
+            workflow/gtPlusISMRMRDReconWorker3DTGRAPPA.h
+            workflow/gtPlusISMRMRDReconWorker3DTSPIRIT.h
+            workflow/gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h
+            workflow/gtPlusISMRMRDReconWorker3DTNoAcceleration.h
+            workflow/gtPlusCloudScheduler.h )
+
+        set( workflow_src_files 
+            workflow/gtPlusISMRMRDReconUtil.cpp
+            workflow/gtPlusCloudScheduler.cpp )
+
+        set( algorithm_header_files 
+            algorithm/gtPlusAlgorithmBase.h 
+            algorithm/gtPlusGRAPPA.h 
+            algorithm/gtPlusSPIRIT.h
+            algorithm/gtPlusOperator.h 
+            algorithm/gtPlusSPIRITOperator.h 
+            algorithm/gtPlusSPIRIT2DOperator.h 
+            algorithm/gtPlusSPIRIT3DOperator.h 
+            algorithm/gtPlusSPIRIT2DTOperator.h 
+            algorithm/gtPlusSPIRITNoNullSpaceOperator.h 
+            algorithm/gtPlusSPIRITNoNullSpace2DOperator.h 
+            algorithm/gtPlusSPIRITNoNullSpace3DOperator.h 
+            algorithm/gtPlusSPIRITNoNullSpace2DTOperator.h 
+            algorithm/gtPlusWaveletOperator.h 
+            algorithm/gtPlusWavelet2DOperator.h 
+            algorithm/gtPlusWavelet3DOperator.h 
+            algorithm/gtPlusWaveletNoNullSpace2DOperator.h 
+            algorithm/gtPlusWaveletNoNullSpace3DOperator.h 
+            algorithm/gtPlusDataFidelityOperator.h 
+            algorithm/FreeFormDeformation/gtplusFFDBase.h
+            algorithm/FreeFormDeformation/gtplusBSplineFFD.h
+            algorithm/FreeFormDeformation/gtplusMLFFD.h
+            algorithm/FreeFormDeformation/gtplusBSplineFFD2D.h
+            algorithm/FreeFormDeformation/gtplusBSplineFFD3D.h 
+            algorithm/FreeFormDeformation/gtplusBSplineFFD4D.h )
+
+        set( algorithm_src_files )
+
+        set( solver_header_files 
+            solver/gtPlusSolver.h 
+            solver/gtPlusLinearSolver.h 
+            solver/gtPlusNonLinearSolver.h
+            solver/gtPlusLSQRSolver.h 
+            solver/gtPlusNCGSolver.h )
+
+        set( solver_src_files )
+
+        set( application_header_files )
+
+        set( application_src_files )
+
+    endif (MKL_FOUND OR ARMADILLO_FOUND)
+
+    # matlab
+    if (MATLAB_FOUND)
+        message("MATLAB FOUND: ${MATLAB_INCLUDE_DIR}, Matlab gt interface is being compiled.")
+        SET(CMAKE_DEBUG_POSTFIX)
+        include_directories( ${MATLAB_INCLUDE_DIR} )
+        set( matlab_files matlab/gtMatlab.h 
+                          matlab/gtMatlabConverter.h
+                          matlab/gtMatlabConverterComplex.h 
+                          matlab/gtMatlabImage.h )
+
+    else(MATLAB_FOUND)
+        message("MATLAB NOT FOUND ...")
+        set( matlab_files )
+    endif(MATLAB_FOUND)
+
+    set(gtplus_files GtPlusExport.h 
+                ${util_header_files} 
+                ${util_src_files} )
+
+    source_group(util FILES ${util_header_files} ${util_src_files})
+
+    if (MKL_FOUND OR ARMADILLO_FOUND)
+        set(gtplus_files ${gtplus_files} 
+                ${workflow_header_files} 
+                ${workflow_src_files} 
+                ${algorithm_header_files} 
+                ${algorithm_src_files} 
+                ${solver_header_files} 
+                ${solver_src_files} 
+                ${application_header_files} 
+                ${application_src_files} 
+                ${matlab_files} )
+
+        source_group(workflow FILES     ${workflow_header_files}    ${workflow_src_files})
+        source_group(algorithm FILES    ${algorithm_header_files}   ${algorithm_src_files})
+        source_group(solver FILES       ${solver_header_files}      ${solver_src_files})
+        source_group(application FILES  ${application_header_files} ${application_src_files})
+
+        if (MATLAB_FOUND)
+            source_group(matlab FILES ${matlab_files})
+        endif(MATLAB_FOUND)
+    endif (MKL_FOUND OR ARMADILLO_FOUND)
+
+    add_library(gadgetron_toolbox_gtplus_io SHARED ${gtplus_io_header_files} ${gtplus_io_src_files} )
+    set_target_properties(gadgetron_toolbox_gtplus_io PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+    target_link_libraries(gadgetron_toolbox_gtplus_io gadgetron_toolbox_log)
+
+    add_library(gadgetron_toolbox_gtplus SHARED 
+                ${gtplus_files} )
+
+    set_target_properties(gadgetron_toolbox_gtplus PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+    target_link_libraries(gadgetron_toolbox_gtplus 
+                          gadgetron_toolbox_cpucore
+                          gadgetron_toolbox_log
+                          gadgetron_toolbox_cpucore_math 
+                          gadgetron_toolbox_cpufft
+                          gadgetron_toolbox_mri_core
+                          gadgetron_toolbox_gtplus_io 
+                          gadgetron_toolbox_cpucore_math 
+                          gadgetron_toolbox_mri_core )
+
+    install(TARGETS gadgetron_toolbox_gtplus DESTINATION lib COMPONENT main)
+    install(TARGETS gadgetron_toolbox_gtplus_io DESTINATION lib COMPONENT main)
+
+    # install gtplus files
+    install (FILES  
+            GtPlusExport.h 
+            ${util_header_files} 
+            ${workflow_header_files} 
+            ${algorithm_header_files} 
+            ${solver_header_files} 
+            ${application_header_files} 
+            ${gtplus_io_header_files} 
+            ${matlab_files}
+            DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+    # This one depends on the gadgets being compiled
+    if (ACE_FOUND AND ISMRMRD_FOUND AND FFTW3_FOUND)
+      if (GTEST_FOUND)
+        add_subdirectory(ut)
+      endif (GTEST_FOUND)
+    endif (ACE_FOUND AND ISMRMRD_FOUND AND FFTW3_FOUND)
+
+endif ( HAS_64_BIT )
diff --git a/toolboxes/gtplus/GtPlusExport.h b/toolboxes/gtplus/GtPlusExport.h
new file mode 100644
index 0000000..dfa96c4
--- /dev/null
+++ b/toolboxes/gtplus/GtPlusExport.h
@@ -0,0 +1,16 @@
+/** \file       GtPlusExport.h
+    \brief      Implement windows export/import for GtPlus toolbox
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#if defined (WIN32)
+    #if defined (__BUILD_GADGETRON_PLUS__) || defined (gtplus_EXPORTS)
+        #define EXPORTGTPLUS __declspec(dllexport)
+    #else
+        #define EXPORTGTPLUS __declspec(dllimport)
+    #endif
+#else
+    #define EXPORTGTPLUS
+#endif
diff --git a/toolboxes/gtplus/GtPlusIOExport.h b/toolboxes/gtplus/GtPlusIOExport.h
new file mode 100644
index 0000000..77b28a1
--- /dev/null
+++ b/toolboxes/gtplus/GtPlusIOExport.h
@@ -0,0 +1,16 @@
+/** \file       GtPlusIOExport.h
+    \brief      Implement export/import for GtPlus toolbox
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#if defined (WIN32)
+    #if defined (__BUILD_GADGETRON_PLUS__) || defined (gtplus_io_EXPORTS)
+        #define EXPORTGTPLUSIO __declspec(dllexport)
+    #else
+        #define EXPORTGTPLUSIO __declspec(dllimport)
+    #endif
+#else
+    #define EXPORTGTPLUSIO
+#endif
diff --git a/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusBSplineFFD.h b/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusBSplineFFD.h
new file mode 100644
index 0000000..8671a29
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusBSplineFFD.h
@@ -0,0 +1,820 @@
+/** \file       gtplusBSplineFFD.h
+    \brief      Class for gtPlus BSpline FreeFormDeformation
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtplusFFDBase.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut>
+class gtplusBSplineFFD : public gtplusFFDBase<T, CoordType, DIn, DOut>
+{
+public:
+
+    typedef gtplusFFDBase<T, CoordType, DIn, DOut> BaseClass;
+    typedef gtplusBSplineFFD<T, CoordType, DIn, DOut> Self;
+
+    typedef typename BaseClass::real_value_type real_value_type;
+    typedef real_value_type bspline_float_type;
+
+    typedef typename BaseClass::coord_type coord_type;
+
+    using BaseClass::D;
+    enum { BSPLINELUTSIZE = 1000 };
+    enum { BSPLINEPADDINGSIZE = 4 };
+
+    typedef real_value_type LUTType[BSPLINELUTSIZE][BSPLINEPADDINGSIZE];
+
+    typedef typename BaseClass::CoordArrayType      CoordArrayType;
+    typedef typename BaseClass::ValueArrayType      ValueArrayType;
+    typedef typename BaseClass::ArrayType           ArrayType;
+    typedef typename BaseClass::FFDCtrlPtGridType   FFDCtrlPtGridType;
+    typedef typename BaseClass::PointType           PointType;
+    typedef typename BaseClass::ImageType           ImageType;
+    typedef typename BaseClass::MaskArrayType       MaskArrayType;
+
+    gtplusBSplineFFD();
+    virtual ~gtplusBSplineFFD();
+
+    /// evaluate the FFD at a grid location
+    virtual bool evaluateFFD(const CoordType pt[D], T r[DOut]) const = 0;
+
+    /// evaluate the 1st order derivative of FFD at a grid location
+    virtual bool evaluateFFDDerivative(const CoordType pt[D], T deriv[D][DOut]) const = 0;
+
+    /// evaluate the 2nd order derivative of FFD at a grid location
+    /// dderiv : D*D vector, stores dxx dxy dxz ...; dyx dyy dyz ...; dzx dzy dzz ...
+    virtual bool evaluateFFDSecondOrderDerivative(const CoordType pt[D], T dderiv[D*D][DOut]) const = 0;
+
+    /// compute the FFD approximation once
+    virtual bool ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N) = 0;
+
+    /// although BSpline grid has the padding, every index is defined on the unpadded grid
+
+    /// get the size of control point arrays
+    virtual size_t get_size(size_t dimension) const { return ctrl_pt_[0].get_size(dimension)-2*BSPLINEPADDINGSIZE; }
+    virtual std::vector<size_t> get_dimensions() const
+    {
+        std::vector<size_t> dim;
+        ctrl_pt_[0].get_dimensions(dim);
+
+        unsigned int d;
+        for ( d=0; d<DIn; d++ )
+        {
+            dim[d] -= 2*BSPLINEPADDINGSIZE;
+        }
+
+        return dim;
+    }
+
+    /// get the spacing of of control point arrays
+    virtual coord_type get_spacing(size_t dimension) const { return ctrl_pt_[0].get_pixel_size(dimension); }
+    virtual void get_spacing(std::vector<coord_type>& spacing) const { ctrl_pt_[0].get_pixel_size(spacing); }
+
+    /// get/set a control point value
+    virtual T get(size_t x, size_t y, size_t d) const { return ctrl_pt_[d](x+BSPLINEPADDINGSIZE, y+BSPLINEPADDINGSIZE); }
+    virtual void set(size_t x, size_t y, size_t d, T v) { ctrl_pt_[d](x+BSPLINEPADDINGSIZE, y+BSPLINEPADDINGSIZE) = v; }
+
+    virtual T get(size_t x, size_t y, size_t z, size_t d) const { return ctrl_pt_[d](x+BSPLINEPADDINGSIZE, y+BSPLINEPADDINGSIZE, z+BSPLINEPADDINGSIZE); }
+    virtual void set(size_t x, size_t y, size_t z, size_t d, T v) { ctrl_pt_[d](x+BSPLINEPADDINGSIZE, y+BSPLINEPADDINGSIZE, z+BSPLINEPADDINGSIZE) = v; }
+
+    virtual T get(size_t x, size_t y, size_t z, size_t s, size_t d) const { return ctrl_pt_[d](x+BSPLINEPADDINGSIZE, y+BSPLINEPADDINGSIZE, z+BSPLINEPADDINGSIZE, s+BSPLINEPADDINGSIZE); }
+    virtual void set(size_t x, size_t y, size_t z, size_t s, size_t d, T v) { ctrl_pt_[d](x+BSPLINEPADDINGSIZE, y+BSPLINEPADDINGSIZE, z+BSPLINEPADDINGSIZE, s+BSPLINEPADDINGSIZE) = v; }
+
+    /// offset to/from indexes for control points
+    virtual size_t calculate_offset(size_t x, size_t y) const { return ctrl_pt_[0].calculate_offset(x+BSPLINEPADDINGSIZE, y+BSPLINEPADDINGSIZE); }
+
+    virtual void calculate_index( size_t offset, size_t& x, size_t& y ) const
+    {
+        ctrl_pt_[0].calculate_index(offset, x, y);
+        x -= BSPLINEPADDINGSIZE;
+        y -= BSPLINEPADDINGSIZE;
+    }
+
+    virtual size_t calculate_offset(size_t x, size_t y, size_t z) const { return ctrl_pt_[0].calculate_offset(x+BSPLINEPADDINGSIZE, y+BSPLINEPADDINGSIZE, z+BSPLINEPADDINGSIZE); }
+    virtual void calculate_index( size_t offset, size_t& x, size_t& y, size_t& z ) const
+    {
+        ctrl_pt_[0].calculate_index(offset, x, y, z);
+        x -= BSPLINEPADDINGSIZE;
+        y -= BSPLINEPADDINGSIZE;
+        z -= BSPLINEPADDINGSIZE;
+    }
+
+    virtual size_t calculate_offset(size_t x, size_t y, size_t z, size_t s) const { return ctrl_pt_[0].calculate_offset(x+BSPLINEPADDINGSIZE, y+BSPLINEPADDINGSIZE, z+BSPLINEPADDINGSIZE, s+BSPLINEPADDINGSIZE); }
+    virtual void calculate_index( size_t offset, size_t& x, size_t& y, size_t& z, size_t& s ) const
+    {
+        ctrl_pt_[0].calculate_index(offset, x, y, z, s);
+        x -= BSPLINEPADDINGSIZE;
+        y -= BSPLINEPADDINGSIZE;
+        z -= BSPLINEPADDINGSIZE;
+        s -= BSPLINEPADDINGSIZE;
+    }
+
+    /// compute the control point location in world coordinates
+    virtual void get_location(size_t x, size_t y, CoordType& sx, CoordType& sy) const { ctrl_pt_[0].image_to_world(x+BSPLINEPADDINGSIZE, y+BSPLINEPADDINGSIZE, sx, sy); }
+    virtual void get_location(size_t x, size_t y, size_t z, CoordType& sx, CoordType& sy, CoordType& sz) const { ctrl_pt_[0].image_to_world(x+BSPLINEPADDINGSIZE, y+BSPLINEPADDINGSIZE, z+BSPLINEPADDINGSIZE, sx, sy, sz); }
+    virtual void get_location(size_t x, size_t y, size_t z, size_t s, CoordType& sx, CoordType& sy, CoordType& sz, CoordType& ss) const { ctrl_pt_[0].image_to_world(x+BSPLINEPADDINGSIZE, y+BSPLINEPADDINGSIZE, z+BSPLINEPADDINGSIZE, s+BSPLINEPADDINGSIZE, sx, sy, sz, ss); }
+
+    /// convert a world coordinate point to FFD grid location
+    virtual bool world_to_grid(const CoordType pt_w[D], CoordType pt_g[D]) const;
+    virtual bool world_to_grid(CoordType px_w, CoordType py_w, CoordType& px_g, CoordType& py_g) const;
+    virtual bool world_to_grid(CoordType px_w, CoordType py_w, CoordType pz_w, CoordType& px_g, CoordType& py_g, CoordType& pz_g) const;
+    virtual bool world_to_grid(CoordType px_w, CoordType py_w, CoordType pz_w, CoordType ps_w, CoordType& px_g, CoordType& py_g, CoordType& pz_g, CoordType& ps_g) const;
+
+    virtual bool grid_to_world(const CoordType pt_g[D], CoordType pt_w[D]) const;
+    virtual bool grid_to_world(CoordType px_g, CoordType py_g, CoordType& px_w, CoordType& py_w) const;
+    virtual bool grid_to_world(CoordType px_g, CoordType py_g, CoordType pz_g, CoordType& px_w, CoordType& py_w, CoordType& pz_w) const;
+    virtual bool grid_to_world(CoordType px_g, CoordType py_g, CoordType pz_g, CoordType ps_g, CoordType& px_w, CoordType& py_w, CoordType& pz_w, CoordType& ps_w) const;
+
+    /// print info
+    virtual void print(std::ostream& os) const;
+
+    /// compute four BSpline basis functions
+    static bspline_float_type BSpline0(bspline_float_type t)
+    {
+        return (1-t)*(1-t)*(1-t)/(bspline_float_type)6.0;
+    }
+
+    static bspline_float_type BSpline1(bspline_float_type t)
+    {
+        return (3*t*t*t - 6*t*t + 4)/(bspline_float_type)6.0;
+    }
+
+    static bspline_float_type BSpline2(bspline_float_type t)
+    {
+        return (-3*t*t*t + 3*t*t + 3*t + 1)/(bspline_float_type)6.0;
+    }
+
+    static bspline_float_type BSpline3(bspline_float_type t)
+    {
+        return (t*t*t)/(bspline_float_type)6.0;
+    }
+
+    static bspline_float_type BSpline(size_t ind, bspline_float_type t)
+    {
+        switch (ind)
+        {
+        case 0:
+            return BSpline0(t);
+        case 1:
+            return BSpline1(t);
+        case 2:
+            return BSpline2(t);
+        case 3:
+            return BSpline3(t);
+        }
+
+        return 0;
+    }
+
+    /// compute 1st order derivatives of four BSpline basis functions
+    static bspline_float_type BSpline0FirstOrderDeriv(bspline_float_type t)
+    {
+        return -(1-t)*(1-t)/(bspline_float_type)2.0;
+    }
+
+    static bspline_float_type BSpline1FirstOrderDeriv(bspline_float_type t)
+    {
+        return (9*t*t - 12*t)/(bspline_float_type)6.0;
+    }
+
+    static bspline_float_type BSpline2FirstOrderDeriv(bspline_float_type t)
+    {
+        return (-9*t*t + 6*t + 3)/(bspline_float_type)6.0;
+    }
+
+    static bspline_float_type BSpline3FirstOrderDeriv(bspline_float_type t)
+    {
+        return (t*t)/(bspline_float_type)2.0;
+    }
+
+    static bspline_float_type BSplineFirstOrderDeriv(size_t ind, bspline_float_type t)
+    {
+        switch (ind)
+        {
+        case 0:
+            return BSpline0FirstOrderDeriv(t);
+        case 1:
+            return BSpline1FirstOrderDeriv(t);
+        case 2:
+            return BSpline2FirstOrderDeriv(t);
+        case 3:
+            return BSpline3FirstOrderDeriv(t);
+        }
+
+        return 0;
+    }
+
+    /// compute 2nd order derivatives of four BSpline basis functions
+    static bspline_float_type BSpline0SecondOrderDeriv(bspline_float_type t)
+    {
+        return 1 - t;
+    }
+
+    static bspline_float_type BSpline1SecondOrderDeriv(bspline_float_type t)
+    {
+        return 3*t - 2;
+    }
+
+    static bspline_float_type BSpline2SecondOrderDeriv(bspline_float_type t)
+    {
+        return -3*t + 1;
+    }
+
+    static bspline_float_type BSpline3SecondOrderDeriv(bspline_float_type t)
+    {
+        return t;
+    }
+
+    static bspline_float_type BSplineSecondOrderDeriv(size_t ind, bspline_float_type t)
+    {
+        switch (ind)
+        {
+        case 0:
+            return BSpline0SecondOrderDeriv(t);
+        case 1:
+            return BSpline1SecondOrderDeriv(t);
+        case 2:
+            return BSpline2SecondOrderDeriv(t);
+        case 3:
+            return BSpline3SecondOrderDeriv(t);
+        }
+
+        return 0;
+    }
+
+    using BaseClass::performTiming_;
+    using BaseClass::debugFolder_;
+
+protected:
+
+    using BaseClass::ctrl_pt_;
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+
+    /// load the look up table for BSpline functions
+    virtual bool loadLookUpTable();
+
+    /// initialize the FFD
+    /// define the FFD over a region
+    bool initializeBFFD(const PointType& start, const PointType& end, CoordType gridCtrlPtSpacing[DIn]);
+    bool initializeBFFD(const PointType& start, const PointType& end, size_t gridCtrlPtNum[DIn]);
+    /// define the FFD over the region covered by an image
+    bool initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, CoordType gridCtrlPtSpacing[DIn]);
+    bool initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, size_t gridCtrlPtNum[DIn]);
+
+    /// 2D
+    bool initializeBFFD(const PointType& start, const PointType& end, CoordType dx, CoordType dy);
+    bool initializeBFFD(const PointType& start, const PointType& end, size_t sx, size_t sy);
+    bool initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, CoordType dx, CoordType dy);
+    bool initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, size_t sx, size_t sy);
+
+    /// 3D
+    bool initializeBFFD(const PointType& start, const PointType& end, CoordType dx, CoordType dy, CoordType dz);
+    bool initializeBFFD(const PointType& start, const PointType& end, size_t sx, size_t sy, size_t sz);
+    bool initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, CoordType dx, CoordType dy, CoordType dz);
+    bool initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, size_t sx, size_t sy, size_t sz);
+
+    /// 4D
+    bool initializeBFFD(const PointType& start, const PointType& end, CoordType dx, CoordType dy, CoordType dz, CoordType ds);
+    bool initializeBFFD(const PointType& start, const PointType& end, size_t sx, size_t sy, size_t sz, size_t ss);
+    bool initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, CoordType dx, CoordType dy, CoordType dz, CoordType ds);
+    bool initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, size_t sx, size_t sy, size_t sz, size_t ss);
+
+    /// look up table for BSpline and its first and second order derivatives
+    LUTType LUT_;
+    LUTType LUT1_;
+    LUTType LUT2_;
+};
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+gtplusBSplineFFD<T, CoordType, DIn, DOut>::gtplusBSplineFFD()
+{
+    this->loadLookUpTable();
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+gtplusBSplineFFD<T, CoordType, DIn, DOut>::~gtplusBSplineFFD()
+{
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::loadLookUpTable()
+{
+    try
+    {
+        long long ii;
+        double gapInLUT = (double)(BSPLINELUTSIZE-1);
+
+        #pragma omp parallel for default(none) private(ii) shared(gapInLUT)
+        for (ii=0; ii<(long long)BSPLINELUTSIZE; ii++)
+        {
+            bspline_float_type g = (bspline_float_type)(ii/gapInLUT);
+
+            LUT_[ii][0]   = BSpline0(g);
+            LUT_[ii][1]   = BSpline1(g);
+            LUT_[ii][2]   = BSpline2(g);
+            LUT_[ii][3]   = BSpline3(g);
+
+            LUT1_[ii][0]  = BSpline0FirstOrderDeriv(g);
+            LUT1_[ii][1]  = BSpline1FirstOrderDeriv(g);
+            LUT1_[ii][2]  = BSpline2FirstOrderDeriv(g);
+            LUT1_[ii][3]  = BSpline3FirstOrderDeriv(g);
+
+            LUT2_[ii][0]  = BSpline0SecondOrderDeriv(g);
+            LUT2_[ii][1]  = BSpline1SecondOrderDeriv(g);
+            LUT2_[ii][2]  = BSpline2SecondOrderDeriv(g);
+            LUT2_[ii][3]  = BSpline3SecondOrderDeriv(g);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in gtplusBSplineFFD<T, CoordType, DIn, DOut>::loadLookUpTable() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::world_to_grid(const CoordType pt_w[D], CoordType pt_g[D]) const
+{
+    try
+    {
+        this->ctrl_pt_[0].world_to_image(pt_w, pt_g);
+        unsigned int d;
+        for ( d=0; d<D; d++ )
+        {
+            pt_g[d] -= BSPLINEPADDINGSIZE;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in world_to_grid(const CoordType pt_w[D], CoordType pt_g[D]) const ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::world_to_grid(CoordType px_w, CoordType py_w, CoordType& px_g, CoordType& py_g) const
+{
+    GADGET_CHECK_RETURN_FALSE(DIn==2);
+
+    try
+    {
+        this->ctrl_pt_[0].world_to_image(px_w, py_w, px_g, py_g);
+        px_g -= BSPLINEPADDINGSIZE;
+        py_g -= BSPLINEPADDINGSIZE;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in world_to_grid(CoordType px_w, CoordType py_w, CoordType& px_g, CoordType& py_g) const ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::world_to_grid(CoordType px_w, CoordType py_w, CoordType pz_w, CoordType& px_g, CoordType& py_g, CoordType& pz_g) const
+{
+    GADGET_CHECK_RETURN_FALSE(DIn==3);
+
+    try
+    {
+        this->ctrl_pt_[0].world_to_image(px_w, py_w, pz_w, px_g, py_g, pz_g);
+        px_g -= BSPLINEPADDINGSIZE;
+        py_g -= BSPLINEPADDINGSIZE;
+        pz_g -= BSPLINEPADDINGSIZE;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in world_to_grid(CoordType px_w, CoordType py_w, CoordType pz_w, CoordType& px_g, CoordType& py_g, CoordType& pz_g) const ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::world_to_grid(CoordType px_w, CoordType py_w, CoordType pz_w, CoordType ps_w, CoordType& px_g, CoordType& py_g, CoordType& pz_g, CoordType& ps_g) const
+{
+    GADGET_CHECK_RETURN_FALSE(DIn==4);
+
+    try
+    {
+        this->ctrl_pt_[0].world_to_image(px_w, py_w, pz_w, ps_w, px_g, py_g, pz_g, ps_g);
+        px_g -= BSPLINEPADDINGSIZE;
+        py_g -= BSPLINEPADDINGSIZE;
+        pz_g -= BSPLINEPADDINGSIZE;
+        ps_g -= BSPLINEPADDINGSIZE;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in world_to_grid(CoordType px_w, CoordType py_w, CoordType pz_w, CoordType ps_w, CoordType& px_g, CoordType& py_g, CoordType& pz_g, CoordType& ps_g) const ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::grid_to_world(const CoordType pt_g[D], CoordType pt_w[D]) const
+{
+    try
+    {
+        CoordType pt_g_padded[D];
+        unsigned int d;
+        for ( d=0; d<D; d++ )
+        {
+            pt_g_padded[d] = pt_g[d] + BSPLINEPADDINGSIZE;
+        }
+
+        this->ctrl_pt_[0].image_to_world(pt_g_padded, pt_w);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in grid_to_world(const CoordType pt_g[D], CoordType pt_w[D]) const ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::grid_to_world(CoordType px_g, CoordType py_g, CoordType& px_w, CoordType& py_w) const
+{
+    GADGET_CHECK_RETURN_FALSE(DIn==2);
+
+    try
+    {
+        px_g += BSPLINEPADDINGSIZE;
+        py_g += BSPLINEPADDINGSIZE;
+        this->ctrl_pt_[0].image_to_world(px_g, py_g, px_w, py_w);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in grid_to_world(CoordType px_g, CoordType py_g, CoordType& px_w, CoordType& py_w) const ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::grid_to_world(CoordType px_g, CoordType py_g, CoordType pz_g, CoordType& px_w, CoordType& py_w, CoordType& pz_w) const
+{
+    GADGET_CHECK_RETURN_FALSE(DIn==3);
+
+    try
+    {
+        px_g += BSPLINEPADDINGSIZE;
+        py_g += BSPLINEPADDINGSIZE;
+        pz_g += BSPLINEPADDINGSIZE;
+        this->ctrl_pt_[0].image_to_world(px_g, py_g, pz_g, px_w, py_w, pz_w);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in grid_to_world(CoordType px_g, CoordType py_g, CoordType pz_g, CoordType& px_w, CoordType& py_w, CoordType& pz_w) const ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::grid_to_world(CoordType px_g, CoordType py_g, CoordType pz_g, CoordType ps_g, CoordType& px_w, CoordType& py_w, CoordType& pz_w, CoordType& ps_w) const
+{
+    GADGET_CHECK_RETURN_FALSE(DIn==4);
+
+    try
+    {
+        px_g += BSPLINEPADDINGSIZE;
+        py_g += BSPLINEPADDINGSIZE;
+        pz_g += BSPLINEPADDINGSIZE;
+        ps_g += BSPLINEPADDINGSIZE;
+        this->ctrl_pt_[0].image_to_world(px_g, py_g, pz_g, ps_g, px_w, py_w, pz_w, ps_w);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in grid_to_world(CoordType px_g, CoordType py_g, CoordType pz_g, CoordType ps_g, CoordType& px_w, CoordType& py_w, CoordType& pz_w, CoordType& ps_w) const ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::initializeBFFD(const PointType& start, const PointType& end, CoordType gridCtrlPtSpacing[DIn])
+{
+    try
+    {
+        unsigned int d;
+        for ( d=0; d<DIn; d++ )
+        {
+            GADGET_CHECK_RETURN_FALSE(end(d) > start(d));
+        }
+
+        std::vector<size_t> dim(DIn, 2);
+        std::vector<coord_type> pixelSize(DIn, 1);
+        std::vector<coord_type> origin(DIn, 0);
+
+        for ( d=0; d<DIn; d++ )
+        {
+            dim[d] = FFD_MKINT( (end(d)-start(d))/gridCtrlPtSpacing[d] ) + 1;
+            pixelSize[d] = (end(d)-start(d))/(dim[d]-1);
+
+            /// add the padding
+            dim[d] += 2*BSPLINEPADDINGSIZE;
+
+            origin[d] = -pixelSize[d]*BSPLINEPADDINGSIZE;
+        }
+
+        for ( d=0; d<DOut; d++ )
+        {
+            this->ctrl_pt_[d].create(dim, pixelSize, origin);
+            Gadgetron::clear(this->ctrl_pt_[d]);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in initializeBFFD(const PointType& start, const PointType& end, CoordType gridCtrlPtSpacing[DIn]) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::initializeBFFD(const PointType& start, const PointType& end, size_t gridCtrlPtNum[DIn])
+{
+    try
+    {
+        unsigned int d;
+        for ( d=0; d<DIn; d++ )
+        {
+            GADGET_CHECK_RETURN_FALSE(end(d) > start(d));
+        }
+
+        std::vector<size_t> dim(DIn, 2);
+        std::vector<coord_type> pixelSize(DIn, 1);
+        std::vector<coord_type> origin(DIn, 0);
+
+        for ( d=0; d<DIn; d++ )
+        {
+            dim[d] = gridCtrlPtNum[d];
+            if ( dim[d] < 3 ) dim[d] = 3;
+
+            pixelSize[d] = (end(d)-start(d))/(dim[d]-1);
+
+            /// add the padding
+            dim[d] += 2*BSPLINEPADDINGSIZE;
+
+            origin[d] = -pixelSize[d]*BSPLINEPADDINGSIZE;
+        }
+
+        for ( d=0; d<DOut; d++ )
+        {
+            this->ctrl_pt_[d].create(dim, pixelSize, origin);
+            Gadgetron::clear(this->ctrl_pt_[d]);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in initializeBFFD(const PointType& start, const PointType& end, CoordType gridCtrlPtNum[DIn]) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, CoordType gridCtrlPtSpacing[DIn])
+{
+    try
+    {
+        unsigned int d;
+        for ( d=0; d<DIn; d++ )
+        {
+            GADGET_CHECK_RETURN_FALSE(end(d) > start(d));
+        }
+
+        std::vector<size_t> dim(DIn, 2);
+        std::vector<coord_type> pixelSize(DIn, 1);
+        std::vector<coord_type> origin(DIn, 0);
+
+        std::vector<coord_type> firstCtrlPt(DIn);
+
+        for ( d=0; d<DIn; d++ )
+        {
+            dim[d] = FFD_MKINT( (end(d)-start(d))/gridCtrlPtSpacing[d] ) + 1;
+            pixelSize[d] = (end(d)-start(d))/(dim[d]-1);
+
+            /// add the padding
+            dim[d] += 2*BSPLINEPADDINGSIZE;
+
+            firstCtrlPt[d] = -pixelSize[d]*BSPLINEPADDINGSIZE/im.get_pixel_size(d);
+        }
+        im.image_to_world( firstCtrlPt, origin);
+
+        for ( d=0; d<DOut; d++ )
+        {
+            this->ctrl_pt_[d].create(dim, pixelSize, origin);
+            Gadgetron::clear(this->ctrl_pt_[d]);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, CoordType gridCtrlPtSpacing[DIn]) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, size_t gridCtrlPtNum[DIn])
+{
+    try
+    {
+        unsigned int d;
+        for ( d=0; d<DIn; d++ )
+        {
+            GADGET_CHECK_RETURN_FALSE(end(d) > start(d));
+        }
+
+        std::vector<size_t> dim(DIn, 2);
+        std::vector<coord_type> pixelSize(DIn, 1);
+        std::vector<coord_type> origin(DIn, 0);
+
+        std::vector<coord_type> firstCtrlPt(DIn);
+
+        for ( d=0; d<DIn; d++ )
+        {
+            dim[d] = gridCtrlPtNum[d];
+            if ( dim[d] < 3 ) dim[d] = 3;
+
+            pixelSize[d] = (end(d)-start(d))/(dim[d]-1);
+
+            /// add the padding
+            dim[d] += 2*BSPLINEPADDINGSIZE;
+
+            firstCtrlPt[d] = -pixelSize[d]*BSPLINEPADDINGSIZE/im.get_pixel_size(d);
+        }
+        im.image_to_world( firstCtrlPt, origin);
+
+        for ( d=0; d<DOut; d++ )
+        {
+            this->ctrl_pt_[d].create(dim, pixelSize, origin);
+            Gadgetron::clear(this->ctrl_pt_[d]);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, CoordType gridCtrlPtNum[DIn]) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::initializeBFFD(const PointType& start, const PointType& end, CoordType dx, CoordType dy)
+{
+    CoordType gridCtrlPtSpacing[2];
+    gridCtrlPtSpacing[0] = dx;
+    gridCtrlPtSpacing[1] = dy;
+    return this->initializeBFFD(start, end, gridCtrlPtSpacing);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::initializeBFFD(const PointType& start, const PointType& end, size_t sx, size_t sy)
+{
+    size_t gridCtrlPtNum[2];
+    gridCtrlPtNum[0] = sx;
+    gridCtrlPtNum[1] = sy;
+    return this->initializeBFFD(start, end, gridCtrlPtNum);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, CoordType dx, CoordType dy)
+{
+    CoordType gridCtrlPtSpacing[2];
+    gridCtrlPtSpacing[0] = dx;
+    gridCtrlPtSpacing[1] = dy;
+    return this->initializeBFFD(im, start, end, gridCtrlPtSpacing);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, size_t sx, size_t sy)
+{
+    size_t gridCtrlPtNum[2];
+    gridCtrlPtNum[0] = sx;
+    gridCtrlPtNum[1] = sy;
+    return this->initializeBFFD(im, start, end, gridCtrlPtNum);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::initializeBFFD(const PointType& start, const PointType& end, CoordType dx, CoordType dy, CoordType dz)
+{
+    CoordType gridCtrlPtSpacing[3];
+    gridCtrlPtSpacing[0] = dx;
+    gridCtrlPtSpacing[1] = dy;
+    gridCtrlPtSpacing[2] = dz;
+    return this->initializeBFFD(start, end, gridCtrlPtSpacing);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::initializeBFFD(const PointType& start, const PointType& end, size_t sx, size_t sy, size_t sz)
+{
+    size_t gridCtrlPtNum[3];
+    gridCtrlPtNum[0] = sx;
+    gridCtrlPtNum[1] = sy;
+    gridCtrlPtNum[2] = sz;
+    return this->initializeBFFD(start, end, gridCtrlPtNum);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, CoordType dx, CoordType dy, CoordType dz)
+{
+    CoordType gridCtrlPtSpacing[3];
+    gridCtrlPtSpacing[0] = dx;
+    gridCtrlPtSpacing[1] = dy;
+    gridCtrlPtSpacing[2] = dz;
+    return this->initializeBFFD(im, start, end, gridCtrlPtSpacing);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, size_t sx, size_t sy, size_t sz)
+{
+    size_t gridCtrlPtNum[3];
+    gridCtrlPtNum[0] = sx;
+    gridCtrlPtNum[1] = sy;
+    gridCtrlPtNum[2] = sz;
+    return this->initializeBFFD(im, start, end, gridCtrlPtNum);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::initializeBFFD(const PointType& start, const PointType& end, CoordType dx, CoordType dy, CoordType dz, CoordType ds)
+{
+    CoordType gridCtrlPtSpacing[4];
+    gridCtrlPtSpacing[0] = dx;
+    gridCtrlPtSpacing[1] = dy;
+    gridCtrlPtSpacing[2] = dz;
+    gridCtrlPtSpacing[3] = ds;
+    return this->initializeBFFD(start, end, gridCtrlPtSpacing);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::initializeBFFD(const PointType& start, const PointType& end, size_t sx, size_t sy, size_t sz, size_t ss)
+{
+    size_t gridCtrlPtNum[4];
+    gridCtrlPtNum[0] = sx;
+    gridCtrlPtNum[1] = sy;
+    gridCtrlPtNum[2] = sz;
+    gridCtrlPtNum[3] = ss;
+    return this->initializeBFFD(start, end, gridCtrlPtNum);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, CoordType dx, CoordType dy, CoordType dz, CoordType ds)
+{
+    CoordType gridCtrlPtSpacing[4];
+    gridCtrlPtSpacing[0] = dx;
+    gridCtrlPtSpacing[1] = dy;
+    gridCtrlPtSpacing[2] = dz;
+    gridCtrlPtSpacing[3] = ds;
+    return this->initializeBFFD(im, start, end, gridCtrlPtSpacing);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusBSplineFFD<T, CoordType, DIn, DOut>::initializeBFFD(const ImageType& im, const PointType& start, const PointType& end, size_t sx, size_t sy, size_t sz, size_t ss)
+{
+    size_t gridCtrlPtNum[4];
+    gridCtrlPtNum[0] = sx;
+    gridCtrlPtNum[1] = sy;
+    gridCtrlPtNum[2] = sz;
+    gridCtrlPtNum[3] = ss;
+    return this->initializeBFFD(im, start, end, gridCtrlPtNum);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+void gtplusBSplineFFD<T, CoordType, DIn, DOut>::print(std::ostream& os) const
+{
+    using namespace std;
+
+    os << "---------------------- GTPlus BSpline Free Form Deformation ------------------" << endl;
+    os << "Define the interface for BSpline Free Form Deformation (BFFD) " << endl;
+    os << "------------------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusBSplineFFD2D.h b/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusBSplineFFD2D.h
new file mode 100644
index 0000000..2160c96
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusBSplineFFD2D.h
@@ -0,0 +1,597 @@
+/** \file       gtplusBSplineFFD2D.h
+    \brief      Implement gtPlus 2D BSpline FreeFormDeformation
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtplusBSplineFFD.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T, typename CoordType, unsigned int DOut>
+class gtplusBSplineFFD2D : public gtplusBSplineFFD<T, CoordType, 2, DOut>
+{
+public:
+
+    typedef gtplusBSplineFFD<T, CoordType, 2, DOut> BaseClass;
+    typedef gtplusBSplineFFD2D<T, CoordType, DOut> Self;
+
+    typedef typename BaseClass::real_value_type real_value_type;
+    typedef real_value_type bspline_float_type;
+
+    typedef typename BaseClass::coord_type coord_type;
+
+    using BaseClass::D;
+    using BaseClass::BSPLINELUTSIZE;
+    using BaseClass::BSPLINEPADDINGSIZE;
+
+    typedef typename BaseClass::LUTType             LUTType;
+    typedef typename BaseClass::CoordArrayType      CoordArrayType;
+    typedef typename BaseClass::ValueArrayType      ValueArrayType;
+    typedef typename BaseClass::ArrayType           ArrayType;
+    typedef typename BaseClass::FFDCtrlPtGridType   FFDCtrlPtGridType;
+    typedef typename BaseClass::PointType           PointType;
+    typedef typename BaseClass::ImageType           ImageType;
+
+    /// constructors
+    gtplusBSplineFFD2D();
+    /// define the FFD over a region with specific control point spacing
+    gtplusBSplineFFD2D(const PointType& start, const PointType& end, CoordType dx, CoordType dy);
+    /// define the FFD over the image region with specific control point spacing
+    gtplusBSplineFFD2D(const ImageType& im, CoordType dx, CoordType dy);
+    /// define the FFD over the image region with specific number of control points
+    gtplusBSplineFFD2D(const ImageType& im, size_t sx, size_t sy);
+    /// define the FFD over an array region with specific number of control points
+    gtplusBSplineFFD2D(const ArrayType& a, size_t sx, size_t sy);
+    /// copy constructor
+    gtplusBSplineFFD2D(const Self& bffd);
+
+    virtual ~gtplusBSplineFFD2D();
+
+    /// evaluate the FFD at a grid location
+    virtual bool evaluateFFD(const CoordType pt[D], T r[DOut]) const;
+    virtual bool evaluateFFD(CoordType px, CoordType py, T r[DOut]) const;
+
+    virtual bool evaluateFFDDX(const CoordType pt[D], T dx[DOut]) const;
+    virtual bool evaluateFFDDY(const CoordType pt[D], T dy[DOut]) const;
+
+    virtual bool evaluateWorldDX(const CoordType pt[D], T dx[DOut]) const;
+    virtual bool evaluateWorldDY(const CoordType pt[D], T dy[DOut]) const;
+
+    /// evaluate the 1st order derivative of FFD at a grid location
+    virtual bool evaluateFFDDerivative(const CoordType pt[D], T deriv[D][DOut]) const;
+    virtual bool evaluateFFDDerivative(CoordType px, CoordType py, T deriv[D][DOut]) const;
+
+    /// evaluate the 2nd order derivative of FFD at a grid location
+    /// dderiv : D*D vector, stores dxx dxy dxz ...; dyx dyy dyz ...; dzx dzy dzz ...
+    virtual bool evaluateFFDSecondOrderDerivative(const CoordType pt[D], T dderiv[D*D][DOut]) const;
+    virtual bool evaluateFFDSecondOrderDerivative(CoordType px, CoordType py, T dderiv[D*D][DOut]) const;
+
+    /// compute the FFD approximation once
+    /// pos : the position of input points, 2 by N
+    /// value : the value on input points, DOut by N
+    /// residual : the approximation residual after computing FFD, DOut by N
+    virtual bool ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N);
+
+    /// As suggested in ref [2], the BSpline FFD can be refined to achieve better approximation
+    virtual bool refine();
+
+    /// general print function
+    virtual void print(std::ostream& os) const;
+
+    using BaseClass::performTiming_;
+    using BaseClass::debugFolder_;
+
+protected:
+
+    using BaseClass::ctrl_pt_;
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::LUT_;
+    using BaseClass::LUT1_;
+    using BaseClass::LUT2_;
+
+    /// evaluate the FFD
+    /// px and py are at FFD grid
+    /// ordx, ordy indicates the order of derivative; 0/1/2 for 0/1st/2nd derivative
+    virtual bool evaluateFFD2D(CoordType px, CoordType py, size_t ordx, size_t ordy, T r[DOut]) const;
+};
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD2D<T, CoordType, DOut>::gtplusBSplineFFD2D() : BaseClass()
+{
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD2D<T, CoordType, DOut>::gtplusBSplineFFD2D(const PointType& start, const PointType& end, CoordType dx, CoordType dy) : BaseClass()
+{
+    GADGET_CHECK_THROW(this->initializeBFFD(start, end, dx, dy));
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD2D<T, CoordType, DOut>::gtplusBSplineFFD2D(const ImageType& im, CoordType dx, CoordType dy) : BaseClass()
+{
+    typename ImageType::coord_type x, y;
+
+    PointType start, end;
+
+    im.image_to_world( (size_t)0, (size_t)0, x, y);
+    start(0) = x;
+    start(1) = y;
+
+    im.image_to_world(im.get_size(0)-1, im.get_size(1)-1, x, y);
+    end(0) = x;
+    end(1) = y;
+
+    GADGET_CHECK_THROW(this->initializeBFFD(im, start, end, dx, dy));
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD2D<T, CoordType, DOut>::gtplusBSplineFFD2D(const ImageType& im, size_t sx, size_t sy) : BaseClass()
+{
+    PointType start, end;
+
+    typename ImageType::coord_type x, y;
+
+    im.image_to_world( (size_t)0, (size_t)0, x, y);
+    start(0) = x;
+    start(1) = y;
+
+    im.image_to_world(im.get_size(0)-1, im.get_size(1)-1, x, y);
+    end(0) = x;
+    end(1) = y;
+
+    GADGET_CHECK_THROW(this->initializeBFFD(im, start, end, sx, sy));
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD2D<T, CoordType, DOut>::gtplusBSplineFFD2D(const ArrayType& a, size_t sx, size_t sy) : BaseClass()
+{
+    PointType start, end;
+
+    start(0) = 0;
+    start(1) = 0;
+
+    end(0) = (CoordType)(a.get_size(0)-1);
+    end(1) = (CoordType)(a.get_size(1)-1);
+
+    GADGET_CHECK_THROW(this->initializeBFFD(start, end, sx, sy));
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD2D<T, CoordType, DOut>::gtplusBSplineFFD2D(const Self& bffd) : BaseClass()
+{
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        this->ctrl_pt_[d].copyFrom( bffd.get_ctrl_pt(d) );
+    }
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD2D<T, CoordType, DOut>::~gtplusBSplineFFD2D()
+{
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+bool gtplusBSplineFFD2D<T, CoordType, DOut>::evaluateFFD2D(CoordType px, CoordType py, size_t ordx, size_t ordy, T r[DOut]) const
+{
+    try
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE( (px>=-2) && (px<=this->get_size(0)+1) );
+        GADGET_DEBUG_CHECK_RETURN_FALSE( (py>=-2) && (py<=this->get_size(1)+1) );
+
+        GADGET_DEBUG_CHECK_RETURN_FALSE(ordX>=0 && ordX<=2);
+        GADGET_DEBUG_CHECK_RETURN_FALSE(ordY>=0 && ordY<=2);
+        GADGET_DEBUG_CHECK_RETURN_FALSE(ordX+ordY<=2);
+
+        long long ix = (long long)std::floor(px);
+        CoordType deltaX = px-(CoordType)ix;
+        long long lx = FFD_MKINT(BSPLINELUTSIZE*deltaX);
+
+        long long iy = (long long)std::floor(py);
+        CoordType deltaY = py-(CoordType)iy;
+        long long ly = FFD_MKINT(BSPLINELUTSIZE*deltaY);
+
+        unsigned int d, jj;
+        size_t offset[4];
+        offset[0] = this->calculate_offset(ix-1, iy-1);
+        offset[1] = this->calculate_offset(ix-1, iy);
+        offset[2] = this->calculate_offset(ix-1, iy+1);
+        offset[3] = this->calculate_offset(ix-1, iy+2);
+
+        const LUTType* p_xLUT= &this->LUT_;
+        const LUTType* p_yLUT= &this->LUT_;
+
+        if ( ordx == 1 )
+        {
+            p_xLUT= &this->LUT1_;
+        }
+        else if ( ordx == 2 )
+        {
+            p_xLUT= &this->LUT2_;
+        }
+
+        if ( ordy == 1 )
+        {
+            p_yLUT= &this->LUT1_;
+        }
+        else if ( ordy == 2 )
+        {
+            p_yLUT= &this->LUT2_;
+        }
+
+        const LUTType& xLUT= *p_xLUT;
+        const LUTType& yLUT= *p_yLUT;
+
+        for ( d=0; d<DOut; d++ )
+        {
+            r[d] = 0;
+
+            T v(0);
+            for (jj=0; jj<4; jj++)
+            {
+                v =   ( this->ctrl_pt_[d](offset[jj]  ) * xLUT[lx][0] )
+                    + ( this->ctrl_pt_[d](offset[jj]+1) * xLUT[lx][1] )
+                    + ( this->ctrl_pt_[d](offset[jj]+2) * xLUT[lx][2] )
+                    + ( this->ctrl_pt_[d](offset[jj]+3) * xLUT[lx][3] );
+
+                r[d] += v * yLUT[ly][jj];
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in evaluateFFD2D(CoordType px, CoordType py, size_t ordx, size_t ordy, T r[DOut]) const ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD2D<T, CoordType, DOut>::evaluateFFD(const CoordType pt[D], T r[DOut]) const
+{
+    return this->evaluateFFD2D(pt[0], pt[1], 0, 0, r);
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD2D<T, CoordType, DOut>::evaluateFFD(CoordType px, CoordType py, T r[DOut]) const
+{
+    return this->evaluateFFD2D(px, py, 0, 0, r);
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD2D<T, CoordType, DOut>::evaluateFFDDX(const CoordType pt[D], T dx[DOut]) const
+{
+    return this->evaluateFFD2D(pt[0], pt[1], 1, 0, dx);
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD2D<T, CoordType, DOut>::evaluateFFDDY(const CoordType pt[D], T dy[DOut]) const
+{
+    return this->evaluateFFD2D(pt[0], pt[1], 0, 1, dy);
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD2D<T, CoordType, DOut>::evaluateWorldDX(const CoordType pt[D], T dx[DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD2D(pt[0], pt[1], 1, 0, dx));
+    coord_type sx = coord_type(1.0)/this->get_spacing(0);
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        dx[d] *= sx;
+    }
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD2D<T, CoordType, DOut>::evaluateWorldDY(const CoordType pt[D], T dy[DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD2D(pt[0], pt[1], 0, 1, dy));
+    coord_type sy = coord_type(1.0)/this->get_spacing(1);
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        dy[d] *= sy;
+    }
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD2D<T, CoordType, DOut>::evaluateFFDDerivative(const CoordType pt[D], T deriv[D][DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD2D(pt[0], pt[1], 1, 0, deriv[0]));
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD2D(pt[0], pt[1], 0, 1, deriv[1]));
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD2D<T, CoordType, DOut>::evaluateFFDDerivative(CoordType px, CoordType py, T deriv[D][DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD2D(px, py, 1, 0, deriv[0]));
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD2D(px, py, 0, 1, deriv[1]));
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD2D<T, CoordType, DOut>::evaluateFFDSecondOrderDerivative(const CoordType pt[D], T dderiv[D*D][DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD2D(pt[0], pt[1], 2, 0, dderiv[0]));
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD2D(pt[0], pt[1], 1, 1, dderiv[1]));
+    memcpy(dderiv[2], dderiv[1], DOut*sizeof(T));
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD2D(pt[0], pt[1], 0, 2, dderiv[3]));
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD2D<T, CoordType, DOut>::evaluateFFDSecondOrderDerivative(CoordType px, CoordType py, T dderiv[D*D][DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD2D(px, py, 2, 0, dderiv[0]));
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD2D(px, py, 1, 1, dderiv[1]));
+    memcpy(dderiv[2], dderiv[1], DOut*sizeof(T));
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD2D(px, py, 0, 2, dderiv[3]));
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+bool gtplusBSplineFFD2D<T, CoordType, DOut>::ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(pos.get_size(0)==2);
+        GADGET_CHECK_RETURN_FALSE(pos.get_size(1)==N);
+
+        GADGET_CHECK_RETURN_FALSE(value.get_size(0)==DOut);
+        GADGET_CHECK_RETURN_FALSE(value.get_size(1)==N);
+
+        if ( !residual.dimensions_equal(&value) )
+        {
+            residual.create(value.get_dimensions());
+            Gadgetron::clear(residual);
+        }
+
+        size_t sx = this->get_size(0);
+        size_t sy = this->get_size(1);
+
+        /// following the definition of ref[2]
+        ho3DArray<T> dx(sx, sy, DOut), ds(sx, sy, DOut);
+        Gadgetron::clear(dx);
+        Gadgetron::clear(ds);
+
+        /// compute the current approximation values
+        ValueArrayType approxValue;
+        approxValue = value;
+
+        /// compute current residual
+        GADGET_CHECK_RETURN_FALSE(this->evaluateFFDArray(pos, approxValue));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::subtract(value, approxValue, residual));
+
+        /// compute the update of control points
+        unsigned int d;
+
+        long long n;
+        for (n=0; n<(long long)N; n++)
+        {
+            coord_type px = pos(0, n);
+            coord_type py = pos(1, n);
+
+            if ( px<-2 || px>sx+2 || py<-2 || py>sy+2 )
+            {
+                continue;
+            }
+
+            long long ix = (long long)std::floor(px);
+            CoordType deltaX = px-(CoordType)ix;
+
+            long long iy = (long long)std::floor(py);
+            CoordType deltaY = py-(CoordType)iy;
+
+            long long i, j, I, J;
+
+            T dist=0, v, vv, vvv;
+            for (j=0; j<4; j++)
+            {
+                for (i=0; i<4; i++)
+                {
+                    v = this->BSpline(i, deltaX) * this->BSpline(j, deltaY);
+                    dist += v*v;
+                }
+            }
+
+            for (j=0; j<4; j++)
+            {
+                J = j + iy - 1;
+                if ( (J>=0) && (J<(long long)sy) )
+                {
+                    for (i=0; i<4; i++)
+                    {
+                        I = i + ix - 1;
+                        if ( (I>=0) && (I<(long long)sx) )
+                        {
+                            v = this->BSpline(i, deltaX) * this->BSpline(j, deltaY);
+                            vv = v*v;
+                            vvv = vv*v;
+
+                            for ( d=0; d<DOut; d++ )
+                            {
+                                dx(I, J, d) += vvv*residual(d, n)/dist;
+                                ds(I, J, d) += vv;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        /// update the control point values
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::addEpsilon(ds));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::divide(dx, ds, dx));
+
+        std::vector<size_t> startND(2, BSPLINEPADDINGSIZE), size(2);
+        size[0] = sx;
+        size[1] = sy;
+
+        hoNDArray<T> ctrlPtWithoutPadding(sx, sy);
+
+        for ( d=0; d<DOut; d++ )
+        {
+            hoNDArray<T> dx2D(sx, sy, dx.begin()+d*sx*sy*sizeof(T));
+
+            std::vector<size_t> dim;
+            this->ctrl_pt_[d].get_dimensions(dim);
+            hoNDArray<T> tmpCtrlPt(dim, this->ctrl_pt_[d].begin(), false);
+            Gadgetron::cropUpTo11DArray(tmpCtrlPt, ctrlPtWithoutPadding, startND, size);
+            Gadgetron::add(ctrlPtWithoutPadding, dx2D, ctrlPtWithoutPadding);
+            Gadgetron::setSubArrayUpTo11DArray(ctrlPtWithoutPadding, tmpCtrlPt, startND, size);
+        }
+
+        /*for (j=0; j<sy; j++)
+        {
+            for (i=0; i<sx; i++)
+            {
+                for ( d=0; d<DOut; d++ )
+                {
+                    if ( ds(i, j, d) > 0)
+                    {
+                        this->ctrl_pt_[d](i, j) += dx(i, j, d)/ds(i, j, d);
+                    }
+                }
+            }
+        }*/
+
+        /// calculate residual error
+        totalResidual = 0;
+        GADGET_CHECK_RETURN_FALSE(this->evaluateFFDArray(pos, approxValue));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::subtract(value, approxValue, residual));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::norm2(residual, totalResidual));
+        totalResidual = totalResidual / (real_value_type)N;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+bool gtplusBSplineFFD2D<T, CoordType, DOut>::refine()
+{
+    try
+    {
+        size_t sx = this->get_size(0);
+        size_t sy = this->get_size(1);
+
+        /// the refined control point grid definition
+
+        std::vector<size_t> dim(2);
+        dim[0] = 2*sx-1 + 2*BSPLINEPADDINGSIZE;
+        dim[1] = 2*sy-1 + 2*BSPLINEPADDINGSIZE;
+
+        std::vector<coord_type> spacing;
+        this->get_spacing(spacing);
+        spacing[0] /= 2;
+        spacing[1] /= 2;
+
+        std::vector<coord_type> oldOrigin;
+        this->ctrl_pt_[0].get_origin(oldOrigin);
+
+        std::vector<coord_type> gridOrigin(2);
+        this->ctrl_pt_[0].image_to_world( (CoordType)(BSPLINEPADDINGSIZE), (CoordType)(BSPLINEPADDINGSIZE), gridOrigin[0], gridOrigin[1]);
+
+        std::vector<coord_type> origin(2);
+        origin[0] = (oldOrigin[0] + gridOrigin[0])/2;
+        origin[1] = (oldOrigin[1] + gridOrigin[1])/2;
+
+        typename ImageType::axis_type axis;
+        this->ctrl_pt_[0].get_axis(axis);
+
+        /// allocate new control points
+        FFDCtrlPtGridType new_ctrl_pt[DOut];
+
+        unsigned int d;
+        for( d=0; d<DOut; d++ )
+        {
+            new_ctrl_pt[d].create(dim, spacing, origin, axis);
+            Gadgetron::clear(new_ctrl_pt[d]);
+        }
+
+        /// refinement weights, see ref[2]
+        T w[2][3];
+
+        w[0][0] = T(0.125); w[0][1] = T(0.75);  w[0][2] = T(0.125);
+        w[1][0] = 0;        w[1][1] = T(0.5);   w[1][2] = T(0.5);
+
+        /// compute refined control point values
+        int x, y, i_new, j_new, i_old, j_old;
+        for (y=0; y<sy; y++)
+        {
+            for (x=0; x<sx; x++)
+            {
+                for (j_new=0; j_new<2; j_new++)
+                {
+                    for (i_new=0; i_new<2; i_new++)
+                    {
+                        size_t offsetNew = new_ctrl_pt[0].calculate_offset(2*x+i_new+BSPLINEPADDINGSIZE, 2*y+j_new+BSPLINEPADDINGSIZE);
+
+                        for (j_old=0; j_old<3; j_old++)
+                        {
+                            for (i_old=0; i_old<3; i_old++)
+                            {
+                                size_t offsetOld = this->calculate_offset(x+i_old-1, y+j_old-1);
+                                for ( d=0; d<DOut; d++ )
+                                {
+                                    new_ctrl_pt[d](offsetNew) += w[i_new][i_old]*w[j_new][j_old] * this->ctrl_pt_[d](offsetOld);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        for ( d=0; d<DOut; d++ )
+        {
+            this->ctrl_pt_[d].create(dim, spacing, origin, axis, new_ctrl_pt[d].begin(), true);
+            new_ctrl_pt[d].delete_data_on_destruct(false);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in refine() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+void gtplusBSplineFFD2D<T, CoordType, DOut>::print(std::ostream& os) const
+{
+    using namespace std;
+
+    os << "---------------------- GTPlus BSpline 2D Free Form Deformation ------------------" << endl;
+    os << "Implement 2D BSpline Free Form Deformation (BFFD) " << endl;
+
+    std::string elemTypeName = std::string(typeid(T).name());
+    os << "FFD value type is : " << elemTypeName << endl;
+
+    elemTypeName = std::string(typeid(CoordType).name());
+    os << "FFD coord type is : " << elemTypeName << endl;
+
+    os << "Output dimension is : " << DOut << endl;
+    os << "---------------------------------------------------" << endl;
+    os << "BFFD grid information : " << endl;
+    this->ctrl_pt_[0].printContent(os);
+    os << "------------------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusBSplineFFD3D.h b/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusBSplineFFD3D.h
new file mode 100644
index 0000000..1f602c3
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusBSplineFFD3D.h
@@ -0,0 +1,740 @@
+/** \file       gtplusBSplineFFD3D.h
+    \brief      Implement gtPlus 2D BSpline FreeFormDeformation
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtplusBSplineFFD.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T, typename CoordType, unsigned int DOut>
+class gtplusBSplineFFD3D : public gtplusBSplineFFD<T, CoordType, 3, DOut>
+{
+public:
+
+    typedef gtplusBSplineFFD<T, CoordType, 3, DOut> BaseClass;
+    typedef gtplusBSplineFFD3D<T, CoordType, DOut> Self;
+
+    typedef typename BaseClass::real_value_type real_value_type;
+    typedef real_value_type bspline_float_type;
+
+    typedef typename BaseClass::coord_type coord_type;
+
+    enum { D = 3 };
+    using BaseClass::BSPLINELUTSIZE;
+    using BaseClass::BSPLINEPADDINGSIZE;
+
+    typedef typename BaseClass::LUTType             LUTType;
+    typedef typename BaseClass::CoordArrayType      CoordArrayType;
+    typedef typename BaseClass::ValueArrayType      ValueArrayType;
+    typedef typename BaseClass::ArrayType           ArrayType;
+    typedef typename BaseClass::FFDCtrlPtGridType   FFDCtrlPtGridType;
+    typedef typename BaseClass::PointType           PointType;
+    typedef typename BaseClass::ImageType           ImageType;
+
+    /// constructors
+    gtplusBSplineFFD3D();
+    /// define the FFD over a region with specific control point spacing
+    gtplusBSplineFFD3D(const PointType& start, const PointType& end, CoordType dx, CoordType dy, CoordType dz);
+    /// define the FFD over the image region with specific control point spacing
+    gtplusBSplineFFD3D(const ImageType& im, CoordType dx, CoordType dy, CoordType dz);
+    /// define the FFD over the image region with specific number of control points
+    gtplusBSplineFFD3D(const ImageType& im, size_t sx, size_t sy, size_t sz);
+    /// define the FFD over an array region with specific number of control points
+    gtplusBSplineFFD3D(const ArrayType& a, size_t sx, size_t sy, size_t sz);
+    /// copy constructor
+    gtplusBSplineFFD3D(const Self& bffd);
+
+    virtual ~gtplusBSplineFFD3D();
+
+    /// evaluate the FFD at a grid location
+    virtual bool evaluateFFD(const CoordType pt[D], T r[DOut]) const;
+    virtual bool evaluateFFD(CoordType px, CoordType py, CoordType pz, T r[DOut]) const;
+
+    virtual bool evaluateFFDDX(const CoordType pt[D], T dx[DOut]) const;
+    virtual bool evaluateFFDDY(const CoordType pt[D], T dy[DOut]) const;
+    virtual bool evaluateFFDDZ(const CoordType pt[D], T dz[DOut]) const;
+
+    virtual bool evaluateWorldDX(const CoordType pt[D], T dx[DOut]) const;
+    virtual bool evaluateWorldDY(const CoordType pt[D], T dy[DOut]) const;
+    virtual bool evaluateWorldDZ(const CoordType pt[D], T dz[DOut]) const;
+
+    /// evaluate the 1st order derivative of FFD at a grid location
+    virtual bool evaluateFFDDerivative(const CoordType pt[D], T deriv[D][DOut]) const;
+    virtual bool evaluateFFDDerivative(CoordType px, CoordType py, CoordType pz, T deriv[D][DOut]) const;
+
+    /// evaluate the 2nd order derivative of FFD at a grid location
+    /// dderiv : D*D vector, stores dxx dxy dxz ...; dyx dyy dyz ...; dzx dzy dzz ...
+    virtual bool evaluateFFDSecondOrderDerivative(const CoordType pt[D], T dderiv[D*D][DOut]) const;
+    virtual bool evaluateFFDSecondOrderDerivative(CoordType px, CoordType py, CoordType pz, T dderiv[D*D][DOut]) const;
+
+    /// compute the FFD approximation once
+    /// pos : the position of input points, D by N
+    /// value : the value on input points, DOut by N
+    /// residual : the approximation residual after computing FFD, DOut by N
+    virtual bool ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N);
+
+    /// As suggested in ref [2], the BSpline FFD can be refined to achieve better approximation
+    virtual bool refine();
+
+    /// general print function
+    virtual void print(std::ostream& os) const;
+
+    using BaseClass::performTiming_;
+    using BaseClass::debugFolder_;
+
+protected:
+
+    using BaseClass::ctrl_pt_;
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::LUT_;
+    using BaseClass::LUT1_;
+    using BaseClass::LUT2_;
+
+    /// evaluate the FFD
+    /// px and py are at FFD grid
+    /// ordx, ordy indicates the order of derivative; 0/1/2 for 0/1st/2nd derivative
+    virtual bool evaluateFFD3D(CoordType px, CoordType py, CoordType pz, size_t ordx, size_t ordy, size_t ordz, T r[DOut]) const;
+};
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD3D<T, CoordType, DOut>::gtplusBSplineFFD3D() : BaseClass()
+{
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD3D<T, CoordType, DOut>::gtplusBSplineFFD3D(const PointType& start, const PointType& end, CoordType dx, CoordType dy, CoordType dz) : BaseClass()
+{
+    GADGET_CHECK_THROW(this->initializeBFFD(start, end, dx, dy, dz));
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD3D<T, CoordType, DOut>::gtplusBSplineFFD3D(const ImageType& im, CoordType dx, CoordType dy, CoordType dz) : BaseClass()
+{
+    PointType start, end;
+
+    typename ImageType::coord_type x, y, z;
+
+    im.image_to_world( (size_t)0, (size_t)0, (size_t)0, x, y, z);
+    start(0) = x;
+    start(1) = y;
+    start(2) = z;
+
+    im.image_to_world(im.get_size(0)-1, im.get_size(1)-1, im.get_size(2)-1, x, y, z);
+    end(0) = x;
+    end(1) = y;
+    end(2) = z;
+
+    GADGET_CHECK_THROW(this->initializeBFFD(im, start, end, dx, dy, dz));
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD3D<T, CoordType, DOut>::gtplusBSplineFFD3D(const ImageType& im, size_t sx, size_t sy, size_t sz) : BaseClass()
+{
+    PointType start, end;
+
+    typename ImageType::coord_type x, y, z;
+
+    im.image_to_world( (size_t)0, (size_t)0, (size_t)0, x, y, z);
+    start(0) = x;
+    start(1) = y;
+    start(2) = z;
+
+    im.image_to_world(im.get_size(0)-1, im.get_size(1)-1, im.get_size(2)-1, x, y, z);
+    end(0) = x;
+    end(1) = y;
+    end(2) = z;
+
+    GADGET_CHECK_THROW(this->initializeBFFD(im, start, end, sx, sy, sz));
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD3D<T, CoordType, DOut>::gtplusBSplineFFD3D(const ArrayType& a, size_t sx, size_t sy, size_t sz) : BaseClass()
+{
+    PointType start, end;
+
+    start(0) = 0;
+    start(1) = 0;
+    start(2) = 0;
+
+    end(0) = a.get_size(0)-1;
+    end(1) = a.get_size(1)-1;
+    end(2) = a.get_size(2)-1;
+
+    GADGET_CHECK_THROW(this->initializeBFFD(start, end, sx, sy, sz));
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD3D<T, CoordType, DOut>::gtplusBSplineFFD3D(const Self& bffd) : BaseClass()
+{
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        this->ctrl_pt_[d].copyFrom( bffd.get_ctrl_pt(d) );
+    }
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD3D<T, CoordType, DOut>::~gtplusBSplineFFD3D()
+{
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+bool gtplusBSplineFFD3D<T, CoordType, DOut>::evaluateFFD3D(CoordType px, CoordType py, CoordType pz, size_t ordx, size_t ordy, size_t ordz, T r[DOut]) const
+{
+    try
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE( (px>=-2) && (px<=this->get_size(0)+1) );
+        GADGET_DEBUG_CHECK_RETURN_FALSE( (py>=-2) && (py<=this->get_size(1)+1) );
+        GADGET_DEBUG_CHECK_RETURN_FALSE( (pz>=-2) && (pz<=this->get_size(2)+1) );
+
+        GADGET_DEBUG_CHECK_RETURN_FALSE(ordX>=0 && ordX<=2);
+        GADGET_DEBUG_CHECK_RETURN_FALSE(ordY>=0 && ordY<=2);
+        GADGET_DEBUG_CHECK_RETURN_FALSE(ordZ>=0 && ordZ<=2);
+        GADGET_DEBUG_CHECK_RETURN_FALSE(ordX+ordY+ordZ<=2);
+
+        long long ix = (long long)std::floor(px);
+        CoordType deltaX = px-(CoordType)ix;
+        long long lx = FFD_MKINT(BSPLINELUTSIZE*deltaX);
+
+        long long iy = (long long)std::floor(py);
+        CoordType deltaY = py-(CoordType)iy;
+        long long ly = FFD_MKINT(BSPLINELUTSIZE*deltaY);
+
+        long long iz = (long long)std::floor(pz);
+        CoordType deltaZ = pz-(CoordType)iz;
+        long long lz = FFD_MKINT(BSPLINELUTSIZE*deltaZ);
+
+        unsigned int d, jj, kk;
+        size_t offset[4][4]; // z, y
+        offset[0][0] = this->calculate_offset(ix-1, iy-1, iz-1);
+        offset[0][1] = this->calculate_offset(ix-1, iy  , iz-1);
+        offset[0][2] = this->calculate_offset(ix-1, iy+1, iz-1);
+        offset[0][3] = this->calculate_offset(ix-1, iy+2, iz-1);
+
+        offset[1][0] = this->calculate_offset(ix-1, iy-1, iz);
+        offset[1][1] = this->calculate_offset(ix-1, iy  , iz);
+        offset[1][2] = this->calculate_offset(ix-1, iy+1, iz);
+        offset[1][3] = this->calculate_offset(ix-1, iy+2, iz);
+
+        offset[2][0] = this->calculate_offset(ix-1, iy-1, iz+1);
+        offset[2][1] = this->calculate_offset(ix-1, iy  , iz+1);
+        offset[2][2] = this->calculate_offset(ix-1, iy+1, iz+1);
+        offset[2][3] = this->calculate_offset(ix-1, iy+2, iz+1);
+
+        offset[3][0] = this->calculate_offset(ix-1, iy-1, iz+2);
+        offset[3][1] = this->calculate_offset(ix-1, iy  , iz+2);
+        offset[3][2] = this->calculate_offset(ix-1, iy+1, iz+2);
+        offset[3][3] = this->calculate_offset(ix-1, iy+2, iz+2);
+
+        const LUTType* p_xLUT= &this->LUT_;
+        const LUTType* p_yLUT= &this->LUT_;
+        const LUTType* p_zLUT= &this->LUT_;
+
+        if ( ordx == 1 )
+        {
+            p_xLUT= &this->LUT1_;
+        }
+        else if ( ordx == 2 )
+        {
+            p_xLUT= &this->LUT2_;
+        }
+
+        if ( ordy == 1 )
+        {
+            p_yLUT= &this->LUT1_;
+        }
+        else if ( ordy == 2 )
+        {
+            p_yLUT= &this->LUT2_;
+        }
+
+        if ( ordz == 1 )
+        {
+            p_zLUT= &this->LUT1_;
+        }
+        else if ( ordz == 2 )
+        {
+            p_zLUT= &this->LUT2_;
+        }
+
+        const LUTType& xLUT= *p_xLUT;
+        const LUTType& yLUT= *p_yLUT;
+        const LUTType& zLUT= *p_zLUT;
+
+        for ( d=0; d<DOut; d++ )
+        {
+            r[d] = 0;
+            for (kk=0; kk<4; kk++)
+            {
+                T rv = 0;
+                for (jj=0; jj<4; jj++)
+                {
+                    T v  =  ( this->ctrl_pt_[d](offset[kk][jj])   * xLUT[lx][0] )
+                        + ( this->ctrl_pt_[d](offset[kk][jj]+1) * xLUT[lx][1] )
+                        + ( this->ctrl_pt_[d](offset[kk][jj]+2) * xLUT[lx][2] )
+                        + ( this->ctrl_pt_[d](offset[kk][jj]+3) * xLUT[lx][3] );
+
+                    rv += v * yLUT[ly][jj];
+                }
+
+                r[d] += rv * zLUT[lz][kk];
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in evaluateFFD3D(CoordType px, CoordType py, CoordType pz, size_t ordx, size_t ordy, size_t ordz, T r[DOut]) const ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD3D<T, CoordType, DOut>::evaluateFFD(const CoordType pt[D], T r[DOut]) const
+{
+    return this->evaluateFFD3D(pt[0], pt[1], pt[2], 0, 0, 0, r);
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD3D<T, CoordType, DOut>::evaluateFFD(CoordType px, CoordType py, CoordType pz, T r[DOut]) const
+{
+    return this->evaluateFFD3D(px, py, pz, 0, 0, 0, r);
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD3D<T, CoordType, DOut>::evaluateFFDDX(const CoordType pt[D], T dx[DOut]) const
+{
+    return this->evaluateFFD3D(pt[0], pt[1], pt[2], 1, 0, 0, dx);
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD3D<T, CoordType, DOut>::evaluateFFDDY(const CoordType pt[D], T dy[DOut]) const
+{
+    return this->evaluateFFD3D(pt[0], pt[1], pt[2], 0, 1, 0, dy);
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD3D<T, CoordType, DOut>::evaluateFFDDZ(const CoordType pt[D], T dz[DOut]) const
+{
+    return this->evaluateFFD3D(pt[0], pt[1], pt[2], 0, 0, 1, dz);
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD3D<T, CoordType, DOut>::evaluateWorldDX(const CoordType pt[D], T dx[DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(pt[0], pt[1], pt[2], 1, 0, 0, dx));
+    coord_type sx = coord_type(1.0)/this->get_spacing(0);
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        dx[d] *= sx;
+    }
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD3D<T, CoordType, DOut>::evaluateWorldDY(const CoordType pt[D], T dy[DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(pt[0], pt[1], pt[2], 0, 1, 0, dy));
+    coord_type sy = coord_type(1.0)/this->get_spacing(1);
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        dy[d] *= sy;
+    }
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD3D<T, CoordType, DOut>::evaluateWorldDZ(const CoordType pt[D], T dz[DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(pt[0], pt[1], pt[2], 0, 0, 1, dz));
+    coord_type sz = coord_type(1.0)/this->get_spacing(2);
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        dz[d] *= sz;
+    }
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD3D<T, CoordType, DOut>::evaluateFFDDerivative(const CoordType pt[D], T deriv[D][DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(pt[0], pt[1], pt[2], 1, 0, 0, deriv[0]));
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(pt[0], pt[1], pt[2], 0, 1, 0, deriv[1]));
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(pt[0], pt[1], pt[2], 0, 0, 1, deriv[2]));
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD3D<T, CoordType, DOut>::evaluateFFDDerivative(CoordType px, CoordType py, CoordType pz, T deriv[D][DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(px, py, pz, 1, 0, 0, deriv[0]));
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(px, py, pz, 0, 1, 0, deriv[1]));
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(px, py, pz, 0, 0, 1, deriv[2]));
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD3D<T, CoordType, DOut>::evaluateFFDSecondOrderDerivative(const CoordType pt[D], T dderiv[D*D][DOut]) const
+{
+    // dxx
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(pt[0], pt[1], pt[2], 2, 0, 0, dderiv[0]));
+    // dxy
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(pt[0], pt[1], pt[2], 1, 1, 0, dderiv[1]));
+    // dxz
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(pt[0], pt[1], pt[2], 1, 0, 1, dderiv[2]));
+    // dyx
+    memcpy(dderiv[3], dderiv[1], DOut*sizeof(T));
+    // dyy
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(pt[0], pt[1], pt[2], 0, 2, 0, dderiv[4]));
+    // dyz
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(pt[0], pt[1], pt[2], 0, 1, 1, dderiv[5]));
+    // dzx
+    memcpy(dderiv[6], dderiv[2], DOut*sizeof(T));
+    // dzy
+    memcpy(dderiv[7], dderiv[5], DOut*sizeof(T));
+    // dzz
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(pt[0], pt[1], pt[2], 0, 0, 2, dderiv[8]));
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD3D<T, CoordType, DOut>::evaluateFFDSecondOrderDerivative(CoordType px, CoordType py, CoordType pz, T dderiv[D*D][DOut]) const
+{
+    // dxx
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(px, py, pz, 2, 0, 0, dderiv[0]));
+    // dxy
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(px, py, pz, 1, 1, 0, dderiv[1]));
+    // dxz
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(px, py, pz, 1, 0, 1, dderiv[2]));
+    // dyx
+    memcpy(dderiv[3], dderiv[1], DOut*sizeof(T));
+    // dyy
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(px, py, pz, 0, 2, 0, dderiv[4]));
+    // dyz
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(px, py, pz, 0, 1, 1, dderiv[5]));
+    // dzx
+    memcpy(dderiv[6], dderiv[2], DOut*sizeof(T));
+    // dzy
+    memcpy(dderiv[7], dderiv[5], DOut*sizeof(T));
+    // dzz
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD3D(px, py, pz, 0, 0, 2, dderiv[8]));
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+bool gtplusBSplineFFD3D<T, CoordType, DOut>::ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(pos.get_size(0)==D);
+        GADGET_CHECK_RETURN_FALSE(pos.get_size(1)==N);
+
+        GADGET_CHECK_RETURN_FALSE(value.get_size(0)==DOut);
+        GADGET_CHECK_RETURN_FALSE(value.get_size(1)==N);
+
+        if ( !residual.dimensions_equal(&value) )
+        {
+            residual.create(value.get_dimensions());
+            Gadgetron::clear(residual);
+        }
+
+        size_t sx = this->get_size(0);
+        size_t sy = this->get_size(1);
+        size_t sz = this->get_size(2);
+
+        /// following the definition of ref[2]
+        ho4DArray<T> dx(sx, sy, sz, DOut), ds(sx, sy, sz, DOut);
+        Gadgetron::clear(dx);
+        Gadgetron::clear(ds);
+
+        /// compute the current approximation values
+        ValueArrayType approxValue;
+        approxValue = value;
+
+        /// compute current residual
+        GADGET_CHECK_RETURN_FALSE(this->evaluateFFDArray(pos, approxValue));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::subtract(value, approxValue, residual));
+
+        /// compute the update of control points
+        unsigned int d;
+
+        long long n;
+        for (n=0; n<N; n++)
+        {
+            coord_type px = pos(0, n);
+            coord_type py = pos(1, n);
+            coord_type pz = pos(2, n);
+
+            if ( px<-2 || px>sx+2
+                || py<-2 || py>sy+2
+                || pz<-2 || pz>sz+2 )
+            {
+                continue;
+            }
+
+            long long ix = (long long)std::floor(px);
+            CoordType deltaX = px-(CoordType)ix;
+
+            long long iy = (long long)std::floor(py);
+            CoordType deltaY = py-(CoordType)iy;
+
+            long long iz = (long long)std::floor(pz);
+            CoordType deltaZ = pz-(CoordType)iz;
+
+            long long i, j, k, I, J, K;
+
+            T dist=0, v, vv, vvv;
+            for (k=0; k<4; k++)
+            {
+                for (j=0; j<4; j++)
+                {
+                    for (i=0; i<4; i++)
+                    {
+                        v = this->BSpline(i, deltaX) * this->BSpline(j, deltaY) * this->BSpline(k, deltaZ);
+                        dist += v*v;
+                    }
+                }
+            }
+
+            for (k=0; k<4; k++)
+            {
+                K = k + iz - 1;
+                if ( (K>=0) && (K<(long long)sz) )
+                {
+                    for (j=0; j<4; j++)
+                    {
+                        J = j + iy - 1;
+                        if ( (J>=0) && (J<(long long)sy) )
+                        {
+                            for (i=0; i<4; i++)
+                            {
+                                I = i + ix - 1;
+                                if ( (I>=0) && (I<(long long)sx) )
+                                {
+                                    v = this->BSpline(i, deltaX) * this->BSpline(j, deltaY) * this->BSpline(k, deltaZ);
+                                    vv = v*v;
+                                    vvv = vv*v;
+
+                                    for ( d=0; d<DOut; d++ )
+                                    {
+                                        dx(I, J, K, d) += vvv*residual(d, n)/dist;
+                                        ds(I, J, K, d) += vv;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        /// update the control point values
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::addEpsilon(ds));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::divide(dx, ds, dx));
+
+        std::vector<size_t> startND(3, BSPLINEPADDINGSIZE), size(3);
+        size[0] = sx;
+        size[1] = sy;
+        size[2] = sz;
+
+        hoNDArray<T> ctrlPtWithoutPadding(sx, sy, sz);
+
+        for ( d=0; d<DOut; d++ )
+        {
+            hoNDArray<T> dx3D(sx, sy, sz, dx.begin()+d*sx*sy*sz*sizeof(T));
+
+            std::vector<size_t> dim;
+            this->ctrl_pt_[d].get_dimensions(dim);
+            hoNDArray<T> tmpCtrlPt(dim, this->ctrl_pt_[d].begin(), false);
+            Gadgetron::cropUpTo11DArray(tmpCtrlPt, ctrlPtWithoutPadding, startND, size);
+            Gadgetron::add(ctrlPtWithoutPadding, dx3D, ctrlPtWithoutPadding);
+            Gadgetron::setSubArrayUpTo11DArray(ctrlPtWithoutPadding, tmpCtrlPt, startND, size);
+        }
+
+        /// calculate residual error
+        totalResidual = 0;
+        GADGET_CHECK_RETURN_FALSE(this->evaluateFFDArray(pos, approxValue));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::subtract(value, approxValue, residual));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::norm2(residual, totalResidual));
+        totalResidual = totalResidual / (real_value_type)N;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+bool gtplusBSplineFFD3D<T, CoordType, DOut>::refine()
+{
+    try
+    {
+        size_t sx = this->get_size(0);
+        size_t sy = this->get_size(1);
+        size_t sz = this->get_size(2);
+
+        /// the refined control point grid definition
+
+        std::vector<size_t> dim(3);
+        dim[0] = 2*sx-1 + 2*BSPLINEPADDINGSIZE;
+        dim[1] = 2*sy-1 + 2*BSPLINEPADDINGSIZE;
+        dim[2] = 2*sz-1 + 2*BSPLINEPADDINGSIZE;
+
+        std::vector<coord_type> spacing;
+        this->get_spacing(spacing);
+        spacing[0] /= 2;
+        spacing[1] /= 2;
+        if ( sz > 1 ) spacing[2] /= 2;
+
+        std::vector<coord_type> oldOrigin;
+        this->ctrl_pt_[0].get_origin(oldOrigin);
+
+        std::vector<coord_type> gridOrigin(3);
+        this->ctrl_pt_[0].image_to_world( (CoordType)(BSPLINEPADDINGSIZE), (CoordType)(BSPLINEPADDINGSIZE), (CoordType)(BSPLINEPADDINGSIZE), gridOrigin[0], gridOrigin[1], gridOrigin[2]);
+
+        std::vector<coord_type> origin(3);
+        origin[0] = (oldOrigin[0] + gridOrigin[0])/2;
+        origin[1] = (oldOrigin[1] + gridOrigin[1])/2;
+        origin[2] = (oldOrigin[2] + gridOrigin[2])/2;
+
+        typename ImageType::axis_type axis;
+        this->ctrl_pt_[0].get_axis(axis);
+
+        /// allocate new control points
+        FFDCtrlPtGridType new_ctrl_pt[DOut];
+
+        unsigned int d;
+        for( d=0; d<DOut; d++ )
+        {
+            new_ctrl_pt[d].create(dim, spacing, origin, axis);
+            Gadgetron::clear(new_ctrl_pt[d]);
+        }
+
+        /// refinement weights, see ref[2]
+        T w[2][3];
+
+        w[0][0] = T(0.125); w[0][1] = T(0.75);  w[0][2] = T(0.125);
+        w[1][0] = 0;        w[1][1] = T(0.5);   w[1][2] = T(0.5);
+
+        /// compute refined control point values
+        int x, y, z, i_new, j_new, k_new, i_old, j_old, k_old;
+        if ( sz > 1 )
+        {
+            for (z=0; z<sz; z++)
+            {
+                for (y=0; y<sy; y++)
+                {
+                    for (x=0; x<sx; x++)
+                    {
+                        for (k_new=0; k_new<2; k_new++)
+                        {
+                            for (j_new=0; j_new<2; j_new++)
+                            {
+                                for (i_new=0; i_new<2; i_new++)
+                                {
+                                    size_t offsetNew = new_ctrl_pt[0].calculate_offset(2*x+i_new+BSPLINEPADDINGSIZE, 2*y+j_new+BSPLINEPADDINGSIZE, 2*z+k_new+BSPLINEPADDINGSIZE);
+
+                                    for (k_old=0; k_old<3; k_old++)
+                                    {
+                                        for (j_old=0; j_old<3; j_old++)
+                                        {
+                                            for (i_old=0; i_old<3; i_old++)
+                                            {
+                                                size_t offsetOld = this->calculate_offset(x+i_old-1, y+j_old-1, z+k_old-1);
+                                                for ( d=0; d<DOut; d++ )
+                                                {
+                                                    new_ctrl_pt[d](offsetNew) += w[i_new][i_old]*w[j_new][j_old]* w[k_new][k_old] * this->ctrl_pt_[d](offsetOld);
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            for (y=0; y<sy; y++)
+            {
+                for (x=0; x<sx; x++)
+                {
+                    for (j_new=0; j_new<2; j_new++)
+                    {
+                        for (i_new=0; i_new<2; i_new++)
+                        {
+                            size_t offsetNew = new_ctrl_pt[0].calculate_offset(2*x+i_new+BSPLINEPADDINGSIZE, 2*y+j_new+BSPLINEPADDINGSIZE, BSPLINEPADDINGSIZE);
+
+                            for (j_old=0; j_old<3; j_old++)
+                            {
+                                for (i_old=0; i_old<3; i_old++)
+                                {
+                                    size_t offsetOld = this->calculate_offset(x+i_old-1, y+j_old-1, 0);
+                                    for ( d=0; d<DOut; d++ )
+                                    {
+                                        new_ctrl_pt[d](offsetNew) += w[i_new][i_old]*w[j_new][j_old] * this->ctrl_pt_[d](offsetOld);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        for ( d=0; d<DOut; d++ )
+        {
+            this->ctrl_pt_[d].create(dim, spacing, origin, axis, new_ctrl_pt[d].begin(), true);
+            new_ctrl_pt[d].delete_data_on_destruct(false);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in refine() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+void gtplusBSplineFFD3D<T, CoordType, DOut>::print(std::ostream& os) const
+{
+    using namespace std;
+
+    os << "---------------------- GTPlus BSpline 3D Free Form Deformation ------------------" << endl;
+    os << "Implement 3D BSpline Free Form Deformation (BFFD) " << endl;
+
+    std::string elemTypeName = std::string( typeid(T).name() );
+    os << "FFD value type is : " << elemTypeName << endl;
+
+    elemTypeName = std::string( typeid(CoordType).name() );
+    os << "FFD coord type is : " << elemTypeName << endl;
+
+    os << "Output dimension is : " << DOut << endl;
+    os << "---------------------------------------------------" << endl;
+    os << "BFFD grid information : " << endl;
+    this->ctrl_pt_[0].printContent(os);
+    os << "---------------------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusBSplineFFD4D.h b/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusBSplineFFD4D.h
new file mode 100644
index 0000000..25a9521
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusBSplineFFD4D.h
@@ -0,0 +1,905 @@
+/** \file       gtplusBSplineFFD4D.h
+    \brief      Implement gtPlus 2D BSpline FreeFormDeformation
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtplusFFDBase.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T, typename CoordType, unsigned int DOut>
+class gtplusBSplineFFD4D : public gtplusBSplineFFD<T, CoordType, 4, DOut>
+{
+public:
+
+    typedef gtplusBSplineFFD<T, CoordType, 4, DOut> BaseClass;
+    typedef gtplusBSplineFFD4D<T, CoordType, DOut> Self;
+
+    typedef typename BaseClass::real_value_type real_value_type;
+    typedef real_value_type bspline_float_type;
+
+    typedef typename BaseClass::coord_type coord_type;
+
+    enum { D = 4 };
+    using BaseClass::BSPLINELUTSIZE;
+    using BaseClass::BSPLINEPADDINGSIZE;
+
+    typedef typename BaseClass::LUTType             LUTType;
+    typedef typename BaseClass::CoordArrayType      CoordArrayType;
+    typedef typename BaseClass::ValueArrayType      ValueArrayType;
+    typedef typename BaseClass::ArrayType           ArrayType;
+    typedef typename BaseClass::FFDCtrlPtGridType   FFDCtrlPtGridType;
+    typedef typename BaseClass::PointType           PointType;
+    typedef typename BaseClass::ImageType           ImageType;
+
+    /// constructors
+    gtplusBSplineFFD4D();
+    /// define the FFD over a region with specific control point spacing
+    gtplusBSplineFFD4D(const PointType& start, const PointType& end, CoordType dx, CoordType dy, CoordType dz, CoordType ds);
+    /// define the FFD over the image region with specific control point spacing
+    gtplusBSplineFFD4D(const ImageType& im, CoordType dx, CoordType dy, CoordType dz, CoordType ds);
+    /// define the FFD over the image region with specific number of control points
+    gtplusBSplineFFD4D(const ImageType& im, size_t sx, size_t sy, size_t sz, size_t ss);
+    /// define the FFD over an array region with specific number of control points
+    gtplusBSplineFFD4D(const ArrayType& a, size_t sx, size_t sy, size_t sz, size_t ss);
+    /// copy constructor
+    gtplusBSplineFFD4D(const Self& bffd);
+
+    virtual ~gtplusBSplineFFD4D();
+
+    /// evaluate the FFD at a grid location
+    virtual bool evaluateFFD(const CoordType pt[D], T r[DOut]) const;
+    virtual bool evaluateFFD(CoordType px, CoordType py, CoordType pz, CoordType ps, T r[DOut]) const;
+
+    virtual bool evaluateFFDDX(const CoordType pt[D], T dx[DOut]) const;
+    virtual bool evaluateFFDDY(const CoordType pt[D], T dy[DOut]) const;
+    virtual bool evaluateFFDDZ(const CoordType pt[D], T dz[DOut]) const;
+    virtual bool evaluateFFDDS(const CoordType pt[D], T ds[DOut]) const;
+
+    virtual bool evaluateWorldDX(const CoordType pt[D], T dx[DOut]) const;
+    virtual bool evaluateWorldDY(const CoordType pt[D], T dy[DOut]) const;
+    virtual bool evaluateWorldDZ(const CoordType pt[D], T dz[DOut]) const;
+    virtual bool evaluateWorldDS(const CoordType pt[D], T ds[DOut]) const;
+
+    /// evaluate the 1st order derivative of FFD at a grid location
+    virtual bool evaluateFFDDerivative(const CoordType pt[D], T deriv[D][DOut]) const;
+    virtual bool evaluateFFDDerivative(CoordType px, CoordType py, CoordType pz, CoordType ps, T deriv[D][DOut]) const;
+
+    /// evaluate the 2nd order derivative of FFD at a grid location
+    /// dderiv : D*D vector, stores dxx dxy dxz ...; dyx dyy dyz ...; dzx dzy dzz ...
+    virtual bool evaluateFFDSecondOrderDerivative(const CoordType pt[D], T dderiv[D*D][DOut]) const;
+    virtual bool evaluateFFDSecondOrderDerivative(CoordType px, CoordType py, CoordType pz, CoordType ps, T dderiv[D*D][DOut]) const;
+
+    /// compute the FFD approximation once
+    /// pos : the position of input points, D by N
+    /// value : the value on input points, DOut by N
+    /// residual : the approximation residual after computing FFD, DOut by N
+    virtual bool ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N);
+
+    /// As suggested in ref [2], the BSpline FFD can be refined to achieve better approximation
+    virtual bool refine();
+
+    /// general print function
+    virtual void print(std::ostream& os) const;
+
+    using BaseClass::performTiming_;
+    using BaseClass::debugFolder_;
+
+protected:
+
+    using BaseClass::ctrl_pt_;
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::LUT_;
+    using BaseClass::LUT1_;
+    using BaseClass::LUT2_;
+
+    /// evaluate the FFD
+    /// px and py are at FFD grid
+    /// ordx, ordy indicates the order of derivative; 0/1/2 for 0/1st/2nd derivative
+    virtual bool evaluateFFD4D(CoordType px, CoordType py, CoordType pz, CoordType ps, size_t ordx, size_t ordy, size_t ordz, size_t ords, T r[DOut]) const;
+};
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD4D<T, CoordType, DOut>::gtplusBSplineFFD4D() : BaseClass()
+{
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD4D<T, CoordType, DOut>::gtplusBSplineFFD4D(const PointType& start, const PointType& end, CoordType dx, CoordType dy, CoordType dz, CoordType ds) : BaseClass()
+{
+    GADGET_CHECK_THROW(this->initializeBFFD(start, end, dx, dy, dz, ds));
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD4D<T, CoordType, DOut>::gtplusBSplineFFD4D(const ImageType& im, CoordType dx, CoordType dy, CoordType dz, CoordType ds) : BaseClass()
+{
+    PointType start, end;
+
+    typename ImageType::coord_type x, y, z, s;
+
+    im.image_to_world(0, 0, 0, 0, x, y, z, s);
+    start(0) = x;
+    start(1) = y;
+    start(2) = z;
+    start(3) = s;
+
+    im.image_to_world(im.get_size(0)-1, im.get_size(1)-1, im.get_size(2)-1, im.get_size(3)-1, x, y, z, s);
+    end(0) = x;
+    end(1) = y;
+    end(2) = z;
+    end(3) = s;
+
+    GADGET_CHECK_THROW(this->initializeBFFD(im, start, end, dx, dy, dz, ds));
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD4D<T, CoordType, DOut>::gtplusBSplineFFD4D(const ImageType& im, size_t sx, size_t sy, size_t sz, size_t ss) : BaseClass()
+{
+    PointType start, end;
+
+    typename ImageType::coord_type x, y, z, s;
+
+    im.image_to_world( (size_t)0, (size_t)0, (size_t)0, (size_t)0, x, y, z, s);
+    start(0) = x;
+    start(1) = y;
+    start(2) = z;
+    start(3) = s;
+
+    im.image_to_world(im.get_size(0)-1, im.get_size(1)-1, im.get_size(2)-1, im.get_size(3)-1, x, y, z, s);
+    end(0) = x;
+    end(1) = y;
+    end(2) = z;
+    end(3) = s;
+
+    GADGET_CHECK_THROW(this->initializeBFFD(im, start, end, sx, sy, sz, ss));
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD4D<T, CoordType, DOut>::gtplusBSplineFFD4D(const ArrayType& a, size_t sx, size_t sy, size_t sz, size_t ss) : BaseClass()
+{
+    PointType start, end;
+
+    start(0) = 0;
+    start(1) = 0;
+    start(2) = 0;
+    start(3) = 0;
+
+    end(0) = a.get_size(0)-1;
+    end(1) = a.get_size(1)-1;
+    end(2) = a.get_size(2)-1;
+    end(3) = a.get_size(3)-1;
+
+    GADGET_CHECK_THROW(this->initializeBFFD(start, end, sx, sy, sz, ss));
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD4D<T, CoordType, DOut>::gtplusBSplineFFD4D(const Self& bffd) : BaseClass()
+{
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        this->ctrl_pt_[d].copyFrom( bffd.get_ctrl_pt(d) );
+    }
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+gtplusBSplineFFD4D<T, CoordType, DOut>::~gtplusBSplineFFD4D()
+{
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+bool gtplusBSplineFFD4D<T, CoordType, DOut>::evaluateFFD4D(CoordType px, CoordType py, CoordType pz, CoordType ps, size_t ordx, size_t ordy, size_t ordz, size_t ords, T r[DOut]) const
+{
+    try
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE( (px>=-2) && (px<=this->get_size(0)+1) );
+        GADGET_DEBUG_CHECK_RETURN_FALSE( (py>=-2) && (py<=this->get_size(1)+1) );
+        GADGET_DEBUG_CHECK_RETURN_FALSE( (pz>=-2) && (pz<=this->get_size(2)+1) );
+        GADGET_DEBUG_CHECK_RETURN_FALSE( (ps>=-2) && (ps<=this->get_size(3)+1) );
+
+        GADGET_DEBUG_CHECK_RETURN_FALSE(ordX>=0 && ordX<=2);
+        GADGET_DEBUG_CHECK_RETURN_FALSE(ordY>=0 && ordY<=2);
+        GADGET_DEBUG_CHECK_RETURN_FALSE(ordZ>=0 && ordZ<=2);
+        GADGET_DEBUG_CHECK_RETURN_FALSE(ordS>=0 && ordS<=2);
+        GADGET_DEBUG_CHECK_RETURN_FALSE(ordX+ordY+ordZ+ordS<=2);
+
+        long long ix = (long long)std::floor(px);
+        CoordType deltaX = px-(CoordType)ix;
+        long long lx = FFD_MKINT(BSPLINELUTSIZE*deltaX);
+
+        long long iy = (long long)std::floor(py);
+        CoordType deltaY = py-(CoordType)iy;
+        long long ly = FFD_MKINT(BSPLINELUTSIZE*deltaY);
+
+        long long iz = (long long)std::floor(pz);
+        CoordType deltaZ = pz-(CoordType)iz;
+        long long lz = FFD_MKINT(BSPLINELUTSIZE*deltaZ);
+
+        long long is = (long long)std::floor(ps);
+        CoordType deltaS = ps-(CoordType)is;
+        long long ls = FFD_MKINT(BSPLINELUTSIZE*deltaS);
+
+        unsigned int d, jj, kk, ss;
+        size_t offset[4][4][4]; // s, z, y
+
+        for ( ss=0; ss<4; ss++ )
+        {
+            offset[ss][0][0] = this->calculate_offset(ix-1, iy-1, iz-1, is+ss-1);
+            offset[ss][0][1] = this->calculate_offset(ix-1, iy  , iz-1, is+ss-1);
+            offset[ss][0][2] = this->calculate_offset(ix-1, iy+1, iz-1, is+ss-1);
+            offset[ss][0][3] = this->calculate_offset(ix-1, iy+2, iz-1, is+ss-1);
+
+            offset[ss][1][0] = this->calculate_offset(ix-1, iy-1, iz, is+ss-1);
+            offset[ss][1][1] = this->calculate_offset(ix-1, iy  , iz, is+ss-1);
+            offset[ss][1][2] = this->calculate_offset(ix-1, iy+1, iz, is+ss-1);
+            offset[ss][1][3] = this->calculate_offset(ix-1, iy+2, iz, is+ss-1);
+
+            offset[ss][2][0] = this->calculate_offset(ix-1, iy-1, iz+1, is+ss-1);
+            offset[ss][2][1] = this->calculate_offset(ix-1, iy  , iz+1, is+ss-1);
+            offset[ss][2][2] = this->calculate_offset(ix-1, iy+1, iz+1, is+ss-1);
+            offset[ss][2][3] = this->calculate_offset(ix-1, iy+2, iz+1, is+ss-1);
+
+            offset[ss][3][0] = this->calculate_offset(ix-1, iy-1, iz+2, is+ss-1);
+            offset[ss][3][1] = this->calculate_offset(ix-1, iy  , iz+2, is+ss-1);
+            offset[ss][3][2] = this->calculate_offset(ix-1, iy+1, iz+2, is+ss-1);
+            offset[ss][3][3] = this->calculate_offset(ix-1, iy+2, iz+2, is+ss-1);
+        }
+
+        const LUTType* p_xLUT= &this->LUT_;
+        const LUTType* p_yLUT= &this->LUT_;
+        const LUTType* p_zLUT= &this->LUT_;
+        const LUTType* p_sLUT= &this->LUT_;
+
+        if ( ordx == 1 )
+        {
+            p_xLUT= &this->LUT1_;
+        }
+        else if ( ordx == 2 )
+        {
+            p_xLUT= &this->LUT2_;
+        }
+
+        if ( ordy == 1 )
+        {
+            p_yLUT= &this->LUT1_;
+        }
+        else if ( ordy == 2 )
+        {
+            p_yLUT= &this->LUT2_;
+        }
+
+        if ( ordz == 1 )
+        {
+            p_zLUT= &this->LUT1_;
+        }
+        else if ( ordz == 2 )
+        {
+            p_zLUT= &this->LUT2_;
+        }
+
+        if ( ords == 1 )
+        {
+            p_sLUT= &this->LUT1_;
+        }
+        else if ( ords == 2 )
+        {
+            p_sLUT= &this->LUT2_;
+        }
+
+        const LUTType& xLUT= *p_xLUT;
+        const LUTType& yLUT= *p_yLUT;
+        const LUTType& zLUT= *p_zLUT;
+        const LUTType& sLUT= *p_sLUT;
+
+        for ( d=0; d<DOut; d++ )
+        {
+            r[d] = 0;
+            for (ss=0; ss<4; ss++)
+            {
+                T rs=0;
+                for (kk=0; kk<4; kk++)
+                {
+                    T rv = 0;
+                    for (jj=0; jj<4; jj++)
+                    {
+                        T v  =  ( this->ctrl_pt_[d](offset[ss][kk][jj])   * xLUT[lx][0] )
+                            + ( this->ctrl_pt_[d](offset[ss][kk][jj]+1) * xLUT[lx][1] )
+                            + ( this->ctrl_pt_[d](offset[ss][kk][jj]+2) * xLUT[lx][2] )
+                            + ( this->ctrl_pt_[d](offset[ss][kk][jj]+3) * xLUT[lx][3] );
+
+                        rv += v * yLUT[ly][jj];
+                    }
+
+                    rs += rv * zLUT[lz][kk];
+                }
+
+                r[d] += rs * sLUT[ls][ss];
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in evaluateFFD4D(CoordType px, CoordType py, CoordType pz, CoordType ps, size_t ordx, size_t ordy, size_t ordz, size_t ords, T r[DOut]) const ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD4D<T, CoordType, DOut>::evaluateFFD(const CoordType pt[D], T r[DOut]) const
+{
+    return this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 0, 0, 0, 0, r);
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD4D<T, CoordType, DOut>::evaluateFFD(CoordType px, CoordType py, CoordType pz, CoordType ps, T r[DOut]) const
+{
+    return this->evaluateFFD4D(px, py, pz, ps, 0, 0, 0, 0, r);
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD4D<T, CoordType, DOut>::evaluateFFDDX(const CoordType pt[D], T dx[DOut]) const
+{
+    return this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 1, 0, 0, 0, dx);
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD4D<T, CoordType, DOut>::evaluateFFDDY(const CoordType pt[D], T dy[DOut]) const
+{
+    return this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 0, 1, 0, 0, dy);
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD4D<T, CoordType, DOut>::evaluateFFDDZ(const CoordType pt[D], T dz[DOut]) const
+{
+    return this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 0, 0, 1, 0, dz);
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD4D<T, CoordType, DOut>::evaluateFFDDS(const CoordType pt[D], T ds[DOut]) const
+{
+    return this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 0, 0, 0, 1, ds);
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD4D<T, CoordType, DOut>::evaluateWorldDX(const CoordType pt[D], T dx[DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 1, 0, 0, 0, dx));
+    coord_type sx = coord_type(1.0)/this->get_spacing(0);
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        dx[d] *= sx;
+    }
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD4D<T, CoordType, DOut>::evaluateWorldDY(const CoordType pt[D], T dy[DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 0, 1, 0, 0, dy));
+    coord_type sy = coord_type(1.0)/this->get_spacing(1);
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        dy[d] *= sy;
+    }
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD4D<T, CoordType, DOut>::evaluateWorldDZ(const CoordType pt[D], T dz[DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 0, 0, 1, 0, dz));
+    coord_type sz = coord_type(1.0)/this->get_spacing(2);
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        dz[d] *= sz;
+    }
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD4D<T, CoordType, DOut>::evaluateWorldDS(const CoordType pt[D], T ds[DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 0, 0, 0, 1, ds));
+    coord_type ss = coord_type(1.0)/this->get_spacing(3);
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        ds[d] *= ss;
+    }
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD4D<T, CoordType, DOut>::evaluateFFDDerivative(const CoordType pt[D], T deriv[D][DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 1, 0, 0, 0, deriv[0]));
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 0, 1, 0, 0, deriv[1]));
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 0, 0, 1, 0, deriv[2]));
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 0, 0, 0, 1, deriv[3]));
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD4D<T, CoordType, DOut>::evaluateFFDDerivative(CoordType px, CoordType py, CoordType pz, CoordType ps, T deriv[D][DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(px, py, pz, ps, 1, 0, 0, 0, deriv[0]));
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(px, py, pz, ps, 0, 1, 0, 0, deriv[1]));
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(px, py, pz, ps, 0, 0, 1, 0, deriv[2]));
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(px, py, pz, ps, 0, 0, 0, 1, deriv[3]));
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD4D<T, CoordType, DOut>::evaluateFFDSecondOrderDerivative(const CoordType pt[D], T dderiv[D*D][DOut]) const
+{
+    // dxx
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 2, 0, 0, 0, dderiv[0]));
+    // dxy
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 1, 1, 0, 0, dderiv[1]));
+    // dxz
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 1, 0, 1, 0, dderiv[2]));
+    // dxs
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 1, 0, 0, 1, dderiv[3]));
+
+    // dyx
+    memcpy(dderiv[4], dderiv[1], DOut*sizeof(T));
+    // dyy
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 0, 2, 0, 0, dderiv[5]));
+    // dyz
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 0, 1, 1, 0, dderiv[6]));
+    // dys
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 0, 1, 0, 1, dderiv[7]));
+
+    // dzx
+    memcpy(dderiv[8], dderiv[2], DOut*sizeof(T));
+    // dzy
+    memcpy(dderiv[9], dderiv[6], DOut*sizeof(T));
+    // dzz
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 0, 0, 2, 0, dderiv[10]));
+    // dzs
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 0, 0, 1, 1, dderiv[11]));
+
+    // dsx
+    memcpy(dderiv[12], dderiv[3], DOut*sizeof(T));
+    // dsy
+    memcpy(dderiv[13], dderiv[7], DOut*sizeof(T));
+    // dsz
+    memcpy(dderiv[14], dderiv[11], DOut*sizeof(T));
+    // dss
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(pt[0], pt[1], pt[2], pt[3], 0, 0, 0, 2, dderiv[15]));
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+inline bool gtplusBSplineFFD4D<T, CoordType, DOut>::evaluateFFDSecondOrderDerivative(CoordType px, CoordType py, CoordType pz, CoordType ps, T dderiv[D*D][DOut]) const
+{
+    // dxx
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(px, py, pz, ps, 2, 0, 0, 0, dderiv[0]));
+    // dxy
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(px, py, pz, ps, 1, 1, 0, 0, dderiv[1]));
+    // dxz
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(px, py, pz, ps, 1, 0, 1, 0, dderiv[2]));
+    // dxs
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(px, py, pz, ps, 1, 0, 0, 1, dderiv[3]));
+
+    // dyx
+    memcpy(dderiv[4], dderiv[1], DOut*sizeof(T));
+    // dyy
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(px, py, pz, ps, 0, 2, 0, 0, dderiv[5]));
+    // dyz
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(px, py, pz, ps, 0, 1, 1, 0, dderiv[6]));
+    // dys
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(px, py, pz, ps, 0, 1, 0, 1, dderiv[7]));
+
+    // dzx
+    memcpy(dderiv[8], dderiv[2], DOut*sizeof(T));
+    // dzy
+    memcpy(dderiv[9], dderiv[6], DOut*sizeof(T));
+    // dzz
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(px, py, pz, ps, 0, 0, 2, 0, dderiv[10]));
+    // dzs
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(px, py, pz, ps, 0, 0, 1, 1, dderiv[11]));
+
+    // dsx
+    memcpy(dderiv[12], dderiv[3], DOut*sizeof(T));
+    // dsy
+    memcpy(dderiv[13], dderiv[7], DOut*sizeof(T));
+    // dsz
+    memcpy(dderiv[14], dderiv[11], DOut*sizeof(T));
+    // dss
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD4D(px, py, pz, ps, 0, 0, 0, 2, dderiv[15]));
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+bool gtplusBSplineFFD4D<T, CoordType, DOut>::ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(pos.get_size(0)==D);
+        GADGET_CHECK_RETURN_FALSE(pos.get_size(1)==N);
+
+        GADGET_CHECK_RETURN_FALSE(value.get_size(0)==DOut);
+        GADGET_CHECK_RETURN_FALSE(value.get_size(1)==N);
+
+        std::vector<size_t> dim;
+        value.get_dimensions(dim);
+        if ( !residual.dimensions_equal(&dim) )
+        {
+            residual.create(value.get_dimensions());
+            Gadgetron::clear(residual);
+        }
+
+        size_t sx = this->get_size(0);
+        size_t sy = this->get_size(1);
+        size_t sz = this->get_size(2);
+        size_t ss = this->get_size(3);
+
+        /// following the definition of ref[2]
+        ho5DArray<T> dx(sx, sy, sz, ss, DOut), ds(sx, sy, sz, ss, DOut);
+        Gadgetron::clear(dx);
+        Gadgetron::clear(ds);
+
+        /// compute the current approximation values
+        ValueArrayType approxValue;
+        approxValue = value;
+
+        /// compute current residual
+        GADGET_CHECK_RETURN_FALSE(this->evaluateFFDArray(pos, approxValue));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::subtract(value, approxValue, residual));
+
+        /// compute the update of control points
+        unsigned int d;
+
+        long long n;
+        for (n=0; n<N; n++)
+        {
+            coord_type px = pos(0, n);
+            coord_type py = pos(1, n);
+            coord_type pz = pos(2, n);
+            coord_type ps = pos(3, n);
+
+            if ( px<-2 || px>sx+2
+                || py<-2 || py>sy+2
+                || pz<-2 || pz>sz+2
+                || ps<-2 || ps>ss+2 )
+            {
+                continue;
+            }
+
+            long long ix = (long long)std::floor(px);
+            CoordType deltaX = px-(CoordType)ix;
+
+            long long iy = (long long)std::floor(py);
+            CoordType deltaY = py-(CoordType)iy;
+
+            long long iz = (long long)std::floor(pz);
+            CoordType deltaZ = pz-(CoordType)iz;
+
+            long long is = (long long)std::floor(ps);
+            CoordType deltaS = ps-(CoordType)is;
+
+            long long i, j, k, s, I, J, K, S;
+
+            T dist=0, v, vv, vvv;
+            for (s=0; s<4; s++)
+            {
+                for (k=0; k<4; k++)
+                {
+                    for (j=0; j<4; j++)
+                    {
+                        for (i=0; i<4; i++)
+                        {
+                            v = (this->BSpline(i, deltaX) * this->BSpline(j, deltaY)) * (this->BSpline(k, deltaZ) * this->BSpline(s, deltaS));
+                            dist += v*v;
+                        }
+                    }
+                }
+            }
+
+            for (s=0; s<4; s++)
+            {
+                S = s + is - 1;
+                if ( (S>=0) && (S<(long long)ss) )
+                {
+                    for (k=0; k<4; k++)
+                    {
+                        K = k + iz - 1;
+                        if ( (K>=0) && (K<(long long)sz) )
+                        {
+                            for (j=0; j<4; j++)
+                            {
+                                J = j + iy - 1;
+                                if ( (J>=0) && (J<(long long)sy) )
+                                {
+                                    for (i=0; i<4; i++)
+                                    {
+                                        I = i + ix - 1;
+                                        if ( (I>=0) && (I<(long long)sx) )
+                                        {
+                                            v = this->BSpline(i, deltaX) * this->BSpline(j, deltaY) * this->BSpline(k, deltaZ) * this->BSpline(s, deltaS);
+                                            vv = v*v;
+                                            vvv = vv*v;
+
+                                            for ( d=0; d<DOut; d++ )
+                                            {
+                                                dx(I, J, K, S, d) += vvv*residual(d, n)/dist;
+                                                ds(I, J, K, S, d) += vv;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        /// update the control point values
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::addEpsilon(ds));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::divide(dx, ds, dx));
+
+        std::vector<size_t> startND(4, BSPLINEPADDINGSIZE), size(4);
+        size[0] = sx;
+        size[1] = sy;
+        size[2] = sz;
+        size[3] = ss;
+
+        hoNDArray<T> ctrlPtWithoutPadding(sx, sy, sz, ss);
+
+        for ( d=0; d<DOut; d++ )
+        {
+            hoNDArray<T> dx4D(sx, sy, sz, ss, dx.begin()+d*sx*sy*sz*ss*sizeof(T));
+
+            std::vector<size_t> dim;
+            this->ctrl_pt_[d].get_dimensions(dim);
+            hoNDArray<T> tmpCtrlPt(dim, this->ctrl_pt_[d].begin(), false);
+            Gadgetron::cropUpTo11DArray(tmpCtrlPt, ctrlPtWithoutPadding, startND, size);
+            Gadgetron::add(ctrlPtWithoutPadding, dx4D, ctrlPtWithoutPadding);
+            Gadgetron::setSubArrayUpTo11DArray(ctrlPtWithoutPadding, tmpCtrlPt, startND, size);
+        }
+
+        /// calculate residual error
+        totalResidual = 0;
+        GADGET_CHECK_RETURN_FALSE(this->evaluateFFDArray(pos, approxValue));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::subtract(value, approxValue, residual));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::norm2(residual, totalResidual));
+        totalResidual = totalResidual / (real_value_type)N;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+bool gtplusBSplineFFD4D<T, CoordType, DOut>::refine()
+{
+    try
+    {
+        size_t sx = this->get_size(0);
+        size_t sy = this->get_size(1);
+        size_t sz = this->get_size(2);
+        size_t ss = this->get_size(3);
+
+        /// the refined control point grid definition
+
+        std::vector<size_t> dim(4);
+        dim[0] = 2*sx-1 + 2*BSPLINEPADDINGSIZE;
+        dim[1] = 2*sy-1 + 2*BSPLINEPADDINGSIZE;
+        dim[2] = 2*sz-1 + 2*BSPLINEPADDINGSIZE;
+        dim[3] = 2*ss-1 + 2*BSPLINEPADDINGSIZE;
+
+        std::vector<coord_type> spacing;
+        this->get_spacing(spacing);
+        spacing[0] /= 2;
+        spacing[1] /= 2;
+        if ( sz > 1 ) spacing[2] /= 2;
+        if ( ss > 1 ) spacing[3] /= 2;
+
+        std::vector<coord_type> oldOrigin;
+        this->ctrl_pt_[0].get_origin(oldOrigin);
+
+        std::vector<coord_type> gridOrigin(4);
+        this->ctrl_pt_[0].image_to_world( (CoordType)(BSPLINEPADDINGSIZE),
+                                          (CoordType)(BSPLINEPADDINGSIZE),
+                                          (CoordType)(BSPLINEPADDINGSIZE),
+                                          (CoordType)(BSPLINEPADDINGSIZE),
+                                          gridOrigin[0], gridOrigin[1],
+                                          gridOrigin[2], gridOrigin[3]);
+
+        std::vector<coord_type> origin(4);
+        origin[0] = (oldOrigin[0] + gridOrigin[0])/2;
+        origin[1] = (oldOrigin[1] + gridOrigin[1])/2;
+        origin[2] = (oldOrigin[2] + gridOrigin[2])/2;
+        origin[3] = (oldOrigin[3] + gridOrigin[3])/2;
+
+        typename ImageType::axis_type axis;
+        this->ctrl_pt_[0].get_axis(axis);
+
+        /// allocate new control points
+        FFDCtrlPtGridType new_ctrl_pt[DOut];
+
+        unsigned int d;
+        for( d=0; d<DOut; d++ )
+        {
+            new_ctrl_pt[d].create(dim, spacing, origin, axis);
+            Gadgetron::clear(new_ctrl_pt[d]);
+        }
+
+        /// refinement weights, see ref[2]
+        T w[2][3];
+
+        w[0][0] = T(0.125); w[0][1] = T(0.75);  w[0][2] = T(0.125);
+        w[1][0] = 0;        w[1][1] = T(0.5);   w[1][2] = T(0.5);
+
+        /// compute refined control point values
+        int x, y, z, s, i_new, j_new, k_new, s_new, i_old, j_old, k_old, s_old;
+
+        if ( ss>1 && sz>1 )
+        {
+            for (s=0; s<ss; s++)
+            {
+                for (z=0; z<sz; z++)
+                {
+                    for (y=0; y<sy; y++)
+                    {
+                        for (x=0; x<sx; x++)
+                        {
+                            for (s_new=0; s_new<2; s_new++)
+                            {
+                                for (k_new=0; k_new<2; k_new++)
+                                {
+                                    for (j_new=0; j_new<2; j_new++)
+                                    {
+                                        for (i_new=0; i_new<2; i_new++)
+                                        {
+                                            size_t offsetNew = new_ctrl_pt[0].calculate_offset(2*x+i_new+BSPLINEPADDINGSIZE, 2*y+j_new+BSPLINEPADDINGSIZE, 2*z+k_new+BSPLINEPADDINGSIZE, 2*s+s_new+BSPLINEPADDINGSIZE);
+
+                                            for (s_old=0; s_old<3; s_old++)
+                                            {
+                                                for (k_old=0; k_old<3; k_old++)
+                                                {
+                                                    for (j_old=0; j_old<3; j_old++)
+                                                    {
+                                                        for (i_old=0; i_old<3; i_old++)
+                                                        {
+                                                            size_t offsetOld = this->calculate_offset(x+i_old-1, y+j_old-1, z+k_old-1, s+s_old-1);
+                                                            for ( d=0; d<DOut; d++ )
+                                                            {
+                                                                new_ctrl_pt[d](offsetNew) += w[i_new][i_old]*w[j_new][j_old]*w[k_new][k_old]*w[s_new][s_old] * this->ctrl_pt_[d](offsetOld);
+                                                            }
+                                                        }
+                                                    }
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else if ( ss==1 && sz>1 )
+        {
+            for (z=0; z<sz; z++)
+            {
+                for (y=0; y<sy; y++)
+                {
+                    for (x=0; x<sx; x++)
+                    {
+                        for (k_new=0; k_new<2; k_new++)
+                        {
+                            for (j_new=0; j_new<2; j_new++)
+                            {
+                                for (i_new=0; i_new<2; i_new++)
+                                {
+                                    size_t offsetNew = new_ctrl_pt[0].calculate_offset(2*x+i_new+BSPLINEPADDINGSIZE, 2*y+j_new+BSPLINEPADDINGSIZE, 2*z+k_new+BSPLINEPADDINGSIZE, BSPLINEPADDINGSIZE);
+
+                                    for (k_old=0; k_old<3; k_old++)
+                                    {
+                                        for (j_old=0; j_old<3; j_old++)
+                                        {
+                                            for (i_old=0; i_old<3; i_old++)
+                                            {
+                                                size_t offsetOld = this->calculate_offset(x+i_old-1, y+j_old-1, z+k_old-1, 0);
+                                                for ( d=0; d<DOut; d++ )
+                                                {
+                                                    new_ctrl_pt[d](offsetNew) += w[i_new][i_old]*w[j_new][j_old]*w[k_new][k_old] * this->ctrl_pt_[d](offsetOld);
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else if ( ss==1 && sz==1 )
+        {
+            for (y=0; y<sy; y++)
+            {
+                for (x=0; x<sx; x++)
+                {
+                    for (j_new=0; j_new<2; j_new++)
+                    {
+                        for (i_new=0; i_new<2; i_new++)
+                        {
+                            size_t offsetNew = new_ctrl_pt[0].calculate_offset(2*x+i_new+BSPLINEPADDINGSIZE, 2*y+j_new+BSPLINEPADDINGSIZE, BSPLINEPADDINGSIZE, BSPLINEPADDINGSIZE);
+
+                            for (j_old=0; j_old<3; j_old++)
+                            {
+                                for (i_old=0; i_old<3; i_old++)
+                                {
+                                    size_t offsetOld = this->calculate_offset(x+i_old-1, y+j_old-1, 0, 0);
+                                    for ( d=0; d<DOut; d++ )
+                                    {
+                                        new_ctrl_pt[d](offsetNew) += w[i_new][i_old]*w[j_new][j_old] * this->ctrl_pt_[d](offsetOld);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        for ( d=0; d<DOut; d++ )
+        {
+            this->ctrl_pt_[d].create(dim, spacing, origin, axis, new_ctrl_pt[d].begin(), true);
+            new_ctrl_pt[d].delete_data_on_destruct(false);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in refine() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DOut>
+void gtplusBSplineFFD4D<T, CoordType, DOut>::print(std::ostream& os) const
+{
+    using namespace std;
+
+    os << "---------------------- GTPlus BSpline 4D Free Form Deformation ------------------" << endl;
+    os << "Implement 4D BSpline Free Form Deformation (BFFD) " << endl;
+
+    std::string elemTypeName = std::string( typeid(T).name() );
+    os << "FFD value type is : " << elemTypeName << endl;
+
+    elemTypeName = std::string( typeid(CoordType).name() );
+    os << "FFD coord type is : " << elemTypeName << endl;
+
+    os << "Output dimension is : " << DOut << endl;
+    os << "---------------------------------------------------" << endl;
+    os << "BFFD grid information : " << endl;
+    this->ctrl_pt_[0].printContent(os);
+    os << "---------------------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusFFDBase.h b/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusFFDBase.h
new file mode 100644
index 0000000..b440481
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusFFDBase.h
@@ -0,0 +1,1978 @@
+/** \file       gtplusFFDBase.h
+    \brief      Base class for gtPlus FreeFormDeformation package
+
+                FreeFormDeformation (FFD) is a general purpose scatter interpolation algorithm. It is widely used in numerical applications, 
+                such as image registration, data inteprolation and geometric modelling etc.
+
+                [1] http://en.wikipedia.org/wiki/Free-form_deformation
+
+                [2] Seungyong Lee ; Dept. of Comput. Sci., Pohang Inst. of Sci. & Technol., South Korea ; Wolberg, G. ; Sung Yong Shin. Scattered data interpolation with multilevel B-splines. IEEE 
+                    Transactions on Visualization and Computer Graphics, Volume 3, Issue 3, 1997.
+
+                [3] D Rueckert, LI Sonoda, C Hayes, DLG Hill, MO Leach, DJ Hawkes. Nonrigid registration using free-form deformations: application to breast MR images. IEEE 
+                    Transactions on Medical Imaging, Volume 18, Issue 8, 1999.
+
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include <typeinfo>
+#include <cmath>
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusIOAnalyze.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+#define FFD_MKINT(a) (((a)>=0)?((int)((a)+0.5)):((int)((a)-0.5)))
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut>
+class gtplusFFDBase
+{
+public:
+
+    typedef gtplusFFDBase<T, CoordType, DIn, DOut> Self;
+
+    typedef typename realType<T>::Type real_value_type;
+
+    typedef CoordType coord_type;
+
+    enum { D = DIn };
+
+    /// array to store the coordinates of spatial points
+    /// has the dimension of DIn by N for N points
+    typedef hoNDArray<CoordType> CoordArrayType;
+
+    /// array to store the point value
+    /// for N points, the dimension of array is DOut by N
+    /// DOut is equal or larger than 1; if larger than 1, the 
+    /// vectorized FFD is computed
+    typedef hoNDArray<T> ValueArrayType;
+    typedef ValueArrayType ArrayType;
+
+    typedef hoNDArray<float> MaskArrayType;
+
+    /// control point grip type
+    typedef hoNDImage<T, DIn> FFDCtrlPtGridType;
+
+    /// point type
+    typedef hoNDPoint<CoordType, DIn> PointType;
+
+    /// image type
+    typedef hoNDImage<T, DIn> ImageType;
+
+    gtplusFFDBase();
+    virtual ~gtplusFFDBase();
+
+    /// evaluate the FFD at a grid location
+    /// the input points are in the FFD grid
+    virtual bool evaluateFFD(const CoordType pt[D], T r[DOut]) const = 0;
+    virtual bool evaluateFFD(const CoordType* pt[D], T* r[DOut], size_t N) const;
+    virtual bool evaluateFFD(const PointType& pt, T r[DOut]) const;
+    virtual bool evaluateFFDArray(const CoordArrayType& pts, ValueArrayType& r) const;
+
+    /// evaluate the 1st order derivative of FFD at a grid location
+    /// deriv: derivative for all D dimensions and all DOut values
+    virtual bool evaluateFFDDerivative(const CoordType pt[D], T deriv[D][DOut]) const = 0;
+    virtual bool evaluateFFDDerivative(const PointType& pt, T deriv[D][DOut]) const;
+
+    virtual bool evaluateFFDDX(const CoordType pt[D], T dx[DOut]) const;
+    virtual bool evaluateFFDDY(const CoordType pt[D], T dy[DOut]) const;
+    virtual bool evaluateFFDDZ(const CoordType pt[D], T dz[DOut]) const;
+    virtual bool evaluateFFDDS(const CoordType pt[D], T ds[DOut]) const;
+
+    /// calculate the 1st order derivative of FFD at a world coordinate location with the world coordinate unit
+    virtual bool evaluateWorldDerivative(const CoordType pt[D], T deriv[D][DOut]) const;
+
+    virtual bool evaluateWorldDX(const CoordType pt[D], T dx[DOut]) const;
+    virtual bool evaluateWorldDY(const CoordType pt[D], T dy[DOut]) const;
+    virtual bool evaluateWorldDZ(const CoordType pt[D], T dz[DOut]) const;
+    virtual bool evaluateWorldDS(const CoordType pt[D], T ds[DOut]) const;
+
+    /// evaluate the 2nd order derivative of FFD at a grid location
+    /// dderiv : D*D vector, stores dxx dxy dxz ...; dyx dyy dyz ...; dzx dzy dzz ...
+    virtual bool evaluateFFDSecondOrderDerivative(const CoordType pt[D], T dderiv[D*D][DOut]) const = 0;
+    virtual bool evaluateFFDSecondOrderDerivative(const PointType& pt, T dderiv[D*D][DOut]) const;
+
+    /// evaluate the FFD at a world location
+    virtual bool evaluateFFDW(const CoordType pt[D], T r[DOut]) const;
+    virtual bool evaluateFFDW(CoordType px, CoordType py, T r[DOut]) const;
+    virtual bool evaluateFFDW(CoordType px, CoordType py, CoordType pz, T r[DOut]) const;
+    virtual bool evaluateFFDW(CoordType px, CoordType py, CoordType pz, CoordType ps, T r[DOut]) const;
+
+    virtual bool evaluateFFDDerivativeW(const CoordType pt[D], T deriv[D][DOut]) const;
+    virtual bool evaluateFFDDerivativeW(CoordType px, CoordType py, T deriv[D][DOut]) const;
+    virtual bool evaluateFFDDerivativeW(CoordType px, CoordType py, CoordType pz, T deriv[D][DOut]) const;
+    virtual bool evaluateFFDDerivativeW(CoordType px, CoordType py, CoordType pz, CoordType ps, T deriv[D][DOut]) const;
+
+    virtual bool evaluateFFDSecondOrderDerivativeW(const CoordType pt[D], T dderiv[D*D][DOut]) const;
+    virtual bool evaluateFFDSecondOrderDerivativeW(CoordType px, CoordType py, T dderiv[D*D][DOut]) const;
+    virtual bool evaluateFFDSecondOrderDerivativeW(CoordType px, CoordType py, CoordType pz, T dderiv[D*D][DOut]) const;
+    virtual bool evaluateFFDSecondOrderDerivativeW(CoordType px, CoordType py, CoordType pz, CoordType ps, T dderiv[D*D][DOut]) const;
+
+    /// compute the FFD approximation once
+    /// pos : the position of input points, DIn by N
+    /// value : the value on input points, DOut by N
+    /// residual : the approximation residual after computing FFD, DOut by N
+    /// N : the number of points
+    virtual bool ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, T& totalResidual, size_t N) = 0;
+
+    /// compute the FFD approximation with refinement, see ref [2]
+    /// numOfRefinement : number of grid refinement
+    virtual bool ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N, size_t numOfRefinement);
+
+    /// keep refine the FFD until either the maximal refinement level is reached or total residual is less than a threshold
+    virtual bool ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement);
+
+    //// fft approximation with input in the world coordinates
+    virtual bool ffdApproxW(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N, size_t numOfRefinement);
+    virtual bool ffdApproxW(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement);
+
+    /// easy-to-use function calls for image and array
+
+    /// convert every pixel in the image to FFD point inputs with world coordiantes
+    virtual bool imageToFFDInputsW(ImageType target[DOut], CoordArrayType& pos, ValueArrayType& value);
+    /// mask == 0 means this point is excluded from approximation
+    virtual bool imageToFFDInputsW(ImageType target[DOut], const MaskArrayType& mask, CoordArrayType& pos, ValueArrayType& value);
+
+    /// convert every pixel in the array to FFD point inputs with world coordiantes
+    virtual bool arrayToFFDInputsW(ArrayType target[DOut], CoordArrayType& pos, ValueArrayType& value);
+    /// mask == 0 means this point is excluded from approximation
+    virtual bool arrayToFFDInputsW(ArrayType target[DOut], const MaskArrayType& mask, CoordArrayType& pos, ValueArrayType& value);
+
+    /// for Image type
+    virtual bool ffdApproxImage(ImageType target[DOut], real_value_type& totalResidual, size_t numOfRefinement);
+    virtual bool ffdApproxImage(ImageType target[DOut], real_value_type& totalResidual, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement);
+    virtual bool ffdApproxImage(ImageType& target, real_value_type& totalResidual, size_t numOfRefinement);
+    virtual bool ffdApproxImage(ImageType& target, real_value_type& totalResidual, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement);
+
+    virtual bool ffdApproxImage(ImageType target[DOut], const MaskArrayType& mask, real_value_type& totalResidual, size_t numOfRefinement);
+    virtual bool ffdApproxImage(ImageType target[DOut], const MaskArrayType& mask, real_value_type& totalResidual, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement);
+
+    /// for Array type
+    virtual bool ffdApproxArray(ArrayType target[DOut], real_value_type& totalResidual, size_t numOfRefinement);
+    virtual bool ffdApproxArray(ArrayType target[DOut], real_value_type& totalResidual, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement);
+    virtual bool ffdApproxArray(ArrayType& target, real_value_type& totalResidual, size_t numOfRefinement);
+    virtual bool ffdApproxArray(ArrayType& target, real_value_type& totalResidual, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement);
+
+    virtual bool ffdApproxArray(ArrayType target[DOut], const MaskArrayType& mask, real_value_type& totalResidual, size_t numOfRefinement);
+    virtual bool ffdApproxArray(ArrayType target[DOut], const MaskArrayType& mask, real_value_type& totalResidual, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement);
+
+    /// As suggested in ref [2], the BSpline FFD can be refined to achieve better approximation
+    virtual bool refine() = 0;
+
+    /// utility functions for easy-to-use
+
+    /// get control points
+    FFDCtrlPtGridType& get_ctrl_pt(unsigned int d) { return this->ctrl_pt_[d]; }
+    const FFDCtrlPtGridType& get_ctrl_pt(unsigned int d) const { return this->ctrl_pt_[d]; }
+
+    /// get the size of control point arrays
+    virtual size_t get_size(size_t dimension) const { return ctrl_pt_[0].get_size(dimension); }
+    virtual std::vector<size_t> get_dimensions() const { std::vector<size_t> dim; ctrl_pt_[0].get_dimensions(dim); return dim; }
+
+    /// get the spacing of of control point arrays
+    virtual coord_type get_spacing(size_t dimension) const { return ctrl_pt_[0].get_pixel_size(dimension); }
+    virtual void get_spacing(std::vector<coord_type>& spacing) const { ctrl_pt_[0].get_pixel_size(spacing); }
+
+    /// get/set a control point value
+    virtual T get(size_t x, size_t y, size_t d) const { return ctrl_pt_[d](x, y); }
+    virtual void set(size_t x, size_t y, size_t d, T v) { ctrl_pt_[d](x, y) = v; }
+
+    virtual T get(size_t x, size_t y, size_t z, size_t d) const { return ctrl_pt_[d](x, y, z); }
+    virtual void set(size_t x, size_t y, size_t z, size_t d, T v) { ctrl_pt_[d](x, y, z) = v; }
+
+    virtual T get(size_t x, size_t y, size_t z, size_t s, size_t d) const { return ctrl_pt_[d](x, y, z, s); }
+    virtual void set(size_t x, size_t y, size_t z, size_t s, size_t d, T v) { ctrl_pt_[d](x, y, z, s) = v; }
+
+    /// offset to/from indexes for control points
+    virtual size_t calculate_offset(size_t x, size_t y) const { return ctrl_pt_[0].calculate_offset(x, y); }
+    virtual void calculate_index( size_t offset, size_t& x, size_t& y ) const { ctrl_pt_[0].calculate_index(offset, x, y); }
+
+    virtual size_t calculate_offset(size_t x, size_t y, size_t z) const { return ctrl_pt_[0].calculate_offset(x, y, z); }
+    virtual void calculate_index( size_t offset, size_t& x, size_t& y, size_t& z ) const { ctrl_pt_[0].calculate_index(offset, x, y, z); }
+
+    virtual size_t calculate_offset(size_t x, size_t y, size_t z, size_t s) const { return ctrl_pt_[0].calculate_offset(x, y, z, s); }
+    virtual void calculate_index( size_t offset, size_t& x, size_t& y, size_t& z, size_t& s ) const { ctrl_pt_[0].calculate_index(offset, x, y, z, s); }
+
+    /// compute the control point location in world coordinates
+    virtual void get_location(size_t x, size_t y, CoordType& sx, CoordType& sy) const { ctrl_pt_[0].image_to_world(x, y, sx, sy); }
+    virtual void get_location(size_t x, size_t y, size_t z, CoordType& sx, CoordType& sy, CoordType& sz) const { ctrl_pt_[0].image_to_world(x, y, z, sx, sy, sz); }
+    virtual void get_location(size_t x, size_t y, size_t z, size_t s, CoordType& sx, CoordType& sy, CoordType& sz, CoordType& ss) const { ctrl_pt_[0].image_to_world(x, y, z, s, sx, sy, sz, ss); }
+
+    /// convert a world coordinate point to FFD grid location
+    virtual bool world_to_grid(const CoordArrayType& pt_w, CoordArrayType& pt_g) const;
+    virtual bool world_to_grid(const CoordType pt_w[D], CoordType pt_g[D]) const;
+    virtual bool world_to_grid(CoordType px_w, CoordType py_w, CoordType& px_g, CoordType& py_g) const;
+    virtual bool world_to_grid(CoordType px_w, CoordType py_w, CoordType pz_w, CoordType& px_g, CoordType& py_g, CoordType& pz_g) const;
+    virtual bool world_to_grid(CoordType px_w, CoordType py_w, CoordType pz_w, CoordType ps_w, CoordType& px_g, CoordType& py_g, CoordType& pz_g, CoordType& ps_g) const;
+
+    virtual bool grid_to_world(const CoordArrayType& pt_g, CoordArrayType& pt_w) const;
+    virtual bool grid_to_world(const CoordType pt_g[D], CoordType pt_w[D]) const;
+    virtual bool grid_to_world(CoordType px_g, CoordType py_g, CoordType& px_w, CoordType& py_w) const;
+    virtual bool grid_to_world(CoordType px_g, CoordType py_g, CoordType pz_g, CoordType& px_w, CoordType& py_w, CoordType& pz_w) const;
+
+    /// evaluate FFD for every pixel in the target image
+    /// the image pixel will first be converted to world-coordinate
+    /// and then converted to FFD grid location
+    virtual bool evaluateFFDOnImage(ImageType& target) const;
+    virtual bool evaluateFFDOnImage(ImageType target[DOut]) const;
+
+    /// evaluate FFD for every elements in an array
+    /// the point indexes will be taken as the FFD grid location
+    virtual bool evaluateFFDOnArray(hoNDArray<T>& target) const;
+    virtual bool evaluateFFDOnArray(hoNDArray<T> target[DOut]) const;
+
+    /// clear the control points
+    virtual bool clear(T v=0);
+
+    /// print info
+    virtual void print(std::ostream& os) const;
+
+    /// whether to perform timing and print out messages
+    bool performTiming_;
+
+    /// debug folder
+    std::string debugFolder_;
+
+protected:
+
+    /// control points
+    FFDCtrlPtGridType ctrl_pt_[DOut];
+
+    /// clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    /// exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    /// util
+    gtPlusISMRMRDReconUtil<T> gtPlus_util_;
+    gtPlusISMRMRDReconUtilComplex<T> gtPlus_util_complex_;
+};
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+gtplusFFDBase<T, CoordType, DIn, DOut>::gtplusFFDBase()
+{
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+gtplusFFDBase<T, CoordType, DIn, DOut>::~gtplusFFDBase()
+{
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFD(const CoordType* pt[D], T* r[DOut], size_t N) const
+{
+    try
+    {
+        long long n;
+        #pragma omp parallel for private(n) shared(N, pt, r)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            this->evaluateFFD(pt[n], r[n]);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in evaluateFFD(const CoordType* pt[D], T* r[DOut], size_t N) const ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFD(const PointType& pt, T r[DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFD(pt.begin(), r));
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDArray(const CoordArrayType& pts, ValueArrayType& r) const
+{
+    try
+    {
+        size_t N = pts.get_size(1);
+        GADGET_CHECK_RETURN_FALSE(pts.get_size(0)==DIn);
+
+        if ( r.get_size(1)!=N || r.get_size(0)!=DOut )
+        {
+            r.create(DOut, N);
+        }
+
+        const CoordType* pPts = pts.begin();
+        T* pR = r.begin();
+
+        long long n;
+        #pragma omp parallel for private(n) shared(N, pPts, pR, r)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            this->evaluateFFD(pPts+n*DIn, pR+n*DOut);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in evaluateFFDArray(const CoordArrayType& pts, ValueArrayType& r) const ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDDerivative(const PointType& pt, T deriv[D][DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFDDerivative(pt.begin(), deriv));
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDDX(const CoordType pt[D], T dx[DOut]) const
+{
+    T deriv[D][DOut];
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFDDerivative(pt, deriv));
+    memcpy(dx, deriv, sizeof(T)*DOut);
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDDY(const CoordType pt[D], T dy[DOut]) const
+{
+    T deriv[D][DOut];
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFDDerivative(pt, deriv));
+    memcpy(dy, deriv+sizeof(T)*DOut, sizeof(T)*DOut);
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDDZ(const CoordType pt[D], T dz[DOut]) const
+{
+    T deriv[D][DOut];
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFDDerivative(pt, deriv));
+    memcpy(dz, deriv+2*sizeof(T)*DOut, sizeof(T)*DOut);
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDDS(const CoordType pt[D], T ds[DOut]) const
+{
+    T deriv[D][DOut];
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFDDerivative(pt, deriv));
+    memcpy(ds, deriv+3*sizeof(T)*DOut, sizeof(T)*DOut);
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateWorldDerivative(const CoordType pt[D], T deriv[D][DOut]) const
+{
+    CoordType pt_g[D];
+    this->world_to_grid(pt, pt_g);
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFDDerivative(pt_g, deriv));
+
+    std::vector<coord_type> spacing;
+    this->get_spacing(spacing);
+
+    unsigned int d, d2;
+    for ( d=0; d<DIn; d++ )
+    {
+        for ( d2=0; d2<DOut; d2++ )
+        {
+            deriv[d][d2] /= spacing[d];
+        }
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateWorldDX(const CoordType pt[D], T dx[DOut]) const
+{
+    CoordType pt_g[D];
+    this->world_to_grid(pt, pt_g);
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFDDX(pt_g, dx));
+
+    coord_type sx = coord_type(1.0)/this->get_spacing(0);
+
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        dx[d] *= sx;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateWorldDY(const CoordType pt[D], T dy[DOut]) const
+{
+    CoordType pt_g[D];
+    this->world_to_grid(pt, pt_g);
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFDDY(pt_g, dy));
+
+    coord_type sy = coord_type(1.0)/this->get_spacing(1);
+
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        dy[d] *= sy;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateWorldDZ(const CoordType pt[D], T dz[DOut]) const
+{
+    CoordType pt_g[D];
+    this->world_to_grid(pt, pt_g);
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFDDZ(pt_g, dz));
+
+    coord_type sz = coord_type(1.0)/this->get_spacing(2);
+
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        dz[d] *= sz;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateWorldDS(const CoordType pt[D], T ds[DOut]) const
+{
+    CoordType pt_g[D];
+    this->world_to_grid(pt, pt_g);
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFDDS(pt_g, ds));
+
+    coord_type ss = coord_type(1.0)/this->get_spacing(3);
+
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        ds[d] *= ss;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDSecondOrderDerivative(const PointType& pt, T dderiv[D*D][DOut]) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->evaluateFFDDerivative(pt.begin(), dderiv));
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDW(const CoordType pt[D], T r[DOut]) const
+{
+    CoordType pg[D];
+    this->world_to_grid(pt, pg);
+    return this->evaluateFFD(pg, r);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDW(CoordType px, CoordType py, T r[DOut]) const
+{
+    CoordType pg[2];
+    this->world_to_grid(px, py, pg[0], pg[1]);
+    return this->evaluateFFD(pg, r);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDW(CoordType px, CoordType py, CoordType pz, T r[DOut]) const
+{
+    CoordType pg[3];
+    this->world_to_grid(px, py, pz, pg[0], pg[1], pg[2]);
+    return this->evaluateFFD(pg, r);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDW(CoordType px, CoordType py, CoordType pz, CoordType ps, T r[DOut]) const
+{
+    CoordType pg[4];
+    this->world_to_grid(px, py, pz, ps, pg[0], pg[1], pg[2], pg[3]);
+    return this->evaluateFFD(pg, r);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDDerivativeW(const CoordType pt[D], T deriv[D][DOut]) const
+{
+    CoordType pg[D];
+    this->world_to_grid(pt, pg);
+    return this->evaluateFFDDerivative(pg, deriv);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDDerivativeW(CoordType px, CoordType py, T deriv[D][DOut]) const
+{
+    CoordType pg[2];
+    this->world_to_grid(px, py, pg[0], pg[1]);
+    return this->evaluateFFDDerivative(pg, deriv);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDDerivativeW(CoordType px, CoordType py, CoordType pz, T deriv[D][DOut]) const
+{
+    CoordType pg[3];
+    this->world_to_grid(px, py, pz, pg[0], pg[1], pg[2]);
+    return this->evaluateFFDDerivative(pg, deriv);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDDerivativeW(CoordType px, CoordType py, CoordType pz, CoordType ps, T deriv[D][DOut]) const
+{
+    CoordType pg[4];
+    this->world_to_grid(px, py, pz, ps, pg[0], pg[1], pg[2], pg[3]);
+    return this->evaluateFFDDerivative(pg, deriv);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDSecondOrderDerivativeW(const CoordType pt[D], T dderiv[D*D][DOut]) const
+{
+    CoordType pg[D];
+    this->world_to_grid(pt, pg);
+    return this->evaluateFFDDerivative(pg, dderiv);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDSecondOrderDerivativeW(CoordType px, CoordType py, T dderiv[D*D][DOut]) const
+{
+    CoordType pg[2];
+    this->world_to_grid(px, py, pg[0], pg[1]);
+    return this->evaluateFFDSecondOrderDerivative(pg, dderiv);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDSecondOrderDerivativeW(CoordType px, CoordType py, CoordType pz, T dderiv[D*D][DOut]) const
+{
+    CoordType pg[3];
+    this->world_to_grid(px, py, pz, pg[0], pg[1], pg[2]);
+    return this->evaluateFFDSecondOrderDerivative(pg, dderiv);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDSecondOrderDerivativeW(CoordType px, CoordType py, CoordType pz, CoordType ps, T dderiv[D*D][DOut]) const
+{
+    CoordType pg[4];
+    this->world_to_grid(px, py, pz, ps, pg[0], pg[1], pg[2], pg[3]);
+    return this->evaluateFFDSecondOrderDerivative(pg, dderiv);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::world_to_grid(const CoordArrayType& pt_w, CoordArrayType& pt_g) const
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(pt_w.get_size(0)==DIn);
+
+        if ( pt_g.dimensions_equal(&pt_w) )
+        {
+            pt_g = pt_w;
+        }
+
+        const CoordType* pW = pt_w.begin();
+        CoordType* pG = pt_g.begin();
+
+        size_t N = pt_w.get_size(1);
+
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, pW, pG)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            this->world_to_grid(pW+n*DIn, pG+n*DIn);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in world_to_grid(const CoordArrayType& pt_w, CoordArrayType& pt_g) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::world_to_grid(const CoordType pt_w[D], CoordType pt_g[D]) const
+{
+    try
+    {
+        this->ctrl_pt_[0].world_to_image(pt_w, pt_g);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in world_to_grid(const CoordType pt_w[D], CoordType pt_g[D]) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::world_to_grid(CoordType px_w, CoordType py_w, CoordType& px_g, CoordType& py_g) const
+{
+    GADGET_CHECK_RETURN_FALSE(DIn==2);
+
+    try
+    {
+        this->ctrl_pt_[0].world_to_image(px_w, py_w, px_g, py_g);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in world_to_grid(CoordType px_w, CoordType py_w, CoordType& px_g, CoordType& py_g) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::world_to_grid(CoordType px_w, CoordType py_w, CoordType pz_w, CoordType& px_g, CoordType& py_g, CoordType& pz_g) const
+{
+    GADGET_CHECK_RETURN_FALSE(DIn==3);
+
+    try
+    {
+        this->ctrl_pt_[0].world_to_image(px_w, py_w, pz_w, px_g, py_g, pz_g);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in world_to_grid(CoordType px_w, CoordType py_w, CoordType pz_w, CoordType& px_g, CoordType& py_g, CoordType& pz_g) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::world_to_grid(CoordType px_w, CoordType py_w, CoordType pz_w, CoordType ps_w, CoordType& px_g, CoordType& py_g, CoordType& pz_g, CoordType& ps_g) const
+{
+    GADGET_CHECK_RETURN_FALSE(DIn==4);
+
+    try
+    {
+        this->ctrl_pt_[0].world_to_image(px_w, py_w, pz_w, ps_w, px_g, py_g, pz_g, ps_g);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in world_to_grid(CoordType px_w, CoordType py_w, CoordType pz_w, CoordType ps_w, CoordType& px_g, CoordType& py_g, CoordType& pz_g, CoordType& ps_g) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::grid_to_world(const CoordArrayType& pt_g, CoordArrayType& pt_w) const
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(pt_g.get_size(0)==DIn);
+
+        if ( pt_w.dimensions_equal(&pt_g) )
+        {
+            pt_w = pt_g;
+        }
+
+        const CoordType* pG = pt_g.begin();
+        CoordType* pW = pt_w.begin();
+
+        size_t N = pt_g.get_size(1);
+
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, pG, pW)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            this->grid_to_world(pG+n*DIn, pW+n*DIn);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in grid_to_world(const CoordArrayType& pt_g, CoordArrayType& pt_w) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::grid_to_world(const CoordType pt_g[D], CoordType pt_w[D]) const
+{
+    try
+    {
+        this->ctrl_pt_[0].image_to_world(pt_g, pt_w);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in grid_to_world(const CoordType pt_g[D], CoordType pt_w[D]) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::grid_to_world(CoordType px_g, CoordType py_g, CoordType& px_w, CoordType& py_w) const
+{
+    GADGET_CHECK_RETURN_FALSE(DIn==2);
+
+    try
+    {
+        this->ctrl_pt_[0].image_to_world(px_g, py_g, px_w, py_w);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in grid_to_world(CoordType px_g, CoordType py_g, CoordType& px_w, CoordType& py_w) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::grid_to_world(CoordType px_g, CoordType py_g, CoordType pz_g, CoordType& px_w, CoordType& py_w, CoordType& pz_w) const
+{
+    GADGET_CHECK_RETURN_FALSE(DIn==3);
+
+    try
+    {
+        this->ctrl_pt_[0].image_to_world(px_g, py_g, pz_g, px_w, py_w, pz_w);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in grid_to_world(CoordType px_g, CoordType py_g, CoordType pz_g, CoordType& px_w, CoordType& py_w, CoordType& pz_w) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N, size_t numOfRefinement)
+{
+    size_t num;
+    return this->ffdApprox(pos, value, residual, totalResidual, N, num, FLT_EPSILON, numOfRefinement);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(pos.get_size(0)==N);
+        GADGET_CHECK_RETURN_FALSE(pos.get_size(1)==DIn);
+
+        GADGET_CHECK_RETURN_FALSE(value.get_size(0)==N);
+        GADGET_CHECK_RETURN_FALSE(value.get_size(1)==DOut);
+
+        totalResidual = 0;
+
+        if ( !residual.dimensions_equal(&value) )
+        {
+            residual.create(value.get_dimensions());
+            Gadgetron::clear(residual);
+        }
+
+        CoordArrayType posg(pos);
+        CoordArrayType posw(pos);
+        GADGET_CHECK_RETURN_FALSE(this->grid_to_world(posg, posw));
+
+        size_t num;
+        for ( num=0; num<maxNumOfRefinement; num++ )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->ffdApprox(posg, value, residual, totalResidual, N));
+
+            GDEBUG_CONDITION_STREAM(performTiming_, "BSpline FFD refinement " << num << " has residual of " << totalResidual);
+
+            if ( totalResidual < thresResidual )
+            {
+                GDEBUG_STREAM("BSpline FFD residual is too small : " << totalResidual);
+                GDEBUG_STREAM("No further refinement will be computed ... ");
+                break;
+            }
+
+            if ( num<maxNumOfRefinement-1 )
+            {
+                GADGET_CHECK_RETURN_FALSE(this->refine());
+                GADGET_CHECK_RETURN_FALSE(this->world_to_grid(posw, posg));
+            }
+        }
+
+        numOfRefinement = num;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::ffdApproxW(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N, size_t numOfRefinement)
+{
+    size_t num;
+    return this->ffdApproxW(pos, value, residual, totalResidual, N, num, FLT_EPSILON, numOfRefinement);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::ffdApproxW(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(pos.get_size(0)==DIn);
+        GADGET_CHECK_RETURN_FALSE(pos.get_size(1)==N);
+
+        GADGET_CHECK_RETURN_FALSE(value.get_size(0)==DOut);
+        GADGET_CHECK_RETURN_FALSE(value.get_size(1)==N);
+
+        totalResidual = 0;
+
+        if ( !residual.dimensions_equal(&value) )
+        {
+            residual.create(value.get_dimensions());
+            Gadgetron::clear(residual);
+        }
+
+        CoordArrayType posg(pos);
+        GADGET_CHECK_RETURN_FALSE(this->world_to_grid(pos, posg));
+
+        size_t num;
+        for ( num=0; num<maxNumOfRefinement; num++ )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->ffdApprox(posg, value, residual, totalResidual, N));
+
+            GDEBUG_CONDITION_STREAM(performTiming_, "BSpline FFD refinement " << num << " has residual of " << totalResidual);
+
+            if ( totalResidual < thresResidual )
+            {
+                GDEBUG_STREAM("BSpline FFD residual is too small : " << totalResidual);
+                GDEBUG_STREAM("No further refinement will be computed ... ");
+                break;
+            }
+
+            if ( num<maxNumOfRefinement-1 )
+            {
+                GADGET_CHECK_RETURN_FALSE(this->refine());
+                GADGET_CHECK_RETURN_FALSE(this->world_to_grid(pos, posg));
+            }
+        }
+
+        numOfRefinement = num;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDOnImage(ImageType& target) const
+{
+    GADGET_CHECK_RETURN_FALSE(DOut==1);
+    return this->evaluateFFDOnImage(&target);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDOnImage(ImageType target[DOut]) const
+{
+    try
+    {
+        if ( DIn==2 )
+        {
+            size_t sx = target[0].get_size(0);
+            size_t sy = target[0].get_size(1);
+
+            long long y;
+
+            #pragma omp parallel private(y) shared(sx, sy, target)
+            {
+                coord_type px, py, pg[2];
+                T v[DOut];
+                unsigned int d;
+
+                #pragma omp for 
+                for ( y=0; y<(long long)sy; y++ )
+                {
+                    for ( size_t x=0; x<sx; x++ )
+                    {
+                        size_t offset = x + y*sx;
+
+                        // target to world
+                        target[0].image_to_world(x, size_t(y), px, py);
+
+                        // world to grid
+                        this->world_to_grid(px, py, pg[0], pg[1]);
+
+                        // evaluate the FFD
+                        this->evaluateFFD(pg, v);
+
+                        if ( DOut == 1 )
+                        {
+                            target[0](offset) = v[0];
+                        }
+                        else if ( DOut == 2 )
+                        {
+                            target[0](offset) = v[0];
+                            target[1](offset) = v[1];
+                        }
+                        else if ( DOut == 3 )
+                        {
+                            target[0](offset) = v[0];
+                            target[1](offset) = v[1];
+                            target[2](offset) = v[2];
+                        }
+                        else
+                        {
+                            for ( d=0; d<DOut; d++ )
+                            {
+                                target[d](offset) = v[d];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else if ( DIn==3 )
+        {
+            size_t sx = target[0].get_size(0);
+            size_t sy = target[0].get_size(1);
+            size_t sz = target[0].get_size(2);
+
+            long long z;
+
+            #pragma omp parallel private(z) shared(sx, sy, sz, target)
+            {
+                coord_type px, py, pz, pg[3];
+                T v[DOut];
+                unsigned int d;
+
+                #pragma omp for 
+                for ( z=0; z<(long long)sz; z++ )
+                {
+                    for ( size_t y=0; y<sy; y++ )
+                    {
+                        size_t offset = y*sx + z*sx*sy;
+
+                        for ( size_t x=0; x<sx; x++ )
+                        {
+                            // target to world
+                            target[0].image_to_world(x, y, size_t(z), px, py, pz);
+
+                            // world to grid
+                            this->world_to_grid(px, py, pz, pg[0], pg[1], pg[2]);
+
+                            // evaluate the FFD
+                            this->evaluateFFD(pg, v);
+
+                            if ( DOut == 1 )
+                            {
+                                target[0](offset+x) = v[0];
+                            }
+                            else if ( DOut == 2 )
+                            {
+                                target[0](offset+x) = v[0];
+                                target[1](offset+x) = v[1];
+                            }
+                            else if ( DOut == 3 )
+                            {
+                                target[0](offset+x) = v[0];
+                                target[1](offset+x) = v[1];
+                                target[2](offset+x) = v[2];
+                            }
+                            else
+                            {
+                                for ( d=0; d<DOut; d++ )
+                                {
+                                    target[d](offset+x) = v[d];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            size_t numOfPixels = target[0].get_number_of_elements();
+
+            long long n;
+
+            #pragma omp parallel private(n) shared(numOfPixels, target)
+            {
+                size_t ind_target[DIn];
+                coord_type pt_target[DIn];
+                coord_type pt_grid[DIn];
+                T v[DOut];
+                unsigned int d;
+
+                #pragma omp for 
+                for ( n=0; n<(long long)numOfPixels; n++ )
+                {
+                    // target to world
+                    target[0].calculate_index( size_t(n), ind_target );
+
+                    target[0].image_to_world(ind_target, pt_target);
+
+                    // world to grid
+                    this->world_to_grid(pt_target, pt_grid);
+
+                    // evaluate the FFD
+                    this->evaluateFFD(pt_grid, v);
+
+                    if ( DOut == 1 )
+                    {
+                        target[0](n) = v[0];
+                    }
+                    else if ( DOut == 2 )
+                    {
+                        target[0](n) = v[0];
+                        target[1](n) = v[1];
+                    }
+                    else if ( DOut == 3 )
+                    {
+                        target[0](n) = v[0];
+                        target[1](n) = v[1];
+                        target[2](n) = v[2];
+                    }
+                    else
+                    {
+                        for ( d=0; d<DOut; d++ )
+                        {
+                            target[d](n) = v[d];
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in evaluateFFD(ImageType target[DOut]) const ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDOnArray(ArrayType& target) const
+{
+    GADGET_CHECK_RETURN_FALSE(DOut==1);
+    return this->evaluateFFDOnArray(&target);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+bool gtplusFFDBase<T, CoordType, DIn, DOut>::evaluateFFDOnArray(ArrayType target[DOut]) const
+{
+    try
+    {
+        if ( DIn==2 )
+        {
+            size_t sx = target[0].get_size(0);
+            size_t sy = target[0].get_size(1);
+
+            long long y;
+
+            #pragma omp parallel private(y) shared(sx, sy, target)
+            {
+                coord_type pg[2];
+                T v[DOut];
+                unsigned int d;
+
+                #pragma omp for 
+                for ( y=0; y<(long long)sy; y++ )
+                {
+                    for ( size_t x=0; x<sx; x++ )
+                    {
+                        size_t offset = x + y*sx;
+
+                        this->world_to_grid((CoordType)x, (CoordType)y, pg[0], pg[1]);
+
+                        // evaluate the FFD
+                        this->evaluateFFD(pg, v);
+
+                        if ( DOut == 1 )
+                        {
+                            target[0](offset) = v[0];
+                        }
+                        else if ( DOut == 2 )
+                        {
+                            target[0](offset) = v[0];
+                            target[1](offset) = v[1];
+                        }
+                        else if ( DOut == 3 )
+                        {
+                            target[0](offset) = v[0];
+                            target[1](offset) = v[1];
+                            target[2](offset) = v[2];
+                        }
+                        else
+                        {
+                            for ( d=0; d<DOut; d++ )
+                            {
+                                target[d](offset) = v[d];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else if ( DIn==3 )
+        {
+            size_t sx = target[0].get_size(0);
+            size_t sy = target[0].get_size(1);
+            size_t sz = target[0].get_size(2);
+
+            long long z;
+
+            #pragma omp parallel private(z) shared(sx, sy, sz, target)
+            {
+                coord_type pg[3];
+                T v[DOut];
+                unsigned int d;
+
+                #pragma omp for 
+                for ( z=0; z<(long long)sz; z++ )
+                {
+                    for ( size_t y=0; y<sy; y++ )
+                    {
+                        size_t offset = y*sx + z*sx*sy;
+
+                        for ( size_t x=0; x<sx; x++ )
+                        {
+                            this->world_to_grid((CoordType)x, (CoordType)y, (CoordType)z, pg[0], pg[1], pg[2]);
+
+                            // evaluate the FFD
+                            this->evaluateFFD(pg, v);
+
+                            if ( DOut == 1 )
+                            {
+                                target[0](offset+x) = v[0];
+                            }
+                            else if ( DOut == 2 )
+                            {
+                                target[0](offset+x) = v[0];
+                                target[1](offset+x) = v[1];
+                            }
+                            else if ( DOut == 3 )
+                            {
+                                target[0](offset+x) = v[0];
+                                target[1](offset+x) = v[1];
+                                target[2](offset+x) = v[2];
+                            }
+                            else
+                            {
+                                for ( d=0; d<DOut; d++ )
+                                {
+                                    target[d](offset+x) = v[d];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            size_t numOfPixels = target[0].get_number_of_elements();
+
+            long long n;
+
+            #pragma omp parallel private(n) shared(numOfPixels, target)
+            {
+                std::vector<size_t> ind_target(DIn);
+                coord_type pt_target[DIn];
+                coord_type pt_grid[DIn];
+                T v[DOut];
+                unsigned int d;
+
+                #pragma omp for 
+                for ( n=0; n<(long long)numOfPixels; n++ )
+                {
+                    ind_target = target[0].calculate_index( size_t(n) );
+
+                    for ( d=0; d<DIn; d++ )
+                    {
+                        pt_target[d] = (CoordType)ind_target[d];
+                    }
+
+                    this->world_to_grid(pt_target, pt_grid);
+
+                    // evaluate the FFD
+                    this->evaluateFFD(pt_grid, v);
+
+                    if ( DOut == 1 )
+                    {
+                        target[0](n) = v[0];
+                    }
+                    else if ( DOut == 2 )
+                    {
+                        target[0](n) = v[0];
+                        target[1](n) = v[1];
+                    }
+                    else if ( DOut == 3 )
+                    {
+                        target[0](n) = v[0];
+                        target[1](n) = v[1];
+                        target[2](n) = v[2];
+                    }
+                    else
+                    {
+                        for ( d=0; d<DOut; d++ )
+                        {
+                            target[d](n) = v[d];
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in evaluateFFD(ArrayType target[DOut]) const ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+bool gtplusFFDBase<T, CoordType, DIn, DOut>::imageToFFDInputsW(ImageType target[DOut], CoordArrayType& pos, ValueArrayType& value)
+{
+    try
+    {
+        size_t N = target[0].get_number_of_elements();
+        pos.create(DIn, N);
+        value.create(DOut, N);
+
+        if ( DIn==2 )
+        {
+            size_t sx = target[0].get_size(0);
+            size_t sy = target[0].get_size(1);
+
+            long long y;
+ 
+            #pragma omp parallel private(y) shared(sx, sy, target, pos, value)
+            {
+                coord_type px, py;
+                unsigned int d;
+
+                #pragma omp for 
+                for ( y=0; y<(long long)sy; y++ )
+                {
+                    for ( size_t x=0; x<sx; x++ )
+                    {
+                        size_t offset = x + y*sx;
+
+                        // target to world
+                        target[0].image_to_world(x, size_t(y), px, py);
+
+                        pos(0, offset) = px;
+                        pos(1, offset) = py;
+
+                        if ( DOut == 1 )
+                        {
+                            value(0, offset) = target[0](offset);
+                        }
+                        else if ( DOut == 2 )
+                        {
+                            value(0, offset) = target[0](offset);
+                            value(1, offset) = target[1](offset);
+                        }
+                        else if ( DOut == 3 )
+                        {
+                            value(0, offset) = target[0](offset);
+                            value(1, offset) = target[1](offset);
+                            value(2, offset) = target[2](offset);
+                        }
+                        else
+                        {
+                            for ( d=0; d<DOut; d++ )
+                            {
+                                value(d, offset) = target[d](offset);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else if ( DIn==3 )
+        {
+            size_t sx = target[0].get_size(0);
+            size_t sy = target[0].get_size(1);
+            size_t sz = target[0].get_size(2);
+
+            long long z;
+
+            #pragma omp parallel private(z) shared(sx, sy, sz, target, pos, value)
+            {
+                coord_type px, py, pz;
+                unsigned int d;
+
+                #pragma omp for 
+                for ( z=0; z<(long long)sz; z++ )
+                {
+                    for ( size_t y=0; y<sy; y++ )
+                    {
+                        size_t offset = y*sx + z*sx*sy;
+
+                        for ( size_t x=0; x<sx; x++ )
+                        {
+                            // target to world
+                            target[0].image_to_world(x, y, size_t(z), px, py, pz);
+
+                            pos(0, offset) = px;
+                            pos(1, offset) = py;
+                            pos(2, offset) = pz;
+
+                            if ( DOut == 1 )
+                            {
+                                value(0, offset) = target[0](offset);
+                            }
+                            else if ( DOut == 2 )
+                            {
+                                value(0, offset) = target[0](offset);
+                                value(1, offset) = target[1](offset);
+                            }
+                            else if ( DOut == 3 )
+                            {
+                                value(0, offset) = target[0](offset);
+                                value(1, offset) = target[1](offset);
+                                value(2, offset) = target[2](offset);
+                            }
+                            else
+                            {
+                                for ( d=0; d<DOut; d++ )
+                                {
+                                    value(d, offset) = target[d](offset);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            long long n;
+
+            #pragma omp parallel private(n) shared(N, target)
+            {
+                size_t ind_target[DIn];
+                coord_type pt_target[DIn];
+                unsigned int d;
+
+                #pragma omp for 
+                for ( n=0; n<(long long)N; n++ )
+                {
+                    // target to world
+                    target[0].calculate_index( size_t(n), ind_target );
+
+                    target[0].image_to_world(ind_target, pt_target);
+
+                    for ( d=0; d<DIn; d++ )
+                    {
+                        pos(d, n) = pt_target[d];
+                    }
+
+                    if ( DOut == 1 )
+                    {
+                        value(0, n) = target[0](n);
+                    }
+                    else if ( DOut == 2 )
+                    {
+                        value(0, n) = target[0](n);
+                        value(1, n) = target[1](n);
+                    }
+                    else if ( DOut == 3 )
+                    {
+                        value(0, n) = target[0](n);
+                        value(1, n) = target[1](n);
+                        value(2, n) = target[2](n);
+                    }
+                    else
+                    {
+                        for ( d=0; d<DOut; d++ )
+                        {
+                            value(d, n) = target[d](n);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in imageToFFDInputsW(ImageType target[DOut], CoordArrayType& pos, ValueArrayType& value) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+bool gtplusFFDBase<T, CoordType, DIn, DOut>::imageToFFDInputsW(ImageType target[DOut], const MaskArrayType& mask, CoordArrayType& pos, ValueArrayType& value)
+{
+    try
+    {
+        size_t N = target[0].get_number_of_elements();
+        if ( mask.get_number_of_elements() != N ) return true;
+
+        size_t n, d;
+        size_t numOfPixels = 0;
+        for ( n=0; n<N; n++ )
+        {
+            if ( mask(n)!=0 ) numOfPixels++;
+        }
+
+        CoordArrayType posTmp;
+        ValueArrayType valueTmp;
+
+        GADGET_CHECK_RETURN_FALSE(this->imageToFFDInputsW(target, posTmp, valueTmp));
+
+        pos.create(DIn, numOfPixels);
+        value.create(DOut, numOfPixels);
+
+        numOfPixels = 0;
+        for ( n=0; n<N; n++ )
+        {
+            if ( mask(n)!=0 )
+            {
+                memcpy(pos.begin()+numOfPixels*DIn, posTmp.begin()+n*DIn, sizeof(T)*DIn);
+
+                if ( DOut == 1 )
+                {
+                    value(0, numOfPixels) = valueTmp(0, n);
+                }
+                else if ( DOut == 2 )
+                {
+                    value(0, numOfPixels) = valueTmp(0, n);
+                    value(1, numOfPixels) = valueTmp(1, n);
+                }
+                else if ( DOut == 3 )
+                {
+                    value(0, numOfPixels) = valueTmp(0, n);
+                    value(1, numOfPixels) = valueTmp(1, n);
+                    value(2, numOfPixels) = valueTmp(2, n);
+                }
+                else
+                {
+                    for ( d=0; d<DOut; d++ )
+                    {
+                        value(d, numOfPixels) = valueTmp(d, n);
+                    }
+                }
+
+                numOfPixels++;
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in imageToFFDInputsW(ImageType target[DOut], const MaskArrayType& mask, CoordArrayType& pos, ValueArrayType& value) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+bool gtplusFFDBase<T, CoordType, DIn, DOut>::arrayToFFDInputsW(ArrayType target[DOut], CoordArrayType& pos, ValueArrayType& value)
+{
+    try
+    {
+        size_t N = target[0].get_number_of_elements();
+        pos.create(DIn, N);
+        value.create(DOut, N);
+
+        if ( DIn==2 )
+        {
+            size_t sx = target[0].get_size(0);
+            size_t sy = target[0].get_size(1);
+
+            long long y;
+ 
+            #pragma omp parallel private(y) shared(sx, sy, target, pos, value)
+            {
+                unsigned int d;
+
+                #pragma omp for 
+                for ( y=0; y<(long long)sy; y++ )
+                {
+                    for ( size_t x=0; x<sx; x++ )
+                    {
+                        size_t offset = x + y*sx;
+
+                        pos(0, offset) = (CoordType)x;
+                        pos(1, offset) = (CoordType)y;
+
+                        if ( DOut == 1 )
+                        {
+                            value(0, offset) = target[0](offset);
+                        }
+                        else if ( DOut == 2 )
+                        {
+                            value(0, offset) = target[0](offset);
+                            value(1, offset) = target[1](offset);
+                        }
+                        else if ( DOut == 3 )
+                        {
+                            value(0, offset) = target[0](offset);
+                            value(1, offset) = target[1](offset);
+                            value(2, offset) = target[2](offset);
+                        }
+                        else
+                        {
+                            for ( d=0; d<DOut; d++ )
+                            {
+                                value(d, offset) = target[d](offset);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else if ( DIn==3 )
+        {
+            size_t sx = target[0].get_size(0);
+            size_t sy = target[0].get_size(1);
+            size_t sz = target[0].get_size(2);
+
+            long long z;
+
+            #pragma omp parallel private(z) shared(sx, sy, sz, target, pos, value)
+            {
+                unsigned int d;
+
+                #pragma omp for 
+                for ( z=0; z<(long long)sz; z++ )
+                {
+                    for ( size_t y=0; y<sy; y++ )
+                    {
+                        size_t offset = y*sx + z*sx*sy;
+
+                        for ( size_t x=0; x<sx; x++ )
+                        {
+                            pos(0, offset) = (CoordType)x;
+                            pos(1, offset) = (CoordType)y;
+                            pos(2, offset) = (CoordType)z;
+
+                            if ( DOut == 1 )
+                            {
+                                value(0, offset) = target[0](offset);
+                            }
+                            else if ( DOut == 2 )
+                            {
+                                value(0, offset) = target[0](offset);
+                                value(1, offset) = target[1](offset);
+                            }
+                            else if ( DOut == 3 )
+                            {
+                                value(0, offset) = target[0](offset);
+                                value(1, offset) = target[1](offset);
+                                value(2, offset) = target[2](offset);
+                            }
+                            else
+                            {
+                                for ( d=0; d<DOut; d++ )
+                                {
+                                    value(d, offset) = target[d](offset);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            long long n;
+
+            #pragma omp parallel private(n) shared(N, target)
+            {
+                std::vector<size_t> ind_target(DIn);
+                unsigned int d;
+
+                #pragma omp for 
+                for ( n=0; n<(long long)N; n++ )
+                {
+                    ind_target = target[0].calculate_index( size_t(n) );
+
+                    for ( d=0; d<DIn; d++ )
+                    {
+                        pos(d, n) = (CoordType)ind_target[d];
+                    }
+
+                    if ( DOut == 1 )
+                    {
+                        value(0, n) = target[0](n);
+                    }
+                    else if ( DOut == 2 )
+                    {
+                        value(0, n) = target[0](n);
+                        value(1, n) = target[1](n);
+                    }
+                    else if ( DOut == 3 )
+                    {
+                        value(0, n) = target[0](n);
+                        value(1, n) = target[1](n);
+                        value(2, n) = target[2](n);
+                    }
+                    else
+                    {
+                        for ( d=0; d<DOut; d++ )
+                        {
+                            value(d, n) = target[d](n);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in imageToFFDInputsW(ImageType target[DOut], CoordArrayType& pos, ValueArrayType& value) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+bool gtplusFFDBase<T, CoordType, DIn, DOut>::arrayToFFDInputsW(ArrayType target[DOut], const MaskArrayType& mask, CoordArrayType& pos, ValueArrayType& value)
+{
+    try
+    {
+        size_t N = target[0].get_number_of_elements();
+        if ( mask.get_number_of_elements() != N ) return true;
+
+        size_t n, d;
+        size_t numOfPixels = 0;
+        for ( n=0; n<N; n++ )
+        {
+            if ( mask(n)!=0 ) numOfPixels++;
+        }
+
+        CoordArrayType posTmp;
+        ValueArrayType valueTmp;
+
+        GADGET_CHECK_RETURN_FALSE(this->arrayToFFDInputsW(target, posTmp, valueTmp));
+
+        pos.create(DIn, numOfPixels);
+        value.create(DOut, numOfPixels);
+
+        numOfPixels = 0;
+        for ( n=0; n<N; n++ )
+        {
+            if ( mask(n)!=0 )
+            {
+                memcpy(pos.begin()+numOfPixels*DIn, posTmp.begin()+n*DIn, sizeof(T)*DIn);
+
+                if ( DOut == 1 )
+                {
+                    value(0, numOfPixels) = valueTmp(0, n);
+                }
+                else if ( DOut == 2 )
+                {
+                    value(0, numOfPixels) = valueTmp(0, n);
+                    value(1, numOfPixels) = valueTmp(1, n);
+                }
+                else if ( DOut == 3 )
+                {
+                    value(0, numOfPixels) = valueTmp(0, n);
+                    value(1, numOfPixels) = valueTmp(1, n);
+                    value(2, numOfPixels) = valueTmp(2, n);
+                }
+                else
+                {
+                    for ( d=0; d<DOut; d++ )
+                    {
+                        value(d, numOfPixels) = valueTmp(d, n);
+                    }
+                }
+
+                numOfPixels++;
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in imageToFFDInputsW(ImageType target[DOut], const MaskArrayType& mask, CoordArrayType& pos, ValueArrayType& value) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::ffdApproxImage(ImageType target[DOut], real_value_type& totalResidual, size_t numOfRefinement)
+{
+    try
+    {
+        size_t N = target[0].get_number_of_elements();
+
+        CoordArrayType pos(DIn, N);
+        ValueArrayType value(DOut, N);
+        ValueArrayType residual(DOut, N);
+
+        GADGET_CHECK_RETURN_FALSE(this->imageToFFDInputsW(target, pos, value));
+        GADGET_CHECK_RETURN_FALSE(this->ffdApproxW(pos, value, residual, totalResidual, N, numOfRefinement));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in ffdApproxImage(ImageType target[DOut], real_value_type& totalResidual, size_t numOfRefinement) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::ffdApproxImage(ImageType target[DOut], real_value_type& totalResidual, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement)
+{
+    try
+    {
+        size_t N = target[0].get_number_of_elements();
+
+        CoordArrayType pos(DIn, N);
+        ValueArrayType value(DOut, N);
+        ValueArrayType residual(DOut, N);
+
+        GADGET_CHECK_RETURN_FALSE(this->imageToFFDInputsW(target, pos, value));
+        GADGET_CHECK_RETURN_FALSE(this->ffdApproxW(pos, value, residual, totalResidual, N, numOfRefinement, thresResidual, maxNumOfRefinement));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in ffdApproxImage(ImageType target[DOut], real_value_type& totalResidual, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::ffdApproxImage(ImageType& target, real_value_type& totalResidual, size_t numOfRefinement)
+{
+    GADGET_CHECK_RETURN_FALSE(DOut==1);
+    return this->ffdApproxImage(&target, totalResidual, numOfRefinement);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::ffdApproxImage(ImageType& target, real_value_type& totalResidual, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement)
+{
+    GADGET_CHECK_RETURN_FALSE(DOut==1);
+    return this->ffdApproxImage(&target, totalResidual, numOfRefinement, thresResidual, maxNumOfRefinement);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::ffdApproxImage(ImageType target[DOut], const MaskArrayType& mask, real_value_type& totalResidual, size_t numOfRefinement)
+{
+    try
+    {
+        CoordArrayType pos;
+        ValueArrayType value;
+        ValueArrayType residual;
+
+        GADGET_CHECK_RETURN_FALSE(this->imageToFFDInputsW(target, mask, pos, value));
+
+        size_t N = pos.get_size(1);
+        GADGET_CHECK_RETURN_FALSE(this->ffdApproxW(pos, value, residual, totalResidual, N, numOfRefinement));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in ffdApproxImage(ImageType target[DOut], const MaskArrayType& mask, real_value_type& totalResidual, size_t numOfRefinement) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::ffdApproxImage(ImageType target[DOut], const MaskArrayType& mask, real_value_type& totalResidual, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement)
+{
+    try
+    {
+        CoordArrayType pos;
+        ValueArrayType value;
+        ValueArrayType residual;
+
+        GADGET_CHECK_RETURN_FALSE(this->imageToFFDInputsW(target, mask, pos, value));
+
+        size_t N = pos.get_size(1);
+        GADGET_CHECK_RETURN_FALSE(this->ffdApproxW(pos, value, residual, totalResidual, N, numOfRefinement, thresResidual, maxNumOfRefinement));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in ffdApproxImage(ImageType target[DOut], const MaskArrayType& mask, real_value_type& totalResidual, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::ffdApproxArray(ArrayType target[DOut], real_value_type& totalResidual, size_t numOfRefinement)
+{
+    try
+    {
+        size_t N = target[0].get_number_of_elements();
+
+        CoordArrayType pos(DIn, N);
+        ValueArrayType value(DOut, N);
+        ValueArrayType residual(DOut, N);
+
+        GADGET_CHECK_RETURN_FALSE(this->arrayToFFDInputsW(target, pos, value));
+        GADGET_CHECK_RETURN_FALSE(this->ffdApproxW(pos, value, residual, totalResidual, N, numOfRefinement));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in ffdApproxArray(ArrayType target[DOut], real_value_type& totalResidual, size_t numOfRefinement) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::ffdApproxArray(ArrayType target[DOut], real_value_type& totalResidual, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement)
+{
+    try
+    {
+        size_t N = target[0].get_number_of_elements();
+
+        CoordArrayType pos(DIn, N);
+        ValueArrayType value(DOut, N);
+        ValueArrayType residual(DOut, N);
+
+        GADGET_CHECK_RETURN_FALSE(this->arrayToFFDInputsW(target, pos, value));
+        GADGET_CHECK_RETURN_FALSE(this->ffdApproxW(pos, value, residual, totalResidual, N, numOfRefinement, thresResidual, maxNumOfRefinement));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in ffdApproxArray(ArrayType target[DOut], real_value_type& totalResidual, size_t numOfRefinement) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::ffdApproxArray(ArrayType& target, real_value_type& totalResidual, size_t numOfRefinement)
+{
+    GADGET_CHECK_RETURN_FALSE(DOut==1);
+    return this->ffdApproxArray(&target, totalResidual, numOfRefinement);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::ffdApproxArray(ArrayType& target, real_value_type& totalResidual, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement)
+{
+    GADGET_CHECK_RETURN_FALSE(DOut==1);
+    return this->ffdApproxArray(&target, totalResidual, numOfRefinement, thresResidual, maxNumOfRefinement);
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::ffdApproxArray(ArrayType target[DOut], const MaskArrayType& mask, real_value_type& totalResidual, size_t numOfRefinement)
+{
+    try
+    {
+        CoordArrayType pos;
+        ValueArrayType value;
+        ValueArrayType residual;
+
+        GADGET_CHECK_RETURN_FALSE(this->arrayToFFDInputsW(target, mask, pos, value));
+
+        size_t N = pos.get_size(1);
+        GADGET_CHECK_RETURN_FALSE(this->ffdApproxW(pos, value, residual, totalResidual, N, numOfRefinement));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in ffdApproxArray(ArrayType target[DOut], const MaskArrayType& mask, real_value_type& totalResidual, size_t numOfRefinement) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusFFDBase<T, CoordType, DIn, DOut>::ffdApproxArray(ArrayType target[DOut], const MaskArrayType& mask, real_value_type& totalResidual, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement)
+{
+    try
+    {
+        CoordArrayType pos;
+        ValueArrayType value, residual;
+
+        GADGET_CHECK_RETURN_FALSE(this->arrayToFFDInputsW(target, mask, pos, value));
+
+        size_t N = pos.get_size(1);
+        GADGET_CHECK_RETURN_FALSE(this->ffdApproxW(pos, value, residual, totalResidual, N, numOfRefinement, thresResidual, maxNumOfRefinement));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in ffdApproxArray(ArrayType target[DOut], const MaskArrayType& mask, real_value_type& totalResidual, size_t numOfRefinement) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+bool gtplusFFDBase<T, CoordType, DIn, DOut>::clear(T v)
+{
+    try
+    {
+        unsigned int d;
+
+        if ( std::abs(v) == 0 )
+        {
+            for ( d=0; d<DOut; d++ )
+            {
+                Gadgetron::clear(ctrl_pt_[d]);
+            }
+        }
+        else
+        {
+            for ( d=0; d<DOut; d++ )
+            {
+                Gadgetron::fill(ctrl_pt_[d], v);
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in gtplusFFDBase<T, CoordType, DIn, DOut>::clear(T v) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+void gtplusFFDBase<T, CoordType, DIn, DOut>::print(std::ostream& os) const
+{
+    using namespace std;
+
+    os << "---------------------- GTPlus Free Form Deformation ------------------" << endl;
+    os << "Define the interface for Free Form Deformation (FFD) " << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusMLFFD.h b/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusMLFFD.h
new file mode 100644
index 0000000..9fcaf6c
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/FreeFormDeformation/gtplusMLFFD.h
@@ -0,0 +1,436 @@
+/** \file       gtplusMLFFD.h
+
+    \brief      Implement gtPlus multi-level FreeFormDeformation
+                For every level, the fitting residual from previous level will be approximated
+                The final fitted value is the sum of all levels
+
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtplusFFDBase.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut>
+class gtplusMLFFD : public gtplusFFDBase<T, CoordType, DIn, DOut>
+{
+public:
+
+    typedef gtplusFFDBase<T, CoordType, DIn, DOut> BaseClass;
+    typedef gtplusFFDBase<T, CoordType, DIn, DOut> Self;
+
+    typedef typename BaseClass::bspline_float_type real_value_type;
+    typedef real_value_type bspline_float_type;
+
+    typedef typename BaseClass::coord_type coord_type;
+
+    enum { D = DIn };
+
+    typedef typename BaseClass::CoordArrayType      CoordArrayType;
+    typedef typename BaseClass::ValueArrayType      ValueArrayType;
+    typedef typename BaseClass::ArrayType           ArrayType;
+    typedef typename BaseClass::FFDCtrlPtGridType   FFDCtrlPtGridType;
+    typedef typename BaseClass::PointType           PointType;
+    typedef typename BaseClass::ImageType           ImageType;
+
+    typedef std::vector<BaseClass*> FFDArrayType;
+
+    gtplusMLFFD(bool delete_data_on_destruct=false);
+    gtplusMLFFD(const FFDArrayType& a, bool delete_data_on_destruct=false);
+    gtplusMLFFD(const Self& a);
+
+    virtual ~gtplusMLFFD();
+
+    size_t get_size() const { return ml_ffd_.size(); }
+
+    /// get the FFD array
+    FFDArrayType& getFFDArray();
+    const FFDArrayType& getFFDArray() const;
+
+    /// set the delete flag
+    bool delete_data_on_destruct() const { return delete_data_on_destruct_; }
+    void delete_data_on_destruct(bool flag) { delete_data_on_destruct_ = flag; }
+
+    /// evaluate the FFD at a grid location
+    /// the input points are in the FFD grid
+    virtual bool evaluateFFD(const CoordType pt[D], T r[DOut]) const;
+
+    /// evaluate the 1st order derivative of FFD at a grid location
+    /// deriv: derivative for all D dimensions and all DOut values
+    virtual bool evaluateFFDDerivative(const CoordType pt[D], T deriv[D][DOut]) const;
+
+    virtual bool evaluateFFDDX(const CoordType pt[D], T dx[DOut]) const;
+    virtual bool evaluateFFDDY(const CoordType pt[D], T dy[DOut]) const;
+    virtual bool evaluateFFDDZ(const CoordType pt[D], T dz[DOut]) const;
+    virtual bool evaluateFFDDS(const CoordType pt[D], T ds[DOut]) const;
+
+    /// evaluate the 2nd order derivative of FFD at a grid location
+    /// dderiv : D*D vector, stores dxx dxy dxz ...; dyx dyy dyz ...; dzx dzy dzz ...
+    virtual bool evaluateFFDSecondOrderDerivative(const CoordType pt[D], T dderiv[D*D][DOut]) const;
+
+    /// compute the FFD approximation once
+    /// pos : the position of input points, DIn by N
+    /// value : the value on input points, DOut by N
+    /// residual : the approximation residual after computing FFD, DOut by N
+    /// N : the number of points
+    virtual bool ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, T& totalResidual, size_t N);
+    virtual bool ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement);
+
+    /// refine the FFD
+    virtual bool refine();
+
+    /// general print function
+    virtual void print(std::ostream& os) const;
+
+protected: 
+
+    FFDArrayType ml_ffd_;
+
+    /// if true, all stored ffd will be deleted
+    bool delete_data_on_destruct_;
+
+};
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+gtplusMLFFD<T, CoordType, DIn, DOut>::gtplusMLFFD(bool delete_data_on_destruct) : delete_data_on_destruct_(delete_data_on_destruct)
+{
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+gtplusMLFFD<T, CoordType, DIn, DOut>::gtplusMLFFD(const FFDArrayType& a, bool delete_data_on_destruct) : delete_data_on_destruct_(delete_data_on_destruct)
+{
+    ml_ffd_.resize(a.size());
+    for ( size_t ii=0; ii<a.size(); ii++ )
+    {
+        if ( a[ii] != NULL )
+        {
+            ml_ffd_[ii] = a[ii];
+        }
+    }
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+gtplusMLFFD<T, CoordType, DIn, DOut>::
+gtplusMLFFD(const Self& a)
+{
+    delete_data_on_destruct_ = false;
+    ml_ffd_ = a.getFFDArray();
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+gtplusMLFFD<T, CoordType, DIn, DOut>::~gtplusMLFFD()
+{
+    if ( delete_data_on_destruct_ )
+    {
+        for ( size_t ii=0; ii<ml_ffd_.size(); ii++ )
+        {
+            if ( ml_ffd_[ii] != NULL )
+            {
+                delete ml_ffd_[ii];
+                ml_ffd_[ii] = NULL;
+            }
+        }
+    }
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusMLFFD<T, CoordType, DIn, DOut>::evaluateFFD(const CoordType pt[D], T r[DOut]) const
+{
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        r[d] = 0;
+    }
+
+    T rLevel[DOut];
+
+    size_t ii;
+    for (ii=0; ii<ml_ffd_.size(); ii++)
+    {
+        if ( ml_ffd_[ii] == NULL ) continue;
+
+        GADGET_CHECK_RETURN_FALSE(ml_ffd_[ii]->evaluateFFD(pt, rLevel));
+        for ( d=0; d<DOut; d++ )
+        {
+            r[d] += rLevel[d];
+        }
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusMLFFD<T, CoordType, DIn, DOut>::evaluateFFDDerivative(const CoordType pt[D], T deriv[D][DOut]) const
+{
+    unsigned int d, d2;
+    for ( d=0; d<D; d++ )
+    {
+        for ( d2=0; d2<DOut; d2++ )
+        {
+            deriv[d][d2] = 0;
+        }
+    }
+
+    T derivLevel[D][DOut];
+
+    size_t ii;
+    for (ii=0; ii<ml_ffd_.size(); ii++)
+    {
+        if ( ml_ffd_[ii] == NULL ) continue;
+
+        GADGET_CHECK_RETURN_FALSE(ml_ffd_[ii]->evaluateFFDDerivative(pt, derivLevel));
+
+        for ( d=0; d<D; d++ )
+        {
+            for ( d2=0; d2<DOut; d2++ )
+            {
+                deriv[d][d2] += derivLevel[d][d2];
+            }
+        }
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusMLFFD<T, CoordType, DIn, DOut>::evaluateFFDDX(const CoordType pt[D], T dx[DOut]) const
+{
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        dx[d] = 0;
+    }
+
+    T dxLevel[D];
+
+    size_t ii;
+    for (ii=0; ii<ml_ffd_.size(); ii++)
+    {
+        if ( ml_ffd_[ii] == NULL ) continue;
+
+        GADGET_CHECK_RETURN_FALSE(ml_ffd_[ii]->evaluateFFDDX(pt, dxLevel));
+
+        for ( d=0; d<DOut; d++ )
+        {
+            dx[d] += dxLevel[d];
+        }
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusMLFFD<T, CoordType, DIn, DOut>::evaluateFFDDY(const CoordType pt[D], T dy[DOut]) const
+{
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        dy[d] = 0;
+    }
+
+    T dyLevel[D];
+
+    size_t ii;
+    for (ii=0; ii<ml_ffd_.size(); ii++)
+    {
+        if ( ml_ffd_[ii] == NULL ) continue;
+
+        GADGET_CHECK_RETURN_FALSE(ml_ffd_[ii]->evaluateFFDDY(pt, dyLevel));
+
+        for ( d=0; d<DOut; d++ )
+        {
+            dy[d] += dyLevel[d];
+        }
+    }
+
+    return true;
+}
+
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusMLFFD<T, CoordType, DIn, DOut>::evaluateFFDDZ(const CoordType pt[D], T dz[DOut]) const
+{
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        dz[d] = 0;
+    }
+
+    T dzLevel[D];
+
+    size_t ii;
+    for (ii=0; ii<ml_ffd_.size(); ii++)
+    {
+        if ( ml_ffd_[ii] == NULL ) continue;
+
+        GADGET_CHECK_RETURN_FALSE(ml_ffd_[ii]->evaluateFFDDZ(pt, dzLevel));
+
+        for ( d=0; d<DOut; d++ )
+        {
+            dz[d] += dzlevel[d];
+        }
+    }
+
+    return true;
+}
+
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusMLFFD<T, CoordType, DIn, DOut>::evaluateFFDDS(const CoordType pt[D], T ds[DOut]) const
+{
+    unsigned int d;
+    for ( d=0; d<DOut; d++ )
+    {
+        ds[d] = 0;
+    }
+
+    T dsLevel[D];
+
+    size_t ii;
+    for (ii=0; ii<ml_ffd_.size(); ii++)
+    {
+        if ( ml_ffd_[ii] == NULL ) continue;
+
+        GADGET_CHECK_RETURN_FALSE(ml_ffd_[ii]->evaluateFFDDS(pt, dsLevel));
+
+        for ( d=0; d<DOut; d++ )
+        {
+            ds[d] += dslevel[d];
+        }
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusMLFFD<T, CoordType, DIn, DOut>::evaluateFFDSecondOrderDerivative(const CoordType pt[D], T dderiv[D*D][DOut]) const
+{
+    unsigned int d, d2;
+    for ( d=0; d<D*D; d++ )
+    {
+        for ( d2=0; d2<DOut; d2++ )
+        {
+            dderiv[d][d2] = 0;
+        }
+    }
+
+    T dderivLevel[D*D][DOut];
+
+    size_t ii;
+    for (ii=0; ii<ml_ffd_.size(); ii++)
+    {
+        if ( ml_ffd_[ii] == NULL ) continue;
+
+        GADGET_CHECK_RETURN_FALSE(ml_ffd_[ii]->evaluateFFDSecondOrderDerivative(pt, dderivLevel));
+
+        for ( d=0; d<D*D; d++ )
+        {
+            for ( d2=0; d2<DOut; d2++ )
+            {
+                dderiv[d][d2] += dderivLevel[d][d2];
+            }
+        }
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusMLFFD<T, CoordType, DIn, DOut>::ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, T& totalResidual, size_t N)
+{
+    ValueArrayType valueLevel(value);
+
+    size_t ii;
+    for (ii=0; ii<ml_ffd_.size(); ii++)
+    {
+        if ( ml_ffd_[ii] == NULL ) continue;
+
+        GADGET_CHECK_RETURN_FALSE(ml_ffd_[ii]->ffdApprox(pos, valueLevel, residual, totalResidual, N));
+        valueLevel = residual;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusMLFFD<T, CoordType, DIn, DOut>::ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(pos.get_size(0)==N);
+        GADGET_CHECK_RETURN_FALSE(pos.get_size(1)==DIn);
+
+        GADGET_CHECK_RETURN_FALSE(value.get_size(0)==N);
+        GADGET_CHECK_RETURN_FALSE(value.get_size(1)==DOut);
+
+        totalResidual = 0;
+
+        if ( !residual.dimensions_equal(value) )
+        {
+            residual.create(value.get_dimensions());
+            Gadgetron::clear(residual);
+        }
+
+        ValueArrayType valueLevel(value);
+        size_t numOfRefinementLevel(0);
+
+        size_t ii;
+        for (ii=0; ii<ml_ffd_.size(); ii++)
+        {
+            if ( ml_ffd_[ii] == NULL ) continue;
+
+            GADGET_CHECK_RETURN_FALSE(ml_ffd_[ii]->ffdApprox(pos, valueLevel, residual, totalResidual, N, numOfRefinementLevel, thresResidual, maxNumOfRefinement));
+            numOfRefinement += numOfRefinementLevel;
+            valueLevel = residual;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in ffdApprox(const CoordArrayType& pos, ValueArrayType& value, ValueArrayType& residual, real_value_type& totalResidual, size_t N, size_t& numOfRefinement, real_value_type thresResidual, size_t maxNumOfRefinement) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+inline bool gtplusMLFFD<T, CoordType, DIn, DOut>::refine()
+{
+    size_t ii;
+    for (ii=0; ii<ml_ffd_.size(); ii++)
+    {
+        if ( ml_ffd_[ii] == NULL ) continue;
+
+        GADGET_CHECK_RETURN_FALSE(ml_ffd_[ii]->refine());
+    }
+
+    return true;
+}
+
+template <typename T, typename CoordType, unsigned int DIn, unsigned int DOut> 
+void gtplusMLFFD<T, CoordType, DIn, DOut>::print(std::ostream& os) const
+{
+    using namespace std;
+
+    os << "---------------------- GTPlus Multi-level Free Form Deformation ------------------" << endl;
+    os << "Number of level is : " << ml_ffd_.size() << endl;
+
+    size_t ii;
+    for (ii=0; ii<ml_ffd_.size(); ii++)
+    {
+        os << "Level " << ii << " : " << endl;
+        if ( ml_ffd_[i]!=NULL )
+        {
+            ml_ffd_[i]->print(os);
+        }
+        else
+        {
+            os << "--> Pointer is NULL ... " << endl;
+        }
+    }
+    os << "------------------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusAlgorithmBase.h b/toolboxes/gtplus/algorithm/gtPlusAlgorithmBase.h
new file mode 100644
index 0000000..421ad2f
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusAlgorithmBase.h
@@ -0,0 +1,70 @@
+/** \file       gtPlusAlgorithmBase.h
+    \brief      Base class for GtPlus algorithm
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd/ismrmrd.h"
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusIOAnalyze.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusAlgorithmBase
+{
+public:
+
+    gtPlusAlgorithmBase();
+    virtual ~gtPlusAlgorithmBase();
+
+    virtual void printInfo(std::ostream& os);
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    bool performTiming_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // debug folder
+    std::string debugFolder_;
+
+    // util
+    gtPlusISMRMRDReconUtil<T> gtPlus_util_;
+    gtPlusISMRMRDReconUtilComplex<T> gtPlus_util_complex_;
+};
+
+template <typename T> 
+gtPlusAlgorithmBase<T>::gtPlusAlgorithmBase() : performTiming_(false)
+{
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+}
+
+template <typename T> 
+gtPlusAlgorithmBase<T>::~gtPlusAlgorithmBase()
+{
+}
+
+template <typename T> 
+void gtPlusAlgorithmBase<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD Algorithm ------------------" << endl;
+    os << "Implementation of algorithms for ISMRMRD package" << endl;
+    os << "----------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusDataFidelityOperator.h b/toolboxes/gtplus/algorithm/gtPlusDataFidelityOperator.h
new file mode 100644
index 0000000..7fb6b55
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusDataFidelityOperator.h
@@ -0,0 +1,161 @@
+/** \file       gtPlusDataFidelityOperator.h
+    \brief      Implement data fidelity operator
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusDataFidelityOperator : public gtPlusOperator<T>
+{
+public:
+
+    typedef gtPlusOperator<T> BaseClass;
+
+    gtPlusDataFidelityOperator();
+    virtual ~gtPlusDataFidelityOperator();
+
+    virtual void printInfo(std::ostream& os);
+
+    // forward operator
+    // D*x
+    virtual bool forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // adjoint operator
+    // D'x
+    virtual bool adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // gradient of ||Dx-y||2
+    // 2*D'*(Dx-y)
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // L2 norm of ||Dx-y||2
+    virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+    virtual bool unitary() const { return true; }
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+
+protected:
+
+    using BaseClass::acquired_points_;
+    using BaseClass::acquired_points_indicator_;
+    using BaseClass::unacquired_points_indicator_;
+    using BaseClass::coil_senMap_;
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+gtPlusDataFidelityOperator<T>::gtPlusDataFidelityOperator() : BaseClass()
+{
+
+}
+
+template <typename T> 
+gtPlusDataFidelityOperator<T>::~gtPlusDataFidelityOperator()
+{
+}
+
+template <typename T> 
+bool gtPlusDataFidelityOperator<T>::
+forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        Gadgetron::multiply(acquired_points_indicator_, x, y);
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusDataFidelityOperator<T>::forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusDataFidelityOperator<T>::
+adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        Gadgetron::multiply(acquired_points_indicator_, x, y);
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusDataFidelityOperator<T>::adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusDataFidelityOperator<T>::
+grad(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        // 2D'*(Dx-y)
+        Gadgetron::multiply(acquired_points_indicator_, x, g);
+        Gadgetron::subtract(g, *acquired_points_, g);
+        Gadgetron::scal(T(2.0), g);
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusDataFidelityOperator<T>::grad(const hoNDArray<T>& x, hoNDArray<T>& g) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusDataFidelityOperator<T>::
+obj(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        Gadgetron::multiply(acquired_points_indicator_, x, kspace_);
+        Gadgetron::subtract(kspace_, *acquired_points_, kspace_);
+        Gadgetron::dotc(kspace_, kspace_, obj);
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusDataFidelityOperator<T>::obj(const hoNDArray<T>& x, T& obj) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void gtPlusDataFidelityOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD data fidelity operator -----------------------" << endl;
+    os << "Data fidelity operator for gtPlus ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusGRAPPA.h b/toolboxes/gtplus/algorithm/gtPlusGRAPPA.h
new file mode 100644
index 0000000..e42e7e2
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusGRAPPA.h
@@ -0,0 +1,1016 @@
+
+/** \file   gtPlusGRAPPA.h
+    \brief  GRAPPA implementation for 2D and 3D MRI parallel imaging
+    \author Hui Xue
+
+    References to the implementation can be found in:
+
+    Griswold MA, Jakob PM, Heidemann RM, Nittka M, Jellus V, Wang J, Kiefer B, Haase A. 
+    Generalized autocalibrating partially parallel acquisitions (GRAPPA). 
+    Magnetic Resonance in Medicine 2002;47(6):1202-1210.
+
+    Kellman P, Epstein FH, McVeigh ER. 
+    Adaptive sensitivity encoding incorporating temporal filtering (TSENSE). 
+    Magnetic Resonance in Medicine 2001;45(5):846-852.
+
+    Breuer FA, Kellman P, Griswold MA, Jakob PM. .
+    Dynamic autocalibrated parallel imaging using temporal GRAPPA (TGRAPPA). 
+    Magnetic Resonance in Medicine 2005;53(4):981-985.
+
+    Saybasili H., Kellman P., Griswold MA., Derbyshire JA. Guttman, MA. 
+    HTGRAPPA: Real-time B1-weighted image domain TGRAPPA reconstruction. 
+    Magnetic Resonance in Medicine 2009;61(6): 1425-1433. 
+*/
+
+#pragma once
+
+#include "gtPlusAlgorithmBase.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusGRAPPA : public gtPlusAlgorithmBase<T>
+{
+public:
+
+    typedef gtPlusAlgorithmBase<T> BaseClass;
+
+    gtPlusGRAPPA() : calib_use_gpu_(true), BaseClass() {}
+    virtual ~gtPlusGRAPPA() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // get the kernel pattern, given the acceleration factor and kernel size
+    bool kerPattern(std::vector<int>& kE1, std::vector<int>& oE1, size_t accelFactor, size_t kNE1, bool fitItself);
+
+    // grappa calibration for 2D case
+    // acsSrc : [RO E1 srcCHA]
+    // acsDst : [RO E1 dstCHA]
+    // ker : [kRO kE1 srcCHA dstCHA oE1]
+    bool calib(const ho3DArray<T>& acsSrc, const ho3DArray<T>& acsDst, double thres, 
+            size_t kRO, const std::vector<int>& kE1, const std::vector<int>& oE1, ho5DArray<T>& ker);
+
+    // image domain kernel for 2D kernel
+    // kIm: image domain kernel [RO E1 srcCHA dstCHA]
+    bool imageDomainKernel(const ho5DArray<T>& ker, size_t kRO, const std::vector<int>& kE1, const std::vector<int>& oE1, size_t ro, size_t e1, hoNDArray<T>& kIm);
+
+    // grappa calibration for 3D case
+    // acsSrc : [RO E1 E2 srcCHA]
+    // acsDst : [RO E1 E2 dstCHA]
+    // ker : [kRO kE1 kE2 srcCHA dstCHA oE1 oE2]
+    bool calib3D(const ho4DArray<T>& acsSrc, const ho4DArray<T>& acsDst, double thres, double overDetermineRatio, 
+            size_t kRO, const std::vector<int>& kE1, const std::vector<int>& kE2, const std::vector<int>& oE1, const std::vector<int>& oE2, ho7DArray<T>& ker);
+
+    // image domain kernel for 3D kernel
+    // kIm: image domain kernel [RO E1 E2 srcCHA dstCHA]
+    bool imageDomainKernel3D(const ho7DArray<T>& ker, size_t kRO, const std::vector<int>& kE1, const std::vector<int>& kE2, const std::vector<int>& oE1, const std::vector<int>& oE2, size_t ro, size_t e1, size_t e2, hoNDArray<T>& kIm);
+
+    // convert the calibrated kernel to the convlution kernel in kspace
+    // if ROis3rdDim == true, the kernel dimension is [E1 E2 RO], otherwise [RO E1 E2]
+    bool kspaceDomainConvKernel3D(const ho7DArray<T>& ker, size_t kRO, const std::vector<int>& kE1, const std::vector<int>& kE2, const std::vector<int>& oE1, const std::vector<int>& oE2, ho5DArray<T>& convKerFlip, bool ROis3rdDim=true);
+
+    // image domain kernel for 3D kernel, only RO direction is converted to image domain
+    // E1 and E2 stays in the kspace domain
+    // kImRO: kspace-image hybrid kernel [convE1 convE2 RO srcCHA dstCHA]
+    bool imageDomainKernelRO3D(const ho7DArray<T>& ker, size_t kRO, const std::vector<int>& kE1, const std::vector<int>& kE2, const std::vector<int>& oE1, const std::vector<int>& oE2, size_t ro, hoNDArray<T>& kImRO);
+
+    // image domain kernel for 3D kernel, E1 and E2 directions are converted to image domain
+    // kImRO : kspace-image hybrid kernel where first two dimensions are E1 and E2 and in kspace
+    bool imageDomainKernelE1E2RO(const hoNDArray<T>& kImRO, size_t e1, size_t e2, hoNDArray<T>& kImE1E2RO);
+
+    // use gpu in the kernel calibration
+    bool calib_use_gpu_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+};
+
+template <typename T> 
+void gtPlusGRAPPA<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD GRAPPA reconstruction ------------------" << endl;
+    os << "Implementation of GRAPPA algorithms for ISMRMRD package" << endl;
+    os << "Both 2D and 3D version are implemented" << endl;
+    os << "Algorithms are published at:" << endl;
+    os << "Generalized autocalibrating partially parallel acquisitions (GRAPPA), Magnetic Resonance in Medicine, Volume 47, Issue 6, pages 1202�1210, June 2002" << endl;
+    os << "HTGRAPPA: Real-time B1-weighted image domain TGRAPPA reconstruction, Magnetic Resonance in Medicine, Volume 61, Issue 6, pages 1425�1433, June 2009" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusGRAPPA<T>::
+kerPattern(std::vector<int>& kE1, std::vector<int>& oE1, size_t accelFactor, size_t kNE1, bool fitItself)
+{
+    kE1.resize(kNE1, 0);
+    if ( kNE1%2 == 0 )
+    {
+        long long k;
+        for ( k=-((long long)kNE1/2-1); k<=(long long)kNE1/2; k++ )
+        {
+            kE1[k+kNE1/2-1] = (int)(k*accelFactor);
+        }
+    }
+    else
+    {
+        long long k;
+        for ( k=-(long long)kNE1/2; k<=(long long)kNE1/2; k++ )
+        {
+            kE1[k+kNE1/2] = (int)(k*accelFactor);
+        }
+    }
+
+    if ( fitItself )
+    {
+        oE1.resize(accelFactor);
+        for ( long long a=0; a<(long long)accelFactor; a++ )
+        {
+            oE1[a] = (int)a;
+        }
+    }
+    else
+    {
+        oE1.resize(accelFactor-1);
+        for ( long long a=1; a<(long long)accelFactor; a++ )
+        {
+            oE1[a-1] = (int)a;
+        }
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusGRAPPA<T>::
+calib(const ho3DArray<T>& acsSrc, const ho3DArray<T>& acsDst, double thres, 
+    size_t kRO, const std::vector<int>& kE1, const std::vector<int>& oE1, ho5DArray<T>& ker)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(0)==acsDst.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(1)==acsDst.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(2)>=acsDst.get_size(2));
+
+        size_t RO = acsSrc.get_size(0);
+        size_t E1 = acsSrc.get_size(1);
+        size_t srcCHA = acsSrc.get_size(2);
+        size_t dstCHA = acsDst.get_size(2);
+
+        const T* pSrc = acsSrc.begin();
+        const T* pDst = acsDst.begin();
+
+        long long kROhalf = kRO/2;
+        if ( 2*kROhalf == kRO )
+        {
+            GWARN_STREAM("gtPlusGRAPPA<T>::calib(...) - 2*kROhalf == kRO " << kRO);
+        }
+        kRO = 2*kROhalf + 1;
+
+        size_t kNE1 = kE1.size();
+        size_t oNE1 = oE1.size();
+
+        // allocate kernel
+        GADGET_CHECK_RETURN_FALSE(ker.createArray(kRO, kNE1, srcCHA, dstCHA, oNE1));
+
+        // loop over the calibration region and assemble the equation
+        // Ax = b
+
+        size_t eRO = RO - kROhalf -1;
+        size_t sE1 = std::abs(kE1[0]);
+        size_t eE1 = E1 -1 - kE1[kNE1-1];
+
+        size_t lenRO = eRO-kROhalf+1;
+
+        size_t rowA = (eE1-sE1+1)*lenRO;
+        size_t colA = kRO*kNE1*srcCHA;
+        size_t colB = dstCHA*oNE1;
+
+        hoMatrix<T> A;
+        hoMatrix<T> B;
+        hoMatrix<T> x( colA, colB );
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 2D calibration - allocate matrix storage ... "); }
+        A.createMatrix( rowA, colA);
+        T* pA = A.begin();
+
+        B.createMatrix( A.rows(), colB);
+        T* pB = B.begin();
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        long long e1;
+        for ( e1=(long long)sE1; e1<=(long long)eE1; e1++ )
+        {
+            for ( long long ro=kROhalf; ro<=(long long)eRO; ro++ )
+            {
+                long long rInd = (e1-sE1)*lenRO+ro-kROhalf;
+
+                size_t src, dst, ke1, oe1;
+                long long kro;
+
+                // fill matrix A
+                size_t col = 0;
+                size_t offset = 0;
+                for ( src=0; src<srcCHA; src++ )
+                {
+                    for ( ke1=0; ke1<kNE1; ke1++ )
+                    {
+                        offset = src*RO*E1 + (e1+kE1[ke1])*RO;
+                        for ( kro=-kROhalf; kro<=kROhalf; kro++ )
+                        {
+                            // A(rInd, col++) = acsSrc(ro+kro, e1+kE1[ke1], src);
+                            pA[rInd + col*rowA] = pSrc[ro+kro+offset];
+                            col++;
+                        }
+                    }
+                }
+
+                // fill matrix B
+                col = 0;
+                for ( oe1=0; oe1<oNE1; oe1++ )
+                {
+                    for ( dst=0; dst<dstCHA; dst++ )
+                    {
+                        B(rInd, col++) = acsDst(ro, e1+oE1[oe1], dst);
+                    }
+                }
+            }
+        }
+
+        //typename realType<T>::Type v;
+
+        //Gadgetron::norm2(A, v);
+        //GDEBUG_STREAM("A = " << v);
+
+        //Gadgetron::norm2(B, v);
+        //GDEBUG_STREAM("B = " << v);
+
+        //if ( performTiming_ ) { gt_timer2_.start("SolveLinearSystem_Tikhonov"); }
+        //#ifdef USE_CUDA
+        //    // go to device
+        //    try
+        //    {
+        //        if ( typeid(typename realType<T>::Type)==typeid(float) && calib_use_gpu_ )
+        //        {
+        //            GDEBUG_STREAM("grappa 2D - calling GPU kernel estimation ... ");
+        //            hoNDArray<float_complext> A_tmp(A.get_dimensions(), reinterpret_cast<float_complext*>(A.begin()));
+        //            hoNDArray<float_complext> B_tmp(B.get_dimensions(), reinterpret_cast<float_complext*>(B.begin()));
+
+        //            int ret(0);
+        //            boost::shared_ptr< hoNDArray<complext<float> > > host_x;
+
+        //            #pragma omp critical(inverse)
+        //            {
+        //                cuNDArray<float_complext> device_A(A_tmp);
+        //                cuNDArray<float_complext> device_B(B_tmp);
+        //                cuNDArray<float_complext> device_x;
+
+        //                ret = Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres);
+        //                if ( ret == 0 )
+        //                {
+        //                    host_x = device_x.to_host();
+        //                }
+        //            }
+
+        //            if ( ret != 0 )
+        //            {
+        //                GERROR_STREAM("failed in Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres) ... ");
+        //                SolveLinearSystem_Tikhonov(A, B, x, thres);
+        //            }
+        //            else
+        //            {
+        //                memcpy(x.begin(), host_x->begin(), x.get_number_of_bytes());
+        //            }
+        //        }
+        //        else
+        //        {
+        //            if ( calib_use_gpu_ )
+        //            {
+        //                GWARN_STREAM("GPU inverse_clib_matrix for grappa is only available for single-precision, calling the CPU version ... ");
+        //            }
+
+        //            GADGET_CHECK_RETURN_FALSE(SolveLinearSystem_Tikhonov(A, B, x, thres));
+        //        }
+        //    }
+        //    catch(...)
+        //    {
+        //        GERROR_STREAM("failed in GPU inverse_clib_matrix for grappa, calling the CPU version ... ");
+        //        GADGET_CHECK_RETURN_FALSE(SolveLinearSystem_Tikhonov(A, B, x, thres));
+        //    }
+
+        //#else
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(SolveLinearSystem_Tikhonov(A, B, x, thres));
+        //#endif // USE_CUDA
+
+        // GADGET_CHECK_RETURN_FALSE(SolveLinearSystem_Tikhonov(A, B, x, thres));
+        //if ( performTiming_ ) { gt_timer2_.stop(); }
+
+        //Gadgetron::norm2(x, v);
+        //GDEBUG_STREAM("x = " << v);
+
+        // the matrix dimension just matches
+        // hoMatrix<T> xt(x.cols(), x.rows(), ker.begin());
+        // GADGET_CHECK_RETURN_FALSE(Gadgetron::trans(x, xt));
+        memcpy(ker.begin(), x.begin(), ker.get_number_of_bytes());
+
+        //Gadgetron::norm2(ker, v);
+        //GDEBUG_STREAM("ker = " << v);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusGRAPPA<T>::calib(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusGRAPPA<T>::
+imageDomainKernel(const ho5DArray<T>& ker, size_t kRO, const std::vector<int>& kE1, const std::vector<int>& oE1, size_t ro, size_t e1, hoNDArray<T>& kIm)
+{
+    try
+    {
+        long long srcCHA = (long long)(ker.get_size(2));
+        long long dstCHA = (long long)(ker.get_size(3));
+        long long kNE1 = (long long)(kE1.size());
+        long long oNE1 = (long long)(oE1.size());
+
+        long long kROhalf = kRO/2;
+        if ( 2*kROhalf == kRO )
+        {
+            GWARN_STREAM("gtPlusGRAPPA<T>::imageDomainKernel(...) - 2*kROhalf == kRO " << kRO);
+        }
+        kRO = 2*kROhalf + 1;
+
+        // allocate image domain kernel
+        kIm.create(ro, e1, srcCHA, dstCHA);
+
+        /// fill the convolution kernels
+        long long convKRO = 2*kRO+3;
+
+        long long maxKE1 = std::abs(kE1[0]);
+        if ( std::abs(kE1[kNE1-1]) > maxKE1 )
+        {
+            maxKE1 = std::abs(kE1[kNE1-1]);
+        }
+        long long convKE1 = 2*maxKE1+1;
+
+        /// allocate the convolution kernel
+        hoNDArray<T> convKer(convKRO, convKE1, srcCHA, dstCHA);
+        Gadgetron::clear(&convKer);
+
+        /// index
+        long long oe1, kro, ke1, src, dst;
+
+        /// fill the convolution kernel and sum up multiple kernels
+        for ( oe1=0; oe1<oNE1; oe1++ )
+        {
+            for ( ke1=0; ke1<kNE1; ke1++ )
+            {
+                for ( kro=-kROhalf; kro<=kROhalf; kro++ )
+                {
+                    for ( dst=0; dst<dstCHA; dst++ )
+                    {
+                        for ( src=0; src<srcCHA; src++ )
+                        {
+                            convKer(-kro+kRO+1, oE1[oe1]-kE1[ke1]+maxKE1, src, dst) = ker(kro+kROhalf, ke1, src, dst, oe1);
+                        }
+                    }
+
+                }
+            }
+        }
+
+        if ( (oE1[0]!=0) && (srcCHA==dstCHA) )
+        {
+            for ( dst=0; dst<dstCHA; dst++ )
+            {
+                convKer(kRO+1, maxKE1, dst, dst) = 1.0;
+            }
+        }
+
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)( std::sqrt((double)(ro*e1)) ), convKer ));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::pad(ro, e1, &convKer, &kIm));
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kIm);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusGRAPPA<T>::imageDomainKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusGRAPPA<T>::
+calib3D(const ho4DArray<T>& acsSrc, const ho4DArray<T>& acsDst, 
+        double thres, double overDetermineRatio, 
+        size_t kRO, const std::vector<int>& kE1, const std::vector<int>& kE2, 
+        const std::vector<int>& oE1, const std::vector<int>& oE2, 
+        ho7DArray<T>& ker)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(0)==acsDst.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(1)==acsDst.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(2)>=acsDst.get_size(2));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(3)>=acsDst.get_size(3));
+
+        size_t RO = acsSrc.get_size(0);
+        size_t E1 = acsSrc.get_size(1);
+        size_t E2 = acsSrc.get_size(2);
+        size_t srcCHA = acsSrc.get_size(3);
+        size_t dstCHA = acsDst.get_size(3);
+
+        const T* pSrc = acsSrc.begin();
+        const T* pDst = acsDst.begin();
+
+        long long kROhalf = (long long)kRO/2;
+        if ( 2*kROhalf == kRO )
+        {
+            GWARN_STREAM("gtPlusGRAPPA<T>::calib3D(...) - 2*kROhalf == kRO " << kRO);
+        }
+
+        kRO = 2*kROhalf + 1;
+
+        size_t kNE1 = kE1.size();
+        size_t oNE1 = oE1.size();
+
+        size_t kNE2 = kE2.size();
+        size_t oNE2 = oE2.size();
+
+        // allocate kernel
+        GADGET_CHECK_RETURN_FALSE(ker.createArray(kRO, kNE1, kNE2, srcCHA, dstCHA, oNE1, oNE2));
+
+        // loop over the calibration region and assemble the equation
+        // Ax = b
+
+        size_t sRO = kROhalf;
+        size_t eRO = RO - kROhalf -1;
+
+        size_t sE1 = std::abs(kE1[0]);
+        size_t eE1 = E1 -1 - kE1[kNE1-1];
+
+        size_t sE2 = std::abs(kE2[0]);
+        size_t eE2 = E2 -1 - kE2[kNE2-1];
+
+        size_t lenRO = eRO-kROhalf+1;
+        size_t lenE1 = eE1-sE1+1;
+        size_t lenE2 = eE2-sE2+1;
+
+        size_t colA = kRO*kNE1*kNE2*srcCHA;
+        size_t colB = dstCHA*oNE1*oNE2;
+
+        if ( overDetermineRatio > 1.0 )
+        {
+            size_t maxRowA = (size_t)std::ceil(overDetermineRatio*colA);
+            size_t maxROUsed = maxRowA/(lenE1*lenE2);
+            if ( maxROUsed < lenRO )
+            {
+                // find the peak signal of acsSrc
+                hoNDArray<T> acsSrc1stCha(RO, E1, E2, const_cast<T*>(acsSrc.begin()));
+                hoNDArray<T> acsSrc1stChaSumE2(RO, E1, 1), acsSrc1stChaSumE2E1(RO, 1, 1);
+
+                try
+                {
+                    Gadgetron::sum_over_dimension(acsSrc1stCha, acsSrc1stChaSumE2, 2);
+                    Gadgetron::sum_over_dimension(acsSrc1stChaSumE2, acsSrc1stChaSumE2E1, 1);
+
+                    T maxSignal;
+                    size_t roInd(0);
+                    try
+                    {
+                        Gadgetron::maxAbsolute(acsSrc1stChaSumE2E1, maxSignal, roInd);
+
+                        if ( roInd > maxROUsed/2+kROhalf )
+                        {
+                            sRO = roInd - maxROUsed/2;
+                        }
+                        else
+                        {
+                            sRO = kROhalf;
+                        }
+
+                        if( sRO+maxROUsed-1 <= RO-kROhalf-1 )
+                        {
+                            eRO = sRO + maxROUsed - 1;
+                        }
+                        else
+                        {
+                            eRO = RO - kROhalf -1;
+                        }
+
+                        lenRO = eRO-sRO+1;
+                        GDEBUG_STREAM("gtPlusGRAPPA<T>::calib3D(...) - overDetermineRatio = " << overDetermineRatio << " ; RO data range used : [" << sRO << " " << eRO << "] ...");
+                    }
+                    catch(...)
+                    {
+                        GWARN_STREAM("gtPlusGRAPPA<T>::calib3D(...) - overDetermineRatio is ignored ... ");
+                        throw;
+                    }
+                }
+                catch (...)
+                {
+                    GWARN_STREAM("gtPlusGRAPPA<T>::calib3D(...) - overDetermineRatio is ignored ... ");
+                    throw;
+                }
+            }
+        }
+
+        size_t rowA = lenRO*lenE1*lenE2;
+
+        hoMatrix<T> A, B, x( colA, colB );
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - allocate matrix storage ... "); }
+        hoNDArray<T> A_mem(rowA, colA);
+        A.createMatrix( rowA, colA, A_mem.begin() );
+        T* pA = A.begin();
+
+        hoNDArray<T> B_mem(rowA, colB);
+        B.createMatrix( rowA, colB, B_mem.begin() );
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+        T* pB = B.begin();
+
+        long long e2;
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - fill calib matrixes ... "); }
+        #pragma omp parallel for default(none) private(e2) shared(sE2, eE2, sE1, eE1, kROhalf, sRO, eRO, lenRO, lenE1, srcCHA, kNE2, kNE1, A, rowA, pA, acsSrc, kE1, kE2, oNE2, oNE1, dstCHA, B, pB, acsDst, oE1, oE2)
+        for ( e2=(long long)sE2; e2<=(long long)eE2; e2++ )
+        {
+            long long e1;
+            for ( e1=(long long)sE1; e1<=(long long)eE1; e1++ )
+            {
+                for ( long long ro=(long long)sRO; ro<=(long long)eRO; ro++ )
+                {
+                    size_t rInd = (e2-sE2)*lenRO*lenE1 + (e1-sE1)*lenRO + ro-sRO;
+
+                    size_t src, dst, ke1, ke2, oe1, oe2;
+                    long long kro;
+
+                    // fill matrix A
+                    size_t col = 0;
+                    for ( src=0; src<srcCHA; src++ )
+                    {
+                        for ( ke2=0; ke2<kNE2; ke2++ )
+                        {
+                            for ( ke1=0; ke1<kNE1; ke1++ )
+                            {
+                                for ( kro=-kROhalf; kro<=kROhalf; kro++ )
+                                {
+                                    // A(rInd, col++) = acsSrc(ro+kro, e1+kE1[ke1], e2+kE2[ke2], src);
+                                    pA[rInd + col*rowA] = acsSrc(ro+kro, e1+kE1[ke1], e2+kE2[ke2], src);
+                                    col++;
+                                }
+                            }
+                        }
+                    }
+
+                    // fill matrix B
+                    col = 0;
+                    for ( oe2=0; oe2<oNE2; oe2++ )
+                    {
+                        for ( oe1=0; oe1<oNE1; oe1++ )
+                        {
+                            for ( dst=0; dst<dstCHA; dst++ )
+                            {
+                                // B(rInd, col++) = acsDst(ro, e1+oE1[oe1], e2+oE2[oe2], dst);
+                                pB[rInd + col*rowA] = acsDst(ro, e1+oE1[oe1], e2+oE2[oe2], dst);
+                                col++;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        //typename realType<T>::Type v;
+
+        //Gadgetron::norm2(A, v);
+        //GDEBUG_STREAM("A = " << v);
+
+        //Gadgetron::norm2(B, v);
+        //GDEBUG_STREAM("B = " << v);
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - solve linear system ... "); }
+        //#ifdef USE_CUDA
+        //    // go to device
+        //    try
+        //    {
+        //        if ( typeid(typename realType<T>::Type)==typeid(float) && calib_use_gpu_ )
+        //        {
+        //            GDEBUG_STREAM("grappa 3D - calling GPU kernel estimation ... ");
+        //            //hoNDArray<float_complext> A_tmp(A.get_dimensions(), reinterpret_cast<float_complext*>(A.begin()));
+        //            //hoNDArray<float_complext> B_tmp(B.get_dimensions(), reinterpret_cast<float_complext*>(B.begin()));
+
+        //            //cuNDArray<float_complext> device_A(A_tmp);
+        //            //cuNDArray<float_complext> device_B(B_tmp);
+        //            //cuNDArray<float_complext> device_x;
+        //            //if ( Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres) != 0 )
+        //            //{
+        //            //    GERROR_STREAM("failed in Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres) ... ");
+        //            //    SolveLinearSystem_Tikhonov(A, B, x, thres);
+        //            //}
+        //            //else
+        //            //{
+        //            //    // go back to host
+        //            //    boost::shared_ptr< hoNDArray<complext<float> > > host_x = device_x.to_host();
+        //            //    memcpy(x.begin(), host_x->begin(), x.get_number_of_bytes());
+        //            //}
+
+        //            hoNDArray<float_complext> A_tmp(A.get_dimensions(), reinterpret_cast<float_complext*>(A.begin()));
+        //            hoNDArray<float_complext> B_tmp(B.get_dimensions(), reinterpret_cast<float_complext*>(B.begin()));
+
+        //            int ret(0);
+        //            boost::shared_ptr< hoNDArray<complext<float> > > host_x;
+
+        //            #pragma omp critical(inverse3D)
+        //            {
+        //                cuNDArray<float_complext> device_A(A_tmp);
+        //                cuNDArray<float_complext> device_B(B_tmp);
+        //                cuNDArray<float_complext> device_x;
+
+        //                ret = Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres);
+        //                if ( ret == 0 )
+        //                {
+        //                    host_x = device_x.to_host();
+        //                }
+        //            }
+
+        //            if ( ret != 0 )
+        //            {
+        //                GERROR_STREAM("failed in Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres) ... ");
+        //                SolveLinearSystem_Tikhonov(A, B, x, thres);
+        //            }
+        //            else
+        //            {
+        //                memcpy(x.begin(), host_x->begin(), x.get_number_of_bytes());
+        //            }
+        //        }
+        //        else
+        //        {
+        //            GWARN_STREAM("GPU inverse_clib_matrix for grappa is only available for single-precision, calling the CPU version ... ");
+        //            GADGET_CHECK_RETURN_FALSE(SolveLinearSystem_Tikhonov(A, B, x, thres));
+        //        }
+        //    }
+        //    catch(...)
+        //    {
+        //        GERROR_STREAM("failed in GPU inverse_clib_matrix for grappa, calling the CPU version ... ");
+        //        GADGET_CHECK_RETURN_FALSE(SolveLinearSystem_Tikhonov(A, B, x, thres));
+        //    }
+        //#else
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(SolveLinearSystem_Tikhonov(A, B, x, thres));
+        //#endif // USE_CUDA
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        //Gadgetron::norm2(x, v);
+        //GDEBUG_STREAM("x = " << v);
+
+        // the matrix dimension just matches
+        //hoMatrix<T> xt(x.cols(), x.rows(), ker.begin());
+        //GADGET_CHECK_RETURN_FALSE(Gadgetron::trans(x, xt));
+        memcpy(ker.begin(), x.begin(), ker.get_number_of_bytes());
+
+        //Gadgetron::norm2(ker, v);
+        //GDEBUG_STREAM("ker = " << v);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusGRAPPA<T>::calib3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusGRAPPA<T>::
+kspaceDomainConvKernel3D(const ho7DArray<T>& ker, size_t kRO, const std::vector<int>& kE1, const std::vector<int>& kE2, const std::vector<int>& oE1, const std::vector<int>& oE2, ho5DArray<T>& convKer, bool ROis3rdDim)
+{
+try
+    {
+        long long srcCHA = (long long)(ker.get_size(3));
+        long long dstCHA = (long long)(ker.get_size(4));
+
+        long long kNE1 = (long long)(kE1.size());
+        long long oNE1 = (long long)(oE1.size());
+
+        long long kNE2 = (long long)(kE2.size());
+        long long oNE2 = (long long)(oE2.size());
+
+        long long kROhalf = kRO/2;
+        if ( 2*kROhalf == kRO )
+        {
+            GWARN_STREAM("gtPlusGRAPPA<T>::imageDomainKernel(...) - 2*kROhalf == kRO " << kRO);
+        }
+        kRO = 2*kROhalf + 1;
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - convert to conv kernel ... "); }
+        /// fill the convolution kernels
+        long long convKRO = 2*kRO+3;
+
+        long long maxKE1 = std::abs(kE1[0]);
+        if ( std::abs(kE1[kNE1-1]) > maxKE1 )
+        {
+            maxKE1 = std::abs(kE1[kNE1-1]);
+        }
+        long long convKE1 = 2*maxKE1+1;
+
+        long long maxKE2 = std::abs(kE2[0]);
+        if ( std::abs(kE2[kNE2-1]) > maxKE2 )
+        {
+            maxKE2 = std::abs(kE2[kNE2-1]);
+        }
+        long long convKE2 = 2*maxKE2+1;
+
+        /// allocate the convolution kernel
+        if ( ROis3rdDim )
+        {
+            convKer.createArray(convKE1, convKE2, convKRO, srcCHA, dstCHA);
+        }
+        else
+        {
+            convKer.createArray(convKRO, convKE1, convKE2, srcCHA, dstCHA);
+        }
+        Gadgetron::clear(&convKer);
+
+        /// index
+        long long oe1, oe2, kro, ke1, ke2, src, dst;
+
+        /// fill the convolution kernel and sum up multiple kernels
+        for ( oe2=0; oe2<oNE2; oe2++ )
+        {
+            for ( oe1=0; oe1<oNE1; oe1++ )
+            {
+                for ( ke2=0; ke2<kNE2; ke2++ )
+                {
+                    for ( ke1=0; ke1<kNE1; ke1++ )
+                    {
+                        for ( kro=-kROhalf; kro<=kROhalf; kro++ )
+                        {
+                            for ( dst=0; dst<dstCHA; dst++ )
+                            {
+                                if ( ROis3rdDim )
+                                {
+                                    for ( src=0; src<srcCHA; src++ )
+                                    {
+                                        convKer(oE1[oe1]-kE1[ke1]+maxKE1, oE2[oe2]-kE2[ke2]+maxKE2, -kro+kRO+1, src, dst) = ker(kro+kROhalf, ke1, ke2, src, dst, oe1, oe2);
+                                    }
+                                }
+                                else
+                                {
+                                    for ( src=0; src<srcCHA; src++ )
+                                    {
+                                        convKer(-kro+kRO+1, oE1[oe1]-kE1[ke1]+maxKE1, oE2[oe2]-kE2[ke2]+maxKE2, src, dst) = ker(kro+kROhalf, ke1, ke2, src, dst, oe1, oe2);
+                                    }
+                                }
+                            }
+
+                        }
+                    }
+                }
+            }
+        }
+
+        if ( (oE1[0]!=0) && (oE2[0]!=0) && (srcCHA==dstCHA) )
+        {
+            for ( dst=0; dst<dstCHA; dst++ )
+            {
+                if ( ROis3rdDim )
+                {
+                    for ( src=0; src<srcCHA; src++ )
+                    {
+                        if ( src == dst )
+                        {
+                            convKer(maxKE1, maxKE2, kRO+1, src, dst) = 1.0;
+                        }
+                        else
+                        {
+                            convKer(maxKE1, maxKE2, kRO+1, src, dst) = 0.0;
+                        }
+                    }
+                }
+                else
+                {
+                    for ( src=0; src<srcCHA; src++ )
+                    {
+                        if ( src == dst )
+                        {
+                            convKer(kRO+1, maxKE1, maxKE2, src, dst) = 1.0;
+                        }
+                        else
+                        {
+                            convKer(kRO+1, maxKE1, maxKE2, src, dst) = 0.0;
+                        }
+                    }
+                }
+            }
+        }
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusGRAPPA<T>::kspaceDomainConvKernel3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusGRAPPA<T>::
+imageDomainKernel3D(const ho7DArray<T>& ker, size_t kRO, const std::vector<int>& kE1, const std::vector<int>& kE2, const std::vector<int>& oE1, const std::vector<int>& oE2, size_t ro, size_t e1, size_t e2, hoNDArray<T>& kIm)
+{
+    try
+    {
+        long long srcCHA = (long long)(ker.get_size(3));
+        long long dstCHA = (long long)(ker.get_size(4));
+
+        long long kNE1 = (long long)(kE1.size());
+        long long oNE1 = (long long)(oE1.size());
+
+        long long kNE2 = (long long)(kE2.size());
+        long long oNE2 = (long long)(oE2.size());
+
+        long long kROhalf = kRO/2;
+        if ( 2*kROhalf == kRO )
+        {
+            GWARN_STREAM("gtPlusGRAPPA<T>::imageDomainKernel(...) - 2*kROhalf == kRO " << kRO);
+        }
+        kRO = 2*kROhalf + 1;
+
+        // allocate image domain kernel
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - create kIm array ... "); }
+        if ( kIm.get_number_of_elements() < (size_t)ro*e1*e2*srcCHA*dstCHA )
+        {
+            kIm.create(ro, e1, e2, srcCHA, dstCHA);
+        }
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        ho5DArray<T> convKer;
+        bool ROis3rdDim = false;
+        GADGET_CHECK_RETURN_FALSE(this->kspaceDomainConvKernel3D(ker, kRO, kE1, kE2, oE1, oE2, convKer, ROis3rdDim));
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - SNR unit scaling ... "); }
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)( std::sqrt((double)(ro*e1*e2)) ), convKer ));
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - zero padding ... "); }
+        // GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().zeropad3DNoPresetZeros(convKer, ro, e1, e2, kIm));
+        // GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::zeropad3D(convKer, ro, e1, e2, kIm, false));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::pad(ro, e1, e2, &convKer, &kIm, false));
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - conver to image domain ... "); }
+
+        long long n;
+
+        #pragma omp parallel default(none) private(n) shared(ro, e1, e2, srcCHA, dstCHA, kIm)
+        {
+            hoNDArray<T> kImTmp(ro, e1, e2);
+            hoNDArray<T> kImRes(ro, e1, e2);
+
+            #pragma omp for 
+            for (n = 0; n < srcCHA*dstCHA; n++)
+            {
+                long long d = n / srcCHA;
+                long long s = n - d*srcCHA;
+
+                T* pkImCha = kIm.begin() + d*ro*e1*e2*srcCHA + s*ro*e1*e2;
+
+                hoNDArray<T> kImCha(ro, e1, e2, pkImCha);
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(kImCha, kImRes, kImTmp);
+                memcpy(pkImCha, kImRes.begin(), kImRes.get_number_of_bytes());
+            }
+        }
+
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusGRAPPA<T>::imageDomainKernel3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusGRAPPA<T>::
+imageDomainKernelRO3D(const ho7DArray<T>& ker, size_t kRO, const std::vector<int>& kE1, const std::vector<int>& kE2, const std::vector<int>& oE1, const std::vector<int>& oE2, size_t ro, hoNDArray<T>& kImRO)
+{
+    try
+    {
+        long long srcCHA = (long long)(ker.get_size(3));
+        long long dstCHA = (long long)(ker.get_size(4));
+
+        GADGET_CHECK_RETURN_FALSE(kRO==ker.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(kE1.size()==ker.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(kE2.size()==ker.get_size(2));
+        GADGET_CHECK_RETURN_FALSE(oE1.size()==ker.get_size(5));
+        GADGET_CHECK_RETURN_FALSE(oE2.size()==ker.get_size(6));
+
+        bool ROat3rdDim = false;
+        ho5DArray<T> convKer;
+        GADGET_CHECK_RETURN_FALSE(this->kspaceDomainConvKernel3D(ker, kRO, kE1,  kE2, oE1, oE2, convKer, ROat3rdDim));
+
+        // allocate image domain kernel
+        size_t kConvE1 = convKer.get_size(1);
+        size_t kConvE2 = convKer.get_size(2);
+
+        kImRO.create(kConvE1, kConvE2, ro, srcCHA, dstCHA);
+
+        hoNDArray<T> kImROTemp(ro, kConvE1, kConvE2, srcCHA, dstCHA);
+        Gadgetron::clear(kImROTemp);
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - SNR unit scaling ... "); }
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)( std::sqrt((double)(ro)) ), convKer ));
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(convKer, debugFolder_+"convKer_scal_RO"); }
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - zero padding only for RO ... "); }
+        // GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().zeropad3DNoPresetZeros(convKer, ro, kConvE1, kConvE2, kImROTemp));
+        // GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::zeropad3D(convKer, ro, kConvE1, kConvE2, kImROTemp, false));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::pad(ro, kConvE1, kConvE2, &convKer, &kImROTemp, false));
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kImROTemp, debugFolder_+"convKer_scal_RO_zeropadded"); }
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - conver to image domain only for RO ... "); }
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(kImROTemp);
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - permute kernel dimensions to be [kE1 kE2 RO ...]  ... "); }
+
+        std::vector<size_t> dim_order(3);
+        dim_order[0] = 1;
+        dim_order[1] = 2;
+        dim_order[2] = 0;
+
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&kImROTemp, &kImRO, &dim_order));
+
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusGRAPPA<T>::imageDomainKernelRO3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusGRAPPA<T>::
+imageDomainKernelE1E2RO(const hoNDArray<T>& kImRO, size_t e1, size_t e2, hoNDArray<T>& kImE1E2RO)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dim = kImRO.get_dimensions();
+
+        std::vector<size_t> dimR(*dim);
+        dimR[0] = e1;
+        dimR[1] = e2;
+
+        kImE1E2RO.create(&dimR);
+        Gadgetron::clear(kImE1E2RO);
+
+        hoNDArray<T> kImROScaled(kImRO);
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - SNR unit scaling for E1 and E2 ... "); }
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)( std::sqrt((double)(e1*e2)) ), kImROScaled ));
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kImROScaled, debugFolder_+"kImROScaledE1E2"); }
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - zero padding for E1 and E2 ... "); }
+        // GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().zeropad3DNoPresetZeros(kImROScaled, e1, e2, dimR[2], kImE1E2RO));
+        // GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::zeropad3D(kImROScaled, e1, e2, dimR[2], kImE1E2RO, false));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::pad(e1, e2, dimR[2], &kImROScaled, &kImE1E2RO, false));
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kImE1E2RO, debugFolder_+"kImE1E2RO_zeropadded_E1E2"); }
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - conver to image domain for E1 and E2 ... "); }
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kImE1E2RO);
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusGRAPPA<T>::imageDomainKernelE1E2RO(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusOperator.h b/toolboxes/gtplus/algorithm/gtPlusOperator.h
new file mode 100644
index 0000000..bc705d9
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusOperator.h
@@ -0,0 +1,284 @@
+/** \file       gtPlusOperator.h
+    \brief      Base class for gtPlus operators
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd/ismrmrd.h"
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusIOAnalyze.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusOperator
+{
+public:
+
+    typedef typename realType<T>::Type value_type;
+
+    gtPlusOperator();
+    virtual ~gtPlusOperator();
+
+    virtual void printInfo(std::ostream& os);
+
+    // forward operator
+    virtual bool forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y) = 0;
+
+    // adjoint operator
+    virtual bool adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y) = 0;
+
+    // adjoint - forward operator
+    virtual bool adjointforwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // compute gradient
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g) = 0;
+
+    // compute cost value
+    virtual bool obj(const hoNDArray<T>& x, T& obj) = 0;
+
+    // perform the proximity operation
+    virtual bool proximity(hoNDArray<T>& x, value_type thres);
+
+    // indicate the operator is unitary or not
+    // unitary operator, AA' = I
+    virtual bool unitary() const = 0;
+
+    // restore acquired kspace points to x
+    virtual bool restoreAcquiredKSpace(const hoNDArray<T>& acquired, hoNDArray<T>& y);
+    virtual bool restoreAcquiredKSpace(hoNDArray<T>& y);
+
+    // set the acquired kspace, unacquired points are set to be zero
+    virtual bool setAcquiredPoints(boost::shared_ptr< hoNDArray<T> >& kspace);
+
+    // set the coil sensivity map
+    virtual bool setCoilSenMap(boost::shared_ptr< hoNDArray<T> >& senMap);
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    bool performTiming_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // debug folder
+    std::string debugFolder_;
+
+    // util
+    gtPlusISMRMRDReconUtil<T> gtPlus_util_;
+    gtPlusISMRMRDReconUtilComplex<T> gtPlus_util_complex_;
+
+public:
+
+    // acquired kspace (unacquired points are zeros)
+    boost::shared_ptr< hoNDArray<T> > acquired_points_;
+    // acquired point indicator array, acquired points as 1, otherwise, 0
+    hoNDArray<T> acquired_points_indicator_;
+    // unacquired point indicator array
+    hoNDArray<T> unacquired_points_indicator_;
+
+    // coil map
+    boost::shared_ptr< hoNDArray<T> > coil_senMap_;
+
+    // helper memory
+    hoNDArray<T> kspace_;
+    hoNDArray<T> complexIm_;
+    hoNDArray<T> res_after_apply_kernel_;
+    hoNDArray<T> res_after_apply_kernel_sum_over_;
+
+    hoNDArray<T> kspace_Managed_;
+    hoNDArray<T> complexIm_Managed_;
+    hoNDArray<T> res_after_apply_kernel_Managed_;
+    hoNDArray<T> res_after_apply_kernel_sum_over_Managed_;
+
+    bool performSumOverSrcChannel(const hoNDArray<T>& x, hoNDArray<T>& r);
+};
+
+template <typename T> 
+gtPlusOperator<T>::gtPlusOperator() : performTiming_(false)
+{
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+}
+
+template <typename T> 
+gtPlusOperator<T>::~gtPlusOperator()
+{
+}
+
+template <typename T> 
+void gtPlusOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD operator ------------------" << endl;
+    os << "Operator for gtPlus ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusOperator<T>::adjointforwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    hoNDArray<T> a(x);
+    GADGET_CHECK_RETURN_FALSE(this->forwardOperator(x, a));
+    GADGET_CHECK_RETURN_FALSE(this->adjointOperator(a, y));
+    return true;
+}
+
+template <typename T> 
+bool gtPlusOperator<T>::restoreAcquiredKSpace(hoNDArray<T>& y)
+{
+    return this->restoreAcquiredKSpace(*acquired_points_, y);
+}
+
+template <typename T> 
+bool gtPlusOperator<T>::restoreAcquiredKSpace(const hoNDArray<T>& acquired, hoNDArray<T>& y)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(acquired.get_number_of_elements()==y.get_number_of_elements());
+
+        size_t N = acquired.get_number_of_elements();
+
+        const T* pA = acquired.get_data_ptr();
+        T* pY = y.get_data_ptr();
+
+        int n;
+        #pragma omp parallel for default(none) private(n) shared(N, pA, pY)
+        for ( n=0; n<(int)N; n++ )
+        {
+            if ( std::abs(pA[n]) > 0 )
+            {
+                pY[n] = pA[n];
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in gtPlusOperator<T>::restoreAcquiredKSpace(const hoNDArray<T>& acquired, hoNDArray<T>& y) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusOperator<T>::
+setAcquiredPoints(boost::shared_ptr< hoNDArray<T> >& kspace)
+{
+    try
+    {
+        acquired_points_ = kspace;
+
+        acquired_points_indicator_.create(kspace->get_dimensions());
+        Gadgetron::clear(acquired_points_indicator_);
+
+        unacquired_points_indicator_.create(kspace->get_dimensions());
+        Gadgetron::clear(unacquired_points_indicator_);
+
+        size_t N = kspace->get_number_of_elements();
+
+        long long ii;
+
+        #pragma omp parallel for default(shared) private(ii) shared(N, kspace)
+        for ( ii=0; ii<(long long)N; ii++ )
+        {
+            if ( std::abs( (*kspace)(ii) ) < DBL_EPSILON )
+            {
+                unacquired_points_indicator_(ii) = T(1.0);
+            }
+            else
+            {
+                acquired_points_indicator_(ii) = T(1.0);
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusOperator<T>::setAcquiredPoints(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusOperator<T>::
+setCoilSenMap(boost::shared_ptr< hoNDArray<T> >& senMap)
+{
+    try
+    {
+        coil_senMap_ = senMap;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusOperator<T>::setCoilSenMap(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusOperator<T>::
+proximity(hoNDArray<T>& /*x*/, value_type /*thres*/)
+{
+    return true;
+}
+
+template<typename T>
+bool gtPlusOperator<T>::
+performSumOverSrcChannel(const hoNDArray<T>& x, hoNDArray<T>& r)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dim = x.get_dimensions();
+        size_t NDim = dim->size();
+
+        if (NDim < 2) return true;
+
+        std::vector<size_t> dimR(NDim - 1);
+        std::vector<size_t> dimRInternal = *dim;
+        dimRInternal[NDim - 2] = 1;
+
+        size_t d;
+        for (d = 0; d<NDim - 2; d++)
+        {
+            dimR[d] = (*dim)[d];
+        }
+        dimR[NDim - 2] = (*dim)[NDim - 1];
+
+        if (!r.dimensions_equal(&dimR))
+        {
+            r.create(&dimR);
+        }
+
+        if (x.get_size(NDim - 2) <= 1)
+        {
+            memcpy(r.begin(), x.begin(), x.get_number_of_bytes());
+            return true;
+        }
+
+        hoNDArray<T> rSum(dimRInternal, r.begin());
+
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(x, rSum, NDim - 2));
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in performSumOverSrcChannel(const hoNDArray<T>& x, hoNDArray<T>& r) ... ");
+        return false;
+    }
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRIT.h b/toolboxes/gtplus/algorithm/gtPlusSPIRIT.h
new file mode 100644
index 0000000..467e14b
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRIT.h
@@ -0,0 +1,1258 @@
+
+/** \file   gtPlusSPIRIT.h
+    \brief  SPIRIT kernel estimation for 2D and 3D MRI parallel imaging
+    \author Hui Xue
+
+    References to the implementation can be found in:
+
+    Lustig M, Pauly JM. 
+    SPIRiT: Iterative self-consistent parallel imaging reconstruction from arbitrary k-space. 
+    Magnetic Resonance in Medicine 2010;64(2):457-471.
+
+    ISMRM 2013 sunrise course on Parallel Imaging
+    Michael S. Hansen, Philip Beatty
+    http://gadgetron.sourceforge.net/sunrise/
+    http://cds.ismrm.org/protected/13MPresentations/abstracts/7059.pdf
+*/
+
+#pragma once
+
+#include "gtPlusAlgorithmBase.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRIT : public gtPlusAlgorithmBase<T>
+{
+public:
+
+    typedef gtPlusAlgorithmBase<T> BaseClass;
+
+    typedef typename realType<T>::Type ValueType;
+
+    gtPlusSPIRIT() : calib_use_gpu_(true), BaseClass() {}
+    virtual ~gtPlusSPIRIT() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // SPIRIT calibration for 2D case
+    // acsSrc : [RO E1 srcCHA]
+    // acsDst : [RO E1 dstCHA]
+    // ker : [kRO kE1 srcCHA dstCHA oRO oE1]
+    bool calib(const ho3DArray<T>& acsSrc, const ho3DArray<T>& acsDst, double thres, 
+            size_t kRO, size_t kE1, size_t oRO, size_t oE1, ho6DArray<T>& ker);
+
+    // image domain kernel for 2D kernel
+    // kIm: image domain kernel [RO E1 srcCHA dstCHA]
+    // if minusI==true, compute image domain G-I kernel
+    bool imageDomainKernel(const ho6DArray<T>& ker, size_t kRO, size_t kE1, 
+        size_t oRO, size_t oE1, size_t ro, size_t e1, hoNDArray<T>& kIm, bool minusI=false);
+
+    // SPIRIT calibration for 3D case
+    // acsSrc : [RO E1 E2 srcCHA]
+    // acsDst : [RO E1 E2 dstCHA]
+    // ker : [kRO kE1 kE2 srcCHA dstCHA oRO oE1 oE2]
+    // overDetermineRatio : over determine ratio of calib matrix, if < 1, all data are used
+    bool calib3D(const ho4DArray<T>& acsSrc, const ho4DArray<T>& acsDst, double thres, double overDetermineRatio, 
+            size_t kRO, size_t kE1, size_t kE2, size_t oRO, size_t oE1, size_t oE2, hoNDArray<T>& ker);
+
+    // convert the calibrated kernel to the convlution kernel in kspace
+    // if ROis3rdDim == true, the kernel dimension is [E1 E2 RO], otherwise [RO E1 E2]
+    bool kspaceDomainConvKernel3D(const hoNDArray<T>& ker, size_t kRO, size_t kE1, size_t kE2, size_t oRO, size_t oE1, size_t oE2, ho5DArray<T>& convKerFlip, bool minusI=true, bool ROis3rdDim=true);
+
+    // image domain kernel for 3D kernel
+    // kIm: image domain kernel [E1 E2 RO srcCHA dstCHA]
+    // if minusI==true, compute image domain G-I kernel
+    bool imageDomainKernel3D(const hoNDArray<T>& ker, size_t kRO, size_t kE1, size_t kE2, 
+        size_t oRO, size_t oE1, size_t oE2, size_t ro, size_t e1, size_t e2, hoNDArray<T>& kIm, bool minusI=false);
+
+    // image domain kernel for 3D kernel, only RO direction is converted to image domain
+    // E1 and E2 stays in the kspace domain
+    // kImRO: kspace-image hybrid kernel [convE1 convE2 RO srcCHA dstCHA]
+    bool imageDomainKernelRO3D(const hoNDArray<T>& ker, size_t kRO, size_t kE1, size_t kE2, 
+        size_t oRO, size_t oE1, size_t oE2, size_t ro, hoNDArray<T>& kImRO, bool minusI=false);
+
+    // image domain kernel for 3D kernel, E1 and E2 directions are converted to image domain
+    // kImRO : kspace-image hybrid kernel where first two dimensions are E1 and E2 and in kspace
+    bool imageDomainKernelE1E2RO(const hoNDArray<T>& kImRO, size_t e1, size_t e2, hoNDArray<T>& kImE1E2RO);
+
+    // compute the image domain adjoint kernel
+    bool imageDomainAdjointKernel(const hoNDArray<T>& kIm, hoNDArray<T>& adjkIm);
+
+    // compute the (G-I)'*(G-I)
+    bool AdjointForwardKernel(const hoNDArray<T>& kImS2D, const hoNDArray<T>& kImD2S, hoNDArray<T>& kIm);
+
+    // use gpu in the kernel calibration
+    bool calib_use_gpu_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+};
+
+template <typename T> 
+void gtPlusSPIRIT<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT reconstruction ------------------" << endl;
+    os << "Implementation of SPIRIT algorithms for ISMRMRD package" << endl;
+    os << "Both 2D and 3D version are implemented" << endl;
+    os << "Algorithms are published at:" << endl;
+    os << "Lustig, M. and Pauly, J. M. (2010), SPIRiT: Iterative self-consistent parallel imaging reconstruction from arbitrary k-space. Magn Reson Med, 64: 457�471. doi: 10.1002/mrm.22428" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::
+calib(const ho3DArray<T>& acsSrc, const ho3DArray<T>& acsDst, double thres, 
+            size_t kRO, size_t kE1, size_t oRO, size_t oE1, ho6DArray<T>& ker)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(0)==acsDst.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(1)==acsDst.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(2)>=acsDst.get_size(2));
+
+        size_t RO = acsSrc.get_size(0);
+        size_t E1 = acsSrc.get_size(1);
+        size_t srcCHA = acsSrc.get_size(2);
+        size_t dstCHA = acsDst.get_size(2);
+
+        long long kROhalf = kRO/2;
+        if ( 2*kROhalf == kRO )
+        {
+            GWARN_STREAM("gtPlusSPIRIT<T>::calib(...) - 2*kROhalf == kRO " << kRO);
+        }
+        kRO = 2*kROhalf + 1;
+
+        long long kE1half = kE1/2;
+        if ( 2*kE1half == kE1 )
+        {
+            GWARN_STREAM("gtPlusSPIRIT<T>::calib(...) - 2*kE1half == kE1 " << kE1);
+        }
+        kE1 = 2*kE1half + 1;
+
+        if ( oRO > kRO ) oRO = kRO;
+        if ( oE1 > kE1 ) oE1 = kE1;
+
+        long long oROhalf = oRO/2;
+        if ( 2*oROhalf == oRO )
+        {
+            GWARN_STREAM("gtPlusSPIRIT<T>::calib(...) - 2*oROhalf == oRO " << oRO);
+        }
+        oRO = 2*oROhalf + 1;
+
+        long long oE1half = oE1/2;
+        if ( 2*oE1half == oE1 )
+        {
+            GWARN_STREAM("gtPlusSPIRIT<T>::calib(...) - 2*oE1half == oE1 " << oE1);
+        }
+        oE1 = 2*oE1half + 1;
+
+        // allocate kernel
+        GADGET_CHECK_RETURN_FALSE(ker.createArray(kRO, kE1, srcCHA, dstCHA, oRO, oE1));
+
+        // loop over the calibration region and assemble the equation
+        // Ax = b
+
+        size_t sRO = kROhalf;
+        size_t eRO = RO - kROhalf -1;
+
+        size_t sE1 = kE1half;
+        size_t eE1 = E1 - kE1half -1;
+
+        size_t lenRO = eRO-sRO+1;
+        size_t lenE1 = eE1-sE1+1;
+
+        size_t rowA = lenE1*lenRO;
+        size_t colA = (kRO*kE1-1)*srcCHA;
+        size_t colB = dstCHA;
+
+        bool useGPU = (typeid(typename realType<T>::Type)==typeid(float) && calib_use_gpu_);
+        //if ( useGPU )
+        //{
+        //    GDEBUG_STREAM("spirit 2D - calling GPU kernel estimation ... "); 
+        //}
+
+        const T* pAcsSrc = acsSrc.begin();
+
+        #pragma omp parallel default(none) shared(RO, E1, sRO, eRO, sE1, eE1, oRO, oE1, lenRO, lenE1, rowA, colA, colB, kRO, kE1, kROhalf, kE1half, oROhalf, oE1half, pAcsSrc, acsSrc, acsDst, srcCHA, dstCHA, thres, ker, useGPU, std::cout) num_threads( (int)(oRO*oE1) ) if (oRO*oE1>=3)
+        {
+            hoMatrix<T> A(rowA, colA);
+            T* pA = A.begin();
+
+            hoMatrix<T> B(rowA, colB);
+            T* pB = B.begin();
+
+            hoMatrix<T> x( A.cols(), B.cols() );
+
+            long long kInd = 0;
+            #pragma omp for
+            for ( kInd=0; kInd<(long long)(oRO*oE1); kInd++ )
+            {
+                long long oe1 = kInd/oRO;
+                long long oro = kInd - oe1*oRO;
+
+                oe1 -=oE1half;
+                oro -=oROhalf;
+
+                long long dRO, dE1;
+
+                for ( long long e1=(long long)sE1; e1<=(long long)eE1; e1++ )
+                {
+                    dE1 = e1 + oe1;
+
+                    for ( long long ro=sRO; ro<=(long long)eRO; ro++ )
+                    {
+                        dRO = ro + oro;
+
+                        long long rInd = (e1-sE1)*lenRO+ro-sRO;
+
+                        // fill matrix A
+                        size_t col = 0;
+                        for ( size_t src=0; src<srcCHA; src++ )
+                        {
+                            for ( long long ke1=-kE1half; ke1<=kE1half; ke1++ )
+                            {
+                                for ( long long kro=-kROhalf; kro<=kROhalf; kro++ )
+                                {
+                                    if ( kro!=oro || ke1!=oe1 )
+                                    {
+                                        //A(rInd, col++) = acsSrc(ro+kro, e1+ke1, src);
+                                        // pA[rInd + col*rowA] = acsSrc(ro+kro, e1+ke1, src);
+                                        pA[rInd + col*rowA] = pAcsSrc[ro+kro + (e1+ke1)*RO + src*RO*E1];
+                                        col++;
+                                    }
+                                }
+                            }
+                        }
+
+                        // fill matrix B
+                        for ( size_t dst=0; dst<dstCHA; dst++ )
+                        {
+                            //B(rInd, dst) = acsDst(dRO, dE1, dst);
+                            pB[rInd+dst*rowA] = acsDst(dRO, dE1, dst);
+                        }
+                    }
+                }
+
+                // GADGET_CHECK_RETURN_FALSE(SolveLinearSystem_Tikhonov(A, B, x, thres));
+
+                //if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - solve linear system ... "); }
+                //#ifdef USE_CUDA
+                //    // go to device
+                //    try
+                //    {
+                //        if ( useGPU )
+                //        {
+                //            hoNDArray<float_complext> A_tmp(A.get_dimensions(), reinterpret_cast<float_complext*>(A.begin()));
+                //            hoNDArray<float_complext> B_tmp(B.get_dimensions(), reinterpret_cast<float_complext*>(B.begin()));
+
+                //            int ret(0);
+                //            boost::shared_ptr< hoNDArray<complext<float> > > host_x;
+
+                //            #pragma omp critical(inverse_spirit)
+                //            {
+                //                cuNDArray<float_complext> device_A(A_tmp);
+                //                cuNDArray<float_complext> device_B(B_tmp);
+                //                cuNDArray<float_complext> device_x;
+
+                //                ret = Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres);
+                //                if ( ret == 0 )
+                //                {
+                //                    host_x = device_x.to_host();
+                //                }
+                //            }
+
+                //            if ( ret != 0 )
+                //            {
+                //                GERROR_STREAM("failed in Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres) ... ");
+                //                SolveLinearSystem_Tikhonov(A, B, x, thres);
+                //            }
+                //            else
+                //            {
+                //                memcpy(x.begin(), host_x->begin(), host_x->get_number_of_bytes());
+                //            }
+                //        }
+                //        else
+                //        {
+                //            GWARN_STREAM("GPU inverse_clib_matrix is only available for single-precision, calling the CPU version ... ");
+                //            SolveLinearSystem_Tikhonov(A, B, x, thres);
+                //        }
+                //    }
+                //    catch(...)
+                //    {
+                //        GERROR_STREAM("failed in GPU inverse_clib_matrix for grappa, calling the CPU version ... ");
+                //        SolveLinearSystem_Tikhonov(A, B, x, thres);
+                //    }
+                //#else
+                    SolveLinearSystem_Tikhonov(A, B, x, thres);
+                //#endif // USE_CUDA
+                //if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                //SolveLinearSystem_Tikhonov(A, B, x, thres);
+
+                long long ind(0);
+                for ( size_t src=0; src<srcCHA; src++ )
+                {
+                    for ( long long ke1=-kE1half; ke1<=kE1half; ke1++ ) 
+                    {
+                        for ( long long kro=-kROhalf; kro<=kROhalf; kro++ ) 
+                        {
+                            if ( kro!=oro || ke1!=oe1 )
+                            {
+                                for ( size_t dst=0; dst<dstCHA; dst++ )
+                                {
+                                    ker(kro+kROhalf, ke1+kE1half, src, dst, oro+oROhalf, oe1+oE1half) = x(ind, dst);
+                                }
+                                ind++;
+                            }
+                            else
+                            {
+                                for ( size_t dst=0; dst<dstCHA; dst++ )
+                                {
+                                    ker(kro+kROhalf, ke1+kE1half, src, dst, oro+oROhalf, oe1+oE1half) = 0;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRIT<T>::calib(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::
+imageDomainKernel(const ho6DArray<T>& ker, size_t kRO, size_t kE1, size_t oRO, size_t oE1, size_t ro, size_t e1, hoNDArray<T>& kIm, bool minusI)
+{
+    try
+    {
+        long long srcCHA = (long long)(ker.get_size(2));
+        long long dstCHA = (long long)(ker.get_size(3));
+
+        GADGET_CHECK_RETURN_FALSE(kRO==ker.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(kE1==ker.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(oRO==ker.get_size(4));
+        GADGET_CHECK_RETURN_FALSE(oE1==ker.get_size(5));
+
+        long long kROhalf = kRO/2;
+        long long kE1half = kE1/2;
+        long long oROhalf = oRO/2;
+        long long oE1half = oE1/2;
+
+        // allocate image domain kernel
+        kIm.create(ro, e1, srcCHA, dstCHA);
+
+        /// fill the convolution kernels
+        long long convKRO = 2*kRO-1;
+        long long convKE1 = 2*kE1-1;
+
+        /// fill in convolution kernel
+        ho6DArray<T> convKer(convKRO, convKE1, srcCHA, dstCHA, oRO, oE1);
+        Gadgetron::clear(&convKer);
+
+        long long oro, oe1, kro, ke1, src, dst;
+        for ( oe1=-oE1half; oe1<=oE1half; oe1++ )
+        {
+            for ( oro=-oROhalf; oro<=oROhalf; oro++ )
+            {
+                for ( ke1=-kE1half; ke1<=kE1half; ke1++ )
+                {
+                    for ( kro=-kROhalf; kro<=kROhalf; kro++ )
+                    {
+                        long long iro = kro - oro + kRO -1;
+                        long long ie1 = ke1 - oe1 + kE1 -1;
+
+                        for ( dst=0; dst<dstCHA; dst++ )
+                        {
+                            for ( src=0; src<srcCHA; src++ )
+                            {
+                                convKer(iro, ie1, src, dst, oro+oROhalf, oe1+oE1half) = ker(kro+kROhalf, ke1+kE1half, src, dst, oro+oROhalf, oe1+oE1half);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        hoNDArray<T> convKer2;
+        hoNDArray<T> conKerMean(convKRO, convKE1, srcCHA, dstCHA, 1, 1);
+
+        //GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(convKer, convKer2));
+        //GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(convKer2, conKerMean));
+
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(convKer, convKer2, 5));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(convKer2, conKerMean, 4));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)(1.0/(oRO*oE1)), conKerMean) );
+
+        // flip the kernel
+        ho4DArray<T> convKerFlip(convKRO, convKE1, srcCHA, dstCHA);
+        Gadgetron::clear(&convKerFlip);
+        for ( ke1=0; ke1<convKE1; ke1++ )
+        {
+            for ( kro=0; kro<convKRO; kro++ )
+            {
+                for ( dst=0; dst<dstCHA; dst++ )
+                {
+                    for ( src=0; src<srcCHA; src++ )
+                    {
+                        convKerFlip( kro, ke1, src, dst) = conKerMean(convKRO-1-kro, convKE1-1-ke1, src, dst, 0, 0);
+                    }
+                }
+            }
+        }
+
+        // minus I
+        if ( minusI )
+        {
+            for ( dst=0; dst<dstCHA; dst++ )
+            {
+                T value = convKerFlip(kRO -1, kE1 -1, dst, dst);
+                convKerFlip(kRO -1, kE1 -1, dst, dst) = value - T(1.0);
+            }
+        }
+
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)( std::sqrt((double)(ro*e1)) ), convKerFlip ));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::pad(ro, e1, &convKerFlip, &kIm));
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kIm);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRIT<T>::imageDomainKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::
+calib3D(const ho4DArray<T>& acsSrc, const ho4DArray<T>& acsDst, double thres, double overDetermineRatio, 
+            size_t kRO, size_t kE1, size_t kE2, size_t oRO, size_t oE1, size_t oE2, hoNDArray<T>& ker)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(0)==acsDst.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(1)==acsDst.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(2)==acsDst.get_size(2));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(3)>=acsDst.get_size(3));
+
+        size_t RO = acsSrc.get_size(0);
+        size_t E1 = acsSrc.get_size(1);
+        size_t E2 = acsSrc.get_size(2);
+        size_t srcCHA = acsSrc.get_size(3);
+        size_t dstCHA = acsDst.get_size(3);
+
+        long long kROhalf = kRO/2;
+        if ( 2*kROhalf == kRO )
+        {
+            GWARN_STREAM("gtPlusSPIRIT<T>::calib3D(...) - 2*kROhalf == kRO " << kRO);
+        }
+        kRO = 2*kROhalf + 1;
+
+        long long kE1half = kE1/2;
+        if ( 2*kE1half == kE1 )
+        {
+            GWARN_STREAM("gtPlusSPIRIT<T>::calib3D(...) - 2*kE1half == kE1 " << kE1);
+        }
+        kE1 = 2*kE1half + 1;
+
+        long long kE2half = kE2/2;
+        if ( 2*kE2half == kE2 )
+        {
+            GWARN_STREAM("gtPlusSPIRIT<T>::calib3D(...) - 2*kE2half == kE2 " << kE2);
+        }
+        kE2 = 2*kE2half + 1;
+
+        if ( oRO > kRO ) oRO = kRO;
+        if ( oE1 > kE1 ) oE1 = kE1;
+        if ( oE2 > kE2 ) oE2 = kE2;
+
+        long long oROhalf = oRO/2;
+        if ( 2*oROhalf == oRO )
+        {
+            GWARN_STREAM("gtPlusSPIRIT<T>::calib3D(...) - 2*oROhalf == oRO " << oRO);
+        }
+        oRO = 2*oROhalf + 1;
+
+        long long oE1half = oE1/2;
+        if ( 2*oE1half == oE1 )
+        {
+            GWARN_STREAM("gtPlusSPIRIT<T>::calib3D(...) - 2*oE1half == oE1 " << oE1);
+        }
+        oE1 = 2*oE1half + 1;
+
+        long long oE2half = oE2/2;
+        if ( 2*oE2half == oE2 )
+        {
+            GWARN_STREAM("gtPlusSPIRIT<T>::calib3D(...) - 2*oE2half == oE2 " << oE2);
+        }
+        oE2 = 2*oE2half + 1;
+
+        // allocate kernel
+        ker.create(kRO, kE1, kE2, srcCHA, dstCHA, oRO, oE1, oE2);
+
+        // loop over the calibration region and assemble the equation
+        // Ax = b
+
+        size_t sRO = kROhalf;
+        size_t eRO = RO - kROhalf -1;
+        size_t lenRO = eRO-sRO+1;
+
+        size_t sE1 = kE1half;
+        size_t eE1 = E1 - kE1half -1;
+        size_t lenE1 = eE1-sE1+1;
+
+        size_t sE2 = kE2half;
+        size_t eE2 = E2 - kE2half -1;
+        size_t lenE2 = eE2-sE2+1;
+
+        size_t colA = (kRO*kE1*kE2-1)*srcCHA;
+        if ( overDetermineRatio > 1.0 )
+        {
+            size_t maxRowA = (size_t)std::ceil(overDetermineRatio*colA);
+            size_t maxROUsed = maxRowA/(lenE1*lenE2);
+            if ( maxROUsed < lenRO )
+            {
+                // find the peak signal of acsSrc
+                hoNDArray<T> acsSrc1stCha(RO, E1, E2, const_cast<T*>(acsSrc.begin()));
+                hoNDArray<T> acsSrc1stChaSumE2(RO, E1, 1), acsSrc1stChaSumE2E1(RO, 1, 1);
+
+                try
+                {
+                    Gadgetron::sum_over_dimension(acsSrc1stCha, acsSrc1stChaSumE2, 2);
+                    Gadgetron::sum_over_dimension(acsSrc1stChaSumE2, acsSrc1stChaSumE2E1, 1);
+
+                    T maxSignal;
+                    size_t roInd(0);
+                    try
+                    {
+                        Gadgetron::maxAbsolute(acsSrc1stChaSumE2E1, maxSignal, roInd);
+
+                        if ( roInd > maxROUsed/2+kROhalf )
+                        {
+                            sRO = roInd - maxROUsed/2;
+                        }
+                        else
+                        {
+                            sRO = kROhalf;
+                        }
+
+                        if( sRO+maxROUsed-1 <= RO-kROhalf-1 )
+                        {
+                            eRO = sRO + maxROUsed - 1;
+                        }
+                        else
+                        {
+                            eRO = RO - kROhalf -1;
+                        }
+
+                        lenRO = eRO-sRO+1;
+                        GDEBUG_STREAM("gtPlusSPIRIT<T>::calib3D(...) - overDetermineRatio = " << overDetermineRatio << " ; RO data range used : [" << sRO << " " << eRO << "] ...");
+                    }
+                    catch(...)
+                    {
+                        GWARN_STREAM("gtPlusSPIRIT<T>::calib3D(...) - overDetermineRatio is ignored ... ");
+                        throw;
+                    }
+                }
+                catch (...)
+                {
+                    GWARN_STREAM("gtPlusSPIRIT<T>::calib3D(...) - overDetermineRatio is ignored ... ");
+                    throw;
+                }
+            }
+        }
+
+        size_t rowA = lenRO*lenE1*lenE2;
+        size_t colB = dstCHA;
+
+        bool useGPU = (typeid(typename realType<T>::Type)==typeid(float) && calib_use_gpu_);
+        if ( useGPU )
+        {
+            GDEBUG_STREAM("spirit 3D - calling GPU kernel estimation ... ");
+        }
+
+        #pragma omp parallel default(none) shared(sRO, eRO, sE1, eE1, sE2, eE2, oRO, oE1, oE2, lenRO, lenE1, lenE2, rowA, colA, colB, kROhalf, kE1half, kE2half, oROhalf, oE1half, oE2half, acsSrc, acsDst, srcCHA, dstCHA, thres, ker, useGPU, std::cout) num_threads( (int)(oRO*oE1*oE2) ) if (oRO*oE1*oE2>=3 && oRO*oE1*oE2<9)
+        {
+            hoMatrix<T> A(rowA, colA);
+            hoMatrix<T> B(rowA, colB);
+            hoMatrix<T> x( A.cols(), B.cols() );
+
+            T* pA = A.begin();
+            T* pB = B.begin();
+
+            long long kInd = 0;
+            #pragma omp for
+            for ( kInd=0; kInd<(long long)(oRO*oE1*oE2); kInd++ )
+            {
+                long long oe2 = kInd/(oRO*oE1);
+                long long oe1 = kInd - oe2*oRO*oE1;
+                oe1 /= oRO;
+                long long oro = kInd - oe2*oRO*oE1 - oe1*oRO;
+
+                oe2 -=oE2half;
+                oe1 -=oE1half;
+                oro -=oROhalf;
+
+                long long dRO, dE1, dE2;
+
+                for ( long long e2=(long long)sE2; e2<=(long long)eE2; e2++ )
+                {
+                    dE2 = e2 + oe2;
+
+                    for ( long long e1=(long long)sE1; e1<=(long long)eE1; e1++ )
+                    {
+                        dE1 = e1 + oe1;
+
+                        for ( long long ro=sRO; ro<=(long long)eRO; ro++ )
+                        {
+                            dRO = ro + oro;
+
+                            long long rInd = (e2-sE2)*lenRO*lenE1 + (e1-sE1)*lenRO + ro-sRO;
+
+                            // fill matrix A
+                            size_t col = 0;
+                            for ( size_t src=0; src<srcCHA; src++ )
+                            {
+                                for ( long long ke2=-kE2half; ke2<=kE2half; ke2++ )
+                                {
+                                    for ( long long ke1=-kE1half; ke1<=kE1half; ke1++ )
+                                    {
+                                        for ( long long kro=-kROhalf; kro<=kROhalf; kro++ )
+                                        {
+                                            if ( kro!=oro || ke1!=oe1 || ke2!=oe2 )
+                                            {
+                                                //A(rInd, col++) = acsSrc(ro+kro, e1+ke1, e2+ke2, src);
+                                                pA[rInd+col*rowA] = acsSrc(ro+kro, e1+ke1, e2+ke2, src);
+                                                col++;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+
+                            // fill matrix B
+                            for ( size_t dst=0; dst<dstCHA; dst++ )
+                            {
+                                //B(rInd, dst) = acsDst(dRO, dE1, dE2, dst);
+                                pB[rInd+dst*rowA] = acsDst(dRO, dE1, dE2, dst);
+                            }
+                        }
+                    }
+                }
+
+                //GADGET_CHECK_RETURN_FALSE(SolveLinearSystem_Tikhonov(A, B, x, thres));
+
+                //if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration - solve linear system ... "); }
+                //#ifdef USE_CUDA
+                //    // go to device
+                //    try
+                //    {
+                //        if ( useGPU )
+                //        {
+                //            hoNDArray<float_complext> A_tmp(A.get_dimensions(), reinterpret_cast<float_complext*>(A.begin()));
+                //            hoNDArray<float_complext> B_tmp(B.get_dimensions(), reinterpret_cast<float_complext*>(B.begin()));
+
+                //            int ret(0);
+                //            boost::shared_ptr< hoNDArray<complext<float> > > host_x;
+                //            #pragma omp critical(inverse_spirit3D)
+                //            {
+                //                cuNDArray<float_complext> device_A(A_tmp);
+                //                cuNDArray<float_complext> device_B(B_tmp);
+                //                cuNDArray<float_complext> device_x;
+
+                //                ret = Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres);
+                //                if ( ret == 0 )
+                //                {
+                //                    host_x = device_x.to_host();
+                //                }
+                //            }
+
+                //            if ( ret != 0 )
+                //            {
+                //                GERROR_STREAM("failed in Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres) ... ");
+                //                SolveLinearSystem_Tikhonov(A, B, x, thres);
+                //            }
+                //            else
+                //            {
+                //                memcpy(x.begin(), host_x->begin(), x.get_number_of_bytes());
+                //            }
+                //        }
+                //        else
+                //        {
+                //            GWARN_STREAM("GPU inverse_clib_matrix is only available for single-precision, calling the CPU version ... ");
+                //            SolveLinearSystem_Tikhonov(A, B, x, thres);
+                //        }
+                //    }
+                //    catch(...)
+                //    {
+                //        GERROR_STREAM("failed in GPU inverse_clib_matrix for grappa, calling the CPU version ... ");
+                //        SolveLinearSystem_Tikhonov(A, B, x, thres);
+                //    }
+                //#else
+                    SolveLinearSystem_Tikhonov(A, B, x, thres);
+                //#endif // USE_CUDA
+                //if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                // SolveLinearSystem_Tikhonov(A, B, x, thres);
+
+                long long ind(0);
+
+                std::vector<size_t> kerInd(8);
+                kerInd[7] = oe2+oE2half;
+                kerInd[6] = oe1+oE1half;
+                kerInd[5] = oro+oROhalf;
+
+                for ( size_t src=0; src<srcCHA; src++ )
+                {
+                    kerInd[3] = src;
+                    for ( long long ke2=-kE2half; ke2<=kE2half; ke2++ ) 
+                    {
+                        kerInd[2] = ke2+kE2half;
+                        for ( long long ke1=-kE1half; ke1<=kE1half; ke1++ ) 
+                        {
+                            kerInd[1] = ke1+kE1half;
+                            for ( long long kro=-kROhalf; kro<=kROhalf; kro++ ) 
+                            {
+                                kerInd[0] = kro+kROhalf;
+
+                                if ( kro!=0 || ke1!=0 || ke2!=0 )
+                                {
+                                    for ( size_t dst=0; dst<dstCHA; dst++ )
+                                    {
+                                        kerInd[4] = dst;
+                                        size_t offset = ker.calculate_offset(kerInd);
+                                        ker(offset) = x(ind, dst);
+                                    }
+                                    ind++;
+                                }
+                                else
+                                {
+                                    for ( size_t dst=0; dst<dstCHA; dst++ )
+                                    {
+                                        kerInd[4] = dst;
+                                        size_t offset = ker.calculate_offset(kerInd);
+                                        ker(offset) = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRIT<T>::calib3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::
+kspaceDomainConvKernel3D(const hoNDArray<T>& ker, size_t kRO, size_t kE1, size_t kE2, size_t oRO, size_t oE1, size_t oE2, ho5DArray<T>& convKerFlip, bool minusI, bool ROis3rdDim)
+{
+    try
+    {
+        long long srcCHA = (long long)(ker.get_size(3));
+        long long dstCHA = (long long)(ker.get_size(4));
+
+        GADGET_CHECK_RETURN_FALSE(kRO==ker.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(kE1==ker.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(kE2==ker.get_size(2));
+        GADGET_CHECK_RETURN_FALSE(oRO==ker.get_size(5));
+        GADGET_CHECK_RETURN_FALSE(oE1==ker.get_size(6));
+        GADGET_CHECK_RETURN_FALSE(oE2==ker.get_size(7));
+
+        long long kROhalf = kRO/2;
+        long long kE1half = kE1/2;
+        long long kE2half = kE2/2;
+        long long oROhalf = oRO/2;
+        long long oE1half = oE1/2;
+        long long oE2half = oE2/2;
+
+        /// fill the convolution kernels
+        long long convKRO = 2*kRO-1;
+        long long convKE1 = 2*kE1-1;
+        long long convKE2 = 2*kE2-1;
+
+        /// fill in convolution kernel
+        if ( performTiming_ ) { gt_timer3_.start("spirit 3D calibration - convert to conv kernel ... "); }
+
+        hoNDArray<T> convKer(convKRO, convKE1, convKE2, srcCHA, dstCHA, oRO, oE1, oE2);
+        Gadgetron::clear(&convKer);
+
+        long long oro, oe1, oe2, kro, ke1, ke2, src, dst;
+        std::vector<size_t> kerInd(8), convKerInd(8);
+        for ( oe2=-oE2half; oe2<=oE2half; oe2++ )
+        {
+            kerInd[7] = oe2+oE2half;
+            convKerInd[7] = oe2+oE2half;
+
+            for ( oe1=-oE1half; oe1<=oE1half; oe1++ )
+            {
+                kerInd[6] = oe1+oE1half;
+                convKerInd[6] = oe1+oE1half;
+
+                for ( oro=-oROhalf; oro<=oROhalf; oro++ )
+                {
+                    kerInd[5] = oro+oROhalf;
+                    convKerInd[5] = oro+oROhalf;
+
+                    for ( ke2=-kE2half; ke2<=kE2half; ke2++ )
+                    {
+                        kerInd[2] = ke2+kE2half;
+
+                        for ( ke1=-kE1half; ke1<=kE1half; ke1++ )
+                        {
+                            kerInd[1] = ke1+kE1half;
+
+                            for ( kro=-kROhalf; kro<=kROhalf; kro++ )
+                            {
+                                long long iro = kro - oro + kRO -1;
+                                long long ie1 = ke1 - oe1 + kE1 -1;
+                                long long ie2 = ke2 - oe2 + kE2 -1;
+
+                                kerInd[0] = kro+kROhalf;
+
+                                convKerInd[0] = iro;
+                                convKerInd[1] = ie1;
+                                convKerInd[2] = ie2;
+
+                                for ( dst=0; dst<dstCHA; dst++ )
+                                {
+                                    kerInd[4] = dst;
+                                    convKerInd[4] = dst;
+
+                                    for ( src=0; src<srcCHA; src++ )
+                                    {
+                                        kerInd[3] = src;
+                                        convKerInd[3] = src;
+
+                                        size_t offsetKer = ker.calculate_offset(kerInd);
+                                        size_t offsetConvKer = convKer.calculate_offset(convKerInd);
+
+                                        convKer(offsetConvKer) = ker(offsetKer);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( performTiming_ ) { gt_timer3_.start("spirit 3D calibration - sum over output dimensions ... "); }
+        hoNDArray<T> convKer2, convKer3;
+
+        //ho5DArray<T> convKernMean(convKRO, convKE1, convKE2, srcCHA, dstCHA);
+        //GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(convKer, convKer2));
+        //GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(convKer2, convKer3));
+        //GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(convKer3, convKernMean));
+
+        hoNDArray<T> convKernMean(convKRO, convKE1, convKE2, srcCHA, dstCHA, 1, 1, 1);
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(convKer, convKer2, 7));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(convKer2, convKer3, 6));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(convKer3, convKernMean, 5));
+
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)(1.0/(oRO*oE1*oE2)), convKernMean) );
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( performTiming_ ) { gt_timer3_.start("spirit 3D calibration - flip along dimensions ... "); }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(convKernMean, debugFolder_+"convKernMean"); }
+
+        // flip the kernel
+        if ( ROis3rdDim ) // E1, E2, RO
+        {
+            convKerFlip.createArray(convKE1, convKE2, convKRO, srcCHA, dstCHA);
+            Gadgetron::clear(&convKerFlip);
+
+            for ( ke2=0; ke2<convKE2; ke2++ )
+            {
+                for ( ke1=0; ke1<convKE1; ke1++ )
+                {
+                    for ( kro=0; kro<convKRO; kro++ )
+                    {
+                        for ( dst=0; dst<dstCHA; dst++ )
+                        {
+                            for ( src=0; src<srcCHA; src++ )
+                            {
+                                T value = convKernMean(convKRO-1-kro, convKE1-1-ke1, convKE2-1-ke2, src, dst, 0, 0, 0);
+                                convKerFlip(ke1, ke2, kro, src, dst) = value;
+                            }
+                        }
+                    }
+                }
+            }
+            if ( performTiming_ ) { gt_timer3_.stop(); }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(convKerFlip, debugFolder_+"convKerFlip"); }
+
+            // minus I
+            if ( minusI )
+            {
+                for ( dst=0; dst<dstCHA; dst++ )
+                {
+                    T value = convKerFlip(kE1 -1, kE2 -1, kRO -1, dst, dst);
+                    convKerFlip(kE1 -1, kE2 -1, kRO -1, dst, dst) = value - T(1.0);
+                }
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(convKerFlip, debugFolder_+"convKerFlip_minusI"); }
+            }
+        }
+        else
+        {
+            // RO, E1, E2
+            convKerFlip.createArray(convKRO, convKE1, convKE2, srcCHA, dstCHA);
+            Gadgetron::clear(&convKerFlip);
+
+            for ( ke2=0; ke2<convKE2; ke2++ )
+            {
+                for ( ke1=0; ke1<convKE1; ke1++ )
+                {
+                    for ( kro=0; kro<convKRO; kro++ )
+                    {
+                        for ( dst=0; dst<dstCHA; dst++ )
+                        {
+                            for ( src=0; src<srcCHA; src++ )
+                            {
+                                T value = convKernMean(convKRO-1-kro, convKE1-1-ke1, convKE2-1-ke2, src, dst);
+                                convKerFlip(kro, ke1, ke2, src, dst) = value;
+                            }
+                        }
+                    }
+                }
+            }
+            if ( performTiming_ ) { gt_timer3_.stop(); }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(convKerFlip, debugFolder_+"convKerFlip"); }
+
+            // minus I
+            if ( minusI )
+            {
+                for ( dst=0; dst<dstCHA; dst++ )
+                {
+                    T value = convKerFlip(kRO -1, kE1 -1, kE2 -1, dst, dst);
+                    convKerFlip(kRO -1, kE1 -1, kE2 -1, dst, dst) = value - T(1.0);
+                }
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(convKerFlip, debugFolder_+"convKerFlip_minusI"); }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRIT<T>::kspaceDomainConvKernel3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::
+imageDomainKernel3D(const hoNDArray<T>& ker, size_t kRO, size_t kE1, size_t kE2, size_t oRO, size_t oE1, size_t oE2, size_t ro, size_t e1, size_t e2, hoNDArray<T>& kIm, bool minusI)
+{
+    try
+    {
+        long long srcCHA = (long long)(ker.get_size(3));
+        long long dstCHA = (long long)(ker.get_size(4));
+
+        GADGET_CHECK_RETURN_FALSE(kRO==ker.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(kE1==ker.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(kE2==ker.get_size(2));
+        GADGET_CHECK_RETURN_FALSE(oRO==ker.get_size(5));
+        GADGET_CHECK_RETURN_FALSE(oE1==ker.get_size(6));
+        GADGET_CHECK_RETURN_FALSE(oE2==ker.get_size(7));
+
+        // allocate image domain kernel
+        kIm.create(e1, e2, ro, srcCHA, dstCHA);
+
+        bool ROat3rdDim = true;
+        ho5DArray<T> convKerFlip;
+        GADGET_CHECK_RETURN_FALSE(this->kspaceDomainConvKernel3D(ker, kRO, kE1,  kE2, oRO, oE1, oE2, convKerFlip, minusI, ROat3rdDim));
+
+        if ( performTiming_ ) { gt_timer3_.start("spirit 3D calibration - SNR unit scaling ... "); }
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)( std::sqrt((double)(ro*e1*e2)) ), convKerFlip ));
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(convKerFlip, debugFolder_+"convKerFlip_scal"); }
+
+        if ( performTiming_ ) { gt_timer3_.start("spirit 3D calibration - zero padding ... "); }
+        // GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().zeropad3D(convKerFlip, e1, e2, ro, kIm));
+        // GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().zeropad3DNoPresetZeros(convKerFlip, e1, e2, ro, kIm));
+        // GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::zeropad3D(convKerFlip, e1, e2, ro, kIm, false));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::pad(e1, e2, ro, &convKerFlip, &kIm, false));
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kIm, debugFolder_+"convKerFlip_scal_zeropadded"); }
+
+        if ( performTiming_ ) { gt_timer3_.start("spirit 3D calibration - conver to image domain ... "); }
+
+        long long n;
+
+        #pragma omp parallel default(none) private(n) shared(ro, e1, e2, srcCHA, dstCHA, kIm)
+        {
+            hoNDArray<T> kImTmp(ro, e1, e2);
+            hoNDArray<T> kImRes(ro, e1, e2);
+
+            #pragma omp for 
+            for (n = 0; n < srcCHA*dstCHA; n++)
+            {
+                long long d = n / srcCHA;
+                long long s = n - d*srcCHA;
+
+                T* pkImCha = kIm.begin() + d*ro*e1*e2*srcCHA + s*ro*e1*e2;
+
+                hoNDArray<T> kImCha(ro, e1, e2, pkImCha);
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(kImCha, kImRes, kImTmp);
+                memcpy(pkImCha, kImRes.begin(), kImRes.get_number_of_bytes());
+            }
+        }
+
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRIT<T>::imageDomainKernel3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::
+imageDomainKernelRO3D(const hoNDArray<T>& ker, size_t kRO, size_t kE1, size_t kE2, size_t oRO, size_t oE1, size_t oE2, size_t ro, hoNDArray<T>& kImRO, bool minusI)
+{
+    try
+    {
+        long long srcCHA = (long long)(ker.get_size(3));
+        long long dstCHA = (long long)(ker.get_size(4));
+
+        GADGET_CHECK_RETURN_FALSE(kRO==ker.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(kE1==ker.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(kE2==ker.get_size(2));
+        GADGET_CHECK_RETURN_FALSE(oRO==ker.get_size(5));
+        GADGET_CHECK_RETURN_FALSE(oE1==ker.get_size(6));
+        GADGET_CHECK_RETURN_FALSE(oE2==ker.get_size(7));
+
+        bool ROat3rdDim = false;
+        ho5DArray<T> convKerFlip;
+        GADGET_CHECK_RETURN_FALSE(this->kspaceDomainConvKernel3D(ker, kRO, kE1,  kE2, oRO, oE1, oE2, convKerFlip, minusI, ROat3rdDim));
+
+        // allocate image domain kernel
+        size_t kConvE1 = convKerFlip.get_size(1);
+        size_t kConvE2 = convKerFlip.get_size(2);
+
+        kImRO.create(kConvE1, kConvE2, ro, srcCHA, dstCHA);
+
+        hoNDArray<T> kImROTemp(ro, kConvE1, kConvE2, srcCHA, dstCHA);
+        Gadgetron::clear(kImROTemp);
+
+        if ( performTiming_ ) { gt_timer3_.start("spirit 3D calibration - SNR unit scaling ... "); }
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)( std::sqrt((double)(ro)) ), convKerFlip ));
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(convKerFlip, debugFolder_+"convKerFlip_scal_RO"); }
+
+        if ( performTiming_ ) { gt_timer3_.start("spirit 3D calibration - zero padding only for RO ... "); }
+        // GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().zeropad3DNoPresetZeros(convKerFlip, ro, kConvE1, kConvE2, kImROTemp));
+        // GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::zeropad3D(convKerFlip, ro, kConvE1, kConvE2, kImROTemp, false));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::pad(ro, kConvE1, kConvE2, &convKerFlip, &kImROTemp, false));
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kImROTemp, debugFolder_+"convKerFlip_scal_RO_zeropadded"); }
+
+        if ( performTiming_ ) { gt_timer3_.start("spirit 3D calibration - conver to image domain only for RO ... "); }
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(kImROTemp);
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( performTiming_ ) { gt_timer3_.start("spirit 3D calibration - permute kernel dimensions to be [kE1 kE2 RO ...]  ... "); }
+
+        std::vector<size_t> dim_order(3);
+        dim_order[0] = 1;
+        dim_order[1] = 2;
+        dim_order[2] = 0;
+
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&kImROTemp, &kImRO, &dim_order));
+
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRIT<T>::imageDomainKernelRO3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::
+imageDomainKernelE1E2RO(const hoNDArray<T>& kImRO, size_t e1, size_t e2, hoNDArray<T>& kImE1E2RO)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dim = kImRO.get_dimensions();
+
+        std::vector<size_t> dimR(*dim);
+        dimR[0] = e1;
+        dimR[1] = e2;
+
+        kImE1E2RO.create(&dimR);
+        Gadgetron::clear(kImE1E2RO);
+
+        hoNDArray<T> kImROScaled(kImRO);
+
+        if ( performTiming_ ) { gt_timer3_.start("spirit 3D calibration - SNR unit scaling for E1 and E2 ... "); }
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)( std::sqrt((double)(e1*e2)) ), kImROScaled ));
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kImROScaled, debugFolder_+"kImROScaledE1E2"); }
+
+        if ( performTiming_ ) { gt_timer3_.start("spirit 3D calibration - zero padding for E1 and E2 ... "); }
+        // GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().zeropad3DNoPresetZeros(kImROScaled, e1, e2, dimR[2], kImE1E2RO));
+        // GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::zeropad3D(kImROScaled, e1, e2, dimR[2], kImE1E2RO, false));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::pad(e1, e2, dimR[2], &kImROScaled, &kImE1E2RO, false));
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kImE1E2RO, debugFolder_+"kImE1E2RO_zeropadded_E1E2"); }
+
+        if ( performTiming_ ) { gt_timer3_.start("spirit 3D calibration - conver to image domain for E1 and E2 ... "); }
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kImE1E2RO);
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRIT<T>::imageDomainKernelE1E2RO(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::
+imageDomainAdjointKernel(const hoNDArray<T>& kIm, hoNDArray<T>& adjkIm)
+{
+    try
+    {
+        std::vector<size_t> dim, dimAdj, dimOrder;
+        kIm.get_dimensions(dim);
+
+        size_t NDim = dim.size();
+
+        dimAdj = dim;
+        dimAdj[NDim - 1] = dim[NDim - 2];
+        dimAdj[NDim - 2] = dim[NDim - 1];
+
+        if (!adjkIm.dimensions_equal(&dimAdj))
+        {
+            adjkIm.create(dimAdj);
+        }
+
+        dimOrder.resize(NDim);
+
+        for (size_t d = 0; d < NDim; d++)
+        {
+            dimOrder[d] = d;
+        }
+        dimOrder[NDim - 2] = NDim - 1;
+        dimOrder[NDim - 1] = NDim - 2;
+
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(const_cast< hoNDArray<T>* >(&kIm), &adjkIm, &dimOrder));
+
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::conjugate(adjkIm, adjkIm));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRIT<T>::imageDomainAdjointKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::AdjointForwardKernel(const hoNDArray<T>& kImS2D, const hoNDArray<T>& kImD2S, hoNDArray<T>& kIm)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dimS2D = kImS2D.get_dimensions();
+
+        size_t NDim = kImS2D.get_number_of_dimensions();
+
+        long long srcCHA = (*dimS2D)[NDim-2];
+        long long dstCHA = (*dimS2D)[NDim-1];
+
+        GADGET_CHECK_RETURN_FALSE(kImD2S.get_number_of_dimensions()==NDim);
+        GADGET_CHECK_RETURN_FALSE(kImD2S.get_number_of_elements()==kImS2D.get_number_of_elements());
+
+        std::vector<size_t> dimRes(*dimS2D);
+        dimRes[NDim-2] = dstCHA;
+
+        kIm.create(&dimRes);
+        Gadgetron::clear(&kIm);
+
+        size_t N = kImS2D.get_number_of_elements()/srcCHA/dstCHA;
+
+        long long d;
+        #pragma omp parallel default(none) private(d) shared(N, dstCHA, srcCHA, kIm, kImS2D, kImD2S) num_threads( (int)dstCHA ) if (dstCHA > 4)
+        {
+            hoNDArray<T> ker(N);
+
+            std::vector<size_t> dim(1);
+            dim[0] = N;
+
+            hoNDArray<T> dKer, kerS2D, kerD2S;
+
+            #pragma omp for
+            for ( d=0; d<dstCHA; d++ )
+            {
+                for ( long long dprime=0; dprime<dstCHA; dprime++ )
+                {
+                    dKer.create(&dim, kIm.begin()+d*N+dprime*N*dstCHA);
+
+                    for ( long long s=0; s<srcCHA; s++ )
+                    {
+                        kerS2D.create(&dim, const_cast<T*>(kImS2D.begin())+s*N+dprime*N*srcCHA);
+                        kerD2S.create(&dim, const_cast<T*>(kImD2S.begin())+d*N+s*N*dstCHA);
+
+                        Gadgetron::multiply(kerS2D, kerD2S, ker);
+                        Gadgetron::add(dKer, ker, dKer);
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRIT<T>::AdjointForwardKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRIT2DOperator.h b/toolboxes/gtplus/algorithm/gtPlusSPIRIT2DOperator.h
new file mode 100644
index 0000000..6f4e6b9
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRIT2DOperator.h
@@ -0,0 +1,239 @@
+/** \file       gtPlusSPIRIT2DOperator.h
+    \brief      Base class for gtPlus 2D operators
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSPIRITOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRIT2DOperator : public gtPlusSPIRITOperator<T>
+{
+public:
+
+    typedef gtPlusSPIRITOperator<T> BaseClass;
+
+    gtPlusSPIRIT2DOperator() : BaseClass() {}
+    virtual ~gtPlusSPIRIT2DOperator() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // convert to image domain or back to kspace
+    virtual bool convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im);
+    virtual bool convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x);
+
+    // forward
+    virtual bool forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // adjoint operator
+    virtual bool adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    using BaseClass::use_symmetric_spirit_;
+    using BaseClass::use_non_centered_fft_;
+    using BaseClass::calib_use_gpu_;
+
+public:
+
+    // [RO E1 srcCHA dstCHA]
+    using BaseClass::forward_kernel_;
+    using BaseClass::adjoint_kernel_;
+    using BaseClass::adjoint_forward_kernel_;
+    using BaseClass::acquired_points_;
+    using BaseClass::acquired_points_indicator_;
+    using BaseClass::unacquired_points_indicator_;
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+void gtPlusSPIRIT2DOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT 2D operator ------------------" << endl;
+    os << "Implementation of SPIRIT 2D operator for ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRIT2DOperator<T>::convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im)
+{
+    if ( this->use_non_centered_fft_ )
+    {
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2(x, im);
+    }
+    else
+    {
+        if ( !complexIm_Managed_.dimensions_equal(&x) )
+        {
+            complexIm_Managed_.create(x.get_dimensions());
+        }
+
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(x, im, complexIm_Managed_);
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRIT2DOperator<T>::convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x)
+{
+    if ( this->use_non_centered_fft_ )
+    {
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2(im, x);
+    }
+    else
+    {
+        if ( !kspace_Managed_.dimensions_equal(&im) )
+        {
+            kspace_Managed_.create(im.get_dimensions());
+        }
+
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(im, x, kspace_Managed_);
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRIT2DOperator<T>::forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        Gadgetron::multiply(unacquired_points_indicator_, x, y);
+
+        // x to image domain
+        this->convertToImage(y, complexIm_);
+
+        size_t ro = x.get_size(0);
+        size_t e1 = x.get_size(1);
+        size_t CHA = x.get_size(2);
+
+        if ( res_after_apply_kernel_sum_over_.get_number_of_elements() < ro*e1*CHA )
+        {
+            res_after_apply_kernel_sum_over_.create(ro, e1, CHA);
+        }
+
+        hoNDArray<T>* kerArray;
+        if ( use_symmetric_spirit_ )
+        {
+            kerArray = this->adjoint_forward_kernel_.get();
+        }
+        else
+        {
+            kerArray = this->forward_kernel_.get();
+        }
+
+        Gadgetron::imageDomainUnwrapping2D(complexIm_, *kerArray, res_after_apply_kernel_sum_over_, y);
+
+        this->convertToKSpace(y, res_after_apply_kernel_sum_over_);
+
+        // apply Dc
+        if ( use_symmetric_spirit_ )
+        {
+            Gadgetron::multiply(unacquired_points_indicator_, res_after_apply_kernel_sum_over_, y);
+        }
+        else
+        {
+            memcpy(y.begin(), res_after_apply_kernel_sum_over_.begin(), sizeof(T)*ro*e1*CHA);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRIT2DOperator<T>::forwardOperator(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRIT2DOperator<T>::adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        if ( use_symmetric_spirit_ )
+        {
+            // Dc(G-I)'(G-I)Dc' is symmetric
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(x, y));
+        }
+        else
+        {
+            // Dc(G-I)'x
+
+            // x to image domain
+            this->convertToImage(x, complexIm_);
+
+            // apply kernel and sum
+            size_t ro = x.get_size(0);
+            size_t e1 = x.get_size(1);
+            size_t CHA = x.get_size(2);
+
+            if ( res_after_apply_kernel_sum_over_.get_number_of_elements() < ro*e1*CHA )
+            {
+                res_after_apply_kernel_sum_over_.create(ro, e1, CHA);
+            }
+
+            Gadgetron::imageDomainUnwrapping2D(complexIm_, *adjoint_kernel_, res_after_apply_kernel_sum_over_, y);
+
+            //long long dCha;
+
+            ////#pragma omp parallel default(shared)
+            //{
+            //    //#ifdef WIN32
+            //    //    int tid = omp_get_thread_num();
+            //    //    DWORD_PTR mask = (1 << tid);
+            //    //    // GDEBUG_STREAM("thread id : " << tid << " - mask : " << mask);
+            //    //    SetThreadAffinityMask( GetCurrentThread(), mask );
+            //    //#endif // WIN32
+
+            //    //#pragma omp for
+
+            //    if ( typeid(T)==typeid( std::complex<float> ) )
+            //    {
+            //        for ( dCha=0; dCha<CHA; dCha++ )
+            //        {
+            //            vcMul(ro*e1*CHA, reinterpret_cast<MKL_Complex8*>(pIm), 
+            //                reinterpret_cast<MKL_Complex8*>(ker+dCha*ro*e1*CHA), 
+            //                reinterpret_cast<MKL_Complex8*>(ptt));
+
+            //            memcpy(pY+dCha*ro*e1, ptt, sizeof(T)*ro*e1);
+            //            for ( size_t sCha=1; sCha<CHA; sCha++ )
+            //            {
+            //                vcAdd(ro*e1, reinterpret_cast<MKL_Complex8*>(pY+dCha*ro*e1), 
+            //                    reinterpret_cast<MKL_Complex8*>(ptt+sCha*ro*e1), 
+            //                    reinterpret_cast<MKL_Complex8*>(pY+dCha*ro*e1));
+            //            }
+            //        }
+            //    }
+
+            //}
+
+            // go back to kspace 
+            this->convertToKSpace(y, res_after_apply_kernel_sum_over_);
+
+            // apply Dc
+            Gadgetron::multiply(unacquired_points_indicator_, res_after_apply_kernel_sum_over_, y);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRITOperator<T>::adjointOperator(...) ... ");
+        return false;
+    }
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRIT2DTOperator.h b/toolboxes/gtplus/algorithm/gtPlusSPIRIT2DTOperator.h
new file mode 100644
index 0000000..f071508
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRIT2DTOperator.h
@@ -0,0 +1,353 @@
+/** \file       gtPlusSPIRIT2DTOperator.h
+    \brief      Base class for gtPlus 2DT operators
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSPIRIT2DOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRIT2DTOperator : public gtPlusSPIRIT2DOperator<T>
+{
+public:
+
+    typedef gtPlusSPIRIT2DOperator<T> BaseClass;
+
+    gtPlusSPIRIT2DTOperator() : BaseClass() {}
+    virtual ~gtPlusSPIRIT2DTOperator() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // set forward kernel, compute the adjoint and adjoint_forward kernel
+    bool setForwardKernel(boost::shared_ptr< hoNDArray<T> >& forward_kernel, bool computeAdjForwardKernel=true);
+    bool setAdjointForwardKernel(boost::shared_ptr< hoNDArray<T> >& adjoint_forward_kernel);
+    // set the acquired kspace, unacquired points are set to be zero
+    bool setAcquiredPoints(boost::shared_ptr< hoNDArray<T> >& kspace);
+
+    // compute gradient of ||(G-I)(Dc'x+D'y)||2
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // compute cost value of L2 norm ||(G-I)(Dc'x+D'y)||2
+    virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+    //using BaseClass::gt_timer1_;
+    //using BaseClass::gt_timer2_;
+    //using BaseClass::gt_timer3_;
+    //using BaseClass::performTiming_;
+    //using BaseClass::gt_exporter_;
+    //using BaseClass::debugFolder_;
+    //using BaseClass::gtPlus_util_;
+    //using BaseClass::gtPlus_util_complex_;
+    //using BaseClass::gtPlus_mem_manager_;
+    //using BaseClass::use_symmetric_spirit_;
+
+protected:
+
+    // G-I, [RO E1 srcCHA dstCHA N]
+    //using BaseClass::forward_kernel_;
+    //using BaseClass::adjoint_kernel_;
+    //using BaseClass::adjoint_forward_kernel_;
+    //using BaseClass::acquired_points_;
+    //using BaseClass::acquired_points_indicator_;
+    //using BaseClass::unacquired_points_indicator_;
+
+    // helper memory
+    //using BaseClass::kspace_;
+    //using BaseClass::complexIm_;
+    //using BaseClass::res_after_apply_kernel_;
+    //using BaseClass::res_after_apply_kernel_sum_over_;
+
+    //using BaseClass::kspace_Managed_;
+    //using BaseClass::complexIm_Managed_;
+    //using BaseClass::res_after_apply_kernel_Managed_;
+    //using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+void gtPlusSPIRIT2DTOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT 2DT operator ------------------" << endl;
+    os << "Implementation of SPIRIT 2DT operator for ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT2DTOperator<T>::
+setForwardKernel(boost::shared_ptr< hoNDArray<T> >& forward_kernel, bool computeAdjForwardKernel)
+{
+    try
+    {
+        this->forward_kernel_ = forward_kernel;
+
+        size_t RO = this->forward_kernel_->get_size(0);
+        size_t E1 = this->forward_kernel_->get_size(1);
+        size_t srcCHA = this->forward_kernel_->get_size(2);
+        size_t dstCHA = this->forward_kernel_->get_size(3);
+        size_t N = this->forward_kernel_->get_size(4);
+
+        this->adjoint_kernel_ = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>(RO, E1, dstCHA, srcCHA, N));
+
+        bool computeAdjointForwardKernel = (computeAdjForwardKernel || this->use_symmetric_spirit_);
+
+        if ( computeAdjointForwardKernel )
+        {
+            this->adjoint_forward_kernel_ = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>(RO, E1, dstCHA, dstCHA, N));
+        }
+
+        size_t n;
+        for ( n=0; n<N; n++ )
+        {
+            hoNDArray<T> kerCurr(RO, E1, srcCHA, dstCHA, this->forward_kernel_->begin()+n*RO*E1*srcCHA*dstCHA);
+            hoNDArray<T> adjKerCurr(RO, E1, dstCHA, srcCHA, this->adjoint_kernel_->begin()+n*RO*E1*dstCHA*srcCHA);
+
+            GADGET_CHECK_RETURN_FALSE(this->imageDomainAdjointKernel(kerCurr, adjKerCurr));
+
+            if ( computeAdjointForwardKernel )
+            {
+                hoNDArray<T> adjForwardKerCurr(RO, E1, dstCHA, dstCHA, this->adjoint_forward_kernel_->begin()+n*RO*E1*dstCHA*dstCHA);
+                GADGET_CHECK_RETURN_FALSE(this->AdjointForwardKernel(adjKerCurr, kerCurr, adjForwardKerCurr));
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRIT2DTOperator<T>::setForwardKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT2DTOperator<T>::
+setAdjointForwardKernel(boost::shared_ptr< hoNDArray<T> >& adjoint_forward_kernel)
+{
+    try
+    {
+        this->adjoint_forward_kernel_ = adjoint_forward_kernel;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRIT2DTOperator<T>::setAdjointForwardKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT2DTOperator<T>::
+setAcquiredPoints(boost::shared_ptr< hoNDArray<T> >& kspace)
+{
+    try
+    {
+        this->acquired_points_ = kspace;
+
+        size_t RO = this->acquired_points_->get_size(0);
+        size_t E1 = this->acquired_points_->get_size(1);
+        size_t srcCHA = this->acquired_points_->get_size(2);
+        size_t E2 = this->acquired_points_->get_size(3);
+
+        this->acquired_points_indicator_.create(kspace->get_dimensions());
+        Gadgetron::clear(this->acquired_points_indicator_);
+
+        this->unacquired_points_indicator_.create(kspace->get_dimensions());
+        Gadgetron::clear(this->unacquired_points_indicator_);
+
+        size_t N = kspace->get_number_of_elements();
+
+        long long ii;
+
+        #pragma omp parallel for default(none) private(ii) shared(N, kspace)
+        for ( ii=0; ii<(long long)N; ii++ )
+        {
+            // if ( std::abs( (*kspace)(ii) ) < DBL_EPSILON )
+            if ( std::abs((*kspace)(ii).real()) < DBL_EPSILON )
+            {
+                this->unacquired_points_indicator_(ii) = 1.0;
+            }
+            else
+            {
+                this->acquired_points_indicator_(ii) = 1.0;
+            }
+        }
+
+        // allocate the helper memory
+        this->kspace_.create(RO, E1, srcCHA, E2);
+        this->complexIm_.create(RO, E1, srcCHA, E2);
+
+        if ( this->forward_kernel_ )
+        {
+            size_t dstCHA = this->forward_kernel_->get_size(3);
+            this->res_after_apply_kernel_.create(RO, E1, srcCHA, dstCHA);
+            this->res_after_apply_kernel_sum_over_.create(RO, E1, dstCHA, E2);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRIT2DTOperator<T>::setAcquiredPoints(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT2DTOperator<T>::grad(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        // gradient of L2 norm is
+        // 2*Dc*(G-I)'(G-I)(D'y+Dc'x)
+
+        // D'y+Dc'x
+        //gt_timer1_.start("1");
+        Gadgetron::multiply(this->unacquired_points_indicator_, x, this->kspace_);
+        //gt_timer1_.stop();
+
+        //gt_timer1_.start("2");
+        Gadgetron::add(*this->acquired_points_, this->kspace_, this->kspace_);
+        //gt_timer1_.stop();
+
+        // x to image domain
+        //gt_timer1_.start("3");
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(this->kspace_, this->complexIm_));
+        //gt_timer1_.stop();
+
+        //gt_timer1_.start("4");
+
+        // apply kernel and sum
+        size_t RO = x.get_size(0);
+        size_t E1 = x.get_size(1);
+        size_t CHA = x.get_size(2);
+        size_t N = x.get_size(3);
+
+        size_t dstCHA = this->adjoint_forward_kernel_->get_size(3);
+        size_t kernelN = this->adjoint_forward_kernel_->get_size(4);
+
+        if ( this->res_after_apply_kernel_sum_over_.get_number_of_elements() < RO*E1*dstCHA*N )
+        {
+            this->res_after_apply_kernel_sum_over_.create(RO, E1, dstCHA, N);
+        }
+
+        //Gadgetron::imageDomainUnwrapping2DT(this->complexIm_, *(this->adjoint_forward_kernel_), this->res_after_apply_kernel_sum_over_, g);
+
+        size_t n;
+        for ( n=0; n<N; n++)
+        {
+            hoNDArray<T> currComplexIm(RO, E1, CHA, this->complexIm_.begin()+n*RO*E1*CHA);
+
+            hoNDArray<T> curr_adjoint_forward_kernel;
+
+            if ( n < kernelN )
+            {
+                curr_adjoint_forward_kernel.create(RO, E1, CHA, dstCHA, this->adjoint_forward_kernel_->begin()+n*RO*E1*CHA*dstCHA);
+            }
+            else
+            {
+                curr_adjoint_forward_kernel.create(RO, E1, CHA, dstCHA, this->adjoint_forward_kernel_->begin()+(kernelN-1)*RO*E1*CHA*dstCHA);
+            }
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(curr_adjoint_forward_kernel, currComplexIm, this->res_after_apply_kernel_));
+
+            hoNDArray<T> sumResCurr(RO, E1, 1, dstCHA, this->res_after_apply_kernel_sum_over_.begin() + n*RO*E1*dstCHA);
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(this->res_after_apply_kernel_, sumResCurr, 2));
+        }
+
+        //gt_timer1_.stop();
+
+        // go back to kspace 
+        //gt_timer1_.start("5");
+        GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(this->res_after_apply_kernel_sum_over_, g));
+        //gt_timer1_.stop();
+
+        // apply Dc
+        //gt_timer1_.start("6");
+        Gadgetron::multiply(this->unacquired_points_indicator_, g, g);
+        //gt_timer1_.stop();
+
+        // multiply by 2
+        //gt_timer1_.start("7");
+        Gadgetron::scal(T(2.0), g);
+        //gt_timer1_.stop();
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRIT2DTOperator<T>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT2DTOperator<T>::obj(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        // L2 norm
+        // ||(G-I)(D'y+Dc'x)||2
+
+        // D'y+Dc'x
+        Gadgetron::multiply(this->unacquired_points_indicator_, x, this->kspace_);
+        Gadgetron::add(*this->acquired_points_, this->kspace_, this->kspace_);
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(this->kspace_, this->complexIm_));
+
+        // apply kernel and sum
+        size_t RO = x.get_size(0);
+        size_t E1 = x.get_size(1);
+        size_t CHA = x.get_size(2);
+        size_t N = x.get_size(3);
+
+        size_t dstCHA = this->forward_kernel_->get_size(3);
+        size_t kernelN = this->forward_kernel_->get_size(4);
+
+        if ( this->res_after_apply_kernel_sum_over_.get_number_of_elements() < RO*E1*dstCHA*N )
+        {
+            this->res_after_apply_kernel_sum_over_.create(RO, E1, dstCHA, N);
+        }
+
+        //Gadgetron::imageDomainUnwrapping2DT(this->complexIm_, *(this->forward_kernel_), this->res_after_apply_kernel_sum_over_, this->kspace_);
+
+        size_t n;
+        for ( n=0; n<N; n++)
+        {
+            hoNDArray<T> currComplexIm(RO, E1, CHA, this->complexIm_.begin()+n*RO*E1*CHA);
+
+            hoNDArray<T> curr_forward_kernel;
+
+            if ( n < kernelN )
+            {
+                curr_forward_kernel.create(RO, E1, CHA, dstCHA, this->forward_kernel_->begin()+n*RO*E1*CHA*dstCHA);
+            }
+            else
+            {
+                curr_forward_kernel.create(RO, E1, CHA, dstCHA, this->forward_kernel_->begin()+(kernelN-1)*RO*E1*CHA*dstCHA);
+            }
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(curr_forward_kernel, currComplexIm, this->res_after_apply_kernel_));
+
+            hoNDArray<T> sumResCurr(RO, E1, 1, dstCHA, this->res_after_apply_kernel_sum_over_.begin() + n*RO*E1*dstCHA);
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(this->res_after_apply_kernel_, sumResCurr, 2));
+        }
+
+        // L2 norm
+        Gadgetron::dotc(this->res_after_apply_kernel_sum_over_, this->res_after_apply_kernel_sum_over_, obj);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRIT2DTOperator<T>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRIT3DOperator.h b/toolboxes/gtplus/algorithm/gtPlusSPIRIT3DOperator.h
new file mode 100644
index 0000000..f52cdac
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRIT3DOperator.h
@@ -0,0 +1,98 @@
+/** \file       gtPlusSPIRIT3DOperator.h
+    \brief      Base class for gtPlus 3D operators
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSPIRITOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRIT3DOperator : public gtPlusSPIRITOperator<T>
+{
+public:
+
+    typedef gtPlusSPIRITOperator<T> BaseClass;
+
+    gtPlusSPIRIT3DOperator() : BaseClass() {}
+    virtual ~gtPlusSPIRIT3DOperator() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // convert to image domain or back to kspace
+    virtual bool convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im);
+    virtual bool convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::gtPlus_mem_manager_;
+    using BaseClass::use_symmetric_spirit_;
+    using BaseClass::use_non_centered_fft_;
+
+protected:
+
+    // [RO E1 srcCHA dstCHA]
+    using BaseClass::forward_kernel_;
+    using BaseClass::djoint_kernel_;
+    using BaseClass::adjoint_forward_kernel_;
+    using BaseClass::acquired_points_;
+    using BaseClass::acquired_points_indicator_;
+    using BaseClass::unacquired_points_indicator_;
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+void gtPlusSPIRIT3DOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT 3D operator ------------------" << endl;
+    os << "Implementation of SPIRIT 3D operator for ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRIT3DOperator<T>::convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im)
+{
+    if ( !complexIm_Managed_.dimensions_equal(&x) )
+    {
+        complexIm_Managed_.create(x.get_dimensions());
+    }
+
+    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(x, im, complexIm_Managed_));
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRIT3DOperator<T>::convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x)
+{
+    if ( !kspace_Managed_.dimensions_equal(&im) )
+    {
+        kspace_Managed_.create(im.get_dimensions());
+    }
+
+    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(im, x, kspace_Managed_));
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace2DOperator.h b/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace2DOperator.h
new file mode 100644
index 0000000..9ca6d4c
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace2DOperator.h
@@ -0,0 +1,68 @@
+/** \file       gtPlusSPIRITNoNullSpace2DOperator.h
+    \brief      Implement SPIRIT 2D operator without Null space
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSPIRITNoNullSpaceOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRITNoNullSpace2DOperator : public gtPlusSPIRITNoNullSpaceOperator<T>
+{
+public:
+
+    typedef gtPlusSPIRITNoNullSpaceOperator<T> BaseClass;
+
+    gtPlusSPIRITNoNullSpace2DOperator() : BaseClass() {}
+    virtual ~gtPlusSPIRITNoNullSpace2DOperator() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // convert to image domain or back to kspace
+    virtual bool convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im);
+    virtual bool convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x);
+
+protected:
+
+};
+
+template <typename T> 
+void gtPlusSPIRITNoNullSpace2DOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT 2D operator without null space constraint ------------------" << endl;
+    os << "Implementation of SPIRIT 2D operator for ISMRMRD package" << endl;
+    os << "-------------------------------------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRITNoNullSpace2DOperator<T>::convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im)
+{
+    if ( !this->complexIm_Managed_.dimensions_equal(&x) )
+    {
+        this->complexIm_Managed_.create(x.get_dimensions());
+    }
+
+    Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(x, im, this->complexIm_Managed_);
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRITNoNullSpace2DOperator<T>::convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x)
+{
+    if ( !this->kspace_Managed_.dimensions_equal(&im) )
+    {
+        this->kspace_Managed_.create(im.get_dimensions());
+    }
+
+    Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(im, x, this->kspace_Managed_);
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace2DTOperator.h b/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace2DTOperator.h
new file mode 100644
index 0000000..797ed10
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace2DTOperator.h
@@ -0,0 +1,285 @@
+/** \file       gtPlusSPIRITNoNullSpace2DTOperator.h
+    \brief      Implement SPIRIT 2DT operator without Null space
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSPIRITNoNullSpace2DOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRITNoNullSpace2DTOperator : public gtPlusSPIRITNoNullSpace2DOperator<T>
+{
+public:
+
+    typedef gtPlusSPIRITNoNullSpace2DOperator<T> BaseClass;
+
+    gtPlusSPIRITNoNullSpace2DTOperator() : BaseClass() {}
+    virtual ~gtPlusSPIRITNoNullSpace2DTOperator() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // set forward kernel, compute the adjoint and adjoint_forward kernel
+    bool setForwardKernel(boost::shared_ptr< hoNDArray<T> >& forward_kernel, bool computeAdjForwardKernel=true);
+    bool setAdjointForwardKernel(boost::shared_ptr< hoNDArray<T> >& adjoint_forward_kernel);
+    // set the acquired kspace, unacquired points are set to be zero
+    bool setAcquiredPoints(boost::shared_ptr< hoNDArray<T> >& kspace);
+
+    // compute gradient of ||(G-I)x||2
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // compute cost value of L2 norm ||(G-I)x||2
+    virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+protected:
+
+};
+
+template <typename T> 
+void gtPlusSPIRITNoNullSpace2DTOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT 2DT operator without null space constraint ------------------" << endl;
+    os << "Implementation of SPIRIT 2DT operator for ISMRMRD package" << endl;
+    os << "--------------------------------------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusSPIRITNoNullSpace2DTOperator<T>::
+setForwardKernel(boost::shared_ptr< hoNDArray<T> >& forward_kernel, bool computeAdjForwardKernel)
+{
+    try
+    {
+        this->forward_kernel_ = forward_kernel;
+
+        size_t RO = this->forward_kernel_->get_size(0);
+        size_t E1 = this->forward_kernel_->get_size(1);
+        size_t srcCHA = this->forward_kernel_->get_size(2);
+        size_t dstCHA = this->forward_kernel_->get_size(3);
+        size_t N = this->forward_kernel_->get_size(4);
+
+        this->adjoint_kernel_ = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>(RO, E1, dstCHA, srcCHA, N));
+
+        bool computeAdjointForwardKernel = (computeAdjForwardKernel || this->use_symmetric_spirit_);
+
+        if ( computeAdjointForwardKernel )
+        {
+            this->adjoint_forward_kernel_ = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>(RO, E1, dstCHA, dstCHA, N));
+        }
+
+        size_t n;
+        for ( n=0; n<N; n++ )
+        {
+            hoNDArray<T> kerCurr(RO, E1, srcCHA, dstCHA, this->forward_kernel_->begin()+n*RO*E1*srcCHA*dstCHA);
+            hoNDArray<T> adjKerCurr(RO, E1, dstCHA, srcCHA, this->adjoint_kernel_->begin()+n*RO*E1*dstCHA*srcCHA);
+
+            GADGET_CHECK_RETURN_FALSE(this->imageDomainAdjointKernel(kerCurr, adjKerCurr));
+
+            if ( computeAdjointForwardKernel )
+            {
+                hoNDArray<T> adjForwardKerCurr(RO, E1, dstCHA, dstCHA, this->adjoint_forward_kernel_->begin()+n*RO*E1*dstCHA*dstCHA);
+                GADGET_CHECK_RETURN_FALSE(this->AdjointForwardKernel(adjKerCurr, kerCurr, adjForwardKerCurr));
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRITNoNullSpace2DTOperator<T>::setForwardKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITNoNullSpace2DTOperator<T>::
+setAdjointForwardKernel(boost::shared_ptr< hoNDArray<T> >& adjoint_forward_kernel)
+{
+    try
+    {
+        this->adjoint_forward_kernel_ = adjoint_forward_kernel;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRITNoNullSpace2DTOperator<T>::setAdjointForwardKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITNoNullSpace2DTOperator<T>::
+setAcquiredPoints(boost::shared_ptr< hoNDArray<T> >& kspace)
+{
+    try
+    {
+        this->acquired_points_ = kspace;
+
+        size_t RO = this->acquired_points_->get_size(0);
+        size_t E1 = this->acquired_points_->get_size(1);
+        size_t srcCHA = this->acquired_points_->get_size(2);
+        size_t E2 = this->acquired_points_->get_size(3);
+
+        this->acquired_points_indicator_.create(kspace->get_dimensions());
+        Gadgetron::clear(this->acquired_points_indicator_);
+
+        this->unacquired_points_indicator_.create(kspace->get_dimensions());
+        Gadgetron::clear(this->unacquired_points_indicator_);
+
+        size_t N = kspace->get_number_of_elements();
+
+        long long ii;
+
+        #pragma omp parallel for default(none) private(ii) shared(N, kspace)
+        for ( ii=0; ii<(long long)N; ii++ )
+        {
+            if ( std::abs( (*kspace)(ii) ) < DBL_EPSILON )
+            {
+                this->unacquired_points_indicator_(ii) = 1.0;
+            }
+            else
+            {
+                this->acquired_points_indicator_(ii) = 1.0;
+            }
+        }
+
+        // allocate the helper memory
+        this->kspace_.create(RO, E1, srcCHA, E2);
+        this->complexIm_.create(RO, E1, srcCHA, E2);
+
+        if ( this->forward_kernel_ )
+        {
+            size_t dstCHA = this->forward_kernel_->get_size(3);
+            this->res_after_apply_kernel_.create(RO, E1, srcCHA, dstCHA);
+            this->res_after_apply_kernel_sum_over_.create(RO, E1, dstCHA, E2);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRITNoNullSpace2DTOperator<T>::setAcquiredPoints(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITNoNullSpace2DTOperator<T>::grad(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        // gradient of L2 norm is
+        // 2*(G-I)'(G-I)x
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, this->complexIm_));
+
+        // apply kernel and sum
+        size_t RO = x.get_size(0);
+        size_t E1 = x.get_size(1);
+        size_t CHA = x.get_size(2);
+        size_t N = x.get_size(3);
+
+        size_t dstCHA = this->adjoint_forward_kernel_->get_size(3);
+        size_t kernelN = this->adjoint_forward_kernel_->get_size(4);
+
+        this->res_after_apply_kernel_sum_over_.create(RO, E1, dstCHA, N);
+
+        size_t n;
+        for ( n=0; n<N; n++)
+        {
+            hoNDArray<T> currComplexIm(RO, E1, CHA, this->complexIm_.begin()+n*RO*E1*CHA);
+
+            hoNDArray<T> curr_adjoint_forward_kernel;
+
+            if ( n < kernelN )
+            {
+                curr_adjoint_forward_kernel.create(RO, E1, CHA, dstCHA, this->adjoint_forward_kernel_->begin()+n*RO*E1*CHA*dstCHA);
+            }
+            else
+            {
+                curr_adjoint_forward_kernel.create(RO, E1, CHA, dstCHA, this->adjoint_forward_kernel_->begin()+(kernelN-1)*RO*E1*CHA*dstCHA);
+            }
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(curr_adjoint_forward_kernel, currComplexIm, this->res_after_apply_kernel_));
+
+            hoNDArray<T> sumResCurr(RO, E1, 1, dstCHA, this->res_after_apply_kernel_sum_over_.begin() + n*RO*E1*dstCHA);
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(this->res_after_apply_kernel_, sumResCurr, 2));
+        }
+
+        // go back to kspace 
+        GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(this->res_after_apply_kernel_sum_over_, g));
+
+        // multiply by 2
+        Gadgetron::scal(T(2.0), g);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRITNoNullSpace2DTOperator<T>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITNoNullSpace2DTOperator<T>::obj(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        // L2 norm
+        // ||(G-I)x||2
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, this->complexIm_));
+
+        // apply kernel and sum
+        size_t RO = x.get_size(0);
+        size_t E1 = x.get_size(1);
+        size_t CHA = x.get_size(2);
+        size_t N = x.get_size(3);
+
+        size_t dstCHA = this->forward_kernel_->get_size(3);
+        size_t kernelN = this->forward_kernel_->get_size(4);
+
+        this->res_after_apply_kernel_sum_over_.create(RO, E1, dstCHA, N);
+
+        size_t n;
+        for ( n=0; n<N; n++)
+        {
+            hoNDArray<T> currComplexIm(RO, E1, CHA, this->complexIm_.begin()+n*RO*E1*CHA);
+
+            hoNDArray<T> curr_forward_kernel;
+
+            if ( n < kernelN )
+            {
+                curr_forward_kernel.create(RO, E1, CHA, dstCHA, this->forward_kernel_->begin()+n*RO*E1*CHA*dstCHA);
+            }
+            else
+            {
+                curr_forward_kernel.create(RO, E1, CHA, dstCHA, this->forward_kernel_->begin()+(kernelN-1)*RO*E1*CHA*dstCHA);
+            }
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(curr_forward_kernel, currComplexIm, this->res_after_apply_kernel_));
+
+            hoNDArray<T> sumResCurr(RO, E1, 1, dstCHA, this->res_after_apply_kernel_sum_over_.begin() + n*RO*E1*dstCHA);
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(this->res_after_apply_kernel_, sumResCurr, 2));
+        }
+
+        // L2 norm
+        Gadgetron::dotc(this->res_after_apply_kernel_sum_over_, this->res_after_apply_kernel_sum_over_, obj);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRITNoNullSpace2DTOperator<T>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace3DOperator.h b/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace3DOperator.h
new file mode 100644
index 0000000..062bc35
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace3DOperator.h
@@ -0,0 +1,64 @@
+/** \file       gtPlusSPIRITNoNullSpace3DOperator.h
+    \brief      Implement SPIRIT 3D operator without Null space
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSPIRITNoNullSpaceOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRITNoNullSpace3DOperator : public gtPlusSPIRITNoNullSpaceOperator<T>
+{
+public:
+
+    typedef gtPlusSPIRITNoNullSpaceOperator<T> BaseClass;
+
+    gtPlusSPIRITNoNullSpace3DOperator() : BaseClass() {}
+    virtual ~gtPlusSPIRITNoNullSpace3DOperator() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // convert to image domain or back to kspace
+    virtual bool convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im);
+    virtual bool convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x);
+
+protected:
+
+};
+
+template <typename T> 
+void gtPlusSPIRITNoNullSpace3DOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT 3D operator without null space constraint ------------------" << endl;
+    os << "Implementation of SPIRIT 3D operator for ISMRMRD package" << endl;
+    os << "-------------------------------------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRITNoNullSpace3DOperator<T>::convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im)
+{
+    if ( !this->complexIm_Managed_.dimensions_equal(&x) )
+    {
+        this->complexIm_Managed_.create(x.get_dimensions());
+    }
+
+    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(x, im, this->complexIm_Managed_));
+}
+
+template <typename T> 
+inline bool gtPlusSPIRITNoNullSpace3DOperator<T>::convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x)
+{
+    if ( !this->kspace_Managed_.dimensions_equal(&im) )
+    {
+        this->kspace_Managed_.create(im.get_dimensions());
+    }
+
+    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(im, x, this->kspace_Managed_));
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpaceOperator.h b/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpaceOperator.h
new file mode 100644
index 0000000..58a0431
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpaceOperator.h
@@ -0,0 +1,130 @@
+/** \file       gtPlusSPIRITNoNullSpaceOperator.h
+    \brief      Base class for SPIRIT operators without Null space
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSPIRITOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRITNoNullSpaceOperator : public gtPlusSPIRITOperator<T>
+{
+public:
+
+    typedef gtPlusSPIRITOperator<T> BaseClass;
+
+    gtPlusSPIRITNoNullSpaceOperator() : BaseClass() {}
+    virtual ~gtPlusSPIRITNoNullSpaceOperator() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // compute gradient of ||(G-I)x||2
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // compute cost value of L2 norm ||(G-I)x||2
+    virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+    //using BaseClass::gt_timer1_;
+    //using BaseClass::gt_timer2_;
+    //using BaseClass::gt_timer3_;
+    //using BaseClass::performTiming_;
+    //using BaseClass::gt_exporter_;
+    //using BaseClass::debugFolder_;
+    //using BaseClass::gtPlus_util_;
+    //using BaseClass::gtPlus_util_complex_;
+    //using BaseClass::gtPlus_mem_manager_;
+    //using BaseClass::use_symmetric_spirit_;
+
+protected:
+
+    // [... srcCHA dstCHA]
+    //using BaseClass::forward_kernel_;
+    //using BaseClass::adjoint_kernel_;
+    //using BaseClass::adjoint_forward_kernel_;
+    //using BaseClass::acquired_points_;
+    //using BaseClass::acquired_points_indicator_;
+    //using BaseClass::unacquired_points_indicator_;
+
+    // helper memory
+    //using BaseClass::kspace_;
+    //using BaseClass::complexIm_;
+    //using BaseClass::res_after_apply_kernel_;
+    //using BaseClass::res_after_apply_kernel_sum_over_;
+
+    //using BaseClass::kspace_Managed_;
+    //using BaseClass::complexIm_Managed_;
+    //using BaseClass::res_after_apply_kernel_Managed_;
+    //using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+void gtPlusSPIRITNoNullSpaceOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT operator without null space constraint ------------------" << endl;
+    os << "Implementation of SPIRIT operator for ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusSPIRITNoNullSpaceOperator<T>::grad(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        // gradient of L2 norm is
+        // 2*(G-I)'(G-I)x
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, this->complexIm_));
+
+        // apply kernel and sum
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(*this->adjoint_forward_kernel_, this->complexIm_, this->res_after_apply_kernel_));
+        GADGET_CHECK_RETURN_FALSE(this->performSumOverSrcChannel(this->res_after_apply_kernel_, this->res_after_apply_kernel_sum_over_));
+
+        // go back to kspace 
+        GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(this->res_after_apply_kernel_sum_over_, g));
+
+        // multiply by 2
+        Gadgetron::scal(T(2.0), g);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRITNoNullSpaceOperator<T>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITNoNullSpaceOperator<T>::obj(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        // L2 norm
+        // ||(G-I)x||2
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, this->complexIm_));
+
+        // apply kernel and sum
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(*this->forward_kernel_, this->complexIm_, this->res_after_apply_kernel_));
+        GADGET_CHECK_RETURN_FALSE(this->performSumOverSrcChannel(this->res_after_apply_kernel_, this->res_after_apply_kernel_sum_over_));
+
+        // L2 norm
+        Gadgetron::dotc(this->res_after_apply_kernel_sum_over_, this->res_after_apply_kernel_sum_over_, obj);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRITNoNullSpaceOperator<T>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRITOperator.h b/toolboxes/gtplus/algorithm/gtPlusSPIRITOperator.h
new file mode 100644
index 0000000..aafbc2a
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRITOperator.h
@@ -0,0 +1,396 @@
+/** \file       gtPlusSPIRITOperator.h
+    \brief      Implement SPIRIT operator functinalities
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSPIRIT.h"
+#include "gtPlusOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRITOperator : public gtPlusSPIRIT<T>, public gtPlusOperator<T>
+{
+public:
+
+    typedef gtPlusOperator<T> BaseClass;
+
+    gtPlusSPIRITOperator() : use_symmetric_spirit_(false), use_non_centered_fft_(false), BaseClass() {}
+    virtual ~gtPlusSPIRITOperator() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // set forward kernel, compute the adjoint and adjoint_forward kernel
+    bool setForwardKernel(boost::shared_ptr< hoNDArray<T> >& forward_kernel, bool computeAdjForwardKernel=true);
+    bool setAdjointForwardKernel(boost::shared_ptr< hoNDArray<T> >& adjoint_forward_kernel);
+
+    hoNDArray<T>* getAdjointKernel();
+    hoNDArray<T>* getAdjointForwardKernel();
+
+    // apply Dc(G-I)'(G-I)Dc' to x
+    virtual bool forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // adjoint operator
+    virtual bool adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // compute right hand side
+    // b = -Dc(G-I)'(G-I)D'x
+    virtual bool computeRighHandSide(const hoNDArray<T>& x, hoNDArray<T>& b);
+
+    // compute gradient of ||(G-I)(Dc'x+D'y)||2
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // compute cost value of L2 norm ||(G-I)(Dc'x+D'y)||2
+    virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+    // indicate the operator is unitary or not
+    // unitary operator, AA' = I
+    virtual bool unitary() const { return false; }
+
+    // convert to image domain or back to kspace
+    virtual bool convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im) = 0;
+    virtual bool convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x) = 0;
+
+    // whether to use symmetric spirit equation
+    // symmetric equation: A = Dc(G-I)'(G-I)Dc'
+    // non-symmetric equation: A = (G-I)Dc'
+    bool use_symmetric_spirit_;
+
+    // if true, use the fft. not fftc
+    bool use_non_centered_fft_;
+
+    using gtPlusSPIRIT<T>::calib_use_gpu_;
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+
+protected:
+
+    // G-I, [... srcCHA dstCHA]
+    boost::shared_ptr< hoNDArray<T> > forward_kernel_;
+    // (G-I)', [... dstCHA srcCHA]
+    boost::shared_ptr< hoNDArray<T> > adjoint_kernel_;
+    // (G-I)'(G-I), [... dstCHA dstCHA]
+    boost::shared_ptr< hoNDArray<T> > adjoint_forward_kernel_;
+
+    using BaseClass::acquired_points_;
+    using BaseClass::acquired_points_indicator_;
+    using BaseClass::unacquired_points_indicator_;
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+void gtPlusSPIRITOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT operator ------------------" << endl;
+    os << "Implementation of SPIRIT operator for ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+inline hoNDArray<T>* gtPlusSPIRITOperator<T>::getAdjointKernel()
+{
+    return adjoint_kernel_.get();
+}
+
+template <typename T> 
+inline hoNDArray<T>* gtPlusSPIRITOperator<T>::getAdjointForwardKernel()
+{
+    return adjoint_forward_kernel_.get();
+}
+
+template <typename T> 
+bool gtPlusSPIRITOperator<T>::
+setForwardKernel(boost::shared_ptr< hoNDArray<T> >& forward_kernel, bool computeAdjForwardKernel)
+{
+    try
+    {
+        forward_kernel_ = forward_kernel;
+
+        adjoint_kernel_ = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>());
+        GADGET_CHECK_RETURN_FALSE(this->imageDomainAdjointKernel(*forward_kernel_, *adjoint_kernel_));
+
+        if ( computeAdjForwardKernel || use_symmetric_spirit_ )
+        {
+            adjoint_forward_kernel_ = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>());
+            GADGET_CHECK_RETURN_FALSE(this->AdjointForwardKernel(*adjoint_kernel_, *forward_kernel_, *adjoint_forward_kernel_));
+        }
+
+        // allocate the helper memory
+        boost::shared_ptr< std::vector<size_t> > dims = forward_kernel->get_dimensions();
+        size_t NDim = dims->size();
+
+        std::vector<size_t> dimSrc(NDim-1), dimDst(NDim-1);
+        size_t ii;
+        for ( ii=0; ii<NDim-2; ii++ )
+        {
+            dimSrc[ii] = (*dims)[ii];
+            dimDst[ii] = (*dims)[ii];
+        }
+
+        dimSrc[NDim-2] = (*dims)[NDim-2];
+        dimDst[NDim-2] = (*dims)[NDim-1];
+
+        kspace_.create(dimSrc);
+        complexIm_.create(dimSrc);
+        res_after_apply_kernel_.create(dims);
+        res_after_apply_kernel_sum_over_.create(dimDst);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRITOperator<T>::setForwardKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITOperator<T>::
+setAdjointForwardKernel(boost::shared_ptr< hoNDArray<T> >& adjoint_forward_kernel)
+{
+    try
+    {
+        adjoint_forward_kernel_ = adjoint_forward_kernel;
+
+        // allocate the helper memory
+        boost::shared_ptr< std::vector<size_t> > dims = adjoint_forward_kernel_->get_dimensions();
+        size_t NDim = dims->size();
+
+        std::vector<size_t> dimSrc(NDim-1), dimDst(NDim-1);
+        size_t ii;
+        for ( ii=0; ii<NDim-2; ii++ )
+        {
+            dimSrc[ii] = (*dims)[ii];
+            dimDst[ii] = (*dims)[ii];
+        }
+
+        dimSrc[NDim-2] = (*dims)[NDim-2];
+        dimDst[NDim-2] = (*dims)[NDim-1];
+
+        kspace_.create(dimSrc);
+        complexIm_.create(dimSrc);
+        res_after_apply_kernel_.create(dims);
+        res_after_apply_kernel_sum_over_.create(dimDst);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRITOperator<T>::setAdjointForwardKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITOperator<T>::forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        // Dc(G-I)'(G-I)Dc'x
+
+        Gadgetron::multiply(unacquired_points_indicator_, x, y);
+
+        // x to image domain
+        this->convertToImage(y, complexIm_);
+
+        // apply kernel and sum
+        if ( use_symmetric_spirit_ )
+        {
+            Gadgetron::multiply(*adjoint_forward_kernel_, complexIm_, res_after_apply_kernel_);
+        }
+        else
+        {
+            Gadgetron::multiply(*forward_kernel_, complexIm_, res_after_apply_kernel_);
+        }
+
+        this->performSumOverSrcChannel(res_after_apply_kernel_, res_after_apply_kernel_sum_over_);
+
+        // go back to kspace 
+        this->convertToKSpace(res_after_apply_kernel_sum_over_, y);
+
+        // apply Dc
+        if ( use_symmetric_spirit_ )
+        {
+            Gadgetron::multiply(unacquired_points_indicator_, y, y);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRITOperator<T>::forwardOperator(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITOperator<T>::adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        if ( use_symmetric_spirit_ )
+        {
+            // Dc(G-I)'(G-I)Dc' is symmetric
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(x, y));
+        }
+        else
+        {
+            // Dc(G-I)'x
+
+            // x to image domain
+            this->convertToImage(x, complexIm_);
+
+            // apply kernel and sum
+            Gadgetron::multiply(*adjoint_kernel_, complexIm_, res_after_apply_kernel_);
+            this->performSumOverSrcChannel(res_after_apply_kernel_, res_after_apply_kernel_sum_over_);
+
+            // go back to kspace 
+            this->convertToKSpace(res_after_apply_kernel_sum_over_, y);
+
+            // apply Dc
+            Gadgetron::multiply(unacquired_points_indicator_, y, y);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRITOperator<T>::adjointOperator(...) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITOperator<T>::computeRighHandSide(const hoNDArray<T>& x, hoNDArray<T>& b)
+{
+    try
+    {
+        // symmetric: -Dc(G-I)'(G-I)D'x
+        // non-symmetric: -(G-I)D'x
+
+        // D'x, need to do nothing, acquired points are already in place
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, complexIm_));
+
+        // apply kernel and sum
+        if ( use_symmetric_spirit_ )
+        {
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(*adjoint_forward_kernel_, complexIm_, res_after_apply_kernel_));
+        }
+        else
+        {
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(*forward_kernel_, complexIm_, res_after_apply_kernel_));
+        }
+
+        GADGET_CHECK_RETURN_FALSE(this->performSumOverSrcChannel(res_after_apply_kernel_, res_after_apply_kernel_sum_over_));
+
+        // go back to kspace 
+        GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(res_after_apply_kernel_sum_over_, b));
+
+        // apply Dc
+        if ( use_symmetric_spirit_ )
+        {
+            Gadgetron::multiply(unacquired_points_indicator_, b, b);
+        }
+
+        // multiply by -1
+        Gadgetron::scal( (typename realType<T>::Type)(-1.0), b);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRITOperator<T>::computeRighHandSide(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITOperator<T>::grad(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        // gradient of L2 norm is
+        // 2*Dc*(G-I)'(G-I)(D'y+Dc'x)
+
+        // D'y+Dc'x
+        Gadgetron::multiply(unacquired_points_indicator_, x, kspace_);
+        Gadgetron::add(*acquired_points_, kspace_, kspace_);
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(kspace_, complexIm_));
+
+        // apply kernel and sum
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(*adjoint_forward_kernel_, complexIm_, res_after_apply_kernel_));
+        GADGET_CHECK_RETURN_FALSE(this->performSumOverSrcChannel(res_after_apply_kernel_, res_after_apply_kernel_sum_over_));
+
+        // go back to kspace 
+        GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(res_after_apply_kernel_sum_over_, g));
+
+        // apply Dc
+        Gadgetron::multiply(unacquired_points_indicator_, g, g);
+
+        // multiply by 2
+        Gadgetron::scal( (typename realType<T>::Type)(2.0), g);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRITOperator<T>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITOperator<T>::obj(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        // L2 norm
+        // ||(G-I)(D'y+Dc'x)||2
+
+        // D'y+Dc'x
+        Gadgetron::multiply(unacquired_points_indicator_, x, kspace_);
+        Gadgetron::add(*acquired_points_, kspace_, kspace_);
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(kspace_, complexIm_));
+
+        // apply kernel and sum
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(*forward_kernel_, complexIm_, res_after_apply_kernel_));
+        GADGET_CHECK_RETURN_FALSE(this->performSumOverSrcChannel(res_after_apply_kernel_, res_after_apply_kernel_sum_over_));
+
+        // L2 norm
+        Gadgetron::dotc(res_after_apply_kernel_sum_over_, res_after_apply_kernel_sum_over_, obj);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusSPIRITOperator<T>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusWavelet2DOperator.h b/toolboxes/gtplus/algorithm/gtPlusWavelet2DOperator.h
new file mode 100644
index 0000000..7b214a8
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusWavelet2DOperator.h
@@ -0,0 +1,374 @@
+/** \file       gtPlusWavelet2DOperator.h
+    \brief      Implement 2D wavelet operator for L1 regularization
+    \author     Hui Xue
+
+    Redundant haar wavelet transformation is implemented here.
+*/
+
+#pragma once
+
+#include "gtPlusWaveletOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusWavelet2DOperator : public gtPlusWaveletOperator<T>
+{
+public:
+
+    typedef gtPlusWaveletOperator<T> BaseClass;
+    typedef typename BaseClass::value_type value_type;
+
+    gtPlusWavelet2DOperator();
+    virtual ~gtPlusWavelet2DOperator();
+
+    virtual void printInfo(std::ostream& os);
+
+    // forward operator, perform wavelet transform
+    // x: [RO E1 ...]
+    // y: [RO E1 W ...]
+    virtual bool forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // adjoint operator, perform inverse transform
+    // x: [RO E1 W ...]
+    // y: [RO E1 ...]
+    virtual bool adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // perform the redundant haar wavelet forward transform
+    // in : [RO E1], out : [RO E1 1+3*level]
+    bool dwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level);
+
+    // perform the redundant haar wavelet inverse transform
+    // in : [RO E1 1+3*level], out : [RO E1]
+    bool idwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level);
+
+    virtual bool unitary() const { return true; }
+
+    using BaseClass::scale_factor_first_dimension_;
+    using BaseClass::scale_factor_second_dimension_;
+    using BaseClass::numOfWavLevels_;
+    using BaseClass::with_approx_coeff_;
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+
+protected:
+
+    using BaseClass::acquired_points_;
+    using BaseClass::acquired_points_indicator_;
+    using BaseClass::unacquired_points_indicator_;
+    using BaseClass::coil_senMap_;
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    using BaseClass::wav_coeff_norm_;
+    using BaseClass::kspace_wav_;
+    using BaseClass::complexIm_wav_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+gtPlusWavelet2DOperator<T>::gtPlusWavelet2DOperator() : BaseClass()
+{
+
+}
+
+template <typename T> 
+gtPlusWavelet2DOperator<T>::~gtPlusWavelet2DOperator()
+{
+}
+
+template <typename T> 
+bool gtPlusWavelet2DOperator<T>::
+forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dims = x.get_dimensions();
+        size_t NDim = dims->size();
+
+        size_t RO = (*dims)[0];
+        size_t E1 = (*dims)[1];
+        size_t W = 1+3*numOfWavLevels_;
+
+        std::vector<size_t> dimR(NDim+1);
+        dimR[0] = RO;
+        dimR[1] = E1;
+        dimR[2] = W;
+
+        size_t n;
+        for ( n=2; n<NDim; n++ )
+        {
+            dimR[n+1] = (*dims)[n];
+        }
+
+        if ( !y.dimensions_equal(&dimR) )
+        {
+            y.create(&dimR);
+        }
+
+        size_t num = x.get_number_of_elements()/(RO*E1);
+
+        T* pX = const_cast<T*>(x.begin());
+        T* pY = y.begin();
+
+        long long t;
+
+        #pragma omp parallel for default(none) private(t) shared(num, RO, E1, W, pX, pY)
+        for ( t=0; t<(long long)num; t++ )
+        {
+            hoNDArray<T> in(RO, E1, pX+t*RO*E1);
+            hoNDArray<T> out(RO, E1, W, pY+t*RO*E1*W);
+            this->dwtRedundantHaar(in, out, numOfWavLevels_);
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet2DOperator<T>::forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet2DOperator<T>::
+adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dims = x.get_dimensions();
+        size_t NDim = dims->size();
+
+        size_t RO = (*dims)[0];
+        size_t E1 = (*dims)[1];
+        size_t W = (*dims)[2];
+
+        std::vector<size_t> dimR(NDim-1);
+        dimR[0] = RO;
+        dimR[1] = E1;
+
+        size_t n;
+        for ( n=2; n<NDim-1; n++ )
+        {
+            dimR[n] = (*dims)[n+1];
+        }
+
+        if ( !y.dimensions_equal(&dimR) )
+        {
+            y.create(&dimR);
+        }
+
+        size_t num = x.get_number_of_elements()/(RO*E1*W);
+
+        T* pX = const_cast<T*>(x.begin());
+        T* pY = y.begin();
+
+        long long t;
+
+        #pragma omp parallel for default(none) private(t) shared(num, RO, E1, W, pX, pY)
+        for ( t=0; t<(long long)num; t++ )
+        {
+            hoNDArray<T> in(RO, E1, W, pX+t*RO*E1*W);
+            hoNDArray<T> out(RO, E1, pY+t*RO*E1);
+            this->idwtRedundantHaar(in, out, numOfWavLevels_);
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet2DOperator<T>::adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet2DOperator<T>::
+dwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level)
+{
+    try
+    {
+        size_t RO = in.get_size(0);
+        size_t E1 = in.get_size(1);
+
+        T scaleFactor = 0.5;
+
+        T* pOut = out.begin();
+        memcpy(pOut, in.begin(), sizeof(T)*RO*E1);
+
+        for (size_t n=0; n<level; n++)
+        {
+            T* LH = pOut + (3*n+1)*RO*E1;
+
+            long long ro;
+            #pragma omp parallel for default(none) private(ro) shared(RO, E1, pOut, LH)
+            for (ro=0; ro<(long long)RO; ro++)
+            {
+                T v1 = pOut[ro];
+
+                long long ii=ro, e1;
+                for (e1=0; e1<(long long)E1-1; e1++)
+                {
+                    LH[ii] = pOut[ii] - pOut[ii+RO];
+                    pOut[ii] += pOut[ii+RO];
+                    ii+=RO;
+                }
+
+                LH[ii] = pOut[ii] - v1;
+                pOut[ii] += v1;
+            }
+
+            this->scal( RO*E1, scaleFactor, pOut );
+            this->scal( RO*E1, scaleFactor, LH );
+
+            T* HL = LH + RO*E1;
+            T* HH = HL + RO*E1;
+
+            long long e1;
+            #pragma omp parallel for default(none) private(e1) shared(RO, E1, pOut, LH, HL, HH)
+            for (e1=0; e1<(long long)E1; e1++)
+            {
+                T v1 = pOut[e1*RO];
+                T v2 = LH[e1*RO];
+
+                size_t ii = e1*RO;
+                for (long long ro=0; ro<(long long)RO-1; ro++)
+                {
+                    HH[ii] = LH[ii] - LH[ii+1];
+                    LH[ii] += LH[ii+1];
+
+                    HL[ii] = pOut[ii] - pOut[ii+1];
+                    pOut[ii] += pOut[ii+1];
+
+                    ii++;
+                }
+
+                HH[ii] = LH[ii] - v2;
+                LH[ii] += v2;
+
+                HL[ii] = pOut[ii] - v1;
+                pOut[ii] += v1;
+            }
+
+            this->scal( RO*E1, scaleFactor, pOut);
+            this->scal( RO*E1, scaleFactor, LH);
+            this->scal( RO*E1, scaleFactor, HL);
+            this->scal( RO*E1, scaleFactor, HH);
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet2DOperator<T>::dwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet2DOperator<T>::
+idwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level)
+{
+    try
+    {
+        size_t RO = in.get_size(0);
+        size_t E1 = in.get_size(1);
+
+        T* pIn = const_cast<T*>(in.begin());
+        T* pOut = out.begin();
+        memcpy(pOut, in.begin(), sizeof(T)*RO*E1);
+
+        hoNDArray<T> tmp(RO*E1);
+        T* pTmp = tmp.begin();
+
+        T scaleFactor = 0.5;
+
+        long long n;
+        for (n=(long long)level-1; n>=0; n--)
+        {
+            T* LH = pIn + (3*n+1)*RO*E1;
+            T* HL = LH + RO*E1;
+            T* HH = HL + RO*E1;
+
+            long long e1;
+            #pragma omp parallel for default(none) private(e1) shared(RO, E1, pOut, LH, HL, HH, pTmp)
+            for (e1=0; e1<(long long)E1; e1++)
+            {
+                size_t ii = e1*RO+RO-1;
+
+                T vLL = pOut[ii];
+                T vLH = LH[ii];
+                T vHL = HL[ii];
+                T vHH = HH[ii];
+
+                for (long long ro=RO-1; ro>0; ro--)
+                {
+                    // ii = e1*RO + ro;
+                    pOut[ii] += pOut[ii-1] + HL[ii] - HL[ii-1];
+                    pTmp[ii] = LH[ii] + LH[ii-1] + HH[ii] - HH[ii-1];
+
+                    ii--;
+                }
+
+                // ii -= 1;
+                /*pOut[ii] += HL[ii] + vLL - vLH;
+                pTmp [ii] = LH[ii] + HH[ii] + vHL - vHH;*/
+
+                pOut[ii] += vLL + HL[ii] - vHL;
+                pTmp [ii] = LH[ii] + vLH + HH[ii] - vHH;
+            }
+
+            this->scal( RO*E1, scaleFactor, pOut );
+            this->scal( RO*E1, scaleFactor, pTmp );
+
+            long long ro;
+            #pragma omp parallel for default(none) private(ro) shared(RO, E1, pOut, pTmp)
+            for (ro=0; ro<(long long)RO; ro++)
+            {
+                size_t ii = (E1-1)*RO+ro;
+                T vLL = pOut[ii];
+                T vLH = pTmp [ii];
+
+                for (long long e1=E1-1; e1>0; e1--)
+                {
+                    // ii = e1*RO + ro;
+                    pOut[ii] += pTmp[ii] + pOut[ii-RO] - pTmp[ii-RO];
+                    ii -= RO;
+                }
+
+                pOut[ro] += pTmp[ro] + vLL - vLH;
+            }
+
+            this->scal( RO*E1, scaleFactor, pOut );
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet2DOperator<T>::idwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+void gtPlusWavelet2DOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD wavelet 2D operator --------------------" << endl;
+    os << "Wavelet 2D operator for gtPlus ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusWavelet3DOperator.h b/toolboxes/gtplus/algorithm/gtPlusWavelet3DOperator.h
new file mode 100644
index 0000000..a86f5f4
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusWavelet3DOperator.h
@@ -0,0 +1,1450 @@
+/** \file       gtPlusWavelet3DOperator.h
+    \brief      Implement 3D wavelet operator for L1 regularization
+    \author     Hui Xue
+
+    Redundant haar wavelet transformation is implemented here.
+*/
+
+#pragma once
+
+#include "gtPlusWaveletOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusWavelet3DOperator : public gtPlusWaveletOperator<T>
+{
+public:
+
+    typedef gtPlusWaveletOperator<T> BaseClass;
+    typedef typename BaseClass::value_type value_type;
+
+    gtPlusWavelet3DOperator();
+    virtual ~gtPlusWavelet3DOperator();
+
+    virtual void printInfo(std::ostream& os);
+
+    // forward operator
+    // x : [RO E1 CHA E2 ...]
+    // y : [RO E1 E2 W CHA ...]
+    virtual bool forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // adjoint operator
+    virtual bool adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // perform the redundant haar wavelet forward transform
+    // in : [RO E1 E2], out : [RO E1 E2 1+7*level]
+    bool dwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level);
+
+    // perform the redundant haar wavelet inverse transform
+    // in : [RO E1 E2 1+7*level], out : [RO E1 E2]
+    bool idwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level);
+
+    virtual bool unitary() const { return true; }
+
+    // compute L1 norm of wavelet coefficients across CHA
+    // waveCoeff: [RO E1 E2 W CHA ...], W is the wavelet coefficient dimension (e.g. for 1 level wavelet decomposition, W=4 for 2D and W=8 for 3D)
+    // the W=1 wavelet coefficient is the most low frequent coefficients
+    virtual bool L1Norm(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm);
+
+    // to compute the gradient of wavelet term, divide the wavelet coefficients by its norm
+    // if processApproxCoeff = true, the most low frequent coefficients are changed; otherwise, remains unchanged
+    virtual bool divideWavCoeffByNorm(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T mu, T p, bool processApproxCoeff=false);
+
+    // soft-threshold or shrink the wavelet coefficients
+    // the really applied threshold is mask.*thres
+    virtual bool shrinkWavCoeff(hoNDArray<T>& wavCoeff, const hoNDArray<value_type>& wavCoeffNorm, value_type thres, const hoNDArray<T>& mask, bool processApproxCoeff=false);
+    virtual bool proximity(hoNDArray<T>& wavCoeff, value_type thres);
+
+    // if the sensitivity S is set, compute gradient of ||wav*F'*S'*(Dc'x+D'y)||1
+    // if not, compute gradient of ||wav*F'*(Dc'x+D'y)||1
+    // x represents the unacquired kspace points [RO E1 CHA E2]
+    // virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // if the sensitivity S is set, compute cost value of L2 norm ||wav*F'*S'*(Dc'x+D'y)||1
+    // if not, compute cost value of L2 norm ||wav*F'*(Dc'x+D'y)||1
+    // virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+    // scaling along RO
+    bool firstDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor);
+    // scaling along E1
+    bool secondDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor);
+    // scaling along E2
+    bool thirdDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor);
+
+    // because the spatial resolution of images are often different in through-plane dimension than the other two dimensions
+    // sometime it is good to take this into account, so the regularization effects are more isotropic
+    // Here only simple scaling factors are used
+    // More generally, a weighting matrix can be concatenated with wavelet coefficients to enhance or suppress regularization effects as needed
+    // the regularization term can become ||W*wav*F'*(Dc'x+D'y)||1, W is the general weighting matrix
+    // in the next version, we shall extend this class with more geneal weighting strategy
+    T scale_factor_third_dimension_;
+
+    // in some cases, the boundary high frequency coefficients of the 3rd dimension should not be changed
+    bool change_coeffcients_third_dimension_boundary_;
+
+    using BaseClass::scale_factor_first_dimension_;
+    using BaseClass::scale_factor_second_dimension_;
+    using BaseClass::numOfWavLevels_;
+    using BaseClass::with_approx_coeff_;
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+
+public:
+
+    // compute gradient on the assembled kspace
+    virtual bool gradTask(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // compute the obj on the assembled kspace
+    virtual bool objTask(const hoNDArray<T>& x, T& obj);
+
+    // help memory
+    hoNDArray<T> mask_;
+    hoNDArray<T> forward_buf_;
+    hoNDArray<T> adjoint_buf_;
+
+    using BaseClass::acquired_points_;
+    using BaseClass::acquired_points_indicator_;
+    using BaseClass::unacquired_points_indicator_;
+    using BaseClass::coil_senMap_;
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::complexIm_norm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    using BaseClass::wav_coeff_norm_;
+    using BaseClass::wav_coeff_norm_mag_;
+    using BaseClass::wav_coeff_norm_approx_;
+
+    hoNDArray<value_type> wav_coeff_norm_mag_sumCHA_;
+
+    using BaseClass::kspace_wav_;
+    using BaseClass::complexIm_wav_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+gtPlusWavelet3DOperator<T>::gtPlusWavelet3DOperator() : 
+        scale_factor_third_dimension_(1.0), 
+        change_coeffcients_third_dimension_boundary_(true), 
+        BaseClass()
+{
+
+}
+
+template <typename T> 
+gtPlusWavelet3DOperator<T>::~gtPlusWavelet3DOperator()
+{
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dims = x.get_dimensions();
+        size_t NDim = dims->size();
+
+        size_t RO = (*dims)[0];
+        size_t E1 = (*dims)[1];
+        size_t CHA = (*dims)[2];
+        size_t E2 = (*dims)[3];
+        size_t W = 1+7*numOfWavLevels_;
+
+        std::vector<size_t> dimR(NDim+1);
+        dimR[0] = RO;
+        dimR[1] = E1;
+        dimR[2] = E2;
+        dimR[3] = W;
+        dimR[4] = CHA;
+
+        size_t n;
+        for ( n=4; n<NDim; n++ )
+        {
+            dimR[n+1] = (*dims)[n];
+        }
+
+        if ( !y.dimensions_equal(&dimR) )
+        {
+            y.create(&dimR);
+        }
+
+        size_t num = x.get_number_of_elements()/(RO*E1*E2*CHA);
+
+        T* pX = const_cast<T*>(x.begin());
+        T* pY = y.begin();
+
+        int t;
+
+        if ( CHA == 1 )
+        {
+            #pragma omp parallel for default(none) private(t) shared(num, RO, E1, E2, W, pX, pY) if ( num > 1)
+            for ( t=0; t<num; t++ )
+            {
+                hoNDArray<T> in(RO, E1, E2, pX+t*RO*E1*E2);
+                hoNDArray<T> out(RO, E1, E2, W, pY+t*RO*E1*E2*W);
+                this->dwtRedundantHaar(in, out, numOfWavLevels_);
+            }
+        }
+        else
+        {
+            // #pragma omp parallel default(none) private(t) shared(num, RO, E1, CHA, E2, W, pX, pY) if ( num > 1 )
+            {
+                // hoNDArray<T> inPermute(RO, E1, E2, CHA);
+                forward_buf_.create(RO, E1, E2, CHA);
+
+                std::vector<size_t> dimOrder(4);
+                dimOrder[0] = 0;
+                dimOrder[1] = 1;
+                dimOrder[2] = 3;
+                dimOrder[3] = 2;
+
+                // #pragma omp for
+                for ( t=0; t<num; t++ )
+                {
+                    hoNDArray<T> in(RO, E1, CHA, E2, pX+t*RO*E1*CHA*E2);
+                    Gadgetron::permute(&in, &forward_buf_, &dimOrder);
+
+                    long long cha;
+
+                    #pragma omp parallel for default(none) private(cha) shared(num, RO, E1, CHA, E2, W, pY, t) if ( CHA > 4 )
+                    for ( cha=0; cha<(long long)CHA; cha++ )
+                    {
+                        hoNDArray<T> in_dwt(RO, E1, E2, forward_buf_.begin()+cha*RO*E1*E2);
+                        hoNDArray<T> out(RO, E1, E2, W, pY+t*RO*E1*E2*W*CHA+cha*RO*E1*E2*W);
+
+                        this->dwtRedundantHaar(in_dwt, out, numOfWavLevels_);
+                    }
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet3DOperator<T>::forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dims = x.get_dimensions();
+        size_t NDim = dims->size();
+
+        size_t RO = (*dims)[0];
+        size_t E1 = (*dims)[1];
+        size_t E2 = (*dims)[2];
+        size_t W = (*dims)[3];
+        size_t CHA = (*dims)[4];
+
+        std::vector<size_t> dimR(NDim-1);
+        dimR[0] = RO;
+        dimR[1] = E1;
+        dimR[2] = CHA;
+        dimR[3] = E2;
+
+        size_t n;
+        for ( n=4; n<NDim-1; n++ )
+        {
+            dimR[n] = (*dims)[n+1];
+        }
+
+        if ( !y.dimensions_equal(&dimR) )
+        {
+            y.create(&dimR);
+        }
+
+        size_t num = x.get_number_of_elements()/(RO*E1*E2*W*CHA);
+
+        T* pX = const_cast<T*>(x.begin());
+        T* pY = y.begin();
+
+        int t;
+
+        if ( CHA == 1 )
+        {
+            #pragma omp parallel for default(none) private(t) shared(num, RO, E1, E2, W, pX, pY) if ( num > 1)
+            for ( t=0; t<num; t++ )
+            {
+                hoNDArray<T> in(RO, E1, E2, W, pX+t*RO*E1*E2*W);
+                hoNDArray<T> out(RO, E1, E2, pY+t*RO*E1*E2);
+                this->idwtRedundantHaar(in, out, numOfWavLevels_);
+            }
+        }
+        else
+        {
+            // #pragma omp parallel default(none) private(t) shared(num, RO, E1, CHA, E2, W, pX, pY) if ( num > 1 ) num_threads( (int)((num>16) ? 16 : num))
+            {
+                // hoNDArray<T> outPermute(RO, E1, E2, CHA);
+                adjoint_buf_.create(RO, E1, E2, CHA);
+
+                std::vector<size_t> dimOrder(4);
+                dimOrder[0] = 0;
+                dimOrder[1] = 1;
+                dimOrder[2] = 3;
+                dimOrder[3] = 2;
+
+                // #pragma omp for
+                for ( t=0; t<num; t++ )
+                {
+                    hoNDArray<T> out(RO, E1, CHA, E2, pY+t*RO*E1*CHA*E2);
+
+                    long long cha;
+                    #pragma omp parallel for default(none) private(cha) shared(RO, E1, CHA, E2, W, pX) if ( CHA > 4 )
+                    for ( cha=0; cha<(long long)CHA; cha++ )
+                    {
+                        hoNDArray<T> in(RO, E1, E2, W, pX+cha*RO*E1*E2*W);
+                        hoNDArray<T> out_idwt(RO, E1, E2, adjoint_buf_.begin()+cha*RO*E1*E2);
+
+                        this->idwtRedundantHaar(in, out_idwt, numOfWavLevels_);
+                    }
+
+                    Gadgetron::permute(&adjoint_buf_, &out, &dimOrder);
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet3DOperator<T>::adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+L1Norm(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dims = wavCoeff.get_dimensions();
+
+        std::vector<size_t> dimR(*dims);
+        dimR[4] = 1;
+
+        if ( !wavCoeffNorm.dimensions_equal(&dimR) )
+        {
+            wavCoeffNorm.create(&dimR);
+        }
+
+        size_t RO = (*dims)[0];
+        size_t E1 = (*dims)[1];
+        size_t E2 = (*dims)[2];
+        size_t W = (*dims)[3];
+        size_t CHA = (*dims)[4];
+
+        // square the coefficients
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiplyConj(wavCoeff, wavCoeff, complexIm_norm_));
+        // sum over CHA
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(complexIm_norm_, wavCoeffNorm, 4));
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet3DOperator<T>::L1Norm(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+divideWavCoeffByNorm(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T mu, T p, bool processApproxCoeff)
+{
+    try
+    {
+        long long RO = (long long)wavCoeff.get_size(0);
+        long long E1 = (long long)wavCoeff.get_size(1);
+        long long E2 = (long long)wavCoeff.get_size(2);
+        long long W = (long long)wavCoeff.get_size(3);
+        long long CHA = (long long)wavCoeff.get_size(4);
+
+        if ( !wav_coeff_norm_approx_.dimensions_equal( &wavCoeffNorm ) )
+        {
+            wav_coeff_norm_approx_.create( wavCoeffNorm.get_dimensions() );
+        }
+
+        long long ii;
+        long long N = (long long)wavCoeffNorm.get_number_of_elements();
+
+        const T* pCoeffNorm = wavCoeffNorm.begin();
+        T* pBuf = wav_coeff_norm_approx_.begin();
+
+        if ( std::abs(std::abs(p) - 1.0) < 0.001 )
+        {
+            #pragma omp parallel for default(none) private(ii) shared(N, pBuf, pCoeffNorm, mu)
+            for ( ii=0; ii<N; ii++ )
+            {
+                pBuf[ii] = (value_type)( 1.0 / std::sqrt( pCoeffNorm[ii].real() + mu.real() ) );
+            }
+        }
+        else
+        {
+            #pragma omp parallel for default(none) private(ii) shared(N, pBuf, pCoeffNorm, mu, p)
+            for ( ii=0; ii<N; ii++ )
+            {
+                pBuf[ii] = (value_type)std::pow( (double)(pCoeffNorm[ii].real() + mu.real()), (double)(p.real()/2.0-1.0) );
+            }
+        }
+
+        if ( processApproxCoeff )
+        {
+            long long num = wavCoeff.get_number_of_elements() / (RO*E1*E2*W*CHA);
+
+            #pragma omp parallel default(none) private(ii) shared(RO, E1, E2, num, wavCoeffNorm, wavCoeff, W, CHA) if ( num > 1 )
+            {
+
+                #pragma omp for
+                for (ii = 0; ii<num; ii++)
+                {
+                    hoNDArray<T> wavCoeffNormCurr(RO, E1, E2, W, wav_coeff_norm_approx_.begin() + ii*RO*E1*E2*W);
+
+                    for (long long cha = 0; cha<CHA; cha++)
+                    {
+                        hoNDArray<T> wavCoeffCurr(RO, E1, E2, W, wavCoeff.begin() + ii*RO*E1*E2*W*CHA + cha*RO*E1*E2*W);
+                        Gadgetron::multiply(wavCoeffNormCurr, wavCoeffCurr, wavCoeffCurr);
+                    }
+                }
+            }
+        }
+        else
+        {
+            long long num = wavCoeff.get_number_of_elements()/(RO*E1*E2*W*CHA);
+
+            #pragma omp parallel default(none) private(ii) shared(RO, E1, E2, num, wavCoeffNorm, wavCoeff, W, CHA) if ( num > 1 )
+            {
+
+                #pragma omp for
+                for ( ii=0; ii<num; ii++ )
+                {
+                    hoNDArray<T> wavCoeffNormCurr(RO, E1, E2, W-1, wav_coeff_norm_approx_.begin()+ii*RO*E1*E2*W+RO*E1*E2);
+
+                    for ( long long cha=0; cha<CHA; cha++ )
+                    {
+                        hoNDArray<T> wavCoeffCurr(RO, E1, E2, W-1, wavCoeff.begin()+ii*RO*E1*E2*W*CHA+cha*RO*E1*E2*W+RO*E1*E2);
+                        Gadgetron::multiply(wavCoeffNormCurr, wavCoeffCurr, wavCoeffCurr);
+                    }
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet3DOperator<T>::divideWavCoeffByNorm(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T mu, T p, bool processApproxCoeff) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+proximity(hoNDArray<T>& wavCoeff, value_type thres)
+{
+    try
+    {
+        // GADGET_CHECK_RETURN_FALSE(this->L1Norm(wavCoeff, wav_coeff_norm_));
+
+        // GADGET_CHECK_RETURN_FALSE(Gadgetron::multiplyConj(wavCoeff, wavCoeff, wav_coeff_norm_));
+        Gadgetron::abs(wavCoeff, wav_coeff_norm_mag_);
+
+        if ( !mask_.dimensions_equal(&wavCoeff) )
+        {
+            mask_.create(wavCoeff.get_dimensions());
+        }
+
+        Gadgetron::fill(mask_, T(thres) );
+
+        if ( std::abs(std::abs(scale_factor_first_dimension_)-1.0) > 1e-6 )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->firstDimensionScale(mask_, scale_factor_first_dimension_));
+        }
+
+        if ( std::abs(std::abs(scale_factor_second_dimension_)-1.0) > 1e-6 )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->secondDimensionScale(mask_, scale_factor_second_dimension_));
+        }
+
+        if ( std::abs(std::abs(scale_factor_third_dimension_)-1.0) > 1e-6 )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->thirdDimensionScale(mask_, scale_factor_third_dimension_));
+        }
+
+        GADGET_CHECK_RETURN_FALSE(this->shrinkWavCoeff(wavCoeff, wav_coeff_norm_mag_, thres, mask_, this->with_approx_coeff_));
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet3DOperator<T>::proximity(hoNDArray<T>& wavCoeff, T thres) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+shrinkWavCoeff(hoNDArray<T>& wavCoeff, const hoNDArray<value_type>& wavCoeffNorm, value_type thres, const hoNDArray<T>& mask, bool processApproxCoeff)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dims = wavCoeff.get_dimensions();
+
+        long long RO = (long long)(*dims)[0];
+        long long E1 = (long long)(*dims)[1];
+        long long E2 = (long long)(*dims)[2];
+        long long W = (long long)(*dims)[3];
+        long long CHA = (long long)(*dims)[4];
+
+        if ( !wav_coeff_norm_approx_.dimensions_equal(&wavCoeffNorm) )
+        {
+            wav_coeff_norm_approx_.create(wavCoeffNorm.get_dimensions());
+        }
+
+        long long ii;
+        long long N = (long long)wavCoeffNorm.get_number_of_elements();
+        long long N4D = RO*E1*E2*W;
+
+        long long num = N/N4D;
+
+        value_type* pCoeffNorm = const_cast<value_type*>(wavCoeffNorm.begin());
+        T* pMag = wav_coeff_norm_approx_.begin();
+
+        if ( wavCoeffNorm.dimensions_equal(&wavCoeff) )
+        {
+            #pragma omp parallel for default(none) private(ii) shared(N, pMag, pCoeffNorm)
+            for ( ii=0; ii<N; ii++ )
+            {
+                pMag[ii] = pCoeffNorm[ii];
+            }
+
+            Gadgetron::divide(wavCoeff, wav_coeff_norm_approx_, complexIm_);
+        }
+        else
+        {
+            if ( !res_after_apply_kernel_.dimensions_equal(&wavCoeffNorm) )
+            {
+                res_after_apply_kernel_.create(wavCoeffNorm.get_dimensions());
+            }
+
+            T* pMagInv = res_after_apply_kernel_.begin();
+
+            #pragma omp parallel for default(none) private(ii) shared(N, pMag, pMagInv, pCoeffNorm)
+            for ( ii=0; ii<N; ii++ )
+            {
+                pMag[ii] = pCoeffNorm[ii];
+                pMagInv[ii] = 1/(pCoeffNorm[ii]+FLT_EPSILON);
+            }
+
+            // Gadgetron::inv(wav_coeff_norm_approx_, res_after_apply_kernel_);
+
+            // phase does not change
+            if ( res_after_apply_kernel_.dimensions_equal(&wavCoeff) )
+            {
+                Gadgetron::multiply(res_after_apply_kernel_, wavCoeff, complexIm_);
+            }
+            else
+            {
+                long long num = wavCoeff.get_number_of_elements() / (RO*E1*E2*W*CHA);
+
+                #pragma omp parallel default(none) private(ii) shared(RO, E1, E2, num, wavCoeff, W, CHA) if ( num > 1 )
+                {
+
+                    #pragma omp for
+                    for (ii = 0; ii<num; ii++)
+                    {
+                        hoNDArray<T> magInvCurr(RO, E1, E2, W, res_after_apply_kernel_.begin() + ii*RO*E1*E2*W);
+
+                        for (long long cha = 0; cha<CHA; cha++)
+                        {
+                            hoNDArray<T> wavCoeffCurr(RO, E1, E2, W, wavCoeff.begin() + ii*RO*E1*E2*W*CHA + cha*RO*E1*E2*W);
+                            hoNDArray<T> resCurr(RO, E1, E2, W, complexIm_.begin() + ii*RO*E1*E2*W*CHA + cha*RO*E1*E2*W);
+
+                            Gadgetron::multiply(magInvCurr, wavCoeffCurr, resCurr);
+                        }
+                    }
+                }
+            }
+        }
+
+        // shrink the magnitude
+        if ( mask.dimensions_equal(&wavCoeffNorm) )
+        {
+            const T* pMask = mask.begin();
+
+            // value_type* pMagCHA = wav_coeff_norm_mag_sumCHA_.begin();
+
+            long long n = 0;
+            for ( n=0; n<num; n++ )
+            {
+                long long s=RO*E1*E2; 
+                if ( processApproxCoeff )
+                {
+                    s = 0;
+                }
+
+                const T* pMaskCurr = pMask + n*N4D;
+                T* pMagCurr = pMag + n*N4D;
+
+                if ( change_coeffcients_third_dimension_boundary_ )
+                {
+                    long long nn;
+                    #pragma omp parallel for private(nn) shared(s, N4D, pMagCurr, pMaskCurr, thres)
+                    for ( nn=s; nn<N4D; nn++ )
+                    {
+                        // if ( std::abs(pMagCurr[nn]) < std::abs(thres*pMaskCurr[nn]) )
+                        if ( pMagCurr[nn].real() < pMaskCurr[nn].real() )
+                        // if ( pMagCHA[nn] < pMaskCurr[nn].real() )
+                        {
+                            pMagCurr[nn] = 0;
+                        }
+                        else
+                        {
+                            pMagCurr[nn] -= pMaskCurr[nn];
+                        }
+                    }
+                }
+                else
+                {
+                    // approx coefficents
+                    long long nn;
+                    #pragma omp parallel for private(nn) shared(s, N4D, pMagCurr, pMaskCurr, thres)
+                    for ( nn=s; nn<RO*E1*E2; nn++ )
+                    {
+                        //if ( std::abs(pMagCurr[nn]) < std::abs(thres*pMaskCurr[nn]) )
+                        if ( pMagCurr[nn].real() < pMaskCurr[nn].real() )
+                        {
+                            pMagCurr[nn] = 0;
+                        }
+                        else
+                        {
+                            pMagCurr[nn] -= pMaskCurr[nn];
+                        }
+                    }
+
+                    size_t level;
+                    for ( level=0; level<numOfWavLevels_; level++ )
+                    {
+                        size_t start = RO*E1*E2 + 7*level;
+
+                        size_t w;
+                        for ( w=0; w<7; w++ )
+                        {
+                            size_t startW = start+w*RO*E1*E2;
+                            size_t endW = startW+RO*E1*E2;
+
+                            if ( w >= 3 )
+                            {
+                                startW += RO*E1;
+                                endW -= RO*E1;
+                            }
+
+                            long long nn;
+                            #pragma omp parallel for private(nn) shared(s, N4D, pMagCurr, pMaskCurr, thres)
+                            for ( nn=(long long)startW; nn<(long long)endW; nn++ )
+                            {
+                                // if ( std::abs(pMagCurr[nn]) < std::abs(thres*pMaskCurr[nn]) )
+                                if ( pMagCurr[nn].real() < pMaskCurr[nn].real() )
+                                {
+                                    pMagCurr[nn] = 0;
+                                }
+                                else
+                                {
+                                    pMagCurr[nn] -= pMaskCurr[nn];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            long long n = 0;
+            for ( n=0; n<num; n++ )
+            {
+                long long s=RO*E1*E2; 
+                if ( processApproxCoeff )
+                {
+                    s = 0;
+                }
+
+                T* pMagCurr = pMag + n*N4D;
+
+                if ( change_coeffcients_third_dimension_boundary_ )
+                {
+                    long long nn;
+                    #pragma omp parallel for private(nn) shared(s, N4D, pMagCurr, thres)
+                    for ( nn=s; nn<N4D; nn++ )
+                    {
+                        // if ( std::abs(pMagCurr[nn]) < std::abs(thres) )
+                        if ( pMagCurr[nn].real() < thres )
+                        {
+                            pMagCurr[nn] = 0;
+                        }
+                        else
+                        {
+                            pMagCurr[nn] -= thres;
+                        }
+                    }
+                }
+                else
+                {
+                    // approx coefficents
+                    long long nn;
+                    #pragma omp parallel for private(nn) shared(s, N4D, pMagCurr, thres)
+                    for ( nn=s; nn<RO*E1*E2; nn++ )
+                    {
+                        // if ( std::abs(pMagCurr[nn]) < std::abs(thres) )
+                        if ( pMagCurr[nn].real() < thres )
+                        {
+                            pMagCurr[nn] = 0;
+                        }
+                        else
+                        {
+                            pMagCurr[nn] -= thres;
+                        }
+                    }
+
+                    size_t level;
+                    for ( level=0; level<numOfWavLevels_; level++ )
+                    {
+                        size_t start = RO*E1*E2 + 7*level;
+
+                        size_t w;
+                        for ( w=0; w<7; w++ )
+                        {
+                            size_t startW = start+w*RO*E1*E2;
+                            size_t endW = startW+RO*E1*E2;
+
+                            if ( w >= 3 )
+                            {
+                                startW += RO*E1;
+                                endW -= RO*E1;
+                            }
+
+                            long long nn;
+                            #pragma omp parallel for private(nn) shared(s, N4D, pMagCurr, thres)
+                            for ( nn=(long long)startW; nn<(long long)endW; nn++ )
+                            {
+                                // if ( std::abs(pMagCurr[nn]) < std::abs(thres) )
+                                if ( pMagCurr[nn].real() < thres )
+                                {
+                                    pMagCurr[nn] = 0;
+                                }
+                                else
+                                {
+                                    pMagCurr[nn] -= thres;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        if ( processApproxCoeff )
+        {
+            if ( wav_coeff_norm_approx_.dimensions_equal(&complexIm_) )
+            {
+                Gadgetron::multiply(wav_coeff_norm_approx_, complexIm_, wavCoeff);
+            }
+            else
+            {
+                long long num = wavCoeff.get_number_of_elements() / (RO*E1*E2*W*CHA);
+
+                #pragma omp parallel default(none) private(ii) shared(RO, E1, E2, num, wavCoeffNorm, wavCoeff, W, CHA) if ( num > 1 )
+                {
+
+                    #pragma omp for
+                    for (ii = 0; ii<num; ii++)
+                    {
+                        hoNDArray<T> magCurr(RO, E1, E2, W, wav_coeff_norm_approx_.begin() + ii*RO*E1*E2*W);
+
+                        for (long long cha = 0; cha<CHA; cha++)
+                        {
+                            hoNDArray<T> phaseCurr(RO, E1, E2, W, complexIm_.begin() + ii*RO*E1*E2*W*CHA + cha*RO*E1*E2*W);
+                            hoNDArray<T> wavCoeffCurr(RO, E1, E2, W, wavCoeff.begin() + ii*RO*E1*E2*W*CHA + cha*RO*E1*E2*W);
+
+                            Gadgetron::multiply(magCurr, phaseCurr, wavCoeffCurr);
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            if ( wav_coeff_norm_approx_.dimensions_equal(&wavCoeff) )
+            {
+                #pragma omp parallel default(none) private(ii) shared(RO, E1, E2, wavCoeffNorm, wavCoeff, W, CHA) if ( CHA > 1 )
+                {
+
+                    #pragma omp for
+                    for ( ii=0; ii<CHA; ii++ )
+                    {
+                        hoNDArray<T> magCurr(RO, E1, E2, W-1, wav_coeff_norm_approx_.begin()+ii*RO*E1*E2*W+RO*E1*E2);
+                        hoNDArray<T> phaseCurr(RO, E1, E2, W-1, complexIm_.begin()+ii*RO*E1*E2*W+RO*E1*E2);
+                        hoNDArray<T> wavCoeffCurr(RO, E1, E2, W-1, wavCoeff.begin()+ii*RO*E1*E2*W+RO*E1*E2);
+
+                        Gadgetron::multiply(magCurr, phaseCurr, wavCoeffCurr);
+                    }
+                }
+            }
+            else
+            {
+                long long num = wavCoeff.get_number_of_elements()/(RO*E1*E2*W*CHA);
+
+                #pragma omp parallel default(none) private(ii) shared(RO, E1, E2, num, wavCoeffNorm, wavCoeff, W, CHA) if ( num > 1 )
+                {
+
+                    #pragma omp for
+                    for ( ii=0; ii<num; ii++ )
+                    {
+                        hoNDArray<T> magCurr(RO, E1, E2, W-1, wav_coeff_norm_approx_.begin()+ii*RO*E1*E2*W+RO*E1*E2);
+
+                        for ( long long cha=0; cha<CHA; cha++ )
+                        {
+                            hoNDArray<T> phaseCurr(RO, E1, E2, W-1, complexIm_.begin()+ii*RO*E1*E2*W*CHA+cha*RO*E1*E2*W+RO*E1*E2);
+                            hoNDArray<T> wavCoeffCurr(RO, E1, E2, W-1, wavCoeff.begin()+ii*RO*E1*E2*W*CHA+cha*RO*E1*E2*W+RO*E1*E2);
+
+                            Gadgetron::multiply(magCurr, phaseCurr, wavCoeffCurr);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet3DOperator<T>::shrinkWavCoeff(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T thres, const hoNDArray<T>& mask, bool processApproxCoeff) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+dwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level)
+{
+    try
+    {
+        long long RO = (long long)in.get_size(0);
+        long long E1 = (long long)in.get_size(1);
+        long long E2 = (long long)in.get_size(2);
+
+        T* pOut = out.begin();
+        memcpy(pOut, in.begin(), sizeof(T)*RO*E1*E2);
+
+        long long N2D = RO*E1;
+        long long N3D = RO*E1*E2;
+
+        for (size_t n=0; n<level; n++)
+        {
+            T* lll = pOut;
+            T* llh = lll + n*7*N3D + N3D;
+            T* lhl = llh + N3D;
+            T* lhh = lhl + N3D;
+            T* hll = lhh + N3D;
+            T* hlh = hll + N3D;
+            T* hhl = hlh + N3D;
+            T* hhh = hhl + N3D;
+
+            long long e2;
+            #pragma omp parallel for default(none) private(e2) shared(RO, E1, E2, N2D, lll, llh)
+            for (e2=0; e2<E2; e2++)
+            {
+                long long ind3D = e2 * N2D;
+                for (long long ro=0; ro<RO; ro++)
+                {
+                    T v1 = lll[ro + ind3D];
+
+                    long long ind = ro + ind3D;
+                    for (long long e1=0; e1<E1-1; e1++)
+                    {
+                        llh[ind] = lll[ind] - lll[ind+RO];
+                        lll[ind] += lll[ind+RO];
+                        ind += RO;
+                    }
+
+                    llh[ind] = lll[ind] - v1;
+                    lll[ind] += v1;
+                }
+            }
+
+            this->scal( N3D, T(0.5), lll);
+            this->scal( N3D, T(0.5), llh );
+
+            #pragma omp parallel for default(none) private(e2) shared(RO, E1, E2, N2D, lll, llh, lhh, lhl)
+            for (e2=0; e2<E2; e2++)
+            {
+                long long ind3D = e2*N2D;
+                for (long long e1=0; e1<E1; e1++)
+                {
+                    T v1 = lll[e1*RO + ind3D];
+                    T v2 = llh[e1*RO + ind3D];
+
+                    long long ind = e1*RO + ind3D;
+                    for (long long ro=0; ro<RO-1; ro++)
+                    {
+                        lhh[ind] = llh[ind] - llh[ind + 1];
+                        llh[ind] += llh[ind + 1];
+
+                        lhl[ind] = lll[ind] - lll[ind + 1];
+                        lll[ind] += lll[ind + 1];
+
+                        ind++;
+                    }
+
+                    lhl[ind] = lll[ind] - v1;
+                    lll[ind] += v1;
+
+                    lhh[ind] = llh[ind] - v2;
+                    llh[ind] += v2;
+                }
+            }
+
+            #pragma omp parallel sections
+            {
+                #pragma omp section
+                this->scal( N3D, T(0.5), lll );
+
+                #pragma omp section
+                this->scal( N3D, T(0.5), lhl );
+
+                #pragma omp section
+                this->scal( N3D, T(0.5), llh );
+
+                #pragma omp section
+                this->scal( N3D, T(0.5), lhh );
+            }
+
+            long long e1;
+            #pragma omp parallel for default(none) private(e1) shared(RO, E1, E2, N2D, lll, hll, lhl, hhl, llh, hlh, lhh, hhh)
+            for (e1=0; e1<E1; e1++)
+            {
+                for (long long ro=0; ro<RO; ro++)
+                {
+                    long long ind2D = e1*RO + ro;
+
+                    T v1 = lll[ind2D];
+                    T v2 = lhl[ind2D];
+                    T v3 = llh[ind2D];
+                    T v4 = lhh[ind2D];
+
+                    long long ind = ind2D;
+                    for (long long e2=0; e2<E2-1; e2++)
+                    {
+                        hll[ind] = lll[ind] - lll[ind + N2D];
+                        lll[ind] += lll[ind + N2D];
+
+                        hhl[ind] = lhl[ind] - lhl[ind + N2D];
+                        lhl[ind] += lhl[ind + N2D];
+
+                        hlh[ind] = llh[ind] - llh[ind + N2D];
+                        llh[ind] += llh[ind + N2D];
+
+                        hhh[ind] = lhh[ind] - lhh[ind + N2D];
+                        lhh[ind] += lhh[ind + N2D];
+
+                        ind += N2D;
+                    }
+
+                    if ( E2 > 1 )
+                    {
+                        hll[ind] = lll[ind] - v1;
+                        lll[ind] += v1;
+
+                        hhl[ind] = lhl[ind] - v2;
+                        lhl[ind] += v2;
+
+                        hlh[ind] = llh[ind] - v3;
+                        llh[ind] += v3;
+
+                        hhh[ind] = lhh[ind] - v4;
+                        lhh[ind] += v4;
+                    }
+                }
+            }
+
+            #pragma omp parallel sections
+            {
+                #pragma omp section
+                this->scal( N3D, T(0.5), lll);
+
+                #pragma omp section
+                this->scal( N3D, T(0.5), hll);
+
+                #pragma omp section
+                this->scal( N3D, T(0.5), lhl);
+
+                #pragma omp section
+                this->scal( N3D, T(0.5), hhl);
+
+                #pragma omp section
+                this->scal( N3D, T(0.5), llh);
+
+                #pragma omp section
+                this->scal( N3D, T(0.5), hlh);
+
+                #pragma omp section
+                this->scal( N3D, T(0.5), lhh);
+
+                #pragma omp section
+                this->scal( N3D, T(0.5), hhh);
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet3DOperator<T>::dwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+idwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level)
+{
+    try
+    {
+        long long RO = (long long)in.get_size(0);
+        long long E1 = (long long)in.get_size(1);
+        long long E2 = (long long)in.get_size(2);
+
+        T* pIn = const_cast<T*>(in.begin());
+        T* pOut = out.begin();
+        memcpy(pOut, in.begin(), sizeof(T)*RO*E1*E2);
+
+        long long N2D = RO*E1;
+        long long N3D = RO*E1*E2;
+
+        hoNDArray<T> LL(N3D);
+        T* pLL = LL.begin();
+
+        hoNDArray<T> HL(N3D);
+        T* pHL = HL.begin();
+
+        hoNDArray<T> LH(N3D);
+        T* pLH = LH.begin();
+
+        hoNDArray<T> HH(N3D);
+        T* pHH = HH.begin();
+
+        long long n;
+        for (n=(long long)level-1; n>=0; n--)
+        {
+            T* lll = pOut;
+            T* llh = pIn + n*7*N3D + N3D;
+            T* lhl = llh + N3D;
+            T* lhh = lhl + N3D;
+            T* hll = lhh + N3D;
+            T* hlh = hll + N3D;
+            T* hhl = hlh + N3D;
+            T* hhh = hhl + N3D;
+
+            long long e1;
+            #pragma omp parallel for default(none) private(e1) shared(RO, E1, E2, N2D, lll, hll, lhl, hhl, llh, hlh, lhh, hhh, pLL, pHL, pLH, pHH) 
+            for (e1=0; e1<E1; e1++)
+            {
+                for (long long ro=0; ro<RO; ro++)
+                {
+                    long long ind2D = e1*RO + ro;
+
+                    long long ind;
+                    for (long long e2=E2-1; e2>0; e2--)
+                    {
+                        ind = ind2D + e2*N2D;
+                        pLL[ind] = (lll[ind]+lll[ind-N2D]) + (hll[ind]-hll[ind-N2D]);
+                        pHL[ind] = (lhl[ind]+lhl[ind-N2D]) + (hhl[ind]-hhl[ind-N2D]);
+                        pLH[ind] = (llh[ind]+llh[ind-N2D]) + (hlh[ind]-hlh[ind-N2D]);
+                        pHH[ind] = (lhh[ind]+lhh[ind-N2D]) + (hhh[ind]-hhh[ind-N2D]);
+                    }
+
+                    if ( E2 > 1 )
+                    {
+                        ind = ind2D + (E2-1)*N2D;
+                        pLL[ind2D] = (lll[ind2D]+lll[ind]) + (hll[ind2D]-hll[ind]);
+                        pHL[ind2D] = (lhl[ind2D]+lhl[ind]) + (hhl[ind2D]-hhl[ind]);
+                        pLH[ind2D] = (llh[ind2D]+llh[ind]) + (hlh[ind2D]-hlh[ind]);
+                        pHH[ind2D] = (lhh[ind2D]+lhh[ind]) + (hhh[ind2D]-hhh[ind]);
+                    }
+                }
+            }
+
+            #pragma omp parallel sections
+            {
+                #pragma omp section
+                this->scal( N3D, T(0.5), pLL );
+
+                #pragma omp section
+                this->scal( N3D, T(0.5), pHL );
+
+                #pragma omp section
+                this->scal( N3D, T(0.5), pLH );
+
+                #pragma omp section
+                this->scal( N3D, T(0.5), pHH );
+            }
+
+            long long e2;
+            #pragma omp parallel for default(none) private(e2) shared(RO, E1, E2, N2D, pLL, pHL, pLH, pHH) 
+            for (e2=0; e2<(long long)E2; e2++)
+            {
+                long long ind3D = e2*N2D;
+                for (long long e1=0; e1<(long long)E1; e1++)
+                {
+                    long long ind = e1*RO + RO-1 + ind3D;
+
+                    T v1 = pLL[ind];
+                    T v2 = pLH[ind];
+
+                    for (long long ro=(long long)RO-1; ro>0; ro--)
+                    {
+                        pLL[ind] = (pLL[ind]+pLL[ind-1]) + (pHL[ind]-pHL[ind-1]);
+                        pLH[ind] = (pLH[ind]+pLH[ind-1]) + (pHH[ind]-pHH[ind-1]);
+                        ind--;
+                    }
+
+                    pLL[ind] = (pLL[ind]+v1) + (pHL[ind]-pHL[ind+RO-1]);
+                    pLH[ind] = (pLH[ind]+v2) + (pHH[ind]-pHH[ind+RO-1]);
+                }
+            }
+
+            this->scal( N3D, T(0.5), pLL );
+            this->scal( N3D, T(0.5), pLH );
+
+            #pragma omp parallel for default(none) private(e2) shared(RO, E1, E2, N2D, pLL,pLH, pOut) 
+            for (e2=0; e2<(long long)E2; e2++)
+            {
+                long long ind3D = e2*N2D;
+                for (long long ro=0; ro<(long long)RO; ro++)
+                {
+                    long long ind = (E1-1)*RO + ro + ind3D;
+                    for (long long e1=(long long)E1-1; e1>0; e1--)
+                    {
+                        pOut[ind] = (pLL[ind]+pLL[ind-RO]) + (pLH[ind]-pLH[ind-RO]);
+                        ind -= RO;
+                    }
+
+                    pOut[ind] = (pLL[ind]+pLL[ind+(E1-1)*RO]) + (pLH[ind]-pLH[ind+(E1-1)*RO]);
+                }
+            }
+
+            this->scal( N3D, T(0.5), pOut );
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet3DOperator<T>::idwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+gradTask(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        // x to image domain
+        //gt_timer2_.start("3");
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, complexIm_));
+        //gt_timer2_.stop();
+
+        size_t RO = complexIm_.get_size(0);
+        size_t E1 = complexIm_.get_size(1);
+        size_t CHA = complexIm_.get_size(2);
+        size_t E2 = complexIm_.get_size(3);
+
+        // compute the gradient
+        if ( coil_senMap_ && coil_senMap_->get_size(0)==RO && coil_senMap_->get_size(1)==E1 && coil_senMap_->get_size(2)==CHA )
+        {
+            // perform coil combination
+            //gt_timer2_.start("4");
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_complex_.coilCombine(complexIm_, *coil_senMap_, res_after_apply_kernel_));
+            //gt_timer2_.stop();
+
+            //gt_timer2_.start("5");
+            hoNDArray<T> combined(RO, E1, 1, E2, res_after_apply_kernel_.begin());
+            //gt_timer2_.stop();
+
+            // compute wavelet transform
+            //gt_timer2_.start("6");
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(combined, res_after_apply_kernel_sum_over_));
+            //gt_timer2_.stop();
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(complexIm_, res_after_apply_kernel_sum_over_));
+        }
+
+        // modify coefficients
+        //gt_timer2_.start("7");
+        GADGET_CHECK_RETURN_FALSE(this->L1Norm(res_after_apply_kernel_sum_over_, wav_coeff_norm_));
+        //gt_timer2_.stop();
+
+        //gt_timer2_.start("8");
+        GADGET_CHECK_RETURN_FALSE(this->divideWavCoeffByNorm(res_after_apply_kernel_sum_over_, wav_coeff_norm_, (value_type)(1e-15), (value_type)(1.0), with_approx_coeff_));
+        //gt_timer2_.stop();
+
+        // first dimension scaling
+        //gt_timer2_.start("9");
+        if ( std::abs(std::abs(scale_factor_first_dimension_)-1.0) > 1e-6 )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->firstDimensionScale(res_after_apply_kernel_sum_over_, scale_factor_first_dimension_));
+        }
+
+        // second dimension scaling
+        if ( std::abs(std::abs(scale_factor_second_dimension_)-1.0) > 1e-6 )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->secondDimensionScale(res_after_apply_kernel_sum_over_, scale_factor_second_dimension_));
+        }
+
+        // third dimension scaling
+        if ( std::abs(std::abs(scale_factor_third_dimension_)-1.0) > 1e-6 )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->thirdDimensionScale(res_after_apply_kernel_sum_over_, scale_factor_third_dimension_));
+        }
+        //gt_timer2_.stop();
+
+        // go back to image
+        //gt_timer2_.start("10");
+        GADGET_CHECK_RETURN_FALSE(this->adjointOperator(res_after_apply_kernel_sum_over_, complexIm_wav_));
+        //gt_timer2_.stop();
+
+        if ( coil_senMap_ && coil_senMap_->get_size(0)==RO && coil_senMap_->get_size(1)==E1 && coil_senMap_->get_size(2)==CHA )
+        {
+            // apply coil sensivity
+            //gt_timer2_.start("11");
+            if ( !kspace_wav_.dimensions_equal(&complexIm_) )
+            {
+                kspace_wav_.create(RO, E1, CHA, E2);
+            }
+
+            for ( size_t e2=0; e2<E2; e2++ )
+            {
+                hoNDArray<T> complexImE2(RO, E1, complexIm_wav_.begin()+e2*RO*E1);
+                hoNDArray<T> kspace_wavE2(RO, E1, CHA, kspace_wav_.begin()+e2*RO*E1*CHA);
+
+                if ( coil_senMap_->get_size(3) == E2 )
+                {
+                    hoNDArray<T> coilMapE2(RO, E1, CHA, coil_senMap_->begin()+e2*RO*E1*CHA);
+                    GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(coilMapE2, complexImE2, kspace_wavE2));
+                }
+                else
+                {
+                    GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(*coil_senMap_, complexImE2, kspace_wavE2));
+                }
+            }
+            //gt_timer2_.stop();
+
+            // go to kspace
+            //gt_timer2_.start("12");
+            GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(kspace_wav_, g));
+            //gt_timer2_.stop();
+        }
+        else
+        {
+            // go to kspace
+            GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(complexIm_wav_, g));
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet3DOperator<T>::gradTask(const hoNDArray<T>& x, hoNDArray<T>& g) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+objTask(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, complexIm_));
+
+        size_t RO = complexIm_.get_size(0);
+        size_t E1 = complexIm_.get_size(1);
+        size_t CHA = complexIm_.get_size(2);
+        size_t E2 = complexIm_.get_size(3);
+
+        // apply sensitivity
+        if (  coil_senMap_ && coil_senMap_->get_size(0)==RO && coil_senMap_->get_size(1)==E1 && coil_senMap_->get_size(2)==CHA )
+        {
+            // perform coil combination
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_complex_.coilCombine(complexIm_, *coil_senMap_, res_after_apply_kernel_));
+
+            hoNDArray<T> combined(RO, E1, 1, E2, res_after_apply_kernel_.begin());
+
+            // compute wavelet transform
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(combined, res_after_apply_kernel_sum_over_));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(complexIm_, res_after_apply_kernel_sum_over_));
+        }
+
+        if ( std::abs(std::abs(scale_factor_third_dimension_)-1.0) > 1e-6 )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->thirdDimensionScale(res_after_apply_kernel_sum_over_, scale_factor_third_dimension_));
+        }
+
+        GADGET_CHECK_RETURN_FALSE(this->L1NormTotal(res_after_apply_kernel_sum_over_, wav_coeff_norm_, obj));
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet3DOperator<T>::objTask(const hoNDArray<T>& x, T& obj) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+firstDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor)
+{
+    try
+    {
+        size_t RO = wavCoeff.get_size(0);
+        size_t E1 = wavCoeff.get_size(1);
+        size_t E2 = wavCoeff.get_size(2);
+        size_t W = wavCoeff.get_size(3);
+
+        size_t num = wavCoeff.get_number_of_elements()/(RO*E1*E2*W);
+
+        // coeff 2, 3, 6, 7 are for RO high frequency
+
+        size_t ii;
+        for ( ii=0; ii<num; ii++ )
+        {
+            for ( size_t n=0; n<numOfWavLevels_; n++ )
+            {
+                // 2, 3
+                hoNDArray<T> coeff(RO, E1, E2, 2, wavCoeff.begin()+ii*RO*E1*E2*W+(7*n+2)*RO*E1*E2);
+                Gadgetron::scal(scaleFactor, coeff);
+
+                // 6, 7
+                hoNDArray<T> coeff2(RO, E1, E2, 2, wavCoeff.begin()+ii*RO*E1*E2*W+(7*n+6)*RO*E1*E2);
+                Gadgetron::scal(scaleFactor, coeff2);
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet3DOperator<T>::firstDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+secondDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor)
+{
+    try
+    {
+        size_t RO = wavCoeff.get_size(0);
+        size_t E1 = wavCoeff.get_size(1);
+        size_t E2 = wavCoeff.get_size(2);
+        size_t W = wavCoeff.get_size(3);
+
+        size_t num = wavCoeff.get_number_of_elements()/(RO*E1*E2*W);
+
+        // coeff 1, 3, 5, 7 are for E1 high frequency
+
+        size_t ii;
+        for ( ii=0; ii<num; ii++ )
+        {
+            for ( size_t n=0; n<numOfWavLevels_; n++ )
+            {
+                hoNDArray<T> coeff(RO, E1, E2, 1, wavCoeff.begin()+ii*RO*E1*E2*W+(7*n+1)*RO*E1*E2);
+                Gadgetron::scal(scaleFactor, coeff);
+
+                hoNDArray<T> coeff1(RO, E1, E2, 1, wavCoeff.begin()+ii*RO*E1*E2*W+(7*n+3)*RO*E1*E2);
+                Gadgetron::scal(scaleFactor, coeff1);
+
+                hoNDArray<T> coeff2(RO, E1, E2, 1, wavCoeff.begin()+ii*RO*E1*E2*W+(7*n+5)*RO*E1*E2);
+                Gadgetron::scal(scaleFactor, coeff2);
+
+                hoNDArray<T> coeff3(RO, E1, E2, 1, wavCoeff.begin()+ii*RO*E1*E2*W+(7*n+7)*RO*E1*E2);
+                Gadgetron::scal(scaleFactor, coeff3);
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet3DOperator<T>::secondDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+thirdDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor)
+{
+    try
+    {
+        size_t RO = wavCoeff.get_size(0);
+        size_t E1 = wavCoeff.get_size(1);
+        size_t E2 = wavCoeff.get_size(2);
+        size_t W = wavCoeff.get_size(3);
+
+        size_t num = wavCoeff.get_number_of_elements()/(RO*E1*E2*W);
+
+        // coeff 4, 5, 6, 7 are for E2 high frequency
+        size_t ii;
+        for ( ii=0; ii<num; ii++ )
+        {
+            for ( size_t n=0; n<numOfWavLevels_; n++ )
+            {
+                hoNDArray<T> coeff(RO, E1, E2, 4, wavCoeff.begin()+ii*RO*E1*E2*W+(7*n+4)*RO*E1*E2);
+                Gadgetron::scal(scaleFactor, coeff);
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWavelet3DOperator<T>::thirdDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void gtPlusWavelet3DOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD wavelet 3D operator -----------------------" << endl;
+    os << "Wavelet operator for gtPlus ISMRMRD package" << endl;
+    os << "-------------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusWaveletNoNullSpace2DOperator.h b/toolboxes/gtplus/algorithm/gtPlusWaveletNoNullSpace2DOperator.h
new file mode 100644
index 0000000..50640c6
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusWaveletNoNullSpace2DOperator.h
@@ -0,0 +1,119 @@
+/** \file       gtPlusWaveletNoNullSpace2DOperator.h
+    \brief      Implement 2D wavelet operator for without Null space cases
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusWavelet2DOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusWaveletNoNullSpace2DOperator : public gtPlusWavelet2DOperator<T>
+{
+public:
+
+    typedef gtPlusWavelet2DOperator<T> BaseClass;
+
+    gtPlusWaveletNoNullSpace2DOperator();
+    virtual ~gtPlusWaveletNoNullSpace2DOperator();
+
+    virtual void printInfo(std::ostream& os);
+
+    // if the sensitivity S is set, compute gradient of ||wav*S'*F'*x||1
+    // if not, compute gradient of ||wav*F'*x||1
+    // x represents the unacquired kspace points [RO E1 CHA]
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // if the sensitivity S is set, compute cost value of L2 norm ||wav*S'*F'*x||1
+    // if not, compute cost value of L2 norm ||wav*F'*x||1
+    virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+    using BaseClass::scale_factor_first_dimension_;
+    using BaseClass::scale_factor_second_dimension_;
+    using BaseClass::numOfWavLevels_;
+    using BaseClass::with_approx_coeff_;
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+
+protected:
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    using BaseClass::wav_coeff_norm_;
+    using BaseClass::kspace_wav_;
+    using BaseClass::complexIm_wav_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+gtPlusWaveletNoNullSpace2DOperator<T>::gtPlusWaveletNoNullSpace2DOperator() : BaseClass()
+{
+
+}
+
+template <typename T> 
+gtPlusWaveletNoNullSpace2DOperator<T>::~gtPlusWaveletNoNullSpace2DOperator()
+{
+}
+
+template <typename T> 
+bool gtPlusWaveletNoNullSpace2DOperator<T>::
+grad(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(this->gradTask(x, g));
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWaveletNoNullSpace2DOperator<T>::grad(const hoNDArray<T>& x, hoNDArray<T>& g) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletNoNullSpace2DOperator<T>::
+obj(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(this->objTask(x, obj));
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWaveletNoNullSpace2DOperator<T>::obj(const hoNDArray<T>& x, T& obj) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void gtPlusWaveletNoNullSpace2DOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD wavelet 2D operator --------------------" << endl;
+    os << "Wavelet 2D operator for gtPlus ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusWaveletNoNullSpace3DOperator.h b/toolboxes/gtplus/algorithm/gtPlusWaveletNoNullSpace3DOperator.h
new file mode 100644
index 0000000..7503687
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusWaveletNoNullSpace3DOperator.h
@@ -0,0 +1,121 @@
+/** \file       gtPlusWaveletNoNullSpace3DOperator.h
+    \brief      Implement 3D wavelet operator for without Null space cases
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusWavelet3DOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusWaveletNoNullSpace3DOperator : public gtPlusWavelet3DOperator<T>
+{
+public:
+
+    typedef gtPlusWavelet3DOperator<T> BaseClass;
+
+    gtPlusWaveletNoNullSpace3DOperator();
+    virtual ~gtPlusWaveletNoNullSpace3DOperator();
+
+    virtual void printInfo(std::ostream& os);
+
+    // if the sensitivity S is set, compute gradient of ||wav*S'*F'*x||1
+    // if not, compute gradient of ||wav*F'*x||1
+    // x represents the unacquired kspace points [RO E1 CHA E2]
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // if the sensitivity S is set, compute cost value of L2 norm ||wav*S'*F'*x||1
+    // if not, compute cost value of L2 norm ||wav*F'*x||1
+    virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+    using BaseClass::scale_factor_first_dimension_;
+    using BaseClass::scale_factor_second_dimension_;
+    using BaseClass::scale_factor_third_dimension_;
+    using BaseClass::change_coeffcients_third_dimension_boundary_;
+    using BaseClass::numOfWavLevels_;
+    using BaseClass::with_approx_coeff_;
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+
+protected:
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    using BaseClass::wav_coeff_norm_;
+    using BaseClass::kspace_wav_;
+    using BaseClass::complexIm_wav_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+gtPlusWaveletNoNullSpace3DOperator<T>::gtPlusWaveletNoNullSpace3DOperator() : BaseClass()
+{
+
+}
+
+template <typename T> 
+gtPlusWaveletNoNullSpace3DOperator<T>::~gtPlusWaveletNoNullSpace3DOperator()
+{
+}
+
+template <typename T> 
+inline bool gtPlusWaveletNoNullSpace3DOperator<T>::
+grad(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(this->gradTask(x, g));
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWaveletNoNullSpace3DOperator<T>::grad(const hoNDArray<T>& x, hoNDArray<T>& g) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusWaveletNoNullSpace3DOperator<T>::
+obj(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(this->objTask(x, obj));
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWaveletNoNullSpace3DOperator<T>::obj(const hoNDArray<T>& x, T& obj) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void gtPlusWaveletNoNullSpace3DOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD wavelet 3D operator -----------------------" << endl;
+    os << "Wavelet operator for gtPlus ISMRMRD package" << endl;
+    os << "-------------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusWaveletOperator.h b/toolboxes/gtplus/algorithm/gtPlusWaveletOperator.h
new file mode 100644
index 0000000..9f0835b
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusWaveletOperator.h
@@ -0,0 +1,746 @@
+/** \file       gtPlusWaveletOperator.h
+    \brief      Implement wavelet operator for L1 regularization
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusWaveletOperator : public gtPlusOperator<T>
+{
+public:
+
+    typedef gtPlusOperator<T> BaseClass;
+    typedef typename BaseClass::value_type value_type;
+
+    gtPlusWaveletOperator();
+    virtual ~gtPlusWaveletOperator();
+
+    virtual void printInfo(std::ostream& os);
+
+    // forward operator
+    virtual bool forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y) = 0;
+
+    // adjoint operator
+    virtual bool adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y) = 0;
+
+    // compute L1 norm of wavelet coefficients across CHA
+    // waveCoeff: [RO E1 W CHA ...], W is the wavelet coefficient dimension (e.g. for 1 level wavelet decomposition, W=4 for 2D and W=8 for 3D)
+    // the W=1 wavelet coefficient is the most low frequent coefficients
+    virtual bool L1Norm(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm);
+    virtual bool L1NormTotal(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm, T& L1CoeffNorm);
+
+    // to compute the gradient of wavelet term, divide the wavelet coefficients by its norm
+    // if processApproxCoeff = true, the most low frequent coefficients are changed; otherwise, remains unchanged
+    virtual bool divideWavCoeffByNorm(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T mu, T p, bool processApproxCoeff=false);
+
+    // soft-threshold or shrink the wavelet coefficients
+    // the really applied threshold is mask.*thres
+    virtual bool shrinkWavCoeff(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, value_type thres, const hoNDArray<T>& mask, bool processApproxCoeff=false);
+    virtual bool proximity(hoNDArray<T>& wavCoeff, value_type thres);
+
+    // if the sensitivity S is set, compute gradient of ||wav*S'*F'*(Dc'x+D'y)||1
+    // if not, compute gradient of ||wav*F'*(Dc'x+D'y)||1
+    // x represents the unacquired kspace points [RO E1 CHA]
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // if the sensitivity S is set, compute cost value of L2 norm ||wav*S'*F'*(Dc'x+D'y)||1
+    // if not, compute cost value of L2 norm ||wav*F'*(Dc'x+D'y)||1
+    virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+    // number of transformation levels
+    size_t numOfWavLevels_;
+
+    // whether to include low frequency approximation coefficients
+    bool with_approx_coeff_;
+
+    T scale_factor_first_dimension_;
+    T scale_factor_second_dimension_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+
+public:
+
+    // convert to image domain or back to kspace
+    virtual bool convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im);
+    virtual bool convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x);
+
+    // compute gradient on the assembled kspace
+    virtual bool gradTask(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // compute the obj on the assembled kspace
+    virtual bool objTask(const hoNDArray<T>& x, T& obj);
+
+    // utility function
+    void scal(size_t N, float a, float* x);
+    void scal(size_t N, double a, double* x);
+    void scal(size_t N, std::complex<float> a, std::complex<float>* x);
+    void scal(size_t N, std::complex<double> a, std::complex<double>* x);
+
+    using BaseClass::acquired_points_;
+    using BaseClass::acquired_points_indicator_;
+    using BaseClass::unacquired_points_indicator_;
+    using BaseClass::coil_senMap_;
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    hoNDArray<T> wav_coeff_norm_;
+    hoNDArray<T> wav_coeff_norm_approx_;
+
+    hoNDArray<value_type> wav_coeff_norm_mag_;
+
+    hoNDArray<T> kspace_wav_;
+    hoNDArray<T> complexIm_wav_;
+    hoNDArray<T> complexIm_norm_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+gtPlusWaveletOperator<T>::gtPlusWaveletOperator() : numOfWavLevels_(1), with_approx_coeff_(false), scale_factor_first_dimension_(1.0), scale_factor_second_dimension_(1.0), BaseClass()
+{
+
+}
+
+template <typename T> 
+gtPlusWaveletOperator<T>::~gtPlusWaveletOperator()
+{
+}
+
+template <typename T> 
+void gtPlusWaveletOperator<T>::scal(size_t N, float a, float* x)
+{
+    long long n;
+    #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>64*1024)
+    for (n = 0; n < (long long)N; n++)
+    {
+        x[n] *= a;
+    }
+}
+
+template <typename T> 
+void gtPlusWaveletOperator<T>::scal(size_t N, double a, double* x)
+{
+    long long n;
+    #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>64*1024)
+    for (n = 0; n < (long long)N; n++)
+    {
+        x[n] *= a;
+    }
+}
+
+template <typename T> 
+void gtPlusWaveletOperator<T>::scal(size_t N,  std::complex<float>  a,  std::complex<float> * x)
+{
+    long long n;
+
+    #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>64*1024)
+    for (n = 0; n < (long long)N; n++)
+    {
+        const  std::complex<float> & c = x[n];
+        const float re = c.real();
+        const float im = c.imag();
+
+        const float ar = a.real();
+        const float ai = a.imag();
+
+        reinterpret_cast<float(&)[2]>(x[n])[0] = re*ar-im*ai;
+        reinterpret_cast<float(&)[2]>(x[n])[1] = re*ai+im*ar;
+    }
+}
+
+template <typename T> 
+void gtPlusWaveletOperator<T>::scal(size_t N,  std::complex<double>  a,  std::complex<double> * x)
+{
+    long long n;
+
+    #pragma omp parallel for default(none) private(n) shared(N, x, a) if (N>64*1024)
+    for (n = 0; n < (long long)N; n++)
+    {
+        const  std::complex<double> & c = x[n];
+        const double re = c.real();
+        const double im = c.imag();
+
+        const double ar = a.real();
+        const double ai = a.imag();
+
+        reinterpret_cast<double(&)[2]>(x[n])[0] = re*ar-im*ai;
+        reinterpret_cast<double(&)[2]>(x[n])[1] = re*ai+im*ar;
+    }
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+L1Norm(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dims = wavCoeff.get_dimensions();
+
+        std::vector<size_t> dimR(*dims);
+        dimR[3] = 1;
+
+        if ( !wavCoeffNorm.dimensions_equal(&dimR) )
+        {
+            wavCoeffNorm.create(&dimR);
+        }
+
+        size_t RO = (*dims)[0];
+        size_t E1 = (*dims)[1];
+        size_t W = (*dims)[2];
+        size_t CHA = (*dims)[3];
+
+        // square the coefficients
+        Gadgetron::multiplyConj(wavCoeff, wavCoeff, complexIm_norm_);
+        // sum over CHA
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(complexIm_norm_, wavCoeffNorm, 3));
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWaveletOperator<T>::L1Norm(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+L1NormTotal(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm, T& L1CoeffNorm)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(this->L1Norm(wavCoeff, wavCoeffNorm));
+
+        Gadgetron::sqrt(wavCoeffNorm, wav_coeff_norm_approx_);
+
+        L1CoeffNorm = Gadgetron::asum(&wav_coeff_norm_approx_);
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWaveletOperator<T>::L1NormTotal(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm, T& L1CoeffNorm) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+divideWavCoeffByNorm(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T mu, T p, bool processApproxCoeff)
+{
+    try
+    {
+        size_t RO = wavCoeff.get_size(0);
+        size_t E1 = wavCoeff.get_size(1);
+        size_t W = wavCoeff.get_size(2);
+        size_t CHA = wavCoeff.get_size(3);
+
+        if ( !wav_coeff_norm_approx_.dimensions_equal( &wavCoeffNorm ) )
+        {
+            wav_coeff_norm_approx_.create( wavCoeffNorm.get_dimensions() );
+        }
+
+        long long ii;
+        long long N = (long long)wavCoeffNorm.get_number_of_elements();
+
+        const T* pCoeffNorm = wavCoeffNorm.begin();
+        T* pBuf = wav_coeff_norm_approx_.begin();
+
+        if ( std::abs(std::abs(p) - 1.0) < 0.001 )
+        {
+            #pragma omp parallel for default(none) private(ii) shared(N, pBuf, pCoeffNorm, mu)
+            for ( ii=0; ii<N; ii++ )
+            {
+                pBuf[ii] = (value_type)(1.0 / std::sqrt( pCoeffNorm[ii].real() + mu.real() ));
+            }
+        }
+        else
+        {
+            #pragma omp parallel for default(none) private(ii) shared(N, pBuf, pCoeffNorm, mu, p)
+            for ( ii=0; ii<N; ii++ )
+            {
+                pBuf[ii] = (value_type)std::pow( (double)(pCoeffNorm[ii].real() + mu.real()), (double)(p.real()/2.0-1.0) );
+            }
+        }
+
+        if ( processApproxCoeff )
+        {
+            size_t num = wavCoeff.get_number_of_elements() / (RO*E1*W*CHA);
+
+#pragma omp parallel default(none) private(ii) shared(RO, E1, num, wavCoeffNorm, wavCoeff, W, CHA) if ( num > 1 )
+            {
+
+#pragma omp for
+                for (ii = 0; ii<(long long)num; ii++)
+                {
+                    hoNDArray<T> wavCoeffNormCurr(RO, E1, W, wav_coeff_norm_approx_.begin() + ii*RO*E1*W);
+
+                    for (size_t cha = 0; cha<CHA; cha++)
+                    {
+                        hoNDArray<T> wavCoeffCurr(RO, E1, W, wavCoeff.begin() + ii*RO*E1*W*CHA + cha*RO*E1*W);
+                        Gadgetron::multiply(wavCoeffNormCurr, wavCoeffCurr, wavCoeffCurr);
+                    }
+                }
+            }
+        }
+        else
+        {
+            size_t num = wavCoeff.get_number_of_elements()/(RO*E1*W*CHA);
+
+            #pragma omp parallel default(none) private(ii) shared(RO, E1, num, wavCoeffNorm, wavCoeff, W, CHA) if ( num > 1 )
+            {
+
+                #pragma omp for
+                for ( ii=0; ii<(long long)num; ii++ )
+                {
+                    hoNDArray<T> wavCoeffNormCurr(RO, E1, W-1, wav_coeff_norm_approx_.begin()+ii*RO*E1*W+RO*E1);
+
+                    for ( size_t cha=0; cha<CHA; cha++ )
+                    {
+                        hoNDArray<T> wavCoeffCurr(RO, E1, W-1, wavCoeff.begin()+ii*RO*E1*W*CHA+cha*RO*E1*W+RO*E1);
+                        Gadgetron::multiply(wavCoeffNormCurr, wavCoeffCurr, wavCoeffCurr);
+                    }
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWaveletOperator<T>::divideWavCoeffByNorm(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T mu, T p, bool processApproxCoeff) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+proximity(hoNDArray<T>& wavCoeff, value_type thres)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(this->L1Norm(wavCoeff, wav_coeff_norm_));
+        hoNDArray<T> mask;
+
+        GADGET_CHECK_RETURN_FALSE(this->shrinkWavCoeff(wavCoeff, wav_coeff_norm_, thres, mask, with_approx_coeff_));
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWaveletOperator<T>::proximity(hoNDArray<T>& wavCoeff, value_type thres) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+shrinkWavCoeff(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, value_type thres, const hoNDArray<T>& mask, bool processApproxCoeff)
+{
+    try
+    {
+        size_t RO = wavCoeff.get_size(0);
+        size_t E1 = wavCoeff.get_size(1);
+        size_t W = wavCoeff.get_size(2);
+        size_t CHA = wavCoeff.get_size(3);
+
+        if ( !wav_coeff_norm_approx_.dimensions_equal(&wavCoeffNorm) )
+        {
+            wav_coeff_norm_approx_.create(wavCoeffNorm.get_dimensions());
+        }
+
+        if ( !res_after_apply_kernel_.dimensions_equal(&wavCoeffNorm) )
+        {
+            res_after_apply_kernel_.create(wavCoeffNorm.get_dimensions());
+        }
+
+        long long ii;
+        long long N = (long long)wavCoeffNorm.get_number_of_elements();
+        long long N3D = RO*E1*W;
+
+        long long num = N/N3D;
+
+        const T* pCoeffNorm = wavCoeffNorm.begin();
+        T* pMag = wav_coeff_norm_approx_.begin();
+        T* pMagInv = res_after_apply_kernel_.begin();
+
+        #pragma omp parallel for default(none) private(ii) shared(N, pMag, pMagInv, pCoeffNorm)
+        for ( ii=0; ii<N; ii++ )
+        {
+            pMag[ii] = (value_type)std::sqrt( pCoeffNorm[ii].real() );
+            pMagInv[ii] = (value_type)(1.0/(pMag[ii].real()+DBL_EPSILON));
+        }
+
+        // phase does not change
+#pragma omp parallel default(none) private(ii) shared(RO, E1, num, wavCoeff, W, CHA) if ( num > 1 )
+        {
+#pragma omp for
+            for (ii = 0; ii<num; ii++)
+            {
+                hoNDArray<T> MagInvCurr(RO, E1, W, res_after_apply_kernel_.begin() + ii*RO*E1*W);
+
+                for (size_t cha = 0; cha<CHA; cha++)
+                {
+                    hoNDArray<T> wavCoeffCurr(RO, E1, W, wavCoeff.begin() + ii*RO*E1*W*CHA + cha*RO*E1*W);
+                    hoNDArray<T> resCurr(RO, E1, W, complexIm_.begin() + ii*RO*E1*W*CHA + cha*RO*E1*W);
+
+                    Gadgetron::multiply(MagInvCurr, wavCoeffCurr, resCurr);
+                }
+            }
+        }
+
+        // shrink the magnitude
+        if ( mask.dimensions_equal(&wavCoeffNorm) )
+        {
+            const T* pMask = mask.begin();
+
+            long long n = 0;
+            for ( n=0; n<num; n++ )
+            {
+                long long s=RO*E1; 
+                if ( processApproxCoeff )
+                {
+                    s = 0;
+                }
+
+                const T* pMaskCurr = pMask + n*N3D;
+                T* pMagCurr = pMag + n*N3D;
+
+                long long nn;
+
+                #pragma omp parallel for private(nn) shared(s, N3D, pMagCurr, pMaskCurr, thres)
+                for ( nn=s; nn<N3D; nn++ )
+                {
+                    // if ( std::abs(pMagCurr[nn]) < std::abs(thres*pMaskCurr[nn]) )
+                    if ( pMagCurr[nn].real() < thres*pMaskCurr[nn].real() )
+                    {
+                        pMagCurr[nn] = 0;
+                    }
+                    else
+                    {
+                        pMagCurr[nn] -= thres*pMaskCurr[nn];
+                    }
+                }
+            }
+        }
+        else
+        {
+            long long n = 0;
+            for ( n=0; n<num; n++ )
+            {
+                long long s=RO*E1; 
+                if ( processApproxCoeff )
+                {
+                    s = 0;
+                }
+
+                T* pMagCurr = pMag + n*N3D;
+
+                long long nn;
+                #pragma omp parallel for private(nn) shared(s, N3D, pMagCurr, thres)
+                for ( nn=s; nn<N3D; nn++ )
+                {
+                    // if ( std::abs(pMagCurr[nn]) < std::abs(thres) )
+                    if ( pMagCurr[nn].real() < thres )
+                    {
+                        pMagCurr[nn] = 0;
+                    }
+                    else
+                    {
+                        pMagCurr[nn] -= thres;
+                    }
+                }
+            }
+        }
+
+        size_t W_Start = W;
+        if ( processApproxCoeff )
+        {
+            num = wavCoeff.get_number_of_elements() / (RO*E1*W*CHA);
+
+#pragma omp parallel default(none) private(ii) shared(RO, E1, num, wavCoeff, W, CHA) if ( num > 1 )
+            {
+#pragma omp for
+                for (ii = 0; ii<num; ii++)
+                {
+                    hoNDArray<T> MagCurr(RO, E1, W, wav_coeff_norm_approx_.begin() + ii*RO*E1*W);
+
+                    for (size_t cha = 0; cha<CHA; cha++)
+                    {
+                        hoNDArray<T> phaseCurr(RO, E1, W, complexIm_.begin() + ii*RO*E1*W*CHA + cha*RO*E1*W);
+                        hoNDArray<T> wavCoeffCurr(RO, E1, W, wavCoeff.begin() + ii*RO*E1*W*CHA + cha*RO*E1*W);
+
+                        Gadgetron::multiply(MagCurr, phaseCurr, wavCoeffCurr);
+                    }
+                }
+            }
+        }
+        else
+        {
+            num = wavCoeff.get_number_of_elements()/(RO*E1*W*CHA);
+
+            #pragma omp parallel default(none) private(ii) shared(RO, E1, num, wavCoeff, W, CHA) if ( num > 1 )
+            {
+                #pragma omp for
+                for ( ii=0; ii<num; ii++ )
+                {
+                    hoNDArray<T> MagCurr(RO, E1, W-1, wav_coeff_norm_approx_.begin()+ii*RO*E1*W+RO*E1);
+
+                    for ( size_t cha=0; cha<CHA; cha++ )
+                    {
+                        hoNDArray<T> phaseCurr(RO, E1, W-1, complexIm_.begin()+ii*RO*E1*W*CHA+cha*RO*E1*W+RO*E1);
+                        hoNDArray<T> wavCoeffCurr(RO, E1, W-1, wavCoeff.begin()+ii*RO*E1*W*CHA+cha*RO*E1*W+RO*E1);
+
+                        Gadgetron::multiply(MagCurr, phaseCurr, wavCoeffCurr);
+                    }
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWaveletOperator<T>::shrinkWavCoeff(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T thres, const hoNDArray<T>& mask, bool processApproxCoeff) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+grad(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        // D'y+Dc'x
+        //gt_timer1_.start("1");
+        Gadgetron::multiply(unacquired_points_indicator_, x, kspace_);
+        //gt_timer1_.stop();
+
+        //gt_timer1_.start("2");
+        Gadgetron::add(*acquired_points_, kspace_, kspace_);
+        //gt_timer1_.stop();
+
+        // compute the gradient on assembled kspace
+        GADGET_CHECK_RETURN_FALSE(this->gradTask(kspace_, g));
+
+        // only unacquired points are kept
+        //gt_timer1_.start("12");
+        Gadgetron::multiply(unacquired_points_indicator_, g, g);
+        //gt_timer1_.stop();
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWaveletOperator<T>::grad(const hoNDArray<T>& x, hoNDArray<T>& g) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+obj(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        // D'y+Dc'x
+        //gt_timer1_.start("1");
+        Gadgetron::multiply(unacquired_points_indicator_, x, kspace_);
+        //gt_timer1_.stop();
+
+        //gt_timer1_.start("2");
+        Gadgetron::add(*acquired_points_, kspace_, kspace_);
+        //gt_timer1_.stop();
+
+        // compute the objective function on assembled kspace
+        GADGET_CHECK_RETURN_FALSE(this->objTask(kspace_, obj));
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWaveletOperator<T>::obj(const hoNDArray<T>& x, T& obj) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+gradTask(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        size_t RO = x.get_size(0);
+        size_t E1 = x.get_size(1);
+        size_t CHA = x.get_size(2);
+
+        // x to image domain
+        //gt_timer2_.start("3");
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, complexIm_));
+        //gt_timer2_.stop();
+
+        // compute the gradient
+        if ( coil_senMap_ && coil_senMap_->get_size(0)==RO && coil_senMap_->get_size(1)==E1 && coil_senMap_->get_size(2)==CHA )
+        {
+            // perform coil combination
+            //gt_timer2_.start("4");
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_complex_.coilCombine(complexIm_, *coil_senMap_, res_after_apply_kernel_));
+            //gt_timer2_.stop();
+
+            //gt_timer2_.start("5");
+            hoNDArray<T> combined(RO, E1, 1, res_after_apply_kernel_.begin());
+            //gt_timer2_.stop();
+
+            // compute wavelet transform
+            //gt_timer2_.start("6");
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(combined, res_after_apply_kernel_sum_over_));
+            //gt_timer2_.stop();
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(complexIm_, res_after_apply_kernel_sum_over_));
+        }
+
+        // modify coefficients
+        //gt_timer2_.start("7");
+        GADGET_CHECK_RETURN_FALSE(this->L1Norm(res_after_apply_kernel_sum_over_, wav_coeff_norm_));
+        //gt_timer2_.stop();
+
+        //gt_timer2_.start("8");
+        GADGET_CHECK_RETURN_FALSE(this->divideWavCoeffByNorm(res_after_apply_kernel_sum_over_, wav_coeff_norm_, T( (value_type)1e-15), T( (value_type)1.0 ), with_approx_coeff_));
+        //gt_timer2_.stop();
+
+        // go back to image
+        //gt_timer2_.start("9");
+        GADGET_CHECK_RETURN_FALSE(this->adjointOperator(res_after_apply_kernel_sum_over_, complexIm_wav_));
+        //gt_timer2_.stop();
+
+        if ( coil_senMap_ && coil_senMap_->get_size(0)==RO && coil_senMap_->get_size(1)==E1 && coil_senMap_->get_size(2)==CHA )
+        {
+            // apply coil sensivity
+            //gt_timer2_.start("10");
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(*coil_senMap_, complexIm_wav_, kspace_wav_));
+            //gt_timer2_.stop();
+
+            // go to kspace
+            //gt_timer2_.start("11");
+            GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(kspace_wav_, g));
+            //gt_timer2_.stop();
+        }
+        else
+        {
+            // go to kspace
+            GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(complexIm_wav_, g));
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWaveletOperator<T>::gradTask(const hoNDArray<T>& x, hoNDArray<T>& g) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+objTask(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        size_t RO = x.get_size(0);
+        size_t E1 = x.get_size(1);
+        size_t CHA = x.get_size(2);
+
+        // x to image domain
+        //gt_timer3_.start("3");
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, complexIm_));
+        //gt_timer3_.stop();
+
+        // apply sensitivity
+        if ( coil_senMap_ && coil_senMap_->get_size(0)==RO && coil_senMap_->get_size(1)==E1 && coil_senMap_->get_size(2)==CHA )
+        {
+            // perform coil combination
+            //gt_timer3_.start("4");
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_complex_.coilCombine(complexIm_, *coil_senMap_, res_after_apply_kernel_));
+            //gt_timer3_.stop();
+
+            //gt_timer3_.start("5");
+            hoNDArray<T> combined(RO, E1, 1, res_after_apply_kernel_.begin());
+            //gt_timer3_.stop();
+
+            // compute wavelet transform
+            //gt_timer3_.start("6");
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(combined, res_after_apply_kernel_sum_over_));
+            //gt_timer3_.stop();
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(complexIm_, res_after_apply_kernel_sum_over_));
+        }
+
+        //gt_timer3_.start("7");
+        GADGET_CHECK_RETURN_FALSE(this->L1NormTotal(res_after_apply_kernel_sum_over_, wav_coeff_norm_, obj));
+        //gt_timer3_.stop();
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusWaveletOperator<T>::objTask(const hoNDArray<T>& x, T& obj) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusWaveletOperator<T>::convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im)
+{
+    if ( !complexIm_Managed_.dimensions_equal(&x) )
+    {
+        complexIm_Managed_.create(x.get_dimensions());
+    }
+
+    Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(x, im, complexIm_Managed_);
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusWaveletOperator<T>::convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x)
+{
+    if ( !kspace_Managed_.dimensions_equal(&im) )
+    {
+        kspace_Managed_.create(im.get_dimensions());
+    }
+
+    Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(im, x, kspace_Managed_);
+
+    return true;
+}
+
+template <typename T> 
+void gtPlusWaveletOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD wavelet operator -----------------------" << endl;
+    os << "Wavelet operator for gtPlus ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/matlab/gtMatlab.h b/toolboxes/gtplus/matlab/gtMatlab.h
new file mode 100644
index 0000000..b465bdd
--- /dev/null
+++ b/toolboxes/gtplus/matlab/gtMatlab.h
@@ -0,0 +1,88 @@
+/********************************************************************
+    created:    2013/10/03
+    created:    3:10:2013   16:15
+    author:     Hui Xue
+
+    purpose:    Header to enable matlab print out info
+*********************************************************************/
+
+#pragma once 
+
+#include <sstream>
+#include <strstream>
+
+#ifdef GDEBUG_STREAM
+    #undef GDEBUG_STREAM
+#endif // GDEBUG_STREAM
+
+#ifdef GERROR_STREAM
+    #undef GERROR_STREAM
+#endif // GERROR_STREAM
+
+#ifdef GWARN_STREAM
+    #undef GWARN_STREAM
+#endif // GWARN_STREAM
+
+#ifdef _DEBUG
+    #define GDEBUG_STREAM(message) { std::ostrstream outs; outs << " (" << __FILE__ << ", " << __LINE__ << "): " << message << std::endl << '\0'; mexPrintf("%s", outs.str()); }
+#else
+    #define GDEBUG_STREAM(message) { std::ostrstream outs; outs << message << std::endl << '\0'; mexPrintf("%s", outs.str()); }
+#endif // _DEBUG
+
+#ifdef _DEBUG
+    #define GWARN_STREAM(message) { std::ostrstream outs; outs << " (" << __FILE__ << ", " << __LINE__ << "): " << message << std::endl << '\0'; mexWarnMsgTxt(outs.str()); }
+#else
+    #define GWARN_STREAM(message) { std::ostrstream outs; outs << message << std::endl << '\0'; mexWarnMsgTxt(outs.str()); }
+#endif // _DEBUG
+
+#define GERROR_STREAM(message) GDEBUG_STREAM(message) 
+
+#ifdef GADGET_CHECK_RETURN_FALSE
+    #undef GADGET_CHECK_RETURN_FALSE
+#endif // GADGET_CHECK_RETURN_FALSE
+#define GADGET_CHECK_RETURN_FALSE(con) { if ( ! (con) ) { return false; } }
+
+#ifdef GADGET_DEBUG_MODE
+#define GADGET_DEBUG_CHECK_THROW(con) GADGET_CHECK_THROW(con)
+#define GADGET_DEBUG_CHECK_RETURN(con, value) GADGET_CHECK_RETURN(con, value)
+#define GADGET_DEBUG_CHECK_RETURN_FALSE(con) GADGET_CHECK_RETURN_FALSE(con)
+#else
+#define GADGET_DEBUG_CHECK_THROW(con)
+#define GADGET_DEBUG_CHECK_RETURN(con, value)
+#define GADGET_DEBUG_CHECK_RETURN_FALSE(con)
+#endif // GADGET_DEBUG_MODE
+
+template <typename ObjType> void matlab_printInfo(const ObjType& obj)
+{
+    std::ostrstream outs;
+    obj.print(outs);
+    outs << std::ends;
+    std::string msg(outs.str());
+    GDEBUG_STREAM(msg.c_str());
+}
+
+inline void printAuthorInfo(std::stringstream& outs)
+{
+    using namespace std;
+    outs << "---------------------------------------------------------------------" << endl;
+    outs << "This software is made by: " << endl;
+    outs << endl;
+    outs << "\t\tHui Xue " << endl;
+    outs << "Magnetic Resonance Technology Program" << endl;
+    outs << "National Heart, Lung and Blood Institute" << endl;
+    outs << "National Institutes of Health" << endl;
+    outs << "Email: hui.xue at nih.gov" << endl;
+    outs << endl;
+    outs << "\t\tPeter Kellman " << endl;
+    outs << "Medical Signal and Image Processing Program" << endl;
+    outs << "National Heart, Lung and Blood Institute" << endl;
+    outs << "National Institutes of Health" << endl;
+    outs << "Email: kellmanp at nhlbi.nih.gov" << endl;
+    outs << endl;
+    outs << "\t\tMichael Hansen " << endl;
+    outs << "Medical Signal and Image Processing Program" << endl;
+    outs << "National Heart, Lung and Blood Institute" << endl;
+    outs << "National Institutes of Health" << endl;
+    outs << "Email: michael.hansen at nih.gov" << endl;
+    outs << "---------------------------------------------------------------------" << endl;
+}
diff --git a/toolboxes/gtplus/matlab/gtMatlabConverter.h b/toolboxes/gtplus/matlab/gtMatlabConverter.h
new file mode 100644
index 0000000..64b81bb
--- /dev/null
+++ b/toolboxes/gtplus/matlab/gtMatlabConverter.h
@@ -0,0 +1,266 @@
+/********************************************************************
+    created:    2013/10/03
+    created:    3:10:2013   14:06
+    author:     Hui Xue
+
+    purpose:    Gadgetron data structure to matlab conversion
+*********************************************************************/
+
+#pragma once
+
+#include "gtMatlabImage.h"
+
+namespace Gadgetron
+{
+
+template <typename T> 
+class gtMatlabConverter
+{
+public:
+
+    typedef gtMatlabConverter<T> Self;
+
+    gtMatlabConverter() {}
+    virtual ~gtMatlabConverter() {}
+
+    virtual bool hoNDArray2Matlab(const hoNDArray<T>& a, mxArray*& aMx);
+    virtual bool Matlab2hoNDArray(const mxArray* aMx, hoNDArray<T>& a);
+
+    virtual bool Vec2Matlab(const std::vector<T>& vec, mxArray*& aMx);
+    virtual bool Matlab2Vec(const mxArray* aMx, std::vector<T>& vec);
+
+    virtual bool Str2Matlab(const std::string& str, mxArray*& aMx);
+    virtual bool Matlab2Str(const mxArray* aMx, std::string& str);
+
+    template <unsigned int D> 
+    bool hoNDImage2Matlab(const hoNDImage<T, D>& a, mxArray*& aMx, mxArray*& aHeader)
+    {
+        std::vector<size_t> dim(D);
+        a.get_dimensions(dim);
+
+        hoNDArray<T> buf(dim, const_cast<T*>(a.get_data_ptr()), false);
+        GADGET_CHECK_RETURN_FALSE(hoNDArray2Matlab(buf, aMx));
+
+        gtMatlabImageHeader<T, D> header(a);
+        GADGET_CHECK_RETURN_FALSE(header.toMatlab(aHeader));
+
+        return true;
+    }
+
+    template <unsigned int D> 
+    bool Matlab2hoNDImage(const mxArray* aMx, const mxArray* aHeader, hoNDImage<T, D>& a)
+    {
+        mwSize ndim = mxGetNumberOfDimensions(aMx);
+        if ( ndim != D ) return false;
+
+        hoNDArray<T> buf;
+        GADGET_CHECK_RETURN_FALSE(Matlab2hoNDArray(aMx, buf));
+        GADGET_CHECK_RETURN_FALSE(buf.get_number_of_dimensions()<=D);
+
+        a.from_NDArray(buf);
+
+        gtMatlabImageHeader<T, D> header;
+        GADGET_CHECK_RETURN_FALSE(header.fromMatlab(aHeader));
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            a.set_pixel_size(ii, header.pixelSize_[ii]);
+            a.set_origin(ii, header.origin_[ii]);
+            a.set_axis(ii, header.axis_[ii]);
+        }
+
+        return true;
+    }
+
+    virtual void printInfo(std::ostream& os) const;
+};
+
+template <typename T> 
+bool gtMatlabConverter<T>::
+hoNDArray2Matlab(const hoNDArray<T>& a, mxArray*& aMx)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dim = a.get_dimensions();
+
+        mwSize ndim = dim->size();
+        mwSize* dims = new mwSize[ndim];
+
+        mwSize ii;
+        for ( ii=0; ii<ndim; ii++ )
+        {
+            dims[ii] = static_cast<mwSize>( (*dim)[ii] );
+        }
+
+        mwSize N = a.get_number_of_elements();
+        const T* pA = a.begin();
+
+        if ( typeid(T) == typeid(float) )
+        {
+            aMx = mxCreateNumericArray(ndim, dims, mxSINGLE_CLASS, mxREAL);
+            float* ptr = static_cast<float*>(mxGetData(aMx));
+
+            for ( ii=0; ii<N; ii++ )
+            {
+                ptr[ii] = (float)(pA[ii]);
+            }
+        }
+        else
+        {
+            aMx = mxCreateNumericArray(ndim, dims, mxDOUBLE_CLASS, mxREAL);
+            double* ptr = static_cast<double*>(mxGetData(aMx));
+
+            for ( ii=0; ii<N; ii++ )
+            {
+                ptr[ii] = (float)(pA[ii]);
+            }
+        }
+    }
+    catch(...)
+    {
+        mexErrMsgTxt("Errors happened in gtMatlabConverter::hoNDArray2Matlab(const hoNDArray<T>& a, mxArray*& aMx) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtMatlabConverter<T>::
+Matlab2hoNDArray(const mxArray* aMx, hoNDArray<T>& a)
+{
+    try
+    {
+        mwSize ndim = mxGetNumberOfDimensions(aMx);
+        const mwSize* dims = mxGetDimensions(aMx);
+
+        std::vector<size_t> dim(ndim);
+
+        size_t ii;
+        for ( ii=0; ii<ndim; ii++ )
+        {
+            dim[ii] = static_cast<size_t>(dims[ii]);
+        }
+
+        a.create(&dim);
+        size_t N = a.get_number_of_elements();
+        T* pA = a.begin();
+
+        if ( mxIsSingle(aMx) )
+        {
+            float* ptr = static_cast<float*>(mxGetData(aMx));
+            for ( ii=0; ii<N; ii++ )
+            {
+                pA[ii] = static_cast<T>(ptr[ii]);
+            }
+        }
+        else
+        {
+            double* ptr = static_cast<double*>(mxGetData(aMx));
+            for ( ii=0; ii<N; ii++ )
+            {
+                pA[ii] = static_cast<T>(ptr[ii]);
+            }
+        }
+    }
+    catch(...)
+    {
+        mexErrMsgTxt("Errors happened in gtMatlabConverter::Matlab2hoNDArray(const mxArray* aMx, hoNDArray<T>& a) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtMatlabConverter<T>::
+Vec2Matlab(const std::vector<T>& vec, mxArray*& aMx)
+{
+    try
+    {
+        aMx = mxCreateNumericMatrix(vec.size(), 1, mxDOUBLE_CLASS, mxREAL);
+        double* ptr = static_cast<double*>(mxGetData(aMx));
+        for ( size_t ii=0; ii<vec.size(); ii++ )
+        {
+            ptr[ii] = static_cast<double>(vec[ii]);
+        }
+    }
+    catch(...)
+    {
+        mexErrMsgTxt("Errors happened in gtMatlabConverter::Vec2Matlab(const std::vector<T>& vec, mxArray*& aMx) ... ");
+        return false;
+    }
+
+    return true; 
+}
+
+template <typename T> 
+bool gtMatlabConverter<T>::
+Matlab2Vec(const mxArray* aMx, std::vector<T>& vec)
+{
+    try
+    {
+        mwSize N = mxGetNumberOfElements(aMx);
+        vec.resize(N);
+
+        if ( mxIsSingle(aMx) )
+        {
+            float* ptr = static_cast<float*>(mxGetData(aMx));
+            for ( size_t ii=0; ii<N; ii++ )
+            {
+                vec[ii] = static_cast<T>(ptr[ii]);
+            }
+        }
+        else
+        {
+            double* ptr = static_cast<double*>(mxGetData(aMx));
+            for ( size_t ii=0; ii<N; ii++ )
+            {
+                vec[ii] = static_cast<T>(ptr[ii]);
+            }
+        }
+    }
+    catch(...)
+    {
+        mexErrMsgTxt("Errors happened in gtMatlabConverter::Matlab2Vec(const mxArray* aMx, std::vector<T>& vec) ... ");
+        return false;
+    }
+
+    return true; 
+}
+
+template <typename T> 
+bool gtMatlabConverter<T>::
+Str2Matlab(const std::string& str, mxArray*& aMx)
+{
+    aMx = mxCreateString(str.c_str());
+    return (aMx != NULL);
+}
+
+template <typename T> 
+bool gtMatlabConverter<T>::
+Matlab2Str(const mxArray* aMx, std::string& str)
+{
+    mwSize N = mxGetNumberOfElements(aMx) + 1;
+
+    std::vector<char> buf(N, '\0');
+    if (mxGetString(aMx, &buf[0], N) != 0)
+    {
+        return false;
+    }
+    str = std::string(&buf[0]);
+
+    return true;
+}
+
+template <typename T> 
+void gtMatlabConverter<T>::printInfo(std::ostream& os) const
+{
+    using namespace std;
+    os << "--------------------------------------------------" << endl;
+    os << "Gadgetron matlab Converter ..." << endl;
+    os << "--------------------------------------------------" << endl;
+}
+
+}
diff --git a/toolboxes/gtplus/matlab/gtMatlabConverterComplex.h b/toolboxes/gtplus/matlab/gtMatlabConverterComplex.h
new file mode 100644
index 0000000..dfda4b7
--- /dev/null
+++ b/toolboxes/gtplus/matlab/gtMatlabConverterComplex.h
@@ -0,0 +1,184 @@
+/********************************************************************
+    created:    2013/10/03
+    created:    3:10:2013   14:06
+    author:     Hui Xue
+
+    purpose:    Gadgetron complex data structure to matlab conversion
+*********************************************************************/
+
+#pragma once
+
+#include "gtMatlabImage.h"
+
+namespace Gadgetron
+{
+
+template <typename T> 
+class gtMatlabConverterComplex
+{
+public:
+
+    gtMatlabConverterComplex() {}
+    virtual ~gtMatlabConverterComplex() {}
+
+    virtual bool hoNDArray2Matlab(const hoNDArray<T>& a, mxArray*& aMx);
+    virtual bool Matlab2hoNDArray(const mxArray* aMx, hoNDArray<T>& a);
+
+    template <unsigned int D> 
+    bool hoNDImage2Matlab(const hoNDImage<T, D>& a, mxArray*& aMx, mxArray*& aHeader)
+    {
+        std::vector<size_t> dim(D);
+        a.get_dimensions(dim);
+
+        hoNDArray<T> buf(dim, const_cast<T*>(a.get_data_ptr()), false);
+        GADGET_CHECK_RETURN_FALSE(hoNDArray2Matlab(buf, aMx));
+
+        gtMatlabImageHeader<T, D> header(a);
+        GADGET_CHECK_RETURN_FALSE(header.toMatlab(aHeader));
+
+        return true;
+    }
+
+    template <unsigned int D> 
+    bool Matlab2hoNDImage(const mxArray* aMx, const mxArray* aHeader, hoNDImage<T, D>& a)
+    {
+        hoNDArray<T> buf;
+        GADGET_CHECK_RETURN_FALSE(Matlab2hoNDArray(aMx, buf));
+        GADGET_CHECK_RETURN_FALSE(buf.get_number_of_dimensions()==D);
+
+        a.from_NDArray(buf);
+
+        gtMatlabImageHeader<T, D> header;
+        GADGET_CHECK_RETURN_FALSE(header.fromMatlab(aHeader));
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            a.set_pixel_size(ii, header.pixelSize_[ii]);
+            a.set_origin(ii, header.origin_[ii]);
+            a.set_axis(ii, header.axis_[ii]);
+        }
+
+        return true;
+    }
+
+    virtual void printInfo(std::ostream& os) const;
+
+protected:
+};
+
+template <typename T> 
+bool gtMatlabConverterComplex<T>::
+hoNDArray2Matlab(const hoNDArray<T>& a, mxArray*& aMx)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dim = a.get_dimensions();
+
+        int ndim = dim->size();
+        mwSize* dims = new mwSize[ndim];
+
+        size_t ii;
+        for ( ii=0; ii<ndim; ii++ )
+        {
+            dims[ii] = static_cast<mwSize>( (*dim)[ii] );
+        }
+
+        size_t N = a.get_number_of_elements();
+        const T* pA = a.begin();
+
+        if ( typeid(T) == typeid(std::complex<float>) )
+        {
+            aMx = mxCreateNumericArray(ndim, dims, mxSINGLE_CLASS, mxCOMPLEX);
+            float* pr = static_cast<float*>(mxGetData(aMx));
+            float* pi = static_cast<float*>(mxGetImagData(aMx));
+
+            for ( ii=0; ii<N; ii++ )
+            {
+                pr[ii] = static_cast<float>(pA[ii].real());
+                pi[ii] = static_cast<float>(pA[ii].imag());
+            }
+        }
+        else if ( typeid(T) == typeid(std::complex<double>) )
+        {
+            aMx = mxCreateNumericArray(ndim, dims, mxDOUBLE_CLASS, mxCOMPLEX);
+            double* pr = static_cast<double*>(mxGetData(aMx));
+            double* pi = static_cast<double*>(mxGetImagData(aMx));
+
+            for ( ii=0; ii<N; ii++ )
+            {
+                pr[ii] = static_cast<double>(pA[ii].real());
+                pi[ii] = static_cast<double>(pA[ii].imag());
+            }
+        }
+    }
+    catch(...)
+    {
+        mexErrMsgTxt("Errors happened in gtMatlabConverterComplex::hoNDArray2Matlab(const hoNDArray<T>& a, mxArray*& aMx) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtMatlabConverterComplex<T>::
+Matlab2hoNDArray(const mxArray* aMx, hoNDArray<T>& a)
+{
+    try
+    {
+        mwSize ndim = mxGetNumberOfDimensions(aMx);
+        const mwSize* dims = mxGetDimensions(aMx);
+
+        std::vector<size_t> dim(ndim);
+
+        size_t ii;
+        for ( ii=0; ii<ndim; ii++ )
+        {
+            dim[ii] = static_cast<size_t>(dims[ii]);
+        }
+
+        a.create(&dim);
+        size_t N = a.get_number_of_elements();
+        T* pA = a.begin();
+
+        if ( mxIsComplex(aMx) && mxIsDouble(aMx) )
+        {
+            double* pr = static_cast<double*>(mxGetData(aMx));
+            double* pi = static_cast<double*>(mxGetImagData(aMx));
+
+            for ( ii=0; ii<N; ii++ )
+            {
+                pA[ii] = T(pr[ii], pi[ii]);
+            }
+        }
+        else if ( mxIsComplex(aMx) && mxIsSingle(aMx) )
+        {
+            float* pr = static_cast<float*>(mxGetData(aMx));
+            float* pi = static_cast<float*>(mxGetImagData(aMx));
+
+            for ( ii=0; ii<N; ii++ )
+            {
+                pA[ii] = T(pr[ii], pi[ii]);
+            }
+        }
+    }
+    catch(...)
+    {
+        mexErrMsgTxt("Errors happened in gtMatlabConverterComplex::Matlab2hoNDArray(const mxArray* aMx, hoNDArray<T>& a) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void gtMatlabConverterComplex<T>::printInfo(std::ostream& os) const
+{
+    using namespace std;
+    os << "--------------------------------------------------" << endl;
+    os << "Gadgetron matlab Converter for complex arrays ..." << endl;
+    os << "--------------------------------------------------" << endl;
+}
+
+}
diff --git a/toolboxes/gtplus/matlab/gtMatlabImage.h b/toolboxes/gtplus/matlab/gtMatlabImage.h
new file mode 100644
index 0000000..0f4e041
--- /dev/null
+++ b/toolboxes/gtplus/matlab/gtMatlabImage.h
@@ -0,0 +1,258 @@
+/********************************************************************
+    created:    2014/02/25
+    author:     Hui Xue
+
+    purpose:    Gadgetron data structure for ND image matlab conversion
+*********************************************************************/
+
+#pragma once
+
+#include <matrix.h>
+#include <mat.h>
+#include <mexGT.h>
+#include <cmath>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <strstream>
+#include <algorithm>
+
+#include "hoNDArray.h"
+#include "hoNDImage.h"
+#include "gtMatlab.h"
+
+namespace Gadgetron
+{
+
+template <typename T, unsigned int D>
+class gtMatlabImageHeader
+{
+public:
+
+    typedef hoNDImage<T, D> ImageType;
+
+    typedef typename ImageType::value_type value_type;
+    typedef typename ImageType::coord_type coord_type;
+    typedef typename ImageType::a_axis_type a_axis_type;
+    typedef typename ImageType::axis_type axis_type;
+
+    coord_type pixelSize_[D];
+    coord_type origin_[D];
+    hoNDPoint<coord_type, D> axis_[D];
+
+    gtMatlabImageHeader();
+    gtMatlabImageHeader(const ImageType& im);
+    virtual ~gtMatlabImageHeader();
+
+    /// for the axis, it will be a D*D rotation matrix
+    /// every column is a oritentation vector for a dimension
+    virtual bool toMatlab(mxArray*& header);
+    virtual bool fromMatlab(const mxArray* header);
+
+protected:
+
+    // the header field names
+    std::vector<char*> header_fields_;
+
+    void set_header_fields()
+    {
+        size_t num = 3; // origin, pixelSize, axis
+        header_fields_.resize(3);
+        header_fields_[0] = "origin";
+        header_fields_[1] = "pixelSize";
+        header_fields_[2] = "axis";
+    }
+};
+
+template <typename T, unsigned int D>
+gtMatlabImageHeader<T, D>::gtMatlabImageHeader()
+{
+    unsigned int ii;
+    for (ii=0;ii<D; ii++)
+    {
+        pixelSize_[ii] = 1;
+        origin_[ii] = 0;
+        axis_[ii].fill(0);
+        axis_[ii][ii] = coord_type(1.0);
+    }
+
+    this->set_header_fields();
+}
+
+template <typename T, unsigned int D>
+gtMatlabImageHeader<T, D>::gtMatlabImageHeader(const ImageType& im)
+{
+    std::vector<coord_type> pixelSize;
+    im.get_pixel_size(pixelSize);
+
+    std::vector<coord_type> origin;
+    im.get_origin(origin);
+
+    axis_type axis;
+    im.get_axis(axis);
+
+    unsigned int ii;
+    for (ii=0;ii<D; ii++)
+    {
+        pixelSize_[ii] = pixelSize[ii];
+        origin_[ii] = origin[ii];
+        axis_[ii] = axis[ii];
+    }
+
+    this->set_header_fields();
+}
+
+template <typename T, unsigned int D>
+gtMatlabImageHeader<T, D>::~gtMatlabImageHeader()
+{
+
+}
+
+template <typename T, unsigned int D>
+bool gtMatlabImageHeader<T, D>::toMatlab(mxArray*& header)
+{
+    try
+    {
+        unsigned int ii, jj;
+
+        mwSize num[2] = {1, 1};
+        header = mxCreateStructArray(2, num, (int)header_fields_.size(), const_cast<const char**>(&header_fields_[0]));
+
+        mwSize dims[1];
+        dims[0] = D;
+
+        mxArray* aMx = mxCreateNumericArray(1, dims, mxSINGLE_CLASS, mxREAL);
+        float* pr = static_cast<float*>(mxGetData(aMx));
+        for ( ii=0; ii<D; ii++ )
+        {
+            pr[ii] = origin_[ii];
+        }
+
+        mxSetField(header, 0, header_fields_[0], aMx);
+
+        aMx = mxCreateNumericArray(1, dims, mxSINGLE_CLASS, mxREAL);
+        pr = static_cast<float*>(mxGetData(aMx));
+        for ( ii=0; ii<D; ii++ )
+        {
+            pr[ii] = pixelSize_[ii];
+        }
+
+        mxSetField(header, 0, header_fields_[1], aMx);
+
+        mwSize dimsAxis[2];
+        dimsAxis[0] = D;
+        dimsAxis[1] = D;
+
+        aMx = mxCreateNumericMatrix(D, D, mxSINGLE_CLASS, mxREAL);
+        pr = static_cast<float*>(mxGetData(aMx));
+        for ( jj=0; jj<D; jj++ )
+        {
+            for ( ii=0; ii<D; ii++ )
+            {
+                pr[jj + ii*D] = axis_[jj][ii]; // stored in column-wise
+            }
+        }
+
+        mxSetField(header, 0, header_fields_[2], aMx);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in gtMatlabImageHeader<T, D>::toMatlab(mxArray*& header) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, unsigned int D>
+bool gtMatlabImageHeader<T, D>::fromMatlab(const mxArray* header)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(mxIsStruct(header));
+
+        unsigned int ii, jj;
+
+        mxArray* aMx = mxGetField(header, 0, header_fields_[0]);
+        size_t N = mxGetNumberOfElements(aMx);
+
+        unsigned int minDN = ( (D<N) ? D : N );
+
+        if ( mxIsSingle(aMx) )
+        {
+            float* pr = static_cast<float*>(mxGetData(aMx));
+
+            for ( ii=0; ii<minDN; ii++ )
+            {
+                origin_[ii] = (coord_type)pr[ii];
+            }
+        }
+        else
+        {
+            double* pr = static_cast<double*>(mxGetData(aMx));
+
+            for ( ii=0; ii<minDN; ii++ )
+            {
+                origin_[ii] = (coord_type)pr[ii];
+            }
+        }
+
+        aMx = mxGetField(header, 0, header_fields_[1]);
+        N = mxGetNumberOfElements(aMx);
+
+        if ( mxIsSingle(aMx) )
+        {
+            float* pr = static_cast<float*>(mxGetData(aMx));
+
+            for ( ii=0; ii<minDN; ii++ )
+            {
+                pixelSize_[ii] = (coord_type)pr[ii];
+            }
+        }
+        else
+        {
+            double* pr = static_cast<double*>(mxGetData(aMx));
+
+            for ( ii=0; ii<minDN; ii++ )
+            {
+                pixelSize_[ii] = (coord_type)pr[ii];
+            }
+        }
+
+        aMx = mxGetField(header, 0, header_fields_[2]);
+
+        if ( mxIsSingle(aMx) )
+        {
+            float* pr = static_cast<float*>(mxGetData(aMx));
+
+            for ( jj=0; jj<minDN; jj++ )
+            {
+                for ( ii=0; ii<minDN; ii++ )
+                {
+                    axis_[jj][ii] = (coord_type)pr[jj + ii*D];
+                }
+            }
+        }
+        else
+        {
+            double* pr = static_cast<double*>(mxGetData(aMx));
+
+            for ( jj=0; jj<minDN; jj++ )
+            {
+                for ( ii=0; ii<minDN; ii++ )
+                {
+                    axis_[jj][ii] = (coord_type)pr[jj + ii*D];
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in gtMatlabImageHeader<T, D>::fromMatlab(const mxArray* header) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}
diff --git a/toolboxes/gtplus/solver/gtPlusLSQRSolver.h b/toolboxes/gtplus/solver/gtPlusLSQRSolver.h
new file mode 100644
index 0000000..84dd19e
--- /dev/null
+++ b/toolboxes/gtplus/solver/gtPlusLSQRSolver.h
@@ -0,0 +1,293 @@
+/** \file       gtPlusLSQRSolver.h
+    \brief      Implement the LSQR linear solver for Ax=b
+    \author     Hui Xue
+
+    Ref to:
+    http://www.stanford.edu/group/SOL/software/lsqr.html
+    C. C. Paige and M. A. Saunders, LSQR: An algorithm for sparse linear equations and sparse least squares, TOMS 8(1), 43-71 (1982). 
+    C. C. Paige and M. A. Saunders, Algorithm 583; LSQR: Sparse linear equations and least-squares problems, TOMS 8(2), 195-209 (1982).
+*/
+
+#pragma once
+
+#include "gtPlusLinearSolver.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+class gtPlusLSQRSolver : public gtPlusLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>
+{
+public:
+
+    typedef gtPlusLinearSolver<Array_Type_I, Array_Type_O, Oper_Type> BaseClass;
+
+    typedef typename BaseClass::ValueType ValueType;
+
+    typedef typename realType<ValueType>::Type value_type;
+
+    gtPlusLSQRSolver();
+    virtual ~gtPlusLSQRSolver();
+
+    virtual bool solve(const Array_Type_I& b, Array_Type_O& x);
+
+    virtual void printInfo(std::ostream& os) const;
+
+    using BaseClass::iterMax_;
+    using BaseClass::thres_;
+    using BaseClass::x0_;
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::printIter_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+
+protected:
+
+    using BaseClass::callback_;
+    using BaseClass::oper_;
+};
+
+// ===================================================================================== //
+//                           Implementation of template function                         //
+// ===================================================================================== //
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+gtPlusLSQRSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+gtPlusLSQRSolver() : BaseClass()
+{
+    iterMax_ = 70;
+    thres_ = (value_type)1e-4;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+gtPlusLSQRSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+~gtPlusLSQRSolver() 
+{
+
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+bool gtPlusLSQRSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+solve(const Array_Type_I& b, Array_Type_O& x)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(oper_!=NULL);
+
+        x = *x0_;
+
+        // Set up for the method
+        value_type n2b;
+        Gadgetron::norm2(b, n2b);
+
+        int flag = 1;
+
+        value_type tolb = thres_ * n2b;
+        Array_Type_I u(b);
+
+        // u = u - A(x, varargin{:}, 'notransp');
+        // u = b - A*x0
+        GADGET_CHECK_RETURN_FALSE(oper_->forwardOperator(x, u));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::subtract(b, u, u));
+
+        value_type beta;
+        Gadgetron::norm2(u, beta);
+
+        value_type normr(beta);
+        if (std::abs(beta)>0)
+        {
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal( value_type(1.0)/beta, u));
+        }
+
+        value_type c = 1;
+        value_type s = 0;
+        value_type phibar = beta;
+
+        // v = A(u, varargin{:},'transp');
+        Array_Type_I v(x);
+        GADGET_CHECK_RETURN_FALSE(oper_->adjointOperator(u, v));
+
+        value_type alpha;
+        Gadgetron::norm2(v, alpha);
+        if (std::abs(alpha)>0)
+        {
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal( value_type(1.0)/alpha, v));
+        }
+
+        Array_Type_I d(x);
+        Gadgetron::clear(d);
+
+        value_type normar;
+        normar = alpha * beta;
+
+        // Check for all zero solution
+        if ( std::abs(normar) < DBL_EPSILON )
+        {
+            Gadgetron::clear(x);
+            return true;
+        }
+
+        value_type norma(0);
+        value_type sumnormd2 = 0;
+        size_t stag = 0;
+        size_t iter = iterMax_;
+        size_t  maxstagsteps = 3;
+
+        // loop over maxit iterations (unless convergence or failure)
+
+        Array_Type_I z(v), dtmp(d), ztmp(v), vt(v), utmp(u);
+        Array_Type_I normaVec(3);
+
+        value_type thet, rhot, rho, phi, tmp, tmp2;
+
+        size_t ii;
+        for ( ii=0; ii<iterMax_; ii++ )
+        {
+            // z = v;
+            memcpy(z.begin(), v.begin(), v.get_number_of_bytes());
+
+            // u = A(z, varargin{:},'notransp') - alpha*u;
+            GADGET_CHECK_RETURN_FALSE(oper_->forwardOperator(z, utmp));
+            Gadgetron::scal( alpha, u);
+            Gadgetron::subtract( utmp, u, u);
+
+            Gadgetron::norm2(u, beta);
+            Gadgetron::scal( value_type(1.0)/beta, u);
+
+            normaVec(0) = norma;
+            normaVec(1) = alpha;
+            normaVec(2) = beta;
+            Gadgetron::norm2(normaVec, norma);
+
+            thet = - s * alpha;
+            rhot = c * alpha;
+            rho = (value_type)( std::sqrt( (double)(rhot*rhot + beta*beta) ));
+            c = rhot / rho;
+            s = - beta / rho;
+            phi = c * phibar;
+            if ( std::abs(phi)< DBL_EPSILON )
+            {
+                stag = 1;
+            }
+
+            phibar = s * phibar;
+
+            // d = (z - thet * d) / rho;
+            //dtmp = d;
+            memcpy(dtmp.begin(), d.begin(), d.get_number_of_bytes());
+            Gadgetron::scal( thet, dtmp);
+            Gadgetron::subtract( z, dtmp, ztmp);
+            Gadgetron::scal( value_type(1.0)/rho, ztmp);
+            //d = ztmp;
+            memcpy(d.begin(), ztmp.begin(), d.get_number_of_bytes());
+
+            // sumnormd2 = sumnormd2 + (norm(d(:)))^2;
+            Gadgetron::norm2(d, tmp);
+            sumnormd2 += (tmp*tmp);
+
+            // Check for stagnation of the method
+            Gadgetron::norm2(x, tmp2);
+
+            if ( std::abs(phi)*std::abs(tmp) < DBL_EPSILON*std::abs(tmp2) )
+            {
+                stag++;
+            }
+            else
+            {
+                stag = 0;
+            }
+
+            // check for convergence in min{|b-A*x|}
+            if ( std::abs(normar/(norma*normr)) <= thres_ )
+            {
+                flag = 0;
+                break;
+            }
+
+            // check for convergence in A*x=b
+            if (std::abs(normr) <= std::abs(tolb) )
+            {
+                flag = 0;
+                break;
+            }
+
+            if (stag >= maxstagsteps)
+            {
+                flag = 3;
+                break;
+            }
+
+            //if (printIter_)
+            //{
+            //    GDEBUG_STREAM("Iteration " << ii << " - normar/(norma*normr) = " << std::abs(normar/(norma*normr)) << " - normr = " << std::abs(normr) );
+            //}
+
+            // x = x + phi * d;
+            //dtmp = d;
+            memcpy(dtmp.begin(), d.begin(), d.get_number_of_bytes());
+            Gadgetron::scal( phi, dtmp);
+            Gadgetron::add( x, dtmp, x);
+
+            normr = (value_type)(std::abs( (double)s) * normr);
+
+            // vt = A(u, varargin{:},'transp');
+            GADGET_CHECK_RETURN_FALSE(oper_->adjointOperator(u, vt));
+
+            // v = vt - beta * v;
+            Gadgetron::scal( beta, v);
+            Gadgetron::subtract( vt, v, v);
+
+            Gadgetron::norm2(v, alpha);
+
+            Gadgetron::scal( value_type(1.0)/alpha, v);
+
+            normar = alpha * std::abs( (value_type)s * phi);
+        }
+
+        if (printIter_)
+        {
+            GDEBUG_STREAM("Total iteration number is  " << ii << " - relative norm is " << std::abs(normar/(norma*normr)) << " ... ");
+        }
+
+        if (flag == 1)
+        {
+            if ( normar/(norma*normr) <= thres_ )
+            {
+                flag = 0;
+            }
+
+            if (std::abs(normr) <= std::abs(tolb) )
+            {
+                flag = 0;
+            }
+        }
+
+        //if (printIter_)
+        //{
+        //    value_type relres = normr/n2b;
+        //    GDEBUG_STREAM("Flag = " << flag << " - relres = " << std::abs(relres) );
+        //}
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in gtPlusLSQRSolver<Array_Type_I, Array_Type_O, Oper_Type>::solve(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+void gtPlusLSQRSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+printInfo(std::ostream& os) const
+{
+    os << "-------------- GTPlus ISMRMRD linear LSQR solver -------------" << std::endl;
+    os << "The linear solver solves Ax=b problem" << std::endl;
+    os << "------------------------------------------------------------" << std::endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/solver/gtPlusLinearSolver.h b/toolboxes/gtplus/solver/gtPlusLinearSolver.h
new file mode 100644
index 0000000..ff33576
--- /dev/null
+++ b/toolboxes/gtplus/solver/gtPlusLinearSolver.h
@@ -0,0 +1,93 @@
+/** \file       gtPlusLinearSolver.h
+    \brief      Define the base class for linear solver
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSolver.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+class gtPlusLinearSolver : public gtPlusSolver<Array_Type_I, Array_Type_O>
+{
+public:
+
+    typedef gtPlusSolver<Array_Type_I, Array_Type_O> BaseClass;
+
+    typedef typename BaseClass::ValueType ValueType;
+    typedef typename realType<ValueType>::Type value_type;
+
+    gtPlusLinearSolver();
+    virtual ~gtPlusLinearSolver();
+
+    Oper_Type* get();
+    void set(Oper_Type& op);
+
+    virtual bool solve(const Array_Type_I& b, Array_Type_O& x) = 0;
+
+    virtual void printInfo(std::ostream& os) const;
+
+    // number of max iterations
+    size_t iterMax_;
+
+    // threshold for detla change of residual
+    value_type thres_;
+
+    // initial guess for the solver
+    Array_Type_O* x0_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::printIter_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+
+protected:
+
+    using BaseClass::callback_;
+    Oper_Type* oper_;
+};
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+gtPlusLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+gtPlusLinearSolver() : oper_(NULL)
+{
+
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+gtPlusLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+~gtPlusLinearSolver() 
+{
+
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+Oper_Type* gtPlusLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::get()
+{
+    return oper_;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+void gtPlusLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::set(Oper_Type& op)
+{
+    oper_ = &op;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+void gtPlusLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+printInfo(std::ostream& os) const
+{
+    using namespace std;
+    os << "-------------- GTPlus ISMRMRD linear solver --------------------" << endl;
+    os << "Linear solver for GtPlus ISMRMRD package ... " << endl;
+    os << "----------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/solver/gtPlusNCGSolver.h b/toolboxes/gtplus/solver/gtPlusNCGSolver.h
new file mode 100644
index 0000000..bd11a3b
--- /dev/null
+++ b/toolboxes/gtplus/solver/gtPlusNCGSolver.h
@@ -0,0 +1,394 @@
+/** \file       gtPlusNCGSolver.h
+    \brief      Implement the non-linear conjugate gradient solver for scalar function minimization problem
+                The function to be optmized is required to supply the gradient computation
+
+                The Secant line search is used with the non-linear CG solver.
+
+    \author     Hui Xue
+
+    Ref to:
+    http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method
+    http://en.wikipedia.org/wiki/Secant_method
+*/
+
+#pragma once
+
+#include "gtPlusNonLinearSolver.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+class gtPlusNCGSolver : public gtPlusNonLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>
+{
+public:
+
+    typedef gtPlusNonLinearSolver<Array_Type_I, Array_Type_O, Oper_Type> BaseClass;
+    typedef typename BaseClass::ValueType ValueType;
+    typedef typename BaseClass::value_type value_type;
+    typedef typename BaseClass::Oper_Elem_Type Oper_Elem_Type;
+    typedef typename BaseClass::Oper_List_Type Oper_List_Type;
+
+    gtPlusNCGSolver();
+    virtual ~gtPlusNCGSolver();
+
+    virtual bool solve(const Array_Type_I& b, Array_Type_O& x);
+
+    virtual bool grad(const Array_Type_I& x, Array_Type_I& grad);
+    virtual bool obj(const Array_Type_I& x, ValueType& obj);
+
+    virtual void printInfo(std::ostream& os) const;
+
+    /// number of max iterations
+    size_t iterMax_;
+
+    /// threshold for detla change of gradient
+    value_type gradThres_;
+
+    /// threshold for detla change of objective function
+    value_type objThres_;
+
+    /// scale factor of initial step size of linear search 
+    value_type beta_;
+
+    /// initial step size of linear search
+    value_type t0_;
+
+    /// number of max linear search iterations (secant linear search)
+    size_t secantIterMax_;
+
+    /// gradient threshold for secant linear search
+    value_type secantThres_;
+
+    /// sometimes the secantThres can increase during line search
+    /// the maximal allowed secantThres increments compared to previous secant iteration
+    value_type secantRatio_;
+
+    /// initial guess for the solver
+    Array_Type_O* x0_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::printIter_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+
+protected:
+
+    using BaseClass::callback_;
+    using BaseClass::operList_;
+
+    Array_Type_I gradBuf_;
+};
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+gtPlusNCGSolver() : BaseClass()
+{
+    iterMax_ = 10;
+    gradThres_ = (value_type)1e-4;
+    objThres_ = (value_type)0.1;
+    beta_ = (value_type)0.5;
+    t0_ = (value_type)2.0;
+    secantIterMax_ = 10;
+    secantThres_ = (value_type)1e-3;
+    secantRatio_ = 2;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+~gtPlusNCGSolver() 
+{
+
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+bool gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+grad(const Array_Type_I& x, Array_Type_I& g)
+{
+    try
+    {
+        g.create(x.get_dimensions());
+
+        size_t N = operList_.size();
+        if ( N == 0 ) return true;
+
+        GADGET_CHECK_RETURN_FALSE(operList_[0].first->grad(x, g));
+        Gadgetron::scal(operList_[0].second, g);
+
+        for ( size_t op=1; op<N; op++ )
+        {
+            GADGET_CHECK_RETURN_FALSE(operList_[op].first->grad(x, gradBuf_));
+            Gadgetron::scal(operList_[op].second, gradBuf_);
+            Gadgetron::add(gradBuf_, g, g);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+bool gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+obj(const Array_Type_I& x, ValueType& ob)
+{
+    try
+    {
+        size_t N = operList_.size();
+        if ( N == 0 )
+        {
+            ob = 0;
+            return true;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(operList_[0].first->obj(x, ob));
+        ob *= operList_[0].second;
+
+        ValueType v = 0;
+        for ( size_t op=1; op<N; op++ )
+        {
+            GADGET_CHECK_RETURN_FALSE(operList_[op].first->obj(x, v));
+            ob += operList_[op].second * v;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::obj(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+bool gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+solve(const Array_Type_I& /*b*/, Array_Type_O& x)
+{
+    try
+    {
+        if ( operList_.empty() ) return true;
+
+        // initial gradient
+        Array_Type_I g0(*x0_);
+        GADGET_CHECK_RETURN_FALSE(this->grad(*x0_, g0));
+
+        //Gadgetron::norm2(*x0_, v); GDEBUG_STREAM(v);
+        //Gadgetron::norm2(g0, v); GDEBUG_STREAM(v);
+
+        // dx = -g0;
+        Array_Type_I dx(g0);
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE( Gadgetron::scal( (value_type)(-1), dx ) );
+
+        //Gadgetron::norm2(dx, v); GDEBUG_STREAM(v);
+
+        // initialize x
+        x = *x0_;
+
+        // secant parameters
+        value_type bk, dxNorm, t0(t0_);
+        ValueType oriF, prevF, currF, deltaD, thresValue, prevThresValue, phiPrev(0), v1, v2, v3, phi, alpha(0);
+        Array_Type_I g1(g0), gTmp(g0), sx(*x0_), xTmp(*x0_), dxTmp(dx), prevX(*x0_);
+        size_t nIter(0);
+
+        // guess the t0_
+        this->obj(x, oriF);
+
+        dxTmp = dx;
+        Gadgetron::scal(t0, dxTmp);
+        Gadgetron::add(x, dxTmp, xTmp);
+
+        this->obj(xTmp, currF);
+
+        if (printIter_)
+        {
+            GDEBUG_STREAM("To determine t0, --- ori and curr obj: " << oriF << " - " << currF << " ... ");
+        }
+
+        unsigned int numOfTries = 0;
+
+        value_type changeRatio = std::abs(currF.real() - oriF.real())/currF.real();
+        value_type changeRatio2 = std::abs(currF.real() - oriF.real())/oriF.real();
+        value_type minChangeRatio = (value_type)0.05;
+        value_type maxChangeRatio = (value_type)6.0;
+        unsigned int maxNumOfTries = 4;
+
+        while ( ( (changeRatio<minChangeRatio)||(changeRatio2>maxChangeRatio) ) && (numOfTries < maxNumOfTries) )
+        {
+            numOfTries++;
+
+            if ( changeRatio<minChangeRatio )
+            {
+                t0 /= beta_;
+            }
+            else if ( changeRatio2>maxChangeRatio )
+            {
+                t0 *= beta_;
+            }
+
+            dxTmp = dx;
+            Gadgetron::scal(t0, dxTmp);
+            Gadgetron::add(x, dxTmp, xTmp);
+
+            this->obj(xTmp, currF);
+
+            GDEBUG_STREAM("t0 is " << t0 << " ... ");
+            GDEBUG_STREAM("To determine t0, --- ori and curr obj: " << oriF << " - " << currF << " ... ");
+
+            changeRatio = std::abs(currF.real() - oriF.real())/currF.real();
+            changeRatio2 = std::abs(currF.real() - oriF.real())/oriF.real();
+        }
+
+        prevF = oriF;
+        while (1)
+        {
+            // secant line-search
+            // wGradient(x+t0*dx);
+            dxTmp = dx;
+            Gadgetron::scal(t0, dxTmp);
+            Gadgetron::add(x, dxTmp, xTmp);
+
+            //Gadgetron::norm2(xTmp, v); GDEBUG_STREAM(v);
+
+            this->grad(xTmp, gTmp);
+
+            //Gadgetron::norm2(gTmp, v); GDEBUG_STREAM(v);
+
+            // phiPrev = gTmp(:)'*dx(:);
+            Gadgetron::dotc(gTmp, dx, phiPrev);
+            alpha = -t0;
+            Gadgetron::dotc(dx, dx, deltaD);
+
+            thresValue = std::conj(alpha)*alpha*deltaD;
+            prevThresValue = thresValue;
+
+            size_t lsiter = 0;
+            sx = x;
+
+            while ( (lsiter<secantIterMax_) 
+                && (thresValue.real()>secantThres_) 
+                && (thresValue.real()<=secantRatio_*prevThresValue.real()) )
+            {
+                if ( lsiter == 0 )
+                {
+                    gTmp = g0;
+                }
+                else
+                {
+                    this->grad(sx, gTmp);
+                }
+
+                Gadgetron::dotc(gTmp, dx, phi);
+                // alpha = alpha * (phi.real()/(phiPrev.real()-phi.real()));
+                alpha = alpha * phi/(phiPrev-phi);
+                phiPrev = phi;
+                lsiter = lsiter+1;
+                prevThresValue = std::abs(thresValue);
+                thresValue = std::conj(alpha)*alpha*deltaD;
+
+                if ( thresValue.real() <= secantRatio_*prevThresValue.real() )
+                {
+                    dxTmp = dx;
+                    Gadgetron::scal(alpha, dxTmp);
+                    Gadgetron::add(sx, dxTmp, sx);
+                }
+            }
+
+            // control the number of line searches by adapting the initial step search
+            if (lsiter>2)
+            {
+                t0 *= beta_;
+            }
+
+            if (lsiter<1)
+            {
+                t0 /= beta_;
+            }
+
+            prevX = x;
+            x = sx;
+
+            this->obj(x, currF);
+
+            // conjugate gradient calculation
+            this->grad(x, g1);
+
+            // Fletcher - Reeves updates
+            Gadgetron::dotc(g1, g1, v1);
+            Gadgetron::dotc(g0, g0, v2);
+            bk = (value_type)(v1.real()/(v2.real()+DBL_EPSILON));
+
+            g0 = g1;
+
+            // dx =  - g1 + bk.* dx;
+            dxTmp = dx;
+            Gadgetron::scal(bk, dxTmp);
+            Gadgetron::subtract(dxTmp, g1, dx);
+
+            if (printIter_)
+            {
+                GDEBUG_STREAM("Iteration " << nIter << " --- prev and curr obj: " << prevF << " - " << currF << " - line search: " << lsiter);
+            }
+
+            // perform call back
+            if ( callback_ != NULL )
+            {
+                Gadgetron::norm2(dx, dxNorm);
+                if ( (nIter>iterMax_) || (dxNorm<gradThres_) || (callback_->exit() && (prevF.real()-currF.real()<objThres_)) )
+                {
+                    if ( prevF.real() < currF.real() )
+                    {
+                        x = prevX;
+                    }
+                    break;
+                }
+
+                GADGET_CHECK_RETURN_FALSE(callback_->callBack(nIter, x));
+                GDEBUG_STREAM("exit is " << callback_->exit());
+
+                nIter = nIter + 1;
+            }
+            else
+            {
+                nIter = nIter + 1;
+
+                Gadgetron::norm2(dx, dxNorm);
+                if ( (nIter>iterMax_) || (dxNorm<gradThres_) || (prevF.real()-currF.real()<objThres_) )
+                {
+                    if ( prevF.real() < currF.real() )
+                    {
+                        x = prevX;
+                    }
+                    break;
+                }
+            }
+
+            prevF = currF;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors happened in gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::solve(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+void gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+printInfo(std::ostream& os) const
+{
+    using namespace std;
+    os << "-------------- GTPlus ISMRMRD ncg solver -------------" << endl;
+    os << "The non-linear cg solver " << std::endl;
+    os << "------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/solver/gtPlusNonLinearSolver.h b/toolboxes/gtplus/solver/gtPlusNonLinearSolver.h
new file mode 100644
index 0000000..b56313e
--- /dev/null
+++ b/toolboxes/gtplus/solver/gtPlusNonLinearSolver.h
@@ -0,0 +1,122 @@
+/** \file       gtPlusNonLinearSolver.h
+    \brief      Define the base class for GtPlus non-linear solvers
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSolver.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+class gtPlusNonLinearSolver : public gtPlusSolver<Array_Type_I, Array_Type_O>
+{
+public:
+
+    typedef gtPlusSolver<Array_Type_I, Array_Type_O> BaseClass;
+
+    typedef typename BaseClass::ValueType ValueType;
+    typedef typename realType<ValueType>::Type value_type;
+
+    // one operator is related to a weight
+    typedef std::pair<Oper_Type*, ValueType> Oper_Elem_Type;
+    // multiple operator can be added to a solver
+    typedef std::vector< Oper_Elem_Type > Oper_List_Type;
+
+    gtPlusNonLinearSolver();
+    virtual ~gtPlusNonLinearSolver();
+
+    Oper_List_Type getOperList();
+    void setOperList(Oper_List_Type& opero);
+
+    void add(Oper_Type& op, ValueType a);
+    void remove(Oper_Type*& op, ValueType& a);
+
+    bool set(size_t ind, Oper_Type& op, ValueType a);
+
+    // main function to perform the solver
+    virtual bool solve(const Array_Type_I& b, Array_Type_O& x) = 0;
+
+    virtual void printInfo(std::ostream& os) const;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::printIter_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+
+protected:
+
+    using BaseClass::callback_;
+    Oper_List_Type operList_;
+};
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+gtPlusNonLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+gtPlusNonLinearSolver() : BaseClass()
+{
+
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+gtPlusNonLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+~gtPlusNonLinearSolver() 
+{
+    operList_.clear();
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+void gtPlusNonLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+add(Oper_Type& op, ValueType a)
+{
+    operList_.push_back(Oper_Elem_Type(&op, a));
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+void gtPlusNonLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+remove(Oper_Type*& op, ValueType& a)
+{
+    if ( operList_.empty() )
+    {
+        op = NULL;
+        a = 0;
+    }
+    else
+    {
+        op = operList_[operList_.size()-1].first;
+        a = operList_[operList_.size()-1].second;
+        operList_.pop_back();
+    }
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+bool gtPlusNonLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+set(size_t ind, Oper_Type& op, ValueType a)
+{
+    if ( ind >= operList_.size() )
+    {
+        GWARN_STREAM("ind >= operList_.size()");
+    }
+
+    operList_[ind].first = &op;
+    operList_[ind].second = a;
+
+    return true;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+void gtPlusNonLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+printInfo(std::ostream& os) const
+{
+    using namespace std;
+    os << "-------------- GTPlus ISMRMRD nonlinear solver -----------------" << endl;
+    os << "ISMRMRD nonlinear solver ... " << endl;
+    os << "----------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/solver/gtPlusSolver.h b/toolboxes/gtplus/solver/gtPlusSolver.h
new file mode 100644
index 0000000..e694331
--- /dev/null
+++ b/toolboxes/gtplus/solver/gtPlusSolver.h
@@ -0,0 +1,146 @@
+/** \file       gtPlusSolver.h
+    \brief      Define the base class for GtPlus solvers
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd/ismrmrd.h"
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusIOAnalyze.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename Array_Type_I, typename Array_Type_O>
+class gtPlusSolverCallBack
+{
+public:
+
+    gtPlusSolverCallBack();
+    virtual ~gtPlusSolverCallBack();
+
+    // if true, current solver will exit
+    virtual bool exit();
+
+    virtual bool callBack(size_t iter, Array_Type_O& x);
+
+    virtual void printInfo(std::ostream& os) const;
+
+    bool exit_;
+};
+
+template <typename Array_Type_I, typename Array_Type_O>
+class gtPlusSolver
+{
+public:
+
+    typedef gtPlusSolver<Array_Type_I, Array_Type_O> Self;
+
+    typedef typename Array_Type_I::value_type ValueType;
+
+    typedef gtPlusSolverCallBack<Array_Type_I, Array_Type_O> CBType;
+
+    gtPlusSolver();
+    virtual ~gtPlusSolver();
+
+    CBType* getCallBack();
+    void setCallBack(CBType* pCB);
+
+    virtual bool solve(const Array_Type_I& b, Array_Type_O& x) = 0;
+
+    virtual void printInfo(std::ostream& os) const;
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    bool performTiming_;
+
+    bool printIter_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // debug folder
+    std::string debugFolder_;
+
+    // util
+    gtPlusISMRMRDReconUtil<ValueType> gtPlus_util_;
+    gtPlusISMRMRDReconUtilComplex<ValueType> gtPlus_util_complex_;
+
+protected:
+
+    CBType* callback_;
+};
+
+template <typename Array_Type_I, typename Array_Type_O>
+gtPlusSolverCallBack<Array_Type_I, Array_Type_O>::
+gtPlusSolverCallBack() : exit_(true)
+{
+
+}
+
+template <typename Array_Type_I, typename Array_Type_O>
+gtPlusSolverCallBack<Array_Type_I, Array_Type_O>::
+~gtPlusSolverCallBack() 
+{
+
+}
+
+template <typename Array_Type_I, typename Array_Type_O>
+bool gtPlusSolverCallBack<Array_Type_I, Array_Type_O>::
+exit()
+{
+    return exit_;
+}
+
+template <typename Array_Type_I, typename Array_Type_O>
+bool gtPlusSolverCallBack<Array_Type_I, Array_Type_O>::
+callBack(size_t /*iter*/, Array_Type_O& /*x*/) 
+{
+    return true;
+}
+
+template <typename Array_Type_I, typename Array_Type_O>
+void gtPlusSolverCallBack<Array_Type_I, Array_Type_O>::
+printInfo(std::ostream& os) const
+{
+    using namespace std;
+    os << "-------------- GTPlus ISMRMRD solver callback ------------------" << endl;
+    os << "A callback scheme for ISMRMRD solvers ... " << endl;
+    os << "----------------------------------------------------------------" << endl;
+}
+
+template <typename Array_Type_I, typename Array_Type_O>
+gtPlusSolver<Array_Type_I, Array_Type_O>::
+gtPlusSolver() : callback_(NULL), performTiming_(false), printIter_(false)
+{
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+}
+
+template <typename Array_Type_I, typename Array_Type_O>
+gtPlusSolver<Array_Type_I, Array_Type_O>::
+~gtPlusSolver() 
+{
+
+}
+
+template <typename Array_Type_I, typename Array_Type_O>
+void gtPlusSolver<Array_Type_I, Array_Type_O>::
+printInfo(std::ostream& os) const
+{
+    using namespace std;
+    os << "-------------- GTPlus ISMRMRD solver ------------------" << endl;
+    os << "GtPlus ISMRMRD solvers ... " << endl;
+    os << "-------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/ut/CMakeLists.txt b/toolboxes/gtplus/ut/CMakeLists.txt
new file mode 100644
index 0000000..b799b7c
--- /dev/null
+++ b/toolboxes/gtplus/ut/CMakeLists.txt
@@ -0,0 +1,57 @@
+ENABLE_TESTING()
+
+if(WIN32)
+    link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+include_directories( ${GTEST_INCLUDE_DIRS}
+                     ${CMAKE_BINARY_DIR}/apps/gadgetron
+                     ${ACE_INCLUDE_DIR} 
+                     ${Boost_INCLUDE_DIR}
+                     ${FFTW3_INCLUDE_DIR}
+                     ${ISMRMRD_INCLUDE_DIR}
+                     ${CMAKE_SOURCE_DIR}/dependencies/tinyxml
+                     ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+                     ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+                     ${CMAKE_SOURCE_DIR}/toolboxes/operators
+                     ${CMAKE_SOURCE_DIR}/toolboxes/operators/cpu
+                     ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+                     ${CMAKE_SOURCE_DIR}/toolboxes/solvers/cpu
+                     ${CMAKE_SOURCE_DIR}/apps/gadgetron 
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+                     ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools 
+                     ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+                     ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu
+                     ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+                     ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools/ismrmrd )
+
+link_libraries(optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+                ${GTEST_LIBRARIES} 
+                ${Boost_LIBRARIES} 
+                ${ISMRMRD_LIBRARIES} 
+                gadgetron_toolbox_cpucore 
+                gadgetron_toolbox_cpucore_math 
+                # cpureg
+                gadgetron_toolbox_cpufft
+                gadgetron_toolbox_gtplus 
+                gadgetron_toolbox_gadgettools 
+                gadgetron_toolbox_mri_core 
+                gadgetronPlus)
+
+if (CUDA_FOUND)
+    link_libraries(gadgetron_toolbox_gtplus gadgetron_toolbox_gpuparallelmri gadgetron_toolbox_gpucore)
+endif (CUDA_FOUND)
+
+add_executable(gtplus_ut_grappa 
+    gtplus_ut.cpp 
+    grappa_test.cpp )
+
+#add_test(gtplus_ut gtplus_ut_util)
diff --git a/toolboxes/gtplus/ut/grappa_test.cpp b/toolboxes/gtplus/ut/grappa_test.cpp
new file mode 100644
index 0000000..0756f2e
--- /dev/null
+++ b/toolboxes/gtplus/ut/grappa_test.cpp
@@ -0,0 +1,609 @@
+
+#ifdef USE_OMP
+#include "omp.h"
+#endif // USE_OMP
+
+#include "Gadget.h"
+#include "ismrmrd/ismrmrd.h"
+#include "complext.h"
+
+#include <gtest/gtest.h>
+
+#include "hoNDArray_utils.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+// #include "gtPlusISMRMRDReconWorkOrder.h"
+#include "gtPlusISMRMRDReconWorker2DTGRAPPA.h"
+#include "gtPlusISMRMRDReconWorker2DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorker3DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian2DT.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian3DT.h"
+#include "gtPlusSPIRIT2DOperator.h"
+#include "gtPlusSPIRIT2DTOperator.h"
+#include "gtPlusSPIRIT3DOperator.h"
+#include "gtPlusSPIRITNoNullSpace2DOperator.h"
+#include "gtPlusSPIRITNoNullSpace2DTOperator.h"
+#include "gtPlusSPIRITNoNullSpace3DOperator.h"
+#include "gtPlusNCGSolver.h"
+
+#include "GadgetronTimer.h"
+
+#include <boost/thread/mutex.hpp>
+
+#ifdef max
+#undef max
+#endif // max
+
+using namespace Gadgetron;
+using namespace Gadgetron::gtPlus;
+using testing::Types;
+
+template <typename T> class gtPlus_grappa_Test : public ::testing::Test 
+{
+protected:
+    virtual void SetUp()
+    {
+        GDEBUG_STREAM("=============================================================================================");
+        gtPluse_ut_folder_ = std::string(::getenv("GTPLUS_UNITTEST_DIRECTORY"));
+        GDEBUG_STREAM("=============================================================================================");
+        GDEBUG_STREAM("Unit Test for GtPlus");
+        gtPluse_ut_data_folder_ = gtPluse_ut_folder_ + "/data/";
+        gtPluse_ut_res_folder_ = gtPluse_ut_folder_ + "/result/";
+        GDEBUG_STREAM("gtPluse_ut_data_folder_ is " << gtPluse_ut_data_folder_);
+        GDEBUG_STREAM("gtPluse_ut_res_folder_ is " << gtPluse_ut_res_folder_);
+
+        timer_.set_timing_in_destruction(false);
+
+#ifdef WIN32
+    #ifdef USE_OMP
+        /// lock the threads
+        #pragma omp parallel default(shared)
+        {
+            int tid = omp_get_thread_num();
+            // GDEBUG_STREAM(tid << std::endl);
+            DWORD_PTR mask = (1 << tid);
+            SetThreadAffinityMask( GetCurrentThread(), mask );
+        }
+    #endif // USE_OMP
+#endif // WIN32
+    }
+
+    std::string gtPluse_ut_folder_;
+    std::string gtPluse_ut_data_folder_;
+    std::string gtPluse_ut_res_folder_;
+
+    gtPlusIOAnalyze gt_io_;
+    gtPlusISMRMRDReconUtil<T> util_;
+    gtPlusISMRMRDReconUtilComplex<T> utilCplx_;
+    GadgetronTimer timer_;
+};
+
+typedef Types<float, double> realImplementations;
+
+typedef Types< std::complex<float> > cpfloatImplementations;
+
+typedef Types<std::complex<float>, std::complex<double>, float_complext, double_complext> cplxImplementations;
+typedef Types<std::complex<float>, std::complex<double> > stdCplxImplementations;
+typedef Types<float_complext, double_complext> cplxtImplementations;
+
+TYPED_TEST_CASE(gtPlus_grappa_Test, cpfloatImplementations);
+
+TYPED_TEST(gtPlus_grappa_Test, reconWorker2DTGRAPPA_SNRUnit)
+{
+    typedef std::complex<float> T;
+
+    gtPlusIOAnalyze gt_io;
+
+    float v;
+
+    // image data
+    hoNDArray<std::complex<float> > data;
+    gt_io.importArrayComplex(data, this->gtPluse_ut_data_folder_ + "StdandardDataR2_Kspace_real", 
+        this->gtPluse_ut_data_folder_ + "StdandardDataR2_Kspace_imag");
+    data.print(std::cout);
+
+    unsigned long long RO = data.get_size(0);
+    unsigned long long E1 = data.get_size(1);
+    unsigned long long CHA = data.get_size(2);
+    unsigned long long PHS = data.get_size(3);
+
+    unsigned long long reconE1 = 144;
+
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    unsigned long long SLC = 1;
+    unsigned long long E2 = 1;
+    unsigned long long CON = 1;
+    unsigned long long REP = 1;
+    unsigned long long SET = 1;
+    unsigned long long SEG = 1;
+
+    hoNDArray<std::complex<float> > kspace(RO, E1, CHA, SLC, E2, CON, PHS);
+    memcpy(kspace.begin(), data.begin(), data.get_number_of_bytes());
+
+    Gadgetron::norm2(kspace, v);
+    GDEBUG_STREAM("kspace = " << v);
+
+    // ref
+    hoNDArray<T> refTmp;
+    gt_io.importArrayComplex(refTmp, this->gtPluse_ut_data_folder_ + "StdandardDataR2_Ref_real", 
+        this->gtPluse_ut_data_folder_ + "StdandardDataR2_Ref_imag");
+
+    hoNDArray<T> ref(refTmp.get_size(0), refTmp.get_size(1), refTmp.get_size(2), SLC, E2, CON, PHS);
+    memcpy(ref.begin(), refTmp.begin(), refTmp.get_number_of_bytes());
+    ref.print(std::cout);
+
+    // noise
+    hoNDArray<T> noise;
+    gt_io.importArrayComplex(noise, this->gtPluse_ut_data_folder_ + "StdandardDataR2_Noise_real", 
+        this->gtPluse_ut_data_folder_ + "StdandardDataR2_Noise_imag");
+    noise.print(std::cout);
+
+    // call the recon
+    typedef std::complex<float> ValueType;
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder2DT<ValueType> WorkOrderType;
+    typedef std::pair<Gadgetron::ISMRMRDDIM, unsigned long long> DimensionRecordType;
+
+    WorkOrderType* workOrder = new WorkOrderType;
+
+    boost::shared_ptr< std::vector<size_t> > dims = kspace.get_dimensions();
+
+    GDEBUG_STREAM("[Ro E1 Cha Slice E2 Con Phase Rep Set Seg] = [" 
+        << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " << (*dims)[3] << " " << (*dims)[4] 
+        << " " << (*dims)[5] << " " << (*dims)[6] << " " << 1 << " " << 1 << " " << 1 << "]");
+
+    std::vector<size_t> dimensions_ = *dims;
+
+        // work flow
+    Gadgetron::gtPlus::gtPlusISMRMRDReconWorkFlowCartesian2DT<ValueType> workflow_;
+
+    // worker
+    Gadgetron::gtPlus::gtPlusReconWorker2DTGRAPPA<ValueType> worker_grappa_;
+
+    // parameters
+    Gadgetron::ISMRMRDDIM dim_4th_ = DIM_Phase;
+    Gadgetron::ISMRMRDDIM dim_5th_ = DIM_Slice;
+    Gadgetron::ISMRMRDDIM workOrder_ShareDim_ = DIM_NONE;
+
+    bool interleaved_same_combinationcoeff_allS_ = false;
+    int interleaved_whichS_combinationcoeff_ = 0;
+
+    bool embedded_averageall_ref_ = false;
+    bool embedded_fullres_coilmap_ = true;
+    bool embedded_same_combinationcoeff_allS_ = false;
+    int embedded_whichS_combinationcoeff_ = 0;
+    bool embedded_ref_fillback_ = true;
+
+    bool separate_averageall_ref_ = true;
+    bool separate_fullres_coilmap_ = false;
+    bool separate_same_combinationcoeff_allS_ = false;
+    int separate_whichS_combinationcoeff_ = 0;
+
+    bool same_coil_compression_coeff_allS_ = true;
+    bool downstream_coil_compression_ = true;
+    double coil_compression_thres_ = 1e-3;
+    int coil_compression_num_modesKept_ = -1;
+
+    unsigned long long csm_kSize_ = 7;
+    unsigned long long csm_powermethod_num_ = 3;
+
+    Gadgetron::ISMRMRDALGO recon_algorithm_ = ISMRMRD_GRAPPA;
+    bool recon_kspace_needed_ = true;
+
+    unsigned long long grappa_kSize_RO_ = 5;
+    unsigned long long grappa_kSize_E1_ = 4;
+    unsigned long long grappa_kSize_E2_ = 4;
+    double grappa_reg_lamda_ = 1e-4;
+
+    // recon
+    workflow_.setDataArray(kspace);
+    workflow_.setRefArray(ref);
+    workflow_.noise_ = &noise;
+
+    workflow_.noiseBW_ = 130;
+    workflow_.overSamplingRatioRO_ = 2.0;
+    workflow_.ADCSamplingTimeinSecond_ = 7800/1e9;
+
+    // for this ut data, the oversampling removal and noise prewhitening on ref are not needed
+    workflow_.ref_remove_oversampling_RO_ = false;
+    workflow_.ref_apply_noisePreWhitening_ = false;
+
+    workflow_.reconSizeRO_ = RO/2;
+    workflow_.reconSizeE1_ = reconE1;
+    workflow_.reconSizeE2_ = 1;
+    // workflow_.dataDimStartingIndexes_ = workOrder->dataDimStartingIndexes_;
+    workflow_.dim4th_ = dim_4th_;
+    workflow_.dim5th_ = dim_5th_;
+
+    workOrder->CalibMode_ = ISMRMRD_separate;
+    workOrder->acceFactorE1_ = 2;
+    workOrder->acceFactorE2_ = 1;
+
+    workOrder->downstream_coil_compression_ = downstream_coil_compression_;
+    workOrder->coil_compression_thres_ = coil_compression_thres_;
+    workOrder->coil_compression_num_modesKept_ = coil_compression_num_modesKept_;
+    workOrder->csm_kSize_ = csm_kSize_;
+    workOrder->csm_powermethod_num_ = csm_powermethod_num_;
+    workOrder->grappa_kSize_RO_ = grappa_kSize_RO_;
+    workOrder->grappa_kSize_E1_ = grappa_kSize_E1_;
+    workOrder->grappa_kSize_E2_ = grappa_kSize_E2_;
+    workOrder->grappa_reg_lamda_ = grappa_reg_lamda_;
+    workOrder->recon_kspace_needed_ = recon_kspace_needed_;
+
+    if ( coil_compression_thres_>0 || coil_compression_num_modesKept_>0 )
+    {
+        workOrder->coil_compression_ = true;
+    }
+    else
+    {
+        workOrder->coil_compression_ = false;
+    }
+
+    workOrder->same_coil_compression_coeff_allS_ = same_coil_compression_coeff_allS_;
+    workOrder->embedded_averageall_ref_ = embedded_averageall_ref_;
+    workOrder->embedded_fullres_coilmap_ = embedded_fullres_coilmap_;
+    workOrder->embedded_same_combinationcoeff_allS_ = embedded_same_combinationcoeff_allS_;
+    workOrder->embedded_whichS_combinationcoeff_ = embedded_whichS_combinationcoeff_;
+    workOrder->embedded_ref_fillback_ = embedded_ref_fillback_;
+    workOrder->separate_averageall_ref_ = separate_averageall_ref_;
+    workOrder->separate_fullres_coilmap_ = separate_fullres_coilmap_;
+    workOrder->separate_same_combinationcoeff_allS_ = separate_same_combinationcoeff_allS_;
+    workOrder->separate_whichS_combinationcoeff_ = separate_whichS_combinationcoeff_;
+    workOrder->interleaved_same_combinationcoeff_allS_ = interleaved_same_combinationcoeff_allS_;
+    workOrder->interleaved_whichS_combinationcoeff_ = interleaved_whichS_combinationcoeff_;
+
+    worker_grappa_.performTiming_ = true;
+    worker_grappa_.debugFolder_ = this->gtPluse_ut_res_folder_;
+
+    workflow_.debugFolder_ = this->gtPluse_ut_res_folder_;
+
+    workflow_.worker_ = &worker_grappa_;
+    workflow_.workOrder_ = workOrder;
+
+    workflow_.preProcessing();
+    workflow_.recon();
+    workflow_.postProcessing();
+
+    gt_io.exportArrayComplex(workflow_.res_, this->gtPluse_ut_res_folder_+"StdandardDataR2_res");
+
+    workflow_.res_.squeeze();
+    gt_io.export3DArrayComplex(workflow_.res_, this->gtPluse_ut_res_folder_+"StdandardDataR2_res_squeezed");
+
+    hoNDArray<T> std;
+    bool NMinusOne = true;
+    stdOver3rdDimension(workflow_.res_, std, NMinusOne);
+    gt_io.export2DArrayComplex(std, this->gtPluse_ut_res_folder_+"StdandardDataR2_res_squeezed_std");
+}
+
+TYPED_TEST(gtPlus_grappa_Test, reconWorker2DTGRAPPA)
+{
+    typedef std::complex<float> T;
+
+    gtPlusIOAnalyze gt_io;
+
+    float v;
+
+    // image data
+    hoNDArray<float> real_data;
+    std::string filename = this->gtPluse_ut_data_folder_ + "underSampledKSpace_real";
+    gt_io.importArray(real_data, filename);
+    real_data.print(std::cout);
+
+    hoNDArray<float> imag_data;
+    filename = this->gtPluse_ut_data_folder_ + "underSampledKSpace_imag";
+    gt_io.importArray(imag_data, filename);
+    imag_data.print(std::cout);
+
+    hoNDArray<std::complex<float> > tmp;
+    Gadgetron::real_imag_to_complex<std::complex<float> >(real_data, imag_data, tmp);
+
+    unsigned long long RO = tmp.get_size(0);
+    unsigned long long E1 = tmp.get_size(1);
+    unsigned long long CHA = tmp.get_size(2);
+    unsigned long long PHS = tmp.get_size(3);
+
+    unsigned long long reconE1 = 120;
+
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    unsigned long long SLC = 1;
+    unsigned long long E2 = 1;
+    unsigned long long CON = 1;
+    unsigned long long REP = 1;
+    unsigned long long SET = 1;
+    unsigned long long SEG = 1;
+
+    hoNDArray<std::complex<float> > kspace(RO, E1, CHA, SLC, E2, CON, PHS, tmp.begin());
+
+    Gadgetron::norm2(kspace, v);
+    GDEBUG_STREAM("kspace = " << v);
+
+    // ref
+    hoNDArray<float> real_ref;
+    filename = this->gtPluse_ut_data_folder_ + "ref_real";
+    gt_io.importArray(real_ref, filename);
+    real_ref.print(std::cout);
+
+    hoNDArray<float> imag_ref;
+    filename = this->gtPluse_ut_data_folder_ + "ref_imag";
+    gt_io.importArray(imag_ref, filename);
+    imag_ref.print(std::cout);
+
+    hoNDArray<T> ref;
+    real_imag_to_complex<std::complex<float> >(real_ref, imag_ref, ref);
+
+    Gadgetron::norm2(ref, v);
+    GDEBUG_STREAM("ref = " << v);
+
+    // call the recon
+    typedef std::complex<float> ValueType;
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder2DT<ValueType> WorkOrderType;
+    typedef std::pair<Gadgetron::ISMRMRDDIM, unsigned long long> DimensionRecordType;
+
+    WorkOrderType* workOrder = new WorkOrderType;
+
+    workOrder->data_ = kspace;
+    workOrder->ref_ = ref;
+
+    boost::shared_ptr< std::vector<size_t> > dims = workOrder->data_.get_dimensions();
+
+    GDEBUG_STREAM("[Ro E1 Cha Slice E2 Con Phase Rep Set Seg] = [" 
+        << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " << (*dims)[3] << " " << (*dims)[4] 
+        << " " << (*dims)[5] << " " << (*dims)[6] << " " << 1 << " " << 1 << " " << 1 << "]");
+
+    std::vector<size_t> dimensions_ = *dims;
+
+        // work flow
+    Gadgetron::gtPlus::gtPlusISMRMRDReconWorkFlowCartesian2DT<ValueType> workflow_;
+
+    // worker
+    Gadgetron::gtPlus::gtPlusReconWorker2DTGRAPPA<ValueType> worker_grappa_;
+
+    // parameters
+    Gadgetron::ISMRMRDDIM dim_4th_ = DIM_Phase;
+    Gadgetron::ISMRMRDDIM dim_5th_ = DIM_Slice;
+    Gadgetron::ISMRMRDDIM workOrder_ShareDim_ = DIM_NONE;
+
+    bool interleaved_same_combinationcoeff_allS_ = false;
+    int interleaved_whichS_combinationcoeff_ = 0;
+
+    bool embedded_averageall_ref_ = false;
+    bool embedded_fullres_coilmap_ = true;
+    bool embedded_same_combinationcoeff_allS_ = false;
+    int embedded_whichS_combinationcoeff_ = 0;
+    bool embedded_ref_fillback_ = true;
+
+    bool separate_averageall_ref_ = false;
+    bool separate_fullres_coilmap_ = true;
+    bool separate_same_combinationcoeff_allS_ = false;
+    int separate_whichS_combinationcoeff_ = 0;
+
+    bool same_coil_compression_coeff_allS_ = true;
+    bool downstream_coil_compression_ = true;
+    double coil_compression_thres_ = 1e-3;
+    int coil_compression_num_modesKept_ = -1;
+
+    unsigned long long csm_kSize_ = 7;
+    unsigned long long csm_powermethod_num_ = 3;
+
+    Gadgetron::ISMRMRDALGO recon_algorithm_ = ISMRMRD_GRAPPA;
+    bool recon_kspace_needed_ = true;
+
+    unsigned long long grappa_kSize_RO_ = 5;
+    unsigned long long grappa_kSize_E1_ = 4;
+    unsigned long long grappa_kSize_E2_ = 4;
+    double grappa_reg_lamda_ = 1e-4;
+
+    // recon
+    workflow_.setDataArray(kspace);
+    workflow_.setRefArray(ref);
+
+    Gadgetron::norm2(workOrder->data_, v); GDEBUG_STREAM("workOrder->data_ = " << v);
+    Gadgetron::norm2(workOrder->ref_, v); GDEBUG_STREAM("workOrder->ref_ = " << v);
+
+    workflow_.reconSizeRO_ = RO;
+    workflow_.reconSizeE1_ = reconE1;
+    workflow_.reconSizeE2_ = 1;
+    // workflow_.dataDimStartingIndexes_ = workOrder->dataDimStartingIndexes_;
+    workflow_.dim4th_ = dim_4th_;
+    workflow_.dim5th_ = dim_5th_;
+
+    workOrder->CalibMode_ = ISMRMRD_separate;
+    workOrder->start_RO_ = 34;
+    workOrder->end_RO_ = (int)RO-1;
+    workOrder->acceFactorE1_ = 4;
+    workOrder->acceFactorE2_ = 1;
+
+    workOrder->downstream_coil_compression_ = downstream_coil_compression_;
+    workOrder->coil_compression_thres_ = coil_compression_thres_;
+    workOrder->coil_compression_num_modesKept_ = coil_compression_num_modesKept_;
+    workOrder->csm_kSize_ = csm_kSize_;
+    workOrder->csm_powermethod_num_ = csm_powermethod_num_;
+    workOrder->grappa_kSize_RO_ = grappa_kSize_RO_;
+    workOrder->grappa_kSize_E1_ = grappa_kSize_E1_;
+    workOrder->grappa_kSize_E2_ = grappa_kSize_E2_;
+    workOrder->grappa_reg_lamda_ = grappa_reg_lamda_;
+    workOrder->recon_kspace_needed_ = recon_kspace_needed_;
+
+    if ( coil_compression_thres_>0 || coil_compression_num_modesKept_>0 )
+    {
+        workOrder->coil_compression_ = true;
+    }
+    else
+    {
+        workOrder->coil_compression_ = false;
+    }
+
+    workOrder->same_coil_compression_coeff_allS_ = same_coil_compression_coeff_allS_;
+    workOrder->embedded_averageall_ref_ = embedded_averageall_ref_;
+    workOrder->embedded_fullres_coilmap_ = embedded_fullres_coilmap_;
+    workOrder->embedded_same_combinationcoeff_allS_ = embedded_same_combinationcoeff_allS_;
+    workOrder->embedded_whichS_combinationcoeff_ = embedded_whichS_combinationcoeff_;
+    workOrder->embedded_ref_fillback_ = embedded_ref_fillback_;
+    workOrder->separate_averageall_ref_ = separate_averageall_ref_;
+    workOrder->separate_fullres_coilmap_ = separate_fullres_coilmap_;
+    workOrder->separate_same_combinationcoeff_allS_ = separate_same_combinationcoeff_allS_;
+    workOrder->separate_whichS_combinationcoeff_ = separate_whichS_combinationcoeff_;
+    workOrder->interleaved_same_combinationcoeff_allS_ = interleaved_same_combinationcoeff_allS_;
+    workOrder->interleaved_whichS_combinationcoeff_ = interleaved_whichS_combinationcoeff_;
+
+    worker_grappa_.performTiming_ = true;
+    worker_grappa_.debugFolder_ = this->gtPluse_ut_res_folder_;
+
+    workflow_.debugFolder_ = this->gtPluse_ut_res_folder_;
+    workflow_.worker_ = &worker_grappa_;
+    workflow_.workOrder_ = workOrder;
+
+    gt_io.exportArrayComplex(workflow_.workOrder_->ref_, this->gtPluse_ut_res_folder_+"ref");
+
+    workflow_.preProcessing();
+    workflow_.recon();
+    workflow_.postProcessing();
+
+    gt_io.exportArrayComplex(workflow_.res_, this->gtPluse_ut_res_folder_+"grappa2D_gtPlus_res");
+}
+
+TYPED_TEST(gtPlus_grappa_Test, grappa2D)
+{
+    typedef std::complex<float> T;
+
+    gtPlusIOAnalyze gt_io;
+
+    float v;
+
+    // image data
+    hoNDArray<float> real_data;
+    std::string filename = this->gtPluse_ut_data_folder_ + "underSampledKSpace_real";
+    gt_io.importArray(real_data, filename);
+    real_data.print(std::cout);
+
+    hoNDArray<float> imag_data;
+    filename = this->gtPluse_ut_data_folder_ + "underSampledKSpace_imag";
+    gt_io.importArray(imag_data, filename);
+    imag_data.print(std::cout);
+
+    hoNDArray<std::complex<float> > tmp;
+    Gadgetron::real_imag_to_complex<std::complex<float> >(real_data, imag_data, tmp);
+
+    unsigned long long RO = tmp.get_size(0);
+    unsigned long long E1 = tmp.get_size(1);
+    unsigned long long CHA = tmp.get_size(2);
+    unsigned long long PHS = tmp.get_size(3);
+
+    hoNDArray<std::complex<float> > kspace(RO, E1, CHA, PHS, tmp.begin());
+
+    // ref
+    hoNDArray<float> real_ref;
+    filename = this->gtPluse_ut_data_folder_ + "ref_real";
+    gt_io.importArray(real_ref, filename);
+    real_ref.print(std::cout);
+
+    hoNDArray<float> imag_ref;
+    filename = this->gtPluse_ut_data_folder_ + "ref_imag";
+    gt_io.importArray(imag_ref, filename);
+    imag_ref.print(std::cout);
+
+    hoNDArray<T> ref;
+    real_imag_to_complex<std::complex<float> >(real_ref, imag_ref, ref);
+
+    Gadgetron::norm2(ref, v);
+    GDEBUG_STREAM("ref = " << v);
+
+    // recon
+    gtPlusISMRMRDReconUtil<std::complex<float> > util;
+    gtPlusISMRMRDReconUtilComplex<std::complex<float> > utilCplx;
+
+    // sum of square
+    hoNDArray<std::complex<float> > complexIm, sosIm;
+
+    GadgetronTimer timer(false);
+    timer.start("ifft2c");
+    hoNDFFT<float>::instance()->ifft2c(kspace, complexIm);
+    timer.stop();
+
+    timer.start("sumOfSquare");
+    utilCplx.sumOfSquare(complexIm, sosIm);
+    timer.stop();
+
+    hoNDArray<float> magSoS;
+    timer.start("absolute");
+    Gadgetron::abs(sosIm, magSoS);
+    timer.stop();
+
+    filename = this->gtPluse_ut_res_folder_ + "SoS";
+    gt_io.exportArray(magSoS, filename);
+
+    // coil map estimation
+    hoNDFFT<float>::instance()->ifft2c(ref, complexIm);
+
+    filename = this->gtPluse_ut_res_folder_ + "complexIm";
+    gt_io.export3DArrayComplex(complexIm, filename);
+
+    hoNDArray<std::complex<float> > coilMap;
+    timer.start("coilMap2DNIH");
+    utilCplx.coilMap2DNIH(complexIm, coilMap, ISMRMRD_SOUHEIL, 7, 3, 3, true);
+    timer.stop();
+
+    filename = this->gtPluse_ut_res_folder_ + "coilMap";
+    gt_io.export3DArrayComplex(coilMap, filename);
+
+    // grappa kernel estimation
+    gtPlusReconWorker2DTGRAPPA<T> grappa;
+
+    unsigned long long kRO = 5;
+    unsigned long long kNE1 = 4;
+    unsigned long long srcCHA = CHA;
+    unsigned long long dstCHA = CHA;
+
+    double grappa_reg_lamda_ = 1e-4;
+
+    ho3DArray<T> acsSrc(RO, E1, CHA, const_cast<T*>(ref.begin()));
+    ho3DArray<T> acsDst(RO, E1, CHA, const_cast<T*>(ref.begin()));
+
+    Gadgetron::norm2(acsSrc, v);
+    GDEBUG_STREAM("acsSrc = " << v);
+
+    int accelFactor = 4;
+    bool fitItself = true;
+
+    ho4DArray<T> convKer;
+    timer.start("grappa2d_calib_convolution_kernel");
+    Gadgetron::grappa2d_calib_convolution_kernel(acsSrc, acsDst, accelFactor, grappa_reg_lamda_, kRO, kNE1, convKer);
+    timer.stop();
+
+    Gadgetron::norm2(convKer, v);
+    GDEBUG_STREAM("convKer = " << v);
+    gt_io.exportArrayComplex(convKer, this->gtPluse_ut_res_folder_ + "convKer");
+
+    ho4DArray<T> kIm(RO, E1, srcCHA, dstCHA);
+    timer.start("grappa2d_image_domain_kernel");
+    Gadgetron::grappa2d_image_domain_kernel(convKer, RO, E1, kIm);
+    timer.stop();
+    gt_io.exportArrayComplex(kIm, this->gtPluse_ut_res_folder_ + "kIm");
+
+    Gadgetron::norm2(kIm, v);
+    GDEBUG_STREAM("kIm = " << v);
+
+    ho3DArray<T> unmixC(RO, E1, srcCHA);
+    ho2DArray<float> gFactor(RO, E1);
+
+    ho3DArray<T> coilMap2(RO, E1, dstCHA, coilMap.begin());
+
+    Gadgetron::norm2(coilMap2, v);
+    GDEBUG_STREAM("coilMap2 = " << v);
+
+    Gadgetron::grappa2d_unmixing_coeff(kIm, coilMap2, accelFactor, unmixC, gFactor);
+
+    Gadgetron::norm2(unmixC, v);
+    GDEBUG_STREAM("unmixC = " << v);
+
+    gt_io.export3DArrayComplex(unmixC, this->gtPluse_ut_res_folder_ + "unmixC");
+    gt_io.export2DArray(gFactor, this->gtPluse_ut_res_folder_ + "gFactor");
+
+    // unwarpping
+    hoNDArray<T> res;
+    grappa.applyImageDomainKernel(kspace, kIm, res);
+    gt_io.export3DArrayComplex(res, this->gtPluse_ut_res_folder_ + "grappa2D_res");
+
+    Gadgetron::apply_unmix_coeff_kspace(kspace, unmixC, res);
+    gt_io.export2DArrayComplex(res, this->gtPluse_ut_res_folder_ + "res_unmixC");
+}
diff --git a/toolboxes/gtplus/ut/gtplus_ut.cpp b/toolboxes/gtplus/ut/gtplus_ut.cpp
new file mode 100644
index 0000000..f338482
--- /dev/null
+++ b/toolboxes/gtplus/ut/gtplus_ut.cpp
@@ -0,0 +1,16 @@
+/*
+ * tests.cpp
+ *
+ *  Created on: Feb 28, 2013
+ *      Author: Dae
+ */
+
+#include <gtest/gtest.h>
+
+int main(int argc, char **argv)
+{
+    //::testing::GTEST_FLAG(filter) = "*grappa*:*spirit*";
+
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/toolboxes/gtplus/util/gtPlusIOAnalyze.cpp b/toolboxes/gtplus/util/gtPlusIOAnalyze.cpp
new file mode 100644
index 0000000..67339d1
--- /dev/null
+++ b/toolboxes/gtplus/util/gtPlusIOAnalyze.cpp
@@ -0,0 +1,100 @@
+/** \file       gtPlusIOAnalyze.cpp
+    \brief      Implement the suppor for the Analzye75 medical image format
+    \author     Hui Xue
+
+    Ref to:
+    http://eeg.sourceforge.net/ANALYZE75.pdf
+*/
+
+#include <gtPlusIOAnalyze.h>
+
+// to suppor the ISMRMRD format
+// [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+
+namespace Gadgetron { namespace gtPlus {
+
+gtPlusIOAnalyze::gtPlusIOAnalyze() : BaseClass()
+{
+}
+
+gtPlusIOAnalyze::gtPlusIOAnalyze(float px, float py) : BaseClass(px, py)
+{
+}
+
+gtPlusIOAnalyze::gtPlusIOAnalyze(float px, float py, float pz) : BaseClass(px, py, pz)
+{
+}
+
+gtPlusIOAnalyze::gtPlusIOAnalyze(float px, float py, float pz, float pt) : BaseClass(px, py, pz, pt)
+{
+}
+
+gtPlusIOAnalyze::gtPlusIOAnalyze(float px, float py, float pz, float pt, float pr) : BaseClass(px, py, pz, pt, pr)
+{
+}
+
+gtPlusIOAnalyze::gtPlusIOAnalyze(float px, float py, float pz, float pt, float pr, float ps) : BaseClass(px, py, pz, pt, pr, ps)
+{
+}
+
+gtPlusIOAnalyze::gtPlusIOAnalyze(float px, float py, float pz, float pt, float pr, float ps, float pp) : BaseClass(px, py, pz, pt, pr, ps, pp)
+{
+}
+
+gtPlusIOAnalyze::gtPlusIOAnalyze(float px, float py, float pz, float pt, float pr, float ps, float pp, float pq) : BaseClass(px, py, pz, pt, pr, ps, pp, pq)
+{
+}
+
+void gtPlusIOAnalyze::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus Array input/output to Analyze75 format -------------" << endl;
+    os << "--------------------------------------------------------------------------" << endl;
+}
+
+bool gtPlusIOAnalyze::readHeader(const std::string& filename, HeaderType& header)
+{
+    try
+    {
+        std::string filenameData = filename;
+        filenameData.append(".hdr");
+
+        gtPlusIOWorker ioworker(filenameData, true);
+
+        GADGET_CHECK_RETURN_FALSE(ioworker.open());
+        GADGET_CHECK_RETURN_FALSE(ioworker.read(reinterpret_cast<char*>(&header), sizeof(dsr)));
+        GADGET_CHECK_RETURN_FALSE(ioworker.close());
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusIOAnalyze::readHeader(const std::string& filename, dsr& header) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool gtPlusIOAnalyze::writeHeader(const std::string& filename, const HeaderType& header)
+{
+    try
+    {
+        std::string filenameData = filename;
+        filenameData.append(".hdr");
+
+        gtPlusIOWorker ioworker(filenameData, false);
+
+        GADGET_CHECK_RETURN_FALSE(ioworker.open());
+        GADGET_CHECK_RETURN_FALSE(ioworker.write(reinterpret_cast<const char*>(&header), sizeof(dsr)));
+        GADGET_CHECK_RETURN_FALSE(ioworker.close());
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusIOAnalyze::writeHeader(const std::string& filename, const dsr& header) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/util/gtPlusIOAnalyze.h b/toolboxes/gtplus/util/gtPlusIOAnalyze.h
new file mode 100644
index 0000000..8c318c6
--- /dev/null
+++ b/toolboxes/gtplus/util/gtPlusIOAnalyze.h
@@ -0,0 +1,937 @@
+/** \file       gtPlusIOAnalyze.h
+    \brief      Implement the suppor for the Analzye75 medical image format
+    \author     Hui Xue
+
+    The ISMRMRD dimensions are mapped to Analyze75 format.
+
+    Ref to:
+    http://eeg.sourceforge.net/ANALYZE75.pdf
+    http://ismrmrd.sourceforge.net/
+*/
+
+#pragma once
+
+#include "gtPlusIOBase.h"
+
+// the file input/output utility functions for the Analyze format
+
+// the following Analyze75 data structured is defined as this online document eeg.sourceforge.net/ANALYZE75.pdf‎
+
+// the official definition of Analyze 7.5 file format
+struct header_key
+{
+    int sizeof_hdr;
+    char data_type[10];
+    char db_name[18];
+    int extents;
+    short int session_error;
+    char regular;
+    char hkey_un0;
+};
+
+struct image_dimension
+{
+    short int dim[8];
+    short int unused8;
+    short int unused9;
+    short int unused10;
+    short int unused11;
+    short int unused12;
+    short int unused13;
+    short int unused14;
+    short int datatype;
+    short int bitpix;
+    short int dim_un0;
+    float pixdim[8];
+    float vox_offset;
+    float funused1;
+    float funused2;
+    float funused3;
+    float cal_max;
+    float cal_min;
+    float compressed;
+    float verified;
+    int glmax,glmin;
+};
+
+struct data_history
+{
+    char descrip[80];
+    char aux_file[24];
+    char orient;
+    char originator[10];
+    char generated[10];
+    char scannum[10];
+    char patient_id[10];
+    char exp_date[10];
+    char exp_time[10];
+    char hist_un0[3];
+    int views;
+    int vols_added;
+    int start_field;
+    int field_skip;
+    int omax, omin;
+    int smax, smin;
+};
+
+// Analyze75 header has 348 bytes
+struct dsr
+{
+    struct header_key hk;
+    struct image_dimension dime;
+    struct data_history hist;
+};
+
+// to suppor the ISMRMRD format
+// [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+
+namespace Gadgetron { namespace gtPlus {
+
+class EXPORTGTPLUSIO gtPlusIOAnalyze : public gtPlusIOBase<dsr>
+{
+public:
+
+    typedef gtPlusIOBase<dsr> BaseClass;
+    typedef BaseClass::THeaderType HeaderType;
+
+    gtPlusIOAnalyze();
+    gtPlusIOAnalyze(float px, float py);
+    gtPlusIOAnalyze(float px, float py, float pz);
+    gtPlusIOAnalyze(float px, float py, float pz, float pt);
+    gtPlusIOAnalyze(float px, float py, float pz, float pt, float pr);
+    gtPlusIOAnalyze(float px, float py, float pz, float pt, float pr, float ps);
+    gtPlusIOAnalyze(float px, float py, float pz, float pt, float pr, float ps, float pp);
+    gtPlusIOAnalyze(float px, float py, float pz, float pt, float pr, float ps, float pp, float pq);
+
+    virtual ~gtPlusIOAnalyze() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    virtual bool exportArray(const hoNDArray<short>& a, const std::string& filename) { return this->exportArrayImpl(a, filename); }
+
+    virtual bool exportArray(const hoNDArray<unsigned short>& a, const std::string& filename) { return this->exportArrayImpl(a, filename); }
+    virtual bool exportArray(const hoNDArray<int>& a, const std::string& filename) { return this->exportArrayImpl(a, filename); }
+    virtual bool exportArray(const hoNDArray<unsigned int>& a, const std::string& filename) { return this->exportArrayImpl(a, filename); }
+    virtual bool exportArray(const hoNDArray<size_t>& a, const std::string& filename) { return this->exportArrayImpl(a, filename); }
+    virtual bool exportArray(const hoNDArray<float>& a, const std::string& filename) { return this->exportArrayImpl(a, filename); }
+    virtual bool exportArray(const hoNDArray<double>& a, const std::string& filename) { return this->exportArrayImpl(a, filename); }
+    virtual bool exportArray(const hoNDArray< std::complex<float> >& a, const std::string& filename) { return this->exportArrayImpl(a, filename); }
+    virtual bool exportArray(const hoNDArray< std::complex<double> >& a, const std::string& filename) { return this->exportArrayImpl(a, filename); }
+
+    virtual bool importArray(hoNDArray<short>& a, const std::string& filename) { return this->importArrayImpl(a, filename); }
+    virtual bool importArray(hoNDArray<unsigned short>& a, const std::string& filename) { return this->importArrayImpl(a, filename); }
+    virtual bool importArray(hoNDArray<int>& a, const std::string& filename) { return this->importArrayImpl(a, filename); }
+    virtual bool importArray(hoNDArray<unsigned int>& a, const std::string& filename) { return this->importArrayImpl(a, filename); }
+    virtual bool importArray(hoNDArray<float>& a, const std::string& filename) { return this->importArrayImpl(a, filename); }
+    virtual bool importArray(hoNDArray<double>& a, const std::string& filename) { return this->importArrayImpl(a, filename); }
+    virtual bool importArray(hoNDArray< std::complex<float> >& a, const std::string& filename) { return this->importArrayImpl(a, filename); }
+    virtual bool importArray(hoNDArray< std::complex<double> >& a, const std::string& filename) { return this->importArrayImpl(a, filename); }
+
+    template <typename T> 
+    bool exportArrayImpl(const hoNDArray<T>& a, const std::string& filename)
+    {
+        try
+        {
+            HeaderType header;
+            GADGET_CHECK_RETURN_FALSE(this->array2Header(a, header));
+            GADGET_CHECK_RETURN_FALSE(this->writeHeader(filename, header));
+
+            std::string filenameData = filename;
+            filenameData.append(".img");
+            GADGET_CHECK_RETURN_FALSE(this->writeData(filenameData, a.begin(), a.get_number_of_bytes()));
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in gtPlusIOAnalyze::exportArrayImpl(const hoNDArray<T>& a, const std::string& filename) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T> 
+    bool importArrayImpl(hoNDArray<T>& a, const std::string& filename)
+    {
+        try
+        {
+            HeaderType header;
+            GADGET_CHECK_RETURN_FALSE(this->readHeader(filename, header));
+            GADGET_CHECK_RETURN_FALSE(this->header2Array(a, header));
+
+            std::string filenameData = filename;
+            filenameData.append(".img");
+            GADGET_CHECK_RETURN_FALSE(this->readData(filenameData, a.begin(), a.get_number_of_bytes()));
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in gtPlusIOAnalyze::importArrayImpl(const hoNDArray<T>& a, const std::string& filename) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    bool exportImage(const hoNDImage<T,D>& a, const std::string& filename)
+    {
+        try
+        {
+            HeaderType header;
+            GADGET_CHECK_RETURN_FALSE(this->image2Header(a, header));
+            GADGET_CHECK_RETURN_FALSE(this->writeHeader(filename, header));
+
+            std::string filenameData = filename;
+            filenameData.append(".img");
+            GADGET_CHECK_RETURN_FALSE(this->writeData(filenameData, a.begin(), a.get_number_of_bytes()));
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in exportImage(const hoNDImage<T,D>& a, const std::string& filename) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    bool importImage(hoNDImage<T,D>& a, const std::string& filename)
+    {
+        try
+        {
+            HeaderType header;
+            GADGET_CHECK_RETURN_FALSE(this->readHeader(filename, header));
+            GADGET_CHECK_RETURN_FALSE(this->header2Image(a, header));
+
+            std::string filenameData = filename;
+            filenameData.append(".img");
+            GADGET_CHECK_RETURN_FALSE(this->readData(filenameData, a.begin(), a.get_number_of_bytes()));
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in importImage(const hoNDImage<T,D>& a, const std::string& filename) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+/// image functions
+
+    template <typename T, unsigned int D> 
+    bool exportImageComplex(const hoNDImage<T,D>& a, const std::string& filename)
+    {
+        try
+        {
+            typedef typename Gadgetron::realType<T>::Type value_type;
+
+            //hoNDImage<value_type, D> buf;
+            //GADGET_CHECK_RETURN_FALSE(Gadgetron::complex_to_real(a, buf));
+
+            //std::string filenameReal = filename;
+            //filenameReal.append("_REAL");
+            //GADGET_CHECK_RETURN_FALSE(exportImage(buf, filenameReal));
+
+            //GADGET_CHECK_RETURN_FALSE(Gadgetron::complex_to_imag(a, buf));
+            //std::string filenameImag = filename;
+            //filenameImag.append("_IMAG");
+            //GADGET_CHECK_RETURN_FALSE(exportImage(buf, filenameImag));
+
+            //GADGET_CHECK_RETURN_FALSE(Gadgetron::abs(a, buf));
+            //std::string filenameMag = filename;
+            //filenameMag.append("_MAG");
+            //GADGET_CHECK_RETURN_FALSE(exportImage(buf, filenameMag));
+
+            //GADGET_CHECK_RETURN_FALSE(Gadgetron::argument(a, buf));
+            //std::string filenamePhase = filename;
+            //filenamePhase.append("_PHASE");
+            //GADGET_CHECK_RETURN_FALSE(exportImage(buf, filenamePhase));
+
+            long long num = (long long)a.get_number_of_elements();
+
+            long long n;
+
+            hoNDImage<value_type, D> rpart, ipart, mag, phs;
+            rpart.create( *a.get_dimensions() );
+            ipart.create( *a.get_dimensions() );
+            mag.create( *a.get_dimensions() );
+            phs.create( *a.get_dimensions() );
+
+            const T* pA = a.begin();
+
+            #pragma omp parallel for default(none) private(n) shared(num, pA, rpart, ipart, mag, phs)
+            for ( n=0; n<num; n++ )
+            {
+                rpart(n) = pA[n].real();
+                ipart(n) = pA[n].imag();
+                mag(n) = std::abs( pA[n] );
+                phs(n) = std::arg( pA[n] );
+            }
+
+            std::string filenameReal = filename;
+            filenameReal.append("_REAL");
+            GADGET_CHECK_RETURN_FALSE(exportImage(rpart, filenameReal));
+
+            std::string filenameImag = filename;
+            filenameImag.append("_IMAG");
+            GADGET_CHECK_RETURN_FALSE(exportImage(ipart, filenameImag));
+
+            std::string filenameMag = filename;
+            filenameMag.append("_MAG");
+            GADGET_CHECK_RETURN_FALSE(exportImage(mag, filenameMag));
+
+            std::string filenamePhase = filename;
+            filenamePhase.append("_PHASE");
+            GADGET_CHECK_RETURN_FALSE(exportImage(phs, filenamePhase));
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in exportImageComplex(const hoNDImage<T,D>& a, const std::string& filename) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    bool importImageComplex(hoNDImage<T,D>& a, const std::string& filename)
+    {
+        try
+        {
+            typedef typename T::value_type value_type;
+            hoNDImage<value_type, D> real, imag;
+
+            std::string filenameReal = filename;
+            filenameReal.append("_REAL");
+            GADGET_CHECK_RETURN_FALSE(importImage(real, filenameReal));
+
+            std::string filenameImag = filename;
+            filenameImag.append("_IMAG");
+            GADGET_CHECK_RETURN_FALSE(importImage(imag, filenameImag));
+
+            a.create(real.get_dimensions());
+            long long num = (long long)a.get_number_of_elements();
+
+            long long n;
+            T* pA = a.begin();
+
+            #pragma omp parallel for default(none) private(n) shared(num, pA, real, imag)
+            for ( n=0; n<num; n++ )
+            {
+                pA[n] = T( real(n), imag(n) );
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in importImageComplex(const hoNDImage<T,D>& a, const std::string& filename) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T, unsigned int D> 
+    bool importImageComplex(hoNDImage<T,D>& a, const std::string& filename_real, const std::string& filename_imag)
+    {
+        try
+        {
+            typedef typename realType<T>::Type value_type;
+            hoNDImage<value_type, D> real, imag;
+
+            GADGET_CHECK_RETURN_FALSE(importImage(real, filename_real));
+            GADGET_CHECK_RETURN_FALSE(importImage(imag, filename_imag));
+
+            a.create(real.get_dimensions());
+            long long num = (long long)a.get_number_of_elements();
+
+            long long n;
+            T* pA = a.begin();
+
+            #pragma omp parallel for default(none) private(n) shared(num, pA, real, imag)
+            for ( n=0; n<num; n++ )
+            {
+                pA[n] = T( real(n), imag(n) );
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in importImageComplex(hoNDImage<T,D>& a, const std::string& filename_real, const std::string& filename_imag) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T> 
+    bool export2DImage(const hoNDImage<T,2>& a, const std::string& filename)
+    {
+        return exportImage(a, filename);
+    }
+
+    template <typename T> 
+    bool import2DImage(hoNDImage<T,2>& a, const std::string& filename)
+    {
+        return importImage(a, filename);
+    }
+
+    template <typename T> 
+    bool export2DImageComplex(const hoNDImage<T,2>& a, const std::string& filename)
+    {
+        return exportImageComplex(a, filename);
+    }
+
+    template <typename T> 
+    bool import2DImageComplex(hoNDImage<T,2>& a, const std::string& filename)
+    {
+        return importImageComplex(a, filename);
+    }
+
+    template <typename T> 
+    bool export3DImage(const hoNDImage<T,3>& a, const std::string& filename)
+    {
+        return exportImage(a, filename);
+    }
+
+    template <typename T> 
+    bool import3DImage(hoNDImage<T,3>& a, const std::string& filename)
+    {
+        return importImage(a, filename);
+    }
+
+    template <typename T> 
+    bool export3DImageComplex(const hoNDImage<T,3>& a, const std::string& filename)
+    {
+        return exportImageComplex(a, filename);
+    }
+
+    template <typename T> 
+    bool import3DImageComplex(hoNDImage<T,3>& a, const std::string& filename)
+    {
+        return importImageComplex(a, filename);
+    }
+
+    template <typename T> 
+    bool export4DImage(const hoNDImage<T,4>& a, const std::string& filename)
+    {
+        try
+        {
+            size_t RO     = a.get_size(0);
+            size_t E1     = a.get_size(1);
+            size_t CHA    = a.get_size(2);
+            size_t N      = a.get_size(3);
+
+            size_t ii;
+            for (ii=0; ii<N; ii++ )
+            {
+                std::vector<size_t> dim(3);
+                dim[0] = RO;
+                dim[1] = E1;
+                dim[2] = CHA;
+
+                hoNDImage<T, 3> a3D(dim, const_cast<T*>(a.begin()+ii*RO*E1*CHA), false);
+
+                std::ostringstream ostr;
+                ostr << filename << "_" << ii << std::ends;
+                GADGET_CHECK_RETURN_FALSE(export3DImage(a3D, ostr.str()));
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in export4DImage(const hoNDImage<T>& a, const std::string& filename) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T> 
+    bool export4DImageComplex(const hoNDImage<T,4>& a, const std::string& filename)
+    {
+        try
+        {
+            size_t RO     = a.get_size(0);
+            size_t E1     = a.get_size(1);
+            size_t CHA    = a.get_size(2);
+            size_t N      = a.get_size(3);
+
+            size_t ii;
+            for (ii=0; ii<N; ii++ )
+            {
+                std::vector<size_t> dim(3);
+                dim[0] = RO;
+                dim[1] = E1;
+                dim[2] = CHA;
+
+                hoNDImage<T, 3> a3D(dim, const_cast<T*>(a.begin()+ii*RO*E1*CHA), false);
+
+                std::ostringstream ostr;
+                ostr << filename << "_" << ii << std::ends;
+                GADGET_CHECK_RETURN_FALSE(export3DImageComplex(a3D, ostr.str()));
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in export4DImageComplex(const hoNDImage<T>& a, const std::string& filename) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+protected:
+
+    template <typename T> bool array2Header(const hoNDArray<T>& a, HeaderType& header);
+    template <typename T> bool header2Array(hoNDArray<T>& a, const HeaderType& header);
+
+    template <typename T, unsigned int D> bool image2Header(const hoNDImage<T, D>& a, HeaderType& header);
+    template <typename T, unsigned int D> bool header2Image(hoNDImage<T, D>& a, const HeaderType& header);
+
+    // read/write the analyze header
+    bool readHeader(const std::string& filename, HeaderType& header);
+    bool writeHeader(const std::string& filename, const HeaderType& header);
+};
+
+template <typename T> 
+bool gtPlusIOAnalyze::array2Header(const hoNDArray<T>& a, HeaderType& header)
+{
+    try
+    {
+        // set everything to zero
+        memset(&header, 0, sizeof(dsr));
+
+        // header_key
+        header.hk.sizeof_hdr = 348;
+        size_t i;
+        for (i=0; i<10; i++ ) header.hk.data_type[i] = 0;
+        for (i=0; i<18; i++ ) header.hk.db_name[i] = 0;
+        header.hk.extents = 16384;
+        header.hk.session_error = 0;
+        header.hk.regular = 'r';
+        header.hk.hkey_un0 = 0;
+
+        // image_dimension
+        size_t NDim = a.get_number_of_dimensions();
+
+        header.dime.dim[0] = (short)(NDim);
+        header.dime.dim[1] = (short)(a.get_size(0));
+
+        if ( NDim > 1 )
+            header.dime.dim[2] = (short)(a.get_size(1));
+        else
+            header.dime.dim[2] = 1;
+
+        if ( NDim > 2 )
+            header.dime.dim[3] = (short)(a.get_size(2));
+        else
+            header.dime.dim[3] = 1;
+
+        if ( NDim > 3 )
+            header.dime.dim[4] = (short)(a.get_size(3));
+        else
+            header.dime.dim[4] = 1;
+
+        if ( NDim > 4 )
+            header.dime.dim[5] = (short)(a.get_size(4));
+        else
+            header.dime.dim[5] = 1;
+
+        if ( NDim > 5 )
+            header.dime.dim[6] = (short)(a.get_size(5));
+        else
+            header.dime.dim[6] = 1;
+
+        if ( NDim > 6 )
+            header.dime.dim[7] = (short)(a.get_size(6));
+        else
+            header.dime.dim[7] = 1;
+
+        if ( NDim > 7 )
+            header.dime.unused8 = (short)(a.get_size(7));
+        else
+            header.dime.unused8 = 1;
+
+        if ( NDim > 8 )
+            header.dime.unused9 = (short)(a.get_size(8));
+        else
+            header.dime.unused9 = 1;
+
+        if ( NDim > 9 )
+            header.dime.unused10 = (short)(a.get_size(9));
+        else
+            header.dime.unused10 = 1;
+
+        header.dime.unused11 = 0;
+        header.dime.unused12 = 0;
+        header.dime.unused13 = 0;
+        header.dime.unused14 = 0;
+
+        std::string rttiID = std::string(typeid(T).name());
+        header.dime.datatype = (short)getDataTypeFromRTTI(rttiID);
+        header.dime.bitpix = (short)(8*sizeof(T));
+        header.dime.dim_un0 = 0;
+
+        // since the NDArray does not carry the pixel spacing
+        header.dime.pixdim[0] = 0;
+        if ( pixelSize_.size() > 1 )
+            header.dime.pixdim[1] = pixelSize_[0];
+        if ( pixelSize_.size() > 2 )
+            header.dime.pixdim[2] = pixelSize_[1];
+        if ( pixelSize_.size() > 3 )
+            header.dime.pixdim[3] = pixelSize_[2];
+        if ( pixelSize_.size() > 4 )
+            header.dime.pixdim[4] = pixelSize_[3];
+        if ( pixelSize_.size() > 5 )
+            header.dime.pixdim[5] = pixelSize_[4];
+        if ( pixelSize_.size() > 6 )
+            header.dime.pixdim[6] = pixelSize_[5];
+        if ( pixelSize_.size() > 7 )
+            header.dime.pixdim[7] = pixelSize_[6];
+
+        header.dime.vox_offset = 0;
+        header.dime.funused1 = 0;
+        header.dime.funused2 = 0;
+        header.dime.funused3 = 0;
+        header.dime.cal_max = 0;
+        header.dime.cal_min = 0;
+        header.dime.compressed = 0;
+        header.dime.verified = 0;
+        header.dime.glmax = 0;
+        header.dime.glmin = 0;
+
+        // data history
+        for (i=0; i<80; i++ ) header.hist.descrip[i] = 0;
+        for (i=0; i<24; i++ ) header.hist.aux_file[i] = 0;
+        header.hist.orient = 0;
+        for (i=0; i<10; i++ ) header.hist.originator[i] = 0;
+        for (i=0; i<10; i++ ) header.hist.generated[i] = 0;
+        for (i=0; i<10; i++ ) header.hist.scannum[i] = 0;
+        for (i=0; i<10; i++ ) header.hist.patient_id[i] = 0;
+        for (i=0; i<10; i++ ) header.hist.exp_date[i] = 0;
+        for (i=0; i<10; i++ ) header.hist.exp_time[i] = 0;
+        for (i=0; i<3; i++ ) header.hist.hist_un0[i] = 0;
+        header.hist.views = 0;
+        header.hist.vols_added = 0;
+        header.hist.start_field = 0;
+        header.hist.field_skip = 0;
+        header.hist.omax = 0;
+        header.hist.omin = 0;
+        header.hist.smax = 0;
+        header.hist.smin = 0;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusIOAnalyze::array2Analyze(const hoNDArray<T>& a, dsr& header) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::header2Array(hoNDArray<T>& a, const HeaderType& header)
+{
+    try
+    {
+        std::string rttiID = std::string(typeid(T).name());
+        GADGET_CHECK_RETURN_FALSE(rttiID==getRTTIFromDataType( (GtDataType)header.dime.datatype));
+
+        std::vector<size_t> dim(header.dime.dim[0]);
+        size_t ii;
+        for ( ii=0; ii<dim.size(); ii++ )
+        {
+            if ( ii == 7 )
+            {
+                dim[ii] = header.dime.unused8;
+            }
+            else if ( ii == 8 )
+            {
+                dim[ii] = header.dime.unused9;
+            }
+            else if ( ii == 9 ) 
+            {
+                dim[ii] = header.dime.unused10;
+            }
+            else
+            {
+                dim[ii] = header.dime.dim[ii+1];
+            }
+        }
+
+        pixelSize_.resize(dim.size());
+        for ( ii=0; ii<dim.size(); ii++ )
+        {
+            if ( ii < 7 )
+            {
+                pixelSize_[ii] = header.dime.pixdim[ii+1];
+            }
+        }
+
+        a.create(&dim);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusIOAnalyze::analyze2Array(hoNDArray<T>& a, const dsr& header) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, unsigned int D> 
+bool gtPlusIOAnalyze::image2Header(const hoNDImage<T,D>& a, HeaderType& header)
+{
+    try
+    {
+        typedef typename hoNDImage<T,D>::coord_type coord_type;
+
+        // set everything to zero
+        memset(&header, 0, sizeof(dsr));
+
+        // header_key
+        header.hk.sizeof_hdr = 348;
+        size_t i;
+        for (i=0; i<10; i++ ) header.hk.data_type[i] = 0;
+        for (i=0; i<18; i++ ) header.hk.db_name[i] = 0;
+        header.hk.extents = 16384;
+        header.hk.session_error = 0;
+        header.hk.regular = 'r';
+        header.hk.hkey_un0 = 0;
+
+        // image_dimension
+        size_t NDim = D;
+
+        header.dime.dim[0] = (short)(NDim);
+        header.dime.dim[1] = (short)(a.get_size(0));
+
+        if ( NDim > 1 )
+            header.dime.dim[2] = (short)(a.get_size(1));
+        else
+            header.dime.dim[2] = 1;
+
+        if ( NDim > 2 )
+            header.dime.dim[3] = (short)(a.get_size(2));
+        else
+            header.dime.dim[3] = 1;
+
+        if ( NDim > 3 )
+            header.dime.dim[4] = (short)(a.get_size(3));
+        else
+            header.dime.dim[4] = 1;
+
+        if ( NDim > 4 )
+            header.dime.dim[5] = (short)(a.get_size(4));
+        else
+            header.dime.dim[5] = 1;
+
+        if ( NDim > 5 )
+            header.dime.dim[6] = (short)(a.get_size(5));
+        else
+            header.dime.dim[6] = 1;
+
+        if ( NDim > 6 )
+            header.dime.dim[7] = (short)(a.get_size(6));
+        else
+            header.dime.dim[7] = 1;
+
+        if ( NDim > 7 )
+            header.dime.unused8 = (short)(a.get_size(7));
+        else
+            header.dime.unused8 = 1;
+
+        if ( NDim > 8 )
+            header.dime.unused9 = (short)(a.get_size(8));
+        else
+            header.dime.unused9 = 1;
+
+        if ( NDim > 9 )
+            header.dime.unused10 = (short)(a.get_size(9));
+        else
+            header.dime.unused10 = 1;
+
+        header.dime.unused11 = 0;
+        header.dime.unused12 = 0;
+        header.dime.unused13 = 0;
+        header.dime.unused14 = 0;
+
+        std::string rttiID = std::string(typeid(T).name());
+        header.dime.datatype = (short)getDataTypeFromRTTI(rttiID);
+        header.dime.bitpix = (short)(8*sizeof(T));
+        header.dime.dim_un0 = 0;
+
+        header.dime.pixdim[0] = 0;
+        header.dime.pixdim[1] = a.get_pixel_size(0);
+        header.dime.pixdim[2] = 1;
+        header.dime.pixdim[3] = 1;
+        if ( NDim > 1 )
+            header.dime.pixdim[2] = a.get_pixel_size(1);
+        if ( NDim > 2 )
+            header.dime.pixdim[3] = a.get_pixel_size(2);
+        if ( NDim > 3 )
+            header.dime.pixdim[4] = a.get_pixel_size(3);
+        if ( NDim > 4 )
+            header.dime.pixdim[5] = a.get_pixel_size(4);
+        if ( NDim > 5 )
+            header.dime.pixdim[6] = a.get_pixel_size(5);
+        if ( NDim > 6 )
+            header.dime.pixdim[7] = a.get_pixel_size(6);
+
+        header.dime.vox_offset = 0;
+        header.dime.funused1 = 0;
+        header.dime.funused2 = 0;
+        header.dime.funused3 = 0;
+        header.dime.cal_max = 0;
+        header.dime.cal_min = 0;
+        header.dime.compressed = 0;
+        header.dime.verified = 0;
+        header.dime.glmax = 0;
+        header.dime.glmin = 0;
+
+        // data history
+        for (i=0; i<80; i++ ) header.hist.descrip[i] = 0;
+        for (i=0; i<24; i++ ) header.hist.aux_file[i] = 0;
+        header.hist.orient = 0;
+        for (i=0; i<10; i++ ) header.hist.originator[i] = 0;
+        for (i=0; i<10; i++ ) header.hist.generated[i] = 0;
+        for (i=0; i<10; i++ ) header.hist.scannum[i] = 0;
+        for (i=0; i<10; i++ ) header.hist.patient_id[i] = 0;
+        for (i=0; i<10; i++ ) header.hist.exp_date[i] = 0;
+        for (i=0; i<10; i++ ) header.hist.exp_time[i] = 0;
+        for (i=0; i<3; i++ ) header.hist.hist_un0[i] = 0;
+        header.hist.views = 0;
+        header.hist.vols_added = 0;
+        header.hist.start_field = 0;
+        header.hist.field_skip = 0;
+        header.hist.omax = 0;
+        header.hist.omin = 0;
+        header.hist.smax = 0;
+        header.hist.smin = 0;
+
+        // store image origin and axis
+        // total number of bytes are
+        size_t numOfBytes = sizeof(float)*(D+D*D);
+        if ( numOfBytes <= sizeof(data_history) )
+        {
+            std::vector<float> buf(D+D*D, 0);
+
+            unsigned int ii;
+            for ( ii=0; ii<D; ii++ )
+            {
+                buf[ii] = (float)a.get_origin(ii);
+            }
+
+            unsigned int jj;
+            for ( ii=0; ii<D; ii++ )
+            {
+                for ( jj=0; jj<D; jj++ )
+                {
+                    buf[D+ii*D+jj] = (float)a.get_axis(ii, jj);
+                }
+            }
+
+            memcpy(&header.hist, &buf[0], numOfBytes);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusIOAnalyze::image2Analyze(const hoNDImage<T>& a, dsr& header) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T, unsigned int D> 
+bool gtPlusIOAnalyze::header2Image(hoNDImage<T,D>& a, const HeaderType& header)
+{
+    try
+    {
+        std::string rttiID = std::string(typeid(T).name());
+        GADGET_CHECK_RETURN_FALSE(rttiID==getRTTIFromDataType( (GtDataType)header.dime.datatype));
+
+        std::vector<size_t> dim(header.dime.dim[0]);
+
+        if ( D > dim.size() ) return false;
+
+        size_t ii;
+        for ( ii=0; ii<dim.size(); ii++ )
+        {
+            if ( ii == 7 )
+            {
+                dim[ii] = header.dime.unused8;
+            }
+            else if ( ii == 8 )
+            {
+                dim[ii] = header.dime.unused9;
+            }
+            else if ( ii == 9 ) 
+            {
+                dim[ii] = header.dime.unused10;
+            }
+            else
+            {
+                dim[ii] = header.dime.dim[ii+1];
+            }
+        }
+
+        a.create(dim);
+
+        for ( ii=0; ii<dim.size(); ii++ )
+        {
+            if ( ii < 7 )
+            {
+                a.set_pixel_size(ii, header.dime.pixdim[ii+1]);
+            }
+        }
+
+        // get origin and axis
+        size_t numOfBytes = sizeof(float)*(D+D*D);
+        if ( numOfBytes <= sizeof(data_history) )
+        {
+            std::vector<float> buf(D+D*D);
+            memcpy(&buf[0], &header.hist, numOfBytes);
+
+            unsigned int ii;
+            for ( ii=0; ii<D; ii++ )
+            {
+                a.set_origin(ii, buf[ii]);
+            }
+
+            unsigned int jj;
+            for ( ii=0; ii<D; ii++ )
+            {
+                typename hoNDImage<T,D>::coord_type v(0);
+                typename hoNDImage<T,D>::coord_type mag(0);
+
+                for ( jj=0; jj<D; jj++ )
+                {
+                    v = buf[D+ii*D+jj];
+                    mag += v*v;
+                    a.set_axis(ii, jj, v);
+                }
+
+                if ( mag < FLT_EPSILON )
+                {
+                    for ( jj=0; jj<D; jj++ )
+                    {
+                        if ( ii != jj )
+                        {
+                            a.set_axis(ii, jj, 0);
+                        }
+                        else
+                        {
+                            a.set_axis(ii, jj, (typename hoNDImage<T,D>::coord_type)(1.0) );
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusIOAnalyze::analyze2Image(hoNDImage<T,D>& a, const dsr& header) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/util/gtPlusIOBase.cpp b/toolboxes/gtplus/util/gtPlusIOBase.cpp
new file mode 100644
index 0000000..c4cc50a
--- /dev/null
+++ b/toolboxes/gtplus/util/gtPlusIOBase.cpp
@@ -0,0 +1,200 @@
+/** \file       gtPlusIOBase.cpp
+    \brief      Define the base IO funcatinality for GtPlus toolbox
+    \author     Hui Xue
+*/
+
+#include <gtPlusIOBase.h>
+
+namespace Gadgetron { namespace gtPlus {
+
+gtPlusIOWorker::gtPlusIOWorker(const std::string& ioTag, bool readFlag) : ioTag_(ioTag), readFlag_(readFlag)
+{
+}
+
+gtPlusIOWorker::~gtPlusIOWorker()
+{
+    if ( !close() )
+    {
+        GERROR_STREAM("Errors in gtPlusIOWorker::~gtPlusIOWorker() ... ");
+    }
+}
+
+bool gtPlusIOWorker::open()
+{
+    try
+    {
+        if ( fid_.is_open() )
+        {
+            fid_.close();
+        }
+
+        if ( readFlag_ )
+        {
+            fid_.open(ioTag_.c_str(), std::ios::in | std::ios::binary);
+        }
+        else
+        {
+            fid_.open(ioTag_.c_str(), std::ios::out | std::ios::binary);
+        }
+
+        if ( !fid_ )
+        {
+            GERROR_STREAM("gtPlusIOWorker::open() cannot open file stream : " << ioTag_);
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusIOWorker::open() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool gtPlusIOWorker::close()
+{
+    try
+    {
+        if ( fid_.is_open() )
+        {
+            fid_.close();
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusIOWorker::close() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+long gtPlusIOWorker::tell()
+{
+    if ( !fid_.is_open() ) return -1;
+
+    if ( readFlag_ )
+    {
+        return (long)fid_.tellg();
+    }
+
+    return (long)fid_.tellp();
+}
+
+bool gtPlusIOWorker::seek(long long offset)
+{
+    if ( !fid_.is_open() ) return false;
+
+    if ( readFlag_ )
+    {
+        fid_.seekg(offset, std::ios::beg);
+        return this->IOinError();
+    }
+
+    fid_.seekp(offset, std::ios::beg);
+    return this->IOinError();
+}
+
+bool gtPlusIOWorker::IOinError()
+{
+    std::ios::iostate s;
+    s = fid_.rdstate();
+
+    if ( (s&std::ios::failbit) || (s&std::ios::badbit) )
+    {
+        return false;
+    }
+
+    return true;
+}
+
+bool gtPlusIOWorker::read(char* data, long long len)
+{
+    if ( !fid_.is_open() ) return false;
+    fid_.read(data, len*sizeof(char));
+    return IOinError();
+}
+
+bool gtPlusIOWorker::write(const char* data, long long len)
+{
+    if ( !fid_.is_open() ) return false;
+    fid_.write(data, len*sizeof(char));
+    return IOinError();
+}
+
+// --------------------------------------------------------------------------
+
+//void gtPlusIOBase::printInfo(std::ostream& os)
+//{
+//    using namespace std;
+//
+//    os << "-------------- GTPlus IO Util ---------------" << endl;
+//    os << "Implementation of file input/output operations" << endl;
+//    os << "---------------------------------------------" << endl;
+//}
+//
+//bool gtPlusIOBase::readFromFile(const std::string& filename, char*& data, long long& length)
+//{
+//    try
+//    {
+//        if (data!=NULL) delete [] data;
+//
+//        gtPlusIOWorker ioworker_(filename, true);
+//
+//        GADGET_CHECK_RETURN_FALSE(ioworker_.open());
+//
+//        // read the total length
+//        long long totalLen;
+//        GADGET_CHECK_RETURN_FALSE(ioworker_.read(reinterpret_cast<char*>(&totalLen), sizeof(long long)));
+//
+//        length = totalLen - sizeof(long long);
+//
+//        data = new char[length];
+//        GADGET_CHECK_RETURN_FALSE(data!=NULL);
+//
+//        GADGET_CHECK_RETURN_FALSE(ioworker_.read(data, length));
+//
+//        GADGET_CHECK_RETURN_FALSE(ioworker_.close());
+//    }
+//    catch (...)
+//    {
+//        GERROR_STREAM("Errors in gtPlusIOBase::readFromFile(const std::string& filename, char*& data, long long& length) ... ");
+//        return false;
+//    }
+//
+//    return true;
+//}
+//
+//bool gtPlusIOBase::writeToFile(const std::string& filename, char* data, long long length)
+//{
+//    try
+//    {
+//        if ( length == 0 ) return true;
+//
+//        GADGET_CHECK_RETURN_FALSE(data!=NULL);
+//
+//        gtPlusIOWorker ioworker_(filename, false);
+//
+//        GADGET_CHECK_RETURN_FALSE(ioworker_.open());
+//
+//        // write the total lengh
+//        const long long totalLen = length+sizeof(long long);
+//        GADGET_CHECK_RETURN_FALSE(ioworker_.write(reinterpret_cast<const char*>(&totalLen), sizeof(long long)));
+//
+//        // write the data
+//        GADGET_CHECK_RETURN_FALSE(ioworker_.write(data, length));
+//
+//        // close the file
+//        GADGET_CHECK_RETURN_FALSE(ioworker_.close());
+//    }
+//    catch (...)
+//    {
+//        GERROR_STREAM("Errors in gtPlusIOBase::writeToFile(const std::string& filename, char* data, long long length) ... ");
+//        return false;
+//    }
+//
+//    return true;
+//}
+
+}}
diff --git a/toolboxes/gtplus/util/gtPlusIOBase.h b/toolboxes/gtplus/util/gtPlusIOBase.h
new file mode 100644
index 0000000..e2d836c
--- /dev/null
+++ b/toolboxes/gtplus/util/gtPlusIOBase.h
@@ -0,0 +1,819 @@
+/** \file       gtPlusIOBase.h
+    \brief      Define the base IO funcatinality for GtPlus toolbox
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include <iostream>
+#include <typeinfo>
+
+#include "GtPlusIOExport.h"
+
+#include "NDArray.h"
+#include "complext.h"
+#include "GadgetronException.h"
+
+#include "hoNDArray.h"
+#include "hoNDImage.h"
+
+#include "hoNDArray_fileio.h"
+
+namespace Gadgetron { 
+
+  struct rgb_type { unsigned char r,g,b; };
+  struct rgba_type { unsigned char r,g,b,a; };
+
+  namespace gtPlus {
+
+class EXPORTGTPLUSIO gtPlusIOWorker
+{
+public:
+
+    gtPlusIOWorker(const std::string& ioTag, bool readFlag=true);
+    virtual ~gtPlusIOWorker();
+
+    // open the file stream
+    // readFlag: true, read mode; false, write mode
+    virtual bool open();
+
+    // close the file stream
+    virtual bool close();
+
+    // the current file offset
+    long tell();
+
+    // set the file offset
+    bool seek(long long offset);
+
+    // reset the file to the beginning
+    bool reset() { return (this->seek(0)); }
+
+    // check the status of i/o operations
+    bool IOinError();
+
+    // read/write
+    // len: number of bytes in data
+    bool read(char* data, long long len);
+    bool write(const char* data, long long len);
+
+protected:
+
+    std::string ioTag_;
+    std::fstream fid_;
+    bool readFlag_;
+};
+
+#ifdef DT_UNKNOWN
+    #undef DT_UNKNOWN
+#endif // DT_UNKNOWN
+
+enum GtDataType
+{
+    DT_ANA_UNKNOWN=0,
+    //DT_BINARY=1, 
+    //DT_UNSIGNED_CHAR=2,
+    //DT_SIGNED_SHORT=4,
+    //DT_UNSIGNED_SHORT=5,
+    //DT_SIGNED_INT=8,
+    //DT_UNSIGNED_INT=9,
+    //DT_FLOAT=16,
+    //DT_COMPLEX=32,
+    //DT_DOUBLE=64,
+    //DT_DOUBLECOMPLEX=96, // this type is added to support complex doulbe
+    //DT_RGB=128,
+    //DT_ALL=255
+
+    DT_NONE                    =0,
+    DT_UNKNOWN                 =0,     /* what it says, dude           */
+    DT_BINARY                  =1,     /* binary (1 bit/voxel)         */
+    DT_UNSIGNED_CHAR           =2,     /* unsigned char (8 bits/voxel) */
+    DT_SIGNED_SHORT            =4,     /* signed short (16 bits/voxel) */
+    DT_UNSIGNED_SHORT          =5,
+    DT_SIGNED_INT              =8,     /* signed int (32 bits/voxel)   */
+    DT_UNSIGNED_INT            =9,
+    DT_FLOAT                  =16,     /* float (32 bits/voxel)        */
+    DT_COMPLEX                =32,     /* complex (64 bits/voxel)      */
+    DT_DOUBLE                 =64,     /* double (64 bits/voxel)       */
+    DT_RGB                   =128,     /* RGB triple (24 bits/voxel)   */
+    DT_ALL                   =255,     /* not very useful (?)          */
+
+                                /*----- another set of names for the same ---*/
+    DT_UINT8                   =2,
+    DT_INT16                   =4,
+    DT_INT32                   =8,
+    DT_FLOAT32                =16,
+    DT_COMPLEX64              =32,
+    DT_FLOAT64                =64,
+    DT_RGB24                 =128,
+
+                                /*------------------- new codes for NIFTI ---*/
+    DT_INT8                  =256,     /* signed char (8 bits)         */
+    DT_UINT16                =512,     /* unsigned short (16 bits)     */
+    DT_UINT32                =768,     /* unsigned int (32 bits)       */
+    DT_INT64                =1024,     /* long long (64 bits)          */
+    DT_UINT64               =1280,     /* unsigned long long (64 bits) */
+    DT_FLOAT128             =1536,     /* long double (128 bits)       */
+    DT_COMPLEX128           =1792,     /* double pair (128 bits)       */
+    DT_COMPLEX256           =2048,     /* long double pair (256 bits)  */
+    DT_RGBA32               =2304,     /* 4 byte RGBA (32 bits/voxel)  */
+};
+
+template <typename HeaderType>
+class gtPlusIOBase
+{
+public:
+
+    typedef HeaderType THeaderType;
+
+    gtPlusIOBase()
+    {
+        pixelSize_.resize(10, 1.0);
+    }
+
+    gtPlusIOBase(float px, float py)
+    {
+        pixelSize_.resize(2);
+        pixelSize_[0] = px;
+        pixelSize_[1] = py;
+    }
+
+    gtPlusIOBase(float px, float py, float pz)
+    {
+        pixelSize_.resize(3);
+        pixelSize_[0] = px;
+        pixelSize_[1] = py;
+        pixelSize_[2] = pz;
+    }
+
+    gtPlusIOBase(float px, float py, float pz, float pt)
+    {
+        pixelSize_.resize(4);
+        pixelSize_[0] = px;
+        pixelSize_[1] = py;
+        pixelSize_[2] = pz;
+        pixelSize_[3] = pt;
+    }
+
+    gtPlusIOBase(float px, float py, float pz, float pt, float pr)
+    {
+        pixelSize_.resize(5);
+        pixelSize_[0] = px;
+        pixelSize_[1] = py;
+        pixelSize_[2] = pz;
+        pixelSize_[3] = pt;
+        pixelSize_[4] = pr;
+    }
+
+    gtPlusIOBase(float px, float py, float pz, float pt, float pr, float ps)
+    {
+        pixelSize_.resize(6);
+        pixelSize_[0] = px;
+        pixelSize_[1] = py;
+        pixelSize_[2] = pz;
+        pixelSize_[3] = pt;
+        pixelSize_[4] = pr;
+        pixelSize_[5] = ps;
+    }
+
+    gtPlusIOBase(float px, float py, float pz, float pt, float pr, float ps, float pp)
+    {
+        pixelSize_.resize(7);
+        pixelSize_[0] = px;
+        pixelSize_[1] = py;
+        pixelSize_[2] = pz;
+        pixelSize_[3] = pt;
+        pixelSize_[4] = pr;
+        pixelSize_[5] = ps;
+        pixelSize_[6] = pp;
+    }
+
+    gtPlusIOBase(float px, float py, float pz, float pt, float pr, float ps, float pp, float pq)
+    {
+        pixelSize_.resize(8);
+        pixelSize_[0] = px;
+        pixelSize_[1] = py;
+        pixelSize_[2] = pz;
+        pixelSize_[3] = pt;
+        pixelSize_[4] = pr;
+        pixelSize_[5] = ps;
+        pixelSize_[6] = pp;
+        pixelSize_[7] = pq;
+    }
+
+    void setPixelSize(float px, float py, float pz=1.0f, float pt=1.0f, float pr=1.0f, float ps=1.0f, float pp=1.0f, float pq=1.0f)
+    {
+        pixelSize_.resize(8);
+        pixelSize_[0] = px;
+        pixelSize_[1] = py;
+        pixelSize_[2] = pz;
+        pixelSize_[3] = pt;
+        pixelSize_[4] = pr;
+        pixelSize_[5] = ps;
+        pixelSize_[6] = pp;
+        pixelSize_[7] = pq;
+    }
+
+    void setPixelSize(double px, double py, double pz=1.0, double pt=1.0, double pr=1.0, double ps=1.0, double pp=1.0, double pq=1.0)
+    {
+        pixelSize_.resize(8);
+        pixelSize_[0] = (float)px;
+        pixelSize_[1] = (float)py;
+        pixelSize_[2] = (float)pz;
+        pixelSize_[3] = (float)pt;
+        pixelSize_[4] = (float)pr;
+        pixelSize_[5] = (float)ps;
+        pixelSize_[6] = (float)pp;
+        pixelSize_[7] = (float)pq;
+    }
+
+    void printInfo(std::ostream& os)
+    {
+        using namespace std;
+
+        os << "-------------- GTPlus Array/Image input/output to medical image format -------------" << endl;
+        os << "--------------------------------------------------------------------------" << endl;
+    }
+
+    virtual ~gtPlusIOBase()
+    {
+    }
+
+    /// export/input for 2D/3D/4D array
+    /// filename should be given without extension
+
+    virtual bool exportArray(const hoNDArray<short>& a, const std::string& filename) = 0;
+    virtual bool exportArray(const hoNDArray<unsigned short>& a, const std::string& filename) = 0;
+    virtual bool exportArray(const hoNDArray<int>& a, const std::string& filename) = 0;
+    virtual bool exportArray(const hoNDArray<unsigned int>& a, const std::string& filename) = 0;
+    virtual bool exportArray(const hoNDArray<float>& a, const std::string& filename) = 0;
+    virtual bool exportArray(const hoNDArray<double>& a, const std::string& filename) = 0;
+    virtual bool exportArray(const hoNDArray< std::complex<float> >& a, const std::string& filename) = 0;
+    virtual bool exportArray(const hoNDArray< std::complex<double> >& a, const std::string& filename) = 0;
+
+    virtual bool importArray(hoNDArray<short>& a, const std::string& filename) = 0;
+    virtual bool importArray(hoNDArray<unsigned short>& a, const std::string& filename) = 0;
+    virtual bool importArray(hoNDArray<int>& a, const std::string& filename) = 0;
+    virtual bool importArray(hoNDArray<unsigned int>& a, const std::string& filename) = 0;
+    virtual bool importArray(hoNDArray<float>& a, const std::string& filename) = 0;
+    virtual bool importArray(hoNDArray<double>& a, const std::string& filename) = 0;
+    virtual bool importArray(hoNDArray< std::complex<float> >& a, const std::string& filename) = 0;
+    virtual bool importArray(hoNDArray< std::complex<double> >& a, const std::string& filename) = 0;
+
+    template <typename T> 
+    bool exportArrayComplexRealImag(const hoNDArray<T>& a, const std::string& filename)
+    {
+        try
+        {
+            typedef typename Gadgetron::realType<T>::Type value_type;
+
+            hoNDArray<value_type> buf(a.get_dimensions());
+
+            long long num = (long long)a.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(num, a, buf)
+            for ( n=0; n<num; n++ )
+            {
+                buf(n) = a(n).real();
+            }
+
+            std::string filenameReal = filename;
+            filenameReal.append("_REAL");
+            GADGET_CHECK_RETURN_FALSE(exportArray(buf, filenameReal));
+
+            #pragma omp parallel for default(none) private(n) shared(num, a, buf)
+            for ( n=0; n<num; n++ )
+            {
+                buf(n) = a(n).imag();
+            }
+
+            std::string filenameImag = filename;
+            filenameImag.append("_IMAG");
+            GADGET_CHECK_RETURN_FALSE(exportArray(buf, filenameImag));
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in exportArrayComplexRealImag(const hoNDArray<T>& a, const std::string& filename) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T> 
+    bool exportArrayComplex(const hoNDArray<T>& a, const std::string& filename)
+    {
+        try
+        {
+            typedef typename Gadgetron::realType<T>::Type value_type;
+
+            /*hoNDArray<value_type> buf;
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::complex_to_real(a, buf));
+
+            std::string filenameReal = filename;
+            filenameReal.append("_REAL");
+            GADGET_CHECK_RETURN_FALSE(exportArray(buf, filenameReal));
+
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::complex_to_imag(a, buf));
+            std::string filenameImag = filename;
+            filenameImag.append("_IMAG");
+            GADGET_CHECK_RETURN_FALSE(exportArray(buf, filenameImag));
+
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::abs(a, buf));
+            std::string filenameMag = filename;
+            filenameMag.append("_MAG");
+            GADGET_CHECK_RETURN_FALSE(exportArray(buf, filenameMag));
+
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::argument(a, buf));
+            std::string filenamePhase = filename;
+            filenamePhase.append("_PHASE");
+            GADGET_CHECK_RETURN_FALSE(exportArray(buf, filenamePhase));*/
+
+            hoNDArray<value_type> rpart, ipart, mag, phs;
+            rpart.create(a.get_dimensions());
+            ipart.create(a.get_dimensions());
+            mag.create(a.get_dimensions());
+            phs.create(a.get_dimensions());
+
+            long long num = (long long)a.get_number_of_elements();
+
+            long long n;
+
+            #pragma omp parallel for default(none) private(n) shared(num, a, rpart, ipart, mag, phs)
+            for ( n=0; n<num; n++ )
+            {
+                rpart(n) = a(n).real();
+                ipart(n) = a(n).imag();
+                mag(n) = std::abs( a(n) );
+                phs(n) = std::arg( a(n) );
+            }
+
+            std::string filenameReal = filename;
+            filenameReal.append("_REAL");
+            GADGET_CHECK_RETURN_FALSE(exportArray(rpart, filenameReal));
+
+            std::string filenameImag = filename;
+            filenameImag.append("_IMAG");
+            GADGET_CHECK_RETURN_FALSE(exportArray(ipart, filenameImag));
+
+            std::string filenameMag = filename;
+            filenameMag.append("_MAG");
+            GADGET_CHECK_RETURN_FALSE(exportArray(mag, filenameMag));
+
+            std::string filenamePhase = filename;
+            filenamePhase.append("_PHASE");
+            GADGET_CHECK_RETURN_FALSE(exportArray(phs, filenamePhase));
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in exportArrayComplex(const hoNDArray<T>& a, const std::string& filename) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T> 
+    bool importArrayComplex(hoNDArray<T>& a, const std::string& filename)
+    {
+        try
+        {
+            typedef typename T::value_type value_type;
+            hoNDArray<value_type> real, imag;
+
+            std::string filenameReal = filename;
+            filenameReal.append("_REAL");
+            GADGET_CHECK_RETURN_FALSE(importArray(real, filenameReal));
+
+            std::string filenameImag = filename;
+            filenameImag.append("_IMAG");
+            GADGET_CHECK_RETURN_FALSE(importArray(imag, filenameImag));
+
+            a.create(real.get_dimensions());
+            long long num = (long long)real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for private(n) shared(num, a, real, imag)
+            for ( n=0; n<num; n++ )
+            {
+                a(n) = T(real(n), imag(n));
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in importArrayComplex(const hoNDArray<T>& a, const std::string& filename) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T> 
+    bool importArrayComplex(hoNDArray<T>& a, const std::string& filename_real, const std::string& filename_imag)
+    {
+        try
+        {
+            typedef typename realType<T>::Type value_type;
+            hoNDArray<value_type> real, imag;
+
+            GADGET_CHECK_RETURN_FALSE(importArray(real, filename_real));
+            GADGET_CHECK_RETURN_FALSE(importArray(imag, filename_imag));
+
+            a.create(real.get_dimensions());
+            long long num = (long long)real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for private(n) shared(num, a, real, imag)
+            for ( n=0; n<num; n++ )
+            {
+                a(n) = T(real(n), imag(n));
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in importArrayComplex(hoNDArray<T>& a, const std::string& filename_real, const std::string& filename_imag) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T> 
+    bool export2DArray(const hoNDArray<T>& a, const std::string& filename)
+    {
+        return exportArray(a, filename);
+    }
+
+    template <typename T> 
+    bool import2DArray(hoNDArray<T>& a, const std::string& filename)
+    {
+        return importArray(a, filename);
+    }
+
+    template <typename T> 
+    bool export2DArrayComplex(const hoNDArray<T>& a, const std::string& filename)
+    {
+        return exportArrayComplex(a, filename);
+    }
+
+    template <typename T> 
+    bool import2DArrayComplex(hoNDArray<T>& a, const std::string& filename)
+    {
+        return importArrayComplex(a, filename);
+    }
+
+    template <typename T> 
+    bool export3DArray(const hoNDArray<T>& a, const std::string& filename)
+    {
+        return exportArray(a, filename);
+    }
+
+    template <typename T> 
+    bool import3DArray(hoNDArray<T>& a, const std::string& filename)
+    {
+        return importArray(a, filename);
+    }
+
+    template <typename T> 
+    bool export3DArrayComplex(const hoNDArray<T>& a, const std::string& filename)
+    {
+        return exportArrayComplex(a, filename);
+    }
+
+    template <typename T> 
+    bool import3DArrayComplex(hoNDArray<T>& a, const std::string& filename)
+    {
+        return importArrayComplex(a, filename);
+    }
+
+    template <typename T> 
+    bool export4DArray(const hoNDArray<T>& a, const std::string& filename)
+    {
+        try
+        {
+            size_t RO     = a.get_size(0);
+            size_t E1     = a.get_size(1);
+            size_t CHA    = a.get_size(2);
+            size_t N      = a.get_size(3);
+
+            size_t ii;
+            for (ii=0; ii<N; ii++ )
+            {
+                std::vector<size_t> dim(3);
+                dim[0] = RO;
+                dim[1] = E1;
+                dim[2] = CHA;
+
+                boost::shared_ptr< std::vector<size_t> > sDim(&dim);
+                hoNDArray<T> a3D(sDim, const_cast<T*>(a.begin()+ii*RO*E1*CHA), false);
+
+                std::ostringstream ostr;
+                ostr << filename << "_" << ii << std::ends;
+                GADGET_CHECK_RETURN_FALSE(export3DArray(a3D, ostr.str()));
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in export4DArray(const hoNDArray<T>& a, const std::string& filename) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T> 
+    bool export4DArrayComplex(const hoNDArray<T>& a, const std::string& filename)
+    {
+        try
+        {
+            size_t RO     = a.get_size(0);
+            size_t E1     = a.get_size(1);
+            size_t CHA    = a.get_size(2);
+            size_t N      = a.get_size(3);
+
+            size_t ii;
+            for (ii=0; ii<N; ii++ )
+            {
+                std::vector<size_t> dim(3);
+                dim[0] = RO;
+                dim[1] = E1;
+                dim[2] = CHA;
+
+                boost::shared_ptr< std::vector<size_t> > sDim(&dim);
+                hoNDArray<T> a3D(sDim, const_cast<T*>(a.begin()+ii*RO*E1*CHA), false);
+
+                std::ostringstream ostr;
+                ostr << filename << "_" << ii << std::ends;
+                GADGET_CHECK_RETURN_FALSE(export3DArrayComplex(a3D, ostr.str()));
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in export4DArrayComplex(const hoNDArray<T>& a, const std::string& filename) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    static bool readFromFile(const std::string& filename, char*& data, long long& length)
+    {
+        try
+        {
+            if (data!=NULL) delete [] data;
+
+            gtPlusIOWorker ioworker_(filename, true);
+
+            GADGET_CHECK_RETURN_FALSE(ioworker_.open());
+
+            // read the total length
+            long long totalLen;
+            GADGET_CHECK_RETURN_FALSE(ioworker_.read(reinterpret_cast<char*>(&totalLen), sizeof(long long)));
+
+            length = totalLen - sizeof(long long);
+
+            data = new char[length];
+            GADGET_CHECK_RETURN_FALSE(data!=NULL);
+
+            GADGET_CHECK_RETURN_FALSE(ioworker_.read(data, length));
+
+            GADGET_CHECK_RETURN_FALSE(ioworker_.close());
+        }
+        catch (...)
+        {
+            GERROR_STREAM("Errors in gtPlusIOBase::readFromFile(const std::string& filename, char*& data, long long& length) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    static bool writeToFile(const std::string& filename, char* data, long long length)
+    {
+        try
+        {
+            if ( length == 0 ) return true;
+
+            GADGET_CHECK_RETURN_FALSE(data!=NULL);
+
+            gtPlusIOWorker ioworker_(filename, false);
+
+            GADGET_CHECK_RETURN_FALSE(ioworker_.open());
+
+            // write the total lengh
+            const long long totalLen = length+sizeof(long long);
+            GADGET_CHECK_RETURN_FALSE(ioworker_.write(reinterpret_cast<const char*>(&totalLen), sizeof(long long)));
+
+            // write the data
+            GADGET_CHECK_RETURN_FALSE(ioworker_.write(data, length));
+
+            // close the file
+            GADGET_CHECK_RETURN_FALSE(ioworker_.close());
+        }
+        catch (...)
+        {
+            GERROR_STREAM("Errors in gtPlusIOBase::writeToFile(const std::string& filename, char* data, long long length) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+protected:
+
+    std::vector<float> pixelSize_;
+
+    // get the run-time type ID from analyze data type or vice versa
+    std::string getRTTIFromDataType(GtDataType aDT)
+    {
+        std::string rttiID;
+
+        switch (aDT)
+        {
+        case DT_INT8 :
+            rttiID = typeid(char).name();
+            break;
+
+        case DT_UNSIGNED_CHAR :
+            rttiID = typeid(unsigned char).name();
+            break;
+
+        case DT_SIGNED_SHORT :
+            rttiID = typeid(short).name();
+            break;
+
+        case DT_UNSIGNED_SHORT :
+        case DT_UINT16 :
+            rttiID = typeid(unsigned short).name();
+            break;
+
+        case DT_SIGNED_INT :
+            rttiID = typeid(int).name();
+            break;
+
+        case DT_UINT32 :
+            rttiID = typeid(unsigned int).name();
+            break;
+
+        case DT_INT64 :
+            rttiID = typeid(long long).name();
+            break;
+
+        case DT_UINT64 :
+            rttiID = typeid(unsigned long long).name();
+            break;
+
+        case DT_FLOAT :
+            rttiID = typeid(float).name();
+            break;
+
+        case DT_DOUBLE :
+            rttiID = typeid(double).name();
+            break;
+
+        case DT_FLOAT128 :
+            rttiID = typeid(long double).name();
+            break;
+
+        case DT_COMPLEX :
+            rttiID = typeid( std::complex<float> ).name();
+            break;
+
+        case DT_COMPLEX128 :
+            rttiID = typeid( std::complex<double> ).name();
+            break;
+
+        case DT_COMPLEX256 :
+            rttiID = typeid( std::complex<long double> ).name();
+            break;
+
+        case DT_RGB :
+            rttiID = typeid( Gadgetron::rgb_type ).name();
+            break;
+
+        case DT_RGBA32 :
+            rttiID = typeid( Gadgetron::rgba_type ).name();
+            break;
+
+        default:
+            rttiID = "UNKOWN TYPE";
+        }
+
+        return rttiID;
+    }
+
+    GtDataType getDataTypeFromRTTI(const std::string& name)
+    {
+        GtDataType analyzeDT = DT_ANA_UNKNOWN;
+
+        if ( name == typeid(unsigned char).name() )
+        {
+            analyzeDT = DT_UNSIGNED_CHAR;
+        }
+
+        if ( name == typeid(short).name() )
+        {
+            analyzeDT = DT_SIGNED_SHORT;
+        }
+
+        if ( name == typeid(unsigned short).name() )
+        {
+            analyzeDT = DT_UINT16;
+        }
+
+        if ( name == typeid(int).name() )
+        {
+            analyzeDT = DT_SIGNED_INT;
+        }
+
+        if ( name == typeid(unsigned int).name() )
+        {
+            analyzeDT = DT_UINT32;
+        }
+
+        if ( name == typeid(float).name() )
+        {
+            analyzeDT = DT_FLOAT;
+        }
+
+        if ( name == typeid(double).name() )
+        {
+            analyzeDT = DT_DOUBLE;
+        }
+
+        if ( name == typeid(long double).name() )
+        {
+            analyzeDT = DT_FLOAT128;
+        }
+
+        if ( name == typeid( std::complex<float> ).name() )
+        {
+            analyzeDT = DT_COMPLEX;
+        }
+
+        if ( name == typeid( std::complex<double> ).name() )
+        {
+            analyzeDT = DT_COMPLEX128;
+        }
+
+        if ( name == typeid(std::complex<long double>).name() )
+        {
+            analyzeDT = DT_COMPLEX256;
+        }
+
+        if ( name == typeid(Gadgetron::rgb_type).name() )
+        {
+            analyzeDT = DT_RGB;
+        }
+
+        if ( name == typeid(Gadgetron::rgba_type).name() )
+        {
+            analyzeDT = DT_RGBA32;
+        }
+
+        return analyzeDT;
+    }
+
+    template <typename T> 
+    bool readData(const std::string& filename, T* data, long long len)
+    {
+        try
+        {
+            gtPlusIOWorker ioworker(filename, true);
+
+            GADGET_CHECK_RETURN_FALSE(ioworker.open());
+            GADGET_CHECK_RETURN_FALSE(ioworker.read(reinterpret_cast<char*>(data), len));
+            GADGET_CHECK_RETURN_FALSE(ioworker.close());
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in readData(const std::string& filename, T* data, long long len) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename T> 
+    bool writeData(const std::string& filename, const T* data, long long len)
+    {
+        try
+        {
+            gtPlusIOWorker ioworker(filename, false);
+
+            GADGET_CHECK_RETURN_FALSE(ioworker.open());
+            GADGET_CHECK_RETURN_FALSE(ioworker.write(reinterpret_cast<const char*>(data), len));
+            GADGET_CHECK_RETURN_FALSE(ioworker.close());
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in writeData(const std::string& filename, const T* data, long long len) ... ");
+            return false;
+        }
+
+        return true;
+    }
+};
+
+}}
diff --git a/toolboxes/gtplus/util/gtPlusUtil.h b/toolboxes/gtplus/util/gtPlusUtil.h
new file mode 100644
index 0000000..cf5f595
--- /dev/null
+++ b/toolboxes/gtplus/util/gtPlusUtil.h
@@ -0,0 +1,95 @@
+/** \file   gtPlusUtil.h
+    \brief  Define the symbols and implement common functionalities for GtPlus toolbox
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "GtPlusExport.h"
+
+#include "ho2DArray.h"
+#include "ho3DArray.h"
+#include "ho4DArray.h"
+#include "ho5DArray.h"
+#include "ho6DArray.h"
+#include "ho7DArray.h"
+#include "hoMatrix.h"
+#include "hoNDFFT.h"
+#include "hoNDArray_utils.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDImage_util.h"
+#include "gtPlusIOAnalyze.h"
+#include "GadgetronTimer.h"
+
+#ifdef _WIN32
+    #include <random>
+    #include <array>
+#endif // _WIN32
+
+#ifdef USE_OMP
+    #include <omp.h>
+#endif // USE_OMP
+
+#include "mri_core_def.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+// ------------------------------------------------------------------------
+// random generator
+// ------------------------------------------------------------------------
+
+#ifdef _WIN32
+
+/// norm distribution random number generator
+template <typename T> 
+class gtPlusRandNorm
+{
+public:
+
+    typedef std::mt19937 RandomGeneratorType;
+
+    gtPlusRandNorm();
+    gtPlusRandNorm(long long seed, T mean=0, T sigma=1);
+    ~gtPlusRandNorm();
+
+    void seed(unsigned long seed);
+    void setPara(T mean=0, T sigma=1);
+
+    RandomGeneratorType& getRandomer() { return rng_; }
+    const RandomGeneratorType& getRandomer() const { return rng_; }
+
+    bool gen(hoNDArray<T>& randNum);
+    bool gen(hoNDArray< std::complex<T> >& randNum);
+
+protected:
+
+    RandomGeneratorType rng_;
+    std::normal_distribution<T> dist_norm_;
+};
+
+#endif // _WIN32
+
+template <typename T> 
+class gtPlusUtil
+{
+public:
+
+    gtPlusUtil() {}
+    ~gtPlusUtil() {}
+
+    // ------------------------------------------------------------------------
+    // utility functions for various things
+    // ------------------------------------------------------------------------
+
+    /// get the current time in system
+    /// time stores year, month, date, hour, minute and second
+    bool getCurrentTime(size_t time[6]);
+
+    /// get UTC (Coordinated Universal Time) time from current time
+    bool convertTimeToUTC(size_t time[6], double& tmUTC);
+};
+
+}}
+
+#include "gtPlusUtil.hxx"
diff --git a/toolboxes/gtplus/util/gtPlusUtil.hxx b/toolboxes/gtplus/util/gtPlusUtil.hxx
new file mode 100644
index 0000000..2cab980
--- /dev/null
+++ b/toolboxes/gtplus/util/gtPlusUtil.hxx
@@ -0,0 +1,149 @@
+
+#include "gtPlusUtil.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+// ------------------------------------------------------------------------
+// random generator
+// ------------------------------------------------------------------------
+
+#ifdef _WIN32
+
+template <typename T> 
+gtPlusRandNorm<T>::gtPlusRandNorm()
+{
+    rng_.seed();
+    this->setPara(0, 1);
+}
+
+template <typename T> 
+gtPlusRandNorm<T>::gtPlusRandNorm(long long s, T mean, T sigma)
+{
+    this->seed(s);
+    this->setPara(mean, sigma);
+}
+
+template <typename T> 
+gtPlusRandNorm<T>::~gtPlusRandNorm()
+{
+}
+
+template <typename T> 
+void gtPlusRandNorm<T>::seed(unsigned long s)
+{
+    rng_.seed(s);
+}
+
+template <typename T> 
+void gtPlusRandNorm<T>::setPara(T mean, T sigma)
+{
+    typename std::normal_distribution<T>::param_type para(mean, sigma);
+    dist_norm_.param(para);
+}
+
+template <typename T> 
+inline bool gtPlusRandNorm<T>::gen(hoNDArray<T>& randNum)
+{
+    try
+    {
+        size_t N = randNum.get_number_of_elements();
+        size_t n;
+        for ( n=0; n<N; n++ )
+        {
+            randNum(n) = dist_norm_(rng_);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusRandNorm<T>::gen(hoNDArray<T>& randNum) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusRandNorm<T>::gen(hoNDArray< std::complex<T> >& randNum)
+{
+    try
+    {
+        size_t N = randNum.get_number_of_elements();
+        size_t n;
+
+        T real, imag;
+        for ( n=0; n<N; n++ )
+        {
+            real = dist_norm_(rng_);
+            imag = dist_norm_(rng_);
+
+            randNum(n) = std::complex<T>(real, imag);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusRandNorm<T>::gen(hoNDArray< std::complex<T> >& randNum) ... ");
+        return false;
+    }
+
+    return true;
+}
+#endif // _WIN32
+
+// ------------------------------------------------------------------------
+// utility functions for various things
+// ------------------------------------------------------------------------
+
+template <typename T> 
+bool gtPlusUtil<T>::getCurrentTime(size_t time[6])
+{
+    try
+    {
+        time_t rawtime;
+        struct tm* timeinfo;
+
+        std::time(&rawtime);
+        timeinfo = std::gmtime (&rawtime);
+
+        time[0] = timeinfo->tm_year+1900;
+        time[1] = timeinfo->tm_mon+1;
+        time[2] = timeinfo->tm_mday;
+        time[3] = timeinfo->tm_hour;
+        time[4] = timeinfo->tm_min;
+        time[5] = timeinfo->tm_sec;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in gtPlusUtil<T>::getCurrentTime(size_t time[6]) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusUtil<T>::convertTimeToUTC(size_t time[6], double& tmUTC)
+{
+    try
+    {
+        struct tm timeinfo;
+
+        timeinfo.tm_year   = time[0]-1900;
+        timeinfo.tm_mon    = time[1] - 1;
+        timeinfo.tm_mday   = time[2];
+        timeinfo.tm_hour   = time[3];
+        timeinfo.tm_min    = time[4];
+        timeinfo.tm_sec    = time[5];
+        timeinfo.tm_isdst  = 0;
+
+        tmUTC = (double)mktime(&timeinfo);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Error happened in gtPlusUtil<T>::convertTimeToUTC(size_t time[6], double& tmUTC) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusCloudScheduler.cpp b/toolboxes/gtplus/workflow/gtPlusCloudScheduler.cpp
new file mode 100644
index 0000000..09529bf
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusCloudScheduler.cpp
@@ -0,0 +1,157 @@
+/** \file   gtPlusCloudScheduler.cpp
+    \brief  Define and implement the GtPlus cloud job scheduler class
+            A simple scheduling strategy is implemented here. The number of job packages which are sent
+            to a node is propotional to the computing power index for that node.
+
+            This class may serve as the base class to implement more complicated job scheduling strategies.
+
+    \author Hui Xue
+*/
+
+#include "gtPlusCloudScheduler.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+gtPlusCloudScheduler::gtPlusCloudScheduler() : num_of_nodes_(0), num_of_jobs_(0)
+{
+}
+
+gtPlusCloudScheduler::~gtPlusCloudScheduler()
+{
+}
+
+void gtPlusCloudScheduler::printInfo(std::ostream& os) const
+{
+    using namespace std;
+
+    os << "-------------- GTPlus Cloud scheduler for jobs ---------------" << endl;
+    os << "This class implements the simple scheduling scheme for GtPlus cloud " << endl;
+    os << "The scheduler here tries to allocate nodes to jobs propotional to the power indexes provided " << endl;
+    os << "--------------------------------------------------------------" << endl;
+}
+
+void gtPlusCloudScheduler::setNumOfJobs(size_t numOfJobs)
+{
+    num_of_jobs_ = numOfJobs;
+}
+
+void gtPlusCloudScheduler::setUpNodes(size_t numOfNodes)
+{
+    num_of_nodes_ = numOfNodes;
+    if ( num_of_nodes_ > 0 )
+    {
+        node_id_computing_power_indexes_.resize(num_of_nodes_);
+        for ( size_t ii=0; ii<num_of_nodes_; ii++ )
+        {
+            node_id_computing_power_indexes_[ii].first = (int)ii;
+            node_id_computing_power_indexes_[ii].second = 1.0;
+        }
+    }
+}
+
+void gtPlusCloudScheduler::setUpNodes(const std::vector<double>& nodeComputingPowerIndexes)
+{
+    num_of_nodes_ = nodeComputingPowerIndexes.size();
+    node_id_computing_power_indexes_.resize(num_of_nodes_);
+
+    for ( size_t ii=0; ii<num_of_nodes_; ii++ )
+    {
+        node_id_computing_power_indexes_[ii].first = (int)ii;
+        node_id_computing_power_indexes_[ii].second = nodeComputingPowerIndexes[ii];
+    }
+}
+
+struct gtPlusCloudSchedulerNodeSorter
+{
+    gtPlusCloudSchedulerNodeSorter() {}
+    ~gtPlusCloudSchedulerNodeSorter() {}
+
+    bool operator()(const std::pair<int, double>& A, const std::pair<int, double>& B) const
+    {
+        return (A.second > B.second);
+    }
+};
+
+bool gtPlusCloudScheduler::schedulerJobs(std::vector<int>& nodeIDforJobs)
+{
+    try
+    {
+        size_t ii;
+
+        nodeIDforJobs.clear();
+
+        if ( num_of_nodes_==0 || num_of_jobs_==0 )
+        {
+            GWARN_STREAM("num_of_nodes_==0 || num_of_jobs_==0");
+            return true;
+        }
+
+        if ( node_id_computing_power_indexes_.size() < num_of_nodes_ )
+        {
+            GWARN_STREAM("node_computing_power_indexes_.size() < num_of_nodes_ : computing power indexes for all nodes are set to be equal ... ");
+            node_id_computing_power_indexes_.resize(num_of_nodes_, std::pair<int, double>(0, 1.0) );
+            for ( ii=0; ii<num_of_nodes_; ii++ )
+            {
+                node_id_computing_power_indexes_[ii].first = (int)ii;
+            }
+        }
+
+        nodeIDforJobs.resize(num_of_jobs_, -1);
+
+        // always sort the nodes with higher computing power node ahead
+        std::sort(node_id_computing_power_indexes_.begin(), node_id_computing_power_indexes_.end(), gtPlusCloudSchedulerNodeSorter() );
+
+        if ( num_of_jobs_ <= num_of_nodes_ )
+        {
+            for ( ii=0; ii<num_of_jobs_; ii++ )
+            {
+                nodeIDforJobs[ii] = node_id_computing_power_indexes_[ii].first;
+            }
+        }
+        else
+        {
+            double totalComputingPower = 0.0;
+            for ( ii=0; ii<num_of_nodes_; ii++ )
+            {
+                totalComputingPower += node_id_computing_power_indexes_[ii].second;
+            }
+
+            size_t totalJobAllocated = 0;
+            std::vector<size_t> jobPerNode(num_of_nodes_, 0);
+            for ( ii=0; ii<num_of_nodes_; ii++ )
+            {
+                jobPerNode[ii] = (size_t)(std::floor(num_of_jobs_ * node_id_computing_power_indexes_[ii].second/totalComputingPower));
+                totalJobAllocated += jobPerNode[ii];
+            }
+
+            if ( totalJobAllocated < num_of_jobs_ )
+            {
+                // give high computing power nodes more jobs
+                for ( ii=0; ii<(num_of_jobs_-totalJobAllocated); ii++ )
+                {
+                    jobPerNode[ii%num_of_nodes_]++;
+                }
+            }
+
+            size_t jobID = 0;
+            for ( ii=0; ii<num_of_nodes_; ii++ )
+            {
+                for ( size_t jj=0; jj<jobPerNode[ii]; jj++ )
+                {
+                    nodeIDforJobs[jobID++] = node_id_computing_power_indexes_[ii].first;
+                }
+            }
+
+            GADGET_CHECK_RETURN_FALSE(jobID==num_of_jobs_);
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in gtPlusCloudScheduler::schedulerJobs(std::vector<int>& nodeIDforJobs) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusCloudScheduler.h b/toolboxes/gtplus/workflow/gtPlusCloudScheduler.h
new file mode 100644
index 0000000..68c6e7a
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusCloudScheduler.h
@@ -0,0 +1,54 @@
+/** \file   gtPlusCloudScheduler.h
+    \brief  Define and implement the GtPlus cloud job scheduler class
+            A simple scheduling strategy is implemented here. The number of job packages which are sent
+            to a node is propotional to the computing power index for that node.
+
+            This class may serve as the base class to implement more complicated job scheduling strategies.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "GtPlusExport.h"
+#include "gtPlusISMRMRDReconUtil.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+/**
+The scheduler class for gadgetron cloud.
+This class can serves as the base class for more complicated scheduling strategy.
+*/
+
+class EXPORTGTPLUS gtPlusCloudScheduler
+{
+public:
+
+    gtPlusCloudScheduler();
+    virtual ~gtPlusCloudScheduler();
+
+    virtual void printInfo(std::ostream& os) const;
+
+    // compute the scheduling for every job
+    // nodeIDforJobs stores the node ID to run every job
+    // node ID starts from 0
+    virtual bool schedulerJobs(std::vector<int>& nodeIDforJobs);
+
+    void setNumOfJobs(size_t numOfJobs);
+
+    void setUpNodes(size_t numOfNodes);
+    void setUpNodes(const std::vector<double>& nodeComputingPowerIndexes);
+
+protected:
+
+    // number of nodes
+    size_t num_of_nodes_;
+
+    // number of jobs, for this simple scheduler, all jobs are considered to have equal sizes
+    size_t num_of_jobs_;
+
+    // computing power indexes for every nodes; if not set, all nodes are treated to have equal computing powers
+    std::vector<std::pair<int, double> > node_id_computing_power_indexes_;
+};
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconCoilMapEstimation.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconCoilMapEstimation.h
new file mode 100644
index 0000000..e12da9f
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconCoilMapEstimation.h
@@ -0,0 +1,137 @@
+/** \file   gtPlusISMRMRDReconCoilMapEstimation.h
+    \brief  Implement coil map estimation methods.
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "GtPlusExport.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusSPIRIT.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+// ================================================================================================== //
+
+template <typename T> 
+class gtPlusISMRMRDReconCoilMapEstimation
+{
+public:
+
+    typedef typename realType<T>::Type value_type;
+
+    gtPlusISMRMRDReconCoilMapEstimation();
+    virtual ~gtPlusISMRMRDReconCoilMapEstimation();
+
+    void printInfo(std::ostream& os);
+
+    // compute dual coil map
+    // data : ref kspace [RO E1 CHA]
+    // coilMap : [RO E1 CHA 2]
+    bool coilMap2DSPIRIT(const hoNDArray<T>& data, hoNDArray<T>& coilMap, hoNDArray<value_type>& eigD, size_t kRO, size_t kE1, value_type thres=0.01);
+};
+
+template <typename T> 
+gtPlusISMRMRDReconCoilMapEstimation<T>::gtPlusISMRMRDReconCoilMapEstimation()
+{
+}
+
+template <typename T> 
+gtPlusISMRMRDReconCoilMapEstimation<T>::~gtPlusISMRMRDReconCoilMapEstimation()
+{
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconCoilMapEstimation<T>::coilMap2DSPIRIT(const hoNDArray<T>& data, hoNDArray<T>& coilMap, hoNDArray<value_type>& eigD, size_t kRO, size_t kE1, value_type thres)
+{
+    try
+    {
+        gtPlusSPIRIT<T> spirit;
+
+        size_t oRO = 1;
+        size_t oE1 = 1;
+
+        size_t RO = coilMap.get_size(0);
+        size_t E1 = coilMap.get_size(1);
+        size_t CHA = data.get_size(2);
+
+        ho3DArray<T> acsSrc(data.get_size(0), data.get_size(1), CHA, const_cast<T*>(data.begin()));
+        ho3DArray<T> acsDst(data.get_size(0), data.get_size(1), CHA, const_cast<T*>(data.begin()));
+
+        ho6DArray<T> ker(kRO, kE1, CHA, CHA, oRO, oE1);
+
+        GADGET_CHECK_RETURN_FALSE(spirit.calib(acsSrc, acsDst, thres, kRO, kE1, oRO, oE1, ker));
+
+        // std::string debugFolder_ = "D:/gtuser/mrprogs/gadgetron/toolboxes/gtplus/ut/result/";
+        // Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+        // if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(ker, debugFolder_+"ker"); }
+
+        bool minusI = false;
+        hoNDArray<T> kIm(RO, E1, CHA, CHA);
+        GADGET_CHECK_RETURN_FALSE(spirit.imageDomainKernel(ker, kRO, kE1, oRO, oE1, RO, E1, kIm, minusI));
+        T* pkIm = kIm.begin();
+
+        // if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kIm, debugFolder_+"kIm"); }
+
+        coilMap.create(RO, E1, CHA, 2);
+        eigD.create(RO, E1, 2);
+
+        long long ro, e1, scha, dcha;
+
+        #pragma omp parallel default(none) private(ro, e1, scha, dcha) shared(RO, E1, CHA, pkIm, coilMap, eigD)
+        {
+            hoMatrix<T> R(CHA, CHA), RC(CHA, CHA), RRT(CHA, CHA);
+            Gadgetron::clear(RRT);
+
+            hoMatrix<value_type> eigenValue;
+
+            #pragma omp for 
+            for ( e1=0; e1<E1; e1++ )
+            {
+                for ( ro=0; ro<RO; ro++ )
+                {
+                    const size_t offset = e1*RO + ro;
+
+                    for ( dcha=0; dcha<CHA; dcha++ )
+                    {
+                        for ( scha=0; scha<CHA; scha++ )
+                        {
+                            // T v = kIm(ro, e1, scha, dcha);
+                            T v = pkIm[dcha*RO*E1*CHA + scha*RO*E1 + offset];
+                            if ( scha == dcha )
+                            {
+                                v -= 1;
+                            }
+
+                            R(scha, dcha) = v;
+                        }
+                    }
+
+                    memcpy(RC.begin(), R.begin(), sizeof(T)*CHA*CHA);
+                    Gadgetron::gemm(RRT, RC, false, R, true);
+
+                    Gadgetron::heev(RRT, eigenValue);
+
+                    for ( scha=0; scha<CHA; scha++ )
+                    {
+                        coilMap(ro, e1, scha, 0) = RRT(scha, 0);
+                        coilMap(ro, e1, scha, 1) = RRT(scha, 1);
+                        eigD(ro, e1, 0) = 1.0 - eigenValue(0, 0);
+                        eigD(ro, e1, 1) = 1.0 - eigenValue(1, 0);
+                    }
+                }
+            }
+        }
+
+        Gadgetron::conjugate(coilMap, coilMap);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconCoilMapEstimation<T>::coilMap2DSPIRIT(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.cpp b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.cpp
new file mode 100644
index 0000000..e40c79b
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.cpp
@@ -0,0 +1,864 @@
+
+#include "gtPlusISMRMRDReconUtil.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+//
+// Instantiation
+//
+
+template EXPORTGTPLUS class gtPlusISMRMRDReconUtil<float>;
+template EXPORTGTPLUS class gtPlusISMRMRDReconUtil<double>;
+template EXPORTGTPLUS class gtPlusISMRMRDReconUtil< std::complex<float> >;
+template EXPORTGTPLUS class gtPlusISMRMRDReconUtil< std::complex<double> >;
+
+template EXPORTGTPLUS class gtPlusISMRMRDReconUtilComplex< std::complex<float> >;
+template EXPORTGTPLUS class gtPlusISMRMRDReconUtilComplex< std::complex<double> >;
+
+}}
+
+namespace Gadgetron {
+
+    // ----------------------------------------------------------------------------------------
+    // templated functions
+    // ----------------------------------------------------------------------------------------
+
+    template <typename T> 
+    bool cropUpTo11DArray(const hoNDArray<T>& x, hoNDArray<T>& r, const std::vector<size_t>& startND, std::vector<size_t>& size)
+    {
+        GADGET_CHECK_RETURN_FALSE( startND.size() == size.size() );
+        GADGET_CHECK_RETURN_FALSE( startND.size() <= 11 );
+
+        r.create(&size);
+        if ( r.get_number_of_elements() == x.get_number_of_elements() )
+        {
+            r = x;
+            return true;
+        }
+
+        std::vector<size_t> start(11, 0);
+        std::vector<size_t> end(11, 0);
+
+        size_t ii;
+        for ( ii=0; ii<startND.size(); ii++ )
+        {
+            start[ii] = startND[ii];
+            end[ii] = start[ii] + size[ii] - 1;
+            GADGET_CHECK_RETURN_FALSE(end[ii] < x.get_size(ii));
+        }
+
+        // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg Ave]
+        size_t e1, cha, n, s, con, phs, rep, set, seg, ave;
+
+        std::vector<size_t> srcInd(11), dstInd(11);
+
+        for ( ave=start[10]; ave<=end[10]; ave++ )
+        {
+            srcInd[10] = ave; dstInd[10] = ave-start[10];
+
+            for ( seg=start[9]; seg<=end[9]; seg++ )
+            {
+                srcInd[9] = seg; dstInd[9] = seg-start[9];
+
+                for ( set=start[8]; set<=end[8]; set++ )
+                {
+                    srcInd[8] = set; dstInd[8] = set-start[8];
+
+                    for ( rep=start[7]; rep<=end[7]; rep++ )
+                    {
+                        srcInd[7] = rep; dstInd[7] = rep-start[7];
+
+                        for ( phs=start[6]; phs<=end[6]; phs++ )
+                        {
+                            srcInd[6] = phs; dstInd[6] = phs-start[6];
+
+                            for ( con=start[5]; con<=end[5]; con++ )
+                            {
+                                srcInd[5] = con; dstInd[5] = con-start[5];
+
+                                for ( s=start[4]; s<=end[4]; s++ )
+                                {
+                                    srcInd[4] = s; dstInd[4] = s-start[4];
+
+                                    for ( n=start[3]; n<=end[3]; n++ )
+                                    {
+                                        srcInd[3] = n; dstInd[3] = n-start[3];
+
+                                        for ( cha=start[2]; cha<=end[2]; cha++ )
+                                        {
+                                            srcInd[2] = cha; dstInd[2] = cha-start[2];
+
+                                            for ( e1=start[1]; e1<=end[1]; e1++ )
+                                            {
+                                                srcInd[1] = e1; dstInd[1] = e1-start[1];
+
+                                                srcInd[0] = start[0];
+                                                dstInd[0] = 0;
+
+                                                size_t offsetSrc = x.calculate_offset(srcInd);
+                                                size_t offsetDst = r.calculate_offset(dstInd);
+
+                                                memcpy(r.begin()+offsetDst, x.begin()+offsetSrc, sizeof(T)*(end[0]-start[0]+1));
+
+                                                /*for ( ro=start[0]; ro<=end[0]; ro++ )
+                                                {
+                                                    srcInd[0] = ro;
+                                                    dstInd[0] = ro-start[0];
+
+                                                    int offsetSrc = x.calculate_offset(srcInd);
+                                                    int offsetDst = r.calculate_offset(dstInd);
+
+                                                    r(offsetDst) = x(offsetSrc);
+                                                }*/
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return true;
+    }
+
+    template <typename T> 
+    bool setSubArrayUpTo11DArray(const hoNDArray<T>& x, hoNDArray<T>& r, const std::vector<size_t>& startND, std::vector<size_t>& size)
+    {
+        GADGET_CHECK_RETURN_FALSE( startND.size() == size.size() );
+        GADGET_CHECK_RETURN_FALSE( startND.size() <= 11 );
+
+        if ( r.get_number_of_elements() == x.get_number_of_elements() )
+        {
+            r = x;
+            return true;
+        }
+
+        std::vector<size_t> start(11, 0);
+        std::vector<size_t> end(11, 0);
+
+        size_t ii;
+        for ( ii=0; ii<startND.size(); ii++ )
+        {
+            start[ii] = startND[ii];
+            end[ii] = start[ii] + size[ii] - 1;
+            GADGET_CHECK_RETURN_FALSE(end[ii] < r.get_size(ii));
+        }
+
+        // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg Ave]
+        size_t e1, cha, n, s, con, phs, rep, set, seg, ave;
+
+        std::vector<size_t> srcInd(11), dstInd(11);
+
+        for ( ave=start[10]; ave<=end[10]; ave++ )
+        {
+            dstInd[10] = ave; srcInd[10] = ave-start[10];
+
+            for ( seg=start[9]; seg<=end[9]; seg++ )
+            {
+                dstInd[9] = seg; srcInd[9] = seg-start[9];
+
+                for ( set=start[8]; set<=end[8]; set++ )
+                {
+                    dstInd[8] = set; srcInd[8] = set-start[8];
+
+                    for ( rep=start[7]; rep<=end[7]; rep++ )
+                    {
+                        dstInd[7] = rep; srcInd[7] = rep-start[7];
+
+                        for ( phs=start[6]; phs<=end[6]; phs++ )
+                        {
+                            dstInd[6] = phs; srcInd[6] = phs-start[6];
+
+                            for ( con=start[5]; con<=end[5]; con++ )
+                            {
+                                dstInd[5] = con; srcInd[5] = con-start[5];
+
+                                for ( s=start[4]; s<=end[4]; s++ )
+                                {
+                                    dstInd[4] = s; srcInd[4] = s-start[4];
+
+                                    for ( n=start[3]; n<=end[3]; n++ )
+                                    {
+                                        dstInd[3] = n; srcInd[3] = n-start[3];
+
+                                        for ( cha=start[2]; cha<=end[2]; cha++ )
+                                        {
+                                            dstInd[2] = cha; srcInd[2] = cha-start[2];
+
+                                            for ( e1=start[1]; e1<=end[1]; e1++ )
+                                            {
+                                                dstInd[1] = e1; srcInd[1] = e1-start[1];
+
+                                                dstInd[0] = start[0];
+                                                srcInd[0] = 0;
+
+                                                size_t offsetSrc = x.calculate_offset(srcInd);
+                                                size_t offsetDst = r.calculate_offset(dstInd);
+
+                                                memcpy(r.begin()+offsetDst, x.begin()+offsetSrc, sizeof(T)*(end[0]-start[0]+1));
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return true;
+    }
+
+    template<typename T> 
+    bool extractSampledLinesUpTo11DArray(const hoNDArray<T>& x, hoNDArray<T>& r, const hoNDArray<float>& timeStamp, double acceFactorE1, double acceFactorE2)
+    {
+        try
+        {
+            std::vector<size_t> dim;
+            x.get_dimensions(dim);
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t CHA = x.get_size(2);
+            size_t SLC = x.get_size(3);
+            size_t E2 = x.get_size(4);
+            size_t CON = x.get_size(5);
+            size_t PHS = x.get_size(6);
+            size_t REP = x.get_size(7);
+            size_t SET = x.get_size(8);
+            size_t SEG = x.get_size(9);
+            size_t AVE = x.get_size(10);
+
+            size_t Num = AVE*SEG*SET*REP*PHS*SLC;
+
+            std::vector<size_t> dimRes(dim);
+
+            if ( acceFactorE1>1 && E1>1 )
+            {
+                dimRes[1] = (size_t)(E1/acceFactorE1) + 1;
+            }
+
+            size_t dstE1 = dimRes[1];
+
+            if ( acceFactorE2>1 && E2>1 )
+            {
+                dimRes[4] = (size_t)(E2/acceFactorE2) + 1;
+            }
+
+            r.create(&dimRes);
+            Gadgetron::clear(r);
+
+            // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg Ave]
+
+            size_t ROLen = sizeof(T)*RO;
+            hoNDArray<T> dummyArray(SLC, CON, PHS, REP, SET, SEG, AVE);
+
+            long long n;
+            #pragma omp parallel default(none) private(n) shared(Num, dummyArray, RO, E1, CHA, SLC, E2, CON, PHS, REP, SET, SEG, AVE, timeStamp, x, r, ROLen, dstE1)
+            {
+
+                std::vector<size_t> indN;
+                std::vector<size_t> srcInd(11, 0), dstInd(11, 0);
+                size_t e1, cha, slc, e2, con, rep, phs, set, seg, ave;
+
+                #pragma omp for
+                for ( n=0; n<(long long)Num; n++ )
+                {
+                    indN = dummyArray.calculate_index(n);
+
+                    ave = indN[6];
+                    seg = indN[5];
+                    set = indN[4];
+                    rep = indN[3];
+                    phs = indN[2];
+                    con = indN[1];
+                    slc = indN[0];
+
+                    srcInd[10] = ave; dstInd[10] = ave;
+                    srcInd[9] = seg; dstInd[9] = seg;
+                    srcInd[8] = set; dstInd[8] = set;
+                    srcInd[7] = rep; dstInd[7] = rep;
+                    srcInd[6] = phs; dstInd[6] = phs;
+                    srcInd[5] = con; dstInd[5] = con;
+                    srcInd[4] = slc; dstInd[4] = slc;
+
+                    // ------------------
+                    size_t indE2(0);
+                    size_t prevE2(0);
+                    for ( e2=0; e2<E2; e2++ )
+                    {
+                        srcInd[3] = e2; dstInd[3] = indE2;
+
+                        size_t indE1(0);
+                        for ( e1=0; e1<E1; e1++ )
+                        {
+                            srcInd[1] = e1; 
+
+                            srcInd[2] = 0;
+                            if ( timeStamp(srcInd) > 0 )
+                            {
+                                dstInd[1] = indE1;
+                                indE1++;
+
+                                if ( e2 != prevE2 )
+                                {
+                                    prevE2 = e2;
+                                    indE2++;
+                                }
+
+                                // ------------------
+                                srcInd[2] = 0; dstInd[2] = 0;
+                                size_t offsetSrc = x.calculate_offset(srcInd);
+                                size_t offsetDst = r.calculate_offset(dstInd);
+
+                                for ( cha=0; cha<CHA; cha++ )
+                                {
+                                    memcpy(r.begin()+offsetDst, x.begin()+offsetSrc, ROLen);
+
+                                    offsetSrc += RO*E1;
+                                    offsetDst += RO*dstE1;
+                                }
+                                // ------------------
+                            }
+                        }
+                        // ------------------
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in extractSampledLinesUpTo11DArray(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename T> 
+    bool fillSampledLinesUpTo11DArray(const hoNDArray<T>& x, hoNDArray<T>& r, const hoNDArray<float>& timeStamp)
+    {
+        try
+        {
+            size_t RO = x.get_size(0);
+            size_t E1 = timeStamp.get_size(1);
+            size_t CHA = x.get_size(2);
+            size_t SLC = timeStamp.get_size(3);
+            size_t E2 = timeStamp.get_size(4);
+            size_t CON = timeStamp.get_size(5);
+            size_t PHS = timeStamp.get_size(6);
+            size_t REP = timeStamp.get_size(7);
+            size_t SET = timeStamp.get_size(8);
+            size_t SEG = timeStamp.get_size(9);
+            size_t AVE = timeStamp.get_size(10);
+
+            size_t srcE1 = x.get_size(1);
+
+            size_t Num = AVE*SEG*SET*REP*PHS*SLC;
+
+            std::vector<size_t> dimRes;
+            timeStamp.get_dimensions(dimRes);
+
+            dimRes[0] = RO;
+            dimRes[2] = CHA;
+            r.create(&dimRes);
+            Gadgetron::clear(r);
+
+            size_t ROLen = sizeof(T)*RO;
+            hoNDArray<T> dummyArray(SLC, CON, PHS, REP, SET, SEG, AVE);
+
+            long long n;
+            #pragma omp parallel default(none) private(n) shared(Num, dummyArray, RO, E1, CHA, SLC, E2, CON, PHS, REP, SET, SEG, AVE, timeStamp, x, r, ROLen, srcE1)
+            {
+
+                std::vector<size_t> indN;
+                std::vector<size_t> srcInd(11, 0), dstInd(11, 0);
+                size_t e1, cha, slc, e2, con, rep, phs, set, seg, ave;
+
+                #pragma omp for
+                for ( n=0; n<(long long)Num; n++ )
+                {
+                    indN = dummyArray.calculate_index(n);
+
+                    ave = indN[6];
+                    seg = indN[5];
+                    set = indN[4];
+                    rep = indN[3];
+                    phs = indN[2];
+                    con = indN[1];
+                    slc = indN[0];
+
+                    srcInd[10] = ave; dstInd[10] = ave;
+                    srcInd[9] = seg; dstInd[9] = seg;
+                    srcInd[8] = set; dstInd[8] = set;
+                    srcInd[7] = rep; dstInd[7] = rep;
+                    srcInd[6] = phs; dstInd[6] = phs;
+                    srcInd[5] = con; dstInd[5] = con;
+                    srcInd[4] = slc; dstInd[4] = slc;
+
+                    // ------------------
+                    size_t indE2(0);
+                    size_t prevE2(0);
+                    for ( e2=0; e2<E2; e2++ )
+                    {
+                        srcInd[3] = indE2; dstInd[3] = e2;
+
+                        size_t indE1(0);
+                        for ( e1=0; e1<E1; e1++ )
+                        {
+                            dstInd[1] = e1; 
+
+                            dstInd[2] = 0;
+                            if ( timeStamp(dstInd) > 0 )
+                            {
+                                srcInd[1] = indE1;
+                                indE1++;
+
+                                if ( e2 != prevE2 )
+                                {
+                                    prevE2 = e2;
+                                    indE2++;
+                                }
+
+                                // ------------------
+
+                                srcInd[2] = 0; dstInd[2] = 0;
+                                size_t offsetSrc = x.calculate_offset(srcInd);
+                                size_t offsetDst = r.calculate_offset(dstInd);
+
+                                for ( cha=0; cha<CHA; cha++ )
+                                {
+                                    memcpy(r.begin()+offsetDst, x.begin()+offsetSrc, ROLen);
+
+                                    offsetSrc += RO*srcE1;
+                                    offsetDst += RO*E1;
+                                }
+                                // ------------------
+                            }
+                        }
+                        // ------------------
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in fillSampledLinesUpTo11DArray(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename T> 
+    bool stdOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& std, bool NMinusOne)
+    {
+        try
+        {
+            typedef typename realType<T>::Type value_type;
+
+            GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_dimensions() >= 3);
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t CHA = x.get_size(2);
+
+            long long num = (long long)x.get_number_of_elements() / (RO*E1*CHA);
+
+            boost::shared_ptr< std::vector<size_t> > dim = x.get_dimensions();
+
+            std::vector<size_t> dimStd(*dim);
+            dimStd.erase(dimStd.begin()+2);
+            std.create(&dimStd);
+
+            std::vector<size_t> dim3D(3);
+            dim3D[0] = RO;
+            dim3D[1] = E1;
+            dim3D[2] = CHA;
+
+            T S( (value_type)CHA );
+            if ( NMinusOne )
+            {
+                S = T( (value_type)CHA-1);
+            }
+
+            T v(0), v1(0);
+            T S2 = T( (value_type)1.0 )/S;
+            T S3 = T( (value_type)1.0 )/T( (value_type)CHA );
+
+            long long n;
+
+            #pragma omp parallel for default(none) private(n) shared(num, RO, E1, CHA, x, std, S, S2, S3, v, v1)
+            for ( n=0; n<num; n++ )
+            {
+                hoNDArray<T> xTmp(RO, E1, CHA, const_cast<T*>(x.begin()+n*RO*E1*CHA));
+                hoNDArray<T> mean(RO, E1);
+                Gadgetron::clear(mean);
+
+                size_t ro, e1, cha;
+                for ( cha=0; cha<CHA; cha++ )
+                {
+                    for ( e1=0; e1<E1; e1++ )
+                    {
+                        for ( ro=0; ro<RO; ro++ )
+                        {
+                            mean(ro+e1*RO) += xTmp(cha*RO*E1+e1*RO+ro)*S3;
+                        }
+                    }
+                }
+
+                for ( e1=0; e1<E1; e1++ )
+                {
+                    for ( ro=0; ro<RO; ro++ )
+                    {
+                        size_t ind = e1*RO+ro;
+
+                        v = 0; v1 = 0;
+                        for ( cha=0; cha<CHA; cha++ )
+                        {
+                            v1 = std::abs(xTmp(cha*RO*E1+ind)-mean(ind));
+                            v += v1*v1;
+                        }
+
+                        v /= S;
+                        std(ind+n*RO*E1) = std::sqrt(v);
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in stdOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& std, bool NMinusOne) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename T> 
+    bool cropOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r, size_t start, size_t end)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+            size_t NDim = dimX->size();
+
+            if ( NDim <= 2 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t E2 = x.get_size(2);
+
+            size_t E2_R = end-start+1;
+
+            if ( E2 <= E2_R )
+            {
+                r = x;
+                return true;
+            }
+
+            std::vector<size_t> dimR(*dimX);
+            dimR[2] = E2_R;
+
+            r.create(&dimR);
+
+            size_t N2D = RO*E1;
+            size_t N3D = RO*E1*E2;
+            size_t N3D_R = RO*E1*E2_R;
+
+            size_t N = x.get_number_of_elements()/N3D;
+
+            const T* pX = x.begin();
+            T* pR = r.begin();
+
+            size_t n;
+            for ( n=0; n<N; n++ )
+            {
+                long long e2;
+                #pragma omp parallel for default(none) private(e2) shared(N2D, N3D, N3D_R, pX, pR, RO, E1, E2, n, start, end)
+                for ( e2=(long long)start; e2<=(long long)end; e2++ )
+                {
+                    memcpy(pR+n*N3D_R+(e2-start)*N2D, pX+n*N3D+e2*N2D, sizeof(T)*N2D);
+                }
+            }
+        }
+        catch (...)
+        {
+            GERROR_STREAM("Errors in cropOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r, size_t start, size_t end) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> bool setSubArrayOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r, size_t start, size_t end)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimR = r.get_dimensions();
+
+            size_t NDim = dimR->size();
+
+            if ( NDim <= 2 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t RO = r.get_size(0);
+            size_t E1 = r.get_size(1);
+            size_t E2 = r.get_size(2);
+
+            size_t E2_X = end-start+1;
+            GADGET_CHECK_RETURN_FALSE( E2_X == x.get_size(2) );
+
+            if ( E2_X >= E2 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t N2D = RO*E1;
+            size_t N3D = RO*E1*E2;
+            size_t N3D_X = RO*E1*E2_X;
+
+            size_t N = r.get_number_of_elements()/N3D;
+
+            const T* pX = x.begin();
+            T* pR = r.begin();
+
+            size_t n;
+            for ( n=0; n<N; n++ )
+            {
+                long long e2;
+                #pragma omp parallel for default(none) private(e2) shared(N2D, N3D, N3D_X, pX, pR, RO, E1, E2, n, start, end)
+                for ( e2=(long long)start; e2<=(long long)end; e2++ )
+                {
+                    memcpy(pR+n*N3D+e2*N2D, pX+n*N3D_X+(e2-start)*N2D, sizeof(T)*N2D);
+                }
+            }
+        }
+        catch (...)
+        {
+            GERROR_STREAM("Errors in setSubArrayOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r, size_t start, size_t end) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T>
+    bool imageDomainUnwrapping2D(const hoNDArray<T>& x, const hoNDArray<T>& kernel, hoNDArray<T>& buf, hoNDArray<T>& y)
+    {
+        try
+        {
+            typedef typename realType<T>::Type value_type;
+
+            T* pX = const_cast<T*>(x.begin());
+            T* ker = const_cast<T*>(kernel.begin());
+            T* pY = y.begin();
+
+            size_t ro = x.get_size(0);
+            size_t e1 = x.get_size(1);
+            size_t srcCHA = x.get_size(2);
+            size_t dstCHA = kernel.get_size(3);
+
+            if (buf.get_number_of_elements() < ro*e1*srcCHA)
+            {
+                buf.create(ro, e1, srcCHA);
+            }
+            T* pBuf = buf.begin();
+
+            size_t dCha;
+
+            //#pragma omp parallel default(shared)
+            {
+                //#ifdef WIN32
+                //    int tid = omp_get_thread_num();
+                //    DWORD_PTR mask = (1 << tid);
+                //    // GADGET_MSG("thread id : " << tid << " - mask : " << mask);
+                //    SetThreadAffinityMask( GetCurrentThread(), mask );
+                //#endif // WIN32
+
+                //#pragma omp for
+
+                for (dCha = 0; dCha<dstCHA; dCha++)
+                {
+                    // multiplyCplx(ro*e1*srcCHA, pX, ker+dCha*ro*e1*srcCHA, pBuf);
+
+                    long long N = ro*e1*srcCHA;
+                    T* x = pX;
+                    T* y = ker + dCha*ro*e1*srcCHA;
+                    T* r = pBuf;
+
+                    long long n;
+#pragma omp parallel for default(none) private(n) shared(N, x, y, r) if (N>64*1024)
+                    for (n = 0; n < (long long)N; n++)
+                    {
+                        const T& a1 = x[n];
+                        const T& b1 = y[n];
+                        const value_type a = a1.real();
+                        const value_type b = a1.imag();
+                        const value_type c = b1.real();
+                        const value_type d = b1.imag();
+
+                        reinterpret_cast<value_type(&)[2]>(r[n])[0] = a*c - b*d;
+                        reinterpret_cast<value_type(&)[2]>(r[n])[1] = a*d + b*c;
+                    }
+
+                    memcpy(pY + dCha*ro*e1, pBuf, sizeof(T)*ro*e1);
+                    for (size_t sCha = 1; sCha<srcCHA; sCha++)
+                    {
+                        // Gadgetron::math::add(ro*e1, pY+dCha*ro*e1, pBuf+sCha*ro*e1, pY+dCha*ro*e1);
+
+                        size_t ii;
+                        size_t N2D = ro*e1;
+
+                        T* pY2D = pY + dCha*ro*e1;
+                        T* pBuf2D = pBuf + sCha*ro*e1;
+
+                        for (ii = 0; ii<N2D; ii++)
+                        {
+                            pY2D[ii] += pBuf2D[ii];
+                        }
+                    }
+                }
+            }
+        }
+        catch (...)
+        {
+            GERROR_STREAM("Errors in imageDomainUnwrapping2D(const hoNDArray<T>& x, const hoNDArray<T>& ker, hoNDArray<T>& buf, hoNDArray<T>& y) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename CoordType, typename T> 
+    bool computePeriodicBoundaryValues(const hoNDArray<CoordType>& x, const hoNDArray<T>& y, CoordType start, CoordType end, hoNDArray<CoordType>& vx, hoNDArray<T>& vy)
+    {
+        try
+        {
+            typedef typename realType<T>::Type real_value_type;
+
+            size_t N = x.get_size(0);
+            size_t M = y.get_size(1);
+
+            GADGET_CHECK_RETURN_FALSE(y.get_size(0)==N);
+            GADGET_CHECK_RETURN_FALSE(start<=x(0));
+            GADGET_CHECK_RETURN_FALSE(end>=x(N-1));
+
+            vx.create(N+2);
+            vy.create(N+2, M);
+
+            size_t m, n;
+
+            vx(0) = start;
+            for ( n=0; n<N; n++ )
+            {
+                vx(n+1) = x(n);
+            }
+            vx(N+1) = end;
+
+            CoordType dS = x(0) - start;
+            CoordType dE = end - x(N-1);
+
+            // start, end
+            CoordType wS;
+            if ( dE+dS > FLT_EPSILON )
+                wS = dE/(dE+dS);
+            else
+                wS = dE/FLT_EPSILON;
+
+            for ( m=0; m<M; m++ )
+            {
+                T a = y(0, m);
+                T b = y(N-1, m);
+
+                vy(0, m) = b + (real_value_type)wS * ( a - b );
+                vy(N+1, m) = vy(0, m);
+            }
+
+            // middle
+            for ( n=0; n<N; n++ )
+            {
+                for ( m=0; m<M; m++ )
+                {
+                    vy(n+1, m) = y(n, m);
+                }
+            }
+        }
+        catch (...)
+        {
+            GERROR_STREAM("Errors in computePeriodicBoundaryValues(const hoNDArray<CoordType>& x, const hoNDArray<T>& y, CoordType& start, CoordType& end, hoNDArray<T>& r) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template EXPORTGTPLUS bool cropUpTo11DArray(const hoNDArray<short>& x, hoNDArray<short>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTGTPLUS bool cropUpTo11DArray(const hoNDArray<unsigned short>& x, hoNDArray<unsigned short>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTGTPLUS bool cropUpTo11DArray(const hoNDArray<float>& x, hoNDArray<float>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTGTPLUS bool cropUpTo11DArray(const hoNDArray<double>& x, hoNDArray<double>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTGTPLUS bool cropUpTo11DArray(const hoNDArray< std::complex<float> >& x, hoNDArray< std::complex<float> >& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTGTPLUS bool cropUpTo11DArray(const hoNDArray< std::complex<double> >& x, hoNDArray< std::complex<double> >& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+
+    template EXPORTGTPLUS bool setSubArrayUpTo11DArray(const hoNDArray<short>& x, hoNDArray<short>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTGTPLUS bool setSubArrayUpTo11DArray(const hoNDArray<unsigned short>& x, hoNDArray<unsigned short>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTGTPLUS bool setSubArrayUpTo11DArray(const hoNDArray<float>& x, hoNDArray<float>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTGTPLUS bool setSubArrayUpTo11DArray(const hoNDArray<double>& x, hoNDArray<double>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTGTPLUS bool setSubArrayUpTo11DArray(const hoNDArray< std::complex<float> >& x, hoNDArray< std::complex<float> >& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTGTPLUS bool setSubArrayUpTo11DArray(const hoNDArray< std::complex<double> >& x, hoNDArray< std::complex<double> >& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+
+    template EXPORTGTPLUS bool extractSampledLinesUpTo11DArray(const hoNDArray<short>& x, hoNDArray<short>& r, const hoNDArray<float>& timeStamp, double acceFactorE1, double acceFactorE2);
+    template EXPORTGTPLUS bool extractSampledLinesUpTo11DArray(const hoNDArray<unsigned short>& x, hoNDArray<unsigned short>& r, const hoNDArray<float>& timeStamp, double acceFactorE1, double acceFactorE2);
+    template EXPORTGTPLUS bool extractSampledLinesUpTo11DArray(const hoNDArray<float>& x, hoNDArray<float>& r, const hoNDArray<float>& timeStamp, double acceFactorE1, double acceFactorE2);
+    template EXPORTGTPLUS bool extractSampledLinesUpTo11DArray(const hoNDArray<double>& x, hoNDArray<double>& r, const hoNDArray<float>& timeStamp, double acceFactorE1, double acceFactorE2);
+    template EXPORTGTPLUS bool extractSampledLinesUpTo11DArray(const hoNDArray< std::complex<float> >& x, hoNDArray< std::complex<float> >& r, const hoNDArray<float>& timeStamp, double acceFactorE1, double acceFactorE2);
+    template EXPORTGTPLUS bool extractSampledLinesUpTo11DArray(const hoNDArray< std::complex<double> >& x, hoNDArray< std::complex<double> >& r, const hoNDArray<float>& timeStamp, double acceFactorE1, double acceFactorE2);
+
+    template EXPORTGTPLUS bool fillSampledLinesUpTo11DArray(const hoNDArray<short>& x, hoNDArray<short>& r, const hoNDArray<float>& timeStamp);
+    template EXPORTGTPLUS bool fillSampledLinesUpTo11DArray(const hoNDArray<unsigned short>& x, hoNDArray<unsigned short>& r, const hoNDArray<float>& timeStamp);
+    template EXPORTGTPLUS bool fillSampledLinesUpTo11DArray(const hoNDArray<float>& x, hoNDArray<float>& r, const hoNDArray<float>& timeStamp);
+    template EXPORTGTPLUS bool fillSampledLinesUpTo11DArray(const hoNDArray<double>& x, hoNDArray<double>& r, const hoNDArray<float>& timeStamp);
+    template EXPORTGTPLUS bool fillSampledLinesUpTo11DArray(const hoNDArray< std::complex<float> >& x, hoNDArray< std::complex<float> >& r, const hoNDArray<float>& timeStamp);
+    template EXPORTGTPLUS bool fillSampledLinesUpTo11DArray(const hoNDArray< std::complex<double> >& x, hoNDArray< std::complex<double> >& r, const hoNDArray<float>& timeStamp);
+
+    template EXPORTGTPLUS bool cropOver3rdDimension(const hoNDArray<short>& x, hoNDArray<short>& r, size_t start, size_t end);
+    template EXPORTGTPLUS bool cropOver3rdDimension(const hoNDArray<unsigned short>& x, hoNDArray<unsigned short>& r, size_t start, size_t end);
+    template EXPORTGTPLUS bool cropOver3rdDimension(const hoNDArray<float>& x, hoNDArray<float>& r, size_t start, size_t end);
+    template EXPORTGTPLUS bool cropOver3rdDimension(const hoNDArray<double>& x, hoNDArray<double>& r, size_t start, size_t end);
+    template EXPORTGTPLUS bool cropOver3rdDimension(const hoNDArray< std::complex<float> >& x, hoNDArray< std::complex<float> >& r, size_t start, size_t end);
+    template EXPORTGTPLUS bool cropOver3rdDimension(const hoNDArray< std::complex<double> >& x, hoNDArray< std::complex<double> >& r, size_t start, size_t end);
+
+    template EXPORTGTPLUS bool setSubArrayOver3rdDimension(const hoNDArray<short>& x, hoNDArray<short>& r, size_t start, size_t end);
+    template EXPORTGTPLUS bool setSubArrayOver3rdDimension(const hoNDArray<unsigned short>& x, hoNDArray<unsigned short>& r, size_t start, size_t end);
+    template EXPORTGTPLUS bool setSubArrayOver3rdDimension(const hoNDArray<float>& x, hoNDArray<float>& r, size_t start, size_t end);
+    template EXPORTGTPLUS bool setSubArrayOver3rdDimension(const hoNDArray<double>& x, hoNDArray<double>& r, size_t start, size_t end);
+    template EXPORTGTPLUS bool setSubArrayOver3rdDimension(const hoNDArray< std::complex<float> >& x, hoNDArray< std::complex<float> >& r, size_t start, size_t end);
+    template EXPORTGTPLUS bool setSubArrayOver3rdDimension(const hoNDArray< std::complex<double> >& x, hoNDArray< std::complex<double> >& r, size_t start, size_t end);
+
+    template EXPORTGTPLUS bool stdOver3rdDimension(const hoNDArray<float>& x, hoNDArray<float>& std, bool NMinusOne);
+    template EXPORTGTPLUS bool stdOver3rdDimension(const hoNDArray<double>& x, hoNDArray<double>& std, bool NMinusOne);
+    template EXPORTGTPLUS bool stdOver3rdDimension(const hoNDArray< std::complex<float> >& x, hoNDArray< std::complex<float> >& std, bool NMinusOne);
+    template EXPORTGTPLUS bool stdOver3rdDimension(const hoNDArray< std::complex<double> >& x, hoNDArray< std::complex<double> >& std, bool NMinusOne);
+
+    template EXPORTGTPLUS bool imageDomainUnwrapping2D(const hoNDArray< std::complex<float> >& x, const hoNDArray< std::complex<float> >& ker, hoNDArray< std::complex<float> >& buf, hoNDArray< std::complex<float> >& y);
+    template EXPORTGTPLUS bool imageDomainUnwrapping2D(const hoNDArray< std::complex<double> >& x, const hoNDArray< std::complex<double> >& ker, hoNDArray< std::complex<double> >& buf, hoNDArray< std::complex<double> >& y);
+
+    template EXPORTGTPLUS bool computePeriodicBoundaryValues(const hoNDArray<float>& x, const hoNDArray<float>& y, float start, float end, hoNDArray<float>& vx, hoNDArray<float>& vy);
+    template EXPORTGTPLUS bool computePeriodicBoundaryValues(const hoNDArray<float>& x, const hoNDArray<double>& y, float start, float end, hoNDArray<float>& vx, hoNDArray<double>& vy);
+    template EXPORTGTPLUS bool computePeriodicBoundaryValues(const hoNDArray<float>& x, const hoNDArray< std::complex<float> >& y, float start, float end, hoNDArray<float>& vx, hoNDArray< std::complex<float> >& vy);
+    template EXPORTGTPLUS bool computePeriodicBoundaryValues(const hoNDArray<float>& x, const hoNDArray< std::complex<double> >& y, float start, float end, hoNDArray<float>& vx, hoNDArray< std::complex<double> >& vy);
+
+    template EXPORTGTPLUS bool computePeriodicBoundaryValues(const hoNDArray<double>& x, const hoNDArray<double>& y, double start, double end, hoNDArray<double>& vx, hoNDArray<double>& vy);
+    template EXPORTGTPLUS bool computePeriodicBoundaryValues(const hoNDArray<double>& x, const hoNDArray<float>& y, double start, double end, hoNDArray<double>& vx, hoNDArray<float>& vy);
+    template EXPORTGTPLUS bool computePeriodicBoundaryValues(const hoNDArray<double>& x, const hoNDArray< std::complex<float> >& y, double start, double end, hoNDArray<double>& vx, hoNDArray< std::complex<float> >& vy);
+    template EXPORTGTPLUS bool computePeriodicBoundaryValues(const hoNDArray<double>& x, const hoNDArray< std::complex<double> >& y, double start, double end, hoNDArray<double>& vx, hoNDArray< std::complex<double> >& vy);
+}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.h
new file mode 100644
index 0000000..0b48ec0
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.h
@@ -0,0 +1,724 @@
+/** \file   gtPlusISMRMRDReconUtil.h
+    \brief  Define the symbols and implement common functionalities for GtPlus toolbox
+
+            The ISMRMRD format is fully supported in this toolbox.
+
+            Other functinalities implemented here include:
+            Karhunen-Lo�ve Transform (KLT) or Principle Component Analysis (PCA)
+            KSpace filter
+            Several MR sensitivity map estimation methods
+
+            Ref to :
+
+            http://en.wikipedia.org/wiki/Karhunen%E2%80%93Lo%C3%A8ve_theorem
+
+            ISMRMRD_SOUHEIL coil map estimation is based on:
+
+                Inati SJ, Hansen MS, Kellman P. 
+                A solution to the phase problem in adaptive coil combination. 
+                In: ISMRM proceeding; april; salt lake city, utah, USA. ; 2013. 2672.
+
+                Kellman P, McVeigh ER. 
+                Image reconstruction in SNR units: A general method for SNR measurement. 
+                Magnetic Resonance in Medicine 2005;54(6):1439-1447.
+
+            ISMRMRD_SOUHEIL_ITER coil map estimation is based on:
+
+                Inati SJ, Hansen MS, Kellman P. Unpublished algorithm.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "GtPlusExport.h"
+
+#include "ismrmrd/ismrmrd.h"
+#include "ismrmrd/meta.h"
+
+#include "boost/tuple/tuple.hpp"
+#include "boost/tuple/tuple_comparison.hpp"
+#include "boost/tuple/tuple_io.hpp"
+
+#include "ho2DArray.h"
+#include "ho3DArray.h"
+#include "ho4DArray.h"
+#include "ho5DArray.h"
+#include "ho6DArray.h"
+#include "ho7DArray.h"
+#include "hoMatrix.h"
+#include "hoNDArray_linalg.h"
+#include "hoNDFFT.h"
+#include "hoNDArray_utils.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDImage_util.h"
+#include "hoNDArray_reductions.h"
+#include "hoNDArray_linalg.h"
+#include "gtPlusIOAnalyze.h"
+#include "GadgetronTimer.h"
+#include "log.h"
+
+#ifdef USE_OMP
+    #include <omp.h>
+#endif // USE_OMP
+
+#include "mri_core_def.h"
+#include "mri_core_utility.h"
+#include "mri_core_coil_map_estimation.h"
+
+namespace Gadgetron {
+
+    // define the dimensions of ISMRMRD
+    enum ISMRMRDDIM
+    {
+        DIM_ReadOut = 32,
+        DIM_Encoding1,
+        DIM_Channel,
+        DIM_Slice,
+        DIM_Encoding2,
+        DIM_Contrast,
+        DIM_Phase,
+        DIM_Repetition,
+        DIM_Set,
+        DIM_Segment,
+        DIM_Average,
+        DIM_other1,
+        DIM_other2,
+        DIM_other3,
+        DIM_NONE
+    };
+
+    // define the reconstruction algorithms
+    enum ISMRMRDALGO
+    {
+        ISMRMRD_GRAPPA = 64,
+        ISMRMRD_SENSE,
+        ISMRMRD_SPIRIT,
+        ISMRMRD_L1SPIRIT,
+        ISMRMRD_SOFTSENSE,
+        ISMRMRD_L1SOFTSENSE,
+        ISMRMRD_2DTBINNING,
+        ISMRMRD_2DTBINNING_FLOW,
+        ISMRMRD_L1SPIRIT_SLEP,
+        ISMRMRD_L1SPIRIT_SLEP_MOTION_COMP,
+        ISMRMRD_NONE
+    };
+
+    // define the coil sensitivity map estimation algorithms
+    enum ISMRMRDCOILMAPALGO
+    {
+        ISMRMRD_SOUHEIL = 96,
+        ISMRMRD_SOUHEIL_ITER
+    };
+
+    // define the partial fourier/asymmetric echo handling algorithms
+    enum ISMRMRDPFALGO
+    {
+        ISMRMRD_PF_HOMODYNE = 128,          // iterative homodyne
+        ISMRMRD_PF_POCS,                    // POCS
+        ISMRMRD_PF_FENGHUANG,               // convolution based method
+        ISMRMRD_PF_ZEROFILLING_FILTER,      // zero-filling with partial fourier filter
+        ISMRMRD_PF_ZEROFILLING,             // zero-filling without partial fourier filter
+        ISMRMRD_PF_NONE
+    };
+
+    // define the kspace filter type
+    enum ISMRMRDKSPACEFILTER
+    {
+        ISMRMRD_FILTER_GAUSSIAN = 160,
+        ISMRMRD_FILTER_HANNING,
+        ISMRMRD_FILTER_TUKEY,
+        ISMRMRD_FILTER_TAPERED_HANNING,
+        ISMRMRD_FILTER_NONE
+    };
+
+    // define the calibration mode of ISMRMRD
+    enum ISMRMRDCALIBMODE
+    {
+        ISMRMRD_embedded = 256,
+        ISMRMRD_interleaved,
+        ISMRMRD_separate,
+        ISMRMRD_external,
+        ISMRMRD_other,
+        ISMRMRD_noacceleration
+    };
+
+    // define the interpolation method
+    enum ISMRMRDINTERP
+    {
+        ISMRMRD_INTERP_LINEAR = 512,
+        ISMRMRD_INTERP_SPLINE,
+        ISMRMRD_INTERP_BSPLINE
+    };
+
+    // define the interpolation method for retro-gating
+    enum ISMRMRDINTERPRETROGATING
+    {
+        ISMRMRD_INTERP_RETRO_GATING_LINEAR = 600,
+        ISMRMRD_INTERP_RETRO_GATING_CUBIC, 
+        ISMRMRD_INTERP_RETRO_GATING_BSPLINE
+    };
+
+    /// data flow tag
+    /// if this flag is set to be 1 for a image, the image is immediately passed to the next gadget
+    /// if this flag is 0, this image is a stored image by the accummulator
+    /// whether to pass a stored image to the next gadget is determined by the processing gadget itself
+    #define GADGETRON_PASS_IMMEDIATE                       "GT_PASSIMAGE_IMMEDIATE"
+
+    /// ISMRMRD Image fields
+    #define ISMRMRD_IMAGE_version                       "ISMRMRD_IMAGE_version"
+    #define ISMRMRD_IMAGE_flags                         "ISMRMRD_IMAGE_flags"
+    #define ISMRMRD_IMAGE_measurement_uid               "ISMRMRD_IMAGE_measurement_uid"
+    #define ISMRMRD_IMAGE_matrix_size                   "ISMRMRD_IMAGE_matrix_size"
+    #define ISMRMRD_IMAGE_field_of_view                 "ISMRMRD_IMAGE_field_of_view"
+    #define ISMRMRD_IMAGE_channels                      "ISMRMRD_IMAGE_channels"
+    #define ISMRMRD_IMAGE_position                      "ISMRMRD_IMAGE_position"
+    #define ISMRMRD_IMAGE_read_dir                      "ISMRMRD_IMAGE_read_dir"
+    #define ISMRMRD_IMAGE_phase_dir                     "ISMRMRD_IMAGE_phase_dir"
+    #define ISMRMRD_IMAGE_slice_dir                     "ISMRMRD_IMAGE_slice_dir"
+    #define ISMRMRD_IMAGE_patient_table_position        "ISMRMRD_IMAGE_patient_table_position"
+    #define ISMRMRD_IMAGE_average                       "ISMRMRD_IMAGE_average"
+    #define ISMRMRD_IMAGE_slice                         "ISMRMRD_IMAGE_slice"
+    #define ISMRMRD_IMAGE_contrast                      "ISMRMRD_IMAGE_contrast"
+    #define ISMRMRD_IMAGE_phase                         "ISMRMRD_IMAGE_phase"
+    #define ISMRMRD_IMAGE_repetition                    "ISMRMRD_IMAGE_repetition"
+    #define ISMRMRD_IMAGE_set                           "ISMRMRD_IMAGE_set"
+    #define ISMRMRD_IMAGE_acquisition_time_stamp        "ISMRMRD_IMAGE_acquisition_time_stamp"
+    #define ISMRMRD_IMAGE_physiology_time_stamp         "ISMRMRD_IMAGE_physiology_time_stamp"
+    #define ISMRMRD_IMAGE_image_data_type               "ISMRMRD_IMAGE_image_data_type"
+    #define ISMRMRD_IMAGE_image_type                    "ISMRMRD_IMAGE_image_type"
+    #define ISMRMRD_IMAGE_image_index                   "ISMRMRD_IMAGE_image_index"
+    #define ISMRMRD_IMAGE_image_series_index            "ISMRMRD_IMAGE_image_series_index"
+    #define ISMRMRD_IMAGE_user_int                      "ISMRMRD_IMAGE_user_int"
+    #define ISMRMRD_IMAGE_user_float                    "ISMRMRD_IMAGE_user_float"
+
+    /// dimension string
+    #define GADGETRON_RO                                    "RO"
+    #define GADGETRON_E1                                    "E1"
+    #define GADGETRON_CHA                                   "CHA"
+    #define GADGETRON_SLC                                   "SLC"
+    #define GADGETRON_E2                                    "E2"
+    #define GADGETRON_CONTRAST                              "CON"
+    #define GADGETRON_PHASE                                 "PHS"
+    #define GADGETRON_REP                                   "REP"
+    #define GADGETRON_SET                                   "SET"
+    #define GADGETRON_SEGMENT                               "SEG"
+    #define GADGETRON_AVERAGE                               "AVE"
+    #define GADGETRON_OTHER1                                "OTH1"
+    #define GADGETRON_OTHER2                                "OTH2"
+    #define GADGETRON_OTHER3                                "OTH3"
+    #define GADGETRON_NONE                                  "NONE"
+}
+
+namespace Gadgetron {
+
+    /**
+    * @brief copy the sub-array of x to r
+             the sub-array is defined by its starting index and array size
+    */
+    template<typename T> EXPORTGTPLUS bool cropUpTo11DArray(const hoNDArray<T>& x, hoNDArray<T>& r, const std::vector<size_t>& startND, std::vector<size_t>& size);
+
+    /**
+    * @brief set the sub-array of r from x
+             the sub-array is defined by its starting index and array size
+    */
+    template<typename T> EXPORTGTPLUS bool setSubArrayUpTo11DArray(const hoNDArray<T>& x, hoNDArray<T>& r, const std::vector<size_t>& startND, std::vector<size_t>& size);
+
+    /**
+    * @brief extract sampled lines from an NDArray
+             timeStamp indicates sampled lines; -1 for unsampled lines
+             x : [Ro E1 Cha Slice E2 Con Phase Rep Set Seg AVE]
+             timeStamp: [1 E1 1 Slice E2 Con Phase Rep Set Seg AVE]
+    */
+    template<typename T> EXPORTGTPLUS bool extractSampledLinesUpTo11DArray(const hoNDArray<T>& x, hoNDArray<T>& r, const hoNDArray<float>& timeStamp, double acceFactorE1, double acceFactorE2);
+
+    /**
+    * @brief fill sampled lines to an NDArray
+             timeStamp indicates sampled lines; -1 for unsampled lines
+    */
+    template<typename T> EXPORTGTPLUS bool fillSampledLinesUpTo11DArray(const hoNDArray<T>& x, hoNDArray<T>& r, const hoNDArray<float>& timeStamp);
+
+    /**
+    * @brief copy the sub-array of x to r only along the 3rd dimensions
+             e.g. x is [RO E1 D3 ...], r will be [RO E1 end-start+1 ... ]
+    */
+    template<typename T> EXPORTGTPLUS bool cropOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r, size_t start, size_t end);
+
+    /**
+    * @brief set the sub-array of r from x only along the 3rd dimensions
+             e.g. r(:, :, start:end, :, ...) will be replaced by x
+    */
+    template<typename T> EXPORTGTPLUS bool setSubArrayOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r, size_t start, size_t end);
+
+    /**
+    * @brief compute the standard deviation along the 3rd dimension, if NMinusOne == true, divided by N-1; otherwise, divided by N
+    */
+    template<typename T> EXPORTGTPLUS bool stdOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& std, bool NMinusOne);
+
+    /**
+    * @brief Image domain unwrapping for 2D
+             x : [RO E1 srcCHA], ker [RO E1 srcCHA dstCHA]
+             buf is a buffer for computer, need to be pre-allocated [RO E1 srcCHA], y [RO E1 dstCHA]
+             for the sake of speed, no check is made in this function
+    */
+    template<typename T> EXPORTGTPLUS bool imageDomainUnwrapping2D(const hoNDArray<T>& x, const hoNDArray<T>& ker, hoNDArray<T>& buf, hoNDArray<T>& y);
+
+    /**
+    * @brief compute periodic boundary values for an array
+             x : [N 1] the data point location, y[N M] data point values at x
+             r : [N+2 M], the data point values with computed boundaries
+    */
+    template<typename CoordType, typename T> EXPORTGTPLUS bool computePeriodicBoundaryValues(const hoNDArray<CoordType>& x, const hoNDArray<T>& y, CoordType start, CoordType end, hoNDArray<CoordType>& vx, hoNDArray<T>& vy);
+}
+
+namespace Gadgetron { namespace gtPlus {
+
+// ================================================================================================== //
+
+template <typename T> 
+class gtPlusISMRMRDReconUtil
+{
+public:
+
+    typedef typename realType<T>::Type value_type;
+
+    gtPlusISMRMRDReconUtil();
+    virtual ~gtPlusISMRMRDReconUtil();
+
+    void printInfo(std::ostream& os);
+
+    typedef std::pair<ISMRMRDDIM, size_t> DimensionRecordType;
+
+    // ------------------------------------------------------------------------
+    // coil compression and KarhunenLoeverTransform
+    // ------------------------------------------------------------------------
+    // data: M rows and N cols matrix
+    // the KLT direction is along the N
+    // eigenVectors: N*N eigen vectors, every column is a eigen vector
+    // eigenValues: N*1 eigen values, descending order
+    bool KLT_eigenAnalysis(const hoMatrix<T>& data, hoMatrix<T>& eigenVectors, hoMatrix<T>& eigenValues);
+
+    // apply the eigen transform
+    // data: M*N data matrix
+    // eigenVectors: N*K eigen vector matrix, every column is a eigen vector
+    // dataEigen: M*K eigen data matrix
+    bool KLT_applyEigen(const hoMatrix<T>& data, hoMatrix<T>& dataEigen, const hoMatrix<T>& eigenVectors);
+    bool KLT_applyEigen(const hoNDArray<T>& data, hoNDArray<T>& dataEigen, const hoMatrix<T>& eigenVectors);
+
+    // number of kept eigen modes
+    // all modes with eigen values greater than thres*max(eigenValues) are kept
+    bool KLT_numberOfKeptModes(const hoMatrix<T>& eigenValues, double thres, long long& numOfModesKept);
+
+    // prune the eigen vector matrixes to keep the last numOfModesKept columns
+    bool pruneEigenVectorMatrix(const hoMatrix<T>& eigenVectors, long long numOfModesKept, hoMatrix<T>& eigenVectorsPruned);
+
+    // KLT based coil compression
+    // data: at least 3D [RO E1 CHA ...]
+    // the KL transform is applied along CHA
+    // coeff: CHA*numOfModesKept eigen vector matrix
+    // eigenValues: CHA*1 eigen values
+    // thres <0 or numOfModesKept==-1, keep all modes
+    // if isChaLastDim==true, the CHA is the last dimension
+    bool computeKLCoilCompressionCoeff(const hoNDArray<T>& data, double thres, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, bool isChaLastDim=false);
+    bool computeKLCoilCompressionCoeff(const hoNDArray<T>& data, int numOfModesKept, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, bool isChaLastDim=false);
+    // coeff: CHA*CHA eigen vector matrix
+    bool computeKLTCoeff(const hoNDArray<T>& data, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, bool isChaLastDim=false);
+
+    // dataEigen: [RO E1 numOfModesKept ...] 
+    bool computeKLCoilCompression(const hoNDArray<T>& data, double thres, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, hoNDArray<T>& dataEigen, bool isChaLastDim=false);
+    bool computeKLCoilCompression(const hoNDArray<T>& data, int numOfModesKept, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, hoNDArray<T>& dataEigen, bool isChaLastDim=false);
+
+    // apply coil compression coefficients
+    bool appyKLCoilCompressionCoeff(const hoNDArray<T>& data, const hoMatrix<T>& coeff, hoNDArray<T>& dataEigen, bool isChaLastDim=false);
+
+    // apply coil compression coefficients on array [RO E1 srcCHA ...]
+    // dataEigen: [RO E1 dstCHA ...]
+    // coeff: [srcCHA dstCHA] matrixes for every last dimension
+    // every last dimension has different compression coefficients
+    bool applyKLCoilCompressionCoeff(const hoNDArray<T>& data, const std::vector<hoMatrix<T> >& coeff, hoNDArray<T>& dataEigen, bool isChaLastDim=false);
+
+    // compute KL transform and perform filtering
+    // the KL dimension is the last dimension
+    bool computeKLFilter(const hoNDArray<T>& data, size_t numOfModesKept, hoNDArray<T>& dataKLF);
+
+    // ------------------------------------------------------------------------
+    // kspace filter
+    // ------------------------------------------------------------------------
+    bool compute2DFilterFromTwo1D(const hoNDArray<T>& fx, const hoNDArray<T>& fy, hoNDArray<T>& fxy);
+    bool compute2DFilterFromTwo1D(const hoNDArray<float>& fx, const hoNDArray<float>& fy, hoNDArray< std::complex<float> >& fxy);
+    bool compute2DFilterFromTwo1D(const hoNDArray<double>& fx, const hoNDArray<double>& fy, hoNDArray< std::complex<double> >& fxy);
+
+    bool compute3DFilterFromThree1D(const hoNDArray<T>& fx, const hoNDArray<T>& fy, const hoNDArray<T>& fz, hoNDArray<T>& fxyz);
+    bool compute3DFilterFromThree1D(const hoNDArray<float>& fx, const hoNDArray<float>& fy, const hoNDArray<float>& fz, hoNDArray< std::complex<float> >& fxyz);
+    bool compute3DFilterFromThree1D(const hoNDArray<double>& fx, const hoNDArray<double>& fy, const hoNDArray<double>& fz, hoNDArray< std::complex<double> >& fxyz);
+
+    // data: in kspace, [RO E1 E2 CHA SLC CON PHS REP SET]
+    bool kspacefilterRO(hoNDArray<T>& data, const hoNDArray<T>& fRO);
+    bool kspacefilterRO(const hoNDArray<T>& data, const hoNDArray<T>& fRO, hoNDArray<T>& dataFiltered);
+    bool kspacefilterROE1(const hoNDArray<T>& data, const hoNDArray<T>& fROE1, hoNDArray<T>& dataFiltered);
+    bool kspacefilterROE1(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE1, hoNDArray<T>& dataFiltered);
+    bool kspacefilterE1(const hoNDArray<T>& data, const hoNDArray<T>& fE1, hoNDArray<T>& dataFiltered);
+
+    // kspace fitler for ISMRMRD dimension order
+    bool kspacefilterE2(const hoNDArray<T>& data, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+    bool kspacefilterROE2(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+    bool kspacefilterE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+    bool kspacefilterROE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fROE1E2, hoNDArray<T>& dataFiltered);
+    bool kspacefilterROE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+
+    // kspace filter for the array whose first three dimensions are RO, E1 and E2; 
+    bool kspace3DfilterE2(const hoNDArray<T>& data, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+    bool kspace3DfilterROE2(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+    bool kspace3DfilterE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+    bool kspace3DfilterROE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fROE1E2, hoNDArray<T>& dataFiltered);
+    bool kspace3DfilterROE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+
+    // ------------------------------------------------------------------------
+    // generate kspace filters
+    // ------------------------------------------------------------------------
+    // symmetric filter, used for image filtering
+    // sigma: for Gaussian, in the unit of pixel
+    // width: for Tukey filter etc., the length of transition band
+    bool generateSymmetricFilter(size_t len, size_t start, size_t end, hoNDArray<T>& filter, ISMRMRDKSPACEFILTER filterType, double sigma, size_t width);
+
+    // asymmetric filter, used for partial fourier/asymmetric echo filtering
+    // start, end: the data range
+    // tapered hanning filer is implemented for this
+    // if filterType==ISMRMRD_FILTER_NONE and densityComp==true, the 0-1-2 filter will be generated
+    // if filterType==ISMRMRD_FILTER_TAPERED_HANNING and the densityComp is true, the density compensation version of tapered filter will be generated
+    // where unacquired region has filter values 0 and symmetric region 1 and nonsymmetric region 2
+    // if densityComp==false, the one side tapered filter will be generated
+    bool generateAsymmetricFilter(size_t len, size_t start, size_t end, hoNDArray<T>& filter, ISMRMRDKSPACEFILTER filterType, size_t width, bool densityComp=false);
+
+    // generate ref data filter
+    bool generateSymmetricFilterForRef(size_t len, size_t start, size_t end, hoNDArray<T>& filter, ISMRMRDKSPACEFILTER filterType, double sigma, size_t width);
+
+    // find the symmetric sampled region
+    bool findSymmetricSampledRegion(size_t start, size_t end, size_t center, size_t& startSym, size_t& endSym);
+
+    // compute the filter SNR unit scale factor
+    bool computeFilterSNRUnitScaleFactor(const hoNDArray<T>& filter, T& scalFactor);
+
+    // ------------------------------------------------------------------------
+    // detect sampled region
+    // ------------------------------------------------------------------------
+    // data : [RO E1 SLC E2 CON PHS REP SET] array
+    bool detectSampledRegion2D(const hoNDArray<T>& data, size_t& startRO, size_t& endRO, size_t& startE1, size_t& endE1);
+    bool detectSampledRegion3D(const hoNDArray<T>& data, size_t& startRO, size_t& endRO, size_t& startE1, size_t& endE1, size_t& startE2, size_t& endE2);
+
+    // ------------------------------------------------------------------------
+    // coil sensitivity
+    // ------------------------------------------------------------------------
+    // average kspace along the 4th dimension
+    // data: [RO E1 CHA N S ...]
+    // ave: [RO E1 CHA 1 S ... ]
+    // simple average
+    bool averageKSpace4D(const hoNDArray<T>& data, hoNDArray<T>& ave);
+    // the sampled times are considered for averaging for E1 dimension
+    // sampledTimes: [E1 1], recording the number of sampled times for each lines
+    bool averageKSpace4D(const hoNDArray<T>& data, hoNDArray<T>& ave, std::vector<size_t>& sampledTimes);
+
+    // average kspace along the 5th dimension
+    // data: [RO E1 E2 CHA N ...]
+    // ave: [RO E1 E2 CHA 1 ... ]
+    // simple average
+    bool averageKSpace5D(const hoNDArray<T>& data, hoNDArray<T>& ave);
+    // the sampled times are considered for averaging for E1 and E2 dimension
+    // sampledTimes: [E1 E2], recording the number of sampled times for each lines
+    bool averageKSpace5D(const hoNDArray<T>& data, hoNDArray<T>& ave, hoNDArray<size_t>& sampledTimes);
+
+    // sampled region along E1
+    // data: [RO E1 CHA N]
+    bool detectSampledRegionE1(const hoNDArray<T>& data, size_t& startE1, size_t& endE1);
+
+    // sampled times along E1, if not sampled, sampledTimes[e1] == 0
+    bool detectSampledTimesE1(const hoNDArray<T>& data4D, std::vector<size_t>& sampledTimes);
+
+    // sampled region along E1 and E2
+    // data: [RO E1 E2 CHA N]
+    bool detectSampledRegionE1E2(const hoNDArray<T>& data, size_t& startE1, size_t& endE1, size_t& startE2, size_t& endE2);
+
+    // sampled times along E1 and E2, if not sampled, sampledTimes(e1, e2) == 0
+    // data5D: [RO E1 E2 CHA N]
+    bool detectSampledTimesE1E2(const hoNDArray<T>& data5D, hoNDArray<size_t>& sampledTimes);
+
+    // copy along E1
+    bool copyAlongE1(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startE1, size_t endE1);
+
+    // copy along RO and E1
+    bool copyAlongROE1(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startRO, size_t endRO, size_t startE1, size_t endE1);
+
+    // copy along RO, E1 and E2
+    bool copyAlongROE1E2(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startRO, size_t endRO, size_t startE1, size_t endE1, size_t startE2, size_t endE2);
+
+    // copy along RO and E1, but a transition band is used to make sure the smoothing transition on the dst kspace
+    // the transition band is achieved via the tapered hanning filter
+    bool copyAlongROE1TransitionBand(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startRO, size_t endRO, 
+            size_t startE1, size_t endE1, size_t transBandRO, size_t transBandE1);
+
+    // copy along RO, E1 and E2, but a transition band is used to make sure the smoothing transition on the dst kspace
+    // the transition band is achieved via the tapered hanning filter
+    // src, dst: [RO E1 E2 ...]
+    bool copyAlongROE1E2TransitionBand(const hoNDArray<T>& src, hoNDArray<T>& dst, 
+                                    size_t startRO, size_t endRO, 
+                                    size_t startE1, size_t endE1, 
+                                    size_t startE2, size_t endE2, 
+                                    size_t transBandRO, size_t transBandE1, 
+                                    size_t transBandE2);
+
+    // ------------------------------------------------------------------------
+    // ISMRMRDDIM related functions
+    // ------------------------------------------------------------------------
+    // get the dimension name
+    std::string getISMRMRDDimName(const ISMRMRDDIM& dim);
+    ISMRMRDDIM getISMRMRDDimFromName(const std::string& name);
+
+    // get the dimension order index in the ISMRMRD format
+    // this function is for the kspace
+    //  [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    //   0  1  2   3     4  5   6     7   8   9
+    bool getISMRMRDDimIndex(const ISMRMRDDIM& dim, long long& ind);
+
+    // find the dimension indexes
+    bool findDimIndex(const std::vector<DimensionRecordType>& dimStartingIndexes, ISMRMRDDIM dim, size_t ind);
+
+    // get recon algorithm from name
+    ISMRMRDALGO getISMRMRDReconAlgoFromName(const std::string& name);
+
+    // get coil map algorithm from name
+    ISMRMRDCOILMAPALGO getISMRMRDCoilMapAlgoFromName(const std::string& name);
+
+    // get the partial fourier/asymmetric echo handling algorithm from name
+    ISMRMRDPFALGO getISMRMRDPartialFourierReconAlgoFromName(const std::string& name);
+
+    // get the partial fourier/asymmetric echo handling algorithm name from algorithm
+    std::string getNameFromISMRMRDPartialFourierReconAlgo(ISMRMRDPFALGO algo);
+
+    // get the kspace filter algorithm from name
+    ISMRMRDKSPACEFILTER getISMRMRDKSpaceFilterFromName(const std::string& name);
+
+    // get retro-gating interpolation method from name
+    ISMRMRDINTERPRETROGATING getISMRMRDRetroGatingInterpFromName(const std::string& name);
+
+    // extract sub array for a dimension
+    // if lessEqual ==  true, [0:value] are extracted for dim
+    bool extractSubArrayForDim(const hoNDArray<T>& x, hoNDArray<T>& r, ISMRMRDDIM& dim, size_t value, bool lessEqual);
+    // if lessEqual ==  true, [0:value1 0:value2] are extracted for dim
+    bool extractSubArrayForDim(const hoNDArray<T>& x, hoNDArray<T>& r, ISMRMRDDIM& dim1, size_t value1, ISMRMRDDIM& dim2, size_t value2, bool lessEqual);
+
+    // extract sub array for two dimensions
+    // [0:value1] and [value2]
+    bool extractSubArrayForDim1LessEqualDim2Equal(const hoNDArray<T>& x, hoNDArray<T>& r, ISMRMRDDIM& dim1, size_t value1, ISMRMRDDIM& dim2, size_t value2);
+
+    // extract sub array limited by the max encoding counters
+    bool extractSubArrayForMaxEncodingCounters(const hoNDArray<T>& x, hoNDArray<T>& r, const ISMRMRD::EncodingCounters& maxIdx);
+
+    // ------------------------------------------------------------------------
+    // ISMRMRD acquisition header
+    // ------------------------------------------------------------------------
+    void clearAcquisitionHeaderISMRMRD(ISMRMRD::AcquisitionHeader& acqHeader);
+
+    // compute the image geometry for two acquisition header
+    bool hasIdenticalGeometryISMRMRD(const ISMRMRD::AcquisitionHeader& acqHeader1, const ISMRMRD::AcquisitionHeader& acqHeader2);
+
+    // add zeros pre/post data array
+    // 1 : pre zeros
+    // 2 : post zeros
+    // 0 : no zeros
+    long long addPrePostZeros(size_t centre_column, size_t samples);
+
+    // find RO ranges from centre_column and number of samples
+    void findStartEndRO(size_t centre_column, size_t samples, long long& startRO, long long& endRO);
+
+    // find RO ranges from centre_column and number of samples after zero-filling
+    void findStartEndROAfterZeroFilling(size_t centre_column, size_t samples_zerofilled, int& startRO, int& endRO);
+
+    // ------------------------------------------------------------------------
+    // ISMRMRD image header
+    // ------------------------------------------------------------------------
+    // set the meta attributes from the ISMRMRD image header
+    bool setMetaAttributesFromImageHeaderISMRMRD(const ISMRMRD::ImageHeader& imgHeader, ISMRMRD::MetaContainer& attrib);
+
+    // compute the image geometry for two acquisition header
+    bool setImageHeaderISMRMRDFromMetaAttributes(const ISMRMRD::MetaContainer& attrib, ISMRMRD::ImageHeader& imgHeader);
+
+    // ------------------------------------------------------------------------
+    // utility functions for various things
+    // ------------------------------------------------------------------------
+    // jobSchedule : for every valid device, it records the job allocated to it
+    // what is stored are valid device id and job packages allocated to it
+    // for one valid device, multiple job packages can be given to it
+
+    // load two hoNDArray and compute differences
+    void compareAgainstGroundTruthArray(const std::string& gt_filename, const hoNDArray<T>& x, typename realType<T>::Type& normDiff, typename realType<T>::Type& maxNormDiff);
+    void compareAgainstGroundTruthArray(const hoNDArray<T>& gt, const hoNDArray<T>& x, typename realType<T>::Type& normDiff, typename realType<T>::Type& maxNormDiff);
+
+    template <typename T2, unsigned int D> void compareAgainstGroundTruthImage(const std::string& gt_filename, const hoNDImage<T2, D>& x, typename realType<T2>::Type& normDiff, typename realType<T2>::Type& maxNormDiff)
+    {
+        hoNDImage<T2, D> gt;
+
+        gtPlusIOAnalyze gt_io;
+        gt_io.importImage(gt, gt_filename);
+
+        compareAgainstGroundTruthImage(gt, x, normDiff, maxNormDiff);
+    }
+
+    template <typename T2, unsigned int D> void compareAgainstGroundTruthImage(const hoNDImage<T2, D>& gt, const hoNDImage<T2, D>& x, typename realType<T2>::Type& normDiff, typename realType<T2>::Type& maxNormDiff)
+    {
+        hoNDImage<T2, D> diff(x);
+        Gadgetron::subtract(gt, x, diff);
+
+        hoNDImage<T2, D> gtEps(gt);
+        Gadgetron::addEpsilon(gtEps);
+
+        Gadgetron::norm2(diff, normDiff);
+
+        Gadgetron::divide(diff, gtEps, diff);
+
+        T2 maxV;
+        size_t ind;
+        Gadgetron::maxAbsolute(diff, maxV, ind);
+        maxNormDiff = std::abs(maxV);
+    }
+
+    void getCurrentMoment(std::string& procTime)
+    {
+        char timestamp[100];
+        time_t mytime;
+        struct tm *mytm;
+        mytime=time(NULL);
+        mytm=localtime(&mytime);
+        strftime(timestamp, sizeof(timestamp),"%a, %b %d %Y, %H:%M:%S",mytm);
+        procTime = timestamp;
+    }
+
+    void getCurrentMomentForFileName(std::string& procTime)
+    {
+        char timestamp[100];
+        time_t mytime;
+        struct tm *mytm;
+        mytime=time(NULL);
+        mytm=localtime(&mytime);
+        strftime(timestamp, sizeof(timestamp),"%a_%b_%d_%Y_%H_%M_%S",mytm);
+        procTime = timestamp;
+    }
+};
+
+// utility functions only meaningful for complex data type
+template <typename T> 
+class gtPlusISMRMRDReconUtilComplex : public gtPlusISMRMRDReconUtil<T>
+{
+public:
+
+    typedef typename realType<T>::Type value_type;
+
+    gtPlusISMRMRDReconUtilComplex();
+    virtual ~gtPlusISMRMRDReconUtilComplex();
+
+    void printInfo(std::ostream& os);
+
+    // ------------------------------------------------------------------------
+    // noise prewhitening
+    // ------------------------------------------------------------------------
+    // compute the noise prewhitening matrix
+    // noise: the noise scan [RO E1 CHA]
+    // noiseBandWidth: the noise bandwidth, Hz/pixel
+    // receiverBWRatio: system receiver noise equivaluent bandwidth ratio
+    // ADCSamplingTimeinSecond: ADC sampling time in second
+    // prewhiteningMatrix: the computed noise prewhitening matrix [CHA CHA]
+    bool computeNoisePrewhiteningMatrix(const hoNDArray<T>& noise, double noiseBandWidth, double receiverBWRatio, double ADCSamplingTimeinSecond, hoMatrix<T>& prewhiteningMatrix);
+
+    // perform the noise prewhitening matrix on the image/ref data
+    // result = prewhiteningMatrix * data
+    // data should at least have three dimensions [R0 E1 CHA], up to 10D
+    bool performNoisePrewhitening(hoNDArray<T>& data, const hoMatrix<T>& prewhiteningMatrix);
+
+    // ------------------------------------------------------------------------
+    // zero-padding resize
+    // ------------------------------------------------------------------------
+    // zero padding resize for kspace and complex image
+    // data is first fft to kspace and then zero padding then ifft to image domain
+    // the scaling is handled to keep the noise variance
+    // data: the 1st and 2rd dimensions are resized
+    bool zpadResize2D(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, hoNDArray<T>& dataResized);
+    // data: [RO E1 SLC E2 CON PHS REP SET], 1, 2 and 4th dimensions are resized
+    bool zpadResize3D(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, size_t sizeZ, hoNDArray<T>& dataResized);
+
+    // zero-padding resize with kspace as input
+    bool zpadResize2DOnKSpace(const hoNDArray<T>& kspace, size_t sizeX, size_t sizeY, hoNDArray<T>& dataResized);
+    bool zpadResize3DOnKSpace(const hoNDArray<T>& kspace, size_t sizeX, size_t sizeY, size_t sizeZ, hoNDArray<T>& dataResized);
+
+    // zero padding resize with filter
+    // data is first fft to kspace, then zero padding, then filtered and ifft to image domain
+    // filter2D: 2D array for kspace filter
+    // data: the 1st and 2rd dimensions are resized
+    bool zpadResize2DFilter(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, const hoNDArray<T>& filter2D, hoNDArray<T>& dataResized);
+    // filter3D: 3D array for kspace filter
+    // data: [RO E1 SLC E2 CON PHS REP SET], 1, 2 and 4th dimensions are resized
+    bool zpadResize3DFilter(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, size_t sizeZ, const hoNDArray<T>& filter3D, hoNDArray<T>& dataResized);
+
+    // ------------------------------------------------------------------------
+    // kspace filter in image domain
+    // ------------------------------------------------------------------------
+    // in image domain for ISMRMRD dimension order
+    bool kspacefilterROImage(hoNDArray<T>& data, const hoNDArray<T>& fRO);
+    bool kspacefilterROImage(const hoNDArray<T>& data, const hoNDArray<T>& fRO, hoNDArray<T>& dataFiltered);
+    bool kspacefilterE1Image(const hoNDArray<T>& data, const hoNDArray<T>& fE1, hoNDArray<T>& dataFiltered);
+    bool kspacefilterE2Image(const hoNDArray<T>& data, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+    bool kspacefilterE1E2Image(const hoNDArray<T>& data, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+    bool kspacefilterROE1E2Image(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+
+    // ------------------------------------------------------------------------
+    // coil sensitivity
+    // ------------------------------------------------------------------------
+    // coil estimation using NIH method
+    // data: in image domain, at least 3D [RO E1 CHA], the coil map will be estimated for every 2D kspace
+    bool coilMap2DNIH(const hoNDArray<T>& data, hoNDArray<T>& coilMap, ISMRMRDCOILMAPALGO algo, size_t ks=11, size_t power=3, size_t iterNum=5, typename realType<T>::Type thres=1e-3, bool useGPU=true);
+
+    // data: in image domain, at least 4D [RO E1 E2 CHA], the coil map will be estimated for every 2D kspace [RO E1 CHA] across E2
+    bool coilMap3DNIH(const hoNDArray<T>& data, hoNDArray<T>& coilMap, ISMRMRDCOILMAPALGO algo, size_t ks=7, size_t power=3, size_t iterNum=5, typename realType<T>::Type thres=1e-3, bool true3D=false);
+
+    // the Souheil method
+    // data: [RO E1 CHA], only 3D array
+    // these functions are using 2D data correlation matrix
+    bool coilMap2DNIHInner(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t power);
+
+    // data: [RO E1 E2 CHA], this functions uses true 3D data correlation matrix
+    bool coilMap3DNIHInner(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t power);
+
+    // the Souheil iteration method
+    // data: [RO E1 CHA], only 3D array
+    bool coilMap2DNIH2Inner(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t iterNum, typename realType<T>::Type thres);
+
+    // data: [RO E1 E2 CHA], true 3D coil map estimation
+    bool coilMap3DNIH2Inner(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t kz, size_t iterNum, typename realType<T>::Type thres);
+
+    // sum of square coil combination
+    // data: in image domain, at least 3D [RO E1 CHA]
+    bool sumOfSquare(const hoNDArray<T>& data, hoNDArray<T>& sos);
+
+    // coil map weighted coil combination
+    // data: in image domain, at least 3D [RO E1 CHA ...]
+    // coilMap: [RO E1 CHA ... ]
+    bool coilCombine(const hoNDArray<T>& data, const hoNDArray<T>& coilMap, hoNDArray<T>& combined);
+
+    // data: in image domain, at least 4D [RO E1 E2 CHA ...]
+    // coilMap: [RO E1 E2 CHA ... ]
+    bool coilCombine3D(const hoNDArray<T>& data, const hoNDArray<T>& coilMap, hoNDArray<T>& combined);
+
+    // ------------------------------------------------------------------------
+    // kspace utility functions
+    // ------------------------------------------------------------------------
+    // get the conjugate symmetric kspace for 2D case
+    // kspace : [RO E1 ...]
+    // kspaceConj: [RO E1 ...]
+    bool conjugateSymmetry2D(const hoNDArray<T>& kspace, hoNDArray<T>& kspaceConj);
+
+    // kspace : [RO E1 E2 ...]
+    // kspaceConj: [RO E1 E2 ...]
+    bool conjugateSymmetry3D(const hoNDArray<T>& kspace, hoNDArray<T>& kspaceConj);
+};
+
+}}
+
+#include "gtPlusISMRMRDReconUtil.hxx"
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.hxx b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.hxx
new file mode 100644
index 0000000..79d4a83
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.hxx
@@ -0,0 +1,5359 @@
+
+#include "gtPlusISMRMRDReconUtil.h"
+#include "hoNDArray_elemwise.h"
+#include <algorithm>
+
+#define GT_IMAGING_GEOMETRY_DELTA 0.001
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+gtPlusISMRMRDReconUtil<T>::gtPlusISMRMRDReconUtil() {}
+
+template <typename T> 
+gtPlusISMRMRDReconUtil<T>::~gtPlusISMRMRDReconUtil() {}
+
+template <typename T> 
+void gtPlusISMRMRDReconUtil<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+    os << "-------------- GTPlus ISMRMRD Recon Util -------------" << endl;
+    os << "Implementation of recon utilities for ISMRMRD format" << endl;
+    os << "------------------------------------------------------" << endl;
+}
+
+// ------------------------------------------------------------------------
+// coil compression and KarhunenLoeverTransform
+// ------------------------------------------------------------------------
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+KLT_eigenAnalysis(const hoMatrix<T>& data, hoMatrix<T>& eigenVectors, hoMatrix<T>& eigenValues)
+{
+    try
+    {
+        typedef typename realType<T>::Type ValueType;
+
+        size_t M = data.rows();
+        size_t N = data.cols();
+
+        GADGET_CHECK_RETURN_FALSE(eigenVectors.createMatrix(N, N));
+        GADGET_CHECK_RETURN_FALSE(eigenValues.createMatrix(N, 1));
+        Gadgetron::clear(eigenVectors);
+        Gadgetron::clear(eigenValues);
+
+        //hoMatrix<T> dataCopy(data);
+        //GADGET_CHECK_RETURN_FALSE(Gadgetron::gemm(eigenVectors, data, true, dataCopy, false));
+
+        char uplo = 'L';
+        bool isAHA = true;
+        Gadgetron::herk(eigenVectors, data, uplo, isAHA);
+        eigenVectors.copyLowerTriToUpper();
+
+        //eigenVectors.print(std::cout);
+
+        hoMatrix<T> mean(N, 1);
+        GADGET_CHECK_RETURN_FALSE(data.sumOverCol(mean));
+        Gadgetron::scal((ValueType)1.0/M, mean);
+
+        //mean.print(std::cout);
+
+        hoMatrix<T> MMH(N, N);
+        Gadgetron::clear(MMH);
+
+        hoMatrix<T> meanCopy(mean);
+        Gadgetron::gemm(MMH, meanCopy, false, mean, true);
+        Gadgetron::scal((ValueType)M, MMH);
+        Gadgetron::subtract(eigenVectors, MMH, eigenVectors);
+        Gadgetron::scal((ValueType)1.0/(M-1), eigenVectors);
+
+        //MMH.print(std::cout);
+        //eigenVectors.print(std::cout);
+
+        hoMatrix<T> EH(eigenVectors);
+        conjugatetrans(eigenVectors, EH);
+        Gadgetron::add(eigenVectors, EH, eigenVectors);
+        Gadgetron::scal( (ValueType)(0.5), eigenVectors);
+
+        //eigenVectors.print(std::cout);
+
+        Gadgetron::heev(eigenVectors, eigenValues);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::KLT_eigenAnalysis(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+KLT_applyEigen(const hoMatrix<T>& data, hoMatrix<T>& dataEigen, const hoMatrix<T>& eigenVectors)
+{
+    try
+    {
+        size_t M = data.rows();
+        size_t N = data.cols();
+
+        GADGET_CHECK_RETURN_FALSE(eigenVectors.rows()==N);
+
+        size_t K = eigenVectors.cols();
+
+        GADGET_CHECK_RETURN_FALSE(dataEigen.createMatrix(M, K));
+        Gadgetron::clear(dataEigen);
+
+        // M*N multiplies N*K
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::gemm(dataEigen, data, false, eigenVectors, false));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::KLT_applyEigen(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+KLT_applyEigen(const hoNDArray<T>& data, hoNDArray<T>& dataEigen, const hoMatrix<T>& eigenVectors)
+{
+    try
+    {
+        size_t M = data.get_size(0);
+        size_t N = data.get_size(1);
+
+        GADGET_CHECK_RETURN_FALSE(eigenVectors.rows()==N);
+
+        size_t K = eigenVectors.cols();
+
+        dataEigen.create(M, K);
+
+        hoNDArray<T> eigenVec(eigenVectors.get_dimensions(), const_cast<T*>(eigenVectors.begin()));
+
+        // M*N multiplies N*K
+        Gadgetron::clear(dataEigen);
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::gemm(dataEigen, data, false, eigenVec, false));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::KLT_applyEigen(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+KLT_numberOfKeptModes(const hoMatrix<T>& eigenValues, double thres, long long& numOfModesKept)
+{
+    try
+    {
+        size_t M = eigenValues.rows();
+
+        if ( thres <= 0 )
+        {
+            numOfModesKept = (long long)M;
+            return true;
+        }
+
+        long long m;
+        for ( m=M-2; m>=0; m-- )
+        {
+            if ( std::abs(eigenValues(m,0)) < thres*std::abs(eigenValues(M-1,0)) )
+            {
+                break;
+            }
+        }
+
+        numOfModesKept = M - m -1;
+
+        if ( numOfModesKept <= 0 )
+        {
+            GWARN_STREAM("KLT_numberOfKeptModes(...) - numOfModesKept <= 0 : " << thres);
+            GWARN_STREAM("KLT_numberOfKeptModes(...) - keep all modes : " << M);
+            numOfModesKept = (long long)M;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::KLT_numberOfKeptModes(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+pruneEigenVectorMatrix(const hoMatrix<T>& eigenVectors, long long numOfModesKept, hoMatrix<T>& eigenVectorsPruned)
+{
+    try
+    {
+        size_t M = eigenVectors.rows();
+        size_t N = eigenVectors.cols();
+
+        if ( numOfModesKept<=0 || numOfModesKept>(long long)N )
+        {
+            GWARN_STREAM("gtPlusISMRMRDReconUtil<T>::pruneEigenVectorMatrix(...) - numOfModesKept<=0 || numOfModesKept>N : " << numOfModesKept);
+            eigenVectorsPruned = eigenVectors;
+            return true;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(eigenVectorsPruned.createMatrix(M, numOfModesKept));
+        GADGET_CHECK_RETURN_FALSE(eigenVectors.subMatrix(eigenVectorsPruned, 0, M-1, N-numOfModesKept, N-1));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::pruneEigenVectorMatrix(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+computeKLTCoeff(const hoNDArray<T>& data, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, bool isChaLastDim)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        hoMatrix<T> eigenVectors;
+        hoMatrix<T> A;
+
+        if ( isChaLastDim )
+        {
+            size_t CHA = data.get_size(NDim-1);
+            size_t N = data.get_number_of_elements()/CHA;
+
+            GADGET_CHECK_RETURN_FALSE(A.createMatrix(N, CHA, const_cast<T*>(data.begin())));
+            GADGET_CHECK_RETURN_FALSE(KLT_eigenAnalysis(A, coeff, eigenValues));
+        }
+        else
+        {
+            size_t RO = data.get_size(0);
+            size_t E1 = data.get_size(1);
+            size_t CHA = data.get_size(2);
+
+            if ( NDim == 3 )
+            {
+                GADGET_CHECK_RETURN_FALSE(A.createMatrix(RO*E1, CHA, const_cast<T*>(data.begin())));
+                GADGET_CHECK_RETURN_FALSE(KLT_eigenAnalysis(A, coeff, eigenValues));
+            }
+            else if ( NDim == 4 )
+            {
+                size_t N = data.get_size(3);
+                hoNDArray<T> dataP(RO, E1, N, CHA);
+
+                std::vector<size_t> dimOrder(4);
+                dimOrder[0] = 0;
+                dimOrder[1] = 1;
+                dimOrder[2] = 3;
+                dimOrder[3] = 2;
+
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(const_cast< hoNDArray<T>* >(&data), &dataP, &dimOrder));
+
+                GADGET_CHECK_RETURN_FALSE(A.createMatrix(RO*E1*N, CHA, dataP.begin()));
+
+                GADGET_CHECK_RETURN_FALSE(KLT_eigenAnalysis(A, coeff, eigenValues));
+            }
+            else if ( NDim >= 5 )
+            {
+                std::vector<size_t> dimOrder(NDim);
+                size_t l;
+                for ( l=0; l<NDim; l++ )
+                {
+                    dimOrder[l] = l;
+                }
+                dimOrder[2] = NDim-1;
+                dimOrder[NDim-1] = 2;
+
+                hoNDArray<T> dataP(data);
+                permute(&dataP, &dimOrder);
+
+                size_t num = data.get_number_of_elements()/CHA;
+                GADGET_CHECK_RETURN_FALSE(A.createMatrix(num, CHA, dataP.begin()));
+
+                GADGET_CHECK_RETURN_FALSE(KLT_eigenAnalysis(A, coeff, eigenValues));
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::computeKLTCoeff(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+computeKLCoilCompressionCoeff(const hoNDArray<T>& data, double thres, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, bool isChaLastDim)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        hoMatrix<T> eigenVectors;
+        GADGET_CHECK_RETURN_FALSE(computeKLTCoeff(data, eigenVectors, eigenValues, isChaLastDim));
+
+        long long numOfModesKept;
+        GADGET_CHECK_RETURN_FALSE(KLT_numberOfKeptModes(eigenValues, thres, numOfModesKept));
+        GADGET_CHECK_RETURN_FALSE(pruneEigenVectorMatrix(eigenVectors, numOfModesKept, coeff));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::computeKLCoilCompressionCoeff(thres) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+computeKLCoilCompressionCoeff(const hoNDArray<T>& data, int numOfModesKept, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, bool isChaLastDim)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        hoMatrix<T> eigenVectors;
+        GADGET_CHECK_RETURN_FALSE(computeKLTCoeff(data, eigenVectors, eigenValues, isChaLastDim));
+        GADGET_CHECK_RETURN_FALSE(pruneEigenVectorMatrix(eigenVectors, numOfModesKept, coeff));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::computeKLCoilCompressionCoeff(numOfModesKept) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusISMRMRDReconUtil<T>::
+computeKLCoilCompression(const hoNDArray<T>& data, double thres, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, hoNDArray<T>& dataEigen, bool isChaLastDim)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(computeKLCoilCompressionCoeff(data, thres, coeff, eigenValues, isChaLastDim));
+        GADGET_CHECK_RETURN_FALSE(appyKLCoilCompressionCoeff(data, coeff, dataEigen, isChaLastDim));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::computeKLCoilCompression(thres) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusISMRMRDReconUtil<T>::
+computeKLCoilCompression(const hoNDArray<T>& data, int numOfModesKept, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, hoNDArray<T>& dataEigen, bool isChaLastDim)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(computeKLCoilCompressionCoeff(data, numOfModesKept, coeff, eigenValues, isChaLastDim));
+        GADGET_CHECK_RETURN_FALSE(appyKLCoilCompressionCoeff(data, coeff, dataEigen, isChaLastDim));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::computeKLCoilCompression(numOfModesKept) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+appyKLCoilCompressionCoeff(const hoNDArray<T>& data, const hoMatrix<T>& coeff, hoNDArray<T>& dataEigen, bool isChaLastDim)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        boost::shared_ptr< std::vector<size_t> > dim = data.get_dimensions();
+
+        size_t dstCHA = coeff.cols();
+
+        // D = A * V
+        hoMatrix<T> A;
+        hoMatrix<T> D;
+
+        if ( isChaLastDim )
+        {
+            size_t CHA = data.get_size(NDim-1);
+            size_t N = data.get_number_of_elements()/CHA;
+
+            hoNDArray<T> A_tmp(N, CHA, const_cast<T*>(data.begin()));
+            // GADGET_CHECK_RETURN_FALSE(A.createMatrix(CHA, N, const_cast<T*>(data.begin())));
+
+            std::vector<size_t> dimEigen(*dim);
+            dimEigen[NDim-1] = dstCHA;
+            dataEigen.create(&dimEigen);
+
+            hoNDArray<T> D_tmp(N, dstCHA, dataEigen.begin());
+            // GADGET_CHECK_RETURN_FALSE(D.createMatrix(dstCHA, N, dataEigen.begin()));
+
+            GADGET_CHECK_RETURN_FALSE(KLT_applyEigen(A_tmp, D_tmp, coeff));
+        }
+        else
+        {
+            size_t RO = data.get_size(0);
+            size_t E1 = data.get_size(1);
+            size_t CHA = data.get_size(2);
+
+            if ( NDim == 3 )
+            {
+                GADGET_CHECK_RETURN_FALSE(A.createMatrix(RO*E1, CHA, const_cast<T*>(data.begin())));
+
+                dataEigen.create(RO, E1, dstCHA);
+                GADGET_CHECK_RETURN_FALSE(D.createMatrix(RO*E1, dstCHA, dataEigen.begin()));
+
+                GADGET_CHECK_RETURN_FALSE(KLT_applyEigen(A, D, coeff));
+            }
+            else if ( NDim == 4 )
+            {
+                size_t N = data.get_size(3);
+                hoNDArray<T> dataP(RO, E1, N, CHA);
+
+                std::vector<size_t> dimOrder(4);
+                dimOrder[0] = 0;
+                dimOrder[1] = 1;
+                dimOrder[2] = 3;
+                dimOrder[3] = 2;
+
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(const_cast< hoNDArray<T>* >(&data), &dataP, &dimOrder));
+
+                GADGET_CHECK_RETURN_FALSE(A.createMatrix(RO*E1*N, CHA, dataP.begin()));
+
+                hoNDArray<T> dataEigenP(RO, E1, N, dstCHA);
+                GADGET_CHECK_RETURN_FALSE(D.createMatrix(RO*E1*N, dstCHA, dataEigenP.begin()));
+
+                GADGET_CHECK_RETURN_FALSE(KLT_applyEigen(A, D, coeff));
+
+                dataEigen.create(RO, E1, dstCHA, N);
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&dataEigenP, &dataEigen, &dimOrder));
+            }
+            else if ( NDim >= 5 )
+            {
+                std::vector<size_t> dimOrder(NDim);
+                size_t l;
+                for ( l=0; l<NDim; l++ )
+                {
+                    dimOrder[l] = l;
+                }
+                dimOrder[2] = NDim-1;
+                dimOrder[NDim-1] = 2;
+
+                boost::shared_ptr< hoNDArray<T> > dataP = permute(const_cast< hoNDArray<T>* >(&data), &dimOrder);
+
+                size_t num = data.get_number_of_elements()/CHA;
+                GADGET_CHECK_RETURN_FALSE(A.createMatrix(num, CHA, dataP->begin()));
+
+                boost::shared_ptr< std::vector<size_t> > dimP = dataP->get_dimensions();
+                (*dimP)[NDim-1] = dstCHA;
+
+                dataEigen.create(dimP);
+                GADGET_CHECK_RETURN_FALSE(D.createMatrix(num, dstCHA, dataEigen.begin()));
+
+                GADGET_CHECK_RETURN_FALSE(KLT_applyEigen(A, D, coeff));
+
+                dataP = permute(&dataEigen, &dimOrder);
+                dataEigen =  *dataP;
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::appyKLCoilCompressionCoeff(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+applyKLCoilCompressionCoeff(const hoNDArray<T>& data, const std::vector<hoMatrix<T> >& coeff, hoNDArray<T>& dataEigen, bool isChaLastDim)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        GADGET_CHECK_RETURN_FALSE(coeff.size()>=data.get_size(NDim-1));
+
+        size_t LastDim = coeff.size();
+        size_t dstCHA = coeff[0].cols();
+
+        size_t n;
+        for ( n=1; n<LastDim; n++ )
+        {
+            GADGET_CHECK_RETURN_FALSE(coeff[n].cols()==dstCHA);
+        }
+
+        size_t LastDimData = data.get_size(NDim-1);
+        boost::shared_ptr< std::vector<size_t> > dim = data.get_dimensions();
+        long long N = data.get_number_of_elements()/LastDimData;
+
+        std::vector<size_t> dimEigen(*dim);
+
+        if ( isChaLastDim )
+        {
+            dimEigen[NDim-2] = dstCHA;
+        }
+        else
+        {
+            dimEigen[2] = dstCHA;
+        }
+
+        dataEigen.create(&dimEigen);
+        long long eigenN = dataEigen.get_number_of_elements()/LastDimData;
+
+        std::vector<size_t> dimLastDim(NDim-1);
+        for ( n=0; n<NDim-1; n++ )
+        {
+            dimLastDim[n] = (*dim)[n];
+        }
+
+        hoNDArray<T> dataEigenLastDim;
+        for ( n=0; n<LastDimData; n++ )
+        {
+            hoNDArray<T> dataLastDim(&dimLastDim, const_cast<T*>(data.begin()+n*N));
+            GADGET_CHECK_RETURN_FALSE(appyKLCoilCompressionCoeff(dataLastDim, coeff[n], dataEigenLastDim, isChaLastDim));
+            memcpy(dataEigen.begin()+n*eigenN, dataEigenLastDim.begin(), dataEigenLastDim.get_number_of_bytes());
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::applyKLCoilCompressionCoeff(std::vector<hoMatrix<T> >& coeff) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::computeKLFilter(const hoNDArray<T>& data, size_t numOfModesKept, hoNDArray<T>& dataKLF)
+{
+    try
+    {
+        if ( !dataKLF.dimensions_equal(&data) )
+        {
+            dataKLF = data;
+        }
+
+        size_t NDim = data.get_number_of_dimensions();
+        size_t M = data.get_size(NDim-1);
+        size_t N = data.get_number_of_elements()/M;
+
+        if ( numOfModesKept > M ) numOfModesKept = M;
+
+        hoMatrix<T> A(N, M, const_cast<T*>(data.begin()));
+
+        hoMatrix<T> eigenVectors, eigenValues;
+        GADGET_CHECK_RETURN_FALSE(KLT_eigenAnalysis(A, eigenVectors, eigenValues));
+
+        hoMatrix<T> E(eigenVectors);
+        size_t r, c;
+        for ( c=0; c<M-numOfModesKept+1; c++ )
+        {
+            for ( r=0; r<M; r++ )
+            {
+                E(r, c) = T(0);
+            }
+        }
+
+        hoMatrix<T> ET(eigenVectors);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::conjugatetrans(eigenVectors, ET));
+
+        hoMatrix<T> EET(M, M);
+        Gadgetron::clear(EET);
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::gemm(EET, E, false, ET, false));
+
+        hoMatrix<T> R(N, M, dataKLF.begin());
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::gemm(R, A, false, EET, false));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::computeKLFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+// ------------------------------------------------------------------------
+// kspace filter
+// ------------------------------------------------------------------------
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+compute2DFilterFromTwo1D(const hoNDArray<T>& fx, const hoNDArray<T>& fy, hoNDArray<T>& fxy)
+{
+    try
+    {
+        size_t RO = fx.get_size(0);
+        size_t E1 = fy.get_size(0);
+
+        fxy.create(RO, E1);
+        T* pFxy = fxy.begin();
+
+        size_t x, y;
+
+        for ( y=0; y<E1; y++ )
+        {
+            for ( x=0; x<RO; x++ )
+            {
+                pFxy[y*RO+x] = fx(x) * fy(y);
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::compute2DFilterFromTwo1D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+compute2DFilterFromTwo1D(const hoNDArray<float>& fx, const hoNDArray<float>& fy, hoNDArray< std::complex<float> >& fxy)
+{
+    try
+    {
+        size_t RO = fx.get_size(0);
+        size_t E1 = fy.get_size(0);
+
+        fxy.create(RO, E1);
+         std::complex<float> * pFxy = fxy.begin();
+
+        size_t x, y;
+
+        for ( y=0; y<E1; y++ )
+        {
+            for ( x=0; x<RO; x++ )
+            {
+                pFxy[y*RO+x] =  std::complex<float> (fx(x) * fy(y));
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::compute2DFilterFromTwo1D(float) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+compute2DFilterFromTwo1D(const hoNDArray<double>& fx, const hoNDArray<double>& fy, hoNDArray< std::complex<double> >& fxy)
+{
+    try
+    {
+        size_t RO = fx.get_size(0);
+        size_t E1 = fy.get_size(0);
+
+        fxy.create(RO, E1);
+         std::complex<double> * pFxy = fxy.begin();
+
+        size_t x, y;
+
+        for ( y=0; y<E1; y++ )
+        {
+            for ( x=0; x<RO; x++ )
+            {
+                pFxy[y*RO+x] =  std::complex<double> (fx(x) * fy(y));
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::compute2DFilterFromTwo1D(double) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+compute3DFilterFromThree1D(const hoNDArray<T>& fx, const hoNDArray<T>& fy, const hoNDArray<T>& fz, hoNDArray<T>& fxyz)
+{
+    try
+    {
+        size_t RO = fx.get_size(0);
+        size_t E1 = fy.get_size(0);
+        size_t E2 = fz.get_size(0);
+
+        fxyz.create(RO, E1, E2);
+        T* pFxyz = fxyz.begin();
+
+        const T* px = fx.begin();
+        const T* py = fy.begin();
+        const T* pz = fz.begin();
+
+        size_t x, y, z;
+
+        T vz, vy, vx;
+
+        size_t ind = 0;
+        for ( z=0; z<E2; z++ )
+        {
+            vz = pz[z];
+            for ( y=0; y<E1; y++ )
+            {
+                vy = py[y];
+                for ( x=0; x<RO; x++ )
+                {
+                    vx = px[x];
+                    pFxyz[ind] = (vx*vz*vy);
+                    ind++;
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::compute3DFilterFromThree1D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+compute3DFilterFromThree1D(const hoNDArray<float>& fx, const hoNDArray<float>& fy, const hoNDArray<float>& fz, hoNDArray< std::complex<float> >& fxyz)
+{
+    try
+    {
+        size_t RO = fx.get_size(0);
+        size_t E1 = fy.get_size(0);
+        size_t E2 = fz.get_size(0);
+
+        fxyz.create(RO, E1, E2);
+         std::complex<float> * pFxyz = fxyz.begin();
+
+        size_t x, y, z;
+
+        for ( z=0; z<E2; z++ )
+        {
+            for ( y=0; y<E1; y++ )
+            {
+                for ( x=0; x<RO; x++ )
+                {
+                    pFxyz[z+RO*E1+y*RO+x] =  std::complex<float> (fx(x)*fy(y)*fz(z));
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::compute3DFilterFromThree1D(float) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+compute3DFilterFromThree1D(const hoNDArray<double>& fx, const hoNDArray<double>& fy, const hoNDArray<double>& fz, hoNDArray< std::complex<double> >& fxyz)
+{
+    try
+    {
+        size_t RO = fx.get_size(0);
+        size_t E1 = fy.get_size(0);
+        size_t E2 = fz.get_size(0);
+
+        fxyz.create(RO, E1, E2);
+         std::complex<double> * pFxyz = fxyz.begin();
+
+        size_t x, y, z;
+
+        for ( z=0; z<E2; z++ )
+        {
+            for ( y=0; y<E1; y++ )
+            {
+                for ( x=0; x<RO; x++ )
+                {
+                    pFxyz[z+RO*E1+y*RO+x] =  std::complex<double> (fx(x)*fy(y)*fz(z));
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::compute3DFilterFromThree1D(double) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterRO(hoNDArray<T>& data, const hoNDArray<T>& fRO)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(data, fRO, data));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterRO(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterRO(const hoNDArray<T>& data, const hoNDArray<T>& fRO, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(data, fRO, dataFiltered));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterRO(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterROE1(const hoNDArray<T>& data, const hoNDArray<T>& fROE1, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)*data.get_size(1)==fROE1.get_number_of_elements());
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(data, fROE1, dataFiltered));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterROE1(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterROE1(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE1, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+
+        hoNDArray<T> fxy;
+        compute2DFilterFromTwo1D(fRO, fE1, fxy);
+
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(data, fxy, dataFiltered));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterROE1(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterE1(const hoNDArray<T>& data, const hoNDArray<T>& fE1, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+
+        hoNDArray<T> fRO(data.get_size(0));
+        fRO.fill(T(1.0));
+
+        hoNDArray<T> fxy;
+        compute2DFilterFromTwo1D(fRO, fE1, fxy);
+
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(data, fxy, dataFiltered));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterE1(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterE2(const hoNDArray<T>& data, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(4)==fE2.get_number_of_elements());
+
+        hoNDArray<T> fRO(data.get_size(0));
+        fRO.fill(T(1.0));
+
+        hoNDArray<T> fE1(data.get_size(1));
+        fE1.fill(T(1.0));
+
+        hoNDArray<T> fxyz;
+        compute3DFilterFromThree1D(fRO, fE1, fE2, fxyz);
+
+        GADGET_CHECK_RETURN_FALSE(kspacefilterROE1E2(data, fxyz, dataFiltered));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterE2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterROE2(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(4)==fE2.get_number_of_elements());
+
+        hoNDArray<T> fE1(data.get_size(1));
+        fE1.fill(T(1.0));
+
+        hoNDArray<T> fxyz;
+        compute3DFilterFromThree1D(fRO, fE1, fE2, fxyz);
+
+        GADGET_CHECK_RETURN_FALSE(kspacefilterROE1E2(data, fxyz, dataFiltered));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterROE2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(4)==fE2.get_number_of_elements());
+
+        hoNDArray<T> fRO(data.get_size(0));
+        fRO.fill(T(1.0));
+
+        hoNDArray<T> fxyz;
+        compute3DFilterFromThree1D(fRO, fE1, fE2, fxyz);
+
+        GADGET_CHECK_RETURN_FALSE(kspacefilterROE1E2(data, fxyz, dataFiltered));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterROE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fROE1E2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        if ( data.get_size(2)==1 && data.get_size(3)==1 )
+        {
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(data, fROE1E2, dataFiltered));
+        }
+        else
+        {
+            size_t NDim = data.get_number_of_dimensions();
+            std::vector<size_t> order(data.get_number_of_dimensions(), 1);
+
+            size_t ii;
+            for ( ii=0; ii<NDim; ii++ )
+            {
+                order[ii] = ii;
+            }
+
+            order[0] = 0;
+            order[1] = 1;
+            order[2] = 4;
+            order[3] = 2;
+            order[4] = 3;
+
+            boost::shared_ptr< hoNDArray<T> > data_permuted = Gadgetron::permute(const_cast<hoNDArray<T>*>(&data), &order);
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(data, fROE1E2, dataFiltered));
+
+            order[0] = 0;
+            order[1] = 1;
+            order[2] = 3;
+            order[3] = 4;
+            order[4] = 2;
+
+            data_permuted = Gadgetron::permute(&dataFiltered, &order);
+            dataFiltered = *data_permuted;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterROE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterROE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(4)==fE2.get_number_of_elements());
+
+        hoNDArray<T> fxyz;
+        compute3DFilterFromThree1D(fRO, fE1, fE2, fxyz);
+
+        GADGET_CHECK_RETURN_FALSE(kspacefilterROE1E2(data, fxyz, dataFiltered));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterROE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspace3DfilterE2(const hoNDArray<T>& data, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==fE2.get_number_of_elements());
+
+        hoNDArray<T> fRO(data.get_size(0));
+        fRO.fill(T(1.0));
+
+        hoNDArray<T> fE1(data.get_size(1));
+        fE1.fill(T(1.0));
+
+        hoNDArray<T> fxyz;
+        compute3DFilterFromThree1D(fRO, fE1, fE2, fxyz);
+
+        GADGET_CHECK_RETURN_FALSE(kspace3DfilterROE1E2(data, fxyz, dataFiltered));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::kspace3DfilterE2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspace3DfilterROE2(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==fE2.get_number_of_elements());
+
+        hoNDArray<T> fE1(data.get_size(1));
+        fE1.fill(T(1.0));
+
+        hoNDArray<T> fxyz;
+        compute3DFilterFromThree1D(fRO, fE1, fE2, fxyz);
+
+        GADGET_CHECK_RETURN_FALSE(kspace3DfilterROE1E2(data, fxyz, dataFiltered));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::kspace3DfilterROE2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspace3DfilterE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==fE2.get_number_of_elements());
+
+        hoNDArray<T> fRO(data.get_size(0));
+        fRO.fill(T(1.0));
+
+        hoNDArray<T> fxyz;
+        compute3DFilterFromThree1D(fRO, fE1, fE2, fxyz);
+
+        GADGET_CHECK_RETURN_FALSE(kspace3DfilterROE1E2(data, fxyz, dataFiltered));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::kspace3DfilterE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspace3DfilterROE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fROE1E2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fROE1E2.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fROE1E2.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==fROE1E2.get_size(2));
+
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(data, fROE1E2, dataFiltered));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::kspace3DfilterROE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspace3DfilterROE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==fE2.get_number_of_elements());
+
+        hoNDArray<T> fxyz(fRO.get_number_of_elements(), fE1.get_number_of_elements(), fE2.get_number_of_elements());
+        compute3DFilterFromThree1D(fRO, fE1, fE2, fxyz);
+
+        GADGET_CHECK_RETURN_FALSE(kspace3DfilterROE1E2(data, fxyz, dataFiltered));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::kspace3DfilterROE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+// ------------------------------------------------------------------------
+// compute kspace filters
+// ------------------------------------------------------------------------
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+generateSymmetricFilter(size_t len, size_t start, size_t end, hoNDArray<T>& filter, ISMRMRDKSPACEFILTER filterType, double sigma, size_t width)
+{
+    try
+    {
+        if ( len == 0 ) return true;
+
+        if ( start > len-1 ) start = 0;
+        if ( end > len-1 ) end = len-1;
+
+        if ( start > end )
+        {
+            start = 0;
+            end = len-1;
+        }
+
+        filter.create(len);
+        Gadgetron::fill(filter, T(1.0));
+
+        if ( width==0 || width>=len ) width = 1;
+
+        size_t ii;
+        switch (filterType)
+        {
+            case ISMRMRD_FILTER_GAUSSIAN:
+                {
+                    double r = -1.0*sigma*sigma/2;
+
+                    if ( len%2 == 0 )
+                    {
+                        // to make sure the zero points match and boundary of filters are symmetric
+                        double stepSize = 2.0/(len-2);
+                        std::vector<double> x(len-1);
+
+                        for ( ii=0; ii<len-1; ii++ )
+                        {
+                            x[ii] = -1 + ii*stepSize;
+                        }
+
+                        for ( ii=0; ii<len-1; ii++ )
+                        {
+                            filter(ii+1) = T( (value_type)(std::exp(r*(x[ii]*x[ii]))) );
+                        }
+
+                        filter(0) = T(0);
+                    }
+                    else
+                    {
+                        double stepSize = 2.0/(len-1);
+                        std::vector<double> x(len);
+
+                        for ( ii=0; ii<len; ii++ )
+                        {
+                            x[ii] = -1 + ii*stepSize;
+                        }
+
+                        for ( ii=0; ii<len; ii++ )
+                        {
+                            filter(ii) = T( (value_type)(std::exp(r*(x[ii]*x[ii]))) );
+                        }
+                    }
+                }
+            break;
+
+            case ISMRMRD_FILTER_TAPERED_HANNING:
+                 {
+                    hoNDArray<T> w(width);
+
+                    for ( ii=1; ii<=width; ii++ )
+                    {
+                        w(ii-1) = T( (value_type)(0.5 * ( 1 - std::cos( 2.0*M_PI*ii/(2*width+1) ) )) );
+                    }
+
+                    if ( len%2 == 0 )
+                    {
+                        for ( ii=1; ii<=width; ii++ )
+                        {
+                            filter(ii) = w(ii-1);
+                            filter(len-ii) = filter(ii);
+                        }
+
+                        filter(0) = T(0);
+                    }
+                    else
+                    {
+                        for ( ii=1; ii<=width; ii++ )
+                        {
+                            filter(ii-1) = w(ii-1);
+                            filter(len-ii) = filter(ii-1);
+                        }
+                    }
+                }
+            break;
+
+            // symmetric hanning
+            //  does not include the first and last zero sample
+            case ISMRMRD_FILTER_HANNING:
+                 {
+                    if ( len%2 == 0 )
+                    {
+                        size_t N = len-1;
+                        double halfLen = (double)( (N+1)/2 );
+                        for ( ii=1; ii<=halfLen; ii++ )
+                        {
+                            filter(ii) = T( (value_type)(0.5 * ( 1 - std::cos( 2.0*M_PI*ii/(N+1) ) )) );
+                        }
+
+                        for ( ii=(size_t)halfLen; ii<N; ii++ )
+                        {
+                            filter(ii+1) = filter(N-ii);
+                        }
+
+                        filter(0) = T(0);
+                    }
+                    else
+                    {
+                        double halfLen = (double)( (len+1)/2 );
+                        for ( ii=1; ii<=(size_t)halfLen; ii++ )
+                        {
+                            filter(ii-1) = T( (value_type)(0.5 * ( 1 - std::cos( 2.0*M_PI*ii/(len+1) ) )) );
+                        }
+
+                        for ( ii=(size_t)halfLen; ii<len; ii++ )
+                        {
+                            filter(ii) = filter(len-1-ii);
+                        }
+                    }
+                }
+            break;
+
+            default:
+            break;
+        }
+
+        T sos = 0.0f;
+        for ( ii=0; ii<len; ii++ )
+        {
+            sos += filter(ii)*filter(ii);
+        }
+
+        T r = (value_type)( 1.0/std::sqrt( std::abs(sos)/(len) ) );
+        for ( ii=0; ii<len; ii++ )
+        {
+            filter(ii) *= r;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::generateSymmetricFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+generateAsymmetricFilter(size_t len, size_t start, size_t end, hoNDArray<T>& filter, ISMRMRDKSPACEFILTER filterType, size_t width, bool densityComp)
+{
+    try
+    {
+        if ( len == 0 ) return true;
+
+        if ( start > len-1 ) start = 0;
+        if ( end > len-1 ) end = len-1;
+
+        if ( start > end )
+        {
+            start = 0;
+            end = len-1;
+        }
+
+        filter.create(len);
+        Gadgetron::clear(filter);
+
+        size_t ii;
+        for ( ii=start; ii<=end; ii++ )
+        {
+            filter(ii) = T(1.0);
+        }
+
+        if ( width==0 || width>=len ) width = 1;
+
+        hoNDArray<T> w(width);
+
+        switch (filterType)
+        {
+            case ISMRMRD_FILTER_TAPERED_HANNING:
+                 {
+                    for ( ii=1; ii<=width; ii++ )
+                    {
+                        w(ii-1) = T( (value_type)(0.5 * ( 1 - std::cos( 2.0*M_PI*ii/(2*width+1) ) )) );
+                    }
+                }
+            break;
+
+            default:
+                Gadgetron::fill(w, T(1.0));
+            break;
+        }
+
+        if ( densityComp )
+        {
+            size_t startSym(0), endSym(len-1);
+            GADGET_CHECK_RETURN_FALSE(findSymmetricSampledRegion(start, end, len/2, startSym, endSym));
+
+            if ( start==0 && end==len-1 )
+            {
+                for ( ii=1; ii<=width; ii++ )
+                {
+                    filter(ii-1) = w(ii-1);
+                    filter(len-ii) = filter(ii-1);
+                }
+            }
+
+            if ( start==0 && end<len-1 )
+            {
+                for ( ii=0; ii<startSym; ii++ )
+                {
+                    filter(ii) = 2.0;
+                }
+
+                for ( ii=1; ii<=width; ii++ )
+                {
+                    filter(ii-1+startSym) = T(1.0) + w(width-ii);
+                    filter(end-ii+1) = w(ii-1);
+                }
+            }
+
+            if ( start>0 && end==len-1 )
+            {
+                for ( ii=endSym+1; ii<len; ii++ )
+                {
+                    filter(ii) = 2.0;
+                }
+
+                for ( ii=1; ii<=width; ii++ )
+                {
+                    filter(endSym-ii+1) = T(1.0) + w(width-ii);
+                    filter(start+ii-1) = w(ii-1);
+                }
+            }
+
+            if ( start>0 && end<len-1 )
+            {
+                if ( start==startSym && end==endSym )
+                {
+                    for ( ii=1; ii<=width; ii++ )
+                    {
+                        filter(start+ii-1) = w(ii-1);
+                        filter(end-ii+1) = w(ii-1);
+                    }
+                }
+                else if ( start==startSym && end>endSym )
+                {
+                    for ( ii=endSym+1; ii<=end; ii++ )
+                    {
+                        filter(ii) = 2.0;
+                    }
+
+                    for ( ii=1; ii<=width; ii++ )
+                    {
+                        filter(end-ii+1) = T(1.0) + w(ii-1);
+                        filter(endSym-ii+1) = w(width-ii);
+                        filter(start+ii-1) = w(ii-1);
+                    }
+                }
+                else if ( start<startSym && end==endSym )
+                {
+                    for ( ii=start; ii<startSym; ii++ )
+                    {
+                        filter(ii) = 2.0;
+                    }
+
+                    for ( ii=1; ii<=width; ii++ )
+                    {
+                        filter(ii-1+start) = T(1.0) + w(ii-1);
+                        filter(ii-1+startSym) = w(width-ii);
+                        filter(end-ii+1) = w(ii-1);
+                    }
+                }
+                else
+                {
+                    for ( ii=1; ii<=width; ii++ )
+                    {
+                        filter(start+ii-1) = w(ii-1);
+                        filter(end-ii+1) = w(ii-1);
+                    }
+                }
+            }
+        }
+        else
+        {
+            if ( start==0 && end==len-1 )
+            {
+                for ( ii=1; ii<=width; ii++ )
+                {
+                    filter(ii-1) = w(ii-1);
+                    filter(len-ii) = filter(ii-1);
+                }
+            }
+
+            if ( start==0 && end<len-1 )
+            {
+                for ( ii=1; ii<=width; ii++ )
+                {
+                    filter(end-ii+1) = w(ii-1);
+                }
+            }
+
+            if ( start>0 && end==len-1 )
+            {
+                for ( ii=1; ii<=width; ii++ )
+                {
+                    filter(start+ii-1) = w(ii-1);
+                }
+            }
+
+            if ( start>0 && end<len-1 )
+            {
+                for ( ii=1; ii<=width; ii++ )
+                {
+                    filter(start+ii-1) = w(ii-1);
+                    filter(end-ii+1) = w(ii-1);
+                }
+            }
+        }
+
+        T sos = 0.0f;
+        for ( ii=0; ii<len; ii++ )
+        {
+            sos += filter(ii)*filter(ii);
+        }
+
+        // T r = 1.0/std::sqrt( std::abs(sos)/len );
+        T r = (value_type)( 1.0/std::sqrt( std::abs(sos)/(end-start+1) ) ); // SNR unit filter
+        for ( ii=0; ii<len; ii++ )
+        {
+            filter(ii) *= r;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::generateAsymmetricFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+generateSymmetricFilterForRef(size_t len, size_t start, size_t end, 
+        hoNDArray<T>& filter, ISMRMRDKSPACEFILTER filterType, double sigma, size_t width)
+{
+    try
+    {
+        if ( len < 2 ) return true;
+
+        GADGET_CHECK_RETURN_FALSE(start>=0&&end<=len-1&&start<=end);
+
+        if ( start==0 && end==len-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(generateSymmetricFilter(len, 0, len-1, filter, filterType, sigma, width));
+            return true;
+        }
+
+        size_t centerInd = len/2;
+
+        size_t lenFilter(0); // make a symmetric filter with zero at the center
+        size_t lenFilterEnd = 2*(end-centerInd)+1;
+        size_t lenFilterStart = 2*(centerInd-start)+1;
+
+        if ( start==0 && end<len-1 )
+        {
+            lenFilter = lenFilterEnd;
+        }
+        else if ( start>0 && end==len-1 )
+        {
+            lenFilter = lenFilterStart;
+        }
+        else if ( start>0 && end<len-1 )
+        {
+            lenFilter = ( (lenFilterStart<lenFilterEnd) ? lenFilterStart : lenFilterEnd );
+        }
+        else
+        {
+            GERROR_STREAM("Invalid inputs : start - end - len : " << start << " " << end << " " << len);
+        }
+
+        GADGET_CHECK_RETURN_FALSE(lenFilter>0);
+
+        hoNDArray<T> filterSym(lenFilter);
+        GADGET_CHECK_RETURN_FALSE(generateSymmetricFilter(lenFilter, 0, lenFilter-1, filterSym, filterType, sigma, width));
+
+        filter.create(len);
+        Gadgetron::clear(&filter);
+
+        if ( start==0 && end<len-1 )
+        {
+            memcpy(filter.begin()+end-lenFilter+1, filterSym.begin(), filterSym.get_number_of_bytes());
+            return true;
+        }
+        else if ( start>0 && end==len-1 )
+        {
+            memcpy(filter.begin()+start, filterSym.begin(), filterSym.get_number_of_bytes());
+            return true;
+        }
+        else if ( start>0 && end<len-1 )
+        {
+            if ( lenFilter == lenFilterStart ) 
+            {
+                memcpy(filter.begin()+start, filterSym.begin(), filterSym.get_number_of_bytes());
+            }
+            else
+            {
+                memcpy(filter.begin()+end-lenFilter+1, filterSym.begin(), filterSym.get_number_of_bytes());
+            }
+
+            return true;
+        }
+        else
+        {
+            GERROR_STREAM("Invalid inputs : start - end - len : " << start << " " << end << " " << len);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::generateSymmetricFilterForRef(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+findSymmetricSampledRegion(size_t start, size_t end, size_t center, size_t& startSym, size_t& endSym)
+{
+    GADGET_CHECK_RETURN_FALSE(end>=start);
+    GADGET_CHECK_RETURN_FALSE(center>=start);
+    GADGET_CHECK_RETURN_FALSE(end>=center);
+
+    size_t halfSizeStart = center - start;
+    size_t halfSizeEnd =  end - center;
+
+    if ( halfSizeStart > halfSizeEnd )
+    {
+        startSym = center - halfSizeEnd;
+        endSym = center + halfSizeEnd;
+    }
+    else
+    {
+        startSym = center - halfSizeStart;
+        endSym = center + halfSizeStart;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::computeFilterSNRUnitScaleFactor(const hoNDArray<T>& filter, T& scalFactor)
+{
+    size_t ii, len;
+
+    len = filter.get_number_of_elements();
+    if ( len == 0 )
+    {
+        scalFactor = T(1.0);
+        return true;
+    }
+
+    T sos(0.0);
+    for ( ii=0; ii<len; ii++ )
+    {
+        sos += filter(ii)*filter(ii);
+    }
+
+    scalFactor = (value_type)(1.0/std::sqrt( std::abs(sos)/len ));
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+detectSampledRegion2D(const hoNDArray<T>& data, size_t& startRO, size_t& endRO, size_t& startE1, size_t& endE1)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+
+        hoNDArray<typename realType<T>::Type> mag(data.get_dimensions()), magSum, magSumE1, magSumRO;
+        Gadgetron::abs(data, mag);
+
+        if ( NDim > 2 )
+        {
+            size_t ii;
+            std::vector<size_t> dim;
+            for ( ii=0; ii<NDim-2; ii++ )
+            {
+                mag.get_dimensions(dim);
+
+                // GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(mag, magSum));
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(mag, magSum, mag.get_number_of_dimensions() - 1));
+
+                std::vector<size_t> dimSum(dim.size()-1);
+                memcpy(&dimSum[0], &dim[0], sizeof(size_t)*dimSum.size());
+                magSum.reshape(dimSum);
+
+                mag = magSum;
+            }
+        }
+
+        size_t RO = mag.get_size(0);
+        size_t E1 = mag.get_size(1);
+
+        startRO = RO-1;
+        endRO = 0;
+
+        startE1 = E1-1;
+        endE1 = 0;
+
+        size_t ro, e1;
+
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(mag, magSumE1, 1));
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(mag, magSumRO, 0));
+
+        for ( ro=0; ro<RO; ro++ )
+        {
+            if ( magSumE1(ro) > 0 )
+            {
+                if ( ro < startRO ) startRO = ro;
+                if ( ro > endRO ) endRO = ro;
+            }
+        }
+
+        for ( e1=0; e1<E1; e1++ )
+        {
+            if ( magSumRO(e1) > 0 )
+            {
+                if ( e1 < startE1 ) startE1 = e1;
+                if ( e1 > endE1 ) endE1 = e1;
+            }
+        }
+
+        if ( startRO > endRO )
+        {
+            startRO = 0;
+            endRO = RO-1;
+        }
+
+        if ( startE1 > endE1 )
+        {
+            startE1 = 0;
+            endE1 = E1-1;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::detectSampledRegion2D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+detectSampledRegion3D(const hoNDArray<T>& data, size_t& startRO, size_t& endRO, size_t& startE1, size_t& endE1, size_t& startE2, size_t& endE2)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+
+        hoNDArray<typename realType<T>::Type> mag(data.get_dimensions()), magSum, magSum2, magSumRO, magSumE1, magSumE2;
+        Gadgetron::abs(data, mag);
+
+        if ( NDim > 5 )
+        {
+            std::vector<size_t> dim;
+
+            size_t ii;
+            for ( ii=0; ii<NDim-5; ii++ )
+            {
+                mag.get_dimensions(dim);
+
+                // GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(mag, magSum));
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(mag, magSum, mag.get_number_of_dimensions() - 1));
+
+                std::vector<size_t> dimSum(dim.size() - 1);
+                memcpy(&dimSum[0], &dim[0], sizeof(size_t)*dimSum.size());
+                magSum.reshape(dimSum);
+
+                mag = magSum;
+            }
+        }
+
+        size_t RO = mag.get_size(0);
+        size_t E1 = mag.get_size(1);
+        size_t E2 = mag.get_size(4);
+
+        startRO = RO-1;
+        endRO = 0;
+
+        startE1 = E1-1;
+        endE1 = 0;
+
+        startE2 = E2-1;
+        endE2 = 0;
+
+        size_t ro, e1, e2;
+
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(mag, magSum2, 4));
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(magSum2, magSum, 3));
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(magSum, magSum2, 2));
+
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(magSum2, magSumE1, 1));
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(magSum2, magSumRO, 0));
+
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(mag, magSum2, 3));
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(magSum2, magSum, 2));
+
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(magSum, magSum2, 1));
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(magSum2, magSumE2, 0));
+
+        for ( ro=0; ro<RO; ro++ )
+        {
+            if ( magSumE1(ro) > 0 )
+            {
+                if ( ro < startRO ) startRO = ro;
+                if ( ro > endRO ) endRO = ro;
+            }
+        }
+
+        for ( e1=0; e1<E1; e1++ )
+        {
+            if ( magSumRO(e1) > 0 )
+            {
+                if ( e1 < startE1 ) startE1 = e1;
+                if ( e1 > endE1 ) endE1 = e1;
+            }
+        }
+
+        for ( e2=0; e2<E2; e2++ )
+        {
+            if ( magSumE2(e2) > 0 )
+            {
+                if ( e2 < startE2 ) startE2 = e2;
+                if ( e2 > endE2 ) endE2 = e2;
+            }
+        }
+
+        if ( startRO > endRO )
+        {
+            startRO = 0;
+            endRO = RO-1;
+        }
+
+        if ( startE1 > endE1 )
+        {
+            startE1 = 0;
+            endE1 = E1-1;
+        }
+
+        if ( startE2 > endE2 )
+        {
+            startE2 = 0;
+            endE2 = E2-1;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::detectSampledRegion3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+// ------------------------------------------------------------------------
+// coil sensitivity
+// ------------------------------------------------------------------------
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+averageKSpace4D(const hoNDArray<T>& data, hoNDArray<T>& ave)
+{
+    try
+    {
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(data, ave, 3));
+        Gadgetron::scal( (typename realType<T>::Type)(1.0/data.get_size(3)), ave);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::averageKSpace4D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+averageKSpace4D(const hoNDArray<T>& data, hoNDArray<T>& ave, std::vector<size_t>& sampledTimes)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=2);
+
+        if ( NDim < 4 )
+        {
+            ave = data;
+            GADGET_CHECK_RETURN_FALSE(detectSampledTimesE1(data, sampledTimes));
+            return true;
+        }
+
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t CHA = data.get_size(2);
+        size_t N = data.get_size(3);
+
+        hoNDArray<T> data4D(RO, E1, CHA, N, const_cast<T*>(data.begin()));
+        GADGET_CHECK_RETURN_FALSE(detectSampledTimesE1(data4D, sampledTimes));
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(data, ave, 3));
+
+        boost::shared_ptr< std::vector<size_t> > dim = ave.get_dimensions();
+
+        if ( dim->size() != NDim )
+        {
+            (*dim).insert((*dim).begin()+3, 1);
+            ave.reshape(dim.get());
+        }
+
+        hoNDArray<T> sampledTimes2D(RO, E1);
+        T* pTimes = sampledTimes2D.begin();
+        size_t ro, e1;
+        for ( e1=0; e1<E1; e1++ )
+        {
+            double t = (double)sampledTimes[e1];
+            if ( t == 0 ) t = 1;
+
+            for ( ro=0; ro<RO; ro++ )
+            {
+                pTimes[e1*RO+ro] = (value_type)(1.0/t);
+            }
+        }
+
+        // GADGET_CHECK_RETURN_FALSE(multipleMultiply(sampledTimes2D, ave, ave));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(multiply(ave, sampledTimes2D, ave));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::averageKSpace4D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+averageKSpace5D(const hoNDArray<T>& data, hoNDArray<T>& ave)
+{
+    try
+    {
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(data, ave, 4));
+        Gadgetron::scal( (typename realType<T>::Type)(1.0/data.get_size(4)), ave);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::averageKSpace5D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+averageKSpace5D(const hoNDArray<T>& data, hoNDArray<T>& ave, hoNDArray<size_t>& sampledTimes)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        if ( NDim < 5 )
+        {
+            ave = data;
+            GADGET_CHECK_RETURN_FALSE(detectSampledTimesE1E2(data, sampledTimes));
+            return true;
+        }
+
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t E2 = data.get_size(2);
+        size_t CHA = data.get_size(3);
+        size_t N = data.get_size(4);
+
+        hoNDArray<T> data5D(RO, E1, E2, CHA, N, const_cast<T*>(data.begin()));
+        GADGET_CHECK_RETURN_FALSE(detectSampledTimesE1E2(data5D, sampledTimes));
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(data, ave, 4));
+
+        hoNDArray<T> sampledTimes3D(RO, E1, E2);
+        T* pTimes = sampledTimes3D.begin();
+        size_t ro, e1, e2;
+        for ( e2=0; e2<E2; e2++ )
+        {
+            for ( e1=0; e1<E1; e1++ )
+            {
+                double t = (double)sampledTimes(e1+e2*E1);
+                if ( t == 0 ) t = 1;
+
+                for ( ro=0; ro<RO; ro++ )
+                {
+                    pTimes[e2*RO*E1+e1*RO+ro] = (value_type)(1.0/t);
+                }
+            }
+        }
+
+        // GADGET_CHECK_RETURN_FALSE(multipleMultiply(sampledTimes3D, ave, ave));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(multiply(ave, sampledTimes3D, ave));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::averageKSpace5D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+detectSampledTimesE1(const hoNDArray<T>& data4D, std::vector<size_t>& sampledTimes)
+{
+    try
+    {
+        size_t NDim = data4D.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=2);
+
+        size_t RO = data4D.get_size(0);
+        size_t E1 = data4D.get_size(1);
+        size_t CHA = data4D.get_size(2);
+        size_t N = data4D.get_size(3);
+
+        hoNDArray<typename realType<T>::Type> mag(data4D.get_dimensions());
+        Gadgetron::abs(data4D, mag);
+
+        hoNDArray<typename realType<T>::Type> mag3D(RO, E1, 1, N);
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(mag, mag3D, 2));
+
+        hoNDArray<typename realType<T>::Type> mag2D(1, E1, 1, N);
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(mag3D, mag2D, 0));
+        typename realType<T>::Type* pMag2D = mag2D.begin();
+
+        sampledTimes.resize(E1, 0);
+
+        size_t e1, n;
+        for ( e1=0; e1<E1; e1++ )
+        {
+            for ( n=0; n<N; n++ )
+            {
+                if ( pMag2D[e1+n*E1] > 0 )
+                {
+                    sampledTimes[e1]++;
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::detectSampledTimesE1(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+detectSampledRegionE1(const hoNDArray<T>& data, size_t& startE1, size_t& endE1)
+{
+    try
+    {
+        std::vector<size_t> sampledTimes;
+        GADGET_CHECK_RETURN_FALSE(detectSampledTimesE1(data, sampledTimes));
+
+        size_t E1 = sampledTimes.size();
+
+        startE1 = E1-1;
+        endE1 = 0;
+
+        for ( size_t e1=0; e1<E1; e1++ )
+        {
+            if ( sampledTimes[e1] > 0 )
+            {
+                if ( e1 > endE1 ) endE1 = e1;
+                if ( e1 < startE1 ) startE1 = e1;
+            }
+        }
+
+        if ( endE1 < startE1 )
+        {
+            startE1 = 0;
+            endE1 = E1-1;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::detectSampledRegionE1(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+detectSampledTimesE1E2(const hoNDArray<T>& data5D, hoNDArray<size_t>& sampledTimes)
+{
+    try
+    {
+        size_t NDim = data5D.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        size_t RO = data5D.get_size(0);
+        size_t E1 = data5D.get_size(1);
+        size_t E2 = data5D.get_size(2);
+        size_t CHA = data5D.get_size(3);
+        size_t N = data5D.get_size(4);
+
+        hoNDArray<typename realType<T>::Type> mag(RO, E1, E2);
+
+        hoNDArray<T> dataFirstChannel(RO, E1, E2, const_cast<T*>(data5D.begin()));
+        Gadgetron::abs(dataFirstChannel, mag);
+
+        hoNDArray<typename realType<T>::Type> mag3D(1, E1, E2);
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(mag, mag3D, 0));
+
+        typename realType<T>::Type* pMag3D = mag3D.begin();
+
+        sampledTimes.create(E1, E2);
+        Gadgetron::clear(sampledTimes);
+        size_t* pTimes = sampledTimes.get_data_ptr();
+
+        size_t e1, e2, n;
+        for ( e2=0; e2<E2; e2++ )
+        {
+            for ( e1=0; e1<E1; e1++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    if ( pMag3D[e1+e2*E1+n*E1*E2] > 0 )
+                    {
+                        pTimes[e1+e2*E1]++;
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::detectSampledTimesE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+detectSampledRegionE1E2(const hoNDArray<T>& data, size_t& startE1, size_t& endE1, size_t& startE2, size_t& endE2)
+{
+    try
+    {
+        hoNDArray<size_t> sampledTimes;
+        GADGET_CHECK_RETURN_FALSE(detectSampledTimesE1E2(data, sampledTimes));
+
+        size_t E1 = sampledTimes.get_size(0);
+        size_t E2 = sampledTimes.get_size(1);
+
+        startE1 = E1-1;
+        endE1 = 0;
+
+        startE2 = E2-1;
+        endE2 = 0;
+
+        size_t e1, e2;
+        for ( e2=0; e2<E2; e2++ )
+        {
+            for ( e1=0; e1<E1; e1++ )
+            {
+                if ( sampledTimes(e1+e2*E1) > 0 )
+                {
+                    if ( e1 > endE1 ) endE1 = e1;
+                    if ( e1 < startE1 ) startE1 = e1;
+
+                    if ( e2 > endE2 ) endE2 = e2;
+                    if ( e2 < startE2 ) startE2 = e2;
+                }
+            }
+        }
+
+        if ( endE1 < startE1 )
+        {
+            startE1 = 0;
+            endE1 = E1-1;
+        }
+
+        if ( endE2 < startE2 )
+        {
+            startE2 = 0;
+            endE2 = E2-1;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::detectSampledRegionE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+copyAlongE1(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startE1, size_t endE1)
+{
+    try
+    {
+        size_t NDim = src.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=2);
+
+        size_t RO = dst.get_size(0);
+        size_t E1 = dst.get_size(1);
+
+        size_t RO_src = src.get_size(0);
+        size_t E1_src = src.get_size(1);
+
+        GADGET_CHECK_RETURN_FALSE(RO==RO_src);
+        GADGET_CHECK_RETURN_FALSE(E1==E1_src);
+        GADGET_CHECK_RETURN_FALSE(src.get_number_of_elements()==dst.get_number_of_elements());
+
+        if ( (startE1>=E1) || (endE1>=E1) || (startE1>endE1) )
+        {
+            dst = src;
+            GWARN_STREAM("copyAlongE1(...) : (startE1>=E1) || (endE1>=E1) || (startE1>endE1) ... ");
+            return true;
+        }
+
+        size_t N = dst.get_number_of_elements()/(RO*E1);
+
+        size_t n, e1;
+        for ( n=0; n<N; n++ )
+        {
+            for ( e1=startE1; e1<=endE1; e1++ )
+            {
+                memcpy(dst.begin()+n*RO*E1+e1*RO, src.begin()+n*RO*E1+e1*RO, sizeof(T)*RO);
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::copyAlongE1(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+copyAlongROE1(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startRO, size_t endRO, size_t startE1, size_t endE1)
+{
+    try
+    {
+        size_t NDim = src.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=2);
+
+        size_t RO = dst.get_size(0);
+        size_t E1 = dst.get_size(1);
+
+        size_t RO_src = src.get_size(0);
+        size_t E1_src = src.get_size(1);
+
+        GADGET_CHECK_RETURN_FALSE(RO==RO_src);
+        GADGET_CHECK_RETURN_FALSE(E1==E1_src);
+        GADGET_CHECK_RETURN_FALSE(src.get_number_of_elements()==dst.get_number_of_elements());
+
+        if ( (startRO>=RO) || (endRO>=RO) || (startRO>endRO) )
+        {
+            dst = src;
+            GWARN_STREAM("copyAlongROE1(...) : (startRO>=RO) || (endRO>=RO) || (startRO>endRO) ... ");
+            return true;
+        }
+
+        if ( (startE1>=E1) || (endE1>=E1) || (startE1>endE1) )
+        {
+            dst = src;
+            GWARN_STREAM("copyAlongROE1(...) : (startE1>=E1) || (endE1>=E1) || (startE1>endE1) ... ");
+            return true;
+        }
+
+        size_t N = dst.get_number_of_elements()/(RO*E1);
+        const T* pSrc = src.begin();
+        T* pDst = dst.begin();
+
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, pSrc, pDst, RO, E1, startRO, endRO, startE1, endE1)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            for ( size_t e1=startE1; e1<=endE1; e1++ )
+            {
+                size_t offset = n*RO*E1+e1*RO+startRO;
+                memcpy(pDst+offset, pSrc+offset, sizeof(T)*(endRO-startRO+1));
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::copyAlongROE1(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+copyAlongROE1E2(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startRO, size_t endRO, size_t startE1, size_t endE1, size_t startE2, size_t endE2)
+{
+    try
+    {
+        size_t NDim = src.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=2);
+
+        size_t RO = dst.get_size(0);
+        size_t E1 = dst.get_size(1);
+        size_t E2 = dst.get_size(2);
+
+        size_t RO_src = src.get_size(0);
+        size_t E1_src = src.get_size(1);
+        size_t E2_src = src.get_size(2);
+
+        GADGET_CHECK_RETURN_FALSE(RO==RO_src);
+        GADGET_CHECK_RETURN_FALSE(E1==E1_src);
+        GADGET_CHECK_RETURN_FALSE(E2==E2_src);
+        GADGET_CHECK_RETURN_FALSE(src.get_number_of_elements()==dst.get_number_of_elements());
+
+        if ( (startRO>=RO) || (endRO>=RO) || (startRO>endRO) )
+        {
+            dst = src;
+            GWARN_STREAM("copyAlongROE1E2(...) : (startRO>=RO) || (endRO>=RO) || (startRO>endRO) ... ");
+            return true;
+        }
+
+        if ( (startE1>=E1) || (endE1>=E1) || (startE1>endE1) )
+        {
+            dst = src;
+            GWARN_STREAM("copyAlongROE1E2(...) : (startE1>=E1) || (endE1>=E1) || (startE1>endE1) ... ");
+            return true;
+        }
+
+        if ( (startE2>=E2) || (endE2>=E2) || (startE2>endE2) )
+        {
+            dst = src;
+            GWARN_STREAM("copyAlongROE1E2(...) : (startE2>=E2) || (endE2>=E2) || (startE2>endE2) ... ");
+            return true;
+        }
+
+        size_t N = dst.get_number_of_elements()/(RO*E1*E2);
+        const T* pSrc = src.begin();
+        T* pDst = dst.begin();
+
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, pSrc, pDst, RO, E1, E2, startRO, endRO, startE1, endE1, startE2, endE2)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            for ( size_t e2=startE2; e2<=endE2; e2++ )
+            {
+                for ( size_t e1=startE1; e1<=endE1; e1++ )
+                {
+                    size_t offset = n*RO*E1*E2+e2*E1*RO+e1*RO+startRO;
+                    memcpy(pDst+offset, pSrc+offset, sizeof(T)*(endRO-startRO+1));
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::copyAlongROE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+copyAlongROE1TransitionBand(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startRO, size_t endRO, 
+        size_t startE1, size_t endE1, size_t transBandRO, size_t transBandE1)
+{
+    try
+    {
+        size_t NDim = src.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=2);
+
+        size_t RO = dst.get_size(0);
+        size_t E1 = dst.get_size(1);
+
+        size_t RO_src = src.get_size(0);
+        size_t E1_src = src.get_size(1);
+
+        GADGET_CHECK_RETURN_FALSE(RO==RO_src);
+        GADGET_CHECK_RETURN_FALSE(E1==E1_src);
+        GADGET_CHECK_RETURN_FALSE(src.get_number_of_elements()==dst.get_number_of_elements());
+
+        if ( (startRO>=RO) || (endRO>=RO) || (startRO>endRO) )
+        {
+            dst = src;
+            GWARN_STREAM("copyAlongROE1TransitionBand(...) : (startRO>=RO) || (endRO>=RO) || (startRO>endRO) ... ");
+            return true;
+        }
+
+        if ( (startE1>=E1) || (endE1>=E1) || (startE1>endE1) )
+        {
+            dst = src;
+            GWARN_STREAM("copyAlongROE1TransitionBand(...) : (startE1>=E1) || (endE1>=E1) || (startE1>endE1) ... ");
+            return true;
+        }
+
+        while ( transBandRO>1 && startRO+transBandRO > RO/2 )
+        {
+             transBandRO--;
+        }
+
+        while ( transBandRO>1 && endRO-transBandRO < RO/2 )
+        {
+             transBandRO--;
+        }
+
+        while ( transBandE1>1 && startE1+transBandE1 > E1/2 )
+        {
+             transBandE1--;
+        }
+
+        while ( transBandE1>1 && endE1-transBandE1 < E1/2 )
+        {
+             transBandE1--;
+        }
+
+        ISMRMRDKSPACEFILTER filterType = ISMRMRD_FILTER_TAPERED_HANNING;
+        bool densityComp = false;
+
+        hoNDArray<T> filter_src_RO, filter_src_E1;
+
+        if ( startRO==0 && endRO==RO-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(RO, startRO, endRO, filter_src_RO, ISMRMRD_FILTER_NONE, transBandRO, densityComp));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(RO, startRO, endRO, filter_src_RO, filterType, transBandRO, densityComp));
+        }
+
+        if ( startE1==0 && endE1==E1-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(E1, startE1, endE1, filter_src_E1, ISMRMRD_FILTER_NONE, transBandE1, densityComp));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(E1, startE1, endE1, filter_src_E1, filterType, transBandE1, densityComp));
+        }
+
+        // in this way, the SNR unit scale property is perserved
+        T midValue = filter_src_RO(RO/2);
+        T scalFactor = T(1.0)/midValue;
+        Gadgetron::scal(scalFactor, filter_src_RO);
+
+        midValue = filter_src_E1(E1/2);
+        scalFactor = T(1.0)/midValue;
+        Gadgetron::scal(scalFactor, filter_src_E1);
+
+        hoNDArray<T> filter_dst_RO(RO), filter_dst_E1(E1);
+
+        size_t ii;
+        for ( ii=0; ii<RO; ii++ )
+        {
+            filter_dst_RO(ii) = T(1.0) - filter_src_RO(ii);
+        }
+
+        for ( ii=0; ii<E1; ii++ )
+        {
+            filter_dst_E1(ii) = T(1.0) - filter_src_E1(ii);
+        }
+
+        hoNDArray<T> srcFiltered(src), dstFiltered(dst);
+        if ( startRO==0 && endRO==RO-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(kspacefilterE1(src, filter_src_E1, srcFiltered));
+            GADGET_CHECK_RETURN_FALSE(kspacefilterE1(dst, filter_dst_E1, dstFiltered));
+        }
+        else if ( startE1==0 && endE1==E1-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(kspacefilterRO(src, filter_src_RO, srcFiltered));
+            GADGET_CHECK_RETURN_FALSE(kspacefilterRO(dst, filter_dst_RO, dstFiltered));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(kspacefilterROE1(src, filter_src_RO, filter_src_E1, srcFiltered));
+
+            hoNDArray<T> fxy;
+            GADGET_CHECK_RETURN_FALSE(compute2DFilterFromTwo1D(filter_src_RO, filter_src_E1, fxy));
+
+            size_t Nxy = RO*E1;
+            for ( ii=0; ii<Nxy; ii++ )
+            {
+                fxy(ii) = T(1.0) - fxy(ii);
+            }
+
+            GADGET_CHECK_RETURN_FALSE(kspacefilterROE1(dst, fxy, dstFiltered));
+        }
+
+        Gadgetron::add(srcFiltered, dstFiltered, dst);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::copyAlongROE1TransitionBand(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+copyAlongROE1E2TransitionBand(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startRO, size_t endRO, 
+                        size_t startE1, size_t endE1, size_t startE2, size_t endE2, 
+                        size_t transBandRO, size_t transBandE1, size_t transBandE2)
+{
+    try
+    {
+        size_t NDim = src.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        size_t RO = dst.get_size(0);
+        size_t E1 = dst.get_size(1);
+        size_t E2 = dst.get_size(2);
+
+        size_t RO_src = src.get_size(0);
+        size_t E1_src = src.get_size(1);
+        size_t E2_src = src.get_size(2);
+
+        GADGET_CHECK_RETURN_FALSE(RO==RO_src);
+        GADGET_CHECK_RETURN_FALSE(E1==E1_src);
+        GADGET_CHECK_RETURN_FALSE(E2==E2_src);
+        GADGET_CHECK_RETURN_FALSE(src.get_number_of_elements()==dst.get_number_of_elements());
+
+        if ( (startRO>=RO) || (endRO>=RO) || (startRO>endRO) )
+        {
+            dst = src;
+            GWARN_STREAM("copyAlongROE1TransitionBand(...) : (startRO>=RO) || (endRO>=RO) || (startRO>endRO) ... ");
+            return true;
+        }
+
+        if ( (startE1>=E1) || (endE1>=E1) || (startE1>endE1) )
+        {
+            dst = src;
+            GWARN_STREAM("copyAlongROE1TransitionBand(...) : (startE1>=E1) || (endE1>=E1) || (startE1>endE1) ... ");
+            return true;
+        }
+
+        if ( (startE2>=E2) || (endE2>=E2) || (startE2>endE2) )
+        {
+            dst = src;
+            GWARN_STREAM("copyAlongROE1E2TransitionBand(...) : (startE2>=E2) || (endE2>=E2) || (startE2>endE2) ... ");
+            return true;
+        }
+
+        while ( transBandRO>1 && startRO+transBandRO > RO/2 )
+        {
+             transBandRO--;
+        }
+
+        while ( transBandRO>1 && endRO-transBandRO < RO/2 )
+        {
+             transBandRO--;
+        }
+
+        while ( transBandE1>1 && startE1+transBandE1 > E1/2 )
+        {
+             transBandE1--;
+        }
+
+        while ( transBandE1>1 && endE1-transBandE1 < E1/2 )
+        {
+             transBandE1--;
+        }
+
+        while ( transBandE2>1 && startE2+transBandE2 > E2/2 )
+        {
+             transBandE2--;
+        }
+
+        while ( transBandE2>1 && endE2-transBandE2 < E2/2 )
+        {
+             transBandE2--;
+        }
+
+        ISMRMRDKSPACEFILTER filterType = ISMRMRD_FILTER_TAPERED_HANNING;
+        bool densityComp = false;
+
+        hoNDArray<T> filter_src_RO, filter_src_E1, filter_src_E2;
+
+        if ( startRO==0 && endRO==RO-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(RO, startRO, endRO, filter_src_RO, ISMRMRD_FILTER_NONE, transBandRO, densityComp));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(RO, startRO, endRO, filter_src_RO, filterType, transBandRO, densityComp));
+        }
+
+        if ( startE1==0 && endE1==E1-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(E1, startE1, endE1, filter_src_E1, ISMRMRD_FILTER_NONE, transBandE1, densityComp));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(E1, startE1, endE1, filter_src_E1, filterType, transBandE1, densityComp));
+        }
+
+        if ( startE2==0 && endE2==E2-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(E2, startE2, endE2, filter_src_E2, ISMRMRD_FILTER_NONE, transBandE2, densityComp));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(E2, startE2, endE2, filter_src_E2, filterType, transBandE2, densityComp));
+        }
+
+        // in this way, the SNR unit scale property is perserved
+        T midValue = filter_src_RO(RO/2);
+        T scalFactor = T(1.0)/midValue;
+        Gadgetron::scal(scalFactor, filter_src_RO);
+
+        midValue = filter_src_E1(E1/2);
+        scalFactor = T(1.0)/midValue;
+        Gadgetron::scal(scalFactor, filter_src_E1);
+
+        midValue = filter_src_E2(E2/2);
+        scalFactor = T(1.0)/midValue;
+        Gadgetron::scal(scalFactor, filter_src_E2);
+
+        hoNDArray<T> filter_dst_RO(RO), filter_dst_E1(E1), filter_dst_E2(E2);
+
+        size_t ii;
+        for ( ii=0; ii<RO; ii++ )
+        {
+            filter_dst_RO(ii) = T(1.0) - filter_src_RO(ii);
+        }
+
+        for ( ii=0; ii<E1; ii++ )
+        {
+            filter_dst_E1(ii) = T(1.0) - filter_src_E1(ii);
+        }
+
+        for ( ii=0; ii<E2; ii++ )
+        {
+            filter_dst_E2(ii) = T(1.0) - filter_src_E2(ii);
+        }
+
+        hoNDArray<T> srcFiltered(src), dstFiltered(dst);
+        if ( startRO>=0 && endRO<=RO-1 && startE1==0 && endE1==E1-1 && startE2==0 && endE1==E2-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(kspacefilterRO(src, filter_src_E1, srcFiltered));
+            GADGET_CHECK_RETURN_FALSE(kspacefilterRO(dst, filter_dst_E1, dstFiltered));
+        }
+        else if ( startRO==0 && endRO==RO-1 && startE1>=0 && endE1<=E1-1 && startE2==0 && endE1==E2-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(kspacefilterE1(src, filter_src_RO, srcFiltered));
+            GADGET_CHECK_RETURN_FALSE(kspacefilterE1(dst, filter_dst_RO, dstFiltered));
+        }
+        else if ( startRO==0 && endRO==RO-1 && startE1==0 && endE1==E1-1 && startE2>=0 && endE1<=E2-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(kspace3DfilterE2(src, filter_src_RO, srcFiltered));
+            GADGET_CHECK_RETURN_FALSE(kspace3DfilterE2(dst, filter_dst_RO, dstFiltered));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(kspace3DfilterROE1E2(src, filter_src_RO, filter_src_E1, filter_src_E2, srcFiltered));
+
+            hoNDArray<T> fxyz;
+            GADGET_CHECK_RETURN_FALSE(compute3DFilterFromThree1D(filter_src_RO, filter_src_E1, filter_src_E2, fxyz));
+
+            size_t Nxyz = RO*E1*E2;
+            for ( ii=0; ii<Nxyz; ii++ )
+            {
+                fxyz(ii) = T(1.0) - fxyz(ii);
+            }
+
+            GADGET_CHECK_RETURN_FALSE(kspace3DfilterROE1E2(dst, fxyz, dstFiltered));
+        }
+
+        Gadgetron::add(srcFiltered, dstFiltered, dst);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::copyAlongROE1E2TransitionBand(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+std::string gtPlusISMRMRDReconUtil<T>::getISMRMRDDimName(const ISMRMRDDIM& dim)
+{
+    std::ostringstream os;
+    switch (dim)
+    {
+        case DIM_ReadOut:
+            os << "DIM_ReadOut";
+        break;
+
+        case DIM_Encoding1:
+            os << "DIM_Encoding1";
+        break;
+
+        case DIM_Channel:
+            os << "DIM_Channel";
+        break;
+
+        case DIM_Slice:
+            os << "DIM_Slice";
+        break;
+
+        case DIM_Encoding2:
+            os << "DIM_Encoding2";
+        break;
+
+        case DIM_Contrast:
+            os << "DIM_Contrast";
+        break;
+
+        case DIM_Phase:
+            os << "DIM_Phase";
+        break;
+
+        case DIM_Repetition:
+            os << "DIM_Repetition";
+        break;
+
+        case DIM_Set:
+            os << "DIM_Set";
+        break;
+
+        case DIM_Segment:
+            os << "DIM_Segment";
+        break;
+
+        case DIM_Average:
+            os << "DIM_Average";
+        break;
+
+        case DIM_other1:
+            os << "DIM_other1";
+        break;
+
+        case DIM_other2:
+            os << "DIM_other2";
+        break;
+
+        case DIM_other3:
+            os << "DIM_other3";
+        break;
+
+        default:
+            os << "DIM_NONE";
+    }
+
+    std::string dimStr(os.str());
+    return dimStr;
+}
+
+template <typename T> 
+ISMRMRDDIM gtPlusISMRMRDReconUtil<T>::getISMRMRDDimFromName(const std::string& name)
+{
+    if ( name == "DIM_ReadOut" ) return DIM_ReadOut;
+    if ( name == "DIM_Encoding1" ) return DIM_Encoding1;
+    if ( name == "DIM_Channel" ) return DIM_Channel;
+    if ( name == "DIM_Slice" ) return DIM_Slice;
+    if ( name == "DIM_Encoding2" ) return DIM_Encoding2;
+    if ( name == "DIM_Contrast" ) return DIM_Contrast;
+    if ( name == "DIM_Phase" ) return DIM_Phase;
+    if ( name == "DIM_Repetition" ) return DIM_Repetition;
+    if ( name == "DIM_Set" ) return DIM_Set;
+    if ( name == "DIM_Segment" ) return DIM_Segment;
+    if ( name == "DIM_Average" ) return DIM_Average;
+    if ( name == "DIM_other1" ) return DIM_other1;
+    if ( name == "DIM_other2" ) return DIM_other2;
+    if ( name == "DIM_other3" ) return DIM_other3;
+
+    return DIM_NONE;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::getISMRMRDDimIndex(const ISMRMRDDIM& dim, long long& ind)
+{
+    switch (dim)
+    {
+        case Gadgetron::DIM_ReadOut:
+            ind = 0;
+        break;
+
+        case Gadgetron::DIM_Encoding1:
+            ind = 1;
+        break;
+
+        case Gadgetron::DIM_Channel:
+            ind = 2;
+        break;
+
+        case Gadgetron::DIM_Slice:
+            ind = 3;
+        break;
+
+        case Gadgetron::DIM_Encoding2:
+            ind = 4;
+        break;
+
+        case Gadgetron::DIM_Contrast:
+            ind = 5;
+        break;
+
+        case Gadgetron::DIM_Phase:
+            ind = 6;
+        break;
+
+        case Gadgetron::DIM_Repetition:
+            ind = 7;
+        break;
+
+        case Gadgetron::DIM_Set:
+            ind = 8;
+        break;
+
+        case Gadgetron::DIM_Segment:
+            ind = 9;
+        break;
+
+        case Gadgetron::DIM_Average:
+            ind = 10;
+        break;
+
+        default:
+            ind = -1;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::findDimIndex(const std::vector<DimensionRecordType>& dimStartingIndexes, ISMRMRDDIM dim, size_t ind)
+{
+    size_t N = dimStartingIndexes.size();
+
+    size_t n;
+    for ( n=0; n<N; n++ )
+    {
+        if ( dimStartingIndexes[n].first == dim )
+        {
+            ind = dimStartingIndexes[n].second;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+template <typename T> 
+ISMRMRDALGO gtPlusISMRMRDReconUtil<T>::getISMRMRDReconAlgoFromName(const std::string& name)
+{
+    if ( name == "ISMRMRD_GRAPPA" ) return ISMRMRD_GRAPPA;
+    if ( name == "ISMRMRD_SENSE" ) return ISMRMRD_SENSE;
+    if ( name == "ISMRMRD_SPIRIT" ) return ISMRMRD_SPIRIT;
+    if ( name == "ISMRMRD_L1SPIRIT" ) return ISMRMRD_L1SPIRIT;
+    if ( name == "ISMRMRD_SOFTSENSE" ) return ISMRMRD_SOFTSENSE;
+    if ( name == "ISMRMRD_L1SOFTSENSE" ) return ISMRMRD_L1SOFTSENSE;
+    if ( name == "ISMRMRD_2DTBINNING" ) return ISMRMRD_2DTBINNING;
+    if ( name == "ISMRMRD_2DTBINNING_FLOW" ) return ISMRMRD_2DTBINNING_FLOW;
+    if ( name == "ISMRMRD_L1SPIRIT_SLEP" ) return ISMRMRD_L1SPIRIT_SLEP;
+    if ( name == "ISMRMRD_L1SPIRIT_SLEP_MOTION_COMP" ) return ISMRMRD_L1SPIRIT_SLEP_MOTION_COMP;
+
+    return ISMRMRD_NONE;
+}
+
+template <typename T> 
+ISMRMRDCOILMAPALGO gtPlusISMRMRDReconUtil<T>::getISMRMRDCoilMapAlgoFromName(const std::string& name)
+{
+    if ( name == "ISMRMRD_SOUHEIL" ) return ISMRMRD_SOUHEIL;
+    if ( name == "ISMRMRD_SOUHEIL_ITER" ) return ISMRMRD_SOUHEIL_ITER;
+
+    return ISMRMRD_SOUHEIL;
+}
+
+template <typename T> 
+ISMRMRDPFALGO gtPlusISMRMRDReconUtil<T>::getISMRMRDPartialFourierReconAlgoFromName(const std::string& name)
+{
+    if ( name == "ISMRMRD_PF_HOMODYNE" ) return ISMRMRD_PF_HOMODYNE;
+    if ( name == "ISMRMRD_PF_FENGHUANG" ) return ISMRMRD_PF_FENGHUANG;
+    if ( name == "ISMRMRD_PF_POCS" ) return ISMRMRD_PF_POCS;
+    if ( name == "ISMRMRD_PF_ZEROFILLING_FILTER" ) return ISMRMRD_PF_ZEROFILLING_FILTER;
+    if ( name == "ISMRMRD_PF_ZEROFILLING" ) return ISMRMRD_PF_ZEROFILLING;
+
+    return ISMRMRD_PF_NONE;
+}
+
+template <typename T> 
+std::string gtPlusISMRMRDReconUtil<T>::getNameFromISMRMRDPartialFourierReconAlgo(ISMRMRDPFALGO algo)
+{
+    if ( algo == ISMRMRD_PF_HOMODYNE ) return std::string("ISMRMRD_PF_HOMODYNE");
+    if ( algo == ISMRMRD_PF_FENGHUANG ) return std::string("ISMRMRD_PF_FENGHUANG");
+    if ( algo == ISMRMRD_PF_ZEROFILLING_FILTER ) return std::string("ISMRMRD_PF_ZEROFILLING_FILTER");
+    if ( algo == ISMRMRD_PF_POCS ) return std::string("ISMRMRD_PF_POCS");
+    if ( algo == ISMRMRD_PF_ZEROFILLING ) return std::string("ISMRMRD_PF_ZEROFILLING");
+
+    return std::string("ISMRMRD_PF_NONE");
+}
+
+template <typename T> 
+ISMRMRDKSPACEFILTER gtPlusISMRMRDReconUtil<T>::
+getISMRMRDKSpaceFilterFromName(const std::string& name)
+{
+    if ( name == "ISMRMRD_FILTER_GAUSSIAN" ) return ISMRMRD_FILTER_GAUSSIAN;
+    if ( name == "ISMRMRD_FILTER_HANNING" ) return ISMRMRD_FILTER_HANNING;
+    if ( name == "ISMRMRD_FILTER_TUKEY" ) return ISMRMRD_FILTER_TUKEY;
+    if ( name == "ISMRMRD_FILTER_TAPERED_HANNING" ) return ISMRMRD_FILTER_TAPERED_HANNING;
+    if ( name == "ISMRMRD_FILTER_NONE" ) return ISMRMRD_FILTER_NONE;
+
+    return ISMRMRD_FILTER_NONE;
+}
+
+template <typename T> 
+ISMRMRDINTERPRETROGATING gtPlusISMRMRDReconUtil<T>::getISMRMRDRetroGatingInterpFromName(const std::string& name)
+{
+    if ( name == "ISMRMRD_INTERP_RETRO_GATING_LINEAR" ) return ISMRMRD_INTERP_RETRO_GATING_LINEAR;
+    if ( name == "ISMRMRD_INTERP_RETRO_GATING_CUBIC" ) return ISMRMRD_INTERP_RETRO_GATING_CUBIC;
+    if ( name == "ISMRMRD_INTERP_RETRO_GATING_BSPLINE" ) return ISMRMRD_INTERP_RETRO_GATING_BSPLINE;
+
+    return ISMRMRD_INTERP_RETRO_GATING_LINEAR;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+extractSubArrayForDim(const hoNDArray<T>& x, hoNDArray<T>& r, ISMRMRDDIM& dim, size_t value, bool lessEqual)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+        long long dimInd;
+        GADGET_CHECK_RETURN_FALSE(getISMRMRDDimIndex(dim, dimInd));
+
+        GADGET_CHECK_RETURN_FALSE(value<(*dimX)[dimInd]);
+
+        std::vector<size_t> crop_offset(11, 0);
+        crop_offset[0] = 0;
+        crop_offset[1] = 0;
+        crop_offset[2] = 0;
+        crop_offset[3] = 0;
+        crop_offset[4] = 0;
+        crop_offset[5] = 0;
+        crop_offset[6] = 0;
+        crop_offset[7] = 0;
+        crop_offset[8] = 0;
+        crop_offset[9] = 0;
+        crop_offset[10] = 0;
+
+        std::vector<size_t> crop_size(11, 0);
+        crop_size[0] = (*dimX)[0];
+        crop_size[1] = (*dimX)[1];
+        crop_size[2] = (*dimX)[2];
+        crop_size[3] = (*dimX)[3];
+        crop_size[4] = (*dimX)[4];
+        crop_size[5] = (*dimX)[5];
+        crop_size[6] = (*dimX)[6];
+        crop_size[7] = (*dimX)[7];
+        crop_size[8] = (*dimX)[8];
+        crop_size[9] = (*dimX)[9];
+        crop_size[10] = (*dimX)[10];
+
+        if ( lessEqual )
+        {
+            crop_size[dimInd] = value+1;
+        }
+        else
+        {
+            crop_offset[dimInd] = value;
+            crop_size[dimInd] = 1;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::cropUpTo11DArray(x, r, crop_offset, crop_size));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::extractSubArrayForDim(dim, value) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+extractSubArrayForDim(const hoNDArray<T>& x, hoNDArray<T>& r, ISMRMRDDIM& dim1, size_t value1, ISMRMRDDIM& dim2, size_t value2, bool lessEqual)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+        long long dimInd1, dimInd2;
+        GADGET_CHECK_RETURN_FALSE(getISMRMRDDimIndex(dim1, dimInd1));
+        GADGET_CHECK_RETURN_FALSE(getISMRMRDDimIndex(dim2, dimInd2));
+
+        GADGET_CHECK_RETURN_FALSE(value1<(*dimX)[dimInd1]);
+        GADGET_CHECK_RETURN_FALSE(value2<(*dimX)[dimInd2]);
+
+        std::vector<size_t> crop_offset(11, 0);
+        crop_offset[0] = 0;
+        crop_offset[1] = 0;
+        crop_offset[2] = 0;
+        crop_offset[3] = 0;
+        crop_offset[4] = 0;
+        crop_offset[5] = 0;
+        crop_offset[6] = 0;
+        crop_offset[7] = 0;
+        crop_offset[8] = 0;
+        crop_offset[9] = 0;
+        crop_offset[10] = 0;
+
+        std::vector<size_t> crop_size(11, 0);
+        crop_size[0] = (*dimX)[0];
+        crop_size[1] = (*dimX)[1];
+        crop_size[2] = (*dimX)[2];
+        crop_size[3] = (*dimX)[3];
+        crop_size[4] = (*dimX)[4];
+        crop_size[5] = (*dimX)[5];
+        crop_size[6] = (*dimX)[6];
+        crop_size[7] = (*dimX)[7];
+        crop_size[8] = (*dimX)[8];
+        crop_size[9] = (*dimX)[9];
+        crop_size[10] = (*dimX)[10];
+
+        if ( lessEqual )
+        {
+            crop_size[dimInd1] = value1+1;
+            crop_size[dimInd2] = value2+1;
+        }
+        else
+        {
+            crop_offset[dimInd1] = value1;
+            crop_size[dimInd1] = 1;
+
+            crop_offset[dimInd2] = value2;
+            crop_size[dimInd2] = 1;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::cropUpTo11DArray(x, r, crop_offset, crop_size));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::extractSubArrayForDim(dim1, value1, dim2, value2) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+extractSubArrayForDim1LessEqualDim2Equal(const hoNDArray<T>& x, hoNDArray<T>& r, ISMRMRDDIM& dim1, size_t value1, ISMRMRDDIM& dim2, size_t value2)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+        long long dimInd1, dimInd2;
+        GADGET_CHECK_RETURN_FALSE(getISMRMRDDimIndex(dim1, dimInd1));
+        GADGET_CHECK_RETURN_FALSE(getISMRMRDDimIndex(dim2, dimInd2));
+
+        GADGET_CHECK_RETURN_FALSE(value1<(*dimX)[dimInd1]);
+        GADGET_CHECK_RETURN_FALSE(value2<(*dimX)[dimInd2]);
+
+        std::vector<size_t> crop_offset(11, 0);
+        crop_offset[0] = 0;
+        crop_offset[1] = 0;
+        crop_offset[2] = 0;
+        crop_offset[3] = 0;
+        crop_offset[4] = 0;
+        crop_offset[5] = 0;
+        crop_offset[6] = 0;
+        crop_offset[7] = 0;
+        crop_offset[8] = 0;
+        crop_offset[9] = 0;
+        crop_offset[10] = 0;
+
+        std::vector<size_t> crop_size(11, 0);
+        crop_size[0] = (*dimX)[0];
+        crop_size[1] = (*dimX)[1];
+        crop_size[2] = (*dimX)[2];
+        crop_size[3] = (*dimX)[3];
+        crop_size[4] = (*dimX)[4];
+        crop_size[5] = (*dimX)[5];
+        crop_size[6] = (*dimX)[6];
+        crop_size[7] = (*dimX)[7];
+        crop_size[8] = (*dimX)[8];
+        crop_size[9] = (*dimX)[9];
+        crop_size[10] = (*dimX)[9];
+
+        crop_size[dimInd1] = value1+1;
+
+        crop_offset[dimInd2] = value2;
+        crop_size[dimInd2] = 1;
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::cropUpTo11DArray(x, r, crop_offset, crop_size));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::extractSubArrayForDim1LessEqualDim2Equal(dim1, value1, dim2, value2) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+extractSubArrayForMaxEncodingCounters(const hoNDArray<T>& x, hoNDArray<T>& r, const ISMRMRD::EncodingCounters& maxIdx)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+        std::vector<size_t> crop_offset(11, 0);
+        crop_offset[0] = 0;
+        crop_offset[1] = 0;
+        crop_offset[2] = 0;
+        crop_offset[3] = 0;
+        crop_offset[4] = 0;
+        crop_offset[5] = 0;
+        crop_offset[6] = 0;
+        crop_offset[7] = 0;
+        crop_offset[8] = 0;
+        crop_offset[9] = 0;
+        crop_offset[10] = 0;
+
+        // [RO E1 Cha Slice E2 Contrast Phase Rep Set Seg Ave]
+        std::vector<size_t> crop_size(11, 0);
+        crop_size[0] = (*dimX)[0];
+        crop_size[1] = (*dimX)[1]; if ( maxIdx.kspace_encode_step_1 < crop_size[1]-1 ) crop_size[1] = maxIdx.kspace_encode_step_1+1;
+        crop_size[2] = (*dimX)[2]; 
+        crop_size[3] = (*dimX)[3]; if ( maxIdx.slice                < crop_size[3]-1 ) crop_size[3] = maxIdx.slice+1;
+        crop_size[4] = (*dimX)[4]; if ( maxIdx.kspace_encode_step_2 < crop_size[4]-1 ) crop_size[4] = maxIdx.kspace_encode_step_2+1;
+        crop_size[5] = (*dimX)[5]; if ( maxIdx.contrast             < crop_size[5]-1 ) crop_size[5] = maxIdx.contrast+1;
+        crop_size[6] = (*dimX)[6]; if ( maxIdx.phase                < crop_size[6]-1 ) crop_size[6] = maxIdx.phase+1;
+        crop_size[7] = (*dimX)[7]; if ( maxIdx.repetition           < crop_size[7]-1 ) crop_size[7] = maxIdx.repetition+1;
+        crop_size[8] = (*dimX)[8]; if ( maxIdx.set                  < crop_size[8]-1 ) crop_size[8] = maxIdx.set+1;
+        crop_size[9] = (*dimX)[9]; if ( maxIdx.segment              < crop_size[9]-1 ) crop_size[9] = maxIdx.segment+1;
+        crop_size[10] = (*dimX)[10]; if ( maxIdx.average            < crop_size[10]-1 ) crop_size[10] = maxIdx.average+1;
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::cropUpTo11DArray(x, r, crop_offset, crop_size));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::extractSubArrayForMaxEncodingCounters(const hoNDArray<T>& x, hoNDArray<T>& r, const ISMRMRD::EncodingCounters& maxIdx) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void gtPlusISMRMRDReconUtil<T>::clearAcquisitionHeaderISMRMRD(ISMRMRD::AcquisitionHeader& acqHeader)
+{
+    memset(&acqHeader, 0, sizeof(ISMRMRD::AcquisitionHeader));
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::hasIdenticalGeometryISMRMRD(const ISMRMRD::AcquisitionHeader& acqHeader1, const ISMRMRD::AcquisitionHeader& acqHeader2)
+{
+    long long ii;
+
+    for ( ii=0; ii<ISMRMRD::ISMRMRD_POSITION_LENGTH; ii++ )
+    {
+        if ( std::abs(acqHeader1.position[ii]-acqHeader2.position[ii]) > GT_IMAGING_GEOMETRY_DELTA ) return false;
+        if ( std::abs(acqHeader1.patient_table_position[ii]-acqHeader2.patient_table_position[ii]) > GT_IMAGING_GEOMETRY_DELTA ) return false;
+    }
+
+    for ( ii=0; ii<ISMRMRD::ISMRMRD_DIRECTION_LENGTH; ii++ )
+    {
+        if ( std::abs(acqHeader1.read_dir[ii]-acqHeader2.read_dir[ii]) > GT_IMAGING_GEOMETRY_DELTA ) return false;
+        if ( std::abs(acqHeader1.phase_dir[ii]-acqHeader2.phase_dir[ii]) > GT_IMAGING_GEOMETRY_DELTA ) return false;
+        if ( std::abs(acqHeader1.slice_dir[ii]-acqHeader2.slice_dir[ii]) > GT_IMAGING_GEOMETRY_DELTA ) return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+long long gtPlusISMRMRDReconUtil<T>::addPrePostZeros(size_t centre_column, size_t samples)
+{
+    // 1 : pre zeros
+    // 2 : post zeros
+    // 0 : no zeros
+    if ( 2*centre_column == samples )
+    {
+        return 0;
+    }
+
+    if ( 2*centre_column < samples )
+    {
+        return 1;
+    }
+
+    if ( 2*centre_column > samples )
+    {
+        return 2;
+    }
+
+    return 0;
+}
+
+template <typename T> 
+void gtPlusISMRMRDReconUtil<T>::findStartEndRO(size_t centre_column, size_t samples, long long& startRO, long long& endRO)
+{
+    long long zerosFlag = addPrePostZeros(centre_column, samples);
+
+    if ( zerosFlag == 0 )
+    {
+        startRO = 0;
+        endRO = (long long)samples-1;
+    }
+
+    if ( zerosFlag == 1 )
+    {
+        endRO = (long long)2*(samples-centre_column)-1;
+        startRO = (long long)endRO-samples+1;
+    }
+
+    if ( zerosFlag == 2 )
+    {
+        startRO = 0;
+        endRO = (long long)samples-1;
+    }
+
+    return;
+}
+
+template <typename T> 
+void gtPlusISMRMRDReconUtil<T>::findStartEndROAfterZeroFilling(size_t centre_column, size_t samples_zerofilled, int& startRO, int& endRO)
+{
+    size_t num = samples_zerofilled/2;
+
+    if ( centre_column == num )
+    {
+        startRO = 0;
+        endRO = (int)samples_zerofilled-1;
+    }
+
+    if ( centre_column+num < samples_zerofilled ) // pre zeros
+    {
+        endRO = (int)samples_zerofilled-1;
+        startRO = endRO-(int)(centre_column+num)+1;
+    }
+
+    if ( centre_column+num > samples_zerofilled ) // post zeros
+    {
+        startRO = 0;
+        endRO = (int)samples_zerofilled-1;
+    }
+
+    return;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::setMetaAttributesFromImageHeaderISMRMRD(const ISMRMRD::ImageHeader& imgHeader, ISMRMRD::MetaContainer& attrib)
+{
+    try
+    {
+        unsigned int ii;
+
+        attrib.set(ISMRMRD_IMAGE_version,                 (long)imgHeader.version);
+        attrib.set(ISMRMRD_IMAGE_flags,                   (long)imgHeader.flags);
+        attrib.set(ISMRMRD_IMAGE_measurement_uid,         (long)imgHeader.measurement_uid);
+
+        // ----------------------------------------
+
+        attrib.set(ISMRMRD_IMAGE_matrix_size, (long)imgHeader.matrix_size[0]);
+        attrib.append(ISMRMRD_IMAGE_matrix_size, (long)imgHeader.matrix_size[1]);
+        attrib.append(ISMRMRD_IMAGE_matrix_size, (long)imgHeader.matrix_size[2]);
+
+        // ----------------------------------------
+
+        attrib.set(ISMRMRD_IMAGE_field_of_view, (double)imgHeader.field_of_view[0]);
+        attrib.append(ISMRMRD_IMAGE_field_of_view, (double)imgHeader.field_of_view[1]);
+        attrib.append(ISMRMRD_IMAGE_field_of_view, (double)imgHeader.field_of_view[2]);
+
+        // ----------------------------------------
+
+        attrib.set(ISMRMRD_IMAGE_channels, (long)imgHeader.channels);
+
+        // ----------------------------------------
+
+        attrib.set(ISMRMRD_IMAGE_position, (double)imgHeader.position[0]);
+        for ( ii=1; ii<ISMRMRD::ISMRMRD_POSITION_LENGTH; ii++ )
+        {
+            attrib.append(ISMRMRD_IMAGE_position, (double)imgHeader.position[ii]);
+        }
+
+        // ----------------------------------------
+
+        attrib.set(ISMRMRD_IMAGE_read_dir, (double)imgHeader.read_dir[0]);
+        for ( ii=1; ii<ISMRMRD::ISMRMRD_DIRECTION_LENGTH; ii++ )
+        {
+            attrib.append(ISMRMRD_IMAGE_read_dir, (double)imgHeader.read_dir[ii]);
+        }
+
+        // ----------------------------------------
+
+        attrib.set(ISMRMRD_IMAGE_phase_dir, (double)imgHeader.phase_dir[0]);
+        for ( ii=1; ii<ISMRMRD::ISMRMRD_DIRECTION_LENGTH; ii++ )
+        {
+            attrib.append(ISMRMRD_IMAGE_phase_dir, (double)imgHeader.phase_dir[ii]);
+        }
+
+        // ----------------------------------------
+
+        attrib.set(ISMRMRD_IMAGE_slice_dir, (double)imgHeader.slice_dir[0]);
+        for ( ii=1; ii<ISMRMRD::ISMRMRD_DIRECTION_LENGTH; ii++ )
+        {
+            attrib.append(ISMRMRD_IMAGE_slice_dir, (double)imgHeader.slice_dir[ii]);
+        }
+
+        // ----------------------------------------
+
+        attrib.set(ISMRMRD_IMAGE_patient_table_position, (double)imgHeader.patient_table_position[0]);
+        for ( ii=1; ii<ISMRMRD::ISMRMRD_POSITION_LENGTH; ii++ )
+        {
+            attrib.append(ISMRMRD_IMAGE_patient_table_position, (double)imgHeader.patient_table_position[ii]);
+        }
+
+        // ----------------------------------------
+
+        attrib.set(ISMRMRD_IMAGE_average,       (long)imgHeader.average);
+        attrib.set(ISMRMRD_IMAGE_slice,         (long)imgHeader.slice);
+        attrib.set(ISMRMRD_IMAGE_contrast,      (long)imgHeader.contrast);
+        attrib.set(ISMRMRD_IMAGE_phase,         (long)imgHeader.phase);
+        attrib.set(ISMRMRD_IMAGE_repetition,    (long)imgHeader.repetition);
+        attrib.set(ISMRMRD_IMAGE_set,           (long)imgHeader.set);
+
+        // ----------------------------------------
+
+        attrib.set(ISMRMRD_IMAGE_acquisition_time_stamp, (long)imgHeader.acquisition_time_stamp);
+
+        // ----------------------------------------
+
+        attrib.set(ISMRMRD_IMAGE_physiology_time_stamp, (long)imgHeader.physiology_time_stamp[0]);
+        for ( ii=1; ii<ISMRMRD::ISMRMRD_PHYS_STAMPS; ii++ )
+        {
+            attrib.append(ISMRMRD_IMAGE_physiology_time_stamp, (long)imgHeader.physiology_time_stamp[ii]);
+        }
+
+        // ----------------------------------------
+
+        attrib.set(ISMRMRD_IMAGE_image_data_type,       (long)imgHeader.data_type);
+        attrib.set(ISMRMRD_IMAGE_image_type,            (long)imgHeader.image_type);
+        attrib.set(ISMRMRD_IMAGE_image_series_index,    (long)imgHeader.image_series_index);
+
+        // ----------------------------------------
+
+        attrib.set(ISMRMRD_IMAGE_user_int, (long)imgHeader.user_int[0]);
+        for ( ii=1; ii<ISMRMRD::ISMRMRD_USER_INTS; ii++ )
+        {
+            attrib.append(ISMRMRD_IMAGE_user_int, (long)imgHeader.user_int[ii]);
+        }
+
+        // ----------------------------------------
+
+        attrib.set(ISMRMRD_IMAGE_user_float, (double)imgHeader.user_float[0]);
+        for ( ii=1; ii<ISMRMRD::ISMRMRD_USER_FLOATS; ii++ )
+        {
+            attrib.append(ISMRMRD_IMAGE_user_float, (double)imgHeader.user_float[ii]);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::setMetaAttributesFromImageHeaderISMRMRD(const ISMRMRD::ImageHeader& imgHeader, ISMRMRD::MetaContainer& attrib) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::setImageHeaderISMRMRDFromMetaAttributes(const ISMRMRD::MetaContainer& attrib, ISMRMRD::ImageHeader& imgHeader)
+{
+    try
+    {
+        unsigned int ii;
+
+        imgHeader.version = (uint16_t)attrib.as_long(ISMRMRD_IMAGE_version, 0);
+        imgHeader.flags = (uint64_t)attrib.as_long(ISMRMRD_IMAGE_flags, 0);
+        imgHeader.measurement_uid = (uint32_t)attrib.as_long(ISMRMRD_IMAGE_measurement_uid, 0);
+
+        // ----------------------------------------
+
+        imgHeader.matrix_size[0] = (uint16_t)attrib.as_long(ISMRMRD_IMAGE_matrix_size, 0);
+        imgHeader.matrix_size[1] = (uint16_t)attrib.as_long(ISMRMRD_IMAGE_matrix_size, 1);
+        imgHeader.matrix_size[2] = (uint16_t)attrib.as_long(ISMRMRD_IMAGE_matrix_size, 2);
+
+        // ----------------------------------------
+
+        imgHeader.field_of_view[0] = (float)attrib.as_double(ISMRMRD_IMAGE_field_of_view, 0);
+        imgHeader.field_of_view[1] = (float)attrib.as_double(ISMRMRD_IMAGE_field_of_view, 1);
+        imgHeader.field_of_view[2] = (float)attrib.as_double(ISMRMRD_IMAGE_field_of_view, 2);
+
+        // ----------------------------------------
+
+        imgHeader.channels = (uint16_t)attrib.as_long(ISMRMRD_IMAGE_channels, 0);;
+
+        // ----------------------------------------
+
+        for ( ii=0; ii<ISMRMRD::ISMRMRD_POSITION_LENGTH; ii++ )
+        {
+            imgHeader.position[ii] = (float)attrib.as_double(ISMRMRD_IMAGE_position, ii);
+        }
+
+        // ----------------------------------------
+
+        for ( ii=0; ii<ISMRMRD::ISMRMRD_DIRECTION_LENGTH; ii++ )
+        {
+            imgHeader.read_dir[ii] = (float)attrib.as_double(ISMRMRD_IMAGE_read_dir, ii);
+        }
+
+        // ----------------------------------------
+
+        for ( ii=0; ii<ISMRMRD::ISMRMRD_DIRECTION_LENGTH; ii++ )
+        {
+            imgHeader.phase_dir[ii] = (float)attrib.as_double(ISMRMRD_IMAGE_phase_dir, ii);
+        }
+
+        // ----------------------------------------
+
+        for ( ii=0; ii<ISMRMRD::ISMRMRD_DIRECTION_LENGTH; ii++ )
+        {
+            imgHeader.slice_dir[ii] = (float)attrib.as_double(ISMRMRD_IMAGE_slice_dir, ii);
+        }
+
+        // ----------------------------------------
+
+        for ( ii=0; ii<ISMRMRD::ISMRMRD_POSITION_LENGTH; ii++ )
+        {
+            imgHeader.patient_table_position[ii] = (float)attrib.as_double(ISMRMRD_IMAGE_patient_table_position, ii);
+        }
+
+        // ----------------------------------------
+
+        imgHeader.average = (uint16_t)attrib.as_long(ISMRMRD_IMAGE_average, 0);
+        imgHeader.slice = (uint16_t)attrib.as_long(ISMRMRD_IMAGE_slice, 0);
+        imgHeader.contrast = (uint16_t)attrib.as_long(ISMRMRD_IMAGE_contrast, 0);
+        imgHeader.phase = (uint16_t)attrib.as_long(ISMRMRD_IMAGE_phase, 0);
+        imgHeader.repetition = (uint16_t)attrib.as_long(ISMRMRD_IMAGE_repetition, 0);
+        imgHeader.set = (uint16_t)attrib.as_long(ISMRMRD_IMAGE_set, 0);
+
+        // ----------------------------------------
+
+        imgHeader.acquisition_time_stamp = (uint32_t)attrib.as_long(ISMRMRD_IMAGE_acquisition_time_stamp, 0);
+
+        // ----------------------------------------
+
+        for ( ii=0; ii<ISMRMRD::ISMRMRD_PHYS_STAMPS; ii++ )
+        {
+            imgHeader.physiology_time_stamp[ii] = (uint32_t)attrib.as_long(ISMRMRD_IMAGE_physiology_time_stamp, ii);
+        }
+
+        // ----------------------------------------
+
+        imgHeader.data_type = (uint16_t)attrib.as_long(ISMRMRD_IMAGE_image_data_type, 0);
+        imgHeader.image_type = (uint16_t)attrib.as_long(ISMRMRD_IMAGE_image_type, 0);
+        imgHeader.image_series_index = (uint16_t)attrib.as_long(ISMRMRD_IMAGE_image_series_index, 0);
+
+        // ----------------------------------------
+
+        for ( ii=0; ii<ISMRMRD::ISMRMRD_USER_INTS; ii++ )
+        {
+            imgHeader.user_int[ii] = (int32_t)attrib.as_long(ISMRMRD_IMAGE_user_int, ii);
+        }
+
+        // ----------------------------------------
+
+        for ( ii=0; ii<ISMRMRD::ISMRMRD_USER_FLOATS; ii++ )
+        {
+            imgHeader.user_float[ii] = (float)attrib.as_double(ISMRMRD_IMAGE_user_float, ii);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::setImageHeaderISMRMRDFromMetaAttributes(const ISMRMRD::MetaContainer& attrib, ISMRMRD::ImageHeader& imgHeader) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+//#ifdef USE_CUDA
+//
+//template <typename T> 
+//bool gtPlusISMRMRDReconUtil<T>::
+//cudaJobSplitter(const std::vector<unsigned int>& jobIDs, size_t jobSize, size_t minimalMemoryForValidDevice, 
+//                std::vector< std::pair<unsigned int, std::vector<std::vector<unsigned int> > > >& jobSchedule)
+//{
+//    try
+//    {
+//        unsigned int numOfJobs = jobIDs.size();
+//        if ( numOfJobs == 0 )
+//        {
+//            GWARN_STREAM("numOfJobs == 0");
+//            return true;
+//        }
+//
+//        // find valid device
+//        int numOfDevices(0);
+//        GADGET_CHECK_RETURN_FALSE(cudaGetDeviceCount( &numOfDevices )==cudaSuccess);
+//
+//        if ( numOfDevices == 0 )
+//        {
+//            GWARN_STREAM("numOfDevices == 0");
+//            return true;
+//        }
+//
+//        std::vector<unsigned int> validDevices;
+//        int d;
+//        for ( d=0; d<numOfDevices; d++ )
+//        {
+//            size_t totalMem = cudaDeviceManager::Instance()->total_global_mem(d);
+//            if ( totalMem >= minimalMemoryForValidDevice )
+//            {
+//                validDevices.push_back(d);
+//            }
+//        }
+//
+//        if ( validDevices.empty() )
+//        {
+//            GERROR_STREAM("No valid device can be found : " << minimalMemoryForValidDevice);
+//            return false;
+//        }
+//
+//        std::vector<unsigned int> maxJobN(validDevices.size());
+//        for ( d=0; d<validDevices.size(); d++ )
+//        {
+//            size_t totalMem = cudaDeviceManager::Instance()->total_global_mem(validDevices[d]);
+//            maxJobN[d] = totalMem/jobSize;
+//        }
+//
+//        jobSchedule.clear();
+//
+//        size_t job = 0;
+//        unsigned int validDevice = 0;
+//        while ( job < numOfJobs )
+//        {
+//            size_t start = job;
+//            size_t end = job + maxJobN[validDevice] - 1;
+//
+//            if ( end >= numOfJobs ) end = numOfJobs - 1;
+//
+//            unsigned int deviceID = validDevices[validDevice];
+//
+//            unsigned int loc;
+//            for ( loc=0; loc<jobSchedule.size(); loc++ )
+//            {
+//                if ( jobSchedule[loc].first == deviceID ) break;
+//            }
+//
+//            if ( loc < jobSchedule.size() )
+//            {
+//                // insert a new job package
+//                std::vector<unsigned int> jobPackage;
+//                for ( unsigned int jj=start; jj<=end; jj++ )
+//                {
+//                    jobPackage.push_back(jobIDs[jj]);
+//                }
+//
+//                jobSchedule[loc].second.push_back(jobPackage);
+//            }
+//            else
+//            {
+//                // create a new entry
+//                std::pair<unsigned int, std::vector<std::vector<unsigned int> > > jobItem;
+//                jobItem.first = deviceID;
+//
+//                std::vector<unsigned int> jobPackage;
+//                for ( unsigned int jj=start; jj<=end; jj++ )
+//                {
+//                    jobPackage.push_back(jobIDs[jj]);
+//                }
+//                jobItem.second.push_back(jobPackage);
+//
+//                jobSchedule.push_back(jobItem);
+//            }
+//
+//            job = end+1;
+//            validDevice++;
+//
+//            if ( validDevice >= validDevices.size() )
+//            {
+//                validDevice = 0;
+//            }
+//        }
+//    }
+//    catch(...)
+//    {
+//        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtil<T>::cudaJobSplitter(...) ... ");
+//        return false;
+//    }
+//
+//    return true;
+//}
+//
+//template <typename T> 
+//bool gtPlusISMRMRDReconUtil<T>::
+//cudaJobSplitter(unsigned int numOfJobs, size_t jobSize, size_t minimalMemoryForValidDevice, 
+//            std::vector< std::pair<unsigned int, std::vector<std::vector<unsigned int> > > >& jobSchedule)
+//{
+//    if ( numOfJobs == 0 )
+//    {
+//        GWARN_STREAM("numOfJobs == 0");
+//        return true;
+//    }
+//
+//    std::vector<unsigned int> jobIDs(numOfJobs, 0);
+//    unsigned int ii;
+//    for ( ii=0; ii<numOfJobs; ii++ ) jobIDs[ii] = ii;
+//    return cudaJobSplitter(jobIDs, jobSize, minimalMemoryForValidDevice, jobSchedule);
+//}
+//
+//#endif // USE_CUDA
+
+template <typename T> 
+void gtPlusISMRMRDReconUtil<T>::
+compareAgainstGroundTruthArray(const std::string& gt_filename, const hoNDArray<T>& x, typename realType<T>::Type& normDiff, typename realType<T>::Type& maxNormDiff)
+{
+    hoNDArray<T> gt;
+
+    gtPlusIOAnalyze gt_io;
+    gt_io.importArray(gt, gt_filename);
+
+    compareAgainstGroundTruthArray(gt, x, normDiff, maxNormDiff);
+}
+
+template <typename T> 
+void gtPlusISMRMRDReconUtil<T>::
+compareAgainstGroundTruthArray(const hoNDArray<T>& gt, const hoNDArray<T>& x, typename realType<T>::Type& normDiff, typename realType<T>::Type& maxNormDiff)
+{
+    hoNDArray<T> diff(x);
+    Gadgetron::subtract(gt, x, diff);
+
+    typename realType<T>::Type v;
+    Gadgetron::norm2(diff, v);
+    normDiff = v;
+
+    T maxV;
+    size_t ind;
+    Gadgetron::maxAbsolute(diff, maxV, ind);
+    maxNormDiff = std::abs(maxV);
+}
+
+// ========================================================================================== //
+
+template <typename T> 
+gtPlusISMRMRDReconUtilComplex<T>::gtPlusISMRMRDReconUtilComplex() {}
+
+template <typename T> 
+gtPlusISMRMRDReconUtilComplex<T>::~gtPlusISMRMRDReconUtilComplex() {}
+
+template <typename T> 
+void gtPlusISMRMRDReconUtilComplex<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+    os << "-------------- GTPlus ISMRMRD Recon Util Complex -------------" << endl;
+    os << "Implementation of recon utilities for ISMRMRD complex data type" << endl;
+    os << "--------------------------------------------------------------" << endl;
+}
+
+// ------------------------------------------------------------------------
+// noise prewhitening
+// ------------------------------------------------------------------------
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+computeNoisePrewhiteningMatrix(const hoNDArray<T>& noise, double noiseBandWidth, double receiverBWRatio, double ADCSamplingTimeinSecond, hoMatrix<T>& prewhiteningMatrix)
+{
+    try
+    {
+        size_t RO = noise.get_size(0);
+        size_t E1 = noise.get_size(1);
+        size_t CHA = noise.get_size(2);
+
+        GADGET_CHECK_RETURN_FALSE(prewhiteningMatrix.createMatrix(CHA, CHA));
+        Gadgetron::clear(prewhiteningMatrix);
+
+        typedef typename realType<T>::Type ValueType;
+
+        // noise sampling time in second
+        ValueType noiseSamplingTimeinSecond = (ValueType)(1.0/(noiseBandWidth*RO));
+
+        // scaling factor
+        ValueType scaling = (ValueType)(noiseSamplingTimeinSecond/ADCSamplingTimeinSecond/receiverBWRatio);
+        scaling /= (RO*E1-1);
+
+        // compute the noise covariance matrix
+        hoMatrix<T> R(RO*E1, CHA, const_cast<T*>(noise.begin()));
+
+        // R'*R --> CHA by CHA covariance matrix
+        Gadgetron::gemm(prewhiteningMatrix, R, true, R, false);
+        Gadgetron::scal(scaling, prewhiteningMatrix);
+
+        // 0.5*(R+R')
+        hoMatrix<T> RH(prewhiteningMatrix);
+        conjugatetrans(prewhiteningMatrix, RH);
+        Gadgetron::add(prewhiteningMatrix, RH, prewhiteningMatrix);
+        Gadgetron::scal( (ValueType)0.5, prewhiteningMatrix);
+
+        Gadgetron::potrf(prewhiteningMatrix, 'U');
+        Gadgetron::trtri(prewhiteningMatrix, 'U');
+        Gadgetron::scal( (value_type)(std::sqrt((double)2.0)), prewhiteningMatrix);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::computeNoisePrewhiteningMatrix(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+performNoisePrewhitening(hoNDArray<T>& data, const hoMatrix<T>& prewhiteningMatrix)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t CHA = data.get_size(2);
+
+        GADGET_CHECK_RETURN_FALSE(prewhiteningMatrix.rows()==CHA);
+        GADGET_CHECK_RETURN_FALSE(prewhiteningMatrix.cols()==CHA);
+
+        size_t N = data.get_number_of_elements()/(RO*E1*CHA);
+
+        long long n;
+        #pragma omp parallel default(none) private(n) shared(RO, E1, CHA, N, data, prewhiteningMatrix)
+        {
+            hoMatrix<T> tmp(RO*E1, CHA);
+            Gadgetron::clear(tmp);
+
+            #pragma omp for
+            for ( n=0; n<(long long)N; n++ )
+            {
+                hoMatrix<T> D(RO*E1, CHA, data.begin()+n*RO*E1*CHA);
+                Gadgetron::gemm(tmp, D, false, prewhiteningMatrix, false);
+                memcpy(data.begin()+n*RO*E1*CHA, tmp.begin(), sizeof(T)*RO*E1*CHA);
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::performNoisePrewhitening(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+zpadResize2D(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, hoNDArray<T>& dataResized)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX>=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY>=E1);
+
+        if ( RO==sizeX && E1==sizeY )
+        {
+            dataResized = data;
+            return true;
+        }
+
+        if ( dataResized.get_size(0)!=sizeX || dataResized.get_size(1)!=sizeY )
+        {
+            dataResized.create(sizeX, sizeY);
+        }
+
+        Gadgetron::clear(&dataResized);
+
+        hoNDArray<T> kspace(data);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(data, kspace);
+        GADGET_CHECK_RETURN_FALSE(zpadResize2DOnKSpace(kspace, sizeX, sizeY, dataResized));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::zpadResize2D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+zpadResize2DOnKSpace(const hoNDArray<T>& kspace, size_t sizeX, size_t sizeY, hoNDArray<T>& dataResized)
+{
+    try
+    {
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX>=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY>=E1);
+
+        if ( RO==sizeX && E1==sizeY )
+        {
+            dataResized = kspace;
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(dataResized);
+            return true;
+        }
+
+        if ( dataResized.get_size(0)!=sizeX || dataResized.get_size(1)!=sizeY )
+        {
+            dataResized.create(sizeX, sizeY);
+        }
+
+        Gadgetron::clear(&dataResized);
+
+        // GADGET_CHECK_RETURN_FALSE(this->zeropad2D(kspace, sizeX, sizeY, dataResized));
+        // GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::zeropad2D(kspace, sizeX, sizeY, dataResized));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::pad(sizeX, sizeY, const_cast<hoNDArray<T>*>(&kspace), &dataResized));
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(dataResized);
+
+        typename realType<T>::Type scaling = (typename realType<T>::Type)(std::sqrt((double)sizeX*sizeY)/std::sqrt((double)RO*E1));
+        Gadgetron::scal(scaling, dataResized);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::zpadResize2DOnKSpace(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+zpadResize3D(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, size_t sizeZ, hoNDArray<T>& dataResized)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t E2 = data.get_size(2);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX>=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY>=E1);
+        GADGET_CHECK_RETURN_FALSE(sizeZ>=E2);
+
+        if ( RO==sizeX && E1==sizeY && E2==sizeZ )
+        {
+            dataResized = data;
+            return true;
+        }
+
+        if ( dataResized.get_size(0)!=sizeX || dataResized.get_size(1)!=sizeY || dataResized.get_size(2)!=sizeZ )
+        {
+            dataResized.create(sizeX, sizeY, sizeZ);
+        }
+
+        Gadgetron::clear(&dataResized);
+
+        hoNDArray<T> kspace(data);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(data, kspace);
+        GADGET_CHECK_RETURN_FALSE(zpadResize3DOnKSpace(kspace, sizeX, sizeY, sizeZ, dataResized));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::zpadResize3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+zpadResize3DOnKSpace(const hoNDArray<T>& kspace, size_t sizeX, size_t sizeY, size_t sizeZ, hoNDArray<T>& dataResized)
+{
+    try
+    {
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t E2 = kspace.get_size(2);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX>=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY>=E1);
+        GADGET_CHECK_RETURN_FALSE(sizeZ>=E2);
+
+        if ( RO==sizeX && E1==sizeY && E2==sizeZ )
+        {
+            dataResized = kspace;
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(dataResized);
+            return true;
+        }
+
+        if ( dataResized.get_size(0)!=sizeX || dataResized.get_size(1)!=sizeY || dataResized.get_size(2)!=sizeZ )
+        {
+            dataResized.create(sizeX, sizeY, sizeZ);
+        }
+
+        Gadgetron::clear(&dataResized);
+
+        // GADGET_CHECK_RETURN_FALSE(this->zeropad3D(kspace, sizeX, sizeY, sizeZ, dataResized));
+        // GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::zeropad3D(kspace, sizeX, sizeY, sizeZ, dataResized));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::pad(sizeX, sizeY, sizeZ, const_cast<hoNDArray<T>*>(&kspace), &dataResized));
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(dataResized);
+
+        typename realType<T>::Type scaling = (typename realType<T>::Type)(std::sqrt((double)sizeX*sizeY*sizeZ)/std::sqrt((double)RO*E1*E2));
+        Gadgetron::scal(scaling, dataResized);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::zpadResize3DOnKSpace(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+zpadResize2DFilter(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, const hoNDArray<T>& filter2D, hoNDArray<T>& dataResized)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX>=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY>=E1);
+
+        GADGET_CHECK_RETURN_FALSE(filter2D.get_size(0)==sizeX);
+        GADGET_CHECK_RETURN_FALSE(filter2D.get_size(1)==sizeY);
+
+        if ( RO==sizeX && E1==sizeY )
+        {
+            dataResized = data;
+            return true;
+        }
+
+        if ( dataResized.get_size(0)!=sizeX || dataResized.get_size(1)!=sizeY )
+        {
+            dataResized.create(sizeX, sizeY);
+        }
+
+        Gadgetron::clear(&dataResized);
+
+        hoNDArray<T> kspace(data);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(data, kspace);
+        // GADGET_CHECK_RETURN_FALSE(this->zeropad2D(kspace, sizeX, sizeY, dataResized));
+        // GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::zeropad2D(kspace, sizeX, sizeY, dataResized));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::pad(sizeX, sizeY, const_cast<hoNDArray<T>*>(&kspace), &dataResized));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(dataResized, filter2D, dataResized));
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(dataResized);
+
+        typename realType<T>::Type scaling = (typename realType<T>::Type)(std::sqrt((double)sizeX*sizeY)/std::sqrt((double)RO*E1));
+        Gadgetron::scal(scaling, dataResized);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::zpadResize2DFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+zpadResize3DFilter(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, size_t sizeZ, const hoNDArray<T>& filter3D, hoNDArray<T>& dataResized)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t E2 = data.get_size(2);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX>=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY>=E1);
+        GADGET_CHECK_RETURN_FALSE(sizeZ>=E2);
+
+        GADGET_CHECK_RETURN_FALSE(filter3D.get_size(0)==sizeX);
+        GADGET_CHECK_RETURN_FALSE(filter3D.get_size(1)==sizeY);
+        GADGET_CHECK_RETURN_FALSE(filter3D.get_size(2)==sizeZ);
+
+        if ( RO==sizeX && E1==sizeY && E2==sizeZ )
+        {
+            dataResized = data;
+            return true;
+        }
+
+        if ( dataResized.get_size(0)!=sizeX || dataResized.get_size(1)!=sizeY || dataResized.get_size(2)!=sizeZ )
+        {
+            dataResized.create(sizeX, sizeY, sizeZ);
+        }
+
+        Gadgetron::clear(&dataResized);
+
+        hoNDArray<T> kspace(data);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(data, kspace);
+        // GADGET_CHECK_RETURN_FALSE(this->zeropad3D(kspace, sizeX, sizeY, sizeZ, dataResized));
+        // GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::zeropad3D(kspace, sizeX, sizeY, sizeZ, dataResized));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::pad(sizeX, sizeY, sizeZ, &kspace, &dataResized));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(dataResized, filter3D, dataResized));
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(dataResized);
+
+        typename realType<T>::Type scaling = (typename realType<T>::Type)(std::sqrt((double)sizeX*sizeY*sizeZ)/std::sqrt((double)RO*E1*E2));
+        Gadgetron::scal(scaling, dataResized);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::zpadResize3DFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+kspacefilterROImage(hoNDArray<T>& data, const hoNDArray<T>& fRO)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft1c(data);
+        GADGET_CHECK_RETURN_FALSE(this->kspacefilterRO(data, fRO));
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(data);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::kspacefilterROImage(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+kspacefilterROImage(const hoNDArray<T>& data, const hoNDArray<T>& fRO, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft1c(data, dataFiltered);
+        GADGET_CHECK_RETURN_FALSE(this->kspacefilterRO(dataFiltered, fRO));
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(dataFiltered);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::kspacefilterROImage(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+kspacefilterE1Image(const hoNDArray<T>& data, const hoNDArray<T>& fE1, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(data, dataFiltered);
+        GADGET_CHECK_RETURN_FALSE(this->kspacefilterRO(dataFiltered, fE1, dataFiltered));
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(dataFiltered);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::kspacefilterE1Image(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+kspacefilterE2Image(const hoNDArray<T>& data, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==fE2.get_number_of_elements());
+
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(data, dataFiltered);
+        GADGET_CHECK_RETURN_FALSE(this->kspacefilterRO(dataFiltered, fE2, dataFiltered));
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(dataFiltered);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::kspacefilterE2Image(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+kspacefilterE1E2Image(const hoNDArray<T>& data, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==fE2.get_number_of_elements());
+
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(data, dataFiltered);
+        GADGET_CHECK_RETURN_FALSE(this->kspacefilterE1E2(dataFiltered, fE1, fE2, dataFiltered));
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(dataFiltered);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::kspacefilterE1E2Image(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+kspacefilterROE1E2Image(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==fE2.get_number_of_elements());
+
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(data, dataFiltered);
+        GADGET_CHECK_RETURN_FALSE(this->kspacefilterROE1E2(dataFiltered, fRO, fE1, fE2, dataFiltered));
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(dataFiltered);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::kspacefilterROE1E2Image(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilMap2DNIHInner(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t power)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+
+        long long RO = data.get_size(0);
+        long long E1 = data.get_size(1);
+        long long CHA = data.get_size(2);
+
+        long long N = data.get_number_of_elements()/(RO*E1*CHA);
+        GADGET_CHECK_RETURN_FALSE(N==1);
+
+        const T* pData = data.begin();
+
+        if ( !data.dimensions_equal(&coilMap) )
+        {
+            coilMap = data;
+        }
+        T* pSen = coilMap.begin();
+
+        if ( ks%2 != 1 )
+        {
+            ks++;
+        }
+
+        size_t kss = ks*ks;
+        long long halfKs = (long long)ks/2;
+
+        int e1;
+
+        // #pragma omp parallel default(none) private(e1) shared(ks, RO, E1, CHA, pSen, pData, halfKs, power, kss)
+        #pragma omp parallel private(e1) shared(ks, RO, E1, CHA, pSen, pData, halfKs, power, kss)
+        {
+            hoNDArray<T> D(ks*ks, CHA);
+            T* pD = D.begin();
+
+            hoNDArray<T> DC(ks*ks, CHA);
+            T* pDC = DC.begin();
+
+            hoNDArray<T> DH_D(CHA, CHA);
+            Gadgetron::clear(DH_D);
+
+            hoNDArray<T> U1(ks*ks, 1);
+            T* pU1 = U1.begin();
+
+            hoNDArray<T> V1(CHA, 1);
+            T* pV1 = V1.begin();
+
+            hoNDArray<T> V(CHA, 1);
+
+            Gadgetron::clear(D);
+            Gadgetron::clear(DC);
+            Gadgetron::clear(DH_D);
+            Gadgetron::clear(U1);
+            Gadgetron::clear(V1);
+            Gadgetron::clear(V);
+
+            T phaseU1;
+
+            value_type v1Norm(1), u1Norm(1);
+
+            long long cha, ro, kro, ke1, de1, dro;
+            size_t po;
+
+            #pragma omp for
+            for ( e1=0; e1<(int)E1; e1++ )
+            {
+                for ( ro=0; ro<(long long)RO; ro++ )
+                {
+                    // fill the data matrix D
+                    if ( e1>=halfKs && e1<E1-halfKs && ro>=halfKs && ro<RO-halfKs )
+                    {
+                        for ( cha=0; cha<CHA; cha++ )
+                        {
+                            const T* pDataCurr = pData + cha*RO*E1;
+                            int ind=0;
+                            for ( ke1=-halfKs; ke1<=halfKs; ke1++ )
+                            {
+                                de1 = e1 + ke1;
+                                for ( kro=-halfKs; kro<=halfKs; kro++ )
+                                {
+                                    // D(ind++, cha) = pDataCurr[de1*RO+ro+kro];
+                                    pD[ind+cha*kss] = pDataCurr[de1*RO+ro+kro];
+                                    ind++;
+                                }
+                            }
+                        }
+                    }
+                    else
+                    {
+                        for ( cha=0; cha<CHA; cha++ )
+                        {
+                            const T* pDataCurr = pData + cha*RO*E1;
+                            int ind=0;
+                            for ( ke1=-halfKs; ke1<=halfKs; ke1++ )
+                            {
+                                de1 = e1 + ke1;
+                                if ( de1 < 0 ) de1 += E1;
+                                if ( de1 >= E1 ) de1 -= E1;
+
+                                for ( kro=-halfKs; kro<=halfKs; kro++ )
+                                {
+                                    dro = ro + kro;
+                                    if ( dro < 0 ) dro += RO;
+                                    if ( dro >= RO ) dro -= RO;
+
+                                    // D(ind++, cha) = pDataCurr[de1*RO+dro];
+                                    pD[ind+cha*kss] = pDataCurr[de1*RO+dro];
+                                    ind++;
+                                }
+                            }
+                        }
+                    }
+
+                    // compute V1
+                    // D.sumOverCol(V1);
+                    T* pTmp;
+                    for ( cha=0; cha<CHA; cha++ )
+                    {
+                        pTmp = pD + cha*kss;
+                        pV1[cha] = pTmp[0];
+                        for ( po=1; po<kss; po++ )
+                        {
+                            pV1[cha] += pTmp[po];
+                        }
+                    }
+
+                    // norm2(V1, v1Norm);
+                    // Gadgetron::math::norm2(CHA, V1.begin(), v1Norm);
+                    value_type sum(0);
+                    for ( cha=0; cha<CHA; cha++ )
+                    {
+                        const T& c = pV1[cha];
+                        const value_type re = c.real();
+                        const value_type im = c.imag();
+                        sum += ( (re*re) + (im * im) );
+                    }
+                    v1Norm = std::sqrt(sum);
+
+                    // scal( (value_type)1.0/v1Norm, V1);
+                    value_type v1NormInv = (value_type)1.0/v1Norm;
+                    for ( cha=0; cha<CHA; cha++ )
+                    {
+                        pV1[cha] *= v1NormInv;
+                    }
+
+                    memcpy(pDC, pD, sizeof(T)*ks*ks*CHA);
+                    gemm(DH_D, DC, true, D, false);
+
+                    for ( po=0; po<power; po++ )
+                    {
+                        gemm(V, DH_D, false, V1, false);
+                        // V1 = V;
+                        memcpy(V1.begin(), V.begin(), V.get_number_of_bytes());
+
+                        // norm2(V1, v1Norm);
+
+                        sum = 0;
+                        for ( cha=0; cha<CHA; cha++ )
+                        {
+                            const T& c = pV1[cha];
+                            const value_type re = c.real();
+                            const value_type im = c.imag();
+                            sum += ( (re*re) + (im * im) );
+                        }
+                        v1Norm = std::sqrt(sum);
+
+                        // scal( (value_type)1.0/v1Norm, V1);
+
+                        value_type v1NormInv = (value_type)1.0/v1Norm;
+                        for ( cha=0; cha<CHA; cha++ )
+                        {
+                            pV1[cha] *= v1NormInv;
+                        }
+                    }
+
+                    // compute U1
+                    gemm(U1, D, false, V1, false);
+
+                    //phaseU1 = U1(0, 0);
+                    phaseU1 = pU1[0];
+                    for ( po=1; po<kss; po++ )
+                    {
+                        //phaseU1 += U1(po, 0);
+                        phaseU1 += pU1[po];
+                    }
+                    phaseU1 /= std::abs(phaseU1);
+
+                    // put the mean object phase to coil map
+                    // conjugate(V1, V1);
+                    // scal(phaseU1, V1);
+
+                    const value_type c = phaseU1.real();
+                    const value_type d = phaseU1.imag();
+
+                    for ( cha=0; cha<CHA; cha++ )
+                    {
+                        const T& v = pV1[cha];
+                        const value_type a = v.real();
+                        const value_type b = v.imag();
+
+                        reinterpret_cast< value_type(&)[2] >(pV1[cha])[0] = a*c+b*d;
+                        reinterpret_cast< value_type(&)[2] >(pV1[cha])[1] = a*d-b*c;
+                    }
+
+                    for ( cha=0; cha<CHA; cha++ )
+                    {
+                        // pSen[cha*RO*E1+e1*RO+ro] = pV1[cha];
+                        pSen[cha*RO*E1+e1*RO+ro] = V1(cha, 0);
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilMap2DNIHInner(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilMap3DNIHInner(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t power)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+
+        long long RO = data.get_size(0);
+        long long E1 = data.get_size(1);
+        long long E2 = data.get_size(2);
+        long long CHA = data.get_size(3);
+
+        long long N = data.get_number_of_elements()/(RO*E1*E2*CHA);
+        GADGET_CHECK_RETURN_FALSE(N==1);
+
+        const T* pData = data.begin();
+
+        if ( !data.dimensions_equal(&coilMap) )
+        {
+            coilMap = data;
+        }
+        T* pSen = coilMap.begin();
+
+        if ( ks%2 != 1 )
+        {
+            ks++;
+        }
+
+        size_t kss = ks*ks*ks;
+        long long halfKs = (long long)ks/2;
+
+        long long e2;
+
+        #pragma omp parallel default(none) private(e2) shared(ks, RO, E1, E2, CHA, pSen, pData, halfKs, power, kss)
+        {
+            hoMatrix<T> D(kss, CHA);
+            hoMatrix<T> DC(kss, CHA);
+            hoMatrix<T> DH_D(CHA, CHA);
+
+            hoMatrix<T> U1(kss, 1);
+            hoMatrix<T> V1(CHA, 1);
+            hoMatrix<T> V(CHA, 1);
+
+            Gadgetron::clear(D);
+            Gadgetron::clear(DC);
+            Gadgetron::clear(DH_D);
+            Gadgetron::clear(U1);
+            Gadgetron::clear(V1);
+            Gadgetron::clear(V);
+
+            T phaseU1;
+
+            value_type v1Norm(1);
+
+            long long cha, ro, e1, kro, dro, ke1, de1, ke2, de2;
+            size_t po;
+
+            #pragma omp for
+            for ( e2=0; e2<(long long)E2; e2++ )
+            {
+                for ( e1=0; e1<(long long)E1; e1++ )
+                {
+                    for ( ro=0; ro<(long long)RO; ro++ )
+                    {
+                        // fill the data matrix D
+                        if ( e2>=halfKs && e2<E2-halfKs && e1>=halfKs && e1<E1-halfKs && ro>=halfKs && ro<RO-halfKs )
+                        {
+                            for ( cha=0; cha<CHA; cha++ )
+                            {
+                                const T* pDataCurr = pData + cha*RO*E1*E2;
+                                long long ind=0;
+                                for ( ke2=-halfKs; ke2<=halfKs; ke2++ )
+                                {
+                                    de2 = e2 + ke2;
+                                    for ( ke1=-halfKs; ke1<=halfKs; ke1++ )
+                                    {
+                                        de1 = e1 + ke1;
+                                        for ( kro=-halfKs; kro<=halfKs; kro++ )
+                                        {
+                                            D(ind++, cha) = pDataCurr[de2*RO*E1+de1*RO+ro+kro];
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        else
+                        {
+                            for ( cha=0; cha<CHA; cha++ )
+                            {
+                                const T* pDataCurr = pData + cha*RO*E1*E2;
+                                long long ind=0;
+                                for ( ke2=-halfKs; ke2<=halfKs; ke2++ )
+                                {
+                                    de2 = e2 + ke2;
+                                    if ( de2 < 0 ) de2 += E2;
+                                    if ( de2 >= E2 ) de2 -= E2;
+
+                                    for ( ke1=-halfKs; ke1<=halfKs; ke1++ )
+                                    {
+                                        de1 = e1 + ke1;
+                                        if ( de1 < 0 ) de1 += E1;
+                                        if ( de1 >= E1 ) de1 -= E1;
+
+                                        for ( kro=-halfKs; kro<=halfKs; kro++ )
+                                        {
+                                            dro = ro + kro;
+                                            if ( dro < 0 ) dro += RO;
+                                            if ( dro >= RO ) dro -= RO;
+
+                                            D(ind++, cha) = pDataCurr[de2*RO*E1+de1*RO+dro];
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        // compute V1
+                        D.sumOverCol(V1);
+                        norm2(V1, v1Norm);
+                        scal( (value_type)1.0/v1Norm, V1);
+
+                        memcpy(DC.begin(), D.begin(), sizeof(T)*kss*CHA);
+                        gemm(DH_D, DC, true, D, false);
+                        // gemm(DH_D, D, true, D, false);
+
+                        for ( po=0; po<power; po++ )
+                        {
+                            gemm(V, DH_D, false, V1, false);
+                            V1 = V;
+                            norm2(V1, v1Norm);
+                            scal( (value_type)1.0/v1Norm, V1);
+                        }
+
+                        // compute U1
+                        gemm(U1, D, false, V1, false);
+
+                        phaseU1 = U1(0, 0);
+                        for ( po=1; po<kss; po++ )
+                        {
+                            phaseU1 += U1(po, 0);
+                        }
+                        phaseU1 /= std::abs(phaseU1);
+
+                        // put the mean object phase to coil map
+                        conjugate(V1, V1);
+                        scal(phaseU1, V1);
+
+                        for ( cha=0; cha<CHA; cha++ )
+                        {
+                            pSen[cha*RO*E1*E2+e2*RO*E1+e1*RO+ro] = V1(cha, 0);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilMap3DNIHInner(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilMap2DNIH2Inner(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t iterNum, typename realType<T>::Type thres)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+
+        long long RO = data.get_size(0);
+        long long E1 = data.get_size(1);
+        long long CHA = data.get_size(2);
+
+        long long N = data.get_number_of_elements()/(RO*E1*CHA);
+        GADGET_CHECK_RETURN_FALSE(N==1);
+
+        const T* pData = data.begin();
+
+        if ( !data.dimensions_equal(&coilMap) )
+        {
+            coilMap = data;
+        }
+
+        // create convolution kernel
+        hoNDArray<T> ker(ks, ks);
+        Gadgetron::fill( ker, T( (value_type)1.0/(ks*ks)) );
+
+        hoNDArray<T> prevR(RO, E1, 1), R(RO, E1, 1), imT(RO, E1, 1), magT(RO, E1, 1), diffR(RO, E1, 1);
+        hoNDArray<T> coilMapConv(RO, E1, CHA);
+        hoNDArray<T> D(RO, E1, CHA);
+        hoNDArray<T> D_sum(1, E1, CHA);
+        hoNDArray<T> D_sum_1st_2nd(1, 1, CHA);
+        typename realType<T>::Type v, vR, vDiffR;
+        T vCha;
+        size_t iter;
+        long long cha;
+
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(data, D_sum, 0));
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(D_sum, D_sum_1st_2nd, 1));
+        Gadgetron::norm2(D_sum_1st_2nd, v);
+        Gadgetron::scal( (value_type)1.0/v, D_sum_1st_2nd);
+
+        Gadgetron::clear(R);
+        for ( cha=0; cha<CHA; cha++ )
+        {
+            hoNDArray<T> dataCHA(RO, E1, const_cast<T*>(data.begin())+cha*RO*E1);
+            vCha = D_sum_1st_2nd(cha);
+            Gadgetron::axpy( std::conj(vCha), dataCHA, R, R);
+        }
+
+        for ( iter=0; iter<iterNum; iter++ )
+        {
+            prevR = R;
+
+            Gadgetron::conjugate(R, R);
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(data, R, coilMap));
+
+            Gadgetron::conv2(coilMap, ker, coilMapConv);
+
+            Gadgetron::multiplyConj(coilMapConv, coilMapConv, D);
+
+            GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(D, R, 2));
+
+            Gadgetron::sqrt(R, R);
+
+            Gadgetron::addEpsilon(R);
+            Gadgetron::inv(R, R);
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(coilMapConv, R, coilMap));
+
+            Gadgetron::multiplyConj(data, coilMap, D);
+            GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(D, R, 2));
+
+            //if ( iter < iterNum - 1 )
+            //{
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(coilMap, R, D));
+            //}
+            //else
+            //{
+            //    D = coilMap;
+            //}
+
+            GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(D, D_sum, 0));
+            GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(D_sum, D_sum_1st_2nd, 1));
+
+            Gadgetron::norm2(D_sum_1st_2nd, v);
+            Gadgetron::scal( (value_type)1.0/v, D_sum_1st_2nd);
+
+            Gadgetron::clear(imT);
+            for ( cha=0; cha<CHA; cha++ )
+            {
+                hoNDArray<T> coilMapCHA(RO, E1, coilMap.begin()+cha*RO*E1);
+                vCha = D_sum_1st_2nd(cha);
+                Gadgetron::axpy( std::conj(vCha), coilMapCHA, imT, imT);
+            }
+
+            Gadgetron::abs(imT, magT);
+            Gadgetron::divide(imT, magT, imT);
+
+            Gadgetron::multiply(R, imT, R);
+            Gadgetron::conjugate(imT, imT);
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(coilMap, imT, coilMap));
+
+            Gadgetron::subtract(prevR, R, diffR);
+            Gadgetron::norm2(diffR, vDiffR);
+            Gadgetron::norm2(R, vR);
+
+            // GDEBUG_STREAM("coilMap2DNIH2Inner - iter : " << iter << " - norm(prevR-R)/norm(R) : " << vDiffR/vR);
+
+            if ( vDiffR/vR < thres ) break;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilMap2DNIH2Inner(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilMap2DNIH(const hoNDArray<T>& data, hoNDArray<T>& coilMap, ISMRMRDCOILMAPALGO algo, size_t ks, size_t power, size_t iterNum, typename realType<T>::Type thres, bool useGPU)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+
+        long long RO = data.get_size(0);
+        long long E1 = data.get_size(1);
+        long long CHA = data.get_size(2);
+
+        size_t N = data.get_number_of_elements()/(RO*E1*CHA);
+        size_t num = RO*E1*CHA;
+
+        if ( !data.dimensions_equal(&coilMap) )
+        {
+            coilMap = data;
+        }
+
+        if ( ks%2 != 1 )
+        {
+            ks++;
+        }
+
+        long long n;
+
+        if ( N >= 16 )
+        {
+            #pragma omp parallel default(none) private(n) shared(ks, RO, E1, CHA, num, algo, N, data, coilMap, power, iterNum, thres)
+            {
+                #pragma omp for
+                for ( n=0; n<(long long)N; n++ )
+                {
+                    hoNDArray<T> dataCurr(RO, E1, CHA, const_cast<T*>(data.begin()+n*num));
+                    hoNDArray<T> coilMapCurr(RO, E1, CHA, coilMap.begin()+n*num);
+
+                    if ( algo == ISMRMRD_SOUHEIL_ITER )
+                    {
+                        Gadgetron::coil_map_2d_Inati_Iter(dataCurr, coilMapCurr, ks, iterNum, thres);
+                    }
+                    else
+                    {
+                        Gadgetron::coil_map_2d_Inati(dataCurr, coilMapCurr, ks, power);
+                    }
+                }
+            }
+        }
+        else if ( N == 1 )
+        {
+            if ( algo == ISMRMRD_SOUHEIL_ITER )
+            {
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::coil_map_2d_Inati_Iter(data, coilMap, ks, iterNum, thres));
+            }
+            else
+            {
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::coil_map_2d_Inati(data, coilMap, ks, power));
+            }
+        }
+        else
+        {
+            for ( n=0; n<(long long)N; n++ )
+            {
+                hoNDArray<T> dataCurr(RO, E1, CHA, const_cast<T*>(data.begin()+n*num));
+                hoNDArray<T> coilMapCurr(RO, E1, CHA, coilMap.begin()+n*num);
+                if ( algo == ISMRMRD_SOUHEIL_ITER )
+                {
+                    GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::coil_map_2d_Inati_Iter(dataCurr, coilMapCurr, ks, iterNum, thres));
+                }
+                else
+                {
+                    GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::coil_map_2d_Inati(dataCurr, coilMapCurr, ks, power));
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilMap2DNIH(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilMap3DNIH2Inner(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t kz, size_t iterNum, typename realType<T>::Type thres)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t E2 = data.get_size(2);
+        size_t CHA = data.get_size(3);
+
+        size_t N = data.get_number_of_elements()/(RO*E1*E2*CHA);
+        GADGET_CHECK_RETURN_FALSE(N==1);
+
+        const T* pData = data.begin();
+
+        if ( !data.dimensions_equal(&coilMap) )
+        {
+            coilMap = data;
+        }
+
+        // create convolution kernel
+        hoNDArray<T> ker(ks, ks, kz);
+        Gadgetron::fill( &ker, T( (value_type)1.0/(ks*ks*kz)) );
+
+        hoNDArray<T> R(RO, E1, E2, 1), imT(RO, E1, E2, 1), magT(RO, E1, E2, 1);
+        hoNDArray<T> coilMapConv(RO, E1, E2, CHA);
+        hoNDArray<T> D(RO, E1, E2, CHA);
+        hoNDArray<T> D_sum(1, CHA);
+        typename realType<T>::Type v;
+        T vCha;
+        size_t iter, cha;
+
+        hoNDArray<T> dataByCha(RO*E1*E2, CHA, const_cast<T*>(data.begin()));
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(data, D_sum, 0));
+        Gadgetron::norm2(D_sum, v);
+        Gadgetron::scal( (value_type)1.0/v, D_sum);
+
+        Gadgetron::clear(R);
+        for ( cha=0; cha<CHA; cha++ )
+        {
+            hoNDArray<T> dataCHA(RO, E1, E2, const_cast<T*>(data.begin())+cha*RO*E1*E2);
+            vCha = D_sum(cha);
+            Gadgetron::axpy( std::conj(vCha), dataCHA, R, R);
+        }
+
+        for ( iter=0; iter<iterNum; iter++ )
+        {
+            Gadgetron::conjugate(R, R);
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(data, R, coilMap));
+
+            Gadgetron::conv2(coilMap, ker, coilMapConv);
+
+            Gadgetron::multiplyConj(coilMapConv, coilMapConv, D);
+
+            GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(D, R, 3));
+
+            Gadgetron::sqrt(R, R);
+
+            Gadgetron::addEpsilon(R);
+            Gadgetron::inv(R, R);
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(coilMapConv, R, coilMap));
+
+            Gadgetron::multiplyConj(data, coilMap, D);
+            GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(D, R, 3));
+
+            //if ( iter < iterNum - 1 )
+            //{
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(coilMap, R, D));
+            //}
+            //else
+            //{
+            //    D = coilMap;
+            //}
+
+            hoNDArray<T> DByCha(RO*E1*E2, CHA, D.begin());
+            GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(DByCha, D_sum, 0));
+
+            Gadgetron::norm2(D_sum, v);
+            Gadgetron::scal( (value_type)1.0/v, D_sum);
+
+            Gadgetron::clear(imT);
+            for ( cha=0; cha<CHA; cha++ )
+            {
+                hoNDArray<T> coilMapCHA(RO, E1, E2, 1, coilMap.begin()+cha*RO*E1*E2);
+                vCha = D_sum(cha);
+                Gadgetron::axpy( std::conj(vCha), coilMapCHA, imT, imT);
+            }
+
+            Gadgetron::abs(imT, magT);
+            Gadgetron::divide(imT, magT, imT);
+
+            Gadgetron::multiply(R, imT, R);
+            Gadgetron::conjugate(imT, imT);
+            Gadgetron::multiply(coilMap, imT, coilMap);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilMap3DNIH2Inner(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilMap3DNIH(const hoNDArray<T>& data, hoNDArray<T>& coilMap, ISMRMRDCOILMAPALGO algo, size_t ks, size_t power, size_t iterNum, typename realType<T>::Type thres, bool true3D)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t E2 = data.get_size(2);
+        size_t CHA = data.get_size(3);
+
+        size_t N = data.get_number_of_elements()/(RO*E1*E2*CHA);
+
+        if ( !data.dimensions_equal(&coilMap) )
+        {
+            coilMap = data;
+        }
+
+        if ( ks%2 != 1 )
+        {
+            ks++;
+        }
+
+        //std::string debugFolder = "D:/software/Gadgetron/20130114/install/gadgetron/DebugOutput/";
+        //gtPlusIOAnalyze gt_io;
+
+        hoNDArray<T> data2D, coilMap2D;
+
+        if ( algo == ISMRMRD_SOUHEIL )
+        {
+            data2D.create(RO, E1, CHA);
+            coilMap2D.create(RO, E1, CHA);
+        }
+
+        int n, e2;
+        for ( n=0; n<(long long)N; n++ )
+        {
+            if ( algo == ISMRMRD_SOUHEIL_ITER )
+            {
+                GDEBUG_STREAM("calling 3D version of Souhiel iterative coil map estimation ... ");
+                GADGET_CHECK_RETURN_FALSE(this->coilMap3DNIH2Inner(data, coilMap, ks, ks, iterNum, thres));
+            }
+            else if ( algo==ISMRMRD_SOUHEIL && E2>5*ks && true3D )
+            {
+                GDEBUG_STREAM("calling 3D version of Souhiel coil map estimation ... ");
+                GADGET_CHECK_RETURN_FALSE(this->coilMap3DNIHInner(data, coilMap, ks, power));
+            }
+            else
+            {
+                hoNDArray<T> dataCurr(RO, E1, E2, CHA, const_cast<T*>(data.begin()+n*RO*E1*E2*CHA));
+                hoNDArray<T> coilMapCurr(RO, E1, E2, CHA, coilMap.begin()+n*RO*E1*E2*CHA);
+
+                #pragma omp parallel default(none) private(e2) shared(dataCurr, coilMapCurr, RO, E1, E2, CHA, algo, ks, power, iterNum, thres) if (E2>12)
+                {
+                    hoNDArray<T> data2D(RO, E1, CHA);
+                    hoNDArray<T> coilMap2D(RO, E1, CHA);
+
+                    #pragma omp for
+                    for ( e2=0; e2<(int)E2; e2++ )
+                    {
+                        long long cha;
+
+                        for ( cha=0; cha<(long long)CHA; cha++ )
+                        {
+                            memcpy(data2D.begin()+cha*RO*E1, dataCurr.begin()+cha*RO*E1*E2+e2*RO*E1, sizeof(T)*RO*E1);
+                        }
+
+                        if ( algo == ISMRMRD_SOUHEIL_ITER )
+                        {
+                            coilMap2DNIH2Inner(data2D, coilMap2D, ks, iterNum, thres);
+                        }
+                        else
+                        {
+                            coilMap2DNIHInner(data2D, coilMap2D, ks, power);
+                        }
+
+                        for ( cha=0; cha<(long long)CHA; cha++ )
+                        {
+                            memcpy(coilMapCurr.begin()+cha*RO*E1*E2+e2*RO*E1, coilMap2D.begin()+cha*RO*E1, sizeof(T)*RO*E1);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilMap3DNIH(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+sumOfSquare(const hoNDArray<T>& data, hoNDArray<T>& sos)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        hoNDArray<T> tmp(data);
+        Gadgetron::multiplyConj(data, data, tmp);
+
+        if ( NDim == 3 )
+        {
+            // GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(tmp, sos));
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(tmp, sos, 2));
+
+            std::vector<size_t> dim(2);
+            dim[0] = sos.get_size(0);
+            dim[1] = sos.get_size(1);
+
+            sos.reshape(dim);
+        }
+        else if ( NDim == 4 )
+        {
+            // GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverSecondLastDimension(tmp, sos));
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(tmp, sos, 2));
+
+            std::vector<size_t> dim(3);
+            dim[0] = sos.get_size(0);
+            dim[1] = sos.get_size(1);
+            dim[2] = sos.get_size(3);
+
+            sos.reshape(dim);
+        }
+        else
+        {
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(tmp, sos, 2));
+        }
+
+        Gadgetron::sqrt(sos, sos);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::sumOfSquare(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilCombine(const hoNDArray<T>& data, const hoNDArray<T>& coilMap, hoNDArray<T>& combined)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        size_t NDimCoil = coilMap.get_number_of_dimensions();
+
+        // GADGET_CHECK_RETURN_FALSE(NDimCoil<=NDim);
+        GADGET_CHECK_RETURN_FALSE(data.get_number_of_elements()>=coilMap.get_number_of_elements());
+
+        size_t n;
+        for ( n=0; n<NDimCoil; n++ )
+        {
+            if ( n<NDim && coilMap.get_size(n)>1 )
+            {
+                GADGET_CHECK_RETURN_FALSE(data.get_size(n)==coilMap.get_size(n));
+            }
+        }
+
+        boost::shared_ptr< std::vector<size_t> > dim = data.get_dimensions();
+        boost::shared_ptr< std::vector<size_t> > dimCoil = coilMap.get_dimensions();
+
+        std::vector<size_t> dimCombined(*dim);
+        dimCombined.erase(dimCombined.begin()+2);
+        combined.create(&dimCombined);
+
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t CHA = data.get_size(2);
+        size_t N = data.get_size(3);
+
+        size_t coilN = coilMap.get_size(3);
+
+        if ( coilN < N )
+        {
+            size_t NCombined = data.get_number_of_elements()/(RO*E1*CHA);
+
+            std::vector<size_t> dataInd, coilMapInd(NDimCoil, 0), coimbinedInd(dimCombined.size(), 0);
+
+            size_t nn;
+            size_t d;
+            hoNDArray<T> dataTmp(RO, E1, CHA);
+            hoNDArray<T> combinedCurr(RO, E1, 1);
+
+            for ( nn=0; nn<NCombined; nn++ )
+            {
+                size_t offsetData = nn*RO*E1*CHA;
+                dataInd = data.calculate_index(offsetData);
+
+                for ( d=0; d<NDimCoil; d++ )
+                {
+                    if ( dataInd[d]<coilMap.get_size(d) )
+                    {
+                        coilMapInd[d] = dataInd[d];
+                    }
+                    else
+                    {
+                        coilMapInd[d] = 0;
+                    }
+                }
+
+                for ( d=3; d<NDim; d++ )
+                {
+                    coimbinedInd[d-1] = dataInd[d];
+                }
+
+                size_t offsetCoilMap = coilMap.calculate_offset(coilMapInd);
+                size_t offsetCombined = combined.calculate_offset(coimbinedInd);
+
+                hoNDArray<T> dataCurr(RO, E1, CHA, const_cast<T*>(data.begin())+offsetData);
+                hoNDArray<T> coilMapCurr(RO, E1, CHA, const_cast<T*>(coilMap.begin())+offsetCoilMap);
+
+                Gadgetron::multiplyConj(dataCurr, coilMapCurr, dataTmp);
+                GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(dataTmp, combinedCurr, 2));
+
+                memcpy(combined.begin()+offsetCombined, combinedCurr.begin(), sizeof(T)*RO*E1);
+            }
+        }
+        else
+        {
+            size_t NCombined = data.get_number_of_elements()/(RO*E1*CHA*N);
+
+            std::vector<size_t> dataInd, coilMapInd(NDimCoil, 0), coimbinedInd(dimCombined.size(), 0);
+
+            size_t nn;
+            size_t d;
+            hoNDArray<T> dataTmp(RO, E1, CHA, N);
+            hoNDArray<T> combinedCurr(RO, E1, 1, N);
+
+            for ( nn=0; nn<NCombined; nn++ )
+            {
+                size_t offsetData = nn*RO*E1*CHA*N;
+                dataInd = data.calculate_index(offsetData);
+
+                for ( d=0; d<NDimCoil; d++ )
+                {
+                    if ( dataInd[d]<coilMap.get_size(d) )
+                    {
+                        coilMapInd[d] = dataInd[d];
+                    }
+                    else
+                    {
+                        coilMapInd[d] = 0;
+                    }
+                }
+
+                for ( d=3; d<NDim; d++ )
+                {
+                    coimbinedInd[d-1] = dataInd[d];
+                }
+
+                size_t offsetCoilMap = coilMap.calculate_offset(coilMapInd);
+                size_t offsetCombined = combined.calculate_offset(coimbinedInd);
+
+                hoNDArray<T> dataCurr(RO, E1, CHA, N, const_cast<T*>(data.begin())+offsetData);
+                hoNDArray<T> coilMapCurr(RO, E1, CHA, N, const_cast<T*>(coilMap.begin())+offsetCoilMap);
+
+                Gadgetron::multiplyConj(dataCurr, coilMapCurr, dataTmp);
+                GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(dataTmp, combinedCurr, 2));
+
+                memcpy(combined.begin()+offsetCombined, combinedCurr.begin(), sizeof(T)*RO*E1*N);
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilCombine(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilCombine3D(const hoNDArray<T>& data, const hoNDArray<T>& coilMap, hoNDArray<T>& combined)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        size_t NDimCoil = coilMap.get_number_of_dimensions();
+
+        // GADGET_CHECK_RETURN_FALSE(NDimCoil<=NDim);
+        GADGET_CHECK_RETURN_FALSE(data.get_number_of_elements()>=coilMap.get_number_of_elements());
+
+        /*size_t n;
+        for ( n=0; n<NDimCoil; n++ )
+        {
+            GADGET_CHECK_RETURN_FALSE(data.get_size(n)==coilMap.get_size(n));
+        }*/
+
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==coilMap.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==coilMap.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==coilMap.get_size(2));
+        GADGET_CHECK_RETURN_FALSE(data.get_size(3)==coilMap.get_size(3));
+
+        boost::shared_ptr< std::vector<size_t> > dim = data.get_dimensions();
+        boost::shared_ptr< std::vector<size_t> > dimCoil = coilMap.get_dimensions();
+
+        size_t N = coilMap.get_number_of_elements();
+        size_t num = data.get_number_of_elements()/coilMap.get_number_of_elements();
+
+        std::vector<size_t> dimCombined(*dim);
+        dimCombined.erase(dimCombined.begin()+3);
+        combined.create(&dimCombined);
+
+        std::vector<size_t> dimCombinedCurr(*dimCoil);
+        dimCombinedCurr[3] = 1;
+
+        size_t NCombined = combined.get_number_of_elements()/num;
+
+        long long nn;
+        #pragma omp parallel default(none) private(nn) shared(data, coilMap, num, dimCoil, dimCombinedCurr, combined, N, NCombined) if (num>=6)
+        {
+            hoNDArray<T> dataTmp(coilMap);
+
+            #pragma omp for
+            for ( nn=0; nn<(long long)num; nn++ )
+            {
+                hoNDArray<T> dataCurr(dimCoil.get(), const_cast<T*>(data.begin()+nn*N));
+                Gadgetron::multiplyConj(dataCurr, coilMap, dataTmp);
+
+                hoNDArray<T> dataCombinedCurr(&dimCombinedCurr, const_cast<T*>(combined.begin()+nn*NCombined));
+                Gadgetron::sum_over_dimension(dataTmp, dataCombinedCurr, 3);
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilCombine3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+conjugateSymmetry2D(const hoNDArray<T>& kspace, hoNDArray<T>& kspaceConj)
+{
+    try
+    {
+        if ( !kspaceConj.dimensions_equal(&kspace) )
+        {
+            kspaceConj.create(kspace.get_dimensions());
+        }
+
+        long long RO = kspace.get_size(0);
+        long long E1 = kspace.get_size(1);
+        long long num = kspace.get_number_of_elements()/(RO*E1);
+
+        long long centerRO = RO/2;
+        long long centerE1 = E1/2;
+
+        long long ii;
+
+        #pragma omp parallel for default(none) private(ii) shared(RO, E1, num, centerRO, centerE1, kspace, kspaceConj)
+        for ( ii=0; ii<num; ii++ )
+        {
+            ho2DArray<T> src(RO, E1, const_cast<T*>(kspace.begin()+ii*RO*E1));
+            ho2DArray<T> dst(RO, E1, const_cast<T*>(kspaceConj.begin()+ii*RO*E1));
+
+            long long ro, e1;
+            long long cro, ce1;
+
+            for ( e1=0; e1<E1; e1++ )
+            {
+                ce1 = 2*centerE1-e1;
+                if ( ce1 > E1-1 )
+                {
+                    ce1 -= E1;
+                }
+
+                for ( ro=0; ro<RO; ro++ )
+                {
+                    cro = 2*centerRO-ro;
+                    if ( cro > RO-1 )
+                    {
+                        cro -= RO;
+                    }
+
+                    dst(ro, e1) = std::conj(src(cro, ce1));
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::conjugateSymmetry2D(const hoNDArray<T>& kspace, hoNDArray<T>& kspaceConj) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+conjugateSymmetry3D(const hoNDArray<T>& kspace, hoNDArray<T>& kspaceConj)
+{
+    try
+    {
+        if ( !kspaceConj.dimensions_equal(&kspace) )
+        {
+            kspaceConj.create(kspace.get_dimensions());
+        }
+
+        long long RO = kspace.get_size(0);
+        long long E1 = kspace.get_size(1);
+        long long E2 = kspace.get_size(2);
+        long long num = kspace.get_number_of_elements()/(RO*E1*E2);
+
+        long long centerRO = RO/2;
+        long long centerE1 = E1/2;
+        long long centerE2 = E2/2;
+
+        long long ii;
+
+        #pragma omp parallel for default(none) private(ii) shared(RO, E1, E2, num, centerRO, centerE1, centerE2, kspace, kspaceConj)
+        for ( ii=0; ii<num; ii++ )
+        {
+            ho3DArray<T> src(RO, E1, E2, const_cast<T*>(kspace.begin()+ii*RO*E1*E2));
+            ho3DArray<T> dst(RO, E1, E2, const_cast<T*>(kspaceConj.begin()+ii*RO*E1*E2));
+
+            long long ro, e1, e2;
+            long long cro, ce1, ce2;
+
+            for ( e2=0; e2<E2; e2++ )
+            {
+                ce2 = 2*centerE2-e2;
+                if ( ce2 > E2-1 )
+                {
+                    ce2 -= E2;
+                }
+
+                for ( e1=0; e1<E1; e1++ )
+                {
+                    ce1 = 2*centerE1-e1;
+                    if ( ce1 > E1-1 )
+                    {
+                        ce1 -= E1;
+                    }
+
+                    for ( ro=0; ro<RO; ro++ )
+                    {
+                        cro = 2*centerRO-ro;
+                        if ( cro > RO-1 )
+                        {
+                            cro -= RO;
+                        }
+
+                        dst(ro, e1, e2) = std::conj(src(cro, ce1, ce2));
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconUtilComplex<T>::conjugateSymmetry3D(const hoNDArray<T>& kspace, hoNDArray<T>& kspaceConj) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlow.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlow.h
new file mode 100644
index 0000000..55fc91e
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlow.h
@@ -0,0 +1,604 @@
+/** \file   gtPlusISMRMRDReconWorkFlow.h
+    \brief  Define the base class for the GtPlus reconstruction workflow
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd/ismrmrd.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorker.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+struct DimensionRecordCompare
+{
+    DimensionRecordCompare() {}
+    ~DimensionRecordCompare() {}
+
+    bool operator()(const std::pair<ISMRMRDDIM, size_t>& a, const std::pair<ISMRMRDDIM, size_t>& b) const
+    {
+        return (a.second > b.second);
+    }
+};
+
+// [RO E1 CHA SLC E2 CON PHS REP SET SEG AVE]
+#define GADGETRON_RECON_KSPACE_DIM_NUM 11
+// [RO E1 CHA SLC E2 CON PHS REP SET AVE]
+#define GADGETRON_RECON_IMAGE_DIM_NUM 10
+
+template <typename T> 
+class gtPlusISMRMRDReconWorkFlow
+{
+public:
+
+    typedef std::pair<ISMRMRDDIM, size_t> DimensionRecordType;
+
+    typedef typename realType<T>::Type real_value_type;
+
+    gtPlusISMRMRDReconWorkFlow();
+    gtPlusISMRMRDReconWorkFlow(gtPlusReconWorker<T>& worker, gtPlusReconWorkOrder<T>& workOrder);
+    virtual ~gtPlusISMRMRDReconWorkFlow();
+
+    void printInfo(std::ostream& os);
+
+    virtual bool preProcessing() = 0;
+
+    virtual bool recon() = 0;
+
+    virtual bool postProcessing() = 0;
+
+    // assemble the ISMRMRD dimension index
+    // ind must have 10 elements
+    bool ismrmrdDimIndex10D(std::vector<size_t>& ind, const ISMRMRDDIM& dim, size_t value);
+
+    // find the permute order for ISMRMRD
+    bool findISMRMRDPermuteOrder(const std::vector<ISMRMRDDIM>& dimsSrc, const std::vector<ISMRMRDDIM>& dimsDst, std::vector<size_t>& order);
+
+    // print the dimension names
+    std::string printISMRMRDDimensions(const std::vector<ISMRMRDDIM>& dims);
+
+    // print the dimension size
+    std::string printISMRMRDDimensionSize(const std::vector<size_t>& sizes);
+
+    bool setDataArray(hoNDArray<T>& data);
+    bool setDataArray(hoNDArray<T>& data, hoNDArray<real_value_type>& time_stamp, hoNDArray<real_value_type>& physio_time_stamp);
+    bool setRefArray(hoNDArray<T>& ref);
+
+    // -------- these member variables are made as public ------------- //
+
+    // recon worker to do the computation
+    gtPlusReconWorker<T>* worker_;
+
+    // recon work order
+    gtPlusReconWorkOrder<T>* workOrder_;
+
+    // ----------------------------------
+    // noise prewhitening
+    // ----------------------------------
+    // noise scan, 3D array [RO E1 CHA]
+    hoNDArray<T>* noise_;
+
+    // noise bandwidth (Hz/pixel)
+    double noiseBW_;
+
+    // noise equivalent bandwidth ratio for receiver
+    double receriverBWRatio_;
+
+    // ADC sampling time in second
+    double ADCSamplingTimeinSecond_;
+
+    // RO oversampling ratio
+    double overSamplingRatioRO_;
+
+    // ----------------------------------
+    // final image sizes for RO/E1/E2
+    // ----------------------------------
+    size_t reconSizeRO_;
+    size_t reconSizeE1_;
+    size_t reconSizeE2_;
+
+    float encodingFOV_RO_;
+    float encodingFOV_E1_;
+    float encodingFOV_E2_;
+
+    float reconFOV_RO_;
+    float reconFOV_E1_;
+    float reconFOV_E2_;
+
+    // ----------------------------------
+    // dimension and starting indexes for this data_
+    // in case this data_ is a portion of a larger dataset
+    // ----------------------------------
+    std::vector< DimensionRecordType > dataDimStartingIndexes_;
+
+    // ----------------------------------
+    // reconstruction results, complex images, 10D array [RO E1 CHA SLC E2 CON PHS REP SET AVE]
+    // ----------------------------------
+    hoNDArray<T> res_;
+    // optional time stamps for the recon results, in the unit of seconds, 10D array [1 1 1 SLC E2 CON PHS REP SET AVE]
+    // if not set, the stored image header will be used for time stamps
+    hoNDArray<real_value_type> res_time_stamp_;
+    hoNDArray<real_value_type> res_physio_time_stamp_;
+
+    hoNDArray<T> res_second_;
+    hoNDArray<real_value_type> res_time_stamp_second_;
+    hoNDArray<real_value_type> res_physio_time_stamp_second_;
+
+    // gfactor, not all reconstruction fills the gfactor
+    // 10D array [RO E1 CHA SLC E2 CON PHS REP SET AVE]
+    hoNDArray<T> gfactor_;
+
+    // warp-around map, not all reconstruction fills the gfactor
+    // 10D array [RO E1 2 SLC E2 CON PHS REP SET AVE]
+    hoNDArray<T> wrap_around_map_;
+
+    // ----------------------------------
+    // debug and timing
+    // ----------------------------------
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    bool performTiming_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // debug folder
+    std::string debugFolder_;
+
+    // ----------------------------------
+    // input data array
+    // ----------------------------------
+    // image data, 11D [RO E1 CHA SLC E2 CON PHS REP SET SEG AVE]
+    hoNDArray<T>* data_;
+    // time stamp, 11D [1 E1 1 SLC E2 CON PHS REP SET SEG AVE]
+    // these are set with data array
+    hoNDArray<real_value_type>* time_stamp_;
+    hoNDArray<real_value_type>* physio_time_stamp_;
+
+    // reference calibration, 11D [RO E1 CHA SLC E2 CON PHS REP SET SEG AVE]
+    hoNDArray<T>* ref_;
+
+protected:
+
+    // internal helper memory allocated for computation
+    hoNDArray<T> dataCurr_;
+    hoNDArray<T> refCurr_;
+    hoNDArray<T> gfactorCurr_;
+    hoNDArray<T> wrap_around_mapCurr_;
+
+    // size of dimensions for image data
+    DimensionRecordType RO_;
+    DimensionRecordType E1_;
+    DimensionRecordType CHA_;
+    DimensionRecordType SLC_;
+    DimensionRecordType E2_;
+    DimensionRecordType CON_;
+    DimensionRecordType PHS_;
+    DimensionRecordType REP_;
+    DimensionRecordType SET_;
+    DimensionRecordType SEG_;
+    DimensionRecordType AVE_;
+
+    // size of dimensions for ref data
+    DimensionRecordType RO_ref_;
+    DimensionRecordType E1_ref_;
+    DimensionRecordType CHA_ref_;
+    DimensionRecordType SLC_ref_;
+    DimensionRecordType E2_ref_;
+    DimensionRecordType CON_ref_;
+    DimensionRecordType PHS_ref_;
+    DimensionRecordType REP_ref_;
+    DimensionRecordType SET_ref_;
+    DimensionRecordType SEG_ref_;
+    DimensionRecordType AVE_ref_;
+
+    // expected dimensions for results
+    std::vector<ISMRMRDDIM> dimsRes_;
+
+    // util
+    gtPlusISMRMRDReconUtil<T> gtPlus_util_;
+};
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlow<T>::gtPlusISMRMRDReconWorkFlow() 
+: data_(NULL), time_stamp_(NULL), physio_time_stamp_(NULL), ref_(NULL), worker_(NULL), workOrder_(NULL), noise_(NULL), noiseBW_(1.0), receriverBWRatio_(1.0), overSamplingRatioRO_(1.0), ADCSamplingTimeinSecond_(1.0) , performTiming_(false)
+{
+    RO_.first = DIM_ReadOut;
+    RO_.second = 1;
+
+    E1_.first = DIM_Encoding1;
+    E1_.second = 1;
+
+    CHA_.first = DIM_Channel;
+    CHA_.second = 1;
+
+    SLC_.first = DIM_Slice;
+    SLC_.second = 1;
+
+    E2_.first = DIM_Encoding2;
+    E2_.second = 1;
+
+    CON_.first = DIM_Contrast;
+    CON_.second = 1;
+
+    PHS_.first = DIM_Phase;
+    PHS_.second = 1;
+
+    REP_.first = DIM_Repetition;
+    REP_.second = 1;
+
+    SET_.first = DIM_Set;
+    SET_.second = 1;
+
+    SEG_.first = DIM_Segment;
+    SEG_.second = 1;
+
+    AVE_.first = DIM_Average;
+    AVE_.second = 1;
+
+    RO_ref_.first = DIM_ReadOut;
+    RO_ref_.second = 1;
+
+    E1_ref_.first = DIM_Encoding1;
+    E1_ref_.second = 1;
+
+    CHA_ref_.first = DIM_Channel;
+    CHA_ref_.second = 1;
+
+    SLC_ref_.first = DIM_Slice;
+    SLC_ref_.second = 1;
+
+    E2_ref_.first = DIM_Encoding2;
+    E2_ref_.second = 1;
+
+    CON_ref_.first = DIM_Contrast;
+    CON_ref_.second = 1;
+
+    PHS_ref_.first = DIM_Phase;
+    PHS_ref_.second = 1;
+
+    REP_ref_.first = DIM_Repetition;
+    REP_ref_.second = 1;
+
+    SET_ref_.first = DIM_Set;
+    SET_ref_.second = 1;
+
+    SEG_ref_.first = DIM_Segment;
+    SEG_ref_.second = 1;
+
+    AVE_ref_.first = DIM_Average;
+    AVE_ref_.second = 1;
+
+    dimsRes_.resize(GADGETRON_RECON_IMAGE_DIM_NUM);
+    dimsRes_[0] = DIM_ReadOut;
+    dimsRes_[1] = DIM_Encoding1;
+    dimsRes_[2] = DIM_Channel;
+    dimsRes_[3] = DIM_Slice;
+    dimsRes_[4] = DIM_Encoding2;
+    dimsRes_[5] = DIM_Contrast;
+    dimsRes_[6] = DIM_Phase;
+    dimsRes_[7] = DIM_Repetition;
+    dimsRes_[8] = DIM_Set;
+    dimsRes_[9] = DIM_Average;
+
+    dataDimStartingIndexes_.resize(GADGETRON_RECON_KSPACE_DIM_NUM);
+    dataDimStartingIndexes_[0] = DimensionRecordType(DIM_ReadOut, 0);
+    dataDimStartingIndexes_[1] = DimensionRecordType(DIM_Encoding1, 0);
+    dataDimStartingIndexes_[2] = DimensionRecordType(DIM_Channel, 0);
+    dataDimStartingIndexes_[3] = DimensionRecordType(DIM_Slice, 0);
+    dataDimStartingIndexes_[4] = DimensionRecordType(DIM_Encoding2, 0);
+    dataDimStartingIndexes_[5] = DimensionRecordType(DIM_Contrast, 0);
+    dataDimStartingIndexes_[6] = DimensionRecordType(DIM_Phase, 0);
+    dataDimStartingIndexes_[7] = DimensionRecordType(DIM_Repetition, 0);
+    dataDimStartingIndexes_[8] = DimensionRecordType(DIM_Set, 0);
+    dataDimStartingIndexes_[9] = DimensionRecordType(DIM_Segment, 0);
+    dataDimStartingIndexes_[10] = DimensionRecordType(DIM_Average, 0);
+
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+}
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlow<T>::gtPlusISMRMRDReconWorkFlow(gtPlusReconWorker<T>& worker, gtPlusReconWorkOrder<T>& workOrder) 
+: data_(NULL), ref_(NULL), worker_(&worker), workOrder_(&workOrder), noise_(NULL),
+  noiseBW_(1.0), receriverBWRatio_(1.0), overSamplingRatioRO_(1.0), ADCSamplingTimeinSecond_(1.0) , performTiming_(false)
+{
+    RO_.second = 1;
+    E1_.second = 1;
+    CHA_.second = 1;
+    SLC_.second = 1;
+    E2_.second = 1;
+    CON_.second = 1;
+    PHS_.second = 1;
+    REP_.second = 1;
+    SET_.second = 1;
+    SEG_.second = 1;
+    AVE_.second = 1;
+
+    RO_ref_.second = 1;
+    E1_ref_.second = 1;
+    CHA_ref_.second = 1;
+    SLC_ref_.second = 1;
+    E2_ref_.second = 1;
+    CON_ref_.second = 1;
+    PHS_ref_.second = 1;
+    REP_ref_.second = 1;
+    SET_ref_.second = 1;
+    SEG_ref_.second = 1;
+    AVE_ref_.second = 1;
+
+    dimsRes_.resize(GADGETRON_RECON_IMAGE_DIM_NUM);
+    dimsRes_[0] = DIM_ReadOut;
+    dimsRes_[1] = DIM_Encoding1;
+    dimsRes_[2] = DIM_Channel;
+    dimsRes_[3] = DIM_Slice;
+    dimsRes_[4] = DIM_Encoding2;
+    dimsRes_[5] = DIM_Contrast;
+    dimsRes_[6] = DIM_Phase;
+    dimsRes_[7] = DIM_Repetition;
+    dimsRes_[8] = DIM_Set;
+    dimsRes_[9] = DIM_Average;
+
+    dataDimStartingIndexes_.resize(GADGETRON_RECON_KSPACE_DIM_NUM);
+    dataDimStartingIndexes_[0] = DimensionRecordType(DIM_ReadOut, 0);
+    dataDimStartingIndexes_[1] = DimensionRecordType(DIM_Encoding1, 0);
+    dataDimStartingIndexes_[2] = DimensionRecordType(DIM_Channel, 0);
+    dataDimStartingIndexes_[3] = DimensionRecordType(DIM_Slice, 0);
+    dataDimStartingIndexes_[4] = DimensionRecordType(DIM_Encoding2, 0);
+    dataDimStartingIndexes_[5] = DimensionRecordType(DIM_Contrast, 0);
+    dataDimStartingIndexes_[6] = DimensionRecordType(DIM_Phase, 0);
+    dataDimStartingIndexes_[7] = DimensionRecordType(DIM_Repetition, 0);
+    dataDimStartingIndexes_[8] = DimensionRecordType(DIM_Set, 0);
+    dataDimStartingIndexes_[9] = DimensionRecordType(DIM_Segment, 0);
+    dataDimStartingIndexes_[10] = DimensionRecordType(DIM_Average, 0);
+
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+}
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlow<T>::~gtPlusISMRMRDReconWorkFlow() 
+{
+}
+
+template <typename T> 
+void gtPlusISMRMRDReconWorkFlow<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD Recon workflow -------------" << endl;
+    os << "Implementation of general reconstruction workflow for ISMRMRD convention" << endl;
+    os << "the gtPlusISMRMRDReconWorkFlow defines and implements the reconstruction workflow for the ISMRMRD definition" << endl;
+    os << "the reconstruction is split into three stages:" << endl;
+    os << "1) PreProcessing" << endl;
+    os << "2) Reconstruction" << endl;
+    os << "3) PostProcessing" << endl;
+    os << endl;
+    os << "These three steps can have different operations for different sampling patterns or imaging applications" << endl;
+    os << "----------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+inline bool gtPlusISMRMRDReconWorkFlow<T>::
+ismrmrdDimIndex10D(std::vector<size_t>& ind, const ISMRMRDDIM& dim, size_t value)
+{
+    GADGET_CHECK_RETURN_FALSE(ind.size()>(size_t)(dim-DIM_ReadOut));
+    ind[dim-DIM_ReadOut] = value;
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlow<T>::
+findISMRMRDPermuteOrder(const std::vector<ISMRMRDDIM>& dimsSrc, const std::vector<ISMRMRDDIM>& dimsDst, std::vector<size_t>& order)
+{
+    GADGET_CHECK_RETURN_FALSE(dimsSrc.size()==dimsDst.size());
+    order.resize(dimsSrc.size());
+
+    size_t NDim = dimsSrc.size();
+    size_t src, dst;
+
+    for ( dst=0; dst<NDim; dst++ )
+    {
+        for ( src=0; src<NDim; src++ )
+        {
+            if ( dimsSrc[src] == dimsDst[dst] )
+                break;
+        }
+
+        order[dst] = src;
+    }
+
+    return true;
+}
+
+template <typename T> 
+std::string gtPlusISMRMRDReconWorkFlow<T>::
+printISMRMRDDimensions(const std::vector<ISMRMRDDIM>& dims)
+{
+    using namespace std;
+
+    if ( dims.empty() ) return std::string("[ ]");
+
+    size_t NDim = dims.size();
+
+    size_t ii;
+
+    std::ostringstream os;
+
+    os << "[ ";
+    for ( ii=0; ii<NDim; ii++ )
+    {
+        ISMRMRDDIM dim = dims[ii];
+        switch (dim)
+        {
+            case DIM_ReadOut:
+                os << "DIM_ReadOut ";
+            break;
+
+            case DIM_Encoding1:
+                os << "Encoding1 ";
+            break;
+
+            case DIM_Channel:
+                os << "Channel ";
+            break;
+
+            case DIM_Slice:
+                os << "Slice ";
+            break;
+
+            case DIM_Encoding2:
+                os << "Encoding2 ";
+            break;
+
+            case DIM_Contrast:
+                os << "Contrast ";
+            break;
+
+            case DIM_Phase:
+                os << "Phase ";
+            break;
+
+            case DIM_Repetition:
+                os << "Repitition ";
+            break;
+
+            case DIM_Set:
+                os << "Set ";
+            break;
+
+            case DIM_Segment:
+                os << "Segment ";
+            break;
+
+            case DIM_Average:
+                os << "Average ";
+            break;
+
+            default:
+                os << " Other";
+        }
+    }
+    os << "]" << endl;
+
+    std::string dimStr(os.str());
+    return dimStr;
+}
+
+template <typename T> 
+std::string gtPlusISMRMRDReconWorkFlow<T>::
+printISMRMRDDimensionSize(const std::vector<size_t>& sizes)
+{
+    using namespace std;
+
+    if ( sizes.empty() ) return std::string("[ ]");
+
+    size_t NDim = sizes.size();
+
+    size_t ii;
+
+    std::ostringstream os;
+
+    os << "[ ";
+    for ( ii=0; ii<NDim; ii++ )
+    {
+        os << sizes[ii] << " ";
+    }
+    os << "]" << endl;
+
+    std::string sizeStr(os.str());
+    return sizeStr;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlow<T>::setDataArray(hoNDArray<T>& data)
+{
+    try
+    {
+        data_ = &data;
+
+        RO_.second = data.get_size(0);
+        E1_.second = data.get_size(1);
+        CHA_.second = data.get_size(2);
+        SLC_.second = data.get_size(3);
+        E2_.second = data.get_size(4);
+        CON_.second = data.get_size(5);
+        PHS_.second = data.get_size(6);
+        REP_.second = data.get_size(7);
+        SET_.second = data.get_size(8);
+        SEG_.second = data.get_size(9);
+        AVE_.second = data.get_size(10);
+    }
+    catch(...)
+    {
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlow<T>::setDataArray(hoNDArray<T>& data, hoNDArray<real_value_type>& time_stamp, hoNDArray<real_value_type>& physio_time_stamp)
+{
+    try
+    {
+        data_ = &data;
+        time_stamp_ = &time_stamp;
+        physio_time_stamp_ = &physio_time_stamp;
+
+        RO_.second = data.get_size(0);
+        E1_.second = data.get_size(1);
+        CHA_.second = data.get_size(2);
+        SLC_.second = data.get_size(3);
+        E2_.second = data.get_size(4);
+        CON_.second = data.get_size(5);
+        PHS_.second = data.get_size(6);
+        REP_.second = data.get_size(7);
+        SET_.second = data.get_size(8);
+        SEG_.second = data.get_size(9);
+        AVE_.second = data.get_size(10);
+    }
+    catch(...)
+    {
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlow<T>::setRefArray(hoNDArray<T>& ref)
+{
+    try
+    {
+        ref_ = &ref;
+
+        RO_ref_.second     = ref.get_size(0);
+        E1_ref_.second     = ref.get_size(1);
+        CHA_ref_.second    = ref.get_size(2);
+        SLC_ref_.second    = ref.get_size(3);
+        E2_ref_.second     = ref.get_size(4);
+        CON_ref_.second    = ref.get_size(5);
+        PHS_ref_.second    = ref.get_size(6);
+        REP_ref_.second    = ref.get_size(7);
+        SET_ref_.second    = ref.get_size(8);
+        SEG_ref_.second    = ref.get_size(9);
+        AVE_ref_.second    = ref.get_size(10);
+    }
+    catch(...)
+    {
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian.h
new file mode 100644
index 0000000..b3fc883
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian.h
@@ -0,0 +1,1892 @@
+/** \file   gtPlusISMRMRDReconWorkFlowCartesian.h
+    \brief  Define the base class for the GtPlus reconstruction workflow for cartesian sampling
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusISMRMRDReconWorkFlow.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusISMRMRDReconWorkFlowCartesian : public gtPlusISMRMRDReconWorkFlow<T>
+{
+public:
+
+    typedef gtPlusISMRMRDReconWorkFlow<T> BaseClass;
+    typedef typename BaseClass::DimensionRecordType DimensionRecordType;
+    typedef typename BaseClass::real_value_type real_value_type;
+
+    gtPlusISMRMRDReconWorkFlowCartesian();
+    virtual ~gtPlusISMRMRDReconWorkFlowCartesian();
+
+    void printInfo(std::ostream& os);
+
+    virtual bool preProcessing();
+
+    virtual bool postProcessing();
+    virtual bool postProcessing(hoNDArray<T>& res, bool process_gfactor=true, bool process_wrap_around_map=true);
+
+    virtual bool configureWorkOrder(const std::vector<ISMRMRDDIM>& dims);
+
+    // resize or cut the reconstruected images to the recon space
+    // res_ [RO E1 CHA SLC E2 ...]
+    virtual bool convertToReconSpace2D(hoNDArray<T>& input_, hoNDArray<T>& output_, bool isKSpace);
+    // res_ [RO E1 E2 CHA ...]
+    virtual bool convertToReconSpace3D(hoNDArray<T>& input_, hoNDArray<T>& output_, bool isKSpace);
+
+    // predict the workOrder dimensions
+    virtual bool predictDimensions() = 0;
+
+    using BaseClass::data_;
+    using BaseClass::time_stamp_;
+    using BaseClass::physio_time_stamp_;
+    using BaseClass::ref_;
+    using BaseClass::gfactor_;
+    using BaseClass::wrap_around_map_;
+    using BaseClass::noise_;
+    using BaseClass::noiseBW_;
+    using BaseClass::receriverBWRatio_;
+    using BaseClass::ADCSamplingTimeinSecond_;
+    using BaseClass::overSamplingRatioRO_;
+    using BaseClass::reconSizeRO_;
+    using BaseClass::reconSizeE1_;
+    using BaseClass::reconSizeE2_;
+    using BaseClass::encodingFOV_RO_;
+    using BaseClass::encodingFOV_E1_;
+    using BaseClass::encodingFOV_E2_;
+    using BaseClass::reconFOV_RO_;
+    using BaseClass::reconFOV_E1_;
+    using BaseClass::reconFOV_E2_;
+
+    using BaseClass::res_;
+    using BaseClass::res_second_;
+    using BaseClass::res_time_stamp_;
+    using BaseClass::res_physio_time_stamp_;
+    using BaseClass::res_time_stamp_second_;
+    using BaseClass::res_physio_time_stamp_second_;
+
+    using BaseClass::worker_;
+    using BaseClass::workOrder_;
+
+    using BaseClass::dimsRes_;
+
+    using BaseClass::dataDimStartingIndexes_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+
+    // the workOrder can share the kernel computation results
+    // if this WorkOrderShareDim_ is not DIM_NONE, then 
+    // workOrders will share kernel estimation results along this 
+    // dimensions
+    ISMRMRDDIM WorkOrderShareDim_;
+
+    // work flow can buffer the kernel computed from previous work order and apply them to other work orders
+    // work flow looks at the workFlow_BufferKernel_ and workFlow_use_BufferedKernel_ fields of work order
+    // buffered kernels
+    boost::shared_ptr< hoNDArray<T> > workFlowBuffer_kernel_;
+    boost::shared_ptr< hoNDArray<T> > workFlowBuffer_kernelIm_;
+    boost::shared_ptr< hoNDArray<T> > workFlowBuffer_unmixingCoeffIm_;
+    boost::shared_ptr< hoNDArray<T> > workFlowBuffer_coilMap_;
+    boost::shared_ptr< std::vector<hoMatrix<T> > > workFlowBuffer_coilCompressionCoef_;
+
+    // whether to perform oversampling removal for ref data
+    bool ref_remove_oversampling_RO_;
+    // whether to apply noise prewhitening on ref data
+    bool ref_apply_noisePreWhitening_;
+
+protected:
+
+    using BaseClass::dataCurr_;
+    using BaseClass::refCurr_;
+    using BaseClass::gfactorCurr_;
+    using BaseClass::wrap_around_mapCurr_;
+
+    using BaseClass::RO_;
+    using BaseClass::E1_;
+    using BaseClass::CHA_;
+    using BaseClass::SLC_;
+    using BaseClass::E2_;
+    using BaseClass::CON_;
+    using BaseClass::PHS_;
+    using BaseClass::REP_;
+    using BaseClass::SET_;
+    using BaseClass::SEG_;
+    using BaseClass::AVE_;
+
+    using BaseClass::RO_ref_;
+    using BaseClass::E1_ref_;
+    using BaseClass::CHA_ref_;
+    using BaseClass::SLC_ref_;
+    using BaseClass::E2_ref_;
+    using BaseClass::CON_ref_;
+    using BaseClass::PHS_ref_;
+    using BaseClass::REP_ref_;
+    using BaseClass::SET_ref_;
+    using BaseClass::SEG_ref_;
+    using BaseClass::AVE_ref_;
+
+    using BaseClass::gtPlus_util_;
+
+    /// permute the array to the fixed order
+    template <typename T2> 
+    bool permuteArrayOrder(hoNDArray<T2>& data, std::vector<size_t>& order)
+    {
+        try
+        {
+            boost::shared_ptr< hoNDArray<T2> > data_permuted = Gadgetron::permute(&data, &order);
+            data.reshape(data_permuted->get_dimensions());
+            memcpy(data.begin(), data_permuted->begin(), data_permuted->get_number_of_bytes());
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors in gtPlusISMRMRDReconWorkFlowCartesian<T>::permuteArrayOrder(hoNDArray<T>& data, const std::vector<int>& order) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    /// copy workOrder results to workflow
+    bool copyReconResultsSecond(size_t dim5, size_t dim6, size_t dim7, size_t dim8, size_t dim9);
+    bool copyGFactor(size_t dim5, size_t dim6, size_t dim7, size_t dim8, size_t dim9, bool gfactor_needed);
+    bool copyWrapAroundMap(size_t dim5, size_t dim6, size_t dim7, size_t dim8, size_t dim9, bool wrap_around_map_needed);
+};
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlowCartesian<T>::
+gtPlusISMRMRDReconWorkFlowCartesian() : BaseClass(), WorkOrderShareDim_(DIM_NONE), ref_remove_oversampling_RO_(true), ref_apply_noisePreWhitening_(true)
+{
+}
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlowCartesian<T>::~gtPlusISMRMRDReconWorkFlowCartesian() 
+{
+}
+
+template <typename T> 
+void gtPlusISMRMRDReconWorkFlowCartesian<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD Recon workflow Cartesian -------------" << endl;
+    os << "Implementation of general reconstruction workflow for cartesian sampling" << endl;
+    os << "Typical PreProcessing includes:" << endl;
+    os << "a) Combine SEG dimension" << endl;
+    os << "b) Remove readout oversampling if any" << endl;
+    os << "c) If input noise scan is available, compute and apply the noise prewhitening matrix" << endl;
+    os << "d) Apply the kspace filter along the RO direction if required" << endl;
+    os << endl;
+    os << "Typical PostProcessing includes:" << endl;
+    os << "a) Apply the kspace filter along the E1 and E2 directions if required" << endl;
+    os << "b) Perform the zero-padding resize if required" << endl;
+    os << endl;
+    os << "Data buffers are named to reflect the typical nature of MR acquisition" << endl;
+    os << "data: image kspace data, 10D array [RO E1 CHA SLC E2 CON PHS REP SET SEG AVE]" << endl;
+    os << "ref: calibration data, 10D array [RO E1 CHA SLC E2 CON PHS REP SET SEG AVE]" << endl;
+    os << "----------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian<T>::
+preProcessing()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data_!=NULL);
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*data_, debugFolder_+"incomingKSpace"); }
+
+        // combine the segment dimension
+        if ( SEG_.second > 1 )
+        {
+            // GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(*data_, dataCurr_));
+
+            std::vector<size_t> dim, dimCurr;
+            data_->get_dimensions(dim);
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(*data_, dataCurr_, data_->get_number_of_dimensions()-1));
+
+            dimCurr.resize(dim.size() - 1);
+            memcpy(&dimCurr[0], &dim[0], sizeof(size_t)*dimCurr.size());
+            dataCurr_.reshape(dimCurr);
+
+            *data_ = dataCurr_;
+            SEG_.second = 1;
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*data_, debugFolder_+"incomingKSpace_SEGCombined"); }
+        }
+
+        if ( (ref_ != NULL) && (ref_->get_number_of_elements()>0) )
+        {
+            if ( !debugFolder_.empty() )
+            {
+                gt_exporter_.exportArrayComplex(*ref_, debugFolder_+"incomingRef");
+            }
+        }
+
+        if ( ref_!=NULL && SEG_ref_.second>1 )
+        {
+            // GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(*ref_, refCurr_));
+
+            std::vector<size_t> dim, dimCurr;
+            ref_->get_dimensions(dim);
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(*ref_, refCurr_, ref_->get_number_of_dimensions() - 1));
+
+            dimCurr.resize(dim.size() - 1);
+            memcpy(&dimCurr[0], &dim[0], sizeof(size_t)*dimCurr.size());
+            refCurr_.reshape(dimCurr);
+
+            *ref_ = refCurr_;
+            SEG_ref_.second = 1;
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*data_, debugFolder_+"incomingRef_SEGCombined"); }
+        }
+
+        // if needed, remove the readout oversampling
+        if ( overSamplingRatioRO_ > 1.0 )
+        {
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(*data_);
+            // GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().cutpad2D(*data_, (size_t)(data_->get_size(0)/overSamplingRatioRO_), data_->get_size(1), dataCurr_));
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::crop((size_t)(data_->get_size(0) / overSamplingRatioRO_), data_->get_size(1), data_, &dataCurr_));
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft1c(dataCurr_);
+            *data_ = dataCurr_;
+            RO_.second = data_->get_size(0);
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*data_, debugFolder_+"kspace_oversamplingRORemoved"); }
+
+            if ( ref_ != NULL && ref_remove_oversampling_RO_ )
+            {
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(*ref_);
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::crop((size_t)(ref_->get_size(0) / overSamplingRatioRO_), ref_->get_size(1), ref_, &refCurr_));
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft1c(refCurr_);
+                *ref_ = refCurr_;
+                RO_ref_.second = ref_->get_size(0);
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*ref_, debugFolder_+"ref_oversamplingRORemoved"); }
+            }
+
+            if ( workOrder_->start_RO_>=0 && workOrder_->end_RO_>=0 )
+            {
+                workOrder_->start_RO_ = (int)(workOrder_->start_RO_/overSamplingRatioRO_);
+                workOrder_->end_RO_ = (int)(workOrder_->end_RO_/overSamplingRatioRO_);
+            }
+        }
+
+        // if needed, perform the noise prewhitening
+        if ( noise_ != NULL )
+        {
+            hoMatrix<T> prewhiteningMatrix;
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().computeNoisePrewhiteningMatrix(*noise_, noiseBW_, receriverBWRatio_, ADCSamplingTimeinSecond_, prewhiteningMatrix));
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().performNoisePrewhitening(*data_, prewhiteningMatrix));
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*data_, debugFolder_+"kspace_noiseprewhitenned"); }
+
+            if ( ref_!=NULL && ref_apply_noisePreWhitening_ )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().performNoisePrewhitening(*ref_, prewhiteningMatrix));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*ref_, debugFolder_+"ref_noiseprewhitenned"); }
+            }
+        }
+
+        // if asymmetric echo is used, set the corresponding RO regions as 0
+        size_t RO = data_->get_size(0);
+        if ( !( workOrder_->start_RO_<0 || workOrder_->end_RO_<0 || (workOrder_->end_RO_-workOrder_->start_RO_+1==RO) ) )
+        {
+            size_t num = data_->get_number_of_elements() / RO;
+            long long n;
+
+            long long startRO = workOrder_->start_RO_;
+            long long endRO = workOrder_->end_RO_;
+            T* pData = data_->begin();
+
+            #pragma omp parallel for default(none) private(n) shared(num, RO, startRO, endRO, pData)
+            for ( n=0; n<(long long)num; n++ )
+            {
+                if ( startRO > 0 )
+                {
+                    memset(pData+n*RO, 0, startRO*sizeof(T) );
+                }
+                else
+                {
+                    memset(pData+n*RO+endRO+1, 0, (RO-endRO)*sizeof(T) );
+                }
+            }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*data_, debugFolder_+"incomingKSpace_RO_setzeros"); }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconWorkFlowCartesian<T>::preProcessing() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian<T>::
+convertToReconSpace2D(hoNDArray<T>& input_, hoNDArray<T>& output_, bool isKSpace)
+{
+    try
+    {
+        size_t RO = data_->get_size(0);
+        size_t E1 = data_->get_size(1);
+
+        size_t inputRO = input_.get_size(0);
+        size_t inputE1 = input_.get_size(1);
+
+        output_ = input_;
+
+        // if encoded FOV are the same as recon FOV
+        if ( (std::abs(encodingFOV_RO_/2 - reconFOV_RO_)<0.1) && (std::abs(encodingFOV_E1_-reconFOV_E1_)<0.1) )
+        {
+            if ( isKSpace )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize2DOnKSpace(input_, reconSizeRO_, reconSizeE1_, output_));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize2D(input_, reconSizeRO_, reconSizeE1_, output_));
+            }
+        }
+        else if (encodingFOV_E1_>=reconFOV_E1_)
+        {
+            size_t encodingE1 = reconSizeE1_;
+            if ( encodingFOV_E1_ > reconFOV_E1_ )
+            {
+                float spacingE1 = reconFOV_E1_/reconSizeE1_;
+                encodingE1 = (size_t)std::floor(encodingFOV_E1_/spacingE1+0.5);
+            }
+
+            hoNDArray<T>* pSrc = &input_;
+            hoNDArray<T>* pDst = &output_;
+            hoNDArray<T>* pTmp;
+
+            hoNDArray<T> buffer2D;
+
+            // adjust E1
+            if ( encodingE1>E1 && encodingE1>inputE1 )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize2DOnKSpace(*pSrc, RO, encodingE1, *pDst));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize2D(*pSrc, RO, encodingE1, *pDst));
+                }
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*pDst, debugFolder_+"complexIm_zpadResize2D_enlarged"); }
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+            else if ( encodingE1 < E1 )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::crop(RO, encodingE1, pSrc, pDst));
+                }
+                else
+                {
+                    Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(*pSrc, buffer2D);
+                    GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::crop(RO, encodingE1, &buffer2D, pDst));
+                }
+
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(*pDst);
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*pDst, "complexIm_zpadResize2D_cut"); }
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+
+            //adjust RO
+            if ( RO < reconSizeRO_ )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize2DOnKSpace(*pSrc, reconSizeRO_, pSrc->get_size(1), *pDst));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize2D(*pSrc, reconSizeRO_, pSrc->get_size(1), *pDst));
+                }
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+            else if ( RO > reconSizeRO_ )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::crop(reconSizeRO_, pSrc->get_size(1), pSrc, pDst));
+                }
+                else
+                {
+                    Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(*pSrc, buffer2D);
+                    GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::crop(reconSizeRO_, pSrc->get_size(1), &buffer2D, pDst));
+                }
+
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(*pDst);
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+
+            // final cut
+            if ( isKSpace )
+            {
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(*pSrc, buffer2D);
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::crop(reconSizeRO_, reconSizeE1_, &buffer2D, pDst));
+            }
+            else
+            {
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::crop(reconSizeRO_, reconSizeE1_, pSrc, pDst));
+            }
+
+            if ( pDst != &output_ )
+            {
+                output_ = *pDst;
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconWorkFlowCartesian<T>::convertToReconSpace2D(const hoNDArray& input_, hoNDArray& output_, bool isKSpace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian<T>::
+convertToReconSpace3D(hoNDArray<T>& input_, hoNDArray<T>& output_, bool isKSpace)
+{
+    try
+    {
+        size_t RO = res_.get_size(0);
+        size_t E1 = res_.get_size(1);
+        size_t E2 = res_.get_size(2);
+
+        output_ = input_;
+
+        // if encoded FOV are the same as recon FOV
+        if ( (std::abs(encodingFOV_RO_/2 - reconFOV_RO_)<0.1) && (std::abs(encodingFOV_E1_-reconFOV_E1_)<0.1) && (std::abs(encodingFOV_E2_-reconFOV_E2_)<0.1) )
+        {
+            if ( isKSpace )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize3DOnKSpace(input_, reconSizeRO_, reconSizeE1_, reconSizeE2_, output_));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize3D(input_, reconSizeRO_, reconSizeE1_, reconSizeE2_, output_));
+            }
+        }
+        else if ( (encodingFOV_E1_>=reconFOV_E1_) && (encodingFOV_E2_>=reconFOV_E2_) )
+        {
+            size_t encodingE1 = reconSizeE1_;
+            if ( encodingFOV_E1_ > reconFOV_E1_ )
+            {
+                float spacingE1 = reconFOV_E1_/reconSizeE1_;
+                encodingE1 = (size_t)std::floor(encodingFOV_E1_/spacingE1+0.5);
+            }
+
+            size_t encodingE2 = reconSizeE2_;
+            if ( encodingFOV_E2_ > reconFOV_E2_ )
+            {
+                float spacingE2 = reconFOV_E2_/reconSizeE2_;
+                encodingE2 = (size_t)std::floor(encodingFOV_E2_/spacingE2+0.5);
+            }
+
+            hoNDArray<T>* pSrc = &input_;
+            hoNDArray<T>* pDst = &output_;
+            hoNDArray<T>* pTmp;
+
+            hoNDArray<T> buffer3D;
+
+            // adjust E1
+            if ( encodingE1 >= E1+1 )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize3DOnKSpace(*pSrc, RO, encodingE1, E2, *pDst));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize3D(*pSrc, RO, encodingE1, E2, *pDst));
+                }
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+            else if ( encodingE1 <= E1-1 )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::crop(RO, encodingE1, E2, pSrc, pDst));
+                }
+                else
+                {
+                    Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(*pSrc, buffer3D);
+                    GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::crop(RO, encodingE1, E2, &buffer3D, pDst));
+                }
+
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(*pDst);
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+
+            // adjust E2
+            if ( encodingE2 >= E2+1 )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize3DOnKSpace(*pSrc, RO, pSrc->get_size(1), encodingE2, *pDst));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize3D(*pSrc, RO, pSrc->get_size(1), encodingE2, *pDst));
+                }
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+            else if ( encodingE2 <= E2-1 )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::crop(RO, pSrc->get_size(1), encodingE2, pSrc, pDst));
+                }
+                else
+                {
+                    Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(*pSrc, buffer3D);
+                    GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::crop(RO, pSrc->get_size(1), encodingE2, &buffer3D, pDst));
+                }
+
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(*pDst);
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+
+            //adjust RO
+            if ( RO < reconSizeRO_ )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize3DOnKSpace(*pSrc, reconSizeRO_, pSrc->get_size(1), pSrc->get_size(2), *pDst));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize3D(*pSrc, reconSizeRO_, pSrc->get_size(1), pSrc->get_size(2), *pDst));
+                }
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+            else if ( RO > reconSizeRO_ )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::crop(reconSizeRO_, pSrc->get_size(1), pSrc->get_size(2), pSrc, pDst));
+                }
+                else
+                {
+                    Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(*pSrc, buffer3D);
+                    GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::crop(reconSizeRO_, pSrc->get_size(1), pSrc->get_size(2), &buffer3D, pDst));
+                }
+
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(*pDst);
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*pSrc, "res_beforeCut"); }
+
+            // final cut on image
+            if ( isKSpace )
+            {
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(*pSrc, buffer3D);
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::crop(reconSizeRO_, reconSizeE1_, reconSizeE2_, &buffer3D, pDst));
+            }
+            else
+            {
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::crop(reconSizeRO_, reconSizeE1_, reconSizeE2_, pSrc, pDst));
+            }
+            // GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(*pDst));
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*pDst, debugFolder_+"res_AfterCut"); }
+
+            if ( pDst != &output_ )
+            {
+                output_ = *pDst;
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconWorkFlowCartesian<T>::convertToReconSpace3D(const hoNDArray& input_, hoNDArray& output_, bool isKSpace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian<T>::
+postProcessing(hoNDArray<T>& res, bool process_gfactor, bool process_wrap_around_map)
+{
+    try
+    {
+        size_t RO = res.get_size(0);
+        size_t E1 = res.get_size(1);
+        size_t E2 = res.get_size(4);
+
+        // whether to process gfactor
+        bool has_gfactor = false;
+        if ( (gfactor_.get_size(0)==RO) && (gfactor_.get_size(1)==E1) )
+        {
+            has_gfactor = true;
+        }
+
+        if ( !process_gfactor )
+        {
+            has_gfactor = false;
+        }
+
+        // whehter to process wrap_around map
+        bool has_wrap_around = false;
+        if ( (wrap_around_map_.get_size(0)==RO) && (wrap_around_map_.get_size(1)==E1) )
+        {
+            has_wrap_around = true;
+        }
+
+        if ( !process_wrap_around_map )
+        {
+            has_wrap_around = false;
+        }
+
+        if ( E2_.second > 1 )
+        {
+            if ( performTiming_ ) { gt_timer1_.start("postProcessing - permute res array ... "); }
+
+            // permute E2 dimension from [RO E1 CHA SLC E2 ...] to [RO E1 E2 CHA SLC ...]
+
+            std::vector<size_t> dim_order(5);
+            dim_order[0] = 0;
+            dim_order[1] = 1;
+            dim_order[2] = 4;
+            dim_order[3] = 2;
+            dim_order[4] = 3;
+
+            std::vector<size_t> dim, dimPermuted;
+            res.get_dimensions(dim);
+            dimPermuted = dim;
+            dimPermuted[2] = dim[4];
+            dimPermuted[3] = dim[2];
+            dimPermuted[4] = dim[3];
+
+            dataCurr_.create(dimPermuted);
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&res, &dataCurr_, &dim_order));
+
+            if ( has_gfactor )
+            {
+                gfactor_.get_dimensions(dim);
+                dimPermuted = dim;
+                dimPermuted[2] = dim[4];
+                dimPermuted[3] = dim[2];
+                dimPermuted[4] = dim[3];
+
+                gfactorCurr_.create(dimPermuted);
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&gfactor_, &gfactorCurr_, &dim_order));
+            }
+
+            if ( has_wrap_around )
+            {
+                wrap_around_map_.get_dimensions(dim);
+                dimPermuted = dim;
+                dimPermuted[2] = dim[4];
+                dimPermuted[3] = dim[2];
+                dimPermuted[4] = dim[3];
+
+                wrap_around_mapCurr_.create(dimPermuted);
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&wrap_around_map_, &wrap_around_mapCurr_, &dim_order));
+            }
+
+            if ( performTiming_ ) { gt_timer1_.stop(); }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(dataCurr_, debugFolder_+"data_permuted"); }
+
+            // dataCurr_ = *data_permuted;
+
+            res.reshape(dataCurr_.get_dimensions());
+
+            bool inKSpace = false;
+
+            if ( workOrder_->filterROE1E2_.get_size(0)==RO 
+                    && workOrder_->filterROE1E2_.get_size(1)==E1 
+                    && workOrder_->filterROE1E2_.get_size(2)==E2 )
+            {
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(dataCurr_, res);
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspace3DfilterROE1E2(res, workOrder_->filterROE1E2_, dataCurr_));
+                inKSpace = true;
+            }
+            else if ( (workOrder_->filterRO_.get_number_of_elements() == RO) 
+                        && (workOrder_->filterE1_.get_number_of_elements() == E1) 
+                        && (workOrder_->filterE2_.get_number_of_elements() == E2) )
+            {
+                if ( performTiming_ ) { gt_timer1_.start("postProcessing - fft3c ... "); }
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(dataCurr_, res);
+                if ( performTiming_ ) { gt_timer1_.stop(); }
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"kspace_beforefiltered"); }
+
+                if ( performTiming_ ) { gt_timer1_.start("postProcessing - 3D kspace filter ... "); }
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspace3DfilterROE1E2(res, workOrder_->filterRO_, workOrder_->filterE1_, workOrder_->filterE2_, dataCurr_));
+                if ( performTiming_ ) { gt_timer1_.stop(); }
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(dataCurr_, debugFolder_+"kspace_afterfiltered"); }
+                inKSpace = true;
+            }
+            else
+            {
+                hoNDArray<T>* pSrc = &res;
+                hoNDArray<T>* pDst = &dataCurr_;
+
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(*pDst, *pSrc);
+
+                bool filterPerformed = false;
+
+                if ( workOrder_->filterRO_.get_number_of_elements() == RO )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspacefilterRO(*pSrc, workOrder_->filterRO_, *pDst));
+                    std::swap(pSrc, pDst);
+                    filterPerformed = true;
+                }
+
+                if ( workOrder_->filterE1_.get_number_of_elements() == E1 )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspacefilterE1(*pSrc, workOrder_->filterE1_, *pDst));
+                    std::swap(pSrc, pDst);
+                    filterPerformed = true;
+                }
+
+                if ( workOrder_->filterE2_.get_number_of_elements() == E2 )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspace3DfilterE2(*pSrc, workOrder_->filterE2_, *pDst));
+                    std::swap(pSrc, pDst);
+                    filterPerformed = true;
+                }
+
+                if ( filterPerformed )
+                {
+                    if ( pDst != &dataCurr_ )
+                    {
+                        dataCurr_ = *pDst;
+                    }
+                }
+                else
+                {
+                    dataCurr_ = res;
+                }
+
+                inKSpace = true;
+            }
+
+            if ( inKSpace )
+            {
+                if ( !debugFolder_.empty() )
+                {
+                    hoNDArray<T> Im(dataCurr_);
+                    Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(Im);
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(Im, debugFolder_+"complexIm_filtered"); }
+                }
+            }
+            else
+            {
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"complexIm_filtered"); }
+            }
+
+            GADGET_CHECK_RETURN_FALSE(convertToReconSpace3D(dataCurr_, res, inKSpace));
+
+            {
+                std::vector<size_t> dim_order(5);
+                dim_order[0] = 0;
+                dim_order[1] = 1;
+                dim_order[2] = 3;
+                dim_order[3] = 4;
+                dim_order[4] = 2;
+
+                std::vector<size_t> dim, dimPermuted;
+                res.get_dimensions(dim);
+                dimPermuted = dim;
+                dimPermuted[2] = dim[3];
+                dimPermuted[3] = dim[4];
+                dimPermuted[4] = dim[2];
+
+                dataCurr_.create(dimPermuted);
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&res, &dataCurr_, &dim_order));
+            }
+
+            res.reshape(dataCurr_.get_dimensions());
+            memcpy(res.begin(), dataCurr_.begin(), res.get_number_of_bytes());
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"complexIm_zpadResize3D"); }
+
+            if ( has_gfactor )
+            {
+                GADGET_CHECK_RETURN_FALSE(convertToReconSpace3D(gfactorCurr_, gfactor_, false));
+
+                std::vector<size_t> dim_order(5);
+                dim_order[0] = 0;
+                dim_order[1] = 1;
+                dim_order[2] = 3;
+                dim_order[3] = 4;
+                dim_order[4] = 2;
+
+                std::vector<size_t> dim, dimPermuted;
+                gfactor_.get_dimensions(dim);
+                dimPermuted = dim;
+                dimPermuted[2] = dim[3];
+                dimPermuted[3] = dim[4];
+                dimPermuted[4] = dim[2];
+
+                gfactorCurr_.create(dimPermuted);
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&gfactor_, &gfactorCurr_, &dim_order));
+
+                gfactor_.reshape(gfactorCurr_.get_dimensions());
+                memcpy(gfactor_.begin(), gfactorCurr_.begin(), gfactor_.get_number_of_bytes());
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(gfactor_, debugFolder_+"gfactor_zpadResize3D"); }
+            }
+
+            if ( has_wrap_around )
+            {
+                GADGET_CHECK_RETURN_FALSE(convertToReconSpace3D(wrap_around_mapCurr_, wrap_around_map_, false));
+
+                std::vector<size_t> dim_order(5);
+                dim_order[0] = 0;
+                dim_order[1] = 1;
+                dim_order[2] = 3;
+                dim_order[3] = 4;
+                dim_order[4] = 2;
+
+                std::vector<size_t> dim, dimPermuted;
+                wrap_around_map_.get_dimensions(dim);
+                dimPermuted = dim;
+                dimPermuted[2] = dim[3];
+                dimPermuted[3] = dim[4];
+                dimPermuted[4] = dim[2];
+
+                wrap_around_mapCurr_.create(dimPermuted);
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&wrap_around_map_, &wrap_around_mapCurr_, &dim_order));
+
+                wrap_around_map_.reshape(wrap_around_mapCurr_.get_dimensions());
+                memcpy(wrap_around_map_.begin(), wrap_around_mapCurr_.begin(), wrap_around_map_.get_number_of_bytes());
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(wrap_around_map_, debugFolder_+"wrap_around_map_zpadResize3D"); }
+            }
+        }
+        else
+        {
+            dataCurr_ = res;
+            bool inKSpace = false;
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(dataCurr_, debugFolder_+"complexIm_before_filtered"); }
+
+            if ( workOrder_->filterROE1_.get_size(0)==RO && workOrder_->filterROE1_.get_size(1)==E1 )
+            {
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(dataCurr_, res);
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspacefilterROE1(res, workOrder_->filterROE1_, dataCurr_));
+                inKSpace = true;
+            }
+            else if ( (workOrder_->filterRO_.get_number_of_elements() == RO) && (workOrder_->filterE1_.get_number_of_elements() == E1) )
+            {
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(dataCurr_, res);
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspacefilterROE1(res, workOrder_->filterRO_, workOrder_->filterE1_, dataCurr_));
+                inKSpace = true;
+            }
+            else
+            {
+                if ( (workOrder_->filterRO_.get_number_of_elements() == RO) && (workOrder_->filterE1_.get_number_of_elements() != E1) )
+                {
+                    Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(dataCurr_, res);
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspacefilterRO(res, workOrder_->filterRO_, dataCurr_));
+                    inKSpace = true;
+                }
+
+                if ( (workOrder_->filterRO_.get_number_of_elements() != RO) && (workOrder_->filterE1_.get_number_of_elements() == E1) )
+                {
+                    Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(dataCurr_, res);
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspacefilterE1(res, workOrder_->filterE1_, dataCurr_));
+                    inKSpace = true;
+                }
+            }
+
+            if ( inKSpace )
+            {
+                if ( !debugFolder_.empty() )
+                {
+                    hoNDArray<T> Im(dataCurr_);
+                    Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(Im);
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(Im, debugFolder_+"complexIm_after_filtered"); }
+                }
+            }
+            else
+            {
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"complexIm_after_filtered"); }
+            }
+
+            GADGET_CHECK_RETURN_FALSE(convertToReconSpace2D(dataCurr_, res, inKSpace));
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"complexIm_zpadResize2D"); }
+
+            if ( has_gfactor )
+            {
+                gfactorCurr_ = gfactor_;
+                GADGET_CHECK_RETURN_FALSE(convertToReconSpace2D(gfactorCurr_, gfactor_, false));
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(gfactor_, debugFolder_+"gfactor_zpadResize2D"); }
+            }
+
+            if ( has_wrap_around )
+            {
+                wrap_around_mapCurr_ = wrap_around_map_;
+                GADGET_CHECK_RETURN_FALSE(convertToReconSpace2D(wrap_around_mapCurr_, wrap_around_map_, false));
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(wrap_around_map_, debugFolder_+"wrap_around_map_zpadResize2D"); }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconWorkFlowCartesian<T>::postProcessing(res) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian<T>::
+postProcessing()
+{
+    try
+    {
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res_, debugFolder_+"complexIm_afterRecon"); }
+        GADGET_CHECK_RETURN_FALSE(this->postProcessing(res_, true, true));
+
+        if ( this->res_second_.get_number_of_elements() > 0 )
+        {
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res_second_, debugFolder_+"complexImSecond_afterRecon"); }
+            GADGET_CHECK_RETURN_FALSE(this->postProcessing(res_second_, false, false));
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconWorkFlowCartesian<T>::postProcessing() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian<T>::
+configureWorkOrder(const std::vector<ISMRMRDDIM>& dims)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data_!=NULL);
+        GADGET_CHECK_RETURN_FALSE(worker_!=NULL);
+        GADGET_CHECK_RETURN_FALSE(workOrder_!=NULL);
+
+        if ( ref_ == NULL )
+        {
+            ref_ = data_;
+        }
+
+        size_t dd;
+
+        // find the dimension size for data and ref
+        std::vector<size_t> dimSize(dims.size());
+        std::vector<size_t> dimSizeRef(dims.size(), 1);
+        size_t indChannelDim = 2;
+        for ( dd=0; dd<dims.size(); dd++ )
+        {
+            dimSize[dd] = data_->get_size(dims[dd]-DIM_ReadOut);
+            if ( ref_ != NULL )
+            {
+                dimSizeRef[dd] = ref_->get_size(dims[dd]-DIM_ReadOut);
+            }
+
+            if ( dims[dd] == DIM_Channel )
+            {
+                indChannelDim = dd;
+            }
+        }
+
+        GDEBUG_CONDITION_STREAM(!debugFolder_.empty(), "Recon dimensions : " << this->printISMRMRDDimensions(dims));
+        GDEBUG_CONDITION_STREAM(!debugFolder_.empty(), "Recon size       : " << this->printISMRMRDDimensionSize(dimSize));
+        GDEBUG_CONDITION_STREAM(!debugFolder_.empty(), "Recon ref size   : " << this->printISMRMRDDimensionSize(dimSizeRef));
+
+        bool gfactor_needed = workOrder_->gfactor_needed_;
+        bool wrap_around_map_needed = workOrder_->wrap_around_map_needed_;
+
+        // recon workOrder size
+        std::vector<size_t> dimReconSize(5);
+        dimReconSize[0] = dimSize[0];
+        dimReconSize[1] = dimSize[1];
+        dimReconSize[2] = dimSize[2];
+        dimReconSize[3] = dimSize[3];
+        dimReconSize[4] = dimSize[4];
+
+        std::vector<size_t> dimReconTimeStampSize(dimReconSize);
+        dimReconTimeStampSize[0] = 1; // RO = 1
+        dimReconTimeStampSize[2] = 1; // CHA = 1
+
+        std::vector<size_t> dimReconSizeRef(5);
+        dimReconSizeRef[0] = dimSizeRef[0];
+        dimReconSizeRef[1] = dimSizeRef[1];
+        dimReconSizeRef[2] = dimSizeRef[2];
+        dimReconSizeRef[3] = dimSizeRef[3];
+        dimReconSizeRef[4] = dimSizeRef[4];
+
+        // first two dimension are always RO and E1
+        size_t N2D = dimReconSize[0]*dimReconSize[1];
+        size_t N2DRef = dimReconSizeRef[0]*dimReconSizeRef[1];
+
+        size_t N3D = N2D*dimReconSize[2];
+        size_t N3DRef = N2DRef*dimReconSizeRef[2];
+
+        // allocate the results
+        size_t num_channels_res = workOrder_->num_channels_res_;
+
+        std::vector<size_t> dimResSize(dimSize);
+
+        if ( gfactor_needed )
+        {
+            dimResSize[indChannelDim] = 1;
+            gfactor_.create(&dimResSize);
+        }
+
+        if ( wrap_around_map_needed )
+        {
+            if ( workOrder_->acceFactorE2_ > 1 ) // 3D acquisition
+            {
+                dimResSize[indChannelDim] = 3;
+            }
+            else
+            {
+                dimResSize[indChannelDim] = 2;
+            }
+
+            wrap_around_map_.create(&dimResSize);
+        }
+
+        dimResSize[indChannelDim] = num_channels_res;
+        res_.create(&dimResSize);
+        Gadgetron::clear(res_);
+
+        res_second_.create(&dimResSize);
+        Gadgetron::clear(res_second_);
+
+        std::vector<size_t> dimReconResTimeStampSize(dimResSize);
+        dimReconResTimeStampSize[0] = 1;
+        dimReconResTimeStampSize[1] = 1;
+        dimReconResTimeStampSize[2] = 1;
+
+        res_time_stamp_.create(dimReconResTimeStampSize);
+        Gadgetron::fill(res_time_stamp_, (real_value_type)(-1) );
+
+        res_physio_time_stamp_.create(dimReconResTimeStampSize);
+        Gadgetron::fill(res_physio_time_stamp_, (real_value_type)(-1) );
+
+        res_time_stamp_second_.create(dimReconResTimeStampSize);
+        Gadgetron::fill(res_time_stamp_second_, (real_value_type)(-1) );
+
+        res_physio_time_stamp_second_.create(dimReconResTimeStampSize);
+        Gadgetron::fill(res_physio_time_stamp_second_, (real_value_type)(-1) );
+
+        std::vector<ISMRMRDDIM> dimsRes(dims);
+
+        GDEBUG_CONDITION_STREAM(!debugFolder_.empty(), "Recon res dimensions : " << this->printISMRMRDDimensions(dimsRes));
+        GDEBUG_CONDITION_STREAM(!debugFolder_.empty(), "Recon res size       : " << this->printISMRMRDDimensionSize(dimResSize));
+
+        bool shareAcrossWorkOrders = (WorkOrderShareDim_!=DIM_NONE);
+
+        if ( !debugFolder_.empty() )
+        {
+            gt_exporter_.exportArrayComplex(*data_, debugFolder_ + "data_");
+            gt_exporter_.exportArrayComplex(*ref_, debugFolder_ + "ref_");
+
+            if ( time_stamp_ != NULL )
+            {
+                gt_exporter_.exportArray(*time_stamp_, debugFolder_ + "time_stamp_");
+            }
+
+            if ( physio_time_stamp_ != NULL )
+            {
+                gt_exporter_.exportArray(*physio_time_stamp_, debugFolder_ + "physio_time_stamp_");
+            }
+        }
+
+        bool workFlow_use_BufferedKernel_ = workOrder_->workFlow_use_BufferedKernel_;
+
+        bool has_second_res = false;
+        bool has_recon_time_stamp = false;
+        bool has_recon_physio_time_stamp = false;
+        bool has_recon_time_stamp_second = false;
+        bool has_recon_physio_time_stamp_second = false;
+
+        // call up the recon
+        size_t dim9, dim8, dim7, dim6, dim5, dim4, dim3, dim2;
+        for ( dim9=0; dim9<dimSize[9]; dim9++ )
+        {
+            for ( dim8=0; dim8<dimSize[8]; dim8++ )
+            {
+                for ( dim7=0; dim7<dimSize[7]; dim7++ )
+                {
+                    for ( dim6=0; dim6<dimSize[6]; dim6++ )
+                    {
+                        for ( dim5=0; dim5<dimSize[5]; dim5++ )
+                        {
+                            std::vector<size_t> ind(11, 0);
+                            this->ismrmrdDimIndex10D(ind, dims[9], dim9);
+                            this->ismrmrdDimIndex10D(ind, dims[8], dim8);
+                            this->ismrmrdDimIndex10D(ind, dims[7], dim7);
+                            this->ismrmrdDimIndex10D(ind, dims[6], dim6);
+                            this->ismrmrdDimIndex10D(ind, dims[5], dim5);
+
+                            // ---------------------------
+                            // prepare the data in workOrder
+                            // ---------------------------
+                            if ( !workOrder_->data_.dimensions_equal(&dimReconSize) )
+                            {
+                                workOrder_->data_.create(&dimReconSize);
+                                workOrder_->time_stamp_.create(&dimReconTimeStampSize);
+                                Gadgetron::clear(workOrder_->time_stamp_);
+
+                                workOrder_->physio_time_stamp_.create(&dimReconTimeStampSize);
+                                Gadgetron::clear(workOrder_->physio_time_stamp_);
+                            }
+
+                            std::vector<size_t> indWorkOrder(5, 0);
+                            for ( dim4=0; dim4<dimSize[4]; dim4++ )
+                            {
+                                this->ismrmrdDimIndex10D(ind, dims[4], dim4);
+                                indWorkOrder[4] = dim4;
+
+                                for ( dim3=0; dim3<dimSize[3]; dim3++ )
+                                {
+                                    this->ismrmrdDimIndex10D(ind, dims[3], dim3);
+                                    indWorkOrder[3] = dim3;
+
+                                    if ( dims[2] == DIM_Channel )
+                                    {
+                                        long long offset = data_->calculate_offset(ind);
+                                        long long offsetWorkOrder = workOrder_->data_.calculate_offset(indWorkOrder);
+                                        memcpy(workOrder_->data_.begin()+offsetWorkOrder, data_->begin()+offset, sizeof(T)*N3D);
+
+                                        if ( time_stamp_ != NULL )
+                                        {
+                                            offset = time_stamp_->calculate_offset(ind);
+                                            offsetWorkOrder = workOrder_->time_stamp_.calculate_offset(indWorkOrder);
+                                            memcpy(workOrder_->time_stamp_.begin()+offsetWorkOrder, time_stamp_->begin()+offset, sizeof(real_value_type)*dimReconSize[1]);
+                                            if ( physio_time_stamp_ != NULL )
+                                            {
+                                                memcpy(workOrder_->physio_time_stamp_.begin()+offsetWorkOrder, physio_time_stamp_->begin()+offset, sizeof(real_value_type)*dimReconSize[1]);
+                                            }
+                                        }
+                                    }
+                                    else
+                                    {
+                                        GWARN_STREAM("dims[2] != DIM_Channel, the time stamps will not be copied ... ");
+
+                                        for ( dim2=0; dim2<dimSize[2]; dim2++ )
+                                        {
+                                            this->ismrmrdDimIndex10D(ind, dims[2], dim2);
+                                            indWorkOrder[2] = dim2;
+
+                                            long long offset = data_->calculate_offset(ind);
+                                            long long offsetWorkOrder = workOrder_->data_.calculate_offset(indWorkOrder);
+                                            memcpy(workOrder_->data_.begin()+offsetWorkOrder, data_->begin()+offset, sizeof(T)*N2D);
+                                        }
+                                    }
+                                }
+                            }
+
+                            // ---------------------------
+                            // prepare the ref in workOrder
+                            // ---------------------------
+                            if ( (ref_ != NULL) && (ref_->get_number_of_elements()>0) )
+                            {
+                                std::vector<size_t> indRef(11, 0);
+
+                                if ( dim9 < dimSizeRef[9] )
+                                {
+                                    this->ismrmrdDimIndex10D(indRef, dims[9], dim9);
+                                }
+                                else
+                                {
+                                    this->ismrmrdDimIndex10D(indRef, dims[9], dimSizeRef[9]-1);
+                                }
+
+                                if ( dim8 < dimSizeRef[8] )
+                                {
+                                    this->ismrmrdDimIndex10D(indRef, dims[8], dim8);
+                                }
+                                else
+                                {
+                                    this->ismrmrdDimIndex10D(indRef, dims[8], dimSizeRef[8]-1);
+                                }
+
+                                if ( dim7 < dimSizeRef[7] )
+                                {
+                                    this->ismrmrdDimIndex10D(indRef, dims[7], dim7);
+                                }
+                                else
+                                {
+                                    this->ismrmrdDimIndex10D(indRef, dims[7], dimSizeRef[7]-1);
+                                }
+
+                                if ( dim6 < dimSizeRef[6] )
+                                {
+                                    this->ismrmrdDimIndex10D(indRef, dims[6], dim6);
+                                }
+                                else
+                                {
+                                    this->ismrmrdDimIndex10D(indRef, dims[6], dimSizeRef[6]-1);
+                                }
+
+                                if ( dim5 < dimSizeRef[5] )
+                                {
+                                    this->ismrmrdDimIndex10D(indRef, dims[5], dim5);
+                                }
+                                else
+                                {
+                                    this->ismrmrdDimIndex10D(indRef, dims[5], dimSizeRef[5]-1);
+                                }
+
+                                if ( !workOrder_->ref_.dimensions_equal(&dimReconSizeRef) )
+                                {
+                                    workOrder_->ref_.create(&dimReconSizeRef);
+                                }
+
+                                std::vector<size_t> indRefWorkOrder(11, 0);
+                                for ( dim4=0; dim4<dimSize[4]; dim4++ )
+                                {
+                                    size_t dim4_ref = dim4;
+                                    if ( dim4 < dimSizeRef[4] )
+                                    {
+                                        this->ismrmrdDimIndex10D(indRef, dims[4], dim4);
+                                    }
+                                    else
+                                    {
+                                        this->ismrmrdDimIndex10D(indRef, dims[4], dimSizeRef[4]-1);
+                                        dim4_ref = dimSizeRef[4]-1;
+                                    }
+                                    indRefWorkOrder[4] = dim4_ref;
+
+                                    for ( dim3=0; dim3<dimSize[3]; dim3++ )
+                                    {
+                                        size_t dim3_ref = dim3;
+                                        if ( dim3 < dimSizeRef[3] )
+                                        {
+                                            this->ismrmrdDimIndex10D(indRef, dims[3], dim3);
+                                        }
+                                        else
+                                        {
+                                            this->ismrmrdDimIndex10D(indRef, dims[3], dimSizeRef[3]-1);
+                                            dim3_ref = dimSizeRef[3]-1;
+                                        }
+                                        indRefWorkOrder[3] = dim3_ref;
+
+                                        if ( dims[2] == DIM_Channel )
+                                        {
+                                            long long offset = ref_->calculate_offset(indRef);
+                                            long long offsetWorkOrder = workOrder_->ref_.calculate_offset(indRefWorkOrder);
+                                            memcpy(workOrder_->ref_.begin()+offsetWorkOrder, ref_->begin()+offset, sizeof(T)*N3DRef);
+                                        }
+                                        else
+                                        {
+                                            for ( dim2=0; dim2<dimSize[2]; dim2++ )
+                                            {
+                                                size_t dim2_ref = dim2;
+                                                if ( dim2 < dimSizeRef[2] )
+                                                {
+                                                    this->ismrmrdDimIndex10D(indRef, dims[2], dim2);
+                                                }
+                                                else
+                                                {
+                                                    this->ismrmrdDimIndex10D(indRef, dims[2], dimSizeRef[2]-1);
+                                                    dim2_ref = dimSizeRef[2]-1;
+                                                }
+                                                indRefWorkOrder[2] = dim2_ref;
+
+                                                long long offset = ref_->calculate_offset(indRef);
+                                                long long offsetWorkOrder = workOrder_->ref_.calculate_offset(indRefWorkOrder);
+                                                memcpy(workOrder_->ref_.begin()+offsetWorkOrder, ref_->begin()+offset, sizeof(T)*N2DRef);
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+
+                            // ---------------------------
+                            // handle shared work order
+                            // ---------------------------
+                            if ( !shareAcrossWorkOrders && workOrder_->workFlow_BufferKernel_ && !workOrder_->workFlow_use_BufferedKernel_ )
+                            {
+                                GADGET_CHECK_RETURN_FALSE(workOrder_->reset());
+                            }
+
+                            if ( shareAcrossWorkOrders && !workOrder_->workFlow_use_BufferedKernel_ )
+                            {
+                                if ( dim5==0 )
+                                {
+                                    workOrder_->workFlow_use_BufferedKernel_ = false;
+                                }
+                                else
+                                {
+                                    workOrder_->workFlow_use_BufferedKernel_ = true;
+                                }
+                            }
+
+                            if ( !debugFolder_.empty() )
+                            {
+                                gt_exporter_.exportArrayComplex(workOrder_->data_, debugFolder_+"workOrder_data");
+                                gt_exporter_.exportArray(workOrder_->time_stamp_, debugFolder_+"workOrder_time_stamp");
+                                gt_exporter_.exportArray(workOrder_->physio_time_stamp_, debugFolder_+"workOrder_physio_time_stamp");
+                                gt_exporter_.exportArrayComplex(workOrder_->ref_, debugFolder_+"workOrder_ref");
+                            }
+
+                            // ---------------------------
+                            // perform the recon
+                            // ---------------------------
+                            GADGET_CHECK_RETURN_FALSE(worker_->performRecon(workOrder_));
+
+                            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder_->complexIm_, debugFolder_+"workOrder_complexIm"); }
+
+                            if ( workOrder_->complexIm_second_.get_number_of_elements()>0 )
+                            {
+                                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder_->complexIm_second_, debugFolder_+"workOrder_complexImSecond"); }
+                            }
+
+                            if ( shareAcrossWorkOrders )
+                            {
+                                workOrder_->workFlow_use_BufferedKernel_ = workFlow_use_BufferedKernel_;
+                            }
+
+                            // ---------------------------
+                            // copy the recon complexIm
+                            // ---------------------------
+                            std::vector<size_t> indRes(ind);
+                            indRes[0] = 0;
+                            indRes[1] = 0;
+                            indRes[2] = 0;
+                            indRes[3] = 0;
+                            indRes[4] = 0;
+                            indRes[5] = dim5;
+                            indRes[6] = dim6;
+                            indRes[7] = dim7;
+                            indRes[8] = dim8;
+                            indRes[9] = dim9;
+
+                            long long offset = res_.calculate_offset(indRes);
+                            memcpy(res_.begin()+offset, workOrder_->complexIm_.begin(), workOrder_->complexIm_.get_number_of_bytes());
+
+                            // ---------------------------
+                            // copy the recon time stamp
+                            // ---------------------------
+                            if ( workOrder_->recon_time_stamp_.get_number_of_elements()>0 )
+                            {
+                                has_recon_time_stamp = true;
+
+                                offset = res_time_stamp_.calculate_offset(indRes);
+                                memcpy(res_time_stamp_.begin()+offset, workOrder_->recon_time_stamp_.begin(), workOrder_->recon_time_stamp_.get_number_of_bytes());
+                            }
+
+                            // ---------------------------
+                            // copy the recon physio time stamp
+                            // ---------------------------
+                            if ( workOrder_->recon_physio_time_stamp_.get_number_of_elements()>0 )
+                            {
+                                has_recon_physio_time_stamp = true;
+
+                                offset = res_physio_time_stamp_.calculate_offset(indRes);
+                                memcpy(res_physio_time_stamp_.begin()+offset, workOrder_->recon_physio_time_stamp_.begin(), workOrder_->recon_physio_time_stamp_.get_number_of_bytes());
+                            }
+
+                            // ---------------------------
+                            // copy the second set of recon complexIm
+                            // ---------------------------
+                            GADGET_CHECK_RETURN_FALSE(this->copyReconResultsSecond(dim5, dim6, dim7, dim8, dim9));
+
+                            if ( workOrder_->complexIm_second_.get_number_of_elements()>0 )
+                            {
+                                has_second_res = true;
+                            }
+
+                            if ( workOrder_->recon_time_stamp_second_.get_number_of_elements()>0 )
+                            {
+                                has_recon_time_stamp_second = true;
+                            }
+
+                            if ( workOrder_->recon_physio_time_stamp_second_.get_number_of_elements()>0 )
+                            {
+                                has_recon_physio_time_stamp_second = true;
+                            }
+
+                            // ---------------------------
+                            // copy the gfactor
+                            // ---------------------------
+                            GADGET_CHECK_RETURN_FALSE(this->copyGFactor(dim5, dim6, dim7, dim8, dim9, gfactor_needed));
+
+                            // ---------------------------
+                            // copy the wrap-round map
+                            // ---------------------------
+                            GADGET_CHECK_RETURN_FALSE(this->copyWrapAroundMap(dim5, dim6, dim7, dim8, dim9, wrap_around_map_needed));
+
+                            // if not sharing across work order
+                            if ( !shareAcrossWorkOrders && !workOrder_->workFlow_use_BufferedKernel_ && !workOrder_->workFlow_BufferKernel_ )
+                            {
+                                GADGET_CHECK_RETURN_FALSE(workOrder_->reset());
+                            }
+                        }
+
+                        // in the outter dimensions, the work order is always reset
+                        if ( !workOrder_->workFlow_use_BufferedKernel_ && !workOrder_->workFlow_BufferKernel_ )
+                        {
+                            GADGET_CHECK_RETURN_FALSE(workOrder_->reset());
+                        }
+                    }
+                }
+            }
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res_, debugFolder_+"res_afterunwrapping"); }
+
+        if ( has_second_res )
+        {
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res_second_, debugFolder_+"res_second_afterunwrapping"); }
+        }
+
+        if ( has_recon_time_stamp )
+        {
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArray(res_time_stamp_, debugFolder_+"res_time_stamp"); }
+        }
+
+        if ( has_recon_physio_time_stamp )
+        {
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArray(res_physio_time_stamp_, debugFolder_+"res_physio_time_stamp"); }
+        }
+
+        if ( has_recon_time_stamp_second )
+        {
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArray(res_time_stamp_second_, debugFolder_+"res_time_stamp_second"); }
+        }
+
+        if ( has_recon_physio_time_stamp_second )
+        {
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArray(res_physio_time_stamp_second_, debugFolder_+"res_physio_time_stamp_second"); }
+        }
+
+        if ( gfactor_needed )
+        {
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(gfactor_, debugFolder_+"gfactor_afterunwrapping"); }
+        }
+
+        if ( wrap_around_map_needed )
+        {
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(wrap_around_map_, debugFolder_+"wrap_around_map_afterunwrapping"); }
+        }
+
+        // permute the res_ to the correct dimension order
+        if (   ( (res_.get_number_of_elements()>dimResSize[0]*dimResSize[1]) && (dims[2]!=DIM_Channel) ) 
+            || ( (res_.get_number_of_elements()>dimResSize[0]*dimResSize[1]*dimResSize[2])             ) )
+        {
+            std::vector<size_t> order;
+            GADGET_CHECK_RETURN_FALSE(this->findISMRMRDPermuteOrder(dimsRes, dimsRes_, order));
+
+            GADGET_CHECK_RETURN_FALSE(this->permuteArrayOrder(res_, order));
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res_, debugFolder_+"res_afterPermute"); }
+
+            if ( has_recon_time_stamp )
+            {
+                GADGET_CHECK_RETURN_FALSE(this->permuteArrayOrder(res_time_stamp_, order));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArray(res_time_stamp_, debugFolder_+"res_time_stamp_afterPermute"); }
+            }
+
+            if ( has_recon_physio_time_stamp )
+            {
+                GADGET_CHECK_RETURN_FALSE(this->permuteArrayOrder(res_physio_time_stamp_, order));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArray(res_physio_time_stamp_, debugFolder_+"res_physio_time_stamp_afterPermute"); }
+            }
+
+            if ( gfactor_needed )
+            {
+                GADGET_CHECK_RETURN_FALSE(this->permuteArrayOrder(gfactor_, order));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(gfactor_, debugFolder_+"gfactor_afterPermute"); }
+            }
+
+            if ( wrap_around_map_needed )
+            {
+                GADGET_CHECK_RETURN_FALSE(this->permuteArrayOrder(wrap_around_map_, order));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(wrap_around_map_, debugFolder_+"wrap_around_map_afterPermute"); }
+            }
+        }
+
+        if ( has_second_res )
+        {
+            if (   ( (res_second_.get_number_of_elements()>dimResSize[0]*dimResSize[1]) && (dims[2]!=DIM_Channel) ) 
+                || ( (res_second_.get_number_of_elements()>dimResSize[0]*dimResSize[1]*dimResSize[2])             ) )
+            {
+                std::vector<size_t> order;
+                GADGET_CHECK_RETURN_FALSE(this->findISMRMRDPermuteOrder(dimsRes, dimsRes_, order));
+
+                GADGET_CHECK_RETURN_FALSE(this->permuteArrayOrder(res_second_, order));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res_second_, debugFolder_+"res_second_afterPermute"); }
+
+                if ( has_recon_time_stamp_second )
+                {
+                    GADGET_CHECK_RETURN_FALSE(this->permuteArrayOrder(res_time_stamp_second_, order));
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArray(res_time_stamp_, debugFolder_+"res_time_stamp_second_afterPermute"); }
+                }
+
+                if ( has_recon_physio_time_stamp_second )
+                {
+                    GADGET_CHECK_RETURN_FALSE(this->permuteArrayOrder(res_physio_time_stamp_second_, order));
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArray(res_physio_time_stamp_second_, debugFolder_+"res_physio_time_stamp_second_afterPermute"); }
+                }
+            }
+        }
+        else
+        {
+            res_second_.clear();
+        }
+
+        if ( !has_recon_time_stamp )
+        {
+            res_time_stamp_.clear();
+        }
+
+        if ( !has_recon_physio_time_stamp )
+        {
+            res_physio_time_stamp_.clear();
+        }
+
+        if ( !has_recon_time_stamp_second )
+        {
+            res_time_stamp_second_.clear();
+        }
+
+        if ( !has_recon_physio_time_stamp_second )
+        {
+            res_physio_time_stamp_second_.clear();
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconWorkFlowCartesian<T>::configureWorkOrder(const std::vector<ISMRMRDDIM>& dims) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian<T>::
+copyReconResultsSecond(size_t dim5, size_t dim6, size_t dim7, size_t dim8, size_t dim9)
+{
+    try
+    {
+
+        if ( workOrder_->complexIm_second_.get_number_of_elements()>0 )
+        {
+            std::vector<size_t> indRes(10);
+
+            size_t RO = workOrder_->complexIm_second_.get_size(0);
+            size_t E1 = workOrder_->complexIm_second_.get_size(1);
+            size_t N = workOrder_->complexIm_second_.get_size(2);
+            size_t S = workOrder_->complexIm_second_.get_size(3);
+
+            std::vector<size_t> dims;
+
+            bool hasTimeStamp = false;
+            if ( workOrder_->recon_time_stamp_second_.get_number_of_elements()>0 )
+            {
+                hasTimeStamp = true;
+
+                res_time_stamp_second_.get_dimensions(dims);
+                if ( dims[3] != N ) dims[3] = N;
+                if ( dims[4] != S ) dims[4] = S;
+
+                res_time_stamp_second_.create(dims);
+                Gadgetron::clear(res_time_stamp_second_);
+            }
+
+            bool hasPhysioTimeStamp = false;
+            if ( workOrder_->recon_physio_time_stamp_second_.get_number_of_elements()>0 )
+            {
+                hasPhysioTimeStamp = true;
+
+                res_physio_time_stamp_second_.get_dimensions(dims);
+                if ( dims[3] != N ) dims[3] = N;
+                if ( dims[4] != S ) dims[4] = S;
+
+                res_physio_time_stamp_second_.create(dims);
+                Gadgetron::clear(res_physio_time_stamp_second_);
+            }
+
+            res_second_.get_dimensions(dims);
+            if ( dims[3] != N ) dims[3] = N;
+            if ( dims[4] != S ) dims[4] = S;
+
+            res_second_.create(dims);
+            Gadgetron::clear(res_second_);
+
+            size_t n, s;
+            for ( s=0; s<S; s++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    indRes[0] = 0;
+                    indRes[1] = 0;
+                    indRes[2] = 0;
+                    indRes[3] = n;
+                    indRes[4] = s;
+                    indRes[5] = dim5;
+                    indRes[6] = dim6;
+                    indRes[7] = dim7;
+                    indRes[8] = dim8;
+                    indRes[9] = dim9;
+
+                    size_t offset = res_second_.calculate_offset(indRes);
+                    memcpy(res_second_.begin()+offset, workOrder_->complexIm_second_.begin()+n*RO*E1+s*RO*E1*N, sizeof(T)*RO*E1);
+
+                    if ( hasTimeStamp )
+                    {
+                        offset = res_time_stamp_second_.calculate_offset(indRes);
+                        res_time_stamp_second_(offset) = workOrder_->recon_time_stamp_second_(0, 0, 0, n, s);
+                    }
+
+                    if ( hasPhysioTimeStamp )
+                    {
+                        offset = res_physio_time_stamp_second_.calculate_offset(indRes);
+                        res_physio_time_stamp_second_(offset) = workOrder_->recon_physio_time_stamp_second_(0, 0, 0, n, s);
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconWorkFlowCartesian<T>::copyReconResultsSecond() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian<T>::
+copyGFactor(size_t dim5, size_t dim6, size_t dim7, size_t dim8, size_t dim9, bool gfactor_needed)
+{
+    try
+    {
+        if ( gfactor_needed && (workOrder_->gfactor_.get_size(0)==res_.get_size(0)) && (workOrder_->gfactor_.get_size(1) == res_.get_size(1)) )
+        {
+            size_t RO = gfactor_.get_size(0);
+            size_t E1 = gfactor_.get_size(1);
+            size_t N = gfactor_.get_size(3);
+            size_t S = gfactor_.get_size(4);
+
+            size_t gfactor_N = workOrder_->gfactor_.get_size(2);
+            size_t gfactor_S = workOrder_->gfactor_.get_size(3);
+
+            if (!debugFolder_.empty()) { gt_exporter_.exportArrayComplex(workOrder_->gfactor_, debugFolder_ + "workOrder_gfactor_afterunwrapping"); }
+
+            std::vector<size_t> indRes(10);
+            indRes[0] = 0;
+            indRes[1] = 0;
+            indRes[2] = 0;
+            indRes[3] = 0;
+            indRes[4] = 0;
+            indRes[5] = dim5;
+            indRes[6] = dim6;
+            indRes[7] = dim7;
+            indRes[8] = dim8;
+            indRes[9] = dim9;
+
+            if ( (gfactor_N == N) && (gfactor_S == S) )
+            {
+                size_t offset = gfactor_.calculate_offset(indRes);
+                memcpy(gfactor_.begin()+offset, workOrder_->gfactor_.begin(), workOrder_->gfactor_.get_number_of_bytes());
+            }
+            else
+            {
+                std::vector<size_t> indGfactor(9);
+                indGfactor[0] = 0;
+                indGfactor[1] = 0;
+                indGfactor[2] = 0;
+                indGfactor[3] = 0;
+                indGfactor[4] = dim5;
+                indGfactor[5] = dim6;
+                indGfactor[6] = dim7;
+                indGfactor[7] = dim8;
+                indGfactor[8] = dim9;
+
+                size_t n, s;
+                for ( s=0; s<S; s++ )
+                {
+                    for ( n=0; n<N; n++ )
+                    {
+                        indRes[3] = n;
+                        indRes[4] = s;
+                        size_t offset = gfactor_.calculate_offset(indRes);
+
+                        if ( n < gfactor_N )
+                        {
+                            indGfactor[2] = n;
+                        }
+                        else
+                        {
+                            indGfactor[2] = gfactor_N-1;
+                        }
+
+                        if ( s < gfactor_S )
+                        {
+                            indGfactor[3] = s;
+                        }
+                        else
+                        {
+                            indGfactor[3] = gfactor_S-1;
+                        }
+
+                        size_t offset2 = workOrder_->gfactor_.calculate_offset(indGfactor);
+
+                        memcpy(gfactor_.begin()+offset, workOrder_->gfactor_.begin()+offset2, sizeof(T)*RO*E1);
+                    }
+                }
+            }
+
+            if (!debugFolder_.empty()) { gt_exporter_.exportArrayComplex(gfactor_, debugFolder_ + "gfactor_after_copyGFactor"); }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconWorkFlowCartesian<T>::copyGFactor() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian<T>::
+copyWrapAroundMap(size_t dim5, size_t dim6, size_t dim7, size_t dim8, size_t dim9, bool wrap_around_map_needed)
+{
+    try
+    {
+        if ( wrap_around_map_needed && (workOrder_->wrap_around_map_.get_size(0)==res_.get_size(0)) && (workOrder_->wrap_around_map_.get_size(1) == res_.get_size(1)) )
+        {
+            size_t RO = wrap_around_map_.get_size(0);
+            size_t E1 = wrap_around_map_.get_size(1);
+            size_t N = wrap_around_map_.get_size(3);
+            size_t S = wrap_around_map_.get_size(4);
+
+            size_t wrap_around_map_CHA = workOrder_->wrap_around_map_.get_size(2);
+            size_t wrap_around_map_N = workOrder_->wrap_around_map_.get_size(3);
+            size_t wrap_around_map_S = workOrder_->wrap_around_map_.get_size(4);
+
+            std::vector<size_t> indRes(10);
+            size_t offset;
+
+            indRes[0] = 0;
+            indRes[1] = 0;
+            indRes[2] = 0;
+            indRes[3] = 0;
+            indRes[4] = 0;
+            indRes[5] = dim5;
+            indRes[6] = dim6;
+            indRes[7] = dim7;
+            indRes[8] = dim8;
+            indRes[9] = dim9;
+
+            if ( (wrap_around_map_N == N) && (wrap_around_map_S == S) )
+            {
+                offset = wrap_around_map_.calculate_offset(indRes);
+                memcpy(wrap_around_map_.begin()+offset, workOrder_->wrap_around_map_.begin(), workOrder_->wrap_around_map_.get_number_of_bytes());
+            }
+            else
+            {
+                std::vector<size_t> indWrapAroundMap(10);
+                indWrapAroundMap[0] = 0;
+                indWrapAroundMap[1] = 0;
+                indWrapAroundMap[2] = 0;
+                indWrapAroundMap[3] = 0;
+                indWrapAroundMap[4] = 0;
+                indWrapAroundMap[5] = dim5;
+                indWrapAroundMap[6] = dim6;
+                indWrapAroundMap[7] = dim7;
+                indWrapAroundMap[8] = dim8;
+                indWrapAroundMap[9] = dim9;
+
+                size_t n, s;
+                for ( s=0; s<S; s++ )
+                {
+                    for ( n=0; n<N; n++ )
+                    {
+                        indRes[3] = n;
+                        indRes[4] = s;
+                        offset = wrap_around_map_.calculate_offset(indRes);
+
+                        if ( n < wrap_around_map_N )
+                        {
+                            indWrapAroundMap[3] = n;
+                        }
+                        else
+                        {
+                            indWrapAroundMap[3] = wrap_around_map_N-1;
+                        }
+
+                        if ( s < wrap_around_map_S )
+                        {
+                            indWrapAroundMap[4] = s;
+                        }
+                        else
+                        {
+                            indWrapAroundMap[4] = wrap_around_map_S-1;
+                        }
+
+                        size_t offset2 = workOrder_->wrap_around_map_.calculate_offset(indWrapAroundMap);
+
+                        memcpy(wrap_around_map_.begin()+offset, workOrder_->wrap_around_map_.begin()+offset2, sizeof(T)*RO*E1*wrap_around_map_CHA);
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconWorkFlowCartesian<T>::copyWrapAroundMap() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian2DT.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian2DT.h
new file mode 100644
index 0000000..4fcd11a
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian2DT.h
@@ -0,0 +1,292 @@
+/** \file   gtPlusISMRMRDReconWorkFlowCartesian2DT.h
+    \brief  Define the base class for the GtPlus 2DT reconstruction workflow for cartesian sampling
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusISMRMRDReconWorkFlowCartesian.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusISMRMRDReconWorkFlowCartesian2DT : public gtPlusISMRMRDReconWorkFlowCartesian<T>
+{
+public:
+
+    typedef gtPlusISMRMRDReconWorkFlowCartesian<T> BaseClass;
+    typedef typename BaseClass::DimensionRecordType DimensionRecordType;
+    typedef typename BaseClass::real_value_type real_value_type;
+
+    gtPlusISMRMRDReconWorkFlowCartesian2DT();
+    virtual ~gtPlusISMRMRDReconWorkFlowCartesian2DT();
+
+    void printInfo(std::ostream& os);
+
+    virtual bool recon();
+
+    virtual bool predictDimensions();
+
+    using BaseClass::data_;
+    using BaseClass::time_stamp_;
+    using BaseClass::physio_time_stamp_;
+    using BaseClass::ref_;
+    using BaseClass::noise_;
+    using BaseClass::noiseBW_;
+    using BaseClass::receriverBWRatio_;
+    using BaseClass::overSamplingRatioRO_;
+    using BaseClass::reconSizeRO_;
+    using BaseClass::reconSizeE1_;
+    using BaseClass::reconSizeE2_;
+    using BaseClass::encodingFOV_RO_;
+    using BaseClass::encodingFOV_E1_;
+    using BaseClass::encodingFOV_E2_;
+    using BaseClass::reconFOV_RO_;
+    using BaseClass::reconFOV_E1_;
+    using BaseClass::reconFOV_E2_;
+    using BaseClass::res_;
+    using BaseClass::res_second_;
+    using BaseClass::res_time_stamp_;
+    using BaseClass::res_physio_time_stamp_;
+    using BaseClass::res_time_stamp_second_;
+    using BaseClass::res_physio_time_stamp_second_;
+
+    using BaseClass::worker_;
+    using BaseClass::workOrder_;
+
+    using BaseClass::dimsRes_;
+
+    using BaseClass::dataDimStartingIndexes_;
+
+    using BaseClass::WorkOrderShareDim_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+
+    using BaseClass::ref_remove_oversampling_RO_;
+    using BaseClass::ref_apply_noisePreWhitening_;
+
+    // the workOrder2D needs 5 dimensions [RO E1 CHA N S]
+    ISMRMRDDIM dim4th_;
+    ISMRMRDDIM dim5th_;
+
+protected:
+
+    using BaseClass::dataCurr_;
+    using BaseClass::refCurr_;
+
+    using BaseClass::RO_;
+    using BaseClass::E1_;
+    using BaseClass::CHA_;
+    using BaseClass::SLC_;
+    using BaseClass::E2_;
+    using BaseClass::CON_;
+    using BaseClass::PHS_;
+    using BaseClass::REP_;
+    using BaseClass::SET_;
+    using BaseClass::SEG_;
+    using BaseClass::AVE_;
+
+    using BaseClass::RO_ref_;
+    using BaseClass::E1_ref_;
+    using BaseClass::CHA_ref_;
+    using BaseClass::SLC_ref_;
+    using BaseClass::E2_ref_;
+    using BaseClass::CON_ref_;
+    using BaseClass::PHS_ref_;
+    using BaseClass::REP_ref_;
+    using BaseClass::SET_ref_;
+    using BaseClass::SEG_ref_;
+    using BaseClass::AVE_ref_;
+
+    using BaseClass::gtPlus_util_;
+};
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlowCartesian2DT<T>::
+gtPlusISMRMRDReconWorkFlowCartesian2DT() : BaseClass(), dim4th_(DIM_NONE), dim5th_(DIM_NONE)
+{
+}
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlowCartesian2DT<T>::~gtPlusISMRMRDReconWorkFlowCartesian2DT() 
+{
+}
+
+template <typename T> 
+void gtPlusISMRMRDReconWorkFlowCartesian2DT<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD Recon workflow Cartesian 2D/2DT -------------" << endl;
+    os << "Implementation of general reconstruction workflow for cartesian sampling of 2D and 2D+T use cases" << endl;
+    os << "The workOrder needs 5 dimensions [RO E1 CHA N S]" << endl;
+    os << "---------------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian2DT<T>::predictDimensions()
+{
+    // if interleaved mode
+    if ( workOrder_->CalibMode_ == ISMRMRD_interleaved )
+    {
+        if ( workOrder_->InterleaveDim_ == DIM_Phase )
+        {
+            dim4th_ = DIM_Phase;
+        }
+
+        if ( workOrder_->InterleaveDim_ == DIM_Repetition )
+        {
+            dim4th_ = DIM_Repetition;
+        }
+
+        if ( workOrder_->InterleaveDim_ == DIM_Contrast )
+        {
+            dim4th_ = DIM_Contrast;
+        }
+
+        if ( CON_.second==1 && SET_.second==1 )
+        {
+            dim5th_ = DIM_Slice;
+        }
+
+        if ( CON_.second>1 && SET_.second==1 )
+        {
+            dim5th_ = DIM_Contrast;
+        }
+
+        if ( CON_.second==1 && SET_.second>1 )
+        {
+            dim5th_ = DIM_Set;
+        }
+
+        if ( CON_.second>1 && SET_.second>1 )
+        {
+            dim5th_ = DIM_Contrast;
+            WorkOrderShareDim_ = DIM_Set;
+        }
+    }
+    else if ( (workOrder_->CalibMode_ == ISMRMRD_embedded) 
+        || (workOrder_->CalibMode_ == ISMRMRD_separate) 
+        || (workOrder_->CalibMode_ == ISMRMRD_noacceleration) ) 
+    {
+            std::vector<DimensionRecordType> dimSizes(4);
+            dimSizes[0] = CON_;
+            dimSizes[1] = PHS_;
+            dimSizes[2] = REP_;
+            dimSizes[3] = SET_;
+
+            std::sort(dimSizes.begin(), dimSizes.end(), DimensionRecordCompare() );
+
+            dim4th_ = dimSizes[0].first;
+            dim5th_ = dimSizes[1].first;
+
+            if ( dimSizes[2].second > 1 )
+            {
+                WorkOrderShareDim_ = dimSizes[2].first;
+            }
+
+            if ( dimSizes[1].second==1 && dimSizes[2].second==1 && dimSizes[3].second==1 )
+            {
+                dim5th_ = DIM_Slice;
+            }
+    }
+
+    if ( dim4th_==DIM_NONE || dim5th_==DIM_NONE )
+    {
+        GERROR_STREAM("gtPlusISMRMRDReconWorkFlowCartesian2DT<T>::predictDimensions() : cannot find 4th and 5th dimensions ... ");
+        return false;
+    }
+
+    workOrder_->enforceConsistency(dim5th_);
+
+    GDEBUG_CONDITION_STREAM(true, "predictDimensions - dim4th : " << gtPlus_util_.getISMRMRDDimName(dim4th_) );
+    GDEBUG_CONDITION_STREAM(true, "predictDimensions - dim5th : " << gtPlus_util_.getISMRMRDDimName(dim5th_) );
+    GDEBUG_CONDITION_STREAM(true, "predictDimensions - WorkOrderShareDim : " << gtPlus_util_.getISMRMRDDimName(WorkOrderShareDim_) );
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian2DT<T>::recon()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data_!=NULL);
+        GADGET_CHECK_RETURN_FALSE(worker_!=NULL);
+
+        if ( dim4th_==DIM_NONE || dim5th_==DIM_NONE )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->predictDimensions());
+        }
+
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=DIM_ReadOut);
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=DIM_Encoding1);
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=DIM_Channel);
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=dim4th_);
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=dim5th_);
+
+        // find recon dimensions
+        std::vector<ISMRMRDDIM> dims;
+        dims.push_back(DIM_ReadOut);
+        dims.push_back(DIM_Encoding1);
+        dims.push_back(DIM_Channel);
+        dims.push_back(dim4th_);
+        dims.push_back(dim5th_);
+
+        // ISMRMRDDIM dim;
+        int dim;
+        size_t dd;
+
+        int indWorkOrderSharingDim = -1;
+        for ( dim=DIM_Slice; dim<=DIM_Average; dim++ )
+        {
+            if ( dim == DIM_Segment )
+            {
+                continue;
+            }
+
+            bool exist = false;
+            for ( dd=0; dd<dims.size(); dd++ )
+            {
+                if ( dims[dd] == (ISMRMRDDIM)dim )
+                {
+                    exist = true;
+                    break;
+                }
+            }
+
+            if ( !exist )
+            {
+                dims.push_back((ISMRMRDDIM)dim);
+
+                if ( dim == WorkOrderShareDim_ )
+                {
+                    indWorkOrderSharingDim = (int)(dims.size()-1);
+                }
+            }
+        }
+
+        if ( (indWorkOrderSharingDim!=-1) && (indWorkOrderSharingDim > 5) )
+        {
+            ISMRMRDDIM dim6th = dims[5];
+            dims[5] = WorkOrderShareDim_;
+            dims[indWorkOrderSharingDim] = dim6th;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(this->configureWorkOrder(dims));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconWorkFlowCartesian2DT<T>::recon() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian3DT.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian3DT.h
new file mode 100644
index 0000000..656f242
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian3DT.h
@@ -0,0 +1,262 @@
+/** \file   gtPlusISMRMRDReconWorkFlowCartesian3DT.h
+    \brief  Define the base class for the GtPlus 3DT reconstruction workflow for cartesian sampling
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusISMRMRDReconWorkFlowCartesian.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusISMRMRDReconWorkFlowCartesian3DT : public gtPlusISMRMRDReconWorkFlowCartesian<T>
+{
+public:
+
+    typedef gtPlusISMRMRDReconWorkFlowCartesian<T> BaseClass;
+    typedef typename BaseClass::DimensionRecordType DimensionRecordType;
+    typedef typename BaseClass::real_value_type real_value_type;
+
+    gtPlusISMRMRDReconWorkFlowCartesian3DT();
+    virtual ~gtPlusISMRMRDReconWorkFlowCartesian3DT();
+
+    void printInfo(std::ostream& os);
+
+    virtual bool recon();
+
+    virtual bool predictDimensions();
+
+    using BaseClass::data_;
+    using BaseClass::time_stamp_;
+    using BaseClass::physio_time_stamp_;
+    using BaseClass::ref_;
+    using BaseClass::noise_;
+    using BaseClass::noiseBW_;
+    using BaseClass::receriverBWRatio_;
+    using BaseClass::overSamplingRatioRO_;
+    using BaseClass::reconSizeRO_;
+    using BaseClass::reconSizeE1_;
+    using BaseClass::reconSizeE2_;
+    using BaseClass::encodingFOV_RO_;
+    using BaseClass::encodingFOV_E1_;
+    using BaseClass::encodingFOV_E2_;
+    using BaseClass::reconFOV_RO_;
+    using BaseClass::reconFOV_E1_;
+    using BaseClass::reconFOV_E2_;
+    using BaseClass::res_;
+    using BaseClass::res_second_;
+    using BaseClass::res_time_stamp_;
+    using BaseClass::res_physio_time_stamp_;
+    using BaseClass::res_time_stamp_second_;
+    using BaseClass::res_physio_time_stamp_second_;
+
+    using BaseClass::worker_;
+    using BaseClass::workOrder_;
+
+    using BaseClass::dataDimStartingIndexes_;
+
+    using BaseClass::WorkOrderShareDim_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+
+    using BaseClass::ref_remove_oversampling_RO_;
+    using BaseClass::ref_apply_noisePreWhitening_;
+
+    // the workOrder3DT needs 5 dimensions [RO E1 E2 CHA S]
+    ISMRMRDDIM dim5th_;
+
+protected:
+
+    using BaseClass::dataCurr_;
+    using BaseClass::refCurr_;
+
+    using BaseClass::RO_;
+    using BaseClass::E1_;
+    using BaseClass::CHA_;
+    using BaseClass::SLC_;
+    using BaseClass::E2_;
+    using BaseClass::CON_;
+    using BaseClass::PHS_;
+    using BaseClass::REP_;
+    using BaseClass::SET_;
+    using BaseClass::SEG_;
+    using BaseClass::AVE_;
+
+    using BaseClass::RO_ref_;
+    using BaseClass::E1_ref_;
+    using BaseClass::CHA_ref_;
+    using BaseClass::SLC_ref_;
+    using BaseClass::E2_ref_;
+    using BaseClass::CON_ref_;
+    using BaseClass::PHS_ref_;
+    using BaseClass::REP_ref_;
+    using BaseClass::SET_ref_;
+    using BaseClass::SEG_ref_;
+    using BaseClass::AVE_ref_;
+
+    using BaseClass::gtPlus_util_;
+};
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlowCartesian3DT<T>::
+gtPlusISMRMRDReconWorkFlowCartesian3DT() : BaseClass(), dim5th_(DIM_NONE)
+{
+}
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlowCartesian3DT<T>::~gtPlusISMRMRDReconWorkFlowCartesian3DT() 
+{
+}
+
+template <typename T> 
+void gtPlusISMRMRDReconWorkFlowCartesian3DT<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD Recon workflow Cartesian 3D/3DT -------------" << endl;
+    os << "Implementation of general reconstruction workflow for cartesian sampling of 3D and 3D+T use cases" << endl;
+    os << "The workOrder needs 5 dimensions [RO E1 E2 CHA S]" << endl;
+    os << "----------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian3DT<T>::predictDimensions()
+{
+    // if interleaved mode
+    if ( workOrder_->CalibMode_ == ISMRMRD_interleaved )
+    {
+        if ( workOrder_->InterleaveDim_ == DIM_Phase )
+        {
+            dim5th_ = DIM_Phase;
+        }
+
+        if ( workOrder_->InterleaveDim_ == DIM_Repetition )
+        {
+            dim5th_ = DIM_Repetition;
+        }
+
+        if ( workOrder_->InterleaveDim_ == DIM_Contrast )
+        {
+            dim5th_ = DIM_Contrast;
+        }
+    }
+    else if ( (workOrder_->CalibMode_ == ISMRMRD_embedded) 
+        || (workOrder_->CalibMode_ == ISMRMRD_separate)
+        || (workOrder_->CalibMode_ == ISMRMRD_noacceleration) ) 
+    {
+        if ( SLC_.second == 1 )
+        {
+            std::vector<DimensionRecordType> dimSizes(4);
+            dimSizes[0] = CON_;
+            dimSizes[1] = PHS_;
+            dimSizes[2] = REP_;
+            dimSizes[3] = SET_;
+
+            std::sort(dimSizes.begin(), dimSizes.end(), DimensionRecordCompare() );
+            dim5th_ = dimSizes[0].first;
+        }
+
+        if (SLC_.second > 1 )
+        {
+            dim5th_ = DIM_Slice; // multiple slab acquisition
+        }
+    }
+
+    if ( dim5th_==DIM_NONE )
+    {
+        GERROR_STREAM("gtPlusISMRMRDReconWorkFlowCartesian3DT<T>::predictDimensions() : cannot find 5th dimensions ... ");
+        return false;
+    }
+
+    workOrder_->enforceConsistency(dim5th_);
+
+    GDEBUG_CONDITION_STREAM(true, "predictDimensions - dim5th : " << gtPlus_util_.getISMRMRDDimName(dim5th_) );
+    GDEBUG_CONDITION_STREAM(true, "predictDimensions - WorkOrderShareDim : " << gtPlus_util_.getISMRMRDDimName(WorkOrderShareDim_) );
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian3DT<T>::recon()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data_!=NULL);
+        GADGET_CHECK_RETURN_FALSE(worker_!=NULL);
+
+        if ( dim5th_==DIM_NONE )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->predictDimensions());
+        }
+
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=DIM_ReadOut);
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=DIM_Encoding1);
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=DIM_Encoding2);
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=DIM_Channel);
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=dim5th_);
+
+        // find recon dimensions
+        std::vector<ISMRMRDDIM> dims;
+        dims.push_back(DIM_ReadOut);
+        dims.push_back(DIM_Encoding1);
+        dims.push_back(DIM_Encoding2);
+        dims.push_back(DIM_Channel);
+        dims.push_back(dim5th_);
+
+        int dim;
+        size_t dd;
+
+        int indWorkOrderSharingDim = -1;
+        for ( dim=DIM_Slice; dim<=DIM_Average; dim++ )
+        {
+            if ( dim == DIM_Segment )
+            {
+                continue;
+            }
+
+            bool exist = false;
+            for ( dd=0; dd<dims.size(); dd++ )
+            {
+                if ( dims[dd] == dim )
+                {
+                    exist = true;
+                    break;
+                }
+            }
+
+            if ( !exist )
+            {
+                dims.push_back((ISMRMRDDIM)dim);
+
+                if ( dim == WorkOrderShareDim_ )
+                {
+                    indWorkOrderSharingDim = (int)(dims.size()-1);
+                }
+            }
+        }
+
+        if ( (indWorkOrderSharingDim!=-1) && (indWorkOrderSharingDim > 5) )
+        {
+            ISMRMRDDIM dim6th = dims[5];
+            dims[5] = WorkOrderShareDim_;
+            dims[indWorkOrderSharingDim] = dim6th;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(this->configureWorkOrder(dims));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusISMRMRDReconWorkFlowCartesian3DT<T>::recon() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder.h
new file mode 100644
index 0000000..dcd1e08
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder.h
@@ -0,0 +1,1273 @@
+/** \file   gtPlusISMRMRDReconWorkOrder.h
+    \brief  Define the GtPlus reconstruction workorder and parameters
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd/ismrmrd.h"
+#include "gtPlusISMRMRDReconUtil.h"
+
+// MACROS FOR PRINTING
+#define GADGET_PARA_PRINT(content) { GDEBUG_STREAM(#content << " is " << content); }
+
+namespace Gadgetron { namespace gtPlus {
+
+#define MAX_MOCO_LEVEL 16
+
+struct gtPlusReconWorkOrderPara
+{
+    ISMRMRDCALIBMODE CalibMode_;
+    ISMRMRDDIM InterleaveDim_;
+
+    // acceleration factor along E1 and E2
+    double acceFactorE1_;
+    double acceFactorE2_;
+
+    // kspace center for RO/E1/E2
+    size_t kSpaceCenterRO_;
+    size_t kSpaceCenterEncode1_;
+    size_t kSpaceCenterEncode2_;
+
+    // kspace max acquired number for RO/E1/E2
+    size_t kSpaceMaxRO_;
+    size_t kSpaceMaxEncode1_;
+    size_t kSpaceMaxEncode2_;
+
+    // for asymmetric echo
+    // the sampled range for RO
+    // if <0, all RO ranges are used
+    int start_RO_;
+    int end_RO_;
+
+    // sampled range for E1
+    int start_E1_;
+    int end_E1_;
+
+    // sampled range for E2
+    int start_E2_;
+    int end_E2_;
+
+    // work order has to have some interaction with work flow
+    // if true work flow will buffer kernel computed from this work order
+    bool workFlow_BufferKernel_;
+    // if true, work flow will use its buffered kernel for this work order
+    bool workFlow_use_BufferedKernel_;
+
+    // number of channels for the reconstruction results
+    // most cases, it is 1
+    size_t num_channels_res_;
+
+    // -----------------------------------------------------------------------
+    // parameters
+    // -----------------------------------------------------------------------
+
+    // -------------------------------
+    // coil compression
+    // -------------------------------
+    bool upstream_coil_compression_;
+    double upstream_coil_compression_thres_;
+    int upstream_coil_compression_num_modesKept_;
+
+    bool downstream_coil_compression_;
+    double coil_compression_thres_;
+    int coil_compression_num_modesKept_;
+
+    // -------------------------------
+    // coil sensitivity estimation
+    // -------------------------------
+    Gadgetron::ISMRMRDCOILMAPALGO coil_map_algorithm_;
+
+    // for ISMRMRD_SOUHEIL
+    size_t csm_kSize_;
+    size_t csm_powermethod_num_;
+    // for 3D acquisition, whether to use the true 3D data correlation matrix
+    bool csm_true_3D_;
+
+    // for ISMRMRD_SOUHEIL_ITER
+    size_t csm_iter_num_;
+    double csm_iter_thres_;
+
+    // whether to use gpu for csm estimation
+    bool csm_use_gpu_;
+
+    // -------------------------------
+    // parameters for variant reconstruction algorithms
+    // -------------------------------
+    Gadgetron::ISMRMRDALGO recon_algorithm_;
+    bool recon_auto_parameters_;
+
+    bool gfactor_needed_;
+
+    bool wrap_around_map_needed_;
+
+    /// --------------
+    // grappa
+    /// --------------
+
+    size_t grappa_kSize_RO_;
+    size_t grappa_kSize_E1_;
+    size_t grappa_kSize_E2_;
+    double grappa_reg_lamda_;
+    double grappa_calib_over_determine_ratio_;
+    bool grappa_use_gpu_;
+
+    /// --------------
+    // SPIRiT
+    /// --------------
+    size_t spirit_kSize_RO_;
+    size_t spirit_kSize_E1_;
+    size_t spirit_kSize_E2_;
+
+    size_t spirit_oSize_RO_;
+    size_t spirit_oSize_E1_;
+    size_t spirit_oSize_E2_;
+
+    double spirit_reg_lamda_;
+    double spirit_calib_over_determine_ratio_;
+
+    bool spirit_solve_symmetric_;
+
+    size_t spirit_iter_max_;
+    double spirit_iter_thres_;
+    bool spirit_print_iter_;
+
+    bool spirit_use_gpu_;
+
+    /// --------------
+    // L1 SPIRiT
+    /// --------------
+    bool spirit_perform_linear_;
+    bool spirit_perform_grappa_linear_;
+    bool spirit_perform_nonlinear_;
+
+    double spirit_parallel_imaging_lamda_;
+    double spirit_image_reg_lamda_;
+    double spirit_data_fidelity_lamda_;
+
+    size_t spirit_ncg_iter_max_;
+    double spirit_ncg_iter_thres_;
+    bool spirit_ncg_print_iter_;
+    double spirit_ncg_scale_factor_;
+
+    size_t spirit_slep_iter_max_;
+    double spirit_slep_iter_thres_;
+    bool spirit_slep_print_iter_;
+    bool spirit_slep_keep_third_dimension_coeff_;
+    bool spirit_slep_keep_approx_coeff_;
+    double spirit_slep_scale_factor_;
+
+    bool spirit_use_coil_sen_map_;
+    bool spirit_use_moco_enhancement_;
+    bool spirit_recon_moco_images_;
+
+    bool spirit_2D_scale_per_chunk_;
+    bool spirit_3D_scale_per_chunk_;
+
+    double spirit_RO_enhancement_ratio_;
+    double spirit_E1_enhancement_ratio_;
+    double spirit_E2_enhancement_ratio_;
+    double spirit_temporal_enhancement_ratio_;
+
+    /// --------------
+    /// parameters for retro-gating
+    /// --------------
+    // number of retro-gated phases
+    // if 0, retro-gating is not prescribed
+    size_t retro_gated_images_;
+
+    // how many readout lines in each segment for retro-gating
+    size_t retro_gated_segment_size_;
+
+    // which method used for retro-gating
+    ISMRMRDINTERPRETROGATING retro_gated_interp_method_;
+
+    /// --------------
+    /// parameters for binning
+    /// --------------
+
+    // number of target cardiac phases
+    size_t kspace_binning_number_of_cardiac_phases_;
+
+    // minimal allowed cardiac phase width used for binning, in ms
+    // if the binned temporal window is smaller than this threshold,
+    // the binned window will be increased
+    // if <=0, then this value will not take effect
+    double kspace_binning_minimal_cardiac_phase_width_;
+
+    // whether to perform binning recon with multiple channel complex data
+    bool kspace_binning_multiple_channel_recon_;
+
+    // whether to perform non-linear recon
+    bool kspace_binning_iterative_non_linear_recon_;
+
+    // non-linear recon using slep optimizer
+    bool kspace_binning_iterative_non_linear_recon_slep_;
+
+    // whether to use coil map when warpping multiple channel images
+    bool kspace_binning_multiple_channel_recon_with_coil_map_;
+
+    // whether to compute navigator signal
+    bool kspace_binning_compute_navigator_signal_;
+
+    // for navigator detection
+    size_t kspace_binning_navigator_moco_level_;
+    size_t kspace_binning_navigator_moco_iter_[MAX_MOCO_LEVEL];
+    double kspace_binning_navigator_hilbert_strength_;
+    double kspace_binning_navigator_dissimilarity_sigma_;
+    bool  kspace_binning_navigator_bidirectional_moco_;
+
+    // parameters for the moco in kspace binning
+    size_t kspace_binning_moco_level_;
+    size_t kspace_binning_moco_iter_[MAX_MOCO_LEVEL];
+    double kspace_binning_moco_hilbert_strength_;
+    double kspace_binning_moco_dissimilarity_sigma_;
+    bool  kspace_binning_bidirectional_moco_;
+
+    // whether to perform soft combination
+    bool kspace_binning_soft_combination_;
+
+    // navigator signal acceptance window
+    double kspace_binning_navigator_window_wide_;
+    double kspace_binning_navigator_window_narrow_;
+
+    // method for warpping the complex images ("BSpline", "Linear")
+    ISMRMRDINTERP kspace_binning_method_warpping_;
+
+    // whether to exclude the last cardiac cycle for binning
+    bool kspace_binning_exclude_last_cardiac_cycle_;
+
+    // some blocks around central kspace must be filled
+    size_t kspace_binning_number_of_central_kspace_blocks_;
+
+    // maximal allowed temporal ratio window
+    double kspace_binning_max_temporal_window_;
+
+    // temporal ratio window used for binning
+    double kspace_binning_temporal_window_;
+
+    // interpolation method to generate best cardiac cycle ('Linear', 'Spline')
+    ISMRMRDINTERP kspace_binning_best_cardiac_cycle_interpolator_;
+
+    // recon using certain length of data (if <=0, use the whole data), in the unit of seconds
+    double kspace_binning_data_length_used_for_recon_;
+
+    // fill hole with nearest neighbor
+    bool kspace_binning_fill_kspace_with_neighbors_;
+
+    // for the flow binning, whether the flow encoding is performed insided every e1
+    bool kspace_binning_flow_in_e1_;
+
+    // whether to jointly recon all flow encoding directions
+    // if false, every flow encoding direction will be reconed seperately
+    bool kspace_binning_flow_recon_jointly_;
+
+    /// --------------
+    /// parameters for motion compensated recon
+    /// --------------
+    size_t motion_comp_num_of_PD_images_;
+
+    // -------------------------------
+    // job split
+    // -------------------------------
+    bool job_split_by_S_;
+    size_t job_num_of_N_;
+    size_t job_max_Megabytes_;
+    size_t job_overlap_;
+    // whether to perform computation on the control node
+    bool job_perform_on_control_node_;
+
+    // -------------------------------
+    // partial fourier handling
+    // -------------------------------
+    // partial fourier handling algorithms
+    ISMRMRDPFALGO partialFourier_algo_;
+
+    // homodyne filter
+    // number of iterations
+    size_t partialFourier_homodyne_iters_;
+    // threshold to stop the iteration
+    double partialFourier_homodyne_thres_;
+    // density compensation for homodyne filter results
+    bool partialFourier_homodyne_densityComp_;
+
+    // POCS
+    // number of iterations
+    size_t partialFourier_POCS_iters_;
+    // threshold to stop the iteration
+    double partialFourier_POCS_thres_;
+    // transit band width
+    size_t partialFourier_POCS_transitBand_;
+    // transit band width for E2
+    size_t partialFourier_POCS_transitBand_E2_;
+
+    // Feng Huang method
+    // kernel size
+    size_t partialFourier_FengHuang_kSize_RO_;
+    size_t partialFourier_FengHuang_kSize_E1_;
+    size_t partialFourier_FengHuang_kSize_E2_;
+    // threshold for kernel estimation
+    double partialFourier_FengHuang_thresReg_;
+    // same kernel for all N
+    bool partialFourier_FengHuang_sameKernel_allN_;
+    // transit band width
+    size_t partialFourier_FengHuang_transitBand_;
+    // transit band width for E2
+    size_t partialFourier_FengHuang_transitBand_E2_;
+
+    gtPlusReconWorkOrderPara()
+    {
+        CalibMode_ = ISMRMRD_noacceleration;
+        InterleaveDim_ = DIM_NONE;
+
+        acceFactorE1_ = 1;
+        acceFactorE2_ = 1;
+
+        kSpaceCenterRO_ = 0;
+        kSpaceCenterEncode1_ = 0;
+        kSpaceCenterEncode2_ = 0;
+
+        kSpaceMaxRO_ = 1;
+        kSpaceMaxEncode1_ = 1;
+        kSpaceMaxEncode2_ = 1;
+
+        start_RO_ = -1;
+        end_RO_ = -1;
+
+        start_E1_ = -1;
+        end_E1_ = -1;
+
+        start_E2_ = -1;
+        end_E2_ = -1;
+
+        workFlow_BufferKernel_ = false;
+        workFlow_use_BufferedKernel_ = false;
+
+        num_channels_res_ = 1;
+
+        // ----------------------------------------------
+
+        upstream_coil_compression_ = false;
+        upstream_coil_compression_thres_ = 1e-3;
+        upstream_coil_compression_num_modesKept_ = -1;
+
+        downstream_coil_compression_ = true;
+        coil_compression_thres_ = 1e-3;
+        coil_compression_num_modesKept_ = -1;
+
+        coil_map_algorithm_ = ISMRMRD_SOUHEIL;
+        csm_kSize_ = 7;
+        csm_powermethod_num_ = 3;
+        csm_true_3D_ = false;
+        csm_iter_num_ = 5;
+        csm_iter_thres_ = 1e-3;
+        csm_use_gpu_ = false;
+
+        // ----------------------------------------------
+
+        recon_algorithm_ = ISMRMRD_GRAPPA;
+        recon_auto_parameters_ = true;
+        gfactor_needed_ = false;
+        wrap_around_map_needed_ = false;
+
+        // ----------------------------------------------
+
+        grappa_kSize_RO_ = 5;
+        grappa_kSize_E1_ = 4;
+        grappa_kSize_E2_ = 4;
+        grappa_reg_lamda_ = 0.0005;
+        grappa_calib_over_determine_ratio_ = 0;
+        grappa_use_gpu_ = false;
+
+        // ----------------------------------------------
+
+        spirit_kSize_RO_ = 7;
+        spirit_kSize_E1_ = 7;
+        spirit_kSize_E2_ = 7;
+
+        spirit_oSize_RO_ = 1;
+        spirit_oSize_E1_ = 1;
+        spirit_oSize_E2_ = 1;
+
+        spirit_reg_lamda_ = 0.005;
+        spirit_calib_over_determine_ratio_ = 0;
+
+        spirit_use_gpu_ = false;
+
+        spirit_solve_symmetric_ = false;
+
+        spirit_iter_max_ = 70;
+        spirit_iter_thres_ = 1e-5;
+        spirit_print_iter_ = false;
+
+        // ----------------------------------------------
+
+        spirit_perform_linear_ = true;
+        spirit_perform_grappa_linear_ = false;
+        spirit_perform_nonlinear_ = true;
+
+        spirit_parallel_imaging_lamda_ = 1.0;
+        spirit_image_reg_lamda_ = 1e-3;
+        spirit_data_fidelity_lamda_ = 0;
+
+        spirit_ncg_iter_max_ = 10;
+        spirit_ncg_iter_thres_ = 1e-3;
+        spirit_ncg_print_iter_ = false;
+        spirit_ncg_scale_factor_ = -1.0;
+
+        spirit_slep_iter_max_ = 5;
+        spirit_slep_iter_thres_ = 1e-5;
+        spirit_slep_print_iter_ = false;
+        spirit_slep_keep_third_dimension_coeff_ = false;
+        spirit_slep_keep_approx_coeff_ = true;
+        spirit_slep_scale_factor_ = -1.0;
+
+        spirit_use_coil_sen_map_ = true;
+        spirit_use_moco_enhancement_ = false;
+        spirit_recon_moco_images_ = false;
+
+        spirit_RO_enhancement_ratio_ = 1;
+        spirit_E1_enhancement_ratio_ = 1;
+        spirit_E2_enhancement_ratio_ = 1;
+        spirit_temporal_enhancement_ratio_ = 1;
+
+        spirit_2D_scale_per_chunk_ = false;
+        spirit_3D_scale_per_chunk_ = true;
+
+        // ----------------------------------------------
+
+        retro_gated_images_ = 0;
+        retro_gated_segment_size_ = 0;
+        retro_gated_interp_method_ = ISMRMRD_INTERP_RETRO_GATING_BSPLINE;
+
+        // ----------------------------------------------
+
+        kspace_binning_number_of_cardiac_phases_ = 30;
+        kspace_binning_minimal_cardiac_phase_width_ = 33; // 33ms, 30 phases for the heart rate of 60
+
+        kspace_binning_multiple_channel_recon_ = true;
+        kspace_binning_iterative_non_linear_recon_ = true;
+        kspace_binning_iterative_non_linear_recon_slep_ = true;
+        kspace_binning_multiple_channel_recon_with_coil_map_ = false;
+        kspace_binning_compute_navigator_signal_ = true;
+
+        kspace_binning_navigator_moco_level_ = 4;
+
+        size_t ii;
+        for ( ii=0; ii<MAX_MOCO_LEVEL; ii++ ) kspace_binning_navigator_moco_iter_[ii] = 0;
+        kspace_binning_navigator_moco_iter_[0] = 1;
+        kspace_binning_navigator_moco_iter_[1] = 100;
+        kspace_binning_navigator_moco_iter_[2] = 100;
+        kspace_binning_navigator_moco_iter_[3] = 100;
+
+        kspace_binning_navigator_hilbert_strength_ = 6.0;
+        kspace_binning_navigator_dissimilarity_sigma_ = 2.0;
+        kspace_binning_navigator_bidirectional_moco_ = false;
+
+        kspace_binning_moco_level_ = 5;
+        for ( ii=0; ii<MAX_MOCO_LEVEL; ii++ ) kspace_binning_moco_iter_[ii] = 0;
+        kspace_binning_moco_iter_[0] = 100;
+        kspace_binning_moco_iter_[1] = 100;
+        kspace_binning_moco_iter_[2] = 100;
+        kspace_binning_moco_iter_[3] = 100;
+        kspace_binning_moco_iter_[4] = 100;
+
+        kspace_binning_moco_hilbert_strength_ = 12.0;
+        kspace_binning_moco_dissimilarity_sigma_ = 2.0;
+        kspace_binning_bidirectional_moco_ = false;
+        kspace_binning_soft_combination_ = true;
+        kspace_binning_navigator_window_wide_ = 0.75;
+        kspace_binning_navigator_window_narrow_ = 0.5;
+        kspace_binning_method_warpping_ = ISMRMRD_INTERP_BSPLINE;
+        kspace_binning_exclude_last_cardiac_cycle_ = false;
+        kspace_binning_number_of_central_kspace_blocks_ = 0;
+        kspace_binning_max_temporal_window_ = 1.0;
+        kspace_binning_temporal_window_ = 4.0;
+        kspace_binning_best_cardiac_cycle_interpolator_= ISMRMRD_INTERP_SPLINE;
+        kspace_binning_data_length_used_for_recon_ = 0;
+        kspace_binning_fill_kspace_with_neighbors_ = false;
+        kspace_binning_flow_in_e1_ = true;
+        kspace_binning_flow_recon_jointly_ = true;
+
+        // ----------------------------------------------
+
+        motion_comp_num_of_PD_images_ = 0;
+
+        // ----------------------------------------------
+
+        job_split_by_S_ = false;
+        job_num_of_N_ = 0;
+        job_max_Megabytes_ = 20*1024;
+        job_overlap_ = 2;
+        job_perform_on_control_node_ = true;
+
+        // ----------------------------------------------
+
+        partialFourier_algo_ = ISMRMRD_PF_ZEROFILLING_FILTER;
+
+        partialFourier_homodyne_iters_ = 6;
+        partialFourier_homodyne_thres_ = 1e-2;
+        partialFourier_homodyne_densityComp_ = false;
+
+        partialFourier_POCS_iters_ = 6;
+        partialFourier_POCS_thres_ = 1e-2;
+        partialFourier_POCS_transitBand_ = 16;
+        partialFourier_POCS_transitBand_E2_ = 16;
+
+        partialFourier_FengHuang_kSize_RO_ = 5;
+        partialFourier_FengHuang_kSize_E1_ = 5;
+        partialFourier_FengHuang_kSize_E2_ = 5;
+        partialFourier_FengHuang_thresReg_ = 0.005;
+        partialFourier_FengHuang_sameKernel_allN_ = false;
+        partialFourier_FengHuang_transitBand_ = 16;
+        partialFourier_FengHuang_transitBand_E2_ = 16;
+    }
+
+    ~gtPlusReconWorkOrderPara() {}
+};
+
+template <typename T> 
+class gtPlusReconWorkOrder : public gtPlusReconWorkOrderPara
+{
+public:
+
+    typedef typename realType<T>::Type real_value_type;
+
+    gtPlusReconWorkOrder();
+    virtual ~gtPlusReconWorkOrder();
+
+    // reset the status of work order
+    // all computed calibration/coil sensitivity results
+    // are deleted
+    virtual bool reset();
+
+    // check and modify inconsistency in the work order
+    virtual bool enforceConsistency(ISMRMRDDIM& /*lastDim*/);
+
+    typedef std::pair<ISMRMRDDIM, size_t> DimensionRecordType;
+
+    // duplicate a workorder without copying the data arrays
+    virtual void duplicatePara(gtPlusReconWorkOrderPara& worder) const;
+    virtual void duplicate(gtPlusReconWorkOrder<T>& worder) const;
+
+    virtual void copyFromPara(const gtPlusReconWorkOrderPara& worder);
+
+    virtual void printInfo(std::ostream& os) const;
+    virtual void print(std::ostream& os) const;
+
+    // -------------------------------
+    // input
+    // -------------------------------
+    // kspace data
+    hoNDArray<T> data_;
+    // ref data
+    hoNDArray<T> ref_;
+
+    // noise data
+    hoNDArray<T> noise_;
+
+    // phase correction data
+    hoNDArray<T> phaseCorr_;
+
+    // other data
+    hoNDArray<T> other_;
+
+    // sometime, the initial kspace can be provided
+    hoNDArray<T> kspace_initial_;
+
+    // acqusition time stamp in the unit of second for kspace data lines
+    // for the embedded mode, the time stamps of ref lines are also stored
+    hoNDArray<real_value_type> time_stamp_;
+
+    // physio time stamp in the unit of second for kspace data lines
+    // for the embedded mode, the physio time stamps of ref lines are also stored
+    hoNDArray<real_value_type> physio_time_stamp_;
+
+    // dimension starting indexes for the data_
+    std::vector< DimensionRecordType > dataDimStartingIndexes_;
+
+    // to support EPI and other trajectories
+    // if 1, the readout line is acquired inversely, otherwise, 0
+    hoNDArray<unsigned short> reflect_;
+    hoNDArray<unsigned short> reflect_ref_;
+    hoNDArray<unsigned short> reflect_phaseCorr_;
+    hoNDArray<unsigned short> reflect_other_;
+
+    // -------------------------------
+    // output
+    // -------------------------------
+    // reconstructed kspace
+    hoNDArray<T> fullkspace_;
+
+    // reconstructed images
+    hoNDArray<T> complexIm_;
+
+    // time stamp and physio stamp for reconed images, in the unit of seconds
+    // if these fields are not set, the buffered image header will be used
+    hoNDArray<real_value_type> recon_time_stamp_;
+    hoNDArray<real_value_type> recon_physio_time_stamp_;
+
+    // extra reconstructed results
+    // some methods can generate more than one set of reconstruction results
+    hoNDArray<T> fullkspace_second_;
+    hoNDArray<T> complexIm_second_;
+    hoNDArray<real_value_type> recon_time_stamp_second_;
+    hoNDArray<real_value_type> recon_physio_time_stamp_second_;
+
+    // gfactor
+    hoNDArray<T> gfactor_;
+
+    // wrap-around eig map
+    hoNDArray<T> wrap_around_map_;
+
+    // -------------------------------
+    // buffers for computation
+    // -------------------------------
+    // ref for recon
+    hoNDArray<T> ref_recon_;
+    // ref for coil map
+    hoNDArray<T> ref_coil_map_;
+
+    // store the estimated kernel, kernel in image domain
+    // if these fields are set before recon, they will be used
+    boost::shared_ptr< hoNDArray<T> > kernel_; // [RO E1 srcCHA dstCHA dstE1 1 or N S]
+    boost::shared_ptr< hoNDArray<T> > kernelIm_; // [RO E1 srcCHA dstCHA 1 or N S]
+    boost::shared_ptr< hoNDArray<T> > unmixingCoeffIm_; // [RO E1 srcCHA 1 or N S]
+    boost::shared_ptr< std::vector<hoMatrix<T> > > coilCompressionCoef_; // [dstCHA srcCHA] matrices
+    boost::shared_ptr< hoNDArray<T> > coilMap_; // [RO E1 dstCHA 1 or N S]
+
+    // -------------------------------
+    // kspace filter for RO/E1/E2 dimension, applied to the reconstruction results
+    // -------------------------------
+    // 1D filter for kspace data
+    hoNDArray<T> filterRO_;
+    hoNDArray<T> filterE1_;
+    hoNDArray<T> filterE2_;
+    // 2D and 3D filter, overwrite the 1D filters
+    hoNDArray<T> filterROE1_;
+    hoNDArray<T> filterROE1E2_;
+
+    // -------------------------------
+    // kspace filter for RO/E1/E2 dimension, applied to the ref data for coil map estimation
+    // -------------------------------
+    // filter for ref data
+    hoNDArray<T> filterRO_ref_;
+    hoNDArray<T> filterE1_ref_;
+    hoNDArray<T> filterE2_ref_;
+
+    hoNDArray<T> filterROE1_ref_;
+    hoNDArray<T> filterROE1E2_ref_;
+
+    // -------------------------------
+    // kspace filter for RO/E1/E2 dimension, applied to the data edge in case of partial fourier or asymmetric echo
+    // -------------------------------
+    // filter for partial fourier/asymmetric echo
+    hoNDArray<T> filterRO_partialfourier_;
+    hoNDArray<T> filterE1_partialfourier_;
+    hoNDArray<T> filterE2_partialfourier_;
+
+    hoNDArray<T> filterROE1_partialfourier_;
+    hoNDArray<T> filterROE1E2_partialfourier_;
+
+    // -------------------------------
+    // parameters for cloud computing
+    // -------------------------------
+    bool CloudComputing_;
+    unsigned int CloudSize_;
+
+    typedef boost::tuple<std::string, std::string, std::string, unsigned int> CloudNodeType;
+    typedef std::vector<CloudNodeType> CloudType;
+
+    CloudType gt_cloud_;
+};
+
+template <typename T> 
+gtPlusReconWorkOrder<T>::gtPlusReconWorkOrder() : gtPlusReconWorkOrderPara()
+{
+    hoNDArray<T>* tmp = new hoNDArray<T>();
+    kernel_ = boost::shared_ptr< hoNDArray<T> >(tmp);
+
+    tmp = new hoNDArray<T>();
+    kernelIm_ = boost::shared_ptr< hoNDArray<T> >(tmp);
+
+    tmp = new hoNDArray<T>();
+    unmixingCoeffIm_ = boost::shared_ptr< hoNDArray<T> >(tmp);
+
+    std::vector<hoMatrix<T> >* tmpCoilCoef = new std::vector<hoMatrix<T> >();
+    coilCompressionCoef_ = boost::shared_ptr< std::vector<hoMatrix<T> > >(tmpCoilCoef);
+
+    tmp = new hoNDArray<T>();
+    coilMap_ = boost::shared_ptr< hoNDArray<T> >(tmp);
+
+    CloudComputing_ = false;
+    CloudSize_ = 0;
+}
+
+template <typename T> 
+gtPlusReconWorkOrder<T>::~gtPlusReconWorkOrder()
+{
+}
+
+template <typename T> 
+bool gtPlusReconWorkOrder<T>::reset()
+{
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorkOrder<T>::enforceConsistency(ISMRMRDDIM& /*lastDim*/)
+{
+    return true;
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder<T>::duplicatePara(gtPlusReconWorkOrderPara& worder) const
+{
+    worder.CalibMode_                                  = CalibMode_;
+    worder.InterleaveDim_                              = InterleaveDim_;
+
+    worder.acceFactorE1_                               = acceFactorE1_;
+    worder.acceFactorE2_                               = acceFactorE2_;
+
+    worder.kSpaceCenterRO_                             = kSpaceCenterRO_;
+    worder.kSpaceCenterEncode1_                        = kSpaceCenterEncode1_;
+    worder.kSpaceCenterEncode2_                        = kSpaceCenterEncode2_;
+
+    worder.kSpaceMaxRO_                                = kSpaceMaxRO_;
+    worder.kSpaceMaxEncode1_                           = kSpaceMaxEncode1_;
+    worder.kSpaceMaxEncode2_                           = kSpaceMaxEncode2_;
+
+    worder.workFlow_BufferKernel_                      = workFlow_BufferKernel_;
+    worder.workFlow_use_BufferedKernel_                = workFlow_use_BufferedKernel_;
+    worder.num_channels_res_                           = num_channels_res_;
+
+    worder.upstream_coil_compression_                  = upstream_coil_compression_;
+    worder.upstream_coil_compression_thres_            = upstream_coil_compression_thres_;
+    worder.upstream_coil_compression_num_modesKept_    = upstream_coil_compression_num_modesKept_;
+
+    worder.downstream_coil_compression_                = downstream_coil_compression_;
+    worder.coil_compression_thres_                     = coil_compression_thres_;
+    worder.coil_compression_num_modesKept_             = coil_compression_num_modesKept_;
+
+    worder.coil_map_algorithm_                         = coil_map_algorithm_;
+    worder.csm_kSize_                                  = csm_kSize_;
+    worder.csm_powermethod_num_                        = csm_powermethod_num_;
+    worder.csm_true_3D_                                = csm_true_3D_;
+    worder.csm_iter_num_                               = csm_iter_num_;
+    worder.csm_iter_thres_                             = csm_iter_thres_;
+    worder.csm_use_gpu_                                = csm_use_gpu_;
+
+    worder.start_RO_                                   = start_RO_;
+    worder.end_RO_                                     = end_RO_;
+
+    worder.start_E1_                                   = start_E1_;
+    worder.end_E1_                                     = end_E1_;
+
+    worder.start_E2_                                   = start_E2_;
+    worder.end_E2_                                     = end_E2_;
+
+    worder.recon_algorithm_                            = recon_algorithm_;
+    worder.recon_auto_parameters_                      = recon_auto_parameters_;
+    worder.gfactor_needed_                             = gfactor_needed_;
+    worder.wrap_around_map_needed_                     = wrap_around_map_needed_;
+
+    worder.grappa_kSize_RO_                            = grappa_kSize_RO_;
+    worder.grappa_kSize_RO_                            = grappa_kSize_RO_;
+    worder.grappa_kSize_E1_                            = grappa_kSize_E1_;
+    worder.grappa_kSize_E2_                            = grappa_kSize_E2_;
+    worder.grappa_reg_lamda_                           = grappa_reg_lamda_;
+    worder.grappa_calib_over_determine_ratio_          = grappa_calib_over_determine_ratio_;
+    worder.grappa_use_gpu_                             = grappa_use_gpu_;
+
+    worder.spirit_kSize_RO_                            = spirit_kSize_RO_;
+    worder.spirit_kSize_E1_                            = spirit_kSize_E1_;
+    worder.spirit_kSize_E2_                            = spirit_kSize_E2_;
+    worder.spirit_oSize_RO_                            = spirit_oSize_RO_;
+    worder.spirit_oSize_E1_                            = spirit_oSize_E1_;
+    worder.spirit_oSize_E2_                            = spirit_oSize_E2_;
+    worder.spirit_reg_lamda_                           = spirit_reg_lamda_;
+    worder.spirit_use_gpu_                             = spirit_use_gpu_;
+    worder.spirit_calib_over_determine_ratio_          = spirit_calib_over_determine_ratio_;
+    worder.spirit_solve_symmetric_                     = spirit_solve_symmetric_;
+    worder.spirit_iter_max_                            = spirit_iter_max_;
+    worder.spirit_iter_thres_                          = spirit_iter_thres_;
+    worder.spirit_print_iter_                          = spirit_print_iter_;
+
+    worder.spirit_perform_linear_                      = spirit_perform_linear_;
+    worder.spirit_perform_grappa_linear_               = spirit_perform_grappa_linear_;
+    worder.spirit_perform_nonlinear_                   = spirit_perform_nonlinear_;
+    worder.spirit_parallel_imaging_lamda_              = spirit_parallel_imaging_lamda_;
+    worder.spirit_image_reg_lamda_                     = spirit_image_reg_lamda_;
+    worder.spirit_data_fidelity_lamda_                 = spirit_data_fidelity_lamda_;
+    worder.spirit_ncg_iter_max_                        = spirit_ncg_iter_max_;
+    worder.spirit_ncg_iter_thres_                      = spirit_ncg_iter_thres_;
+    worder.spirit_ncg_scale_factor_                    = spirit_ncg_scale_factor_;
+    worder.spirit_ncg_print_iter_                      = spirit_ncg_print_iter_;
+    worder.spirit_slep_iter_max_                       = spirit_slep_iter_max_;
+    worder.spirit_slep_iter_thres_                     = spirit_slep_iter_thres_;
+    worder.spirit_slep_print_iter_                     = spirit_slep_print_iter_;
+    worder.spirit_slep_keep_third_dimension_coeff_     = spirit_slep_keep_third_dimension_coeff_;
+    worder.spirit_slep_keep_approx_coeff_              = spirit_slep_keep_approx_coeff_;
+    worder.spirit_slep_scale_factor_                   = spirit_slep_scale_factor_;
+    worder.spirit_use_coil_sen_map_                    = spirit_use_coil_sen_map_;
+    worder.spirit_use_moco_enhancement_                = spirit_use_moco_enhancement_;
+    worder.spirit_recon_moco_images_                   = spirit_recon_moco_images_;
+    worder.spirit_RO_enhancement_ratio_                = spirit_RO_enhancement_ratio_;
+    worder.spirit_E1_enhancement_ratio_                = spirit_E1_enhancement_ratio_;
+    worder.spirit_E2_enhancement_ratio_                = spirit_E2_enhancement_ratio_;
+    worder.spirit_temporal_enhancement_ratio_          = spirit_temporal_enhancement_ratio_;
+    worder.spirit_2D_scale_per_chunk_                  = spirit_2D_scale_per_chunk_;
+    worder.spirit_3D_scale_per_chunk_                  = spirit_3D_scale_per_chunk_;
+
+    worder.retro_gated_images_                         = retro_gated_images_;
+    worder.retro_gated_segment_size_                   = retro_gated_segment_size_;
+    worder.retro_gated_interp_method_                  = retro_gated_interp_method_;
+
+    worder.kspace_binning_number_of_cardiac_phases_                 = kspace_binning_number_of_cardiac_phases_;
+    worder.kspace_binning_minimal_cardiac_phase_width_              = kspace_binning_minimal_cardiac_phase_width_;
+    worder.kspace_binning_multiple_channel_recon_                   = kspace_binning_multiple_channel_recon_;
+    worder.kspace_binning_iterative_non_linear_recon_               = kspace_binning_iterative_non_linear_recon_;
+    worder.kspace_binning_iterative_non_linear_recon_slep_          = kspace_binning_iterative_non_linear_recon_slep_;
+    worder.kspace_binning_multiple_channel_recon_with_coil_map_     = kspace_binning_multiple_channel_recon_with_coil_map_;
+    worder.kspace_binning_compute_navigator_signal_                 = kspace_binning_compute_navigator_signal_;
+    worder.kspace_binning_navigator_moco_level_                     = kspace_binning_navigator_moco_level_;
+    memcpy(worder.kspace_binning_navigator_moco_iter_, kspace_binning_navigator_moco_iter_, sizeof(size_t)*MAX_MOCO_LEVEL);
+    worder.kspace_binning_navigator_hilbert_strength_               = kspace_binning_navigator_hilbert_strength_;
+    worder.kspace_binning_navigator_dissimilarity_sigma_            = kspace_binning_navigator_dissimilarity_sigma_;
+    worder.kspace_binning_navigator_bidirectional_moco_             = kspace_binning_navigator_bidirectional_moco_;
+    worder.kspace_binning_moco_level_                               = kspace_binning_moco_level_;
+    memcpy(worder.kspace_binning_moco_iter_, kspace_binning_moco_iter_, sizeof(size_t)*MAX_MOCO_LEVEL);
+    worder.kspace_binning_moco_hilbert_strength_                    = kspace_binning_moco_hilbert_strength_;
+    worder.kspace_binning_moco_dissimilarity_sigma_                 = kspace_binning_moco_dissimilarity_sigma_;
+    worder.kspace_binning_bidirectional_moco_                       = kspace_binning_bidirectional_moco_;
+    worder.kspace_binning_soft_combination_                         = kspace_binning_soft_combination_;
+    worder.kspace_binning_navigator_window_wide_                    = kspace_binning_navigator_window_wide_;
+    worder.kspace_binning_navigator_window_narrow_                  = kspace_binning_navigator_window_narrow_;
+    worder.kspace_binning_method_warpping_                          = kspace_binning_method_warpping_;
+    worder.kspace_binning_exclude_last_cardiac_cycle_               = kspace_binning_exclude_last_cardiac_cycle_;
+    worder.kspace_binning_number_of_central_kspace_blocks_          = kspace_binning_number_of_central_kspace_blocks_;
+    worder.kspace_binning_max_temporal_window_                      = kspace_binning_max_temporal_window_;
+    worder.kspace_binning_temporal_window_                          = kspace_binning_temporal_window_;
+    worder.kspace_binning_best_cardiac_cycle_interpolator_          = kspace_binning_best_cardiac_cycle_interpolator_;
+    worder.kspace_binning_data_length_used_for_recon_               = kspace_binning_data_length_used_for_recon_;
+    worder.kspace_binning_fill_kspace_with_neighbors_               = kspace_binning_fill_kspace_with_neighbors_;
+    worder.kspace_binning_flow_in_e1_                               = kspace_binning_flow_in_e1_;
+    worder.kspace_binning_flow_recon_jointly_                       = kspace_binning_flow_recon_jointly_;
+
+    worder.motion_comp_num_of_PD_images_                            = motion_comp_num_of_PD_images_;
+
+    worder.job_split_by_S_                             = job_split_by_S_;
+    worder.job_num_of_N_                               = job_num_of_N_;
+    worder.job_max_Megabytes_                          = job_max_Megabytes_;
+    worder.job_overlap_                                = job_overlap_;
+    worder.job_perform_on_control_node_                = job_perform_on_control_node_;
+
+    worder.partialFourier_algo_                        = partialFourier_algo_;
+
+    worder.partialFourier_homodyne_iters_              = partialFourier_homodyne_iters_;
+    worder.partialFourier_homodyne_thres_              = partialFourier_homodyne_thres_;
+    worder.partialFourier_homodyne_densityComp_        = partialFourier_homodyne_densityComp_;
+
+    worder.partialFourier_POCS_iters_                  = partialFourier_POCS_iters_;
+    worder.partialFourier_POCS_thres_                  = partialFourier_POCS_thres_;
+    worder.partialFourier_POCS_transitBand_            = partialFourier_POCS_transitBand_;
+    worder.partialFourier_POCS_transitBand_E2_         = partialFourier_POCS_transitBand_E2_;
+
+    worder.partialFourier_FengHuang_kSize_RO_          = partialFourier_FengHuang_kSize_RO_;
+    worder.partialFourier_FengHuang_kSize_E1_          = partialFourier_FengHuang_kSize_E1_;
+    worder.partialFourier_FengHuang_kSize_E2_          = partialFourier_FengHuang_kSize_E2_;
+    worder.partialFourier_FengHuang_thresReg_          = partialFourier_FengHuang_thresReg_;
+    worder.partialFourier_FengHuang_sameKernel_allN_   = partialFourier_FengHuang_sameKernel_allN_;
+    worder.partialFourier_FengHuang_transitBand_       = partialFourier_FengHuang_transitBand_;
+    worder.partialFourier_FengHuang_transitBand_E2_    = partialFourier_FengHuang_transitBand_E2_;
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder<T>::duplicate(gtPlusReconWorkOrder<T>& worder) const
+{
+    this->duplicatePara(worder);
+
+    worder.dataDimStartingIndexes_      = dataDimStartingIndexes_;
+
+    worder.filterRO_                    = filterRO_;
+    worder.filterE1_                    = filterE1_;
+    worder.filterE2_                    = filterE2_;
+    worder.filterROE1_                  = filterROE1_;
+    worder.filterROE1E2_                = filterROE1E2_;
+
+    worder.filterRO_ref_                = filterRO_ref_;
+    worder.filterE1_ref_                = filterE1_ref_;
+    worder.filterE2_ref_                = filterE2_ref_;
+    worder.filterROE1_ref_              = filterROE1_ref_;
+    worder.filterROE1E2_ref_            = filterROE1E2_ref_;
+
+    worder.filterRO_partialfourier_     = filterRO_partialfourier_;
+    worder.filterE1_partialfourier_     = filterE1_partialfourier_;
+    worder.filterE2_partialfourier_     = filterE2_partialfourier_;
+    worder.filterROE1_partialfourier_   = filterROE1_partialfourier_;
+    worder.filterROE1E2_partialfourier_ = filterROE1E2_partialfourier_;
+
+    worder.CloudComputing_              = CloudComputing_;
+    worder.CloudSize_                   = CloudSize_;
+    worder.gt_cloud_                    = gt_cloud_;
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder<T>::copyFromPara(const gtPlusReconWorkOrderPara& worder)
+{
+    CalibMode_                                  = worder.CalibMode_;
+    InterleaveDim_                              = worder.InterleaveDim_;
+
+    acceFactorE1_                               = worder.acceFactorE1_;
+    acceFactorE2_                               = worder.acceFactorE2_;
+
+    kSpaceCenterRO_                             = worder.kSpaceCenterRO_;
+    kSpaceCenterEncode1_                        = worder.kSpaceCenterEncode1_;
+    kSpaceCenterEncode2_                        = worder.kSpaceCenterEncode2_;
+
+    kSpaceMaxRO_                                = worder.kSpaceMaxRO_;
+    kSpaceMaxEncode1_                           = worder.kSpaceMaxEncode1_;
+    kSpaceMaxEncode2_                           = worder.kSpaceMaxEncode2_;
+
+    workFlow_BufferKernel_                      = worder.workFlow_BufferKernel_;
+    workFlow_use_BufferedKernel_                = worder.workFlow_use_BufferedKernel_;
+    num_channels_res_                           = worder.num_channels_res_;
+
+    upstream_coil_compression_                  = worder.upstream_coil_compression_;
+    upstream_coil_compression_thres_            = worder.upstream_coil_compression_thres_;
+    upstream_coil_compression_num_modesKept_    = worder.upstream_coil_compression_num_modesKept_;
+
+    downstream_coil_compression_                = worder.downstream_coil_compression_;
+    coil_compression_thres_                     = worder.coil_compression_thres_;
+    coil_compression_num_modesKept_             = worder.coil_compression_num_modesKept_;
+
+    coil_map_algorithm_                         = worder.coil_map_algorithm_;
+    csm_kSize_                                  = worder.csm_kSize_;
+    csm_powermethod_num_                        = worder.csm_powermethod_num_;
+    csm_true_3D_                                = worder.csm_true_3D_;
+    csm_iter_num_                               = worder.csm_iter_num_;
+    csm_iter_thres_                             = worder.csm_iter_thres_;
+    csm_use_gpu_                                = worder.csm_use_gpu_;
+
+    start_RO_                                   = worder.start_RO_;
+    end_RO_                                     = worder.end_RO_;
+
+    start_E1_                                   = worder.start_E1_;
+    end_E1_                                     = worder.end_E1_;
+
+    start_E2_                                   = worder.start_E2_;
+    end_E2_                                     = worder.end_E2_;
+
+    recon_algorithm_                            = worder.recon_algorithm_;
+    recon_auto_parameters_                      = worder.recon_auto_parameters_;
+    gfactor_needed_                             = worder.gfactor_needed_;
+    wrap_around_map_needed_                     = worder.wrap_around_map_needed_;
+
+    grappa_kSize_RO_                            = worder.grappa_kSize_RO_;
+    grappa_kSize_RO_                            = worder.grappa_kSize_RO_;
+    grappa_kSize_E1_                            = worder.grappa_kSize_E1_;
+    grappa_kSize_E2_                            = worder.grappa_kSize_E2_;
+    grappa_reg_lamda_                           = worder.grappa_reg_lamda_;
+    grappa_calib_over_determine_ratio_          = worder.grappa_calib_over_determine_ratio_;
+    grappa_use_gpu_                             = worder.grappa_use_gpu_;
+
+    spirit_kSize_RO_                            = worder.spirit_kSize_RO_;
+    spirit_kSize_E1_                            = worder.spirit_kSize_E1_;
+    spirit_kSize_E2_                            = worder.spirit_kSize_E2_;
+    spirit_oSize_RO_                            = worder.spirit_oSize_RO_;
+    spirit_oSize_E1_                            = worder.spirit_oSize_E1_;
+    spirit_oSize_E2_                            = worder.spirit_oSize_E2_;
+    spirit_reg_lamda_                           = worder.spirit_reg_lamda_;
+    spirit_use_gpu_                             = worder.spirit_use_gpu_;
+    spirit_calib_over_determine_ratio_          = worder.spirit_calib_over_determine_ratio_;
+    spirit_solve_symmetric_                     = worder.spirit_solve_symmetric_;
+    spirit_iter_max_                            = worder.spirit_iter_max_;
+    spirit_iter_thres_                          = worder.spirit_iter_thres_;
+    spirit_print_iter_                          = worder.spirit_print_iter_;
+
+    spirit_perform_linear_                      = worder.spirit_perform_linear_;
+    spirit_perform_grappa_linear_               = worder.spirit_perform_grappa_linear_;
+    spirit_perform_nonlinear_                   = worder.spirit_perform_nonlinear_;
+    spirit_parallel_imaging_lamda_              = worder.spirit_parallel_imaging_lamda_;
+    spirit_image_reg_lamda_                     = worder.spirit_image_reg_lamda_;
+    spirit_data_fidelity_lamda_                 = worder.spirit_data_fidelity_lamda_;
+    spirit_ncg_iter_max_                        = worder.spirit_ncg_iter_max_;
+    spirit_ncg_iter_thres_                      = worder.spirit_ncg_iter_thres_;
+    spirit_ncg_scale_factor_                    = worder.spirit_ncg_scale_factor_;
+    spirit_ncg_print_iter_                      = worder.spirit_ncg_print_iter_;
+    spirit_slep_iter_max_                       = worder.spirit_slep_iter_max_;
+    spirit_slep_iter_thres_                     = worder.spirit_slep_iter_thres_;
+    spirit_slep_print_iter_                     = worder.spirit_slep_print_iter_;
+    spirit_slep_keep_third_dimension_coeff_     = worder.spirit_slep_keep_third_dimension_coeff_;
+    spirit_slep_keep_approx_coeff_              = worder.spirit_slep_keep_approx_coeff_;
+    spirit_slep_scale_factor_                   = worder.spirit_slep_scale_factor_;
+    spirit_use_coil_sen_map_                    = worder.spirit_use_coil_sen_map_;
+    spirit_use_moco_enhancement_                = worder.spirit_use_moco_enhancement_;
+    spirit_recon_moco_images_                   = worder.spirit_recon_moco_images_;
+    spirit_RO_enhancement_ratio_                = worder.spirit_RO_enhancement_ratio_;
+    spirit_E1_enhancement_ratio_                = worder.spirit_E1_enhancement_ratio_;
+    spirit_E2_enhancement_ratio_                = worder.spirit_E2_enhancement_ratio_;
+    spirit_temporal_enhancement_ratio_          = worder.spirit_temporal_enhancement_ratio_;
+    spirit_2D_scale_per_chunk_                  = worder.spirit_2D_scale_per_chunk_;
+    spirit_3D_scale_per_chunk_                  = worder.spirit_3D_scale_per_chunk_;
+
+    retro_gated_images_                         = worder.retro_gated_images_;
+    retro_gated_segment_size_                   = worder.retro_gated_segment_size_;
+    retro_gated_interp_method_                  = worder.retro_gated_interp_method_;
+
+    kspace_binning_number_of_cardiac_phases_          = worder.kspace_binning_number_of_cardiac_phases_;
+    kspace_binning_minimal_cardiac_phase_width_          = worder.kspace_binning_minimal_cardiac_phase_width_;
+    kspace_binning_multiple_channel_recon_         = worder.kspace_binning_multiple_channel_recon_;
+    kspace_binning_iterative_non_linear_recon_              = worder.kspace_binning_iterative_non_linear_recon_;
+    kspace_binning_iterative_non_linear_recon_slep_              = worder.kspace_binning_iterative_non_linear_recon_slep_;
+    kspace_binning_multiple_channel_recon_with_coil_map_ = worder.kspace_binning_multiple_channel_recon_with_coil_map_;
+    kspace_binning_compute_navigator_signal_       = worder.kspace_binning_compute_navigator_signal_;
+    kspace_binning_navigator_moco_level_                = worder.kspace_binning_navigator_moco_level_;
+    memcpy(kspace_binning_navigator_moco_iter_, worder.kspace_binning_navigator_moco_iter_, sizeof(size_t)*MAX_MOCO_LEVEL);
+    kspace_binning_navigator_hilbert_strength_                    = worder.kspace_binning_navigator_hilbert_strength_;
+    kspace_binning_navigator_dissimilarity_sigma_                 = worder.kspace_binning_navigator_dissimilarity_sigma_;
+    kspace_binning_navigator_bidirectional_moco_         = worder.kspace_binning_navigator_bidirectional_moco_;
+    kspace_binning_moco_level_                   = worder.kspace_binning_moco_level_;
+    memcpy(kspace_binning_moco_iter_, worder.kspace_binning_moco_iter_, sizeof(size_t)*MAX_MOCO_LEVEL);
+    kspace_binning_moco_hilbert_strength_                       = worder.kspace_binning_moco_hilbert_strength_;
+    kspace_binning_moco_dissimilarity_sigma_                    = worder.kspace_binning_moco_dissimilarity_sigma_;
+    kspace_binning_bidirectional_moco_            = worder.kspace_binning_bidirectional_moco_;
+    kspace_binning_soft_combination_            = worder.kspace_binning_soft_combination_;
+    kspace_binning_navigator_window_wide_                  = worder.kspace_binning_navigator_window_wide_;
+    kspace_binning_navigator_window_narrow_                = worder.kspace_binning_navigator_window_narrow_;
+    kspace_binning_method_warpping_              = worder.kspace_binning_method_warpping_;
+    kspace_binning_exclude_last_cardiac_cycle_            = worder.kspace_binning_exclude_last_cardiac_cycle_;
+    kspace_binning_number_of_central_kspace_blocks_         = worder.kspace_binning_number_of_central_kspace_blocks_;
+    kspace_binning_max_temporal_window_    = worder.kspace_binning_max_temporal_window_;
+    kspace_binning_temporal_window_    = worder.kspace_binning_temporal_window_;
+    kspace_binning_best_cardiac_cycle_interpolator_        = worder.kspace_binning_best_cardiac_cycle_interpolator_;
+    kspace_binning_data_length_used_for_recon_            = worder.kspace_binning_data_length_used_for_recon_;
+    kspace_binning_fill_kspace_with_neighbors_ = worder.kspace_binning_fill_kspace_with_neighbors_;
+    kspace_binning_flow_in_e1_ = worder.kspace_binning_flow_in_e1_;
+    kspace_binning_flow_recon_jointly_ = worder.kspace_binning_flow_recon_jointly_;
+
+    motion_comp_num_of_PD_images_ = worder.motion_comp_num_of_PD_images_;
+
+    job_split_by_S_                             = worder.job_split_by_S_;
+    job_num_of_N_                               = worder.job_num_of_N_;
+    job_max_Megabytes_                          = worder.job_max_Megabytes_;
+    job_overlap_                                = worder.job_overlap_;
+    job_perform_on_control_node_                = worder.job_perform_on_control_node_;
+
+    partialFourier_algo_                        = worder.partialFourier_algo_;
+
+    partialFourier_homodyne_iters_              = worder.partialFourier_homodyne_iters_;
+    partialFourier_homodyne_thres_              = worder.partialFourier_homodyne_thres_;
+    partialFourier_homodyne_densityComp_        = worder.partialFourier_homodyne_densityComp_;
+
+    partialFourier_POCS_iters_                  = worder.partialFourier_POCS_iters_;
+    partialFourier_POCS_thres_                  = worder.partialFourier_POCS_thres_;
+    partialFourier_POCS_transitBand_            = worder.partialFourier_POCS_transitBand_;
+    partialFourier_POCS_transitBand_E2_         = worder.partialFourier_POCS_transitBand_E2_;
+
+    partialFourier_FengHuang_kSize_RO_          = worder.partialFourier_FengHuang_kSize_RO_;
+    partialFourier_FengHuang_kSize_E1_          = worder.partialFourier_FengHuang_kSize_E1_;
+    partialFourier_FengHuang_kSize_E2_          = worder.partialFourier_FengHuang_kSize_E2_;
+    partialFourier_FengHuang_thresReg_          = worder.partialFourier_FengHuang_thresReg_;
+    partialFourier_FengHuang_sameKernel_allN_   = worder.partialFourier_FengHuang_sameKernel_allN_;
+    partialFourier_FengHuang_transitBand_       = worder.partialFourier_FengHuang_transitBand_;
+    partialFourier_FengHuang_transitBand_E2_    = worder.partialFourier_FengHuang_transitBand_E2_;
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder<T>::printInfo(std::ostream& os) const
+{
+    using namespace std;
+    GADGET_PARA_PRINT(CalibMode_);
+    GADGET_PARA_PRINT(InterleaveDim_);
+    GADGET_PARA_PRINT(acceFactorE1_);
+    GADGET_PARA_PRINT(acceFactorE2_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(kSpaceCenterRO_);
+    GADGET_PARA_PRINT(kSpaceCenterEncode1_);
+    GADGET_PARA_PRINT(kSpaceCenterEncode2_);
+    GADGET_PARA_PRINT(kSpaceMaxRO_);
+    GADGET_PARA_PRINT(kSpaceMaxEncode1_);
+    GADGET_PARA_PRINT(kSpaceMaxEncode2_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(workFlow_BufferKernel_);
+    GADGET_PARA_PRINT(workFlow_use_BufferedKernel_);
+    GADGET_PARA_PRINT(num_channels_res_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(upstream_coil_compression_);
+    GADGET_PARA_PRINT(upstream_coil_compression_thres_);
+    GADGET_PARA_PRINT(upstream_coil_compression_num_modesKept_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(downstream_coil_compression_);
+    GADGET_PARA_PRINT(coil_compression_thres_);
+    GADGET_PARA_PRINT(coil_compression_num_modesKept_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(coil_map_algorithm_);
+    GADGET_PARA_PRINT(csm_kSize_);
+    GADGET_PARA_PRINT(csm_powermethod_num_);
+    GADGET_PARA_PRINT(csm_true_3D_);
+    GADGET_PARA_PRINT(csm_iter_num_);
+    GADGET_PARA_PRINT(csm_iter_thres_);
+    GADGET_PARA_PRINT(csm_use_gpu_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(start_RO_);
+    GADGET_PARA_PRINT(end_RO_);
+    GADGET_PARA_PRINT(start_E1_);
+    GADGET_PARA_PRINT(end_E1_);
+    GADGET_PARA_PRINT(start_E2_);
+    GADGET_PARA_PRINT(end_E2_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(recon_algorithm_);
+    GADGET_PARA_PRINT(recon_auto_parameters_);
+    GADGET_PARA_PRINT(gfactor_needed_);
+    GADGET_PARA_PRINT(wrap_around_map_needed_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(grappa_kSize_RO_);
+    GADGET_PARA_PRINT(grappa_kSize_E1_);
+    GADGET_PARA_PRINT(grappa_kSize_E2_);
+    GADGET_PARA_PRINT(grappa_reg_lamda_);
+    GADGET_PARA_PRINT(grappa_calib_over_determine_ratio_);
+    GADGET_PARA_PRINT(grappa_use_gpu_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(spirit_kSize_RO_);
+    GADGET_PARA_PRINT(spirit_kSize_E1_);
+    GADGET_PARA_PRINT(spirit_kSize_E2_);
+    GADGET_PARA_PRINT(spirit_oSize_RO_);
+    GADGET_PARA_PRINT(spirit_oSize_E1_);
+    GADGET_PARA_PRINT(spirit_oSize_E2_);
+    GADGET_PARA_PRINT(spirit_reg_lamda_);
+    GADGET_PARA_PRINT(spirit_use_gpu_);
+    GADGET_PARA_PRINT(spirit_calib_over_determine_ratio_);
+    GADGET_PARA_PRINT(spirit_solve_symmetric_);
+    GADGET_PARA_PRINT(spirit_iter_max_);
+    GADGET_PARA_PRINT(spirit_iter_thres_);
+    GADGET_PARA_PRINT(spirit_print_iter_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(spirit_perform_linear_);
+    GADGET_PARA_PRINT(spirit_perform_grappa_linear_);
+    GADGET_PARA_PRINT(spirit_perform_nonlinear_);
+    GADGET_PARA_PRINT(spirit_parallel_imaging_lamda_);
+    GADGET_PARA_PRINT(spirit_image_reg_lamda_);
+    GADGET_PARA_PRINT(spirit_data_fidelity_lamda_);
+    GADGET_PARA_PRINT(spirit_ncg_iter_max_);
+    GADGET_PARA_PRINT(spirit_ncg_iter_thres_);
+    GADGET_PARA_PRINT(spirit_ncg_scale_factor_);
+    GADGET_PARA_PRINT(spirit_ncg_print_iter_);
+    GADGET_PARA_PRINT(spirit_slep_iter_max_);
+    GADGET_PARA_PRINT(spirit_slep_iter_thres_);
+    GADGET_PARA_PRINT(spirit_slep_print_iter_);
+    GADGET_PARA_PRINT(spirit_slep_keep_third_dimension_coeff_);
+    GADGET_PARA_PRINT(spirit_slep_keep_approx_coeff_);
+    GADGET_PARA_PRINT(spirit_slep_scale_factor_);
+    GADGET_PARA_PRINT(spirit_use_coil_sen_map_);
+    GADGET_PARA_PRINT(spirit_use_moco_enhancement_);
+    GADGET_PARA_PRINT(spirit_recon_moco_images_);
+    GADGET_PARA_PRINT(spirit_RO_enhancement_ratio_);
+    GADGET_PARA_PRINT(spirit_E1_enhancement_ratio_);
+    GADGET_PARA_PRINT(spirit_E2_enhancement_ratio_);
+    GADGET_PARA_PRINT(spirit_temporal_enhancement_ratio_);
+    GADGET_PARA_PRINT(spirit_2D_scale_per_chunk_);
+    GADGET_PARA_PRINT(spirit_3D_scale_per_chunk_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(retro_gated_images_);
+    GADGET_PARA_PRINT(retro_gated_segment_size_);
+    GADGET_PARA_PRINT(retro_gated_interp_method_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(kspace_binning_number_of_cardiac_phases_);
+    GADGET_PARA_PRINT(kspace_binning_minimal_cardiac_phase_width_);
+    GADGET_PARA_PRINT(kspace_binning_multiple_channel_recon_);
+    GADGET_PARA_PRINT(kspace_binning_iterative_non_linear_recon_);
+    GADGET_PARA_PRINT(kspace_binning_iterative_non_linear_recon_slep_);
+    GADGET_PARA_PRINT(kspace_binning_multiple_channel_recon_with_coil_map_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(kspace_binning_compute_navigator_signal_);
+
+    GADGET_PARA_PRINT(kspace_binning_navigator_moco_level_);
+    std::stringstream ostr;
+    ostr << " [ ";
+    size_t ii;
+    for ( ii=0; ii<kspace_binning_navigator_moco_level_; ii++ )
+    {
+        ostr << kspace_binning_navigator_moco_iter_[ii] << " ";
+    }
+    ostr << " ] " << std::endl;
+    GDEBUG_STREAM(ostr.str());
+
+    GADGET_PARA_PRINT(kspace_binning_navigator_hilbert_strength_);
+    GADGET_PARA_PRINT(kspace_binning_navigator_dissimilarity_sigma_);
+    GADGET_PARA_PRINT(kspace_binning_navigator_bidirectional_moco_);
+    GDEBUG_STREAM("---------------------");
+
+    GADGET_PARA_PRINT(kspace_binning_moco_level_);
+    std::stringstream ostr_moco_level;
+    ostr_moco_level << " [ ";
+    for ( ii=0; ii<kspace_binning_moco_level_; ii++ )
+    {
+        ostr_moco_level << kspace_binning_moco_iter_[ii] << " ";
+    }
+    ostr_moco_level << " ] " << std::endl;
+    GDEBUG_STREAM(ostr_moco_level.str());
+
+    GADGET_PARA_PRINT(kspace_binning_moco_hilbert_strength_);
+    GADGET_PARA_PRINT(kspace_binning_moco_dissimilarity_sigma_);
+    GADGET_PARA_PRINT(kspace_binning_bidirectional_moco_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(kspace_binning_soft_combination_);
+    GADGET_PARA_PRINT(kspace_binning_navigator_window_wide_);
+    GADGET_PARA_PRINT(kspace_binning_navigator_window_narrow_);
+    GADGET_PARA_PRINT(kspace_binning_method_warpping_);
+    GADGET_PARA_PRINT(kspace_binning_exclude_last_cardiac_cycle_);
+    GADGET_PARA_PRINT(kspace_binning_number_of_central_kspace_blocks_);
+    GADGET_PARA_PRINT(kspace_binning_max_temporal_window_);
+    GADGET_PARA_PRINT(kspace_binning_temporal_window_);
+    GADGET_PARA_PRINT(kspace_binning_best_cardiac_cycle_interpolator_);
+    GADGET_PARA_PRINT(kspace_binning_data_length_used_for_recon_);
+    GADGET_PARA_PRINT(kspace_binning_fill_kspace_with_neighbors_);
+    GADGET_PARA_PRINT(kspace_binning_flow_in_e1_);
+    GADGET_PARA_PRINT(kspace_binning_flow_recon_jointly_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(motion_comp_num_of_PD_images_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(job_split_by_S_);
+    GADGET_PARA_PRINT(job_num_of_N_);
+    GADGET_PARA_PRINT(job_max_Megabytes_);
+    GADGET_PARA_PRINT(job_overlap_);
+    GADGET_PARA_PRINT(job_perform_on_control_node_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(partialFourier_algo_);
+    GADGET_PARA_PRINT(partialFourier_homodyne_iters_);
+    GADGET_PARA_PRINT(partialFourier_homodyne_thres_);
+    GADGET_PARA_PRINT(partialFourier_homodyne_densityComp_);
+    GADGET_PARA_PRINT(partialFourier_POCS_iters_);
+    GADGET_PARA_PRINT(partialFourier_POCS_thres_);
+    GADGET_PARA_PRINT(partialFourier_POCS_transitBand_);
+    GADGET_PARA_PRINT(partialFourier_POCS_transitBand_E2_);
+    GADGET_PARA_PRINT(partialFourier_FengHuang_kSize_RO_);
+    GADGET_PARA_PRINT(partialFourier_FengHuang_kSize_E1_);
+    GADGET_PARA_PRINT(partialFourier_FengHuang_kSize_E2_);
+    GADGET_PARA_PRINT(partialFourier_FengHuang_thresReg_);
+    GADGET_PARA_PRINT(partialFourier_FengHuang_sameKernel_allN_);
+    GADGET_PARA_PRINT(partialFourier_FengHuang_transitBand_);
+    GADGET_PARA_PRINT(partialFourier_FengHuang_transitBand_E2_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(CloudComputing_);
+    GADGET_PARA_PRINT(CloudSize_);
+    for ( unsigned int nn=0; nn<gt_cloud_.size(); nn++ )
+    {
+        GADGET_PARA_PRINT(gt_cloud_[nn]);
+    }
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder<T>::print(std::ostream& os) const
+{
+    using namespace std;
+    os << "-------------- gtPlusReconWorkOrder ---------------" << endl;
+    printInfo(os);
+    os << "---------------------------------------------------" << endl;
+}
+
+}}
+
+#include "gtPlusISMRMRDReconWorkOrder2DT.h"
+#include "gtPlusISMRMRDReconWorkOrder3DT.h"
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder2DT.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder2DT.h
new file mode 100644
index 0000000..0dac730
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder2DT.h
@@ -0,0 +1,409 @@
+/** \file   gtPlusISMRMRDReconWorkOrder2DT.h
+    \brief  Define the GtPlus reconstruction workorder and parameters for 2DT reconstruction
+    \author Hui Xue
+*/
+
+#pragma once
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorkOrder2DT : public gtPlusReconWorkOrder<T>
+{
+public:
+
+    typedef gtPlusReconWorkOrder<T> BaseClass;
+
+    gtPlusReconWorkOrder2DT();
+    virtual ~gtPlusReconWorkOrder2DT();
+
+    virtual bool reset();
+
+    virtual bool enforceConsistency(ISMRMRDDIM& lastDim);
+    virtual void duplicate(gtPlusReconWorkOrder2DT<T>& worder);
+
+    virtual void printInfo(std::ostream& os) const;
+    virtual void print(std::ostream& os) const;
+
+    // kspace_: [RO E1 CHA N S], for 2D recon, N can be 1
+    // ref_: [RO E1 CHA M S], M can equal to N or 1 or others
+    // fullkspace_: [RO E1 CHA N S]
+    // complexIm_: [RO E1 N S], after coil combination or [RO E1 num_channels_res_ N S] if num_channels_res_ > 1
+    // coilMap_: [RO E1 CHA 1 or N S]
+    // gfactor_: [RO E1 CHA 1 or N S]
+
+    // the fifth dimension can be SLC or SET or others
+
+    // default behavior
+    // a) the coil compression coefficients are computed once across all S
+    // b) the kernel or coil sensitivity are estimated for every S
+
+    // embedded mode
+    // a) perform recon and estimate kernel/coil sensitivity for every 2D kspace [RO E1 CHA]
+    // b) coil combination uses different coil maps for every S
+    // c) if the kspace recon is performed, the coil combination map is reestimated on the fullkspace for every 2D images
+    // d) the ref lines are filled back to fullkspace_
+
+    // separate mode
+    // a) perform recon and estimate kernel/coil sensitivity for every 2D kspace [RO E1 CHA] if M==N
+    // b) if M==1, the kernel is only estimated once for every S
+    // c) coil combination uses different coil maps for every S
+    // d) if the kspace recon is performed, the coil combination map is reestimated on the fullkspace for every 2D images
+
+    // interleave
+    // a) the average-all ref is used
+    // b) kernel/coil sensitivity is estimated once for every S
+
+    using BaseClass::data_;
+    using BaseClass::ref_;
+    using BaseClass::ref_recon_;
+    using BaseClass::ref_coil_map_;
+    using BaseClass::kspace_initial_;
+    using BaseClass::CalibMode_;
+    using BaseClass::InterleaveDim_;
+    using BaseClass::acceFactorE1_;
+    using BaseClass::acceFactorE2_;
+    using BaseClass::num_channels_res_;
+    using BaseClass::coilMap_; // [RO E1 dstCHA 1 or N S]
+
+    using BaseClass::fullkspace_; // [RO E1 dstCHA N S]
+    using BaseClass::complexIm_; // [RO E1 N S]
+    using BaseClass::recon_time_stamp_; // [1 1 1 N S]
+    using BaseClass::recon_physio_time_stamp_; // [1 1 1 N S]
+
+    using BaseClass::fullkspace_second_; // [RO E1 dstCHA N S]
+    using BaseClass::complexIm_second_; // [RO E1 N S]
+    using BaseClass::recon_time_stamp_second_; // [1 1 1 N S]
+    using BaseClass::recon_physio_time_stamp_second_; // [1 1 1 N S]
+
+    using BaseClass::gfactor_; // [RO E1 1 or N S]
+    using BaseClass::wrap_around_map_; // [RO E1 2 1 or N S]
+
+    using BaseClass::downstream_coil_compression_;
+    using BaseClass::coil_compression_thres_;
+    using BaseClass::coil_compression_num_modesKept_;
+    using BaseClass::csm_kSize_;
+    using BaseClass::csm_powermethod_num_;
+    using BaseClass::csm_true_3D_;
+    using BaseClass::csm_iter_num_;
+    using BaseClass::csm_iter_thres_;
+    using BaseClass::csm_use_gpu_;
+    using BaseClass::start_RO_;
+    using BaseClass::end_RO_;
+    using BaseClass::start_E1_;
+    using BaseClass::end_E1_;
+    using BaseClass::start_E2_;
+    using BaseClass::end_E2_;
+
+    using BaseClass::filterRO_;
+    using BaseClass::filterE1_;
+    using BaseClass::filterE2_;
+    using BaseClass::filterROE1_;
+    using BaseClass::filterROE1E2_;
+
+    using BaseClass::filterRO_ref_;
+    using BaseClass::filterE1_ref_;
+    using BaseClass::filterE2_ref_;
+    using BaseClass::filterROE1_ref_;
+    using BaseClass::filterROE1E2_ref_;
+
+    using BaseClass::filterRO_partialfourier_;
+    using BaseClass::filterE1_partialfourier_;
+    using BaseClass::filterE2_partialfourier_;
+    using BaseClass::filterROE1_partialfourier_;
+    using BaseClass::filterROE1E2_partialfourier_;
+
+    using BaseClass::recon_algorithm_;
+
+    using BaseClass::grappa_kSize_RO_;
+    using BaseClass::grappa_kSize_E1_;
+    using BaseClass::grappa_kSize_E2_;
+    using BaseClass::grappa_reg_lamda_;
+    using BaseClass::grappa_calib_over_determine_ratio_;
+    using BaseClass::grappa_use_gpu_;
+
+    using BaseClass::spirit_kSize_RO_;
+    using BaseClass::spirit_kSize_E1_;
+    using BaseClass::spirit_kSize_E2_;
+    using BaseClass::spirit_oSize_RO_;
+    using BaseClass::spirit_oSize_E1_;
+    using BaseClass::spirit_oSize_E2_;
+    using BaseClass::spirit_reg_lamda_;
+    using BaseClass::spirit_use_gpu_;
+    using BaseClass::spirit_iter_max_;
+    using BaseClass::spirit_iter_thres_;
+    using BaseClass::spirit_print_iter_;
+
+    using BaseClass::spirit_perform_linear_;
+    using BaseClass::spirit_perform_nonlinear_;
+    using BaseClass::spirit_parallel_imaging_lamda_;
+    using BaseClass::spirit_image_reg_lamda_;
+    using BaseClass::spirit_data_fidelity_lamda_;
+    using BaseClass::spirit_ncg_iter_max_;
+    using BaseClass::spirit_ncg_iter_thres_;
+    using BaseClass::spirit_ncg_scale_factor_;
+    using BaseClass::spirit_ncg_print_iter_;
+    using BaseClass::spirit_slep_iter_max_;
+    using BaseClass::spirit_slep_iter_thres_;
+    using BaseClass::spirit_slep_print_iter_;
+    using BaseClass::spirit_slep_keep_third_dimension_coeff_;
+    using BaseClass::spirit_slep_scale_factor_;
+    using BaseClass::spirit_use_coil_sen_map_;
+    using BaseClass::spirit_use_moco_enhancement_;
+    using BaseClass::spirit_recon_moco_images_;
+    using BaseClass::spirit_RO_enhancement_ratio_;
+    using BaseClass::spirit_E1_enhancement_ratio_;
+    using BaseClass::spirit_E2_enhancement_ratio_;
+    using BaseClass::spirit_temporal_enhancement_ratio_;
+
+    using BaseClass::job_split_by_S_;
+    using BaseClass::job_num_of_N_;
+    using BaseClass::job_max_Megabytes_;
+    using BaseClass::job_overlap_;
+
+    using BaseClass::partialFourier_algo_;
+    using BaseClass::partialFourier_homodyne_iters_;
+    using BaseClass::partialFourier_homodyne_thres_;
+    using BaseClass::partialFourier_homodyne_densityComp_;
+    using BaseClass::partialFourier_POCS_iters_;
+    using BaseClass::partialFourier_POCS_thres_;
+    using BaseClass::partialFourier_POCS_transitBand_;
+    using BaseClass::partialFourier_FengHuang_kSize_RO_;
+    using BaseClass::partialFourier_FengHuang_kSize_E1_;
+    using BaseClass::partialFourier_FengHuang_kSize_E2_;
+    using BaseClass::partialFourier_FengHuang_thresReg_;
+    using BaseClass::partialFourier_FengHuang_sameKernel_allN_;
+    using BaseClass::partialFourier_FengHuang_transitBand_;
+
+    using BaseClass::CloudComputing_;
+    using BaseClass::CloudSize_;
+    using BaseClass::gt_cloud_;
+
+    // for 2DT
+    using BaseClass::kernel_; // [RO E1 srcCHA dstCHA dstE1 1 or N S]
+    using BaseClass::kernelIm_; // [RO E1 srcCHA dstCHA 1 or N S]
+    using BaseClass::unmixingCoeffIm_; // [RO E1 srcCHA 1 or N S]
+    using BaseClass::coilCompressionCoef_; // [dstCHA srcCHA] matrixes
+
+    // parameters to change the default behavior
+
+    // if true, the actual full kspace is computed, not only the coil combined complex images
+    bool recon_kspace_needed_;
+
+    // if true, no coil compression will be performed
+    bool coil_compression_;
+    // if true, the same coil compression coefficient is computed for all S
+    bool same_coil_compression_coeff_allS_;
+
+    // no acceleration
+    // if true, the average of all M ref will be used
+    // the coil sensitivity will be only estimed once for all N
+    bool no_acceleration_averageall_ref_;
+    // number of modes kept for ref data
+    int no_acceleration_ref_numOfModes_;
+    // if true, the same coil combination coefficients will be used for all S
+    bool no_acceleration_same_combinationcoeff_allS_;
+    // if no_acceleration_same_combinationcoeff_allS_==true, select the S for coil combination coefficient estimation
+    size_t no_acceleration_whichS_combinationcoeff_;
+
+    // embedded mode
+    // if true, the average of all M ref will be used
+    // the kernel/sensitivity will be only estimed once for all N
+    bool embedded_averageall_ref_;
+    // number of modes kept for ref data
+    int embedded_ref_numOfModes_;
+    // if true, the coil map will be estimated from the fullkspace_
+    bool embedded_fullres_coilmap_;
+    // if embedded_averageall_ref_==true && embedded_fullres_coilmap_==true, whether to select the highest signal frame to compute full res coil map
+    // if false, the averageall image will be used to compute full res coil map
+    bool embedded_fullres_coilmap_useHighestSignal_;
+    // if true, the same coil combination coefficients will be used for all S
+    bool embedded_same_combinationcoeff_allS_;
+    // if embedded_same_combinationcoeff_allS_==true, select the S for coil combination coefficient estimation
+    size_t embedded_whichS_combinationcoeff_;
+    // if true, the ref lines will be filled back to fullkspace
+    bool embedded_ref_fillback_;
+
+    // separate mode
+    // if true, the average of all M ref will be used
+    // the kernel/sensitivity will be only estimed once for every S
+    bool separate_averageall_ref_;
+    // number of modes kept for ref data
+    int separate_ref_numOfModes_;
+    // if true, the coil map will be estimated from the fullkspace_
+    bool separate_fullres_coilmap_;
+    // if true, the same coil combination coefficients will be used for all S
+    bool separate_same_combinationcoeff_allS_;
+    // if separate_same_combinationcoeff_allS_==true, select the S for coil combination coefficient estimation
+    size_t separate_whichS_combinationcoeff_;
+
+    // interleaved mode
+    // if true, the same coil combination coefficients will be used for all S
+    bool interleaved_same_combinationcoeff_allS_;
+    // if separate_same_combinationcoeff_allS_==true, select the S for coil combination coefficient estimation
+    size_t interleaved_whichS_combinationcoeff_;
+    // number of modes kept for ref data
+    int interleaved_ref_numOfModes_;
+};
+
+template <typename T> 
+gtPlusReconWorkOrder2DT<T>::gtPlusReconWorkOrder2DT() : BaseClass()
+{
+    coil_compression_ = true;
+    same_coil_compression_coeff_allS_ = false;
+
+    no_acceleration_averageall_ref_ = true;
+    no_acceleration_ref_numOfModes_ = 3;
+    no_acceleration_same_combinationcoeff_allS_ = false;
+    no_acceleration_whichS_combinationcoeff_ = 0;
+
+    embedded_averageall_ref_ = false;
+    embedded_ref_numOfModes_ = 3;
+    embedded_fullres_coilmap_ = true;
+    embedded_fullres_coilmap_useHighestSignal_ = false;
+    embedded_same_combinationcoeff_allS_ = false;
+    embedded_whichS_combinationcoeff_ = false;
+    embedded_ref_fillback_ = true;
+
+    separate_averageall_ref_ = false;
+    separate_ref_numOfModes_ = 3;
+    separate_fullres_coilmap_ = true;
+    separate_same_combinationcoeff_allS_ = false;
+    separate_whichS_combinationcoeff_ = false;
+
+    interleaved_same_combinationcoeff_allS_ = false;
+    interleaved_whichS_combinationcoeff_ = false;
+    interleaved_ref_numOfModes_ = 0;
+}
+
+template <typename T> 
+gtPlusReconWorkOrder2DT<T>::~gtPlusReconWorkOrder2DT()
+{
+}
+
+template <typename T> 
+bool gtPlusReconWorkOrder2DT<T>::reset()
+{
+    try
+    {
+        kernel_->clear();
+        kernelIm_->clear();
+        unmixingCoeffIm_->clear();
+        coilCompressionCoef_->clear();
+        coilMap_->clear();
+
+        fullkspace_.clear();
+        complexIm_.clear();
+        recon_time_stamp_.clear();
+        recon_physio_time_stamp_.clear();
+
+        fullkspace_second_.clear();
+        complexIm_second_.clear();
+        recon_time_stamp_second_.clear();
+        recon_physio_time_stamp_second_.clear();
+
+        gfactor_.clear();
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorkOrder2DT<T>::reset() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorkOrder2DT<T>::enforceConsistency(ISMRMRDDIM& lastDim)
+{
+    if ( lastDim == DIM_Slice )
+    {
+        same_coil_compression_coeff_allS_ = false;
+        no_acceleration_same_combinationcoeff_allS_ = false;
+        embedded_same_combinationcoeff_allS_ = false;
+        separate_same_combinationcoeff_allS_ = false;
+        interleaved_same_combinationcoeff_allS_ = false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder2DT<T>::duplicate(gtPlusReconWorkOrder2DT<T>& worder)
+{
+    BaseClass::duplicate(worder);
+
+    worder.recon_kspace_needed_ = recon_kspace_needed_;
+
+    worder.coil_compression_ = coil_compression_;
+    worder.same_coil_compression_coeff_allS_ = same_coil_compression_coeff_allS_;
+
+    worder.no_acceleration_averageall_ref_ = no_acceleration_averageall_ref_;
+    worder.no_acceleration_ref_numOfModes_ = no_acceleration_ref_numOfModes_;
+    worder.no_acceleration_same_combinationcoeff_allS_ = no_acceleration_same_combinationcoeff_allS_;
+    worder.no_acceleration_whichS_combinationcoeff_ = no_acceleration_whichS_combinationcoeff_;
+
+    worder.embedded_averageall_ref_ = embedded_averageall_ref_;
+    worder.embedded_ref_numOfModes_ = embedded_ref_numOfModes_;
+    worder.embedded_fullres_coilmap_ = embedded_fullres_coilmap_;
+    worder.embedded_fullres_coilmap_useHighestSignal_ = embedded_fullres_coilmap_useHighestSignal_;
+    worder.embedded_same_combinationcoeff_allS_ = embedded_same_combinationcoeff_allS_;
+    worder.embedded_whichS_combinationcoeff_ = embedded_whichS_combinationcoeff_;
+    worder.embedded_ref_fillback_ = embedded_ref_fillback_;
+
+    worder.separate_averageall_ref_ = separate_averageall_ref_;
+    worder.separate_ref_numOfModes_ = separate_ref_numOfModes_;
+    worder.separate_fullres_coilmap_ = separate_fullres_coilmap_;
+    worder.separate_same_combinationcoeff_allS_ = separate_same_combinationcoeff_allS_;
+    worder.separate_whichS_combinationcoeff_ = separate_whichS_combinationcoeff_;
+
+    worder.interleaved_same_combinationcoeff_allS_ = interleaved_same_combinationcoeff_allS_;
+    worder.interleaved_whichS_combinationcoeff_ = interleaved_whichS_combinationcoeff_;
+    worder.interleaved_ref_numOfModes_ = interleaved_ref_numOfModes_;
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder2DT<T>::printInfo(std::ostream& os) const
+{
+    using namespace std;
+    BaseClass::printInfo(os);
+
+    GADGET_PARA_PRINT(recon_kspace_needed_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(coil_compression_);
+    GADGET_PARA_PRINT(same_coil_compression_coeff_allS_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(no_acceleration_averageall_ref_);
+    GADGET_PARA_PRINT(no_acceleration_ref_numOfModes_);
+    GADGET_PARA_PRINT(no_acceleration_same_combinationcoeff_allS_);
+    GADGET_PARA_PRINT(no_acceleration_whichS_combinationcoeff_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(embedded_averageall_ref_);
+    GADGET_PARA_PRINT(embedded_ref_numOfModes_);
+    GADGET_PARA_PRINT(embedded_fullres_coilmap_);
+    GADGET_PARA_PRINT(embedded_fullres_coilmap_useHighestSignal_);
+    GADGET_PARA_PRINT(embedded_same_combinationcoeff_allS_);
+    GADGET_PARA_PRINT(embedded_whichS_combinationcoeff_);
+    GADGET_PARA_PRINT(embedded_ref_fillback_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(separate_averageall_ref_);
+    GADGET_PARA_PRINT(separate_ref_numOfModes_);
+    GADGET_PARA_PRINT(separate_fullres_coilmap_);
+    GADGET_PARA_PRINT(separate_same_combinationcoeff_allS_);
+    GADGET_PARA_PRINT(separate_whichS_combinationcoeff_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(interleaved_same_combinationcoeff_allS_);
+    GADGET_PARA_PRINT(interleaved_whichS_combinationcoeff_);
+    GADGET_PARA_PRINT(interleaved_ref_numOfModes_);
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder2DT<T>::print(std::ostream& os) const
+{
+    using namespace std;
+    os << "-------------- gtPlusReconWorkOrder2DT ---------------" << endl;
+    printInfo(os);
+    os << "------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder3DT.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder3DT.h
new file mode 100644
index 0000000..97d1f6a
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder3DT.h
@@ -0,0 +1,380 @@
+/** \file   gtPlusISMRMRDReconWorkOrder3DT.h
+    \brief  Define the GtPlus reconstruction workorder and parameters for 3DT reconstruction
+    \author Hui Xue
+*/
+
+#pragma once
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorkOrder3DT : public gtPlusReconWorkOrder<T>
+{
+public:
+
+    typedef gtPlusReconWorkOrder<T> BaseClass;
+
+    gtPlusReconWorkOrder3DT();
+    virtual ~gtPlusReconWorkOrder3DT();
+
+    // reset the status of work order
+    // all computed calibration/coil sensitivity results are deleted
+    virtual bool reset();
+
+    // check and modify inconsistency in the work order
+    virtual bool enforceConsistency(ISMRMRDDIM& lastDim);
+
+    virtual void duplicate(gtPlusReconWorkOrder3DT<T>& worder);
+
+    virtual void printInfo(std::ostream& os) const;
+    virtual void print(std::ostream& os) const;
+
+    // kspace_: [RO E1 E2 CHA N], for 3D recon, N can be 1
+    // ref_: [RO E1 E2 CHA M], M can equal to N or 1 or others
+    // fullkspace_: [RO E1 E2 CHA N]
+    // complexIm_: [RO E1 E2 N], after coil combination
+    // coilMap_: [RO E1 E2 CHA 1 or N]
+    // gfactor_: [RO E1 E2 CHA 1 or N]
+
+    // the fifth dimension can be the temporal dimension or others
+
+    // default behavior
+    // a) the coil compression coefficients are computed once for all N
+
+    // embedded mode
+    // a) perform recon and estimate kernel/coil sensitivity for every 3D kspace [RO E1 E2 CHA]
+    // b) coil combination uses different coil maps for every N
+    // c) if the kspace recon is performed, the coil combination map is reestimated on the fullkspace for every 3D images
+    // d) the ref lines are filled back to fullkspace_
+
+    // separate mode
+    // a) perform recon and estimate kernel/coil sensitivity for every 3D kspace [RO E1 E2 CHA] if M==N
+    // b) if M==1, the kernel is only estimated once for all N
+    // c) coil combination uses different coil maps for every N
+    // d) if the kspace recon is performed, the coil combination map is reestimated on the fullkspace for every 3D images
+
+    // interleave
+    // a) the average-all ref is used
+    // b) kernel/coil sensitivity is estimated once for all N
+
+    using BaseClass::data_;
+    using BaseClass::ref_;
+    using BaseClass::ref_recon_;
+    using BaseClass::ref_coil_map_;
+    using BaseClass::CalibMode_;
+    using BaseClass::InterleaveDim_;
+    using BaseClass::acceFactorE1_;
+    using BaseClass::acceFactorE2_;
+    using BaseClass::num_channels_res_;
+
+    using BaseClass::coilMap_;
+    using BaseClass::fullkspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::recon_time_stamp_;
+    using BaseClass::recon_physio_time_stamp_;
+
+    using BaseClass::fullkspace_second_;
+    using BaseClass::complexIm_second_;
+    using BaseClass::recon_time_stamp_second_;
+    using BaseClass::recon_physio_time_stamp_second_;
+
+    using BaseClass::gfactor_;
+    using BaseClass::wrap_around_map_;
+
+    using BaseClass::upstream_coil_compression_;
+    using BaseClass::upstream_coil_compression_thres_;
+    using BaseClass::upstream_coil_compression_num_modesKept_;
+
+    using BaseClass::downstream_coil_compression_;
+    using BaseClass::coil_compression_thres_;
+    using BaseClass::coil_compression_num_modesKept_;
+    using BaseClass::csm_kSize_;
+    using BaseClass::csm_powermethod_num_;
+    using BaseClass::csm_true_3D_;
+    using BaseClass::csm_iter_num_;
+    using BaseClass::csm_iter_thres_;
+    using BaseClass::csm_use_gpu_;
+    using BaseClass::start_RO_;
+    using BaseClass::end_RO_;
+    using BaseClass::start_E1_;
+    using BaseClass::end_E1_;
+    using BaseClass::start_E2_;
+    using BaseClass::end_E2_;
+
+    using BaseClass::filterRO_;
+    using BaseClass::filterE1_;
+    using BaseClass::filterE2_;
+    using BaseClass::filterROE1_;
+    using BaseClass::filterROE1E2_;
+
+    using BaseClass::filterRO_ref_;
+    using BaseClass::filterE1_ref_;
+    using BaseClass::filterE2_ref_;
+    using BaseClass::filterROE1_ref_;
+    using BaseClass::filterROE1E2_ref_;
+
+    using BaseClass::filterRO_partialfourier_;
+    using BaseClass::filterE1_partialfourier_;
+    using BaseClass::filterE2_partialfourier_;
+    using BaseClass::filterROE1_partialfourier_;
+    using BaseClass::filterROE1E2_partialfourier_;
+
+    using BaseClass::recon_algorithm_;
+
+    using BaseClass::grappa_kSize_RO_;
+    using BaseClass::grappa_kSize_E1_;
+    using BaseClass::grappa_kSize_E2_;
+    using BaseClass::grappa_reg_lamda_;
+    using BaseClass::grappa_calib_over_determine_ratio_;
+    using BaseClass::grappa_use_gpu_;
+
+    using BaseClass::spirit_kSize_RO_;
+    using BaseClass::spirit_kSize_E1_;
+    using BaseClass::spirit_kSize_E2_;
+    using BaseClass::spirit_oSize_RO_;
+    using BaseClass::spirit_oSize_E1_;
+    using BaseClass::spirit_oSize_E2_;
+    using BaseClass::spirit_reg_lamda_;
+    using BaseClass::spirit_use_gpu_;
+    using BaseClass::spirit_iter_max_;
+    using BaseClass::spirit_iter_thres_;
+    using BaseClass::spirit_print_iter_;
+
+    using BaseClass::spirit_perform_linear_;
+    using BaseClass::spirit_perform_nonlinear_;
+    using BaseClass::spirit_parallel_imaging_lamda_;
+    using BaseClass::spirit_image_reg_lamda_;
+    using BaseClass::spirit_data_fidelity_lamda_;
+    using BaseClass::spirit_ncg_iter_max_;
+    using BaseClass::spirit_ncg_iter_thres_;
+    using BaseClass::spirit_ncg_scale_factor_;
+    using BaseClass::spirit_ncg_print_iter_;
+    using BaseClass::spirit_slep_iter_max_;
+    using BaseClass::spirit_slep_iter_thres_;
+    using BaseClass::spirit_slep_print_iter_;
+    using BaseClass::spirit_slep_keep_third_dimension_coeff_;
+    using BaseClass::spirit_slep_scale_factor_;
+    using BaseClass::spirit_use_coil_sen_map_;
+    using BaseClass::spirit_use_moco_enhancement_;
+    using BaseClass::spirit_recon_moco_images_;
+    using BaseClass::spirit_RO_enhancement_ratio_;
+    using BaseClass::spirit_E1_enhancement_ratio_;
+    using BaseClass::spirit_E2_enhancement_ratio_;
+    using BaseClass::spirit_temporal_enhancement_ratio_;
+
+    using BaseClass::job_split_by_S_;
+    using BaseClass::job_num_of_N_;
+    using BaseClass::job_max_Megabytes_;
+    using BaseClass::job_overlap_;
+
+    using BaseClass::partialFourier_algo_;
+    using BaseClass::partialFourier_homodyne_iters_;
+    using BaseClass::partialFourier_homodyne_thres_;
+    using BaseClass::partialFourier_homodyne_densityComp_;
+    using BaseClass::partialFourier_POCS_iters_;
+    using BaseClass::partialFourier_POCS_thres_;
+    using BaseClass::partialFourier_POCS_transitBand_;
+    using BaseClass::partialFourier_FengHuang_kSize_RO_;
+    using BaseClass::partialFourier_FengHuang_kSize_E1_;
+    using BaseClass::partialFourier_FengHuang_kSize_E2_;
+    using BaseClass::partialFourier_FengHuang_thresReg_;
+    using BaseClass::partialFourier_FengHuang_sameKernel_allN_;
+    using BaseClass::partialFourier_FengHuang_transitBand_;
+    using BaseClass::partialFourier_FengHuang_transitBand_E2_;
+
+    using BaseClass::CloudComputing_;
+    using BaseClass::CloudSize_;
+    using BaseClass::gt_cloud_;
+
+    using BaseClass::kernel_; // [RO E1 E2 srcCHA dstCHA dstRO dstE1 dstE2]
+    using BaseClass::kernelIm_; // [RO E1 E2 srcCHA dstCHA]
+    using BaseClass::unmixingCoeffIm_; // [RO E1 E2 srcCHA 1 or N]
+    using BaseClass::coilCompressionCoef_;
+
+    // parameters to change the default behavior
+
+    // if true, the actual full kspace is computed, not only the coil combined complex images
+    bool recon_kspace_needed_;
+
+    // if true, no coil compression will be performed
+    bool coil_compression_;
+    // if true, the same coil compression coefficient is computed for all N
+    bool same_coil_compression_coeff_allN_;
+
+    // no acceleration
+    // if true, the average of all M ref will be used
+    // the coil sensitivity will be only estimated once for all N
+    bool no_acceleration_averageall_ref_;
+    // if true, the same coil combination coefficients will be used for all N
+    bool no_acceleration_same_combinationcoeff_allN_;
+    // if no_acceleration_same_combinationcoeff_allN_==true, select the N for coil combination coefficient estimation
+    size_t no_acceleration_whichN_combinationcoeff_;
+
+    // embedded mode
+    // if true, the average of all M ref will be used
+    // the kernel/sensitivity will be only estimated once for all N
+    bool embedded_averageall_ref_;
+    // if true, the coil map will be estimated from the fullkspace_
+    bool embedded_fullres_coilmap_;
+    // if true, the same coil combination coefficients will be used for all N
+    bool embedded_same_combinationcoeff_allN_;
+    // if embedded_same_combinationcoeff_allN_==true, select the N for coil combination coefficient estimation
+    // if -1, the average-all N is used for coil combination
+    int embedded_whichN_combinationcoeff_;
+    // if true, the ref lines will be filled back to fullkspace
+    bool embedded_ref_fillback_;
+
+    // separate mode
+    // if true, the average of all M ref will be used
+    // the kernel/sensitivity will be only estimated once for all N
+    bool separate_averageall_ref_;
+    // if true, the coil map will be estimated from the fullkspace_
+    bool separate_fullres_coilmap_;
+    // if true, the same coil combination coefficients will be used for all N
+    bool separate_same_combinationcoeff_allN_;
+    // if separate_same_combinationcoeff_allN_==true, select the 3D kspace used for coil combination coefficient estimation
+    // if -1, the average-all N is used for coil combination
+    int separate_whichN_combinationcoeff_;
+
+    // interleaved mode
+};
+
+template <typename T> 
+gtPlusReconWorkOrder3DT<T>::gtPlusReconWorkOrder3DT() : BaseClass()
+{
+    recon_kspace_needed_ = false;
+    coil_compression_ = true;
+    same_coil_compression_coeff_allN_ = false;
+
+    no_acceleration_averageall_ref_ = false;
+    no_acceleration_same_combinationcoeff_allN_ = false;
+
+    embedded_averageall_ref_ = false;
+    embedded_fullres_coilmap_ = true;
+    embedded_same_combinationcoeff_allN_ = false;
+    embedded_whichN_combinationcoeff_ = false;
+    embedded_ref_fillback_ = true;
+
+    separate_averageall_ref_ = false;
+    separate_fullres_coilmap_ = true;
+    separate_same_combinationcoeff_allN_ = false;
+    separate_whichN_combinationcoeff_ = false;
+}
+
+template <typename T> 
+gtPlusReconWorkOrder3DT<T>::~gtPlusReconWorkOrder3DT()
+{
+}
+
+template <typename T> 
+bool gtPlusReconWorkOrder3DT<T>::reset()
+{
+    try
+    {
+        kernel_->clear();
+        kernelIm_->clear();
+        unmixingCoeffIm_->clear();
+        coilCompressionCoef_->clear();
+        coilMap_->clear();
+
+        fullkspace_.clear();
+        complexIm_.clear();
+        recon_time_stamp_.clear();
+        recon_physio_time_stamp_.clear();
+
+        fullkspace_second_.clear();
+        complexIm_second_.clear();
+        recon_time_stamp_second_.clear();
+        recon_physio_time_stamp_second_.clear();
+
+        gfactor_.clear();
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorkOrder3DT<T>::reset() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorkOrder3DT<T>::enforceConsistency(ISMRMRDDIM& lastDim)
+{
+    if ( lastDim == DIM_Slice )
+    {
+        no_acceleration_averageall_ref_ = false;
+        no_acceleration_same_combinationcoeff_allN_ = false;
+
+        embedded_averageall_ref_ = false;
+        embedded_same_combinationcoeff_allN_ = false;
+
+        separate_averageall_ref_ = false;
+        separate_same_combinationcoeff_allN_ = false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder3DT<T>::duplicate(gtPlusReconWorkOrder3DT<T>& worder)
+{
+    BaseClass::duplicate(worder);
+
+    worder.recon_kspace_needed_ = recon_kspace_needed_;
+    worder.coil_compression_ = coil_compression_;
+    worder.same_coil_compression_coeff_allN_ = same_coil_compression_coeff_allN_;
+
+    worder.no_acceleration_averageall_ref_ = no_acceleration_averageall_ref_;
+    worder.no_acceleration_same_combinationcoeff_allN_ = no_acceleration_same_combinationcoeff_allN_;
+    worder.no_acceleration_whichN_combinationcoeff_ = no_acceleration_whichN_combinationcoeff_;
+
+    worder.embedded_averageall_ref_ = embedded_averageall_ref_;
+    worder.embedded_fullres_coilmap_ = embedded_fullres_coilmap_;
+    worder.embedded_same_combinationcoeff_allN_ = embedded_same_combinationcoeff_allN_;
+    worder.embedded_whichN_combinationcoeff_ = embedded_whichN_combinationcoeff_;
+    worder.embedded_ref_fillback_ = embedded_ref_fillback_;
+
+    worder.separate_averageall_ref_ = separate_averageall_ref_;
+    worder.separate_fullres_coilmap_ = separate_fullres_coilmap_;
+    worder.separate_same_combinationcoeff_allN_ = separate_same_combinationcoeff_allN_;
+    worder.separate_whichN_combinationcoeff_ = separate_whichN_combinationcoeff_;
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder3DT<T>::printInfo(std::ostream& os) const
+{
+    using namespace std;
+    BaseClass::printInfo(os);
+
+    GADGET_PARA_PRINT(recon_kspace_needed_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(coil_compression_);
+    GADGET_PARA_PRINT(same_coil_compression_coeff_allN_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(no_acceleration_averageall_ref_);
+    GADGET_PARA_PRINT(no_acceleration_same_combinationcoeff_allN_);
+    GADGET_PARA_PRINT(no_acceleration_whichN_combinationcoeff_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(embedded_averageall_ref_);
+    GADGET_PARA_PRINT(embedded_fullres_coilmap_);
+    GADGET_PARA_PRINT(embedded_same_combinationcoeff_allN_);
+    GADGET_PARA_PRINT(embedded_whichN_combinationcoeff_);
+    GADGET_PARA_PRINT(embedded_ref_fillback_);
+    GDEBUG_STREAM("---------------------");
+    GADGET_PARA_PRINT(separate_averageall_ref_);
+    GADGET_PARA_PRINT(separate_fullres_coilmap_);
+    GADGET_PARA_PRINT(separate_same_combinationcoeff_allN_);
+    GADGET_PARA_PRINT(separate_whichN_combinationcoeff_);
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder3DT<T>::print(std::ostream& os) const
+{
+    using namespace std;
+    os << "-------------- gtPlusReconWorkOrder3DT ---------------" << endl;
+    printInfo(os);
+    os << "------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker.h
new file mode 100644
index 0000000..ad30281
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker.h
@@ -0,0 +1,615 @@
+/** \file   gtPlusISMRMRDReconWorker.h
+    \brief  Define the base class for the GtPlus worker for reconstruction
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd/ismrmrd.h"
+
+#include <string>
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorkOrder.h"
+#include "gtPlusCloudScheduler.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+struct gtPlusReconJob2DT
+{
+    gtPlusReconWorkOrder<T> workOrder2DT;
+    hoNDArray<T> kspace;
+    hoNDArray<T> ker;
+    // hoNDArray<T> coilMap;
+
+    hoNDArray<T> complexIm;
+    hoNDArray<T> res;
+
+    size_t job_index_startN_;
+    size_t job_index_endN_;
+    size_t job_index_S_;
+
+    gtPlusReconJob2DT();
+    gtPlusReconJob2DT(const gtPlusReconJob2DT& job);
+
+    ~gtPlusReconJob2DT();
+
+    virtual bool serialize(char*& buf, size_t& len) const ;
+    virtual bool deserialize(char* buf, size_t& len);
+};
+
+template <typename T> 
+gtPlusReconJob2DT<T>::gtPlusReconJob2DT()
+{
+
+}
+
+template <typename T> 
+gtPlusReconJob2DT<T>::~gtPlusReconJob2DT()
+{
+
+}
+
+template <typename T> 
+gtPlusReconJob2DT<T>::gtPlusReconJob2DT(const gtPlusReconJob2DT& job)
+{
+    job.workOrder2DT.duplicate(workOrder2DT);
+    workOrder2DT.coilMap_ = job.workOrder2DT.coilMap_;
+    kspace = job.kspace;
+    ker = job.ker;
+    // coilMap = job.coilMap;
+    complexIm = job.complexIm;
+    res = job.res;
+    job_index_startN_ = job.job_index_startN_;
+    job_index_endN_ = job.job_index_endN_;
+    job_index_S_ = job.job_index_S_;
+}
+
+template <typename T> 
+bool gtPlusReconJob2DT<T>::serialize(char*& buf, size_t& len) const 
+{
+    char *bufKSpace(NULL), *bufKernel(NULL), *bufCoilMap(NULL), *bufComplexIm(NULL), *bufRes(NULL);
+    try
+    {
+        if ( buf != NULL ) delete[] buf;
+
+        // find the total len
+        gtPlusReconWorkOrderPara para;
+        para = this->workOrder2DT;
+
+        // buffer for kspace, kernel and coil map
+        size_t lenKSpace, lenKernel, lenCoilMap, lenComplexIm, lenRes;
+
+        GADGET_CHECK_THROW(kspace.serialize(bufKSpace, lenKSpace));
+        GADGET_CHECK_THROW(ker.serialize(bufKernel, lenKernel));
+
+        if ( workOrder2DT.coilMap_ )
+        {
+            GADGET_CHECK_THROW(workOrder2DT.coilMap_->serialize(bufCoilMap, lenCoilMap));
+        }
+        else
+        {
+            hoNDArray<T> coilMapDummy;
+            GADGET_CHECK_THROW(coilMapDummy.serialize(bufCoilMap, lenCoilMap));
+        }
+        GADGET_CHECK_THROW(complexIm.serialize(bufComplexIm, lenComplexIm));
+        GADGET_CHECK_THROW(res.serialize(bufRes, lenRes));
+
+        // total length
+        len = sizeof(gtPlusReconWorkOrderPara) + sizeof(size_t)*3 + lenKSpace + lenKernel + lenCoilMap + lenComplexIm + lenRes;
+
+        buf = new char[len];
+        GADGET_CHECK_RETURN_FALSE( buf != NULL );
+
+        size_t offset = 0, currLen=0;
+
+        currLen = sizeof(gtPlusReconWorkOrderPara);
+        memcpy(buf+offset, &para, currLen);
+        offset += currLen;
+
+        currLen = sizeof(size_t);
+        memcpy(buf+offset, &job_index_startN_, currLen);
+        offset += currLen;
+
+        currLen = sizeof(size_t);
+        memcpy(buf+offset, &job_index_endN_, currLen);
+        offset += currLen;
+
+        currLen = sizeof(size_t);
+        memcpy(buf+offset, &job_index_S_, currLen);
+        offset += currLen;
+
+        currLen = lenKSpace;
+        memcpy(buf+offset, bufKSpace, currLen);
+        offset += currLen;
+        delete [] bufKSpace;
+
+        currLen = lenKernel;
+        memcpy(buf+offset, bufKernel, currLen);
+        offset += currLen;
+        delete [] bufKernel;
+
+        currLen = lenCoilMap;
+        memcpy(buf+offset, bufCoilMap, currLen);
+        offset += currLen;
+        delete [] bufCoilMap;
+
+        currLen = lenComplexIm;
+        memcpy(buf+offset, bufComplexIm, currLen);
+        offset += currLen;
+        delete [] bufComplexIm;
+
+        currLen = lenRes;
+        memcpy(buf+offset, bufRes, currLen);
+        offset += currLen;
+        delete [] bufRes;
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors happened in gtPlusReconJob2DT<T>::serialize(...) ... ");
+
+        if ( bufKSpace != NULL ) delete [] bufKSpace;
+        if ( bufKernel != NULL ) delete [] bufKernel;
+        if ( bufCoilMap != NULL ) delete [] bufCoilMap;
+        if ( bufComplexIm != NULL ) delete [] bufComplexIm;
+        if ( bufRes != NULL ) delete [] bufRes;
+
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconJob2DT<T>::deserialize(char* buf, size_t& len)
+{
+    try
+    {
+        gtPlusReconWorkOrderPara para;
+        memcpy(&para, buf, sizeof(gtPlusReconWorkOrderPara));
+
+        workOrder2DT.copyFromPara(para);
+
+        size_t offset(sizeof(gtPlusReconWorkOrderPara)), currLen=0;
+
+        currLen = sizeof(size_t);
+        memcpy(&job_index_startN_, buf+offset, currLen);
+        offset += currLen;
+
+        currLen = sizeof(size_t);
+        memcpy(&job_index_endN_, buf+offset, currLen);
+        offset += currLen;
+
+        currLen = sizeof(size_t);
+        memcpy(&job_index_S_, buf+offset, currLen);
+        offset += currLen;
+
+        // kspace, kernel and coil map
+        GADGET_CHECK_RETURN_FALSE(kspace.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        GADGET_CHECK_RETURN_FALSE(ker.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        hoNDArray<T> coilMapDummy;
+        GADGET_CHECK_RETURN_FALSE(coilMapDummy.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        if ( coilMapDummy.get_number_of_elements() > 0 )
+        {
+            if ( workOrder2DT.coilMap_ )
+            {
+                *workOrder2DT.coilMap_ = coilMapDummy;
+            }
+            else
+            {
+                workOrder2DT.coilMap_ = boost::shared_ptr< hoNDArray<T> >( new hoNDArray<T>(coilMapDummy) );
+            }
+        }
+        else
+        {
+            if ( workOrder2DT.coilMap_ ) workOrder2DT.coilMap_->clear();
+        }
+
+        GADGET_CHECK_RETURN_FALSE(complexIm.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        GADGET_CHECK_RETURN_FALSE(res.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        // total length
+        len = offset;
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors happended in gtPlusReconJob2DT<T>::deserialize(...) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+class gtPlusReconWorker
+{
+public:
+
+    typedef typename realType<T>::Type value_type;
+
+    gtPlusReconWorker() : partial_fourier_handling_(true), performTiming_(false), verbose_(false)
+    {
+        gt_timer1_.set_timing_in_destruction(false);
+        gt_timer2_.set_timing_in_destruction(false);
+        gt_timer3_.set_timing_in_destruction(false);
+    }
+
+    virtual ~gtPlusReconWorker() {}
+
+    virtual bool performRecon(gtPlusReconWorkOrder<T>* workOrder) = 0;
+
+    virtual bool performPartialFourierHandling(gtPlusReconWorkOrder<T>* /*workOrder*/) { return true; }
+
+    virtual bool autoReconParameter(gtPlusReconWorkOrder<T>* workOrder)
+    {
+        if ( workOrder == NULL ) return false;
+        return true;
+    }
+
+    // whether to apply partial fourier processing
+    bool partial_fourier_handling_;
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    bool performTiming_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // debug folder
+    std::string debugFolder_;
+
+    // verbose mode
+    bool verbose_;
+
+    // util
+    gtPlusISMRMRDReconUtil<T> gtPlus_util_;
+    gtPlusISMRMRDReconUtilComplex<T> gtPlus_util_cplx_;
+
+    // ----------------------------------------------------
+    // recon job splitter and combiner
+    // ----------------------------------------------------
+    // 2DT array, [RO E1 CHA N S]
+    // if splitByS is true, split jobs by each S
+    // if jobN > 0, every jobN 2D kspaces are assigned into one job
+    // if splitByS=false and jobN<=0, the jobMegaBytes is used to define the maximal size of every job 
+    // overlapN: the overlap along N dimension
+    virtual bool splitReconJob(gtPlusReconWorkOrder<T>* workOrder2DT, hoNDArray<T>& kspace, hoNDArray<T>& ker, 
+                        bool splitByS, size_t jobN, size_t jobMegaBytes, size_t overlapN, 
+                        std::vector<gtPlusReconJob2DT<T> >& jobList);
+
+    virtual bool combineReconJob(gtPlusReconWorkOrder<T>* workOrder2DT, std::vector<gtPlusReconJob2DT<T> >& jobList, size_t N, size_t S);
+
+    virtual bool createAReconJob(gtPlusReconWorkOrder<T>* workOrder2DT, hoNDArray<T>& kspace, hoNDArray<T>& ker, 
+                            size_t startN, size_t endN, size_t indS, gtPlusReconJob2DT<T>& job);
+
+    // from the node computing power indexes, get the effective node number for job splitting
+    virtual bool computeEffectiveNodeNumberBasedOnComputingPowerIndex(gtPlusReconWorkOrder<T>* workOrder, size_t& numOfEffectiveNodes);
+
+    // estimate the job size, given the maximal memory usage for every job
+    virtual bool estimateJobSize(gtPlusReconWorkOrder<T>* workOrder, size_t maxNumOfBytesPerJob, size_t overlapBetweenJobs, size_t numOfNodes, size_t& jobSize) = 0;
+
+    // given the number of nodes in a cloud and corresponding computing power indexes, spread the jobs on the nodes
+    virtual bool scheduleJobForNodes(gtPlusReconWorkOrder<T>* workOrder2DT, size_t numOfJobs, std::vector<int>& nodeIdForJob);
+};
+
+template <typename T> 
+bool gtPlusReconWorker<T>::createAReconJob(gtPlusReconWorkOrder<T>* workOrder2DT, hoNDArray<T>& kspace, hoNDArray<T>& ker, 
+        size_t startN, size_t endN, size_t indS, gtPlusReconJob2DT<T>& job)
+{
+    try
+    {
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t N = kspace.get_size(3);
+        size_t S = kspace.get_size(4);
+
+        size_t kerRO = ker.get_size(0);
+        size_t kerE1 = ker.get_size(1);
+        size_t srcCHA = ker.get_size(2);
+        size_t dstCHA = ker.get_size(3);
+        size_t refN = ker.get_size(4);
+
+        size_t jobN = endN-startN+1;
+
+        job.kspace.create(RO, E1, srcCHA, jobN, 1);
+        memcpy(job.kspace.begin(), kspace.begin()+indS*RO*E1*srcCHA*N+startN*RO*E1*srcCHA, job.kspace.get_number_of_bytes());
+
+        if ( refN < N )
+        {
+            job.ker.create(kerRO, kerE1, srcCHA, dstCHA, refN, 1);
+            memcpy(job.ker.begin(), ker.begin()+indS*kerRO*kerE1*srcCHA*dstCHA*refN, job.ker.get_number_of_bytes());
+        }
+        else
+        {
+            job.ker.create(kerRO, kerE1, srcCHA, dstCHA, jobN, 1, ker.begin()+indS*kerRO*kerE1*srcCHA*dstCHA*refN+startN*kerRO*kerE1*srcCHA*dstCHA);
+        }
+
+        if ( workOrder2DT->coilMap_->get_number_of_elements() > 0 )
+        {
+            if ( refN < N )
+            {
+                job.workOrder2DT.coilMap_ = boost::shared_ptr<hoNDArray<T> >(new hoNDArray<T>(RO, E1, dstCHA, workOrder2DT->coilMap_->begin()+indS*RO*E1*dstCHA*refN));
+            }
+            else
+            {
+                job.workOrder2DT.coilMap_ = boost::shared_ptr<hoNDArray<T> >(new hoNDArray<T>(RO, E1, dstCHA, jobN, workOrder2DT->coilMap_->begin()+indS*RO*E1*dstCHA*refN+startN*RO*E1*dstCHA));
+            }
+        }
+
+        job.job_index_startN_ = startN;
+        job.job_index_endN_ = endN;
+        job.job_index_S_ = indS;
+        workOrder2DT->duplicate(job.workOrder2DT);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker<T>::createAReconJob(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker<T>::splitReconJob(gtPlusReconWorkOrder<T>* workOrder2DT, hoNDArray<T>& kspace, hoNDArray<T>& ker, 
+        bool splitByS, size_t jobN, size_t jobMegaBytes, size_t overlapN, 
+        std::vector<gtPlusReconJob2DT<T> >& jobList)
+{
+    try
+    {
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t N = kspace.get_size(3);
+        size_t S = kspace.get_size(4);
+
+        size_t kerRO = ker.get_size(0);
+        size_t kerE1 = ker.get_size(1);
+        size_t srcCHA = ker.get_size(2);
+        size_t dstCHA = ker.get_size(3);
+        size_t refN = ker.get_size(4);
+
+        size_t s;
+        int startN, endN;
+
+        if ( splitByS )
+        {
+            jobList.resize(S);
+            startN = 0;
+            endN = (int)N-1;
+            for ( s=0; s<S; s++ )
+            {
+                GADGET_CHECK_RETURN_FALSE(createAReconJob(workOrder2DT, kspace, ker, startN, endN, s, jobList[s]));
+            }
+
+            return true;
+        }
+
+        if ( jobN > 0 )
+        {
+            if ( jobN < 2*overlapN ) jobN = 2*overlapN;
+        }
+        else if ( jobMegaBytes > 0 )
+        {
+            jobN = jobMegaBytes/(kerRO*kerE1*srcCHA*dstCHA*sizeof(T)/1024/1024);
+            if ( jobN < 2*overlapN ) jobN = 2*overlapN;
+        }
+
+        jobList.clear();
+
+        // find number of jobs
+        size_t numPerN=0;
+        startN = 0;
+        while ( startN < N )
+        {
+            endN = (int)(startN+jobN+overlapN-1);
+            numPerN++;
+
+            if ( endN >= N )
+            {
+                endN = (int)N-1;
+                break;
+            }
+
+            startN = endN-(int)overlapN+1;
+        }
+
+        jobList.resize(S*numPerN);
+
+        for ( s=0; s<S; s++ )
+        {
+
+            size_t num=0;
+            startN = 0;
+            while ( startN < N )
+            {
+                endN = (int)(startN+jobN+(int)overlapN-1);
+                num++;
+
+                if ( endN >= N )
+                {
+                    endN = (int)N-1;
+
+                    if ( endN-startN+1 < jobN )
+                    {
+                        startN = endN-(int)jobN+1;
+                        if ( startN < 0 ) startN = 0;
+                    }
+
+                    GADGET_CHECK_RETURN_FALSE(createAReconJob(workOrder2DT, kspace, ker, startN, endN, s, jobList[s*numPerN+num-1]));
+                    break;
+                }
+
+                GADGET_CHECK_RETURN_FALSE(createAReconJob(workOrder2DT, kspace, ker, startN, endN, s, jobList[s*numPerN+num-1]));
+
+                startN = endN-(int)overlapN+1;
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker<T>::splitReconJob(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker<T>::
+combineReconJob(gtPlusReconWorkOrder<T>* workOrder2DT, std::vector<gtPlusReconJob2DT<T> >& jobList, size_t N, size_t S)
+{
+    try
+    {
+        size_t RO = jobList[0].kspace.get_size(0);
+        size_t E1 = jobList[0].kspace.get_size(1);
+
+        size_t srcCHA = jobList[0].ker.get_size(2);
+        size_t dstCHA = jobList[0].ker.get_size(3);
+        size_t refN = jobList[0].ker.get_size(4);
+
+        workOrder2DT->complexIm_.create(RO, E1, N, S);
+        Gadgetron::clear(workOrder2DT->complexIm_);
+
+        workOrder2DT->fullkspace_.create(RO, E1, dstCHA, N, S);
+        Gadgetron::clear(workOrder2DT->fullkspace_);
+
+        size_t ii, n, s;
+
+        size_t numOfJobs = jobList.size();
+
+        ho2DArray<T> fillingTimes(N, S);
+        Gadgetron::clear(fillingTimes);
+
+        for ( ii=0; ii<numOfJobs; ii++ )
+        {
+            size_t startN = jobList[ii].job_index_startN_;
+            size_t endN = jobList[ii].job_index_endN_;
+            size_t indS = jobList[ii].job_index_S_;
+
+            if ( jobList[ii].complexIm.get_number_of_elements() > 0 )
+            {
+                hoNDArray<T> complexIm(RO, E1, endN-startN+1, workOrder2DT->complexIm_.begin()+indS*RO*E1*N+startN*RO*E1);
+                Gadgetron::add(jobList[ii].complexIm, complexIm, complexIm);
+            }
+
+            if ( jobList[ii].res.get_number_of_elements() > 0 )
+            {
+                hoNDArray<T> fullkspace(RO, E1, dstCHA, endN-startN+1, workOrder2DT->fullkspace_.begin()+indS*RO*E1*dstCHA*N+startN*RO*E1*dstCHA);
+                Gadgetron::add(jobList[ii].res, fullkspace, fullkspace);
+            }
+
+            for ( n=startN; n<=endN; n++ )
+            {
+                fillingTimes(n, indS) = fillingTimes(n, indS) + T(1.0);
+            }
+        }
+
+        for ( s=0; s<S; s++ )
+        {
+            for ( n=0; n<N; n++ )
+            {
+                if ( fillingTimes(n, s).real() > 1 )
+                {
+                    hoNDArray<T> complexIm(RO, E1, workOrder2DT->complexIm_.begin()+s*RO*E1*N+n*RO*E1);
+                    Gadgetron::scal( (value_type)(1.0)/fillingTimes(n, s).real(), complexIm);
+
+                    hoNDArray<T> fullkspace(RO, E1, dstCHA, workOrder2DT->fullkspace_.begin()+s*RO*E1*dstCHA*N+n*RO*E1*dstCHA);
+                    Gadgetron::scal( (value_type)(1.0)/fillingTimes(n, s).real(), fullkspace);
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker<T>::combineReconJob(gtPlusReconWorkOrder<T>* workOrder2DT, std::vector<gtPlusReconJob2DT<T> >& jobList, size_t N, size_t S) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker<T>::
+computeEffectiveNodeNumberBasedOnComputingPowerIndex(gtPlusReconWorkOrder<T>* workOrder, size_t& numOfEffectiveNodes)
+{
+    try
+    {
+        size_t numOfNodes = workOrder->gt_cloud_.size();
+        numOfEffectiveNodes = 0;
+
+        if ( numOfNodes == 0 )
+        {
+            GWARN_STREAM("numOfNodes == 0");
+            return true;
+        }
+
+        double minPowerIndex = workOrder->gt_cloud_[0].template get<3>();
+        double totalPowerIndex = minPowerIndex;
+
+        size_t ii;
+        for ( ii=1; ii<numOfNodes; ii++ )
+        {
+            totalPowerIndex += workOrder->gt_cloud_[ii].template get<3>();
+            if ( workOrder->gt_cloud_[ii].template get<3>() < minPowerIndex ) minPowerIndex = workOrder->gt_cloud_[ii].template get<3>();
+        }
+
+        numOfEffectiveNodes = (size_t)(std::floor(totalPowerIndex/minPowerIndex));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker<T>::computeEffectiveNodeNumberBasedOnComputingPowerIndex(gtPlusReconWorkOrder<T>* workOrder, unsigned int& numOfEffectiveNodes) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker<T>::
+scheduleJobForNodes(gtPlusReconWorkOrder<T>* workOrder, size_t numOfJobs, std::vector<int>& nodeIdForJob)
+{
+    try
+    {
+        size_t numOfNodes = workOrder->gt_cloud_.size();
+
+        gtPlusCloudScheduler scheduler;
+        scheduler.setNumOfJobs(numOfJobs);
+
+        std::vector<double> powerIndexes(numOfNodes);
+        for ( size_t ii=0; ii<numOfNodes; ii++ )
+        {
+            powerIndexes[ii] = workOrder->gt_cloud_[ii].template get<3>();
+        }
+
+        scheduler.setUpNodes(powerIndexes);
+
+        GADGET_CHECK_RETURN_FALSE(scheduler.schedulerJobs(nodeIdForJob));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker<T>::scheduleJobForNodes(gtPlusReconWorkOrder<T>* workOrder2DT, size_t numOfJobs, std::vector<int>& nodeIdForJob) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DT.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DT.h
new file mode 100644
index 0000000..e600765
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DT.h
@@ -0,0 +1,2825 @@
+/** \file   gtPlusISMRMRDReconWorker2DT.h
+    \brief  Define the base class for the GtPlus worker for 2DT reconstruction cases
+
+            Five different strategies were implemented for partial fourier or asymmetric echo acquisition, including:
+
+            ISMRMRD_PF_ZEROFILLING          : only zero filling the unacquired k-space
+
+            ISMRMRD_PF_ZEROFILLING_FILTER   : zero filling the unacquired k-space and apply a transition filter on the edges between
+                                              acquired and unacquired regions
+
+            ISMRMRD_PF_HOMODYNE             : perform the iterative homodyne filter
+                                              Handbook of MRI Pulse Sequences. Page 556.
+                                              Matt A. Bernstein, Kevin F. King, Xiaohong Joe Zhou. 
+                                              Academic Press, ISBN-10: 0120928612.
+
+            ISMRMRD_PF_POCS                 : perform the iterative POCS reconstruction
+                                              Magnetic Resonance Imaging: Physical Principles and Sequence Design. Page 296-297.
+                                              E. Mark Haacke, Robert W. Brown, Michael R. Thompson, Ramesh Venkatesan. 
+                                              Wiley-Liss, ISBN-10: 0471351288.
+
+            ISMRMRD_PF_FENGHUANG            : perform a k-space convolution based partial fourier reconstruction. 
+                                              This is our recommendation for 2D, 2DT cases.
+
+                                              Feng Huang, Wei Lin, and Yu Li. 
+                                              Partial Fourier Reconstruction Through Data Fitting and Convolution in k-Space.
+                                              Magnetic Resonance in Medicine, Vol 62, page 1261�1269, 2009.
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusISMRMRDReconWorker.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker2DT : public gtPlusReconWorker<T>
+{
+public:
+
+    typedef gtPlusReconWorker<T> BaseClass;
+    typedef typename realType<T>::Type value_type;
+
+    gtPlusReconWorker2DT() : BaseClass(), startE1_(0), endE1_(1024) {}
+    virtual ~gtPlusReconWorker2DT() {}
+
+    virtual bool performRecon(gtPlusReconWorkOrder<T>* workOrder)
+    {
+        // check whether we have all-zeros input
+        value_type v(1);
+        Gadgetron::norm2(workOrder->data_, v);
+        if ( v <= 0 )
+        {
+            GWARN_STREAM("gtPlusReconWorker2DT, performRecon(workOrder) : incoming data contains all-zeros ... ");
+
+            boost::shared_ptr< std::vector<size_t> > dims = workOrder->data_.get_dimensions();
+            (*dims)[2] = workOrder->num_channels_res_;
+            workOrder->complexIm_.create(dims);
+            Gadgetron::clear(workOrder->complexIm_);
+
+            return true;
+        }
+
+        gtPlusReconWorkOrder2DT<T>* workOrder2DT = dynamic_cast<gtPlusReconWorkOrder2DT<T>*>(workOrder);
+        if ( workOrder2DT == NULL ) return false;
+
+        if ( workOrder2DT->recon_auto_parameters_ )
+        {
+            this->autoReconParameter(workOrder2DT);
+            GDEBUG_STREAM("Gt Plus 2DT -- automatic paramter selection ---");
+            if ( !this->debugFolder_.empty() ) { workOrder2DT->print(std::cout); }
+        }
+
+        return this->performRecon(workOrder2DT);
+    }
+
+    // the common functionalities are performed here for 2DT recon
+    // compute the coil compression coefficients
+    // prepare the ref data array
+    virtual bool performRecon(gtPlusReconWorkOrder2DT<T>* workOrder2DT);
+
+    virtual bool estimateCoilMap(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, const hoNDArray<T>& ref_coil_map_dst);
+    virtual bool performCalib(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, const hoNDArray<T>& ref_coil_map_dst);
+    virtual bool performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT);
+    virtual bool performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT, size_t n, size_t usedS);
+
+    virtual bool performUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& data);
+
+    // the partial fourier handling for the 2DT reconstruction
+    // the computation is performed on the reconstructed full kspace
+    virtual bool performPartialFourierHandling(gtPlusReconWorkOrder2DT<T>* workOrder2DT);
+
+    // perform the kspace filter on ref data for coil map estimation
+    virtual bool performRefFilter(gtPlusReconWorkOrder2DT<T>* workOrder2DT, 
+                                        const hoNDArray<T>& ref, hoNDArray<T>& refFiltered, 
+                                        int startRO, int endRO, int startE1, int endE1);
+
+    // for interleave, compute mean ref
+    // for embedded and separate, squeeze out the zero lines
+    virtual bool prepRef(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& ref, 
+                    hoNDArray<T>& refRecon, hoNDArray<T>& refCoilMap, 
+                    int startRO, int endRO, int startE1, int endE1, size_t dataE1);
+
+    // implement reference data preparation
+    virtual bool prepRefByAveragingCrossN(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& ref, bool averageAllRef, int numOfModes, hoNDArray<T>& refRecon);
+
+    // compute coil compression coefficients
+    virtual bool coilCompression(gtPlusReconWorkOrder2DT<T>* workOrder2DT);
+
+    // after unwrapping, for embedded and separate, the full res coil map may be estimated
+    // for embedded, the ref may be filled back to fullkspace
+    virtual bool afterUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT);
+
+    // pick the frame with highest signal from the 2DT buffer
+    // data: [RO E1 CHA N S], res: [RO E1 CHA 1 S]
+    bool pickHighestSignalForN(const hoNDArray<T>& data, hoNDArray<T>& res);
+
+    // ----------------------------------------------------
+    // common functions for 2DT reconstruction
+    // ----------------------------------------------------
+    // image domain kernel with coil sensitivity
+    // kerIm: [RO E1 srcCHA dstCHA]
+    // coilMap: [RO E1 dstCHA]
+    // unmixCoeff: [RO E1 srcCHA]
+    // gFactor: [RO E1]
+    bool unmixCoeff(const hoNDArray<T>& kerIm, const hoNDArray<T>& coilMap, hoNDArray<T>& unmixCoeff, hoNDArray<T>& gFactor);
+
+    // apply image domain kernel
+    // kspace: [RO E1 srcCHA ...]
+    // complexIm : [RO E1 dstCHA ...]
+    bool applyImageDomainKernel(const hoNDArray<T>& kspace, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm);
+    // aliasedIm : [RO E1 srcCHA ...]
+    bool applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm);
+    // for speed, a buffer can be provided
+    bool applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& kerImBuffer, hoNDArray<T>& complexIm);
+
+    // apply unmixCoeff
+    // kspace: [RO E1 srcCHA ...]
+    // unmixCoeff : [RO E1 srcCHA]
+    // complexIm : [RO E1 ...]
+    bool applyUnmixCoeff(const hoNDArray<T>& kspace, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm);
+    // aliasedIm : [RO E1 srcCHA ...]
+    bool applyUnmixCoeffImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm);
+
+    // ----------------------------------------------------
+    // Partial fourier handling for 2DT reconstruction
+    // ----------------------------------------------------
+    // apply the partial fourier filer along the edges
+    bool performPartialFourierFilter(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace);
+    // apply the iterative homodyne filter for partial fourier reconstruction
+    bool performPartialFourierHomodyneRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace);
+    // apply the iterative POCS for partial fourier reconstruction
+    bool performPartialFourierPOCSRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace);
+    // apply the Feng Huang partial fourier reconstruction
+    bool performPartialFourierFengHuangRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace);
+
+    // compute Feng Huang kernel and perform recon
+    bool calibFengHuang(gtPlusReconWorkOrder2DT<T>& workOrder2DT, const hoNDArray<T>& src, const hoNDArray<T>& dst, ho6DArray<T>& kernel);
+    bool performReconFangHuang(gtPlusReconWorkOrder2DT<T>& workOrder2DT, const hoNDArray<T>& kspaceConj, hoNDArray<T>& kspace, int startRO, int endRO, int startE1, int endE1, ho6DArray<T>& kernel);
+
+    // estimate the job size, given the maximal memory usage for every job
+    virtual bool estimateJobSize(gtPlusReconWorkOrder<T>* workOrder, size_t maxNumOfBytesPerJob, size_t overlapBetweenJobs, size_t numOfNodes, size_t& jobSize);
+
+    using BaseClass::partial_fourier_handling_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::verbose_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_cplx_;
+
+protected:
+
+    // helper memory for computation
+    hoNDArray<T> buffer2DT_;
+    hoNDArray<T> buffer2DT_unwrapping_;
+    hoNDArray<T> buffer2DT_partial_fourier_;
+    hoNDArray<T> buffer2DT_partial_fourier_kspaceIter_;
+    hoNDArray<T> ref_src_;
+    hoNDArray<T> ref_dst_;
+    hoNDArray<T> data_dst_;
+    hoNDArray<T> ref_coil_map_dst_;
+
+    // sampled region along E1
+    size_t startE1_;
+    size_t endE1_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performRefFilter(gtPlusReconWorkOrder2DT<T>* workOrder2DT, 
+                                        const hoNDArray<T>& ref, hoNDArray<T>& refFiltered, 
+                                        int startRO, int endRO, int startE1, int endE1)
+{
+    try
+    {
+        refFiltered = ref;
+
+        size_t RO = ref.get_size(0);
+        size_t E1 = ref.get_size(1);
+
+        if ( workOrder2DT->filterROE1_ref_.get_size(0)==RO && workOrder2DT->filterROE1_ref_.get_size(1)==E1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(ref, workOrder2DT->filterROE1_ref_, refFiltered));
+        }
+        else if ( (workOrder2DT->filterRO_ref_.get_number_of_elements()==RO) && (workOrder2DT->filterE1_ref_.get_number_of_elements()==E1) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(ref, workOrder2DT->filterRO_ref_, workOrder2DT->filterE1_ref_, refFiltered));
+        }
+        else
+        {
+            if ( (workOrder2DT->filterRO_ref_.get_number_of_elements()==RO) && (workOrder2DT->filterE1_ref_.get_number_of_elements()!=E1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterRO(ref, workOrder2DT->filterRO_ref_, refFiltered));
+            }
+
+            if ( (workOrder2DT->filterRO_ref_.get_number_of_elements()!=RO) && (workOrder2DT->filterE1_ref_.get_number_of_elements()==E1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterE1(ref, workOrder2DT->filterE1_ref_, refFiltered));
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::performRefFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::prepRefByAveragingCrossN(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& ref, bool averageAllRef, int numOfModes, hoNDArray<T>& refRecon)
+{
+    try
+    {
+        size_t RO = ref.get_size(0);
+        size_t E1 = ref.get_size(1);
+        size_t CHA = ref.get_size(2);
+        size_t N = ref.get_size(3);
+        size_t S = ref.get_size(4);
+
+        std::vector<size_t> sampledTimes;
+
+        if ( !averageAllRef && ( (numOfModes<1) || (numOfModes>N-1) ) )
+        {
+            refRecon = ref;
+        }
+        else if ( averageAllRef && ( (numOfModes<1) || (numOfModes>N-1) ) )
+        {
+            //GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(ref, refRecon));
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(ref, refRecon, sampledTimes));
+        }
+        else if ( averageAllRef && (numOfModes>=1) && (numOfModes<=N-1) )
+        {
+            hoNDArray<T> refKLF(RO, E1, CHA, N, S);
+
+            size_t s;
+            for ( s=0; s<S; s++ )
+            {
+                hoMatrix<T> A(RO*E1*CHA, N, const_cast<T*>(ref.begin()+s*RO*E1*CHA*N));
+                hoMatrix<T> A_KLF(RO*E1*CHA, N, refKLF.begin()+s*RO*E1*CHA*N);
+
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLFilter(A, numOfModes, A_KLF));
+            }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refKLF, debugFolder_+"refKLF"); }
+
+            //GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(refKLF, refRecon));
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(refKLF, refRecon, sampledTimes));
+        }
+        else if ( !averageAllRef && (numOfModes>=1) && (numOfModes<=N-1) )
+        {
+            refRecon.create(RO, E1, CHA, N, S);
+
+            size_t s;
+            for ( s=0; s<S; s++ )
+            {
+                hoMatrix<T> A(RO*E1*CHA, N, const_cast<T*>(ref.begin()+s*RO*E1*CHA*N));
+                hoMatrix<T> A_KLF(RO*E1*CHA, N, refRecon.begin()+s*RO*E1*CHA*N);
+
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLFilter(A, numOfModes, A_KLF));
+            }
+        }
+        else
+        {
+            refRecon = ref;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::prepRefByAveragingCrossN(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::prepRef(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& ref, 
+                    hoNDArray<T>& refRecon, hoNDArray<T>& refCoilMap, 
+                    int startRO, int endRO, int startE1, int endE1, size_t dataE1)
+{
+    try
+    {
+        size_t dataRO = workOrder2DT->data_.get_size(0);
+        size_t dataS = workOrder2DT->data_.get_size(4);
+
+        size_t RO = ref.get_size(0);
+        size_t E1 = ref.get_size(1);
+        size_t srcCHA = ref.get_size(2);
+        size_t N = ref.get_size(3);
+        size_t S = ref.get_size(4);
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(ref, debugFolder_+"ref"); }
+
+        if (workOrder2DT->CalibMode_ == ISMRMRD_noacceleration)
+        {
+            if ( workOrder2DT->no_acceleration_averageall_ref_ )
+            {
+                GADGET_CHECK_RETURN_FALSE(prepRefByAveragingCrossN(workOrder2DT, ref, workOrder2DT->no_acceleration_averageall_ref_, workOrder2DT->no_acceleration_ref_numOfModes_, refRecon));
+            }
+
+            GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder2DT, refRecon, refCoilMap, startRO, endRO, startE1, endE1));
+        }
+        else if ( workOrder2DT->CalibMode_ == ISMRMRD_interleaved )
+        {
+            GADGET_CHECK_RETURN_FALSE(prepRefByAveragingCrossN(workOrder2DT, ref, true, workOrder2DT->interleaved_ref_numOfModes_, refRecon));
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refRecon, debugFolder_+"refRecon_interleaved"); }
+
+            GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder2DT, refRecon, refCoilMap, startRO, endRO, startE1, endE1));
+
+            if ( (startRO>=0 && endRO>0 && endRO>startRO) || (startE1>=0 && endE1>0 && endE1>startE1) )
+            {
+                std::vector<size_t> crop_offset(5), crop_size(5);
+
+                crop_offset[0] = 0;
+                crop_offset[1] = 0;
+                crop_offset[2] = 0;
+                crop_offset[3] = 0;
+                crop_offset[4] = 0;
+
+                crop_size[0] = RO;
+                crop_size[1] = E1;
+                crop_size[2] = refRecon.get_size(2);
+                crop_size[3] = refRecon.get_size(3);
+                crop_size[4] = refRecon.get_size(4);
+
+                if (startRO>=0 && endRO>0 && endRO>startRO)
+                {
+                    crop_offset[0] = startRO;
+                    crop_size[0] = endRO-startRO+1;
+                }
+
+                if (startE1>=0 && endE1>0 && endE1>startE1)
+                {
+                    crop_offset[1] = startE1;
+                    crop_size[1] = endE1-startE1+1;
+                }
+
+                hoNDArray<T> croppedRef;
+                GADGET_CHECK_RETURN_FALSE(cropUpTo11DArray(refRecon, croppedRef, crop_offset, crop_size));
+                refRecon = croppedRef;
+            }
+        }
+        else if ( workOrder2DT->CalibMode_ == ISMRMRD_embedded 
+                || workOrder2DT->CalibMode_ == ISMRMRD_separate 
+                || workOrder2DT->CalibMode_ == ISMRMRD_external )
+        {
+            if ( workOrder2DT->CalibMode_ == ISMRMRD_embedded )
+            {
+                refRecon = ref;
+            }
+
+            if ( workOrder2DT->CalibMode_ == ISMRMRD_separate )
+            {
+                GADGET_CHECK_RETURN_FALSE(prepRefByAveragingCrossN(workOrder2DT, ref, workOrder2DT->separate_averageall_ref_, workOrder2DT->separate_ref_numOfModes_, refRecon));
+            }
+
+            hoNDArray<typename realType<T>::Type> refMag(refRecon.get_dimensions()), refMagSum;
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::abs(refRecon, refMag));
+            /*GADGET_CHECK_RETURN_FALSE(sumOverLastDimension(refMag, refMagSum));
+            GADGET_CHECK_RETURN_FALSE(sumOverLastDimension(refMagSum, refMag));
+            GADGET_CHECK_RETURN_FALSE(sumOverLastDimension(refMag, refMagSum));*/
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(sum_over_dimension(refMag, refMagSum, refMag.get_number_of_dimensions()-1));
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(sum_over_dimension(refMagSum, refMag, refMagSum.get_number_of_dimensions() - 2));
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(sum_over_dimension(refMag, refMagSum, refMag.get_number_of_dimensions() - 3));
+
+            refMagSum.squeeze();
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<float>().detectSampledRegionE1(refMagSum, startE1_, endE1_));
+
+            std::vector<size_t> crop_offset(5);
+            crop_offset[0] = 0;
+            crop_offset[1] = startE1_;
+            crop_offset[2] = 0;
+            crop_offset[3] = 0;
+            crop_offset[4] = 0;
+
+            std::vector<size_t> crop_size(5);
+            crop_size[0] = refRecon.get_size(0);
+            crop_size[1] = endE1_-startE1_+1;
+            crop_size[2] = srcCHA;
+            crop_size[3] = refRecon.get_size(3);
+            crop_size[4] = refRecon.get_size(4);
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refRecon, debugFolder_+"refRecon_beforeCrop"); }
+
+            if ( workOrder2DT->CalibMode_ == ISMRMRD_embedded )
+            {
+                hoNDArray<T> croppedRef;
+                GADGET_CHECK_RETURN_FALSE(cropUpTo11DArray(refRecon, croppedRef, crop_offset, crop_size));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(croppedRef, debugFolder_+"refRecon_afterCrop"); }
+
+                if ( workOrder2DT->recon_algorithm_ == ISMRMRD_SPIRIT 
+                    || workOrder2DT->recon_algorithm_ == ISMRMRD_L1SPIRIT 
+                    || workOrder2DT->recon_algorithm_ == ISMRMRD_L1SPIRIT_SLEP 
+                    || workOrder2DT->recon_algorithm_ == ISMRMRD_L1SPIRIT_SLEP_MOTION_COMP )
+                {
+                    // copy the ref into the data
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongE1(refRecon, workOrder2DT->data_, startE1_, endE1_));
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder2DT->data_, debugFolder_+"data_copyAlongE1"); }
+                }
+
+                GADGET_CHECK_RETURN_FALSE(prepRefByAveragingCrossN(workOrder2DT, croppedRef, workOrder2DT->embedded_averageall_ref_, workOrder2DT->embedded_ref_numOfModes_, refRecon));
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refRecon, debugFolder_+"refRecon_afterCrop_prepCrossN"); }
+
+                crop_size[3] = refRecon.get_size(3);
+
+                refCoilMap.create(RO, E1, srcCHA, refRecon.get_size(3), S);
+                GADGET_CHECK_RETURN_FALSE(setSubArrayUpTo11DArray(refRecon, refCoilMap, crop_offset, crop_size));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refCoilMap, debugFolder_+"refCoilMap"); }
+
+                hoNDArray<T> refCoilMapTmp(refCoilMap);
+                GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder2DT, refCoilMapTmp, refCoilMap, startRO, endRO, startE1, endE1));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refCoilMap, debugFolder_+"refCoilMap_filtered"); }
+
+                if ( refRecon.get_size(0) == RO )
+                {
+                    if ( startRO>=0 && endRO>0 && endRO>startRO && startRO<RO && endRO<RO )
+                    {
+                        crop_offset[0] = startRO;
+                        crop_size[0] = endRO-startRO+1;
+
+                        crop_offset[1] = 0;
+                        crop_size[1] = refRecon.get_size(1);
+                    }
+                }
+
+                GADGET_CHECK_RETURN_FALSE(cropUpTo11DArray(refRecon, croppedRef, crop_offset, crop_size));
+                refRecon = croppedRef;
+            }
+            else
+            {
+                hoNDArray<T> croppedRef;
+                GADGET_CHECK_RETURN_FALSE(cropUpTo11DArray(refRecon, croppedRef, crop_offset, crop_size));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(croppedRef, debugFolder_+"croppedRef"); }
+
+                GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder2DT, croppedRef, refCoilMap, startRO, endRO, startE1, endE1));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refCoilMap, debugFolder_+"croppedRef_filtered"); }
+
+                refRecon = croppedRef;
+
+                // GADGET_CHECK_RETURN_FALSE(gtPlus_util_.zeropad2D(refCoilMap, dataRO, dataE1, croppedRef));
+                // GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::zeropad2D(refCoilMap, dataRO, dataE1, croppedRef));
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::pad(dataRO, dataE1, &refCoilMap, &croppedRef));
+                refCoilMap = croppedRef;
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refCoilMap, debugFolder_+"refCoilMap"); }
+
+                if ( refRecon.get_size(0) == RO )
+                {
+                    if ( startRO>=0 && endRO>0 && endRO>startRO && startRO<RO && endRO<RO )
+                    {
+                        crop_offset[0] = startRO;
+                        crop_size[0] = endRO-startRO+1;
+
+                        crop_offset[1] = 0;
+                        crop_size[1] = refRecon.get_size(1);
+
+                        GADGET_CHECK_RETURN_FALSE(cropUpTo11DArray(refRecon, croppedRef, crop_offset, crop_size));
+                        refRecon = croppedRef;
+                    }
+                }
+            }
+
+            if ( S < dataS )
+            {
+                hoNDArray<T> refReconDataS(refRecon.get_size(0), refRecon.get_size(1), refRecon.get_size(2), refRecon.get_size(3), dataS);
+                hoNDArray<T> refCoilMapDataS(refCoilMap.get_size(0), refCoilMap.get_size(1), refCoilMap.get_size(2), refCoilMap.get_size(3), dataS);
+
+                memcpy(refReconDataS.begin(), refRecon.begin(), refRecon.get_number_of_bytes());
+                memcpy(refCoilMapDataS.begin(), refCoilMap.begin(), refCoilMap.get_number_of_bytes());
+
+                size_t refReconN4D = refRecon.get_size(0)*refRecon.get_size(1)*refRecon.get_size(2)*refRecon.get_size(3);
+                size_t refCoilMapN4D = refCoilMap.get_size(0)*refCoilMap.get_size(1)*refCoilMap.get_size(2)*refCoilMap.get_size(3);
+
+                size_t s;
+                for ( s=S; s<dataS; s++ )
+                {
+                    memcpy(refReconDataS.begin()+s*refReconN4D, refRecon.begin()+(S-1)*refReconN4D, sizeof(T)*refReconN4D);
+                    memcpy(refCoilMapDataS.begin()+s*refCoilMapN4D, refCoilMap.begin()+(S-1)*refCoilMapN4D, sizeof(T)*refCoilMapN4D);
+                }
+
+                refRecon = refReconDataS;
+                refCoilMap = refCoilMapDataS;
+            }
+        }
+        else
+        {
+            GERROR_STREAM("CalibMode is not supported in gtPlusReconWorker2DT<T>::prepRef(...) : " << workOrder2DT->CalibMode_);
+            return false;
+        }
+
+        // if the upstream coil compression is needed
+        if ( workOrder2DT->upstream_coil_compression_ )
+        {
+            if ( !debugFolder_.empty() ) { GDEBUG_STREAM("Upstream coil compression ... "); }
+
+            std::vector<hoMatrix<T> > upstreamCoilCoeffRef(workOrder2DT->ref_.get_size(4)), upstreamCoilCoeffRefRecon(refRecon.get_size(4)), upstreamCoilCoeffData(workOrder2DT->data_.get_size(4));
+
+            if ( workOrder2DT->same_coil_compression_coeff_allS_ )
+            {
+                hoNDArray<T> aveAllS;
+
+                std::vector<size_t> allSDim(4);
+                allSDim[0] = refRecon.get_size(0);
+                allSDim[1] = refRecon.get_size(1);
+                allSDim[2] = refRecon.get_size(2);
+                allSDim[3] = refRecon.get_size(3)*refRecon.get_size(4);
+
+                hoNDArray<T> dataAllS(&allSDim, refRecon.begin(), false);
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(dataAllS, aveAllS));
+
+                hoMatrix<T> coeff, eigenValues;
+                if ( workOrder2DT->coil_compression_num_modesKept_ > 0 )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(aveAllS, 
+                                workOrder2DT->upstream_coil_compression_num_modesKept_, coeff, eigenValues));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(aveAllS, 
+                                workOrder2DT->upstream_coil_compression_thres_, coeff, eigenValues));
+                }
+
+                eigenValues.print(std::cout);
+                GDEBUG_STREAM("Upstream coil compression, number of channel kept is " << coeff.cols());
+
+                size_t n;
+                for ( n=0; n<upstreamCoilCoeffRef.size(); n++ )
+                {
+                    upstreamCoilCoeffRef[n] = coeff;
+                }
+
+                for ( n=0; n<upstreamCoilCoeffRefRecon.size(); n++ )
+                {
+                    upstreamCoilCoeffRefRecon[n] = coeff;
+                }
+
+                for ( n=0; n<upstreamCoilCoeffData.size(); n++ )
+                {
+                    upstreamCoilCoeffData[n] = coeff;
+                }
+            }
+            else
+            {
+                std::vector<size_t> allSDim(4);
+                allSDim[0] = refRecon.get_size(0);
+                allSDim[1] = refRecon.get_size(1);
+                allSDim[2] = refRecon.get_size(2);
+                allSDim[3] = refRecon.get_size(3);
+
+                size_t N_refRecon = allSDim[0]*allSDim[1]*allSDim[2]*allSDim[3];
+
+                size_t num_modesKept = srcCHA;
+
+                size_t s;
+                for ( s=0; s<refRecon.get_size(4); s++ )
+                {
+                    hoNDArray<T> dataCurrS(&allSDim, refRecon.begin()+s*N_refRecon, false);
+
+                    hoMatrix<T> coeff, eigenValues;
+
+                    if ( s == 0 )
+                    {
+                        if ( workOrder2DT->coil_compression_num_modesKept_ > 0 )
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(dataCurrS, 
+                                        workOrder2DT->upstream_coil_compression_num_modesKept_, coeff, eigenValues));
+                        }
+                        else
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(dataCurrS, 
+                                        workOrder2DT->upstream_coil_compression_thres_, coeff, eigenValues));
+                        }
+
+                        num_modesKept = coeff.get_size(1);
+                    }
+                    else
+                    {
+                        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(dataCurrS, 
+                                        (int)num_modesKept, coeff, eigenValues));
+                    }
+
+                    if ( !debugFolder_.empty() ) {  eigenValues.print(std::cout); }
+                    GDEBUG_STREAM("Upstream coil compression, number of channel kept is " << coeff.cols());
+
+                    if ( s < upstreamCoilCoeffRef.size() )
+                    {
+                        upstreamCoilCoeffRef[s] = coeff;
+                    }
+
+                    upstreamCoilCoeffRefRecon[s] = coeff;
+                    upstreamCoilCoeffData[s] = coeff;
+                }
+            }
+
+            // apply the coil compression
+            #ifdef USE_OMP
+                omp_set_nested(1);
+            #endif // USE_OMP
+
+            if ( performTiming_ ) { gt_timer2_.start("apply upstream coil compression ... "); }
+            #pragma omp parallel sections default(shared)
+            {
+
+                #pragma omp section
+                {
+                    //if ( performTiming_ ) { gt_timer2_.start("apply the coil compression on data ... "); }
+                    // GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->data_, upstreamCoilCoeffData, data_dst_, true));
+                    if ( performTiming_ ) { gt_timer3_.start("applyKLCoilCompressionCoeff ... "); }
+                    gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder2DT->data_, upstreamCoilCoeffData, data_dst_);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( performTiming_ ) { gt_timer3_.start("copy data ... "); }
+                    workOrder2DT->data_ = data_dst_;
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    //if ( performTiming_ ) { gt_timer2_.stop(); }
+                }
+
+                #pragma omp section
+                {
+                    //if ( performTiming_ ) { gt_timer2_.start("apply the coil compression on ref ... "); }
+                    //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->ref_, upstreamCoilCoeff, ref_dst_, true));
+                    gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder2DT->ref_, upstreamCoilCoeffRef, ref_dst_);
+                    workOrder2DT->ref_ = ref_dst_;
+                    //if ( performTiming_ ) { gt_timer2_.stop(); }
+                }
+
+                #pragma omp section
+                {
+                    //if ( performTiming_ ) { gt_timer2_.start("apply the coil compression on refRecon ... "); }
+                    hoNDArray<T> refRecon_upstream;
+                    //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(refRecon, upstreamCoilCoeff, refRecon_upstream, true));
+                    gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(refRecon, upstreamCoilCoeffRefRecon, refRecon_upstream);
+                    refRecon = refRecon_upstream;
+                    refRecon_upstream.clear();
+                    //if ( performTiming_ ) { gt_timer2_.stop(); }
+                }
+
+                #pragma omp section
+                {
+                    //if ( performTiming_ ) { gt_timer2_.start("apply the coil compression on ref for coil map ... "); }
+                    hoNDArray<T> refCoilMap_upstream;
+                    //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(refCoilMap, upstreamCoilCoeff, refCoilMap_upstream, true));
+                    gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(refCoilMap, upstreamCoilCoeffRefRecon, refCoilMap_upstream);
+                    refCoilMap = refCoilMap_upstream;
+                    refCoilMap_upstream.clear();
+                    //if ( performTiming_ ) { gt_timer2_.stop(); }
+                }
+            }
+
+            if ( performTiming_ ) { gt_timer2_.stop(); }
+
+            #ifdef USE_OMP
+                omp_set_nested(0);
+            #endif // USE_OMP
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refRecon, debugFolder_+"refRecon"); }
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refCoilMap, debugFolder_+"refCoilMap"); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::prepRef(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::coilCompression(gtPlusReconWorkOrder2DT<T>* workOrder2DT)
+{
+    // the 2DT recon on 5D array [RO E1 CHA N S]
+    try
+    {
+        size_t RO = workOrder2DT->ref_recon_.get_size(0);
+        size_t E1 = workOrder2DT->ref_recon_.get_size(1);
+        size_t srcCHA = workOrder2DT->ref_recon_.get_size(2);
+        size_t N = workOrder2DT->ref_recon_.get_size(3);
+        size_t S = workOrder2DT->ref_recon_.get_size(4);
+
+        size_t dataS = workOrder2DT->data_.get_size(4);
+
+        // if ( workOrder2DT->acceFactorE1_ == 1 ) return true;
+
+        // compute coil compression coeff
+        if ( workOrder2DT->coil_compression_ )
+        {
+            // check whether coil compression coeff has been preset
+            if ( workOrder2DT->coilCompressionCoef_->size()!=S )
+            {
+                if ( workOrder2DT->same_coil_compression_coeff_allS_ )
+                {
+                    hoNDArray<T> aveAllS;
+
+                    std::vector<size_t> allSDim(4);
+                    allSDim[0] = RO;
+                    allSDim[1] = E1;
+                    allSDim[2] = srcCHA;
+                    allSDim[3] = N*S;
+
+                    hoNDArray<T> dataAllS(&allSDim, workOrder2DT->ref_recon_.begin(), false);
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(dataAllS, aveAllS));
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(aveAllS, debugFolder_+"aveAllS"); }
+
+                    hoMatrix<T> coeff, eigenValues;
+                    if ( workOrder2DT->coil_compression_num_modesKept_ > 0 )
+                    {
+                        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(aveAllS, 
+                                    workOrder2DT->coil_compression_num_modesKept_, coeff, eigenValues));
+                    }
+                    else
+                    {
+                        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(aveAllS, 
+                                    workOrder2DT->coil_compression_thres_, coeff, eigenValues));
+                    }
+
+                    workOrder2DT->coilCompressionCoef_->resize(dataS);
+
+                    size_t s;
+                    for ( s=0; s<dataS; s++ )
+                    {
+                        (*workOrder2DT->coilCompressionCoef_)[s] = coeff;
+                    }
+
+                    if ( !debugFolder_.empty() ) {  eigenValues.print(std::cout); }
+                    GDEBUG_STREAM("Coil compression, number of channel kept is " << coeff.cols());
+                }
+                else
+                {
+                    std::vector<size_t> allSDim(4);
+                    allSDim[0] = RO;
+                    allSDim[1] = E1;
+                    allSDim[2] = srcCHA;
+                    allSDim[3] = N;
+
+                    size_t num_modesKept = srcCHA;
+
+                    size_t s;
+                    for ( s=0; s<S; s++ )
+                    {
+                        hoNDArray<T> dataCurrS(&allSDim, workOrder2DT->ref_recon_.begin()+s*RO*E1*srcCHA*N, false);
+
+                        hoMatrix<T> coeff, eigenValues;
+
+                        if ( s == 0 )
+                        {
+                            if ( workOrder2DT->coil_compression_num_modesKept_ > 0 )
+                            {
+                                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(dataCurrS, 
+                                            workOrder2DT->coil_compression_num_modesKept_, coeff, eigenValues));
+                            }
+                            else
+                            {
+                                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(dataCurrS, 
+                                            workOrder2DT->coil_compression_thres_, coeff, eigenValues));
+                            }
+
+                            num_modesKept = coeff.get_size(1);
+                            workOrder2DT->coilCompressionCoef_->push_back(coeff);
+                        }
+                        else
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(dataCurrS, 
+                                            (int)num_modesKept, coeff, eigenValues));
+
+                            workOrder2DT->coilCompressionCoef_->push_back(coeff);
+                        }
+
+                        if ( !debugFolder_.empty() ) {  eigenValues.print(std::cout); }
+                        GDEBUG_STREAM("Coil compression, number of channel kept is " << coeff.cols());
+                    }
+
+                    if ( S < dataS )
+                    {
+                        std::vector<hoMatrix<T> > coilCompressionCoef(dataS);
+                        for ( s=0; s<S; s++ )
+                        {
+                            coilCompressionCoef[s] = (*workOrder2DT->coilCompressionCoef_)[s];
+                        }
+
+                        for ( s=S; s<dataS; s++ )
+                        {
+                            coilCompressionCoef[s] = (*workOrder2DT->coilCompressionCoef_)[S-1];
+                        }
+
+                        *(workOrder2DT->coilCompressionCoef_) = coilCompressionCoef;
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::coilCompression(gtPlusReconWorkOrder2DT<T>* workOrder2DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performRecon(gtPlusReconWorkOrder2DT<T>* workOrder2DT)
+{
+    // the 2DT recon on 5D array [RO E1 CHA N S]
+    try
+    {
+        if ( !workOrder2DT->workFlow_use_BufferedKernel_ )
+        {
+            if ( performTiming_ ) { gt_timer1_.start("prepRef"); }
+            GADGET_CHECK_RETURN_FALSE(prepRef(workOrder2DT, workOrder2DT->ref_, workOrder2DT->ref_recon_, workOrder2DT->ref_coil_map_, 
+                        workOrder2DT->start_RO_, workOrder2DT->end_RO_, workOrder2DT->start_E1_, workOrder2DT->end_E1_, workOrder2DT->data_.get_size(1)));
+            if ( performTiming_ ) { gt_timer1_.stop(); }
+
+            if ( performTiming_ ) { gt_timer1_.start("coilCompression"); }
+            GADGET_CHECK_RETURN_FALSE(coilCompression(workOrder2DT));
+            if ( performTiming_ ) { gt_timer1_.stop(); }
+        }
+
+         // apply coil compression coefficients
+        if ( workOrder2DT->workFlow_use_BufferedKernel_ )
+        {
+            if ( workOrder2DT->coil_compression_ )
+            {
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder2DT->data_, debugFolder_+"data_"); }
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder2DT->data_, *workOrder2DT->coilCompressionCoef_, data_dst_));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(data_dst_, debugFolder_+"data_dst_"); }
+            }
+            else
+            {
+                data_dst_ = workOrder2DT->data_;
+            }
+        }
+        else
+        {
+            if ( workOrder2DT->coil_compression_ )
+            {
+                ref_src_ = workOrder2DT->ref_recon_;
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(ref_src_, debugFolder_+"ref_src_"); }
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(ref_src_, *workOrder2DT->coilCompressionCoef_, ref_dst_));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(ref_dst_, debugFolder_+"ref_dst_"); }
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder2DT->data_, debugFolder_+"data_"); }
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder2DT->data_, *workOrder2DT->coilCompressionCoef_, data_dst_));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(data_dst_, debugFolder_+"data_dst_"); }
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder2DT->ref_coil_map_, debugFolder_+"ref_coil_map_"); }
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder2DT->ref_coil_map_, *workOrder2DT->coilCompressionCoef_, ref_coil_map_dst_));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(ref_coil_map_dst_, debugFolder_+"ref_coil_map_dst_"); }
+
+                if ( !workOrder2DT->downstream_coil_compression_ 
+                    || workOrder2DT->recon_algorithm_==ISMRMRD_SPIRIT 
+                    || workOrder2DT->recon_algorithm_==ISMRMRD_L1SPIRIT 
+                    || workOrder2DT->recon_algorithm_==ISMRMRD_L1SPIRIT_SLEP 
+                    || workOrder2DT->recon_algorithm_==ISMRMRD_L1SPIRIT_SLEP_MOTION_COMP )
+                {
+                    ref_src_ = ref_dst_;
+                }
+            }
+            else
+            {
+                ref_src_ = workOrder2DT->ref_recon_;
+                ref_dst_ = workOrder2DT->ref_recon_;
+                data_dst_ = workOrder2DT->data_;
+                ref_coil_map_dst_ = workOrder2DT->ref_coil_map_;
+            }
+
+            if ( performTiming_ ) { gt_timer1_.start("estimateCoilMap"); }
+            GADGET_CHECK_RETURN_FALSE(this->estimateCoilMap(workOrder2DT, ref_src_, ref_dst_, ref_coil_map_dst_));
+            if ( performTiming_ ) { gt_timer1_.stop(); }
+
+            if ( performTiming_ ) { gt_timer1_.start("performCalib"); }
+            GADGET_CHECK_RETURN_FALSE(this->performCalib(workOrder2DT, ref_src_, ref_dst_, ref_coil_map_dst_));
+            if ( performTiming_ ) { gt_timer1_.stop(); }
+        }
+
+        if ( performTiming_ ) { gt_timer1_.start("performUnwrapping"); }
+        GADGET_CHECK_RETURN_FALSE(this->performUnwrapping(workOrder2DT, data_dst_));
+        if ( performTiming_ ) { gt_timer1_.stop(); }
+
+        if ( performTiming_ ) { gt_timer1_.start("afterUnwrapping"); }
+        GADGET_CHECK_RETURN_FALSE(this->afterUnwrapping(workOrder2DT));
+        if ( performTiming_ ) { gt_timer1_.stop(); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::performRecon(gtPlusReconWorkOrder2DT<T>* workOrder2DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::
+estimateCoilMap(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, const hoNDArray<T>& ref_coil_map_dst)
+{
+    try
+    {
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = ref_src.get_size(2);
+
+        size_t refRO = ref_dst.get_size(0);
+        size_t refE1 = ref_dst.get_size(1);
+        size_t refN = ref_dst.get_size(3);
+        size_t dstCHA = ref_coil_map_dst.get_size(2);
+
+        bool same_combinationcoeff_allS = false;
+        size_t whichS_combinationcoeff = 0;
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_interleaved )
+        {
+            same_combinationcoeff_allS = workOrder2DT->interleaved_same_combinationcoeff_allS_;
+            whichS_combinationcoeff = workOrder2DT->interleaved_whichS_combinationcoeff_;
+        }
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_embedded )
+        {
+            same_combinationcoeff_allS = workOrder2DT->embedded_same_combinationcoeff_allS_;
+            whichS_combinationcoeff = workOrder2DT->embedded_whichS_combinationcoeff_;
+        }
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_separate )
+        {
+            same_combinationcoeff_allS = workOrder2DT->separate_same_combinationcoeff_allS_;
+            whichS_combinationcoeff = workOrder2DT->separate_whichS_combinationcoeff_;
+        }
+
+        if ( whichS_combinationcoeff >= S ) whichS_combinationcoeff=S-1;
+
+        // if the coil map has not been preset
+        if ( (workOrder2DT->coilMap_->get_size(0)!=RO) 
+            || (workOrder2DT->coilMap_->get_size(1)!=E1)
+            || (workOrder2DT->coilMap_->get_size(4)!=S) )
+        {
+            if ( same_combinationcoeff_allS )
+            {
+                size_t usedS = whichS_combinationcoeff;
+
+                hoNDArray<T> refCoilMapS(RO, E1, dstCHA, refN, const_cast<T*>(ref_coil_map_dst.begin()+usedS*RO*E1*dstCHA*refN));
+
+                workOrder2DT->coilMap_->create(RO, E1, dstCHA, refN, S);
+
+                hoNDArray<T> coilMapS(RO, E1, dstCHA, refN, workOrder2DT->coilMap_->begin()+usedS*RO*E1*dstCHA*refN);
+
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(refCoilMapS, buffer2DT_);
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(buffer2DT_, 
+                        coilMapS, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, 
+                        workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, (value_type)workOrder2DT->csm_iter_thres_, workOrder2DT->csm_use_gpu_));
+
+                GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder2DT->coilMap_, usedS));
+            }
+            else
+            {
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(ref_coil_map_dst, buffer2DT_);
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(buffer2DT_, 
+                        *workOrder2DT->coilMap_, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, 
+                        workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, (value_type)workOrder2DT->csm_iter_thres_, workOrder2DT->csm_use_gpu_));
+            }
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*workOrder2DT->coilMap_, debugFolder_+"coilMap_"); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::estimateCoilMap(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::
+performCalib(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, const hoNDArray<T>& ref_coil_map_dst)
+{
+    try
+    {
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = ref_src.get_size(2);
+
+        size_t refRO = ref_dst.get_size(0);
+        size_t refE1 = ref_dst.get_size(1);
+        size_t refN = ref_dst.get_size(3);
+        size_t dstCHA = ref_coil_map_dst.get_size(2);
+
+        bool same_combinationcoeff_allS = false;
+        size_t whichS_combinationcoeff = 0;
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_interleaved )
+        {
+            same_combinationcoeff_allS = workOrder2DT->interleaved_same_combinationcoeff_allS_;
+            whichS_combinationcoeff = workOrder2DT->interleaved_whichS_combinationcoeff_;
+        }
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_embedded )
+        {
+            same_combinationcoeff_allS = workOrder2DT->embedded_same_combinationcoeff_allS_;
+            whichS_combinationcoeff = workOrder2DT->embedded_whichS_combinationcoeff_;
+        }
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_separate )
+        {
+            same_combinationcoeff_allS = workOrder2DT->separate_same_combinationcoeff_allS_;
+            whichS_combinationcoeff = workOrder2DT->separate_whichS_combinationcoeff_;
+        }
+
+        if ( whichS_combinationcoeff >= S ) whichS_combinationcoeff=S-1;
+
+        // calibration
+        if ( (workOrder2DT->kernelIm_->get_size(0)!=RO) 
+                || (workOrder2DT->kernelIm_->get_size(1)!=E1)
+                || (workOrder2DT->kernelIm_->get_size(2)!=srcCHA)
+                || (workOrder2DT->kernelIm_->get_size(3)!=dstCHA)
+                || (workOrder2DT->kernelIm_->get_size(5)!=S) )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->performCalibPrep(ref_src, ref_dst, workOrder2DT));
+
+            size_t n;
+
+            // perform calibration
+            if ( same_combinationcoeff_allS )
+            {
+                size_t usedS = whichS_combinationcoeff;
+
+                for ( n=0; n<refN; n++ )
+                {
+                    GADGET_CHECK_RETURN_FALSE(this->performCalibImpl(ref_src, ref_dst, workOrder2DT, n, usedS));
+                }
+
+                if ( S > 1 )
+                {
+                    GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder2DT->kernel_, usedS));
+                    GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder2DT->kernelIm_, usedS));
+                    if (workOrder2DT->unmixingCoeffIm_ && workOrder2DT->unmixingCoeffIm_->get_size(4) == S) GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder2DT->unmixingCoeffIm_, usedS));
+                    if ( workOrder2DT->gfactor_needed_ ) { GADGET_CHECK_RETURN_FALSE(repmatLastDimension(workOrder2DT->gfactor_, usedS)); }
+                    if ( workOrder2DT->wrap_around_map_needed_ ) { GADGET_CHECK_RETURN_FALSE(repmatLastDimension(workOrder2DT->wrap_around_map_, usedS)); }
+                }
+
+                if (!debugFolder_.empty())
+                {
+                    gt_exporter_.exportArrayComplex(workOrder2DT->gfactor_, debugFolder_ + "gfactor_after_calib");
+                }
+            }
+            else
+            {
+                int usedS;
+                #ifdef USE_OMP
+                    if ( S < omp_get_num_procs()/2 )
+                    {
+                        omp_set_nested(1);
+                        GDEBUG_STREAM("performCalib, nested omp is on ... ");
+                    }
+                #endif // USE_OMP
+
+                #pragma omp parallel for default(none) private(usedS) shared(S, refN, ref_src, ref_dst, workOrder2DT) if (S>1)
+                for ( usedS=0; usedS<(int)S; usedS++ )
+                {
+                    for ( size_t n=0; n<refN; n++ )
+                    {
+                        this->performCalibImpl(ref_src, ref_dst, workOrder2DT, n, usedS);
+                    }
+                }
+
+                #ifdef USE_OMP
+                    omp_set_nested(0);
+                #endif // USE_OMP
+
+                if (!debugFolder_.empty())
+                {
+                    gt_exporter_.exportArrayComplex(workOrder2DT->gfactor_, debugFolder_ + "gfactor_after_calib");
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::performCalib(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::
+performCalibPrep(const hoNDArray<T>& , const hoNDArray<T>& , gtPlusReconWorkOrder2DT<T>* /*workOrder2DT*/)
+{
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::
+performCalibImpl(const hoNDArray<T>& , const hoNDArray<T>& , gtPlusReconWorkOrder2DT<T>* , size_t , size_t )
+{
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performUnwrapping(gtPlusReconWorkOrder2DT<T>* , const hoNDArray<T>& )
+{
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::unmixCoeff(const hoNDArray<T>& kerIm, const hoNDArray<T>& coilMap, hoNDArray<T>& unmixCoeff, hoNDArray<T>& gFactor)
+{
+    try
+    {
+        size_t RO = kerIm.get_size(0);
+        size_t E1 = kerIm.get_size(1);
+        size_t srcCHA = kerIm.get_size(2);
+        size_t dstCHA = kerIm.get_size(3);
+
+        GADGET_CHECK_RETURN_FALSE(coilMap.get_size(0)==RO);
+        GADGET_CHECK_RETURN_FALSE(coilMap.get_size(1)==E1);
+        GADGET_CHECK_RETURN_FALSE(coilMap.get_size(2)==dstCHA);
+
+        unmixCoeff.create(RO, E1, srcCHA);
+        Gadgetron::clear(&unmixCoeff);
+
+        gFactor.create(RO, E1);
+        Gadgetron::clear(&gFactor);
+
+        int src;
+
+        T* pKerIm = const_cast<T*>(kerIm.begin());
+        T* pCoilMap = const_cast<T*>(coilMap.begin());
+        T* pCoeff = unmixCoeff.begin();
+
+        std::vector<size_t> dim(2);
+        dim[0] = RO;
+        dim[1] = E1;
+
+        #pragma omp parallel default(none) private(src) shared(RO, E1, srcCHA, dstCHA, pKerIm, pCoilMap, pCoeff, dim)
+        {
+            hoNDArray<T> coeff2D, coeffTmp(&dim);
+            hoNDArray<T> coilMap2D;
+            hoNDArray<T> kerIm2D;
+
+            #pragma omp for
+            for ( src=0; src<(int)srcCHA; src++ )
+            {
+                coeff2D.create(&dim, pCoeff+src*RO*E1);
+
+                for ( size_t dst=0; dst<dstCHA; dst++ )
+                {
+                    kerIm2D.create(&dim, pKerIm+src*RO*E1+dst*RO*E1*srcCHA);
+                    coilMap2D.create(&dim, pCoilMap+dst*RO*E1);
+                    Gadgetron::multiplyConj(kerIm2D, coilMap2D, coeffTmp);
+                    Gadgetron::add(coeff2D, coeffTmp, coeff2D);
+                }
+            }
+        }
+
+        hoNDArray<T> conjUnmixCoeff(unmixCoeff);
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiplyConj(unmixCoeff, conjUnmixCoeff, conjUnmixCoeff));
+        // GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(conjUnmixCoeff, gFactor));
+
+        hoNDArray<T> gFactorBuf(RO, E1, 1, gFactor.begin());
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(conjUnmixCoeff, gFactorBuf, 2));
+
+        // memcpy(gFactor.begin(), gFactorBuf.begin(), sizeof(T)*RO*E1);
+        Gadgetron::sqrt(gFactor, gFactor);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::unmixCoeff(const hoNDArray<T>& kerIm, const hoNDArray<T>& coilMap, hoNDArray<T>& unmixCoeff, hoNDArray<T>& gFactor) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::applyImageDomainKernel(const hoNDArray<T>& kspace, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        size_t RO = kerIm.get_size(0);
+        size_t E1 = kerIm.get_size(1);
+        size_t srcCHA = kerIm.get_size(2);
+        size_t dstCHA = kerIm.get_size(3);
+
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(0)==RO);
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(1)==E1);
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(2)==srcCHA);
+
+        buffer2DT_unwrapping_ = kspace;
+
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kspace, buffer2DT_unwrapping_);
+
+        GADGET_CHECK_RETURN_FALSE(applyImageDomainKernelImage(buffer2DT_unwrapping_, kerIm, complexIm));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::applyImageDomainKernel(const hoNDArray<T>& kspace, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm)
+{
+    hoNDArray<T> buf4D(kerIm.get_dimensions());
+    return applyImageDomainKernelImage(aliasedIm, kerIm, buf4D, complexIm);
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& kerImBuffer, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        size_t RO = kerIm.get_size(0);
+        size_t E1 = kerIm.get_size(1);
+        size_t srcCHA = kerIm.get_size(2);
+        size_t dstCHA = kerIm.get_size(3);
+
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(0)==RO);
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(1)==E1);
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(2)==srcCHA);
+
+        boost::shared_ptr< std::vector<size_t> > dim = aliasedIm.get_dimensions();
+
+        std::vector<size_t> dimIm(*dim);
+        dimIm[2] = dstCHA;
+
+        if ( !complexIm.dimensions_equal(&dimIm) )
+        {
+            complexIm.create(&dimIm);
+        }
+        Gadgetron::clear(&complexIm);
+
+        std::vector<size_t> dim3D(3);
+        dim3D[0] = RO;
+        dim3D[1] = E1;
+        dim3D[2] = srcCHA;
+
+        std::vector<size_t> dimIm3D(3);
+        dimIm3D[0] = RO;
+        dimIm3D[1] = E1;
+        dimIm3D[2] = dstCHA;
+
+        size_t num = aliasedIm.get_number_of_elements()/ (RO*E1*srcCHA);
+
+        int n;
+
+        if ( num <= 8 )
+        {
+            if ( performTiming_ ) { gt_timer3_.start("apply image domain kernel image ... "); }
+            for ( n=0; n<(int)num; n++ )
+            {
+                hoNDArray<T> buf3D(&dim3D, const_cast<T*>(aliasedIm.begin()+n*RO*E1*srcCHA));
+                hoNDArray<T> bufIm3D(RO, E1, 1, dstCHA, complexIm.begin() + n*RO*E1*dstCHA);
+
+                Gadgetron::multiply(kerIm, buf3D, kerImBuffer);
+                Gadgetron::sum_over_dimension(kerImBuffer, bufIm3D, 2);
+            }
+            if ( performTiming_ ) { gt_timer3_.stop(); }
+        }
+        else
+        {
+            #pragma omp parallel default(none) private(n) shared(kerIm, num, dim3D, aliasedIm, RO, E1, srcCHA, dimIm3D, dstCHA, complexIm) 
+            {
+                hoNDArray<T> buf3D;
+                hoNDArray<T> bufIm3D;
+                hoNDArray<T> buf4D(kerIm.get_dimensions());
+
+                #pragma omp for
+                for ( n=0; n<(int)num; n++ )
+                {
+                    buf3D.create(&dim3D, const_cast<T*>(aliasedIm.begin()+n*RO*E1*srcCHA));
+                    bufIm3D.create(RO, E1, 1, dstCHA, complexIm.begin() + n*RO*E1*dstCHA);
+
+                    Gadgetron::multiply(kerIm, buf3D, buf4D);
+                    Gadgetron::sum_over_dimension(buf4D, bufIm3D, 2);
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& kerImBuffer, hoNDArray<T>& complexIm) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::applyUnmixCoeff(const hoNDArray<T>& kspace, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(0)==unmixCoeff.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(1)==unmixCoeff.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(2)==unmixCoeff.get_size(2));
+
+        buffer2DT_unwrapping_ = kspace;
+
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kspace, buffer2DT_unwrapping_);
+        GADGET_CHECK_RETURN_FALSE(applyUnmixCoeffImage(buffer2DT_unwrapping_, unmixCoeff, complexIm));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::applyUnmixCoeff(const hoNDArray<T>& kspace, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::applyUnmixCoeffImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(0)==unmixCoeff.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(1)==unmixCoeff.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(2)==unmixCoeff.get_size(2));
+
+        boost::shared_ptr< std::vector<size_t> > dim = aliasedIm.get_dimensions();
+
+        std::vector<size_t> dimIm(*dim);
+        dimIm[2] = 1;
+
+        if ( !complexIm.dimensions_equal(&dimIm) )
+        {
+            complexIm.create(&dimIm);
+        }
+        Gadgetron::clear(&complexIm);
+
+        buffer2DT_unwrapping_ = aliasedIm;
+
+        Gadgetron::multiply(aliasedIm, unmixCoeff, buffer2DT_unwrapping_);
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(buffer2DT_unwrapping_, complexIm, 2));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::applyUnmixCoeffImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::afterUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT)
+{
+    try
+    {
+        bool fullres_coilmap = false;
+        bool fullres_coilmap_useHighestSignal = false;
+        bool ref_fillback = false;
+        bool averageallN_coilmap = false;
+        int numOfModesKept = 0;
+        bool same_coilmap_allS = false;
+        size_t whichS_coilmap = 0;
+
+        size_t RO = workOrder2DT->kernelIm_->get_size(0);
+        size_t E1 = workOrder2DT->kernelIm_->get_size(1);
+        size_t srcCHA = workOrder2DT->kernelIm_->get_size(2);
+        size_t dstCHA = workOrder2DT->kernelIm_->get_size(3);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_noacceleration )
+        {
+            fullres_coilmap = false;
+            ref_fillback = false;
+        }
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_embedded )
+        {
+            if ( workOrder2DT->embedded_fullres_coilmap_ )
+            {
+                fullres_coilmap = true;
+                fullres_coilmap_useHighestSignal = workOrder2DT->embedded_fullres_coilmap_useHighestSignal_;
+            }
+
+            if ( workOrder2DT->embedded_ref_fillback_ 
+                && (workOrder2DT->recon_algorithm_!=ISMRMRD_SPIRIT) 
+                && (workOrder2DT->recon_algorithm_!=ISMRMRD_L1SPIRIT)
+                && (workOrder2DT->recon_algorithm_!=ISMRMRD_L1SPIRIT_SLEP)
+                && (workOrder2DT->recon_algorithm_!=ISMRMRD_L1SPIRIT_SLEP_MOTION_COMP) )
+            {
+                ref_fillback = true;
+            }
+
+            if ( workOrder2DT->embedded_averageall_ref_ )
+            {
+                averageallN_coilmap = true;
+            }
+
+            if ( workOrder2DT->embedded_same_combinationcoeff_allS_ )
+            {
+                same_coilmap_allS = true;
+                whichS_coilmap = workOrder2DT->embedded_whichS_combinationcoeff_;
+            }
+
+            numOfModesKept = workOrder2DT->embedded_ref_numOfModes_;
+        }
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_separate )
+        {
+            if ( workOrder2DT->separate_fullres_coilmap_ )
+            {
+                fullres_coilmap = true;
+            }
+
+            if ( workOrder2DT->separate_averageall_ref_ )
+            {
+                averageallN_coilmap = true;
+            }
+
+            if ( workOrder2DT->separate_same_combinationcoeff_allS_ )
+            {
+                same_coilmap_allS = true;
+                whichS_coilmap = workOrder2DT->separate_whichS_combinationcoeff_;
+            }
+
+            numOfModesKept = workOrder2DT->separate_ref_numOfModes_;
+        }
+
+        if ( whichS_coilmap >= S ) whichS_coilmap = S-1;
+
+        if ( ref_fillback )
+        {
+            GDEBUG_STREAM("Fill back the reference kspace lines to the reconstruction ");
+
+            hoNDArray<T> ref_dst;
+            if ( workOrder2DT->coil_compression_ )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.applyKLCoilCompressionCoeff(workOrder2DT->ref_, *workOrder2DT->coilCompressionCoef_, ref_dst));
+            }
+            else
+            {
+                ref_dst = workOrder2DT->ref_;
+            }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(ref_dst, debugFolder_+"ref_dst"); }
+
+            if ( (ref_dst.get_size(2)==dstCHA) && (ref_dst.get_size(3)==N) && (ref_dst.get_size(4)==S) )
+            {
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder2DT->fullkspace_, debugFolder_+"fullkspace_"); }
+
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongE1(ref_dst, workOrder2DT->fullkspace_, startE1_, endE1_));
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder2DT->fullkspace_, debugFolder_+"fullkspace_After"); }
+            }
+        }
+
+        // partial fourier handling
+        if ( partial_fourier_handling_ )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->performPartialFourierHandling(workOrder2DT));
+        }
+
+        if ( fullres_coilmap )
+        {
+            if ( performTiming_ ) { gt_timer2_.start("full res coil map : allocate buffer 2DT ...  "); }
+            hoNDArray<T> buffer2DT_Two(workOrder2DT->fullkspace_.get_dimensions());
+            if ( performTiming_ ) { gt_timer2_.stop(); }
+
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(workOrder2DT->fullkspace_, buffer2DT_, buffer2DT_Two);
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer2DT_, debugFolder_+"ComplexIm_afterRefFill"); }
+
+            if ( averageallN_coilmap )
+            {
+                if ( workOrder2DT->workFlow_use_BufferedKernel_ && workOrder2DT->coilMap_->get_size(3)==1 && workOrder2DT->coilMap_->get_size(4)==S )
+                {
+                    size_t s;
+                    for ( s=0; s<S; s++ )
+                    {
+                        hoNDArray<T> coilMapS(RO, E1, dstCHA, workOrder2DT->coilMap_->begin()+s*RO*E1*dstCHA);
+                        hoNDArray<T> complexImS(RO, E1, dstCHA, N, buffer2DT_.begin()+s*RO*E1*dstCHA*N);
+                        hoNDArray<T> complexImCombinedS(RO, E1, N, workOrder2DT->complexIm_.begin()+s*RO*E1*N);
+
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine(complexImS, coilMapS, complexImCombinedS));
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(complexImCombinedS, debugFolder_+"complexImCombinedS"); }
+                    }
+                }
+                else
+                {
+                    workOrder2DT->coilMap_->create(RO, E1, dstCHA, 1, S);
+                    //Gadgetron::clear( *(workOrder2DT->coilMap_) );
+
+                    size_t s;
+
+                    if ( same_coilmap_allS )
+                    {
+                        hoNDArray<T> aveComplexImS(RO, E1, dstCHA, 1);
+                        //Gadgetron::clear(aveComplexImS);
+
+                        buffer2DT_unwrapping_.create(RO, E1, dstCHA, N);
+                        //Gadgetron::clear(aveComplexImS);
+
+                        hoMatrix<T> A(RO*E1*dstCHA, N, buffer2DT_.begin()+whichS_coilmap*RO*E1*dstCHA*N);
+                        hoMatrix<T> A_KLF(RO*E1*dstCHA, N, buffer2DT_unwrapping_.begin());
+
+                        if ( numOfModesKept>0 && numOfModesKept<dstCHA )
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLFilter(A, numOfModesKept, A_KLF));
+                        }
+                        else
+                        {
+                            memcpy(A_KLF.begin(), A.begin(), A_KLF.get_number_of_bytes());
+                        }
+
+                        if ( fullres_coilmap_useHighestSignal )
+                        {
+                            GADGET_CHECK_RETURN_FALSE(pickHighestSignalForN(buffer2DT_unwrapping_, aveComplexImS));
+                        }
+                        else
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(buffer2DT_unwrapping_, aveComplexImS));
+                        }
+
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(aveComplexImS, debugFolder_+"aveComplexImS"); }
+
+                        hoNDArray<T> coilMapS(RO, E1, dstCHA, 1, workOrder2DT->coilMap_->begin()+whichS_coilmap*RO*E1*dstCHA);
+
+                        if ( performTiming_ ) { gt_timer2_.start("coilMap2DNIH ...  "); }
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(aveComplexImS, coilMapS, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, (value_type)workOrder2DT->csm_iter_thres_, workOrder2DT->csm_use_gpu_));
+                        if ( performTiming_ ) { gt_timer2_.stop(); }
+
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(coilMapS, debugFolder_+"coilMapS"); }
+
+                        GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder2DT->coilMap_, whichS_coilmap));
+
+                        for ( s=0; s<S; s++ )
+                        {
+                            hoNDArray<T> coilMapS(RO, E1, dstCHA, workOrder2DT->coilMap_->begin()+s*RO*E1*dstCHA);
+                            hoNDArray<T> complexImS(RO, E1, dstCHA, N, buffer2DT_.begin()+s*RO*E1*dstCHA*N);
+                            hoNDArray<T> complexImCombinedS(RO, E1, N, workOrder2DT->complexIm_.begin()+s*RO*E1*N);
+
+                            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine(complexImS, coilMapS, complexImCombinedS));
+                            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(complexImCombinedS, debugFolder_+"complexImCombinedS"); }
+                        }
+                    }
+                    else
+                    {
+                        hoNDArray<T> aveComplexIm(RO, E1, dstCHA, 1, S);
+                        //Gadgetron::clear(aveComplexIm);
+
+                        buffer2DT_unwrapping_ = buffer2DT_;
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer2DT_unwrapping_, debugFolder_+"buffer2DT_unwrapping"); }
+
+                        if ( numOfModesKept>0 && numOfModesKept<dstCHA )
+                        {
+                            for ( s=0; s<S; s++ )
+                            {
+                                hoMatrix<T> A(RO*E1*dstCHA, N, buffer2DT_.begin()+s*RO*E1*dstCHA*N);
+                                hoMatrix<T> A_KLF(RO*E1*dstCHA, N, buffer2DT_unwrapping_.begin()+s*RO*E1*dstCHA*N);
+
+                                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLFilter(A, numOfModesKept, A_KLF));
+                            }
+
+                            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer2DT_unwrapping_, debugFolder_+"ComplexIm_KLF"); }
+                        }
+
+                        if ( fullres_coilmap_useHighestSignal )
+                        {
+                            GADGET_CHECK_RETURN_FALSE(pickHighestSignalForN(buffer2DT_unwrapping_, aveComplexIm));
+                        }
+                        else
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(buffer2DT_unwrapping_, aveComplexIm));
+                        }
+
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(aveComplexIm, debugFolder_+"aveComplexIm"); }
+
+                        if ( performTiming_ ) { gt_timer2_.start("coilMap2DNIH ...  "); }
+
+                        gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(aveComplexIm, *workOrder2DT->coilMap_, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, (value_type)workOrder2DT->csm_iter_thres_, workOrder2DT->csm_use_gpu_);
+
+                        gtPlusISMRMRDReconUtilComplex<T>().coilCombine(buffer2DT_, *workOrder2DT->coilMap_, workOrder2DT->complexIm_);
+
+                        //long long ss;
+                        //#pragma omp parallel for private(s) if (S>2)
+                        //for ( ss=0; ss<S; ss++ )
+                        //{
+                        //    hoNDArray<T> aveComplexImS(RO, E1, dstCHA, aveComplexIm.begin()+ss*RO*E1*dstCHA);
+                        //    hoNDArray<T> coilMapS(RO, E1, dstCHA, workOrder2DT->coilMap_->begin()+ss*RO*E1*dstCHA);
+
+                        //    //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(aveComplexImS, coilMapS, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, workOrder2DT->csm_iter_thres_));
+                        //    gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(aveComplexImS, coilMapS, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, workOrder2DT->csm_iter_thres_);
+
+                        //    hoNDArray<T> complexImS(RO, E1, dstCHA, N, buffer2DT_.begin()+ss*RO*E1*dstCHA*N);
+                        //    hoNDArray<T> complexImCombinedS(RO, E1, N, workOrder2DT->complexIm_.begin()+ss*RO*E1*N);
+
+                        //    //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine(complexImS, coilMapS, complexImCombinedS));
+                        //    gtPlusISMRMRDReconUtilComplex<T>().coilCombine(complexImS, coilMapS, complexImCombinedS);
+                        //}
+                        if ( performTiming_ ) { gt_timer2_.stop(); }
+
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*workOrder2DT->coilMap_, debugFolder_+"coilMap_fullres"); }
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder2DT->complexIm_, debugFolder_+"complexImCombined"); }
+                    }
+                }
+            }
+            else
+            {
+                if ( workOrder2DT->workFlow_use_BufferedKernel_ && workOrder2DT->coilMap_->get_size(3)==N && workOrder2DT->coilMap_->get_size(4)==S )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine(buffer2DT_, *workOrder2DT->coilMap_, workOrder2DT->complexIm_));
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder2DT->complexIm_, debugFolder_+"complexIm_"); }
+                }
+                else
+                {
+                    workOrder2DT->coilMap_->create(RO, E1, dstCHA, N, S);
+
+                    if ( performTiming_ ) { gt_timer2_.start("coilMap2DNIH ...  "); }
+                    if ( same_coilmap_allS )
+                    {
+                        hoNDArray<T> complexImS(RO, E1, dstCHA, N, buffer2DT_.begin()+whichS_coilmap*RO*E1*dstCHA*N);
+                        hoNDArray<T> coilMapS(RO, E1, dstCHA, N, workOrder2DT->coilMap_->begin()+whichS_coilmap*RO*E1*dstCHA*N);
+
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(complexImS, coilMapS, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, (value_type)workOrder2DT->csm_iter_thres_, workOrder2DT->csm_use_gpu_));
+                        GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder2DT->coilMap_, whichS_coilmap));
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*workOrder2DT->coilMap_, debugFolder_+"coilMap_fullres"); }
+                    }
+                    else
+                    {
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(buffer2DT_, *workOrder2DT->coilMap_, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, (value_type)workOrder2DT->csm_iter_thres_, workOrder2DT->csm_use_gpu_));
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*workOrder2DT->coilMap_, debugFolder_+"coilMap_fullres"); }
+                    }
+                    if ( performTiming_ ) { gt_timer2_.stop(); }
+
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine(buffer2DT_, *workOrder2DT->coilMap_, workOrder2DT->complexIm_));
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder2DT->complexIm_, debugFolder_+"complexIm_"); }
+                }
+            }
+        }
+        else
+        {
+            if ( partial_fourier_handling_ )
+            {
+                bool partialFourierHandling = true;
+                if ( (workOrder2DT->start_RO_<0 || workOrder2DT->end_RO_<0 || (workOrder2DT->end_RO_-workOrder2DT->start_RO_+1==RO) ) 
+                        && (workOrder2DT->start_E1_<0 || workOrder2DT->end_E1_<0 || (workOrder2DT->end_E1_-workOrder2DT->start_E1_+1==E1) ) )
+                {
+                    partialFourierHandling = false;
+                }
+
+                // if the partial fourier handling is used to compute updated full kspace, the coil combination needs to be repeated
+                if ( partialFourierHandling )
+                {
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder2DT->complexIm_, debugFolder_+"complexIm_origin_noFullResCoilMap_"); }
+
+                    // if the partial fourier handling is performed on the fullkspace, an extra coil combination is needed
+                    if ( workOrder2DT->CalibMode_ == ISMRMRD_noacceleration )
+                    {
+                        hoNDArray<T> buffer2DT_Two(workOrder2DT->data_.get_dimensions());
+                        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(workOrder2DT->data_, buffer2DT_, buffer2DT_Two);
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine(buffer2DT_, *workOrder2DT->coilMap_, workOrder2DT->complexIm_));
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder2DT->complexIm_, debugFolder_+"complexIm_noFullResCoilMap_"); }
+                    }
+                    else if ( workOrder2DT->fullkspace_.get_number_of_elements() > 0 )
+                    {
+                        if ( workOrder2DT->fullkspace_.get_size(2) == workOrder2DT->coilMap_->get_size(2) )
+                        {
+                            hoNDArray<T> buffer2DT_Two(workOrder2DT->fullkspace_.get_dimensions());
+                            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(workOrder2DT->fullkspace_, buffer2DT_, buffer2DT_Two);
+                            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine(buffer2DT_, *workOrder2DT->coilMap_, workOrder2DT->complexIm_));
+                            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder2DT->complexIm_, debugFolder_+"complexIm_noFullResCoilMap_"); }
+                        }
+                        else if (workOrder2DT->fullkspace_.get_size(2) == 1) // if recon kspace is not required
+                        {
+                            hoNDArray<T> buffer2DT_ComplexIm(workOrder2DT->fullkspace_.get_dimensions());
+                            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(workOrder2DT->fullkspace_, buffer2DT_ComplexIm);
+                            memcpy(workOrder2DT->complexIm_.begin(), buffer2DT_ComplexIm.begin(), workOrder2DT->complexIm_.get_number_of_bytes());
+                            if (!debugFolder_.empty()) { gt_exporter_.exportArrayComplex(workOrder2DT->complexIm_, debugFolder_ + "complexIm_noFullResCoilMap_noReconKSpace_"); }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::afterUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::pickHighestSignalForN(const hoNDArray<T>& data, hoNDArray<T>& res)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t CHA = data.get_size(2);
+        size_t N = data.get_size(3);
+        size_t S = data.get_size(4);
+
+        res.create(RO, E1, CHA, 1, S);
+
+        size_t s;
+        for ( s=0; s<S; s++ )
+        {
+            size_t maxInd=0;
+            typename realType<T>::Type maxNorm;
+
+            hoNDArray<T> data3D(RO, E1, CHA, const_cast<T*>(data.begin()+s*RO*E1*CHA*N));
+            Gadgetron::norm2(data3D, maxNorm);
+
+            size_t n;
+            for ( n=1; n<N; n++ )
+            {
+                data3D.create(RO, E1, CHA, const_cast<T*>(data.begin()+n*RO*E1*CHA+s*RO*E1*CHA*N));
+
+                typename realType<T>::Type currNorm;
+                Gadgetron::norm2(data3D, currNorm);
+
+                if ( maxNorm < currNorm )
+                {
+                    maxNorm = currNorm;
+                    maxInd = n;
+                }
+            }
+
+            memcpy(res.begin()+s*RO*E1*CHA*N, data.begin()+maxInd*RO*E1*CHA+s*RO*E1*CHA*N, sizeof(T)*RO*E1*CHA);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::pickHighestSignalForN() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performPartialFourierHandling(gtPlusReconWorkOrder2DT<T>* workOrder2DT)
+{
+    try
+    {
+        // compensate for the partial fourier to preserve the SNR unit
+        value_type partialFourierCompensationFactor = 1;
+
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+
+        if ( !( workOrder2DT->start_RO_<0 || workOrder2DT->end_RO_<0 || (workOrder2DT->end_RO_-workOrder2DT->start_RO_+1==RO) ) )
+        {
+            partialFourierCompensationFactor *= (value_type)(RO)/(value_type)(workOrder2DT->end_RO_-workOrder2DT->start_RO_+1);
+        }
+
+        if ( !( workOrder2DT->start_E1_<0 || workOrder2DT->end_E1_<0 || (workOrder2DT->end_E1_-workOrder2DT->start_E1_+1==E1) ) )
+        {
+            if ( workOrder2DT->end_E1_-workOrder2DT->start_E1_+1 <= E1 )
+            {
+                partialFourierCompensationFactor *= (value_type)(E1)/(value_type)(workOrder2DT->end_E1_-workOrder2DT->start_E1_+1);
+            }
+        }
+
+        partialFourierCompensationFactor = std::sqrt(partialFourierCompensationFactor);
+        if ( performTiming_ ) { GDEBUG_STREAM("Partial fourier scaling factor : " << partialFourierCompensationFactor); }
+
+        if ( performTiming_ ) { GDEBUG_STREAM("Partial fourier algorithm : " << gtPlus_util_.getNameFromISMRMRDPartialFourierReconAlgo(workOrder2DT->partialFourier_algo_)); }
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_noacceleration )
+        {
+            if ( (workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING || workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER) && (std::abs(partialFourierCompensationFactor-1)>FLT_EPSILON) )
+            {
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal(partialFourierCompensationFactor, workOrder2DT->data_));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFilter(*workOrder2DT, workOrder2DT->data_));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_HOMODYNE )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierHomodyneRecon(*workOrder2DT, workOrder2DT->data_));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_POCS )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierPOCSRecon(*workOrder2DT, workOrder2DT->data_));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_FENGHUANG )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFengHuangRecon(*workOrder2DT, workOrder2DT->data_));
+            }
+        }
+        else if ( workOrder2DT->fullkspace_.get_number_of_elements() > 0 )
+        {
+            if ( (workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING || workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER) && (std::abs(partialFourierCompensationFactor-1)>FLT_EPSILON) )
+            {
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal(partialFourierCompensationFactor, workOrder2DT->fullkspace_));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFilter(*workOrder2DT, workOrder2DT->fullkspace_));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_HOMODYNE )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierHomodyneRecon(*workOrder2DT, workOrder2DT->fullkspace_));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_POCS )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierPOCSRecon(*workOrder2DT, workOrder2DT->fullkspace_));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_FENGHUANG )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFengHuangRecon(*workOrder2DT, workOrder2DT->fullkspace_));
+            }
+        }
+        else
+        {
+            // perform partial fourier handling on the complex images after coil combination
+            hoNDArray<T> kspace(workOrder2DT->complexIm_);
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(workOrder2DT->complexIm_, kspace);
+
+            if ( (workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING || workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER) && (std::abs(partialFourierCompensationFactor-1)>FLT_EPSILON) )
+            {
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal(partialFourierCompensationFactor, kspace));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFilter(*workOrder2DT, kspace));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_HOMODYNE )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierHomodyneRecon(*workOrder2DT, kspace));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_POCS )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierPOCSRecon(*workOrder2DT, kspace));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_FENGHUANG )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFengHuangRecon(*workOrder2DT, kspace));
+            }
+
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kspace, workOrder2DT->complexIm_);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::performPartialFourierHandling(gtPlusReconWorkOrder2DT<T>* workOrder2DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performPartialFourierFilter(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace)
+{
+    try
+    {
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+
+        // check whether partial fourier is used
+        if ( (workOrder2DT.start_RO_<0 || workOrder2DT.end_RO_<0 || (workOrder2DT.end_RO_-workOrder2DT.start_RO_+1==RO) ) 
+            && (workOrder2DT.start_E1_<0 || workOrder2DT.end_E1_<0 || (workOrder2DT.end_E1_-workOrder2DT.start_E1_+1==E1) ) )
+        {
+            return true;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_before_PF_Filter"); }
+
+        if ( workOrder2DT.filterROE1_partialfourier_.get_size(0)==RO 
+                && workOrder2DT.filterROE1_partialfourier_.get_size(1)==E1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(kspace, workOrder2DT.filterROE1_partialfourier_, buffer2DT_partial_fourier_));
+            kspace = buffer2DT_partial_fourier_;
+        }
+
+        else if ( (workOrder2DT.filterRO_partialfourier_.get_number_of_elements() == RO) 
+                && (workOrder2DT.filterE1_partialfourier_.get_number_of_elements() == E1) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(kspace, workOrder2DT.filterRO_partialfourier_, workOrder2DT.filterE1_partialfourier_, buffer2DT_partial_fourier_));
+            kspace = buffer2DT_partial_fourier_;
+        }
+
+        else
+        {
+            bool filterPerformed = false;
+
+            if ( (workOrder2DT.filterRO_partialfourier_.get_number_of_elements() == RO) 
+                    && (workOrder2DT.filterE1_partialfourier_.get_number_of_elements() != E1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterRO(kspace, workOrder2DT.filterRO_partialfourier_, buffer2DT_partial_fourier_));
+                filterPerformed = true;
+            }
+
+            if ( (workOrder2DT.filterRO_partialfourier_.get_number_of_elements() != RO) 
+                    && (workOrder2DT.filterE1_partialfourier_.get_number_of_elements() == E1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterE1(kspace, workOrder2DT.filterE1_partialfourier_, buffer2DT_partial_fourier_));
+                filterPerformed = true;
+            }
+
+            if ( filterPerformed )
+            {
+                kspace = buffer2DT_partial_fourier_;
+            }
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_after_PF_Filter"); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::performPartialFourierFilter(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performPartialFourierHomodyneRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace)
+{
+    try
+    {
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t CHA = kspace.get_size(2);
+        size_t N = kspace.get_size(3);
+        size_t S = kspace.get_size(4);
+
+        // check whether partial fourier is used
+        if ( (workOrder2DT.start_RO_<0 || workOrder2DT.end_RO_<0 || (workOrder2DT.end_RO_-workOrder2DT.start_RO_+1==RO) ) 
+            && (workOrder2DT.start_E1_<0 || workOrder2DT.end_E1_<0 || (workOrder2DT.end_E1_-workOrder2DT.start_E1_+1==E1) ) )
+        {
+            return true;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_before_homodyne"); }
+
+        // create kspace filter for homodyne phase estimation
+        ISMRMRDKSPACEFILTER filter_ref_type_ = ISMRMRD_FILTER_HANNING;
+        double filter_ref_sigma_ = 1.5;
+        double filter_ref_width_ = 0.15;
+
+        size_t startRO(0), endRO(RO-1);
+        hoNDArray<T> filterRO(RO);
+        if ( (workOrder2DT.start_RO_<0 || workOrder2DT.end_RO_<0) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(RO, 0, RO-1, 
+                filterRO, filter_ref_type_, filter_ref_sigma_, (size_t)std::ceil(filter_ref_width_*RO)));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(RO, workOrder2DT.start_RO_, workOrder2DT.end_RO_, 
+                filterRO, filter_ref_type_, filter_ref_sigma_, (size_t)std::ceil(filter_ref_width_*RO)));
+
+            startRO = workOrder2DT.start_RO_;
+            endRO = workOrder2DT.end_RO_;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(filterRO, "filterRO_homodyne"); }
+
+        size_t startE1(0), endE1(E1-1);
+        hoNDArray<T> filterE1(E1);
+        if ( (workOrder2DT.start_E1_<0 || workOrder2DT.end_E1_<0) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(E1, 0, E1-1, 
+                filterE1, filter_ref_type_, filter_ref_sigma_, (size_t)std::ceil(filter_ref_width_*E1)));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(E1, workOrder2DT.start_E1_, workOrder2DT.end_E1_, 
+                filterE1, filter_ref_type_, filter_ref_sigma_, (size_t)std::ceil(filter_ref_width_*E1)));
+
+            startE1 = workOrder2DT.start_E1_;
+            endE1 = workOrder2DT.end_E1_;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(filterE1, debugFolder_+"filterE1_homodyne"); }
+
+        hoNDArray<T> kspaceIter(kspace.get_dimensions());
+        kspaceIter = kspace;
+        // store the filtered kspace
+        buffer2DT_partial_fourier_ = kspace;
+        // store the phase images
+        buffer2DT_ = kspace;
+        // magnitude of complex images
+        hoNDArray<typename realType<T>::Type> mag(kspace.get_dimensions());
+        hoNDArray<T> magComplex(kspace.get_dimensions());
+
+        // complex images
+        hoNDArray<T> complexIm(kspace.get_dimensions());
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kspace, complexIm);
+
+        hoNDArray<T> complexImPrev(complexIm);
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"homodyne_kspace_beforeIteration"); }
+
+        size_t ii;
+        for ( ii=0; ii<workOrder2DT.partialFourier_homodyne_iters_; ii++ )
+        {
+            // kspace filter before phase extraction
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(kspaceIter, filterRO, filterE1, buffer2DT_partial_fourier_));
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer2DT_partial_fourier_, debugFolder_+"homodyne_kspaceIter_afterFiltered"); }
+
+            // go to image domain
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(buffer2DT_partial_fourier_);
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer2DT_partial_fourier_, debugFolder_+"homodyne_complexIm"); }
+
+            // get the phase
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::abs(buffer2DT_partial_fourier_, mag));
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::addEpsilon(mag));
+            GADGET_CHECK_RETURN_FALSE(magComplex.copyFrom(mag));
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::divide(buffer2DT_partial_fourier_, magComplex, buffer2DT_));
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer2DT_, debugFolder_+"homodyne_phase"); }
+
+            // remove the phase from complex images
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::conjugate(buffer2DT_, buffer2DT_));
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(complexIm, buffer2DT_, complexIm));
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(complexIm, debugFolder_+"homodyne_complexIm_removePhase"); }
+
+            // go back to kspace
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(complexIm, kspaceIter);
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIter, debugFolder_+"homodyne_complexIm_removePhase_kspace"); }
+
+            // compute threshold to stop the iteration
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::subtract(complexImPrev, complexIm, buffer2DT_));
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer2DT_, debugFolder_+"homodyne_diff_complexIm"); }
+
+            typename realType<T>::Type diff, prev;
+            Gadgetron::norm2(complexImPrev, prev);
+            Gadgetron::norm2(buffer2DT_, diff);
+
+            typename realType<T>::Type thres = diff/prev;
+
+            if ( !debugFolder_.empty() )
+            {
+                GDEBUG_STREAM("Homodyne iter : " << ii << " - thres : " << thres << " ... ");
+            }
+
+            if ( thres < workOrder2DT.partialFourier_homodyne_thres_ )
+            {
+                break;
+            }
+
+            complexImPrev = complexIm;
+        }
+
+        // restore the acquired region
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIter, debugFolder_+"kspaceIter_after_homodyne_beforeCopy"); }
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_after_homodyne_beforeCopy"); }
+
+        if ( workOrder2DT.partialFourier_homodyne_densityComp_ )
+        {
+            size_t width_RO = (size_t)std::floor(0.1*RO);
+            size_t width_E1 = (size_t)std::floor(0.1*E1);
+
+            // compute PF filter for RO and E1
+            hoNDArray<T> filterPF_RO, filterPF_E1;
+
+            if ( workOrder2DT.start_RO_<0 || workOrder2DT.end_RO_<0 || (workOrder2DT.start_RO_==0 && workOrder2DT.end_RO_==RO-1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(RO, workOrder2DT.start_RO_, workOrder2DT.end_RO_, 
+                    filterPF_RO, ISMRMRD_FILTER_NONE, width_RO, true));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(RO, workOrder2DT.start_RO_, workOrder2DT.end_RO_, 
+                    filterPF_RO, ISMRMRD_FILTER_TAPERED_HANNING, width_RO, true));
+            }
+
+            if ( workOrder2DT.start_E1_<0 || workOrder2DT.end_E1_<0 || (workOrder2DT.start_E1_==0 && workOrder2DT.end_E1_==E1-1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(E1, workOrder2DT.start_E1_, workOrder2DT.end_E1_, 
+                    filterPF_E1, ISMRMRD_FILTER_NONE, width_E1, true));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(E1, workOrder2DT.start_E1_, workOrder2DT.end_E1_, 
+                    filterPF_E1, ISMRMRD_FILTER_TAPERED_HANNING, width_E1, true));
+            }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(filterPF_RO, debugFolder_+"filterPF_RO_homodyne"); }
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(filterPF_E1, debugFolder_+"filterPF_E1_homodyne"); }
+
+            // compensate filter for homodyne filtered kspace
+            hoNDArray<T> filterPF_homodyne_RO(filterPF_RO), filterPF_homodyne_E1(filterPF_E1);
+
+            T midValue = filterPF_RO(RO/2);
+            for ( ii=0; ii<RO; ii++ )
+            {
+                if ( std::abs(filterPF_homodyne_RO(ii)) > std::abs(midValue) )
+                {
+                    filterPF_homodyne_RO(ii) = T(0.0);
+                }
+                else
+                {
+                    filterPF_homodyne_RO(ii) = midValue - filterPF_homodyne_RO(ii);
+                }
+            }
+
+            midValue = filterPF_E1(E1/2);
+            for ( ii=0; ii<E1; ii++ )
+            {
+                if ( std::abs(filterPF_homodyne_E1(ii)) > std::abs(midValue) )
+                {
+                    filterPF_homodyne_E1(ii) = T(0.0);
+                }
+                else
+                {
+                    filterPF_homodyne_E1(ii) = midValue - filterPF_homodyne_E1(ii);
+                }
+            }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(filterPF_homodyne_RO, "filterPF_homodyne_RO_homodyne"); }
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(filterPF_homodyne_E1, "filterPF_homodyne_E1_homodyne"); }
+
+            T scaleFactor(1.0);
+            hoNDArray<T> filterPF;
+
+            if ( workOrder2DT.start_RO_<0 || workOrder2DT.end_RO_<0 || (workOrder2DT.start_RO_==0 && workOrder2DT.end_RO_==RO-1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterE1(kspace, filterPF_E1, kspace));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_after_homodyne_PF_Filter"); }
+
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterE1(kspaceIter, filterPF_homodyne_E1, kspaceIter));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIter, debugFolder_+"kspaceIter_after_homodyne_PF_Filter"); }
+
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::add(filterPF_E1, filterPF_homodyne_E1, filterPF));
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeFilterSNRUnitScaleFactor(filterPF, scaleFactor));
+            }
+            else if ( workOrder2DT.start_E1_<0 || workOrder2DT.end_E1_<0 || (workOrder2DT.start_E1_==0 && workOrder2DT.end_E1_==E1-1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterRO(kspace, filterPF_RO, kspace));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_after_homodyne_PF_Filter"); }
+
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterRO(kspaceIter, filterPF_homodyne_RO, kspaceIter));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIter, debugFolder_+"kspaceIter_after_homodyne_PF_Filter"); }
+
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::add(filterPF_RO, filterPF_homodyne_RO, filterPF));
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeFilterSNRUnitScaleFactor(filterPF, scaleFactor));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(kspace, filterPF_RO, filterPF_E1, kspace));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_after_homodyne_PF_Filter"); }
+
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(kspaceIter, filterPF_homodyne_RO, filterPF_homodyne_E1, kspaceIter));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIter, debugFolder_+"kspaceIter_after_homodyne_PF_Filter"); }
+
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::add(filterPF_RO, filterPF_homodyne_RO, filterPF));
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeFilterSNRUnitScaleFactor(filterPF, scaleFactor));
+
+                T scaleFactorE1(1.0);
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::add(filterPF_E1, filterPF_homodyne_E1, filterPF));
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeFilterSNRUnitScaleFactor(filterPF, scaleFactorE1));
+
+                scaleFactor *= scaleFactorE1;
+            }
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::add(kspace, kspaceIter, kspace));
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal(scaleFactor, kspace));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1(kspace, kspaceIter, startRO, endRO, startE1, endE1));
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIter, debugFolder_+"kspaceIter_after_homodyne_afterCopy"); }
+            kspace = kspaceIter;
+        }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_after_homodyne"); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::performPartialFourierHomodyneRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performPartialFourierPOCSRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace)
+{
+    try
+    {
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t CHA = kspace.get_size(2);
+        size_t N = kspace.get_size(3);
+        size_t S = kspace.get_size(4);
+
+        // check whether partial fourier is used
+        if ( (workOrder2DT.start_RO_<0 || workOrder2DT.end_RO_<0 || (workOrder2DT.end_RO_-workOrder2DT.start_RO_+1==RO) ) 
+            && (workOrder2DT.start_E1_<0 || workOrder2DT.end_E1_<0 || (workOrder2DT.end_E1_-workOrder2DT.start_E1_+1==E1) ) )
+        {
+            return true;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_before_POCS"); }
+
+        // create kspace filter for homodyne phase estimation
+        ISMRMRDKSPACEFILTER filter_ref_type_ = ISMRMRD_FILTER_HANNING;
+        double filter_ref_sigma_ = 1.5;
+        double filter_ref_width_ = 0.15;
+
+        size_t startRO(0), endRO(RO-1);
+        hoNDArray<T> filterRO(RO);
+        if ( (workOrder2DT.start_RO_<0 || workOrder2DT.end_RO_<0) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(RO, 0, RO-1, 
+                filterRO, filter_ref_type_, filter_ref_sigma_, (size_t)std::ceil(filter_ref_width_*RO)));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(RO, workOrder2DT.start_RO_, workOrder2DT.end_RO_, 
+                filterRO, filter_ref_type_, filter_ref_sigma_, (size_t)std::ceil(filter_ref_width_*RO)));
+
+            startRO = workOrder2DT.start_RO_;
+            endRO = workOrder2DT.end_RO_;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(filterRO, debugFolder_+"filterRO_POCS"); }
+
+        size_t startE1(0), endE1(E1-1);
+        hoNDArray<T> filterE1(E1);
+        if ( (workOrder2DT.start_E1_<0 || workOrder2DT.end_E1_<0) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(E1, 0, E1-1, 
+                filterE1, filter_ref_type_, filter_ref_sigma_, (size_t)std::ceil(filter_ref_width_*E1)));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(E1, workOrder2DT.start_E1_, workOrder2DT.end_E1_, 
+                filterE1, filter_ref_type_, filter_ref_sigma_, (size_t)std::ceil(filter_ref_width_*E1)));
+
+            startE1 = workOrder2DT.start_E1_;
+            endE1 = workOrder2DT.end_E1_;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(filterE1, "filterE1_POCS"); }
+
+        hoNDArray<T> kspaceIter(kspace);
+        // magnitude of complex images
+        hoNDArray<typename realType<T>::Type> mag(kspace.get_dimensions());
+        hoNDArray<T> magComplex(kspace.get_dimensions());
+
+        // kspace filter
+        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(kspaceIter, filterRO, filterE1, buffer2DT_partial_fourier_));
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer2DT_partial_fourier_, debugFolder_+"POCS_afterFiltered"); }
+
+        // go to image domain
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(buffer2DT_partial_fourier_);
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer2DT_partial_fourier_, debugFolder_+"POCS_afterFiltered_complexIm"); }
+
+        // get the complex image phase for the filtered kspace
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::abs(buffer2DT_partial_fourier_, mag));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::addEpsilon(mag));
+        GADGET_CHECK_RETURN_FALSE(magComplex.copyFrom(mag));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::divide(buffer2DT_partial_fourier_, magComplex, buffer2DT_));
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer2DT_, debugFolder_+"POCS_afterFiltered_complexIm_phase"); }
+
+        // complex images, initialized as not filtered complex image
+        hoNDArray<T> complexIm(kspaceIter);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kspaceIter, complexIm);
+        hoNDArray<T> complexImPOCS(complexIm);
+
+        // the kspace during iteration is buffered here
+        buffer2DT_partial_fourier_kspaceIter_ = kspaceIter;
+
+        size_t ii;
+        for ( ii=0; ii<workOrder2DT.partialFourier_POCS_iters_; ii++ )
+        {
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::abs(complexImPOCS, mag));
+            GADGET_CHECK_RETURN_FALSE(magComplex.copyFrom(mag));
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(magComplex, buffer2DT_, complexImPOCS));
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(complexImPOCS, debugFolder_+"POCS_complexImPOCS"); }
+
+            // go back to kspace
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(complexImPOCS, kspaceIter);
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIter, debugFolder_+"POCS_kspaceIter"); }
+
+            // buffer kspace during iteration
+            buffer2DT_partial_fourier_kspaceIter_ = kspaceIter;
+
+            // restore the acquired region
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1(kspace, kspaceIter, startRO, endRO, startE1, endE1));
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIter, debugFolder_+"POCS_kspaceIter_copyOri"); }
+
+            // update complex image
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kspaceIter, complexImPOCS);
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(complexImPOCS, debugFolder_+"POCS_kspaceIter_copyOri_complexImPOCS"); }
+
+            // compute threshold to stop the iteration
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::subtract(complexImPOCS, complexIm, buffer2DT_partial_fourier_));
+            typename realType<T>::Type diff, prev;
+            Gadgetron::norm2(complexIm, prev);
+            Gadgetron::norm2(buffer2DT_partial_fourier_, diff);
+
+            typename realType<T>::Type thres = diff/prev;
+
+            if ( !debugFolder_.empty() )
+            {
+                GDEBUG_STREAM("POCS iter : " << ii << " - thres : " << thres << " ... ");
+            }
+
+            if ( thres < workOrder2DT.partialFourier_POCS_thres_ )
+            {
+                break;
+            }
+
+            complexIm = complexImPOCS;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer2DT_partial_fourier_kspaceIter_, debugFolder_+"kspaceIter_after_POCS"); }
+
+        if ( workOrder2DT.partialFourier_POCS_transitBand_ == 0 )
+        {
+            kspace = kspaceIter;
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1TransitionBand(kspace, buffer2DT_partial_fourier_kspaceIter_, startRO, endRO, startE1, endE1, workOrder2DT.partialFourier_POCS_transitBand_, workOrder2DT.partialFourier_POCS_transitBand_));
+            kspace = buffer2DT_partial_fourier_kspaceIter_;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_after_POCS"); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::performPartialFourierPOCSRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performPartialFourierFengHuangRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace)
+{
+    try
+    {
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t CHA = kspace.get_size(2);
+        size_t N = kspace.get_size(3);
+        size_t S = kspace.get_size(4);
+
+        // check whether partial fourier is used
+        if ( (workOrder2DT.start_RO_<0 || workOrder2DT.end_RO_<0 || (workOrder2DT.end_RO_-workOrder2DT.start_RO_+1==RO) ) 
+            && (workOrder2DT.start_E1_<0 || workOrder2DT.end_E1_<0 || (workOrder2DT.end_E1_-workOrder2DT.start_E1_+1==E1) ) )
+        {
+            return true;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_before_FengHuang"); }
+
+        size_t startRO(0), endRO(RO-1);
+        if ( workOrder2DT.start_RO_>=0 && workOrder2DT.end_RO_<RO )
+        {
+            startRO = workOrder2DT.start_RO_;
+            endRO = workOrder2DT.end_RO_;
+        }
+
+        size_t startE1(0), endE1(E1-1);
+        if ( workOrder2DT.start_E1_>=0 && workOrder2DT.end_E1_<E1 )
+        {
+            startE1 = workOrder2DT.start_E1_;
+            endE1 = workOrder2DT.end_E1_;
+        }
+
+        // compute the conjugate symmetric kspace
+        if ( performTiming_ ) { gt_timer1_.start("conjugateSymmetry2D"); }
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().conjugateSymmetry2D(kspace, buffer2DT_));
+        if ( performTiming_ ) { gt_timer1_.stop(); }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer2DT_, debugFolder_+"kspaceConj_FengHuang"); }
+
+        // find the symmetric region in the kspace
+        size_t startSymRO, endSymRO;
+        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.findSymmetricSampledRegion(startRO, endRO, RO/2, startSymRO, endSymRO));
+
+        size_t startSymE1, endSymE1;
+        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.findSymmetricSampledRegion(startE1, endE1, E1/2, startSymE1, endSymE1));
+
+        // the reference kspace for kernel estimation
+        hoNDArray<T> src, dst;
+        std::vector<size_t> start(5), size(5);
+
+        start[0] = startSymRO;
+        start[1] = startSymE1;
+        start[2] = 0;
+        start[3] = 0;
+        start[4] = 0;
+
+        size[0] = endSymRO-startSymRO+1;
+        size[1] = endSymE1-startSymE1+1;
+        size[2] = CHA;
+        size[3] = N;
+        size[4] = S;
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::cropUpTo11DArray(buffer2DT_, src, start, size));
+        GADGET_CHECK_RETURN_FALSE(cropUpTo11DArray(kspace, dst, start, size));
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(src, debugFolder_+"src_FengHuang"); }
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(dst, debugFolder_+"dst_FengHuang"); }
+
+        if ( workOrder2DT.partialFourier_FengHuang_sameKernel_allN_ )
+        {
+            hoNDArray<T> ave4D;
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(src, ave4D));
+            src = ave4D;
+
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(dst, ave4D));
+            dst = ave4D;
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(src, debugFolder_+"src_ave4D_FengHuang"); }
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(dst, debugFolder_+"dst_ave4D_FengHuang"); }
+        }
+
+        // estimate the kernels
+        ho6DArray<T> kernel; // [RO E1 srcCHA dstCHA N S]
+        if ( performTiming_ ) { gt_timer1_.start("calibFengHuang"); }
+        GADGET_CHECK_RETURN_FALSE(this->calibFengHuang(workOrder2DT, src, dst, kernel));
+        if ( performTiming_ ) { gt_timer1_.stop(); }
+
+        // perform the recon
+        if ( workOrder2DT.partialFourier_FengHuang_transitBand_==0 )
+        {
+            if ( performTiming_ ) { gt_timer1_.start("performReconFangHuang"); }
+            GADGET_CHECK_RETURN_FALSE(this->performReconFangHuang(workOrder2DT, buffer2DT_, kspace, (int)startRO, (int)endRO, (int)startE1, (int)endE1, kernel));
+            if ( performTiming_ ) { gt_timer1_.stop(); }
+        }
+        else
+        {
+            if ( performTiming_ ) { gt_timer1_.start("performReconFangHuang with transition band"); }
+
+            size_t tb =  (int)workOrder2DT.partialFourier_FengHuang_transitBand_;
+
+            size_t sRO(startRO), eRO(endRO), sE1(startE1), eE1(endE1);
+
+            if ( startRO > 0 )
+            {
+                startRO += tb;
+                if ( startRO > RO ) startRO = 0;
+            }
+
+            if ( endRO < RO-1 )
+            {
+                endRO -= tb;
+                if ( endRO < 0 ) endRO = RO-1;
+            }
+
+            if ( startRO > endRO )
+            {
+                startRO = 0;
+                endRO = RO-1;
+            }
+
+            if ( startE1 > 0 )
+            {
+                startE1 += tb;
+                if ( startE1 > E1 ) startE1 = 0;
+            }
+
+            if ( endE1 < E1-1 )
+            {
+                endE1 -= tb;
+                if ( endE1 < 0 ) endE1 = E1-1;
+            }
+
+            if ( startE1 > endE1 )
+            {
+                startE1 = 0;
+                endE1 = E1-1;
+            }
+
+            buffer2DT_partial_fourier_kspaceIter_ = kspace;
+            GADGET_CHECK_RETURN_FALSE(this->performReconFangHuang(workOrder2DT, buffer2DT_, 
+                    buffer2DT_partial_fourier_kspaceIter_, (int)startRO, (int)endRO, (int)startE1, (int)endE1, kernel));
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer2DT_partial_fourier_kspaceIter_, debugFolder_+"kspace_FengHuang_recon"); }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_FengHuang_original"); }
+
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1TransitionBand(kspace, buffer2DT_partial_fourier_kspaceIter_, 
+                    sRO, eRO, sE1, eE1, workOrder2DT.partialFourier_FengHuang_transitBand_, workOrder2DT.partialFourier_FengHuang_transitBand_));
+
+            kspace = buffer2DT_partial_fourier_kspaceIter_;
+
+            if ( performTiming_ ) { gt_timer1_.stop(); }
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_after_FengHuang"); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::performPartialFourierFengHuangRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::calibFengHuang(gtPlusReconWorkOrder2DT<T>& workOrder2DT, const hoNDArray<T>& src, const hoNDArray<T>& dst, ho6DArray<T>& kernel)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(src.dimensions_equal(&dst));
+
+        long long RO = (long long)src.get_size(0);
+        long long E1 = (long long)src.get_size(1);
+        long long srcCHA = (long long)src.get_size(2);
+        long long N = (long long)src.get_size(3);
+        long long S = (long long)src.get_size(4);
+
+        long long kx = (long long)workOrder2DT.partialFourier_FengHuang_kSize_RO_;
+        long long ky = (long long)workOrder2DT.partialFourier_FengHuang_kSize_E1_;
+
+        if ( kx%2 == 0 ) kx++;
+        if ( ky%2 == 0 ) ky++;
+
+        long long halfKx = (long long)kx/2;
+        long long halfKy = (long long)ky/2;
+
+        // the cross-channel kernel is not estimated
+        kernel.createArray(kx, ky, srcCHA, 1, N, S);
+
+        long long ii=0;
+        long long num = N*S*srcCHA;
+
+        size_t startRO = halfKx;
+        size_t endRO = RO - halfKx - 1;
+
+        size_t startE1 = halfKy;
+        size_t endE1 = E1 - halfKy - 1;
+
+        long long rowA, colA, rowB, colB;
+        rowA = (endE1-startE1+1)*(endRO-startRO+1); 
+        colA = kx*ky;
+
+        rowB = rowA;
+        colB = 1;
+
+        double thresReg = workOrder2DT.partialFourier_FengHuang_thresReg_;
+
+        #pragma omp parallel default(none) private(ii) shared(num, RO, E1, srcCHA, N, S, kx, ky, src, dst, kernel, rowA, colA, rowB, colB, startRO, endRO, startE1, endE1, halfKx, halfKy, thresReg)
+        {
+            hoMatrix<T> A(rowA, colA);
+            T* pA = A.begin();
+
+            hoMatrix<T> B(rowB, colB);
+            T* pB = B.begin();
+
+            hoMatrix<T> K(colA, colB);
+
+            #pragma omp for
+            for ( ii=0; ii<num; ii ++ )
+            {
+                T* pSrc2D = const_cast<T*>(src.begin())+ii*RO*E1;
+                T* pDst2D = const_cast<T*>(dst.begin())+ii*RO*E1;
+                //ho2DArray<T> src2D(RO, E1, const_cast<T*>(src.begin())+ii*RO*E1);
+                //ho2DArray<T> dst2D(RO, E1, const_cast<T*>(dst.begin())+ii*RO*E1);
+
+                size_t ro, e1, row(0);
+                long long x, y;
+
+                for ( e1=startE1; e1<=endE1; e1++ )
+                {
+                    for ( ro=startRO; ro<=endRO; ro++ )
+                    {
+
+                        size_t colInd(0);
+                        for ( y=-halfKy; y<=halfKy; y++ )
+                        {
+                            for ( x=-halfKx; x<=halfKx; x++ )
+                            {
+                                // A(row, colInd++) = src2D(ro+x, e1+y);
+                                pA[row + colInd*rowA] = pSrc2D[ro+x + (e1+y)*RO];
+                                colInd++;
+                            }
+                        }
+
+                        // B(row, 0) = dst2D(ro, e1);
+                        pB[row] = pDst2D[ro + e1*RO];
+
+                        row++;
+                    }
+                }
+
+                Gadgetron::SolveLinearSystem_Tikhonov(A, B, K, thresReg);
+
+                memcpy(kernel.begin()+ii*kx*ky, K.begin(), sizeof(T)*kx*ky);
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::calibFengHuang(gtPlusReconWorkOrder2DT<T>& workOrder2DT, const hoNDArray<T>& src, const hoNDArray<T>& dst, ho6DArray<T>& kernel) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performReconFangHuang(gtPlusReconWorkOrder2DT<T>& workOrder2DT, 
+                                                const hoNDArray<T>& kspaceConj, hoNDArray<T>& kspace, 
+                                                int startRO, int endRO, int startE1, int endE1, ho6DArray<T>& kernel)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(kspaceConj.dimensions_equal(&kspace));
+
+        long long RO = (long long)kspace.get_size(0);
+        long long E1 = (long long)kspace.get_size(1);
+        long long CHA = (long long)kspace.get_size(2);
+        long long N = (long long)kspace.get_size(3);
+        long long S = (long long)kspace.get_size(4);
+
+        long long kx = (long long)kernel.get_size(0);
+        long long ky = (long long)kernel.get_size(1);
+
+        long long halfKx = kx/2;
+        long long halfKy = ky/2;
+        long long kerN = (long long)kernel.get_size(4);
+        GADGET_CHECK_RETURN_FALSE( (kerN==1) || (kerN==N) );
+
+        long long num = CHA*N*S;
+
+        long long rowD = RO*E1 - ( (endE1-startE1+1) * (endRO-startRO+1) );
+        long long colD = kx*ky;
+
+        ho2DArray<size_t> coeffX(rowD, colD);
+        ho2DArray<size_t> coeffY(rowD, colD);
+
+        long long ro, e1, row(0);
+        long long x, y, dx, dy;
+
+        for ( e1=0; e1<E1; e1++ )
+        {
+            for ( ro=0; ro<RO; ro++ )
+            {
+                if ( (ro>=startRO) && (ro<=endRO) && (e1>=startE1) && (e1<=endE1) )
+                {
+                    continue;
+                }
+
+                size_t colInd(0);
+                for ( y=-halfKy; y<=halfKy; y++ )
+                {
+                    dy = e1 + y;
+                    if ( dy < 0 ) dy += E1;
+                    if ( dy > E1-1 ) dy -= E1;
+
+                    for ( x=-halfKx; x<=halfKx; x++ )
+                    {
+                        dx = ro + x;
+                        if ( dx < 0 ) dx += RO;
+                        if ( dx > RO-1 ) dx -= RO;
+
+                        coeffX(row, colInd) = dx;
+                        coeffY(row, colInd) = dy;
+                        colInd++;
+                    }
+                }
+
+                row++;
+            }
+        }
+
+        long long ii;
+        #pragma omp parallel default(none) private(ii) shared(num, RO, E1, CHA, N, S, kerN, kspaceConj, kspace, kernel, rowD, colD, coeffX, coeffY)
+        {
+            hoMatrix<T> D(rowD, colD);
+            hoMatrix<T> K(colD, 1);
+            hoMatrix<T> R(rowD, 1);
+
+            Gadgetron::clear(D);
+            Gadgetron::clear(K);
+            Gadgetron::clear(R);
+
+            #pragma omp for
+            for ( ii=0; ii<num; ii ++ )
+            {
+                ho2DArray<T> src2D(RO, E1, const_cast<T*>(kspaceConj.begin())+ii*RO*E1);
+                ho2DArray<T> dst2D(RO, E1, kspace.begin()+ii*RO*E1);
+
+                long long row, col;
+                for ( col=0; col<colD; col++ )
+                {
+                    for ( row=0; row<rowD; row++ )
+                    {
+                        D(row, col) = src2D(coeffX(row, col), coeffY(row, col));
+                    }
+                }
+
+                if ( kerN == 1 )
+                {
+                    long long ind = ii;
+                    long long currS = ind/(CHA*N);
+                    ind %= CHA*N;
+                    long long currN = ind/CHA;
+                    ind %= CHA;
+                    memcpy(K.begin(), kernel.begin()+(ind+currS*CHA)*colD, sizeof(T)*colD);
+                }
+                else
+                {
+                    memcpy(K.begin(), kernel.begin()+ii*colD, sizeof(T)*colD);
+                }
+
+                // R = D*K
+                Gadgetron::gemm(R, D, false, K, false);
+
+                for ( row=0; row<rowD; row++ )
+                {
+                    dst2D( coeffX(row, colD/2), coeffY(row, colD/2) ) = R(row, 0);
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::performReconFangHuang(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::
+estimateJobSize(gtPlusReconWorkOrder<T>* workOrder2DT, size_t maxNumOfBytesPerJob, size_t overlapBetweenJobs, size_t numOfNodes, size_t& jobSize)
+{
+    try
+    {
+        size_t nodeN = numOfNodes;
+        GADGET_CHECK_RETURN_FALSE(this->computeEffectiveNodeNumberBasedOnComputingPowerIndex(workOrder2DT, nodeN));
+        if ( workOrder2DT->job_perform_on_control_node_ ) nodeN++;
+
+        GDEBUG_STREAM("GtPlus Cloud 2DT - job_perform_on_control_node is " << workOrder2DT->job_perform_on_control_node_  << " - nodeN is " << nodeN << " - overlapBetweenJobs is " << overlapBetweenJobs << " ... ");
+
+        // adjust jobN according to cloud size
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = workOrder2DT->kernelIm_->get_size(2);
+        size_t dstCHA = workOrder2DT->kernelIm_->get_size(3);
+
+        size_t totalJobNum = N;
+        jobSize = (size_t)std::ceil( (double)(totalJobNum+overlapBetweenJobs*(nodeN-1))/(double)nodeN );
+
+        size_t numOfBytesPerJob = sizeof(T)*( RO*E1*srcCHA*dstCHA*jobSize + 2*RO*E1*srcCHA*jobSize );
+
+        // here a 64Mb graceful size is given to job
+        while ( numOfBytesPerJob > maxNumOfBytesPerJob*1024*1024*1024-64.0*1024*1024 )
+        {
+            nodeN *= 2;
+            jobSize = (size_t)std::ceil( (double)(totalJobNum+overlapBetweenJobs*(nodeN-1))/(double)nodeN );
+            numOfBytesPerJob = sizeof(T)*( RO*E1*srcCHA*dstCHA*jobSize + 2*RO*E1*srcCHA*jobSize );
+        }
+
+        GDEBUG_STREAM("GtPlus Cloud 2DT - jobSize is " << jobSize << "; every job has " << numOfBytesPerJob/1024.0/1024 << " MBytes ... ");
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DT<T>::estimateJobSize(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTGRAPPA.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTGRAPPA.h
new file mode 100644
index 0000000..e0a966b
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTGRAPPA.h
@@ -0,0 +1,426 @@
+/** \file   gtPlusISMRMRDReconWorker2DTGRAPPA.h
+    \brief  Implement the 2DT GRAPPA reconstruction
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd/ismrmrd.h"
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconCoilMapEstimation.h"
+#include "gtPlusISMRMRDReconWorker2DT.h"
+#include "mri_core_grappa.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker2DTGRAPPA : public gtPlusReconWorker2DT<T>
+{
+public:
+
+    typedef gtPlusReconWorker2DT<T> BaseClass;
+    typedef typename BaseClass::value_type value_type;
+
+    gtPlusReconWorker2DTGRAPPA() : BaseClass() {}
+    virtual ~gtPlusReconWorker2DTGRAPPA() {}
+
+    virtual bool performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT);
+    virtual bool performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT, size_t n, size_t usedS);
+
+    virtual bool performUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& data);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::verbose_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+
+    using BaseClass::buffer2DT_;
+    using BaseClass::buffer2DT_unwrapping_;
+    using BaseClass::buffer2DT_partial_fourier_;
+    using BaseClass::buffer2DT_partial_fourier_kspaceIter_;
+    using BaseClass::ref_src_;
+    using BaseClass::ref_dst_;
+    using BaseClass::data_dst_;
+    using BaseClass::ref_coil_map_dst_;
+    using BaseClass::startE1_;
+    using BaseClass::endE1_;
+
+    // gtPlusGRAPPA<T> grappa_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker2DTGRAPPA<T>::
+performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT)
+{
+    try
+    {
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = ref_src.get_size(2);
+
+        size_t refRO = ref_dst.get_size(0);
+        size_t refE1 = ref_dst.get_size(1);
+        size_t refN = ref_dst.get_size(3);
+        size_t dstCHA = ref_dst.get_size(2);
+
+        std::vector<int> kE1, oE1;
+        size_t convkRO, convkE1;
+        bool fitItself = true;
+
+        grappa2d_kerPattern(kE1, oE1, convkRO, convkE1, workOrder2DT->acceFactorE1_, workOrder2DT->grappa_kSize_RO_, workOrder2DT->grappa_kSize_E1_, fitItself);
+
+        size_t kRO = workOrder2DT->grappa_kSize_RO_;
+        size_t kNE1 = workOrder2DT->grappa_kSize_E1_;
+
+        workOrder2DT->kernel_->create(convkRO, convkE1, srcCHA, dstCHA, refN, S);
+        workOrder2DT->kernelIm_->create(RO, E1, srcCHA, dstCHA, refN, S);
+        workOrder2DT->unmixingCoeffIm_->create(RO, E1, srcCHA, refN, S);
+        workOrder2DT->gfactor_.create(RO, E1, refN, S);
+
+        if ( workOrder2DT->wrap_around_map_needed_ )
+        {
+            workOrder2DT->wrap_around_map_.create(RO, E1, 2, refN, S);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DTGRAPPA<T>::performCalibPrep(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTGRAPPA<T>::
+performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT, size_t n, size_t usedS)
+{
+    try
+    {
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = ref_src.get_size(2);
+
+        size_t refRO = ref_dst.get_size(0);
+        size_t refE1 = ref_dst.get_size(1);
+        size_t refN = ref_dst.get_size(3);
+        size_t dstCHA = ref_dst.get_size(2);
+
+        bool fitItself = true;
+        size_t kRO = workOrder2DT->grappa_kSize_RO_;
+        size_t kNE1 = workOrder2DT->grappa_kSize_E1_;
+        size_t convkRO = workOrder2DT->kernel_->get_size(0);
+        size_t convkE1 = workOrder2DT->kernel_->get_size(1);
+
+        ho3DArray<T> acsSrc(refRO, refE1, srcCHA, const_cast<T*>(ref_src.begin()+n*refRO*refE1*srcCHA+usedS*refRO*refE1*srcCHA*refN));
+        ho3DArray<T> acsDst(refRO, refE1, dstCHA, const_cast<T*>(ref_dst.begin()+n*refRO*refE1*dstCHA+usedS*refRO*refE1*dstCHA*refN));
+
+        std::ostringstream ostr;
+        ostr << "_n_" << n << "s_" << usedS;
+        std::string suffix = ostr.str();
+
+        std::string filename = "acsSrc";
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(acsSrc, debugFolder_+filename+suffix); }
+
+        filename = "acsDst";
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(acsDst, debugFolder_+filename+suffix); }
+
+        ho4DArray<T> convKer(convkRO, convkE1, srcCHA, dstCHA, workOrder2DT->kernel_->begin() + n*convkRO*convkE1*srcCHA*dstCHA + usedS*refN*convkRO*convkE1*srcCHA*dstCHA);
+
+        Gadgetron::GadgetronTimer gt_timer_local;
+        gt_timer_local.set_timing_in_destruction(false);
+
+        if ( performTiming_ ) { gt_timer_local.start("grappa2d_calib_convolution_kernel ... "); }
+        Gadgetron::grappa2d_calib_convolution_kernel(acsSrc, acsDst, (size_t)workOrder2DT->acceFactorE1_, workOrder2DT->grappa_reg_lamda_, kRO, kNE1, convKer);
+        if ( performTiming_ ) { gt_timer_local.stop(); }
+
+        filename = "convKer";
+        if (!debugFolder_.empty()) { gt_exporter_.exportArrayComplex(convKer, debugFolder_ + filename + suffix); }
+
+        hoNDArray<T> kIm(RO, E1, srcCHA, dstCHA, workOrder2DT->kernelIm_->begin()+n*RO*E1*srcCHA*dstCHA+usedS*RO*E1*srcCHA*dstCHA*refN);
+        if ( performTiming_ ) { gt_timer_local.start("grappa2d_image_domain_kernel ... "); }
+        Gadgetron::grappa2d_image_domain_kernel(convKer, RO, E1, kIm);
+        if ( performTiming_ ) { gt_timer_local.stop(); }
+
+        filename = "kIm";
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kIm, debugFolder_+filename+suffix); }
+
+        hoNDArray<T> coilMap(RO, E1, dstCHA, workOrder2DT->coilMap_->begin()+n*RO*E1*dstCHA+usedS*RO*E1*dstCHA*refN);
+        hoNDArray<T> unmixC(RO, E1, srcCHA, workOrder2DT->unmixingCoeffIm_->begin()+n*RO*E1*srcCHA+usedS*RO*E1*srcCHA*refN);
+        hoNDArray<T> gFactor(RO, E1, 1, workOrder2DT->gfactor_.begin()+n*RO*E1+usedS*RO*E1*refN);
+
+        hoNDArray< typename realType<T>::Type > gFactorMap(RO, E1);
+        if ( performTiming_ ) { gt_timer_local.start("grappa2d_unmixing_coeff ... "); }
+        Gadgetron::grappa2d_unmixing_coeff(kIm, coilMap, (size_t)workOrder2DT->acceFactorE1_, unmixC, gFactorMap);
+        if ( performTiming_ ) { gt_timer_local.stop(); }
+
+        Gadgetron::real_to_complex(gFactorMap, gFactor);
+
+        filename = "unmixC";
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(unmixC, debugFolder_+filename+suffix); }
+
+        filename = "gFactor";
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(gFactor, debugFolder_+filename+suffix); }
+
+        if ( workOrder2DT->wrap_around_map_needed_ )
+        {
+            hoNDArray<T> wrapAroundMap(RO, E1, 2, workOrder2DT->wrap_around_map_.begin()+n*RO*E1*2+usedS*RO*E1*2*refN);
+
+            gtPlusISMRMRDReconCoilMapEstimation<T> coil_map_util;
+
+            hoNDArray<T> coilMap(RO, E1, acsDst.get_size(2));
+            hoNDArray<value_type> eigD(RO, E1, 2);
+
+            value_type thres = workOrder2DT->spirit_reg_lamda_;
+
+            GADGET_CHECK_RETURN_FALSE(coil_map_util.coilMap2DSPIRIT(acsDst, coilMap, eigD, workOrder2DT->spirit_kSize_RO_, workOrder2DT->spirit_kSize_E1_, thres));
+            GADGET_CHECK_RETURN_FALSE(wrapAroundMap.copyFrom(eigD));
+
+            filename = "wrapAroundMap";
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArray(eigD, debugFolder_+filename+suffix); }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DTGRAPPA<T>::performCalibImpl(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTGRAPPA<T>::
+performUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& data_dst)
+{
+    try
+    {
+        int n;
+
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = workOrder2DT->kernelIm_->get_size(2);
+        size_t dstCHA = workOrder2DT->kernelIm_->get_size(3);
+
+        size_t refN = workOrder2DT->kernelIm_->get_size(4);
+
+        workOrder2DT->complexIm_.create(RO, E1, N, S);
+
+        if ( workOrder2DT->downstream_coil_compression_ )
+        {
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(workOrder2DT->data_, buffer2DT_);
+        }
+        else
+        {
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(data_dst, buffer2DT_);
+        }
+
+        double effectiveAcceFactor = workOrder2DT->acceFactorE1_;
+        if ( workOrder2DT->start_E1_>0 && workOrder2DT->end_E1_>0 )
+        {
+            size_t num = workOrder2DT->end_E1_ - workOrder2DT->start_E1_ + 1;
+            size_t res = (size_t)( num % (size_t)(std::ceil(workOrder2DT->acceFactorE1_)) );
+            double N = std::floor( (double)(num-res)/(double)workOrder2DT->acceFactorE1_);
+            effectiveAcceFactor = (double)num/N;
+        }
+        else
+        {
+            size_t num = E1;
+            size_t res = (size_t)( num % (size_t)(std::ceil(workOrder2DT->acceFactorE1_)) );
+            double N = std::floor( (double)(num-res)/(double)workOrder2DT->acceFactorE1_);
+            effectiveAcceFactor = (double)num/N;
+        }
+
+        typename realType<T>::Type fftCompensationRatio = (typename realType<T>::Type)(1.0/std::sqrt(effectiveAcceFactor));
+
+        Gadgetron::scal( fftCompensationRatio, buffer2DT_);
+
+        // if the image data is scaled and ref lines are going to be filled back to the data, 
+        // the reference lines should be scaled too
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_embedded )
+        {
+            if ( workOrder2DT->embedded_ref_fillback_ )
+            {
+                Gadgetron::scal( fftCompensationRatio, workOrder2DT->ref_);
+            }
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer2DT_, debugFolder_+"buffer2DT_"); }
+
+        bool recon_kspace = false;
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_embedded )
+        {
+            if ( workOrder2DT->embedded_fullres_coilmap_ || workOrder2DT->embedded_ref_fillback_ )
+            {
+                recon_kspace = true;
+            }
+        }
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_separate )
+        {
+            if ( workOrder2DT->separate_fullres_coilmap_ )
+            {
+                recon_kspace = true;
+            }
+        }
+
+        if ( workOrder2DT->recon_kspace_needed_ )
+        {
+            recon_kspace = true;
+        }
+
+        // if kspace is actually needed
+        if ( recon_kspace )
+        {
+            workOrder2DT->fullkspace_ = data_dst;
+
+            buffer2DT_unwrapping_.create(RO, E1, srcCHA, dstCHA);
+
+            size_t usedS;
+            for ( usedS=0; usedS<S; usedS++ )
+            {
+                if ( (refN<N) || (refN==1) )
+                {
+                    hoNDArray<T> kIm(RO, E1, srcCHA, dstCHA, workOrder2DT->kernelIm_->begin()+usedS*RO*E1*srcCHA*dstCHA*refN);
+                    hoNDArray<T> aliasedIm(RO, E1, srcCHA, N, buffer2DT_.begin()+usedS*RO*E1*srcCHA*N);
+                    hoNDArray<T> unwarppedIm(RO, E1, dstCHA, N, workOrder2DT->fullkspace_.begin()+usedS*RO*E1*dstCHA*N);
+
+                    this->applyImageDomainKernelImage(aliasedIm, kIm, buffer2DT_unwrapping_, unwarppedIm);
+
+                    if ( !debugFolder_.empty() )
+                    {
+                        {
+                            std::ostringstream ostr;
+                            ostr << "kIm_" << usedS;
+                            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kIm, debugFolder_+ostr.str()); }
+                        }
+
+                        {
+                            std::ostringstream ostr;
+                            ostr << "aliasedIm_" << usedS;
+                            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(aliasedIm, debugFolder_+ostr.str()); }
+                        }
+
+                        std::ostringstream ostr;
+                        ostr << "unwarppedIm_" << usedS;
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(unwarppedIm, debugFolder_+ostr.str()); }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel private(n)
+                    {
+                        hoNDArray<T> complexIm(RO, E1, dstCHA);
+
+                        #pragma omp for
+                        for ( n=0; n<(int)N; n++ )
+                        {
+                            hoNDArray<T> kIm(RO, E1, srcCHA, dstCHA, workOrder2DT->kernelIm_->begin()+n*RO*E1*srcCHA*dstCHA+usedS*RO*E1*srcCHA*dstCHA*refN);
+
+                            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kIm, debugFolder_+"kIm_n"); }
+
+                            T* pIm2D = buffer2DT_.begin()+n*RO*E1*srcCHA+usedS*RO*E1*srcCHA*N;
+                            hoNDArray<T> aliasedIm(RO, E1, srcCHA, pIm2D);
+
+                            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(aliasedIm, debugFolder_+"aliasedIm_n"); }
+
+                            this->applyImageDomainKernelImage(aliasedIm, kIm, complexIm);
+                            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(complexIm, debugFolder_+"complexIm_n"); }
+
+                            memcpy(workOrder2DT->fullkspace_.begin()+n*RO*E1*dstCHA+usedS*RO*E1*dstCHA*N, complexIm.begin(), sizeof(T)*RO*E1*dstCHA);
+                        }
+                    }
+                }
+
+                hoNDArray<T> unwarppedIm(RO, E1, dstCHA, N, workOrder2DT->fullkspace_.begin()+usedS*RO*E1*dstCHA*N);
+                hoNDArray<T> combined(RO, E1, N, workOrder2DT->complexIm_.begin()+usedS*RO*E1*N);
+
+                if ( refN == N )
+                {
+                    hoNDArray<T> coilMap(RO, E1, dstCHA, refN, workOrder2DT->coilMap_->begin()+usedS*RO*E1*dstCHA*refN);
+                    gtPlusISMRMRDReconUtilComplex<T>().coilCombine(unwarppedIm, coilMap, combined);
+                }
+                else
+                {
+                    hoNDArray<T> coilMap(RO, E1, dstCHA, workOrder2DT->coilMap_->begin()+usedS*RO*E1*dstCHA*refN);
+                    gtPlusISMRMRDReconUtilComplex<T>().coilCombine(unwarppedIm, coilMap, combined);
+                }
+
+                if ( !debugFolder_.empty() )
+                {
+                    std::ostringstream ostr;
+                    ostr << "combined_" << usedS;
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(combined, debugFolder_+ostr.str()); }
+                }
+            }
+
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(workOrder2DT->fullkspace_);
+
+            if ( !debugFolder_.empty() )
+            {
+                std::ostringstream ostr;
+                ostr << "fullkspace_" << usedS;
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder2DT->fullkspace_, debugFolder_+ostr.str()); }
+            }
+        }
+        else
+        {
+            size_t usedS;
+            for ( usedS=0; usedS<S; usedS++ )
+            {
+                if ( (refN<N) || (refN==1) )
+                {
+                    hoNDArray<T> unmixCoeff(RO, E1, srcCHA, workOrder2DT->unmixingCoeffIm_->begin()+usedS*RO*E1*srcCHA*refN);
+                    hoNDArray<T> aliasedIm(RO, E1, srcCHA, N, buffer2DT_.begin()+usedS*RO*E1*srcCHA*N);
+                    hoNDArray<T> unwarppedIm(RO, E1, 1, N, workOrder2DT->complexIm_.begin()+usedS*RO*E1*N);
+
+                    this->applyUnmixCoeffImage(aliasedIm, unmixCoeff, unwarppedIm);
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(unwarppedIm, debugFolder_+"unwarppedIm"); }
+                }
+                else
+                {
+                    // #pragma omp parallel for private(n)
+                    for ( n=0; n<(int)N; n++ )
+                    {
+                        hoNDArray<T> unmixCoeff(RO, E1, srcCHA, workOrder2DT->unmixingCoeffIm_->begin()+n*RO*E1*srcCHA+usedS*RO*E1*srcCHA*refN);
+                        hoNDArray<T> aliasedIm(RO, E1, srcCHA, buffer2DT_.begin()+n*RO*E1*srcCHA+usedS*RO*E1*srcCHA*N);
+                        hoNDArray<T> unwarppedIm(RO, E1, 1, workOrder2DT->complexIm_.begin()+n*RO*E1+usedS*RO*E1*N);
+
+                        this->applyUnmixCoeffImage(aliasedIm, unmixCoeff, unwarppedIm);
+
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(unwarppedIm, debugFolder_+"unwarppedIm"); }
+                    }
+                }
+            }
+
+            workOrder2DT->fullkspace_.create(RO, E1, 1, N, S);
+            memcpy(workOrder2DT->fullkspace_.begin(), workOrder2DT->complexIm_.begin(), workOrder2DT->complexIm_.get_number_of_bytes());
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(workOrder2DT->fullkspace_);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DTGRAPPA<T>::performUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& data) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h
new file mode 100644
index 0000000..8f6b5f8
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h
@@ -0,0 +1,355 @@
+/** \file   gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h
+    \brief  Implement the 2DT non-linear SPIRIT reconstruction using the non-linear CG solver
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusISMRMRDReconWorker2DTSPIRIT.h"
+#include "gtPlusSPIRIT2DTOperator.h"
+#include "gtPlusSPIRITNoNullSpace2DOperator.h"
+#include "gtPlusSPIRITNoNullSpace2DTOperator.h"
+#include "gtPlusNCGSolver.h"
+#include "gtPlusWavelet2DOperator.h"
+#include "gtPlusWavelet3DOperator.h"
+#include "gtPlusWaveletNoNullSpace2DOperator.h"
+#include "gtPlusWaveletNoNullSpace3DOperator.h"
+#include "gtPlusDataFidelityOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker2DTL1SPIRITNCG : public gtPlusReconWorker2DTSPIRIT<T>
+{
+public:
+
+    typedef gtPlusReconWorker2DTSPIRIT<T> BaseClass;
+    typedef typename realType<T>::Type value_type;
+
+    gtPlusReconWorker2DTL1SPIRITNCG() : BaseClass() {}
+    virtual ~gtPlusReconWorker2DTL1SPIRITNCG() {}
+
+    virtual bool performUnwarppingImpl(gtPlusReconWorkOrder<T>* workOrder2DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res, size_t s);
+    virtual bool performUnwarppingImpl(gtPlusReconJob2DT<T>& job);
+    // virtual bool performUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& data);
+
+    virtual bool autoReconParameter(gtPlusReconWorkOrder<T>* workOrder);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::verbose_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+
+    using BaseClass::buffer2DT_;
+    using BaseClass::buffer2DT_unwrapping_;
+    using BaseClass::buffer2DT_partial_fourier_;
+    using BaseClass::buffer2DT_partial_fourier_kspaceIter_;
+    using BaseClass::ref_src_;
+    using BaseClass::ref_dst_;
+    using BaseClass::data_dst_;
+    using BaseClass::ref_coil_map_dst_;
+    using BaseClass::startE1_;
+    using BaseClass::endE1_;
+
+    using BaseClass::spirit_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker2DTL1SPIRITNCG<T>::autoReconParameter(gtPlusReconWorkOrder<T>* workOrder)
+{
+    BaseClass::autoReconParameter(workOrder);
+
+    gtPlusReconWorkOrder2DT<T>* workOrder2DT = dynamic_cast<gtPlusReconWorkOrder2DT<T>*>(workOrder);
+    if ( workOrder2DT == NULL ) return false;
+
+    if ( workOrder2DT->spirit_perform_linear_ )
+    {
+        if ( workOrder2DT->spirit_solve_symmetric_ )
+        {
+            workOrder2DT->spirit_image_reg_lamda_ = 0.0025;
+            workOrder2DT->spirit_ncg_iter_thres_ = 0.0001;
+        }
+        else
+        {
+            workOrder2DT->spirit_image_reg_lamda_ = 0.0025;
+            workOrder2DT->spirit_ncg_iter_thres_ = 0.0001;
+        }
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTL1SPIRITNCG<T>::
+performUnwarppingImpl(gtPlusReconWorkOrder<T>* workOrder2DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res, size_t s)
+{
+    try
+    {
+        hoNDArray<T> kspaceLinear(kspace);
+        res = kspace;
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace"); }
+
+        bool performLinear = workOrder2DT->spirit_perform_linear_;
+        if ( !workOrder2DT->spirit_perform_nonlinear_ ) performLinear = true;
+
+        if ( performLinear )
+        {
+            if ( performTiming_ ) { gt_timer3_.start("NCG spirit linear solver for 2DT ... "); }
+            GADGET_CHECK_RETURN_FALSE(BaseClass::performUnwarppingImpl(workOrder2DT, kspace, adj_forward_G_I, kspaceLinear, s));
+            if ( performTiming_ ) { gt_timer3_.stop(); }
+        }
+        else
+        {
+            if ( workOrder2DT->kspace_initial_.get_number_of_elements() == kspace.get_number_of_elements() )
+            {
+                GDEBUG_STREAM("Start the iteration with the input initial kspace ... ");
+                memcpy(kspaceLinear.begin(), workOrder2DT->kspace_initial_.begin(), kspace.get_number_of_bytes());
+            }
+            else
+            {
+                GDEBUG_STREAM("Start the iteration with the input kspace ... ");
+            }
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceLinear, debugFolder_+"kspaceLinear"); }
+
+        if ( workOrder2DT->spirit_perform_nonlinear_ )
+        {
+            size_t refN = adj_forward_G_I.get_size(4);
+
+            size_t RO = kspace.get_size(0);
+            size_t E1 = kspace.get_size(1);
+            size_t N = kspace.get_size(3);
+
+            size_t srcCHA = adj_forward_G_I.get_size(2);
+            size_t dstCHA = adj_forward_G_I.get_size(3);
+
+            if ( workOrder2DT->spirit_2D_scale_per_chunk_ )
+            {
+                typename realType<T>::Type scaleFactor = 1.0;
+                Gadgetron::norm2(kspace, scaleFactor);
+                scaleFactor /= (typename realType<T>::Type)( (RO*std::sqrt(double(srcCHA))) );
+
+                workOrder2DT->spirit_ncg_scale_factor_ = scaleFactor;
+            }
+
+            // apply the scale
+            if ( workOrder2DT->spirit_ncg_scale_factor_ > 0 )
+            {
+                Gadgetron::scal( static_cast<value_type>(1.0/workOrder2DT->spirit_ncg_scale_factor_), kspaceLinear);
+                Gadgetron::scal( static_cast<value_type>(1.0/workOrder2DT->spirit_ncg_scale_factor_), kspace);
+            }
+
+            boost::shared_ptr< hoNDArray<T> > coilMapS;
+            
+            if ( workOrder2DT->coilMap_ )
+            {
+                if ( refN < N )
+                {
+                    coilMapS = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>(RO, E1, dstCHA, workOrder2DT->coilMap_->begin()));
+                }
+                else
+                {
+                    coilMapS = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>(RO, E1, dstCHA, refN, workOrder2DT->coilMap_->begin()+s*RO*E1*dstCHA*refN));
+                }
+            }
+
+            if ( N > 1 )
+            {
+                // 2D+T
+                boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(RO, E1, srcCHA, dstCHA, refN, adj_forward_G_I.begin()));
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(RO, E1, srcCHA, N, kspace.begin()));
+
+                gtPlusNCGSolver<hoNDArray<T>, hoNDArray<T>, gtPlusOperator<T> > ncgsolver;
+                ncgsolver.iterMax_ = workOrder2DT->spirit_ncg_iter_max_;
+                ncgsolver.printIter_ = workOrder2DT->spirit_ncg_print_iter_;
+                ncgsolver.secantRatio_ = 1;
+                ncgsolver.x0_ = &kspaceLinear;
+
+                hoNDArray<T> b;
+
+                if ( workOrder2DT->spirit_data_fidelity_lamda_ <= 0 )
+                {
+                    // parallel imaging term
+                    gtPlusSPIRIT2DTOperator<T> spirit;
+                    spirit.use_symmetric_spirit_ = false;
+                    spirit.setForwardKernel(ker, true);
+                    spirit.setAcquiredPoints(acq);
+
+                    // L1 term
+                    gtPlusWavelet3DOperator<T> wavNullSpace3DOperator;
+                    wavNullSpace3DOperator.setAcquiredPoints(acq);
+                    wavNullSpace3DOperator.scale_factor_first_dimension_ = (value_type)workOrder2DT->spirit_RO_enhancement_ratio_;
+                    wavNullSpace3DOperator.scale_factor_second_dimension_ = (value_type)workOrder2DT->spirit_E1_enhancement_ratio_;
+                    wavNullSpace3DOperator.scale_factor_third_dimension_ = (value_type)workOrder2DT->spirit_temporal_enhancement_ratio_;
+
+                    if ( workOrder2DT->spirit_use_coil_sen_map_ && workOrder2DT->coilMap_ )
+                    {
+                        wavNullSpace3DOperator.setCoilSenMap(coilMapS);
+                    }
+
+                    // set operators
+                    ncgsolver.add(spirit, T( (value_type)workOrder2DT->spirit_parallel_imaging_lamda_ ) );
+                    ncgsolver.add(wavNullSpace3DOperator, T( (value_type)workOrder2DT->spirit_image_reg_lamda_ ) );
+
+                    if ( performTiming_ ) { gt_timer3_.start("NCG spirit solver for 2DT ... "); }
+                    ncgsolver.solve(b, res);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"ncg_spirit_2DT_res"); }
+
+                    spirit.restoreAcquiredKSpace(kspace, res);
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"ncg_spirit_2DT_res_restored"); }
+                }
+                else
+                {
+                    gtPlusSPIRITNoNullSpace2DTOperator<T> spirit_noNullSpace;
+                    spirit_noNullSpace.use_symmetric_spirit_ = false;
+                    spirit_noNullSpace.setForwardKernel(ker, true);
+                    spirit_noNullSpace.setAcquiredPoints(acq);
+
+                    gtPlusDataFidelityOperator<T> dataOper;
+                    dataOper.setAcquiredPoints(acq);
+
+                    gtPlusWaveletNoNullSpace3DOperator<T> wavNoNullSpace3DOperator;
+                    wavNoNullSpace3DOperator.setAcquiredPoints(acq);
+                    wavNoNullSpace3DOperator.scale_factor_first_dimension_ = (value_type)workOrder2DT->spirit_RO_enhancement_ratio_;
+                    wavNoNullSpace3DOperator.scale_factor_second_dimension_ = (value_type)workOrder2DT->spirit_E1_enhancement_ratio_;
+                    wavNoNullSpace3DOperator.scale_factor_third_dimension_ = (value_type)workOrder2DT->spirit_temporal_enhancement_ratio_;
+
+                    if ( workOrder2DT->spirit_use_coil_sen_map_ && workOrder2DT->coilMap_ )
+                    {
+                        wavNoNullSpace3DOperator.setCoilSenMap(coilMapS);
+                    }
+
+                    ncgsolver.add(spirit_noNullSpace, T( (value_type)workOrder2DT->spirit_parallel_imaging_lamda_ ) );
+                    ncgsolver.add(wavNoNullSpace3DOperator, T( (value_type)workOrder2DT->spirit_image_reg_lamda_ ) );
+                    ncgsolver.add(dataOper, T( (value_type)workOrder2DT->spirit_data_fidelity_lamda_ ) );
+
+                    if ( performTiming_ ) { gt_timer3_.start("NCG spirit solver for 2DT without null space ... "); }
+                    ncgsolver.solve(b, res);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"ncg_spirit_2DT_res_noNullSpace"); }
+                }
+            }
+            else
+            {
+                // 2D
+                boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(RO, E1, srcCHA, dstCHA, adj_forward_G_I.begin()));
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(RO, E1, srcCHA, kspace.begin()));
+
+                gtPlusNCGSolver<hoNDArray<T>, hoNDArray<T>, gtPlusOperator<T> > ncgsolver;
+                ncgsolver.iterMax_ = workOrder2DT->spirit_ncg_iter_max_;
+                ncgsolver.printIter_ = workOrder2DT->spirit_ncg_print_iter_;
+                ncgsolver.secantRatio_ = 1;
+                ncgsolver.x0_ = &kspaceLinear;
+
+                hoNDArray<T> b;
+
+                if ( workOrder2DT->spirit_data_fidelity_lamda_ <= 0 )
+                {
+                    // parallel imaging term
+                    gtPlusSPIRIT2DOperator<T> spirit;
+                    spirit.use_symmetric_spirit_ = false;
+                    spirit.setForwardKernel(ker, true);
+                    spirit.setAcquiredPoints(acq);
+
+                    // L1 term
+                    gtPlusWavelet2DOperator<T> wavNullSpace2DOperator;
+                    wavNullSpace2DOperator.setAcquiredPoints(acq);
+
+                    if ( workOrder2DT->spirit_use_coil_sen_map_ && workOrder2DT->coilMap_ )
+                    {
+                        wavNullSpace2DOperator.setCoilSenMap(coilMapS);
+                    }
+
+                    // set operators
+                    ncgsolver.add(spirit, T( (value_type)workOrder2DT->spirit_parallel_imaging_lamda_ ) );
+                    ncgsolver.add(wavNullSpace2DOperator, T( (value_type)workOrder2DT->spirit_image_reg_lamda_ ) );
+
+                    if ( performTiming_ ) { gt_timer3_.start("NCG spirit solver for 2D ... "); }
+                    ncgsolver.solve(b, res);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"ncg_spirit_2D_res"); }
+
+                    spirit.restoreAcquiredKSpace(kspace, res);
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, "ncg_spirit_2D_res_restored"); }
+                }
+                else
+                {
+                    gtPlusSPIRITNoNullSpace2DOperator<T> spirit_noNullSpace;
+                    spirit_noNullSpace.use_symmetric_spirit_ = false;
+                    spirit_noNullSpace.setForwardKernel(ker, true);
+                    spirit_noNullSpace.setAcquiredPoints(acq);
+
+                    gtPlusDataFidelityOperator<T> dataOper;
+                    dataOper.setAcquiredPoints(acq);
+
+                    gtPlusWaveletNoNullSpace2DOperator<T> wavNoNullSpace2DOperator;
+                    wavNoNullSpace2DOperator.setAcquiredPoints(acq);
+
+                    if ( workOrder2DT->spirit_use_coil_sen_map_ && workOrder2DT->coilMap_ )
+                    {
+                        wavNoNullSpace2DOperator.setCoilSenMap(coilMapS);
+                    }
+
+                    ncgsolver.add(spirit_noNullSpace, T( (value_type)workOrder2DT->spirit_parallel_imaging_lamda_ ) );
+                    ncgsolver.add(wavNoNullSpace2DOperator, T( (value_type)workOrder2DT->spirit_image_reg_lamda_ ) );
+                    ncgsolver.add(dataOper, T( (value_type)workOrder2DT->spirit_data_fidelity_lamda_ ) );
+
+                    if ( performTiming_ ) { gt_timer3_.start("NCG spirit solver for 2D without null space ... "); }
+                    ncgsolver.solve(b, res);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"ncg_spirit_2D_res_noNullSpace"); }
+                }
+            }
+
+            Gadgetron::scal(T( (value_type)workOrder2DT->spirit_ncg_scale_factor_ ), res);
+        }
+        else
+        {
+            res = kspaceLinear;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DTL1SPIRITNCG<T>::performUnwarppingImpl(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTL1SPIRITNCG<T>::
+performUnwarppingImpl(gtPlusReconJob2DT<T>& job)
+{
+    try
+    {
+        hoNDArray<T>& kspace = job.kspace;
+        hoNDArray<T>& ker = job.ker;
+        hoNDArray<T>& res = job.res;
+        gtPlusReconWorkOrder<T>* workOrder2DT = &(job.workOrder2DT);
+
+        GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(workOrder2DT, kspace, ker, res, job.job_index_S_));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DTL1SPIRITNCG<T>::performUnwarppingImpl(job) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTNoAcceleration.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTNoAcceleration.h
new file mode 100644
index 0000000..be6d6d6
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTNoAcceleration.h
@@ -0,0 +1,155 @@
+/** \file   gtPlusISMRMRDReconWorker2DTNoAcceleration.h
+    \brief  Implement the 2DT reconstruction without the k-space undersampling
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd/ismrmrd.h"
+
+#include "GadgetronTimer.h"
+
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorker2DT.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker2DTNoAcceleration : public gtPlusReconWorker2DT<T>
+{
+public:
+
+    typedef gtPlusReconWorker2DT<T> BaseClass;
+    typedef typename BaseClass::value_type value_type;
+
+    gtPlusReconWorker2DTNoAcceleration() : BaseClass() {}
+    virtual ~gtPlusReconWorker2DTNoAcceleration() {}
+
+    virtual bool performRecon(gtPlusReconWorkOrder2DT<T>* workOrder2DT);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::verbose_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+
+    using BaseClass::buffer2DT_;
+    using BaseClass::buffer2DT_unwrapping_;
+    using BaseClass::buffer2DT_partial_fourier_;
+    using BaseClass::buffer2DT_partial_fourier_kspaceIter_;
+    using BaseClass::ref_src_;
+    using BaseClass::ref_dst_;
+    using BaseClass::data_dst_;
+    using BaseClass::ref_coil_map_dst_;
+    using BaseClass::startE1_;
+    using BaseClass::endE1_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker2DTNoAcceleration<T>::performRecon(gtPlusReconWorkOrder2DT<T>* workOrder2DT)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(workOrder2DT!=NULL);
+
+        if ( !workOrder2DT->workFlow_use_BufferedKernel_ )
+        {
+            if ( performTiming_ ) { gt_timer1_.start("prepRef"); }
+            GADGET_CHECK_RETURN_FALSE(this->prepRef(workOrder2DT, workOrder2DT->ref_, workOrder2DT->ref_recon_, workOrder2DT->ref_coil_map_, 
+                                                workOrder2DT->start_RO_, workOrder2DT->end_RO_, workOrder2DT->start_E1_, workOrder2DT->end_E1_, workOrder2DT->data_.get_size(1)));
+            if ( performTiming_ ) { gt_timer1_.stop(); }
+        }
+
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t CHA = workOrder2DT->data_.get_size(2);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t refN = workOrder2DT->ref_recon_.get_size(3);
+        size_t usedS;
+
+        // apply coil compression coefficients
+        if ( !workOrder2DT->workFlow_use_BufferedKernel_ 
+                    || (workOrder2DT->coilMap_->get_size(0)!=RO) 
+                    || (workOrder2DT->coilMap_->get_size(1)!=E1)
+                    || (workOrder2DT->coilMap_->get_size(4)!=S) )
+        {
+            workOrder2DT->coilMap_->create(RO, E1, CHA, refN, S);
+
+            // estimate the coil sensitivity
+            if ( workOrder2DT->no_acceleration_same_combinationcoeff_allS_ )
+            {
+                usedS = workOrder2DT->no_acceleration_whichS_combinationcoeff_;
+                if ( usedS >= S ) usedS = S-1;
+
+                hoNDArray<T> refCoilMapS(RO, E1, CHA, refN, workOrder2DT->ref_coil_map_.begin()+usedS*RO*E1*CHA*refN);
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(refCoilMapS, buffer2DT_);
+
+                hoNDArray<T> coilMapS(RO, E1, CHA, refN, workOrder2DT->coilMap_->begin()+usedS*RO*E1*CHA*refN);
+
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(buffer2DT_, 
+                        coilMapS, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, (value_type)workOrder2DT->csm_iter_thres_, workOrder2DT->csm_use_gpu_));
+
+                GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder2DT->coilMap_, usedS));
+            }
+            else
+            {
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(workOrder2DT->ref_coil_map_, buffer2DT_);
+
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(buffer2DT_, 
+                        *workOrder2DT->coilMap_, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, (value_type)workOrder2DT->csm_iter_thres_, workOrder2DT->csm_use_gpu_));
+            }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*workOrder2DT->coilMap_, debugFolder_+"coilMap_"); }
+        }
+
+        // partial fourier handling
+        GADGET_CHECK_RETURN_FALSE(this->performPartialFourierHandling(workOrder2DT));
+
+        workOrder2DT->complexIm_.create(RO, E1, N, S);
+
+        if ( performTiming_ ) { gt_timer1_.start("perform coil combination"); }
+
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(workOrder2DT->data_, buffer2DT_unwrapping_);
+
+        /*if ( refN == N )
+        {*/
+            gtPlusISMRMRDReconUtilComplex<T>().coilCombine(buffer2DT_unwrapping_, *(workOrder2DT->coilMap_), workOrder2DT->complexIm_ );
+        /*}
+        else
+        {
+            for ( usedS=0; usedS<S; usedS++ )
+            {
+                hoNDArray<T> unwarppedIm(RO, E1, CHA, N, buffer2DT_unwrapping_.begin()+usedS*RO*E1*CHA*N);
+                hoNDArray<T> combined(RO, E1, N, workOrder2DT->complexIm_.begin()+usedS*RO*E1*N);
+
+                if ( refN == N )
+                {
+                    hoNDArray<T> coilMap(RO, E1, CHA, refN, workOrder2DT->coilMap_->begin()+usedS*RO*E1*CHA*refN);
+                    gtPlusISMRMRDReconUtilComplex<T>().coilCombine(unwarppedIm, coilMap, combined);
+                }
+                else
+                {
+                    hoNDArray<T> coilMap(RO, E1, CHA, workOrder2DT->coilMap_->begin()+usedS*RO*E1*CHA*N);
+                    gtPlusISMRMRDReconUtilComplex<T>().coilCombine(unwarppedIm, coilMap, combined);
+                }
+            }
+        }*/
+        if ( performTiming_ ) { gt_timer1_.stop(); }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder2DT->complexIm_, debugFolder_+"combined"); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DTNoAcceleration<T>::performRecon(gtPlusReconWorkOrder2DT<T>* workOrder2DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTSPIRIT.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTSPIRIT.h
new file mode 100644
index 0000000..7d8abc4
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTSPIRIT.h
@@ -0,0 +1,734 @@
+/** \file   gtPlusISMRMRDReconWorker2DTSPIRIT.h
+    \brief  Implement the 2DT linear SPIRIT reconstruction
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd/ismrmrd.h"
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorker2DT.h"
+#include "gtPlusSPIRIT.h"
+#include "gtPlusSPIRIT2DTOperator.h"
+#include "gtPlusLSQRSolver.h"
+
+#include "GadgetCloudController.h"
+#include "GadgetCloudJobMessageReadWrite.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker2DTSPIRIT : public gtPlusReconWorker2DT<T>
+{
+public:
+
+    typedef gtPlusReconWorker2DT<T> BaseClass;
+    typedef typename realType<T>::Type value_type;
+
+    gtPlusReconWorker2DTSPIRIT() : BaseClass() {}
+    virtual ~gtPlusReconWorker2DTSPIRIT() {}
+
+    virtual bool performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT);
+    virtual bool performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT, size_t n, size_t usedS);
+
+    virtual bool performUnwarppingImpl(gtPlusReconJob2DT<T>& job);
+    virtual bool performUnwarppingImpl(gtPlusReconWorkOrder<T>* workOrder2DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res, size_t s);
+    virtual bool performUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& data);
+
+    virtual bool autoReconParameter(gtPlusReconWorkOrder<T>* workOrder);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::verbose_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_cplx_;
+
+    using BaseClass::buffer2DT_;
+    using BaseClass::buffer2DT_unwrapping_;
+    using BaseClass::buffer2DT_partial_fourier_;
+    using BaseClass::buffer2DT_partial_fourier_kspaceIter_;
+    using BaseClass::ref_src_;
+    using BaseClass::ref_dst_;
+    using BaseClass::data_dst_;
+    using BaseClass::ref_coil_map_dst_;
+    using BaseClass::startE1_;
+    using BaseClass::endE1_;
+
+    gtPlusSPIRIT<T> spirit_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker2DTSPIRIT<T>::
+performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT)
+{
+    try
+    {
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = ref_src.get_size(2);
+
+        size_t refRO = ref_dst.get_size(0);
+        size_t refE1 = ref_dst.get_size(1);
+        size_t refN = ref_dst.get_size(3);
+        size_t dstCHA = ref_dst.get_size(2);
+
+        size_t kRO = workOrder2DT->spirit_kSize_RO_;
+        size_t kE1 = workOrder2DT->spirit_kSize_E1_;
+        size_t oRO = workOrder2DT->spirit_oSize_RO_;
+        size_t oE1 = workOrder2DT->spirit_oSize_E1_;
+
+        workOrder2DT->kernel_->create(kRO, kE1, srcCHA, dstCHA, oRO, oE1, refN, S);
+        workOrder2DT->kernelIm_->create(RO, E1, srcCHA, dstCHA, refN, S);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DTSPIRIT<T>::performCalibPrep(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTSPIRIT<T>::autoReconParameter(gtPlusReconWorkOrder<T>* workOrder)
+{
+    gtPlusReconWorkOrder2DT<T>* workOrder2DT = dynamic_cast<gtPlusReconWorkOrder2DT<T>*>(workOrder);
+    if ( workOrder2DT == NULL ) return false;
+
+    double maxAcceFactor = workOrder2DT->acceFactorE1_;
+
+    if ( maxAcceFactor>=6 )
+    {
+        workOrder2DT->spirit_iter_max_ = 150;
+        workOrder2DT->spirit_iter_thres_ = 0.0015;
+        workOrder2DT->spirit_reg_lamda_ = 0.005;
+    }
+    else if ( maxAcceFactor>=5 )
+    {
+        workOrder2DT->spirit_iter_max_ = 120;
+        workOrder2DT->spirit_iter_thres_ = 0.0015;
+        workOrder2DT->spirit_reg_lamda_ = 0.005;
+    }
+    else if ( maxAcceFactor>=4 )
+    {
+        workOrder2DT->spirit_iter_max_ = 100;
+        workOrder2DT->spirit_iter_thres_ = 0.0015;
+        workOrder2DT->spirit_reg_lamda_ = 0.005;
+    }
+    else if ( maxAcceFactor>=3 )
+    {
+        workOrder2DT->spirit_iter_max_ = 60;
+        workOrder2DT->spirit_iter_thres_ = 0.0015;
+        workOrder2DT->spirit_reg_lamda_ = 0.005;
+    }
+    else
+    {
+        workOrder2DT->spirit_iter_max_ = 50;
+        workOrder2DT->spirit_iter_thres_ = 0.0015;
+        workOrder2DT->spirit_reg_lamda_ = 0.005;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTSPIRIT<T>::
+performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT, size_t n, size_t usedS)
+{
+    try
+    {
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = ref_src.get_size(2);
+
+        size_t refRO = ref_dst.get_size(0);
+        size_t refE1 = ref_dst.get_size(1);
+        size_t refN = ref_dst.get_size(3);
+        size_t dstCHA = ref_dst.get_size(2);
+
+        size_t kRO = workOrder2DT->spirit_kSize_RO_;
+        size_t kE1 = workOrder2DT->spirit_kSize_E1_;
+        size_t oRO = workOrder2DT->spirit_oSize_RO_;
+        size_t oE1 = workOrder2DT->spirit_oSize_E1_;
+
+        ho3DArray<T> acsSrc(refRO, refE1, srcCHA, const_cast<T*>(ref_src.begin()+n*refRO*refE1*srcCHA+usedS*refRO*refE1*srcCHA*refN));
+        ho3DArray<T> acsDst(refRO, refE1, dstCHA, const_cast<T*>(ref_dst.begin()+n*refRO*refE1*dstCHA+usedS*refRO*refE1*dstCHA*refN));
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(acsSrc, debugFolder_+"acsSrc"); }
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(acsDst, debugFolder_+"acsDst"); }
+
+        ho6DArray<T> ker(kRO, kE1, srcCHA, dstCHA, oRO, oE1, 
+                            workOrder2DT->kernel_->begin()
+                            +n*kRO*kE1*srcCHA*dstCHA*oRO*oE1
+                            +usedS*kRO*kE1*srcCHA*dstCHA*oRO*oE1*refN);
+
+        gtPlusSPIRIT2DOperator<T> spirit;
+        spirit.calib_use_gpu_ = workOrder2DT->spirit_use_gpu_;
+
+        spirit.calib(acsSrc, acsDst, workOrder2DT->spirit_reg_lamda_, kRO, kE1, oRO, oE1, ker);
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(ker, debugFolder_+"ker"); }
+
+        bool minusI = true;
+
+        hoNDArray<T> kIm(RO, E1, srcCHA, dstCHA, workOrder2DT->kernelIm_->begin()+n*RO*E1*srcCHA*dstCHA+usedS*RO*E1*srcCHA*dstCHA*refN);
+        GADGET_CHECK_RETURN_FALSE(spirit.imageDomainKernel(ker, kRO, kE1, oRO, oE1, RO, E1, kIm, minusI));
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kIm, debugFolder_+"kIm"); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DTSPIRIT<T>::performCalibImpl(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTSPIRIT<T>::
+performUnwarppingImpl(gtPlusReconWorkOrder<T>* workOrder2DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res, size_t s)
+{
+    try
+    {
+        size_t refN = adj_forward_G_I.get_size(4);
+
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t N = kspace.get_size(3);
+
+        size_t srcCHA = adj_forward_G_I.get_size(2);
+        size_t dstCHA = adj_forward_G_I.get_size(3);
+
+        res.create(kspace.get_dimensions());
+
+        long long n;
+
+        #ifdef USE_OMP
+            int numThreads = (int)( (N<64) ? N : 64 );
+
+            int numOpenMPProcs = omp_get_num_procs();
+            GDEBUG_STREAM("gtPlusReconWorker2DTSPIRIT, numOpenMPProcs : " << numOpenMPProcs);
+
+            if ( numThreads > numOpenMPProcs ) numThreads = numOpenMPProcs;
+
+            int maxOpenMPThreads = omp_get_max_threads();
+
+            GDEBUG_STREAM("gtPlusReconWorker2DTSPIRIT, maxOpenMPThreads : " << maxOpenMPThreads);
+
+            int allowOpenMPNested = omp_get_nested();
+
+            if ( N < numOpenMPProcs-2 )
+            {
+                omp_set_nested(1);
+                allowOpenMPNested = 1;
+            }
+            else
+            {
+                omp_set_nested(0);
+                allowOpenMPNested = 0;
+            }
+
+            GDEBUG_STREAM("gtPlusReconWorker2DTSPIRIT, allowOpenMPNested : " << allowOpenMPNested);
+            GDEBUG_STREAM("gtPlusReconWorker2DTSPIRIT, numThreads : " << numThreads);
+            GDEBUG_STREAM("gtPlusReconWorker2DTSPIRIT, maxOpenMPThreads : " << maxOpenMPThreads);
+            GDEBUG_STREAM("gtPlusReconWorker2DTSPIRIT, numThreads : " << numThreads);
+        #endif
+
+        GDEBUG_STREAM("gtPlusReconWorker2DTSPIRIT, processing starts ... ");
+
+        hoNDArray<T> ker_Shifted(adj_forward_G_I);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifftshift2D(adj_forward_G_I, ker_Shifted);
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(ker_Shifted, debugFolder_+"ker_Shifted"); }
+
+        hoNDArray<T> kspace_Shifted;
+        kspace_Shifted = kspace;
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifftshift2D(kspace, kspace_Shifted);
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace_Shifted, debugFolder_+"kspace_Shifted"); }
+
+        hoNDArray<T> kspace_initial_Shifted;
+        bool hasInitial = false;
+        if ( workOrder2DT->kspace_initial_.dimensions_equal(&kspace) )
+        {
+            kspace_initial_Shifted = workOrder2DT->kspace_initial_;
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifftshift2D(workOrder2DT->kspace_initial_, kspace_initial_Shifted);
+            hasInitial = true;
+        }
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace_initial_Shifted, debugFolder_+"kspace_initial_Shifted"); }
+
+        #pragma omp parallel default(none) private(n) shared(RO, E1, srcCHA, dstCHA, kspace, kspace_Shifted, kspace_initial_Shifted, ker_Shifted, workOrder2DT, res, refN, N, hasInitial) num_threads(numThreads)
+        {
+            gtPlusSPIRIT2DOperator<T> spirit;
+            // spirit.setMemoryManager(gtPlus_mem_manager_);
+            spirit.use_symmetric_spirit_ = false;
+            spirit.use_non_centered_fft_ = true;
+
+            if ( refN == 1 )
+            {
+                boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(RO, E1, srcCHA, dstCHA, ker_Shifted.begin()));
+                spirit.setForwardKernel(ker, false);
+            }
+
+            gtPlusLSQRSolver<hoNDArray<T>, hoNDArray<T>, gtPlusSPIRIT2DOperator<T> > cgSolver;
+
+            cgSolver.iterMax_ = workOrder2DT->spirit_iter_max_;
+            cgSolver.thres_ = (value_type)workOrder2DT->spirit_iter_thres_;
+            cgSolver.printIter_ = workOrder2DT->spirit_print_iter_;
+
+            cgSolver.set(spirit);
+
+            hoNDArray<T> b(RO, E1, srcCHA);
+            hoNDArray<T> unwarppedKSpace(RO, E1, dstCHA);
+
+            #pragma omp for
+            for ( n=0; n<(long long)N; n++ )
+            {
+                // check whether the kspace is undersampled
+                bool undersampled = false;
+                for ( size_t e1=0; e1<E1; e1++ )
+                {
+                    if ( (std::abs( kspace(RO/2, e1, srcCHA-1, n) ) == 0)
+                        && (std::abs( kspace(RO/2, e1, 0, n) ) == 0) )
+                    {
+                        undersampled = true;
+                        break;
+                    }
+                }
+
+                if ( !undersampled )
+                {
+                    memcpy(res.begin()+n*RO*E1*dstCHA, kspace_Shifted.begin()+n*RO*E1*srcCHA, sizeof(T)*RO*E1*dstCHA);
+                    continue;
+                }
+
+                long long kernelN = n;
+                if ( kernelN >= (long long)refN ) kernelN = (long long)refN-1;
+
+                boost::shared_ptr< hoNDArray<T> > acq(new hoNDArray<T>(RO, E1, srcCHA, kspace_Shifted.begin()+n*RO*E1*srcCHA));
+                spirit.setAcquiredPoints(acq);
+
+                boost::shared_ptr< hoNDArray<T> > initialAcq;
+                if ( hasInitial )
+                {
+                    initialAcq = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>(RO, E1, srcCHA, kspace_initial_Shifted.begin()+n*RO*E1*srcCHA));
+                    cgSolver.x0_ = initialAcq.get();
+                }
+                else
+                {
+                    cgSolver.x0_ = acq.get();
+                }
+
+                if ( refN > 1 )
+                {
+                    boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(RO, E1, srcCHA, dstCHA, ker_Shifted.begin()+kernelN*RO*E1*srcCHA*dstCHA));
+                    spirit.setForwardKernel(ker, false);
+
+                    // compute rhs
+                    spirit.computeRighHandSide(*acq, b);
+
+                    // solve
+                    cgSolver.solve(b, unwarppedKSpace);
+                }
+                else
+                {
+                    // compute rhs
+                    spirit.computeRighHandSide(*acq, b);
+
+                    // solve
+                    cgSolver.solve(b, unwarppedKSpace);
+                }
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(unwarppedKSpace, debugFolder_+"unwarppedKSpace_n"); }
+
+                // restore the acquired points
+                spirit.restoreAcquiredKSpace(*acq, unwarppedKSpace);
+
+                memcpy(res.begin()+n*RO*E1*dstCHA, unwarppedKSpace.begin(), unwarppedKSpace.get_number_of_bytes());
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(unwarppedKSpace, debugFolder_+"unwarppedKSpace_n_setAcq"); }
+            }
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"res_Shifted"); }
+
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fftshift2D(res, kspace_Shifted);
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace_Shifted, debugFolder_+"res"); }
+        res = kspace_Shifted;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DTSPIRIT<T>::performUnwarppingImpl(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTSPIRIT<T>::
+performUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& data_dst)
+{
+    try
+    {
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = workOrder2DT->kernelIm_->get_size(2);
+        size_t dstCHA = workOrder2DT->kernelIm_->get_size(3);
+
+        size_t refN = workOrder2DT->kernelIm_->get_size(4);
+
+        size_t usedS;
+
+        // compute the scaling factor
+        typename realType<T>::Type scaleFactor = 1.0;
+        int numOfNForScaling = 100;
+        if ( N > numOfNForScaling )
+        {
+            hoNDArray<T> kspaceForScaleFactor(RO, E1, srcCHA, numOfNForScaling, const_cast<T*>(data_dst.begin()));
+            Gadgetron::norm2(kspaceForScaleFactor, scaleFactor);
+            scaleFactor /= (value_type)(numOfNForScaling*std::sqrt(double(srcCHA)));
+        }
+        else
+        {
+            Gadgetron::norm2(data_dst, scaleFactor);
+            scaleFactor /= (value_type)(N*std::sqrt(double(srcCHA)));
+        }
+
+        if ( workOrder2DT->spirit_ncg_scale_factor_ < 0 )
+        {
+            workOrder2DT->spirit_ncg_scale_factor_ = scaleFactor;
+        }
+        else
+        {
+            GDEBUG_STREAM("SPIRIT - 2DT - spirit_ncg_scale_factor_ is preset : " << workOrder2DT->spirit_ncg_scale_factor_ << " ... ");
+        }
+
+        // split the jobs
+        bool splitByS = workOrder2DT->job_split_by_S_;
+        size_t jobN = workOrder2DT->job_num_of_N_;
+        size_t jobMegaBytes = workOrder2DT->job_max_Megabytes_;
+        size_t overlapN = workOrder2DT->job_overlap_;
+        size_t maxNumOfBytesPerJob = jobMegaBytes*1024*1024;
+
+        if ( workOrder2DT->recon_algorithm_==ISMRMRD_SPIRIT )
+        {
+            overlapN = 0;
+        }
+
+        bool splitJobs = (splitByS==true || jobN>0);
+        if ( !splitJobs )
+        {
+            if ( jobMegaBytes>0 )
+            {
+                size_t jobN = jobMegaBytes/(RO*E1*srcCHA*dstCHA*sizeof(T)/1024/1024);
+                if ( jobN < N ) splitJobs = true;
+                GDEBUG_STREAM("SPIRIT - 2DT - size of largest job : " << jobN);
+            }
+        }
+
+        if ( !workOrder2DT->CloudComputing_ )
+        {
+            if ( jobN >= N ) splitJobs = false;
+        }
+
+        if ( splitJobs )
+        {
+            bool runJobsOnCloud = workOrder2DT->CloudComputing_;
+            unsigned int cloudSize = workOrder2DT->CloudSize_;
+            bool runJobsOnLocalNode = workOrder2DT->job_perform_on_control_node_;
+
+            std::vector<gtPlusReconJob2DT<T> > jobList;
+
+            if ( runJobsOnCloud )
+            {
+                unsigned int j;
+
+                GADGET_CHECK_RETURN_FALSE(this->estimateJobSize(workOrder2DT, maxNumOfBytesPerJob, overlapN, cloudSize, jobN));
+
+                //GDEBUG_STREAM("SPIRIT - 2DT - cloudSize is " << cloudSize << " - N is " << N << " ... ");
+                //unsigned int nodeN = cloudSize;
+                //if ( runJobsOnLocalNode ) nodeN++;
+                //GDEBUG_STREAM("SPIRIT - 2DT - runJobsOnLocalNode is " << runJobsOnLocalNode << " - nodeN is " << nodeN << " - overlapN is " << overlapN << " ... ");
+
+                //// adjust jobN according to cloud size
+                //jobN = std::ceil( (double)(N+overlapN*(nodeN-1))/(double)nodeN );
+
+                //size_t numOfBytesPerJob = sizeof(T)*( RO*E1*srcCHA*dstCHA*jobN + 2*RO*E1*srcCHA*jobN );
+
+                //while ( numOfBytesPerJob > 2.0*1024*1024*1024-64.0*1024*1024 )
+                //{
+                //    nodeN *= 2;
+                //    jobN = std::ceil( (double)N/nodeN + (double)(overlapN*(nodeN-1))/nodeN );
+                //    numOfBytesPerJob = sizeof(T)*( RO*E1*srcCHA*dstCHA*jobN + 2*RO*E1*srcCHA*jobN );
+                //}
+
+                //GDEBUG_STREAM("SPIRIT - 2DT - jobN is " << jobN << "; every job has " << numOfBytesPerJob/1024.0/1024 << " MBytes ... ");
+
+                // split the job
+                GADGET_CHECK_RETURN_FALSE(this->splitReconJob(workOrder2DT, const_cast<hoNDArray<T>&>(data_dst), *(workOrder2DT->kernelIm_), splitByS, jobN, jobMegaBytes, overlapN, jobList));
+
+                if ( runJobsOnLocalNode )
+                {
+                    while ( jobList.size() <= cloudSize )
+                    {
+                        jobN--;
+                        jobList.clear();
+                        GADGET_CHECK_RETURN_FALSE(this->splitReconJob(workOrder2DT, const_cast<hoNDArray<T>&>(data_dst), *(workOrder2DT->kernelIm_), splitByS, jobN, jobMegaBytes, overlapN, jobList));
+                    }
+                }
+
+                std::vector<gtPlusReconJob2DT<T> > completedJobList(jobList.size());
+
+                for ( j=0; j<jobList.size(); j++ )
+                {
+                    jobList[j].workOrder2DT.duplicate(completedJobList[j].workOrder2DT);
+                    completedJobList[j].job_index_startN_ = jobList[j].job_index_startN_;
+                    completedJobList[j].job_index_endN_ = jobList[j].job_index_endN_;
+                    completedJobList[j].job_index_S_ = jobList[j].job_index_S_;
+                }
+
+                GDEBUG_STREAM("SPIRIT - 2DT - total job : " << jobList.size() << " - job N : " << jobN << " - cloud size : " << cloudSize);
+
+                unsigned int numOfJobRunOnCloud = (unsigned int)(jobList.size() - jobList.size()/(cloudSize+1));
+                if ( !runJobsOnLocalNode ) numOfJobRunOnCloud = (unsigned int)jobList.size();
+                GDEBUG_STREAM("SPIRIT - 2DT - numOfJobRunOnCloud : " << numOfJobRunOnCloud << " ... ");
+
+                typedef Gadgetron::GadgetCloudController< gtPlusReconJob2DT<T> > GTCloudControllerType;
+                GTCloudControllerType controller;
+
+                if (controller.open () == -1)
+                {
+                    GERROR_STREAM("Cloud controller cannot open the cloud ...");
+                    controller.handle_close (ACE_INVALID_HANDLE, 0);
+                    runJobsOnCloud = false;
+                }
+                else
+                {
+                    std::vector<gtPlusReconJob2DT<T>* > jobListCloud(numOfJobRunOnCloud);
+                    std::vector<gtPlusReconJob2DT<T>* > completedJobListCloud(numOfJobRunOnCloud);
+                    std::vector<int> node_ids(numOfJobRunOnCloud);
+
+                    GADGET_CHECK_RETURN_FALSE(this->scheduleJobForNodes(workOrder2DT, numOfJobRunOnCloud, node_ids));
+
+                    for ( j=0; j<numOfJobRunOnCloud; j++ )
+                    {
+                        // node_ids[j] = j%cloudSize;
+                        jobListCloud[j] = &jobList[j];
+                        completedJobListCloud[j] = &completedJobList[j];
+                        GDEBUG_STREAM("--> job " << j << " runs on node " << node_ids[j] << " ... ");
+                    }
+
+                    std::vector<GadgetMessageReader*> readers(cloudSize, NULL);
+                    std::vector<GadgetMessageWriter*> writers(cloudSize, NULL);
+
+                    for ( j=0; j<cloudSize; j++ )
+                    {
+                        readers[j] = new GtPlusCloudJobMessageReaderCPFL();
+                        writers[j] = new GtPlusCloudJobMessageWriterCPFL();
+                    }
+
+                    if ( controller.createConnector(workOrder2DT->gt_cloud_, GADGET_MESSAGE_CLOUD_JOB, readers, GADGET_MESSAGE_CLOUD_JOB, writers) != 0 )
+                    {
+                        GERROR_STREAM("Cloud controller creates connectors failed ...");
+                        controller.handle_close (ACE_INVALID_HANDLE, 0);
+                        runJobsOnCloud = false;
+                    }
+                    else if ( controller.connectToCloud(workOrder2DT->gt_cloud_) != 0 )
+                    {
+                        GERROR_STREAM("Cloud controller cannot connect to the cloud ...");
+                        controller.handle_close (ACE_INVALID_HANDLE, 0);
+                        runJobsOnCloud = false;
+                    }
+                    else
+                    {
+                        if ( controller.runJobsOnCloud(jobListCloud, completedJobListCloud, node_ids) != 0 )
+                        {
+                            GERROR_STREAM("Cloud controller runs jobs on the cloud failed ...");
+                            controller.closeCloudNode();
+                            controller.handle_close (ACE_INVALID_HANDLE, 0);
+                            runJobsOnCloud = false;
+                        }
+                        else
+                        {
+                            controller.closeCloudNode();
+
+                            // run the left over jobs on the local computer
+                            for ( j=numOfJobRunOnCloud; j<jobList.size(); j++ )
+                            {
+                                GDEBUG_STREAM("SPIRIT - 2DT - job : " << j << " - size :" << jobList[j].job_index_endN_-jobList[j].job_index_startN_+1);
+
+                                if ( performTiming_ ) { gt_timer3_.start("SPIRIT 2DT ... "); }
+                                GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(jobList[j]));
+                                if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                                std::ostringstream ostr;
+                                ostr << "job_fullkspace" << "_" << j;
+                                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(jobList[j].res, debugFolder_+ostr.str()); }
+                            }
+
+                            // wait the cloud job to complete
+                            controller.waitForJobToComplete();
+
+                            // combine results from cloud and local run
+                            for ( j=0; j<numOfJobRunOnCloud; j++ )
+                            {
+                                jobList[j].res = controller.completed_job_list_[j]->res;
+                                jobList[j].complexIm = controller.completed_job_list_[j]->complexIm;
+                            }
+
+                            // if some jobs are not actually completed, process them
+                            for ( j=0; j<numOfJobRunOnCloud; j++ )
+                            {
+                                if ( 
+                                    !jobList[j].res.dimensions_equal(&jobList[j].kspace) 
+                                        && 
+                                    ( jobList[j].complexIm.get_size(0)!= jobList[j].kspace.get_size(0) 
+                                    || jobList[j].complexIm.get_size(1)!= jobList[j].kspace.get_size(1) 
+                                    || jobList[j].complexIm.get_size(2)!= jobList[j].kspace.get_size(2) ) 
+                                   )
+                                {
+                                    GDEBUG_STREAM("SPIRIT - 2DT - uncompleted cloud job : " << j << " - size :" << jobList[j].job_index_endN_-jobList[j].job_index_startN_+1);
+
+                                    if ( performTiming_ ) { gt_timer3_.start("SPIRIT 3DT ... "); }
+                                    GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(jobList[j]));
+                                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                                    std::ostringstream ostr;
+                                    ostr << "job_fullkspace" << "_" << j;
+                                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(jobList[j].res, debugFolder_+ostr.str()); }
+                                }
+                            }
+
+                            // combine the job
+                            GADGET_CHECK_RETURN_FALSE(this->combineReconJob(workOrder2DT, jobList, N, S));
+
+                            // clear the memory
+                            jobList.clear();
+                        }
+                    }
+                }
+            }
+
+            if ( !runJobsOnCloud )
+            {
+                GADGET_CHECK_RETURN_FALSE(this->splitReconJob(workOrder2DT, const_cast<hoNDArray<T>&>(data_dst), *(workOrder2DT->kernelIm_), splitByS, jobN, jobMegaBytes, overlapN, jobList));
+
+                GDEBUG_STREAM("SPIRIT - 2DT - total job : " << jobList.size());
+
+                size_t j;
+                for ( j=0; j<jobList.size(); j++ )
+                {
+                    GDEBUG_STREAM("SPIRIT - 2DT - job : " << j << " - size :" << jobList[j].job_index_endN_-jobList[j].job_index_startN_+1);
+
+                    if ( performTiming_ ) { gt_timer3_.start("L1 SPIRIT NCG 2DT ... "); }
+                    GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(jobList[j]));
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(jobList[j].res, debugFolder_+"job_fullkspace"); }
+                }
+
+                // combine the job
+                GADGET_CHECK_RETURN_FALSE(this->combineReconJob(workOrder2DT, jobList, N, S));
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder2DT->fullkspace_, debugFolder_+"fullkspace"); }
+
+                // clear the memory
+                jobList.clear();
+            }
+        }
+        else
+        {
+            workOrder2DT->complexIm_.create(RO, E1, N, S);
+
+            // downstream coil compression is not supported here
+            // kspace is always reconed
+            bool recon_kspace = true;
+
+            workOrder2DT->fullkspace_ = data_dst;
+
+            for ( usedS=0; usedS<S; usedS++ )
+            {
+                hoNDArray<T> kIm(RO, E1, srcCHA, dstCHA, refN, workOrder2DT->kernelIm_->begin()+usedS*RO*E1*srcCHA*dstCHA*refN);
+
+                hoNDArray<T> aliasedKSpace(RO, E1, srcCHA, N, const_cast<T*>(data_dst.begin())+usedS*RO*E1*srcCHA*N);
+
+                hoNDArray<T> unwarppedKSpace(RO, E1, dstCHA, N, workOrder2DT->fullkspace_.begin()+usedS*RO*E1*dstCHA*N);
+
+                GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(workOrder2DT, aliasedKSpace, kIm, unwarppedKSpace, usedS));
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(unwarppedKSpace, debugFolder_+"unwarppedKSpace"); }
+            }
+        }
+
+        hoNDArray<T> complexImMultiChannel(RO, E1, dstCHA, N);
+
+        // perform coil combination
+        for ( usedS=0; usedS<S; usedS++ )
+        {
+            hoNDArray<T> unwarppedKSpace(RO, E1, dstCHA, N, workOrder2DT->fullkspace_.begin()+usedS*RO*E1*dstCHA*N);
+
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(unwarppedKSpace, complexImMultiChannel);
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(complexImMultiChannel, debugFolder_+"unwarppedComplexIm"); }
+
+            hoNDArray<T> combined(RO, E1, N, workOrder2DT->complexIm_.begin()+usedS*RO*E1*N);
+
+            if ( refN == N )
+            {
+                hoNDArray<T> coilMap(RO, E1, dstCHA, refN, workOrder2DT->coilMap_->begin()+usedS*RO*E1*dstCHA*refN);
+                gtPlusISMRMRDReconUtilComplex<T>().coilCombine(complexImMultiChannel, coilMap, combined);
+            }
+            else
+            {
+                hoNDArray<T> coilMap(RO, E1, dstCHA, workOrder2DT->coilMap_->begin()+usedS*RO*E1*dstCHA*refN);
+                gtPlusISMRMRDReconUtilComplex<T>().coilCombine(complexImMultiChannel, coilMap, combined);
+            }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(combined, debugFolder_+"combined"); }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DTSPIRIT<T>::performUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& data) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTSPIRIT<T>::
+performUnwarppingImpl(gtPlusReconJob2DT<T>& job)
+{
+    try
+    {
+        hoNDArray<T>& kspace = job.kspace;
+        hoNDArray<T>& ker = job.ker;
+        hoNDArray<T>& res = job.res;
+        gtPlusReconWorkOrder<T>* workOrder2DT = &(job.workOrder2DT);
+
+        GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(workOrder2DT, kspace, ker, res, job.job_index_S_));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker2DTSPIRIT<T>::performUnwarppingImpl(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DT.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DT.h
new file mode 100644
index 0000000..0b07803
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DT.h
@@ -0,0 +1,2716 @@
+/** \file   gtPlusISMRMRDReconWorker3DT.h
+    \brief  Define the base class for the GtPlus worker for 3D or 3DT reconstruction cases
+
+            Four different strategies were implemented for partial fourier or asymmetric echo acquisition, including:
+
+            ISMRMRD_PF_ZEROFILLING          : only zero filling the unacquired k-space
+
+            ISMRMRD_PF_ZEROFILLING_FILTER   : zero filling the unacquired k-space and apply a transition filter on the edges between
+                                              acquired and unacquired regions
+
+            ISMRMRD_PF_POCS                 : perform the iterative POCS reconstruction
+                                              Magnetic Resonance Imaging: Physical Principles and Sequence Design. Page 296-297.
+                                              E. Mark Haacke, Robert W. Brown, Michael R. Thompson, Ramesh Venkatesan. 
+                                              Wiley-Liss, ISBN-10: 0471351288.
+
+            ISMRMRD_PF_FENGHUANG            : perform a k-space convolution based partial fourier reconstruction. 
+
+                                              Feng Huang, Wei Lin, and Yu Li. 
+                                              Partial Fourier Reconstruction Through Data Fitting and Convolution in k-Space.
+                                              Magnetic Resonance in Medicine, Vol 62, page 1261�1269, 2009.
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusISMRMRDReconWorker.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker3DT : public gtPlusReconWorker<T>
+{
+public:
+
+    typedef gtPlusReconWorker<T> BaseClass;
+    typedef gtPlusReconWorkOrder3DT<T> WorkOrderType;
+    typedef typename BaseClass::value_type value_type;
+
+    gtPlusReconWorker3DT() : BaseClass(), startE1_(0), endE1_(0), startE2_(0), endE2_(0) {}
+    virtual ~gtPlusReconWorker3DT() {}
+
+    virtual bool performRecon(gtPlusReconWorkOrder<T>* workOrder)
+    {
+        // check whether we have all-zeros input
+        value_type v(1);
+        Gadgetron::norm2(workOrder->data_, v);
+        if ( v <= 0 )
+        {
+            GWARN_STREAM("gtPlusReconWorker2DT, performRecon(workOrder) : incoming data contains all-zeros ... ");
+
+            boost::shared_ptr< std::vector<size_t> > dims = workOrder->data_.get_dimensions();
+            (*dims)[3] = workOrder->num_channels_res_;
+            workOrder->complexIm_.create(dims);
+            Gadgetron::clear(workOrder->complexIm_);
+
+            return true;
+        }
+
+        gtPlusReconWorkOrder3DT<T>* workOrder3DT = dynamic_cast<gtPlusReconWorkOrder3DT<T>*>(workOrder);
+        if ( workOrder3DT == NULL ) return false;
+
+        if ( workOrder3DT->recon_auto_parameters_ )
+        {
+            this->autoReconParameter(workOrder3DT);
+            GDEBUG_STREAM("Gt Plus 3DT -- automatic paramter selection ---");
+            if ( !this->debugFolder_.empty() ) { workOrder3DT->print(std::cout); }
+        }
+
+        return this->performRecon(workOrder3DT);
+    }
+
+    // the common functionalities are performed here for 3DT recon
+    // compute the coil compression coefficients
+    // prepare the ref data array
+    virtual bool performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT);
+
+    virtual bool estimateCoilMap(WorkOrderType* workOrder3DT, const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, const hoNDArray<T>& ref_coil_map_dst);
+    virtual bool performCalib(WorkOrderType* workOrder3DT, const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, const hoNDArray<T>& ref_coil_map_dst);
+    virtual bool performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT);
+    virtual bool performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT, size_t usedN);
+
+    virtual bool performUnwrapping(WorkOrderType* workOrder3DT, const hoNDArray<T>& data);
+
+    // the partial fourier handling for the 3DT reconstruction
+    // the computation is performed on the reconstructed full kspace
+    virtual bool performPartialFourierHandling(WorkOrderType* workOrder3DT);
+
+    // perform the kspace filter on ref data for coil map estimation
+    virtual bool performRefFilter(gtPlusReconWorkOrder3DT<T>* workOrder3DT, 
+                                        const hoNDArray<T>& ref, hoNDArray<T>& refFiltered, 
+                                        int startRO, int endRO, int startE1, int endE1, int startE2, int endE2);
+
+    // for interleave, compute mean ref
+    // for embedded and separate, squeeze out the zero lines
+    virtual bool prepRef(WorkOrderType* workOrder3DT, 
+                        const hoNDArray<T>& ref, 
+                        hoNDArray<T>& refRecon, 
+                        hoNDArray<T>& refCoilMap, 
+                        int startRO, int endRO, 
+                        int startE1, int endE1, 
+                        int startE2, int endE2, 
+                        size_t dataE1, 
+                        size_t dataE2);
+
+    // implement reference data preparation
+    virtual bool prepRefByAveragingCrossN(WorkOrderType* workOrder3DT, const hoNDArray<T>& ref, bool averageAllRef, int numOfModes, hoNDArray<T>& refRecon);
+
+    // compute coil compression coefficients
+    virtual bool coilCompression(WorkOrderType* workOrder3DT);
+
+    // after unwrapping, for embedded and separate, the full res coil map may be estimated
+    // for embedded, the ref may be filled back to fullkspace
+    virtual bool afterUnwrapping(WorkOrderType* workOrder3DT);
+
+    // whether to recon kspace, if true, the coil combination may not be performed, only the fullkspace is computed
+    virtual bool computeKSpace(gtPlusReconWorkOrder3DT<T>* workOrder3DT) = 0;
+
+    // ----------------------------------------------------
+    // common functions for 3DT reconstruction
+    // ----------------------------------------------------
+    // image domain kernel with coil sensitivity
+    // kerIm: [RO E1 E2 srcCHA dstCHA]
+    // coilMap: [RO E1 E2 dstCHA]
+    // unmixCoeff: [RO E1 E2 srcCHA]
+    // gFactor: [RO E1 E2]
+    bool unmixCoeff(const hoNDArray<T>& kerIm, const hoNDArray<T>& coilMap, hoNDArray<T>& unmixCoeff, hoNDArray<T>& gFactor);
+
+    // apply image domain kernel
+    // kspace: [RO E1 E2 srcCHA ...]
+    // complexIm : [RO E1 E2 dstCHA ...]
+    bool applyImageDomainKernel(const hoNDArray<T>& kspace, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm);
+    // aliasedIm : [RO E1 E2 srcCHA ...]
+    bool applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm);
+    // for speed, a buffer can be provided
+    bool applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& kerImBuffer, hoNDArray<T>& complexIm);
+
+    // apply unmixCoeff
+    // kspace: [RO E1 E2 srcCHA ...]
+    // unmixCoeff : [RO E1 E2 srcCHA]
+    // complexIm : [RO E1 E2 ...]
+    bool applyUnmixCoeff(const hoNDArray<T>& kspace, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm);
+    // aliasedIm : [RO E1 E2 srcCHA ...]
+    bool applyUnmixCoeffImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm);
+
+    // ----------------------------------------------------
+    // Partial fourier handling for 3DT reconstruction
+    // ----------------------------------------------------
+    // apply the partial fourier filer along the edges
+    bool performPartialFourierFilter(WorkOrderType& workOrder3DT, hoNDArray<T>& kspace);
+    // apply the iterative POCS for partial fourier reconstruction
+    bool performPartialFourierPOCSRecon(WorkOrderType& workOrder3DT, hoNDArray<T>& kspace);
+    // apply the Feng Huang partial fourier reconstruction
+    bool performPartialFourierFengHuangRecon(WorkOrderType& workOrder3DT, hoNDArray<T>& kspace);
+
+    //// compute Feng Huang kernel and perform recon
+    bool calibFengHuang(WorkOrderType& workOrder3DT, const hoNDArray<T>& src, const hoNDArray<T>& dst, ho6DArray<T>& kernel);
+    bool performReconFangHuang(WorkOrderType& workOrder3DT, const hoNDArray<T>& kspaceConj, hoNDArray<T>& kspace, int startRO, int endRO, int startE1, int endE1, int startE2, int endE2, ho6DArray<T>& kernel);
+
+    // estimate job size for 3DT recon
+    virtual bool estimateJobSize(gtPlusReconWorkOrder<T>* workOrder3DT, size_t maxNumOfBytesPerJob, size_t overlapBetweenJobs, size_t numOfNodes, size_t& jobSize);
+
+    using BaseClass::partial_fourier_handling_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::verbose_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+
+protected:
+
+    // helper memory for computation
+    hoNDArray<T> ref_src_;
+    hoNDArray<T> ref_dst_;
+    hoNDArray<T> data_dst_;
+    hoNDArray<T> ref_coil_map_dst_;
+
+    // sampled region along E1/E2
+    size_t startE1_;
+    size_t endE1_;
+
+    size_t startE2_;
+    size_t endE2_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::performRecon(WorkOrderType* workOrder3DT)
+{
+    // the 3DT recon on 5D array [RO E1 E2 CHA N]
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(workOrder3DT!=NULL);
+
+        if ( !workOrder3DT->workFlow_use_BufferedKernel_ )
+        {
+            if ( performTiming_ ) { gt_timer1_.start("prepRef"); }
+            GADGET_CHECK_RETURN_FALSE(prepRef(workOrder3DT, workOrder3DT->ref_, 
+                                            workOrder3DT->ref_recon_, 
+                                            workOrder3DT->ref_coil_map_, 
+                                            workOrder3DT->start_RO_, workOrder3DT->end_RO_, 
+                                            workOrder3DT->start_E1_, workOrder3DT->end_E1_, 
+                                            workOrder3DT->start_E2_, workOrder3DT->end_E2_, 
+                                            workOrder3DT->data_.get_size(1), workOrder3DT->data_.get_size(2)));
+            if ( performTiming_ ) { gt_timer1_.stop(); }
+
+            if ( performTiming_ ) { gt_timer1_.start("coilCompression"); }
+            GADGET_CHECK_RETURN_FALSE(coilCompression(workOrder3DT));
+            if ( performTiming_ ) { gt_timer1_.stop(); }
+        }
+
+        // apply coil compression coefficients
+        if ( workOrder3DT->workFlow_use_BufferedKernel_ )
+        {
+            if ( workOrder3DT->coil_compression_ 
+                && workOrder3DT->recon_algorithm_!=ISMRMRD_SPIRIT 
+                && workOrder3DT->recon_algorithm_!=ISMRMRD_L1SPIRIT 
+                && workOrder3DT->recon_algorithm_!=ISMRMRD_L1SPIRIT_SLEP 
+                && workOrder3DT->recon_algorithm_!=ISMRMRD_L1SPIRIT_SLEP_MOTION_COMP )
+            {
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->data_, debugFolder_+"data_"); }
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->data_, *workOrder3DT->coilCompressionCoef_, data_dst_, true));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(data_dst_, debugFolder_+"data_dst_"); }
+            }
+            else
+            {
+                data_dst_ = workOrder3DT->data_;
+            }
+        }
+        else
+        {
+            if ( workOrder3DT->coil_compression_ 
+                && workOrder3DT->recon_algorithm_!=ISMRMRD_SPIRIT 
+                && workOrder3DT->recon_algorithm_!=ISMRMRD_L1SPIRIT 
+                && workOrder3DT->recon_algorithm_!=ISMRMRD_L1SPIRIT_SLEP 
+                && workOrder3DT->recon_algorithm_!=ISMRMRD_L1SPIRIT_SLEP_MOTION_COMP 
+                && (workOrder3DT->acceFactorE1_>1 || workOrder3DT->acceFactorE2_>1) )
+            {
+                ref_src_ = workOrder3DT->ref_recon_;
+
+                if ( performTiming_ ) { gt_timer2_.start("apply coil compression ... "); }
+
+                #pragma omp parallel sections default(shared)
+                {
+                    #pragma omp section
+                    {
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(ref_src_, debugFolder_+"ref_src_"); }
+                        //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(ref_src_, *workOrder3DT->coilCompressionCoef_, ref_dst_, true));
+                        gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(ref_src_, *workOrder3DT->coilCompressionCoef_, ref_dst_, true);
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(ref_dst_, debugFolder_+"ref_dst_"); }
+                    }
+
+                    #pragma omp section
+                    {
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->data_, debugFolder_+"data_"); }
+                        //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->data_, *workOrder3DT->coilCompressionCoef_, data_dst_, true));
+                        gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->data_, *workOrder3DT->coilCompressionCoef_, data_dst_, true);
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(data_dst_, debugFolder_+"data_dst_"); }
+                    }
+
+                    #pragma omp section
+                    {
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->ref_coil_map_, debugFolder_+"ref_coil_map_"); }
+                        //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->ref_coil_map_, *workOrder3DT->coilCompressionCoef_, ref_coil_map_dst_, true));
+                        gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->ref_coil_map_, *workOrder3DT->coilCompressionCoef_, ref_coil_map_dst_, true);
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(ref_coil_map_dst_, debugFolder_+"ref_coil_map_dst_"); }
+                    }
+                }
+
+                if ( performTiming_ ) { gt_timer2_.stop(); }
+
+                if ( !workOrder3DT->downstream_coil_compression_ 
+                    || workOrder3DT->recon_algorithm_==ISMRMRD_SPIRIT 
+                    || workOrder3DT->recon_algorithm_==ISMRMRD_L1SPIRIT 
+                    || workOrder3DT->recon_algorithm_==ISMRMRD_L1SPIRIT_SLEP 
+                    || workOrder3DT->recon_algorithm_==ISMRMRD_L1SPIRIT_SLEP_MOTION_COMP )
+                {
+                    ref_src_ = ref_dst_;
+                }
+            }
+            else
+            {
+                ref_src_ = workOrder3DT->ref_recon_;
+                ref_dst_ = workOrder3DT->ref_recon_;
+                data_dst_ = workOrder3DT->data_;
+                ref_coil_map_dst_ = workOrder3DT->ref_coil_map_;
+            }
+
+            if ( performTiming_ ) { gt_timer1_.start("estimate coil map"); }
+            GADGET_CHECK_RETURN_FALSE(this->estimateCoilMap(workOrder3DT, ref_src_, ref_dst_, ref_coil_map_dst_));
+            if ( performTiming_ ) { gt_timer1_.stop(); }
+
+            if ( workOrder3DT->acceFactorE1_>1 || workOrder3DT->acceFactorE2_>1 )
+            {
+                if ( performTiming_ ) { gt_timer1_.start("performCalib"); }
+                GADGET_CHECK_RETURN_FALSE(this->performCalib(workOrder3DT, ref_src_, ref_dst_, ref_coil_map_dst_));
+                if ( performTiming_ ) { gt_timer1_.stop(); }
+            }
+        }
+
+        if ( performTiming_ ) { gt_timer1_.start("performUnwrapping"); }
+        GADGET_CHECK_RETURN_FALSE(this->performUnwrapping(workOrder3DT, data_dst_));
+        if ( performTiming_ ) { gt_timer1_.stop(); }
+
+        if ( performTiming_ ) { gt_timer1_.start("afterUnwrapping"); }
+        GADGET_CHECK_RETURN_FALSE(this->afterUnwrapping(workOrder3DT));
+        if ( performTiming_ ) { gt_timer1_.stop(); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::performRecon(WorkOrderType* workOrder3DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::
+estimateCoilMap(gtPlusReconWorkOrder3DT<T>* workOrder3DT, const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, const hoNDArray<T>& ref_coil_map_dst)
+{
+    try
+    {
+        size_t RO = workOrder3DT->data_.get_size(0);
+        size_t E1 = workOrder3DT->data_.get_size(1);
+        size_t E2 = workOrder3DT->data_.get_size(2);
+        size_t N = workOrder3DT->data_.get_size(4);
+        size_t srcCHA = workOrder3DT->data_.get_size(3);
+
+        size_t refRO = ref_dst.get_size(0);
+        size_t refE1 = ref_dst.get_size(1);
+        size_t refE2 = ref_dst.get_size(2);
+        size_t refN = ref_dst.get_size(4);
+        size_t dstCHA = ref_coil_map_dst.get_size(3);
+
+        bool same_combinationcoeff_allN = false;
+        size_t whichN_combinationcoeff = 0;
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_interleaved )
+        {
+            same_combinationcoeff_allN = true;
+            whichN_combinationcoeff = 0;
+        }
+
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_embedded )
+        {
+            same_combinationcoeff_allN = workOrder3DT->embedded_same_combinationcoeff_allN_;
+            whichN_combinationcoeff = workOrder3DT->embedded_whichN_combinationcoeff_;
+        }
+
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_separate )
+        {
+            same_combinationcoeff_allN = workOrder3DT->separate_same_combinationcoeff_allN_;
+            whichN_combinationcoeff = workOrder3DT->separate_whichN_combinationcoeff_;
+        }
+
+        if ( whichN_combinationcoeff >= refN ) whichN_combinationcoeff=refN-1;
+
+        bool reconKSpace = this->computeKSpace(workOrder3DT);
+
+        // if the coil map has not been preset
+        if ( !reconKSpace )
+        {
+            if ( (workOrder3DT->coilMap_->get_size(0)!=RO) 
+                || (workOrder3DT->coilMap_->get_size(1)!=E1)
+                || (workOrder3DT->coilMap_->get_size(2)!=E2) )
+            {
+                if ( same_combinationcoeff_allN )
+                {
+                    size_t usedN = whichN_combinationcoeff;
+
+                    hoNDArray<T> refCoilMapN(RO, E1, E2, dstCHA, const_cast<T*>(ref_coil_map_dst.begin()+usedN*RO*E1*E2*dstCHA));
+
+                    workOrder3DT->coilMap_->create(RO, E1, E2, dstCHA, refN);
+                    //Gadgetron::clear(workOrder3DT->coilMap_.get());
+
+                    // hoNDArray<T> coilMapN(RO, E1, E2, dstCHA, workOrder3DT->coilMap_->begin()+usedN*RO*E1*E2*dstCHA);
+                    // hoNDArray<T> coilMapN(RO, E1, E2, dstCHA);
+                    hoNDArray<T> coilMapN(RO, E1, E2, dstCHA);
+
+                    hoNDArray<T> buffer3DT(RO, E1, E2, dstCHA);
+
+                    Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(refCoilMapN, buffer3DT);
+
+                    if ( performTiming_ ) { gt_timer3_.start("coil map estimation ... "); }
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(buffer3DT, 
+                            coilMapN, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, 
+                            workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, (value_type)workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    memcpy(workOrder3DT->coilMap_->begin()+usedN*RO*E1*E2*dstCHA, coilMapN.begin(), coilMapN.get_number_of_bytes());
+                    GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder3DT->coilMap_, usedN));
+                }
+                else
+                {
+                    hoNDArray<T> buffer3DT(ref_coil_map_dst.get_dimensions());
+
+                    Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(ref_coil_map_dst, buffer3DT);
+
+                    if ( performTiming_ ) { gt_timer3_.start("coil map estimation ... "); }
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(buffer3DT, 
+                            *workOrder3DT->coilMap_, workOrder3DT->coil_map_algorithm_, 
+                            workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, 
+                            workOrder3DT->csm_iter_num_, (value_type)workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+                }
+            }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*workOrder3DT->coilMap_, debugFolder_+"coilMap_"); }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::estimateCoilMap(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::
+performCalib(gtPlusReconWorkOrder3DT<T>* workOrder3DT, const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, const hoNDArray<T>& ref_coil_map_dst)
+{
+    try
+    {
+        size_t RO = workOrder3DT->data_.get_size(0);
+        size_t E1 = workOrder3DT->data_.get_size(1);
+        size_t E2 = workOrder3DT->data_.get_size(2);
+        size_t N = workOrder3DT->data_.get_size(4);
+        size_t srcCHA = workOrder3DT->data_.get_size(3);
+
+        size_t refRO = ref_dst.get_size(0);
+        size_t refE1 = ref_dst.get_size(1);
+        size_t refE2 = ref_dst.get_size(2);
+        size_t refN = ref_dst.get_size(4);
+        size_t dstCHA = ref_coil_map_dst.get_size(3);
+
+        bool same_combinationcoeff_allN = false;
+        size_t whichN_combinationcoeff = 0;
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_interleaved )
+        {
+            same_combinationcoeff_allN = true;
+            whichN_combinationcoeff = 0;
+        }
+
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_embedded )
+        {
+            same_combinationcoeff_allN = workOrder3DT->embedded_same_combinationcoeff_allN_;
+            whichN_combinationcoeff = workOrder3DT->embedded_whichN_combinationcoeff_;
+        }
+
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_separate )
+        {
+            same_combinationcoeff_allN = workOrder3DT->separate_same_combinationcoeff_allN_;
+            whichN_combinationcoeff = workOrder3DT->separate_whichN_combinationcoeff_;
+        }
+
+        if ( whichN_combinationcoeff >= refN ) whichN_combinationcoeff=refN-1;
+
+        bool reconKSpace = this->computeKSpace(workOrder3DT);
+
+        // calibration
+        if ( (workOrder3DT->kernelIm_->get_size(3)!=srcCHA) || (workOrder3DT->kernelIm_->get_size(4)!=dstCHA) )
+        {
+           GADGET_CHECK_RETURN_FALSE(this->performCalibPrep(ref_src, ref_dst, workOrder3DT));
+
+            // perform calibration
+            if ( same_combinationcoeff_allN )
+            {
+                size_t usedN = whichN_combinationcoeff;
+
+                this->performCalibImpl(ref_src, ref_dst, workOrder3DT, usedN);
+
+                GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder3DT->kernel_, usedN));
+                GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder3DT->kernelIm_, usedN));
+
+                if ( !reconKSpace )
+                {
+                    if ( workOrder3DT->unmixingCoeffIm_ && (workOrder3DT->unmixingCoeffIm_->get_number_of_elements()>0) )
+                    {
+                        GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder3DT->unmixingCoeffIm_, usedN));
+                        GADGET_CHECK_RETURN_FALSE(repmatLastDimension(workOrder3DT->gfactor_, usedN));
+
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*workOrder3DT->unmixingCoeffIm_, debugFolder_+"unmixingCoeffIm_"); }
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->gfactor_, debugFolder_+"gfactor_"); }
+                    }
+                }
+            }
+            else
+            {
+                int usedN;
+                #ifdef USE_OMP
+                    omp_set_nested(1);
+                #endif // USE_OMP
+
+                #pragma omp parallel for default(none) private(usedN) shared(N, ref_src, ref_dst, workOrder3DT, reconKSpace)
+                for ( usedN=0; usedN<(int)N; usedN++ )
+                {
+                    this->performCalibImpl(ref_src, ref_dst, workOrder3DT, usedN);
+                }
+
+                #ifdef USE_OMP
+                    omp_set_nested(0);
+                #endif // USE_OMP
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::performCalib(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::
+performCalibPrep(const hoNDArray<T>& , const hoNDArray<T>& , WorkOrderType* /*workOrder3DT*/)
+{
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::
+performCalibImpl(const hoNDArray<T>& , const hoNDArray<T>& , WorkOrderType* , size_t )
+{
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::performUnwrapping(WorkOrderType* , const hoNDArray<T>& )
+{
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::performRefFilter(WorkOrderType* workOrder3DT, 
+                                            const hoNDArray<T>& ref, 
+                                            hoNDArray<T>& refFiltered, 
+                                            int startRO, int endRO, 
+                                            int startE1, int endE1, 
+                                            int startE2, int endE2)
+{
+    try
+    {
+        refFiltered = ref;
+
+        size_t RO = ref.get_size(0);
+        size_t E1 = ref.get_size(1);
+        size_t E2 = ref.get_size(2);
+
+        if ( workOrder3DT->filterROE1E2_ref_.get_size(0)==RO 
+            && workOrder3DT->filterROE1E2_ref_.get_size(1)==E1 
+            && workOrder3DT->filterROE1E2_ref_.get_size(2)==E2 )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspace3DfilterROE1E2(ref, workOrder3DT->filterROE1E2_ref_, refFiltered));
+        }
+        else if ( (workOrder3DT->filterRO_ref_.get_number_of_elements()==RO) 
+            && (workOrder3DT->filterE1_ref_.get_number_of_elements()==E1) 
+            && (workOrder3DT->filterE2_ref_.get_number_of_elements()==E2) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspace3DfilterROE1E2(ref, workOrder3DT->filterRO_ref_, workOrder3DT->filterE1_ref_, workOrder3DT->filterE2_ref_, refFiltered));
+        }
+        else
+        {
+            if ( (workOrder3DT->filterRO_ref_.get_number_of_elements()==RO) 
+                && (workOrder3DT->filterE1_ref_.get_number_of_elements()!=E1) 
+                && (workOrder3DT->filterE2_ref_.get_number_of_elements()!=E2) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterRO(ref, workOrder3DT->filterRO_ref_, refFiltered));
+            }
+
+            if ( (workOrder3DT->filterRO_ref_.get_number_of_elements()!=RO) 
+                && (workOrder3DT->filterE1_ref_.get_number_of_elements()==E1) 
+                && (workOrder3DT->filterE2_ref_.get_number_of_elements()!=E2) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterE1(ref, workOrder3DT->filterE1_ref_, refFiltered));
+            }
+
+            if ( (workOrder3DT->filterRO_ref_.get_number_of_elements()!=RO) 
+                && (workOrder3DT->filterE1_ref_.get_number_of_elements()!=E1) 
+                && (workOrder3DT->filterE2_ref_.get_number_of_elements()==E2) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspace3DfilterE2(ref, workOrder3DT->filterE2_ref_, refFiltered));
+            }
+
+            if ( (workOrder3DT->filterRO_ref_.get_number_of_elements()==RO) 
+                && (workOrder3DT->filterE1_ref_.get_number_of_elements()==E1) 
+                && (workOrder3DT->filterE2_ref_.get_number_of_elements()!=E2) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(ref, workOrder3DT->filterRO_ref_, workOrder3DT->filterE1_ref_, refFiltered));
+            }
+
+            if ( (workOrder3DT->filterRO_ref_.get_number_of_elements()==RO) 
+                && (workOrder3DT->filterE1_ref_.get_number_of_elements()!=E1) 
+                && (workOrder3DT->filterE2_ref_.get_number_of_elements()==E2) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspace3DfilterROE2(ref, workOrder3DT->filterRO_ref_, workOrder3DT->filterE2_ref_, refFiltered));
+            }
+
+            if ( (workOrder3DT->filterRO_ref_.get_number_of_elements()!=RO) 
+                && (workOrder3DT->filterE1_ref_.get_number_of_elements()==E1) 
+                && (workOrder3DT->filterE2_ref_.get_number_of_elements()==E2) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspace3DfilterE1E2(ref, workOrder3DT->filterE1_ref_, workOrder3DT->filterE2_ref_, refFiltered));
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::performRefFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::prepRefByAveragingCrossN(WorkOrderType* workOrder3DT, const hoNDArray<T>& ref, bool averageAllRef, int numOfModes, hoNDArray<T>& refRecon)
+{
+    try
+    {
+        size_t RO = ref.get_size(0);
+        size_t E1 = ref.get_size(1);
+        size_t E2 = ref.get_size(2);
+        size_t CHA = ref.get_size(3);
+        size_t N = ref.get_size(4);
+
+        if ( !averageAllRef && ( (numOfModes<1) || (numOfModes>N-1) ) )
+        {
+            refRecon = ref;
+        }
+        else if ( averageAllRef && ( (numOfModes<1) || (numOfModes>N-1) ) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace5D(ref, refRecon));
+        }
+        else if ( averageAllRef && (numOfModes>=1) && (numOfModes<=N-1) )
+        {
+            hoNDArray<T> refKLF(RO, E1, E2, CHA, N);
+            Gadgetron::clear(refKLF);
+
+            hoMatrix<T> A(RO*E1*E2*CHA, N, const_cast<T*>(ref.begin()));
+            hoMatrix<T> A_KLF(RO*E1*E2*CHA, N, refKLF.begin());
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLFilter(A, numOfModes, A_KLF));
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refKLF, debugFolder_+"refKLF"); }
+
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace5D(refKLF, refRecon));
+        }
+        else if ( !averageAllRef && (numOfModes>=1) && (numOfModes<=N-1) )
+        {
+            refRecon.create(RO, E1, E2, CHA, N);
+            Gadgetron::clear(refRecon);
+
+            hoMatrix<T> A(RO*E1*E2*CHA, N, const_cast<T*>(ref.begin()));
+            hoMatrix<T> A_KLF(RO*E1*E2*CHA, N, refRecon.begin());
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLFilter(A, numOfModes, A_KLF));
+        }
+        else
+        {
+            refRecon = ref;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::prepRefByAveragingCrossN(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::prepRef(WorkOrderType* workOrder3DT, const hoNDArray<T>& ref, 
+                                hoNDArray<T>& refRecon, hoNDArray<T>& refCoilMap, 
+                                int startRO, int endRO, 
+                                int startE1, int endE1, 
+                                int startE2, int endE2, 
+                                size_t dataE1, size_t dataE2)
+{
+    try
+    {
+        size_t dataRO = workOrder3DT->data_.get_size(0);
+        size_t dataN = workOrder3DT->data_.get_size(4);
+
+        size_t RO = ref.get_size(0);
+        size_t E1 = ref.get_size(1);
+        size_t E2 = ref.get_size(2);
+        size_t srcCHA = ref.get_size(3);
+        size_t N = ref.get_size(4);
+
+        if ( workOrder3DT->acceFactorE1_ == 1 && workOrder3DT->acceFactorE2_ == 1 )
+        {
+            if ( workOrder3DT->no_acceleration_averageall_ref_ )
+            {
+                GADGET_CHECK_RETURN_FALSE(prepRefByAveragingCrossN(workOrder3DT, ref, workOrder3DT->no_acceleration_averageall_ref_, 0, refRecon));
+            }
+
+            GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder3DT, refRecon, refCoilMap, startRO, endRO, startE1, endE1, startE2, endE2));
+        }
+        else if ( workOrder3DT->CalibMode_ == ISMRMRD_interleaved )
+        {
+            GADGET_CHECK_RETURN_FALSE(prepRefByAveragingCrossN(workOrder3DT, ref, true, 0, refRecon));
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refRecon, debugFolder_+"refRecon_interleaved"); }
+
+            GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder3DT, refRecon, refCoilMap, startRO, endRO, startE1, endE1, startE2, endE2));
+
+            if ( (startRO>=0 && endRO>0 && endRO>startRO) || (startE1>=0 && endE1>0 && endE1>startE1) || (startE2>=0 && endE2>0 && endE2>startE2) )
+            {
+                std::vector<size_t> crop_offset(5), crop_size(5);
+
+                crop_offset[0] = 0;
+                crop_offset[1] = 0;
+                crop_offset[2] = 0;
+                crop_offset[3] = 0;
+                crop_offset[4] = 0;
+
+                crop_size[0] = RO;
+                crop_size[1] = E1;
+                crop_size[2] = refRecon.get_size(2);
+                crop_size[3] = refRecon.get_size(3);
+                crop_size[4] = refRecon.get_size(4);
+
+                if (startRO>=0 && endRO>0 && endRO>startRO)
+                {
+                    crop_offset[0] = startRO;
+                    crop_size[0] = endRO-startRO+1;
+                }
+
+                if (startE1>=0 && endE1>0 && endE1>startE1)
+                {
+                    crop_offset[1] = startE1;
+                    crop_size[1] = endE1-startE1+1;
+                }
+
+                if (startE2>=0 && endE2>0 && endE2>startE2)
+                {
+                    crop_offset[2] = startE2;
+                    crop_size[2] = endE2-startE2+1;
+                }
+
+                hoNDArray<T> croppedRef;
+                GADGET_CHECK_RETURN_FALSE(cropUpTo11DArray(refRecon, croppedRef, crop_offset, crop_size));
+                refRecon = croppedRef;
+            }
+        }
+        else if ( workOrder3DT->CalibMode_ == ISMRMRD_embedded 
+                || workOrder3DT->CalibMode_ == ISMRMRD_separate 
+                || workOrder3DT->CalibMode_ == ISMRMRD_external )
+        {
+            if ( workOrder3DT->CalibMode_ == ISMRMRD_embedded )
+            {
+                refRecon = ref;
+            }
+
+            if ( workOrder3DT->CalibMode_ == ISMRMRD_separate )
+            {
+                GADGET_CHECK_RETURN_FALSE(prepRefByAveragingCrossN(workOrder3DT, ref, workOrder3DT->separate_averageall_ref_, 0, refRecon));
+            }
+
+            if ( performTiming_ ) { gt_timer2_.start("detectSampledRegionE1E2 ... "); }
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.detectSampledRegionE1E2(refRecon, startE1_, endE1_, startE2_, endE2_));
+            if ( performTiming_ ) { gt_timer2_.stop(); }
+
+            std::vector<size_t> crop_offset(5);
+            crop_offset[0] = 0;
+            crop_offset[1] = startE1_;
+            crop_offset[2] = startE2_;
+            crop_offset[3] = 0;
+            crop_offset[4] = 0;
+
+            std::vector<size_t> crop_size(5);
+            crop_size[0] = refRecon.get_size(0);
+            crop_size[1] = endE1_-startE1_+1;
+            crop_size[2] = endE2_-startE2_+1;
+            crop_size[3] = srcCHA;
+            crop_size[4] = refRecon.get_size(4);
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refRecon, debugFolder_+"refRecon_beforeCrop"); }
+
+            if ( workOrder3DT->CalibMode_ == ISMRMRD_embedded )
+            {
+                if ( performTiming_ ) { gt_timer2_.start("crop sampled region ... "); }
+                hoNDArray<T> croppedRef;
+                GADGET_CHECK_RETURN_FALSE(cropUpTo11DArray(refRecon, croppedRef, crop_offset, crop_size));
+                if ( performTiming_ ) { gt_timer2_.stop(); }
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(croppedRef, debugFolder_+"refRecon_afterCrop"); }
+
+                if ( workOrder3DT->recon_algorithm_ == ISMRMRD_SPIRIT 
+                    || workOrder3DT->recon_algorithm_ == ISMRMRD_L1SPIRIT 
+                    || workOrder3DT->recon_algorithm_ == ISMRMRD_L1SPIRIT_SLEP 
+                    || workOrder3DT->recon_algorithm_ == ISMRMRD_L1SPIRIT_SLEP_MOTION_COMP )
+                {
+                    // copy the ref into the data
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1E2(refRecon, workOrder3DT->data_, 0, refRecon.get_size(0)-1, startE1_, endE1_, startE2_, endE2_));
+                }
+
+                GADGET_CHECK_RETURN_FALSE(prepRefByAveragingCrossN(workOrder3DT, croppedRef, workOrder3DT->embedded_averageall_ref_, 0, refRecon));
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refRecon, debugFolder_+"refRecon_afterCrop_prepCrossN"); }
+
+                crop_size[4] = refRecon.get_size(4);
+
+                if ( performTiming_ ) { gt_timer2_.start("set up ref for coil map ... "); }
+                refCoilMap.create(RO, E1, E2, srcCHA, refRecon.get_size(4));
+                GADGET_CHECK_RETURN_FALSE(setSubArrayUpTo11DArray(refRecon, refCoilMap, crop_offset, crop_size));
+                if ( performTiming_ ) { gt_timer2_.stop(); }
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refCoilMap, debugFolder_+"refCoilMap"); }
+
+                if ( performTiming_ ) { gt_timer2_.start("perform ref coil map filter ... "); }
+                // hoNDArray<T> refCoilMapTmp(refCoilMap);
+
+                // GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder3DT, refCoilMapTmp, refCoilMap, startRO, endRO, startE1, endE1, startE2, endE2));
+                GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder3DT, refCoilMap, refCoilMap, startRO, endRO, startE1, endE1, startE2, endE2));
+                if ( performTiming_ ) { gt_timer2_.stop(); }
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refCoilMap, debugFolder_+"refCoilMap_filtered"); }
+
+                if ( refRecon.get_size(0) == RO )
+                {
+                    if ( startRO>=0 && endRO>0 && endRO>startRO && startRO<RO && endRO<RO )
+                    {
+                        crop_offset[0] = startRO;
+                        crop_size[0] = endRO-startRO+1;
+
+                        crop_offset[1] = 0;
+                        crop_size[1] = refRecon.get_size(1);
+
+                        crop_offset[2] = 0;
+                        crop_size[2] = refRecon.get_size(2);
+                    }
+                }
+
+                GADGET_CHECK_RETURN_FALSE(cropUpTo11DArray(refRecon, croppedRef, crop_offset, crop_size));
+                refRecon = croppedRef;
+            }
+            else
+            {
+                hoNDArray<T> croppedRef;
+                GADGET_CHECK_RETURN_FALSE(cropUpTo11DArray(refRecon, croppedRef, crop_offset, crop_size));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(croppedRef, debugFolder_+"croppedRef"); }
+
+                GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder3DT, croppedRef, refCoilMap, startRO, endRO, startE1, endE1, startE2, endE2));
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refCoilMap, debugFolder_+"croppedRef_filtered"); }
+
+                refRecon = croppedRef;
+
+                // GADGET_CHECK_RETURN_FALSE(gtPlus_util_.zeropad3D(refCoilMap, dataRO, dataE1, dataE2, croppedRef));
+                // GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::zeropad3D(refCoilMap, dataRO, dataE1, dataE2, croppedRef));
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::pad(dataRO, dataE1, dataE2, &refCoilMap, &croppedRef));
+                refCoilMap = croppedRef;
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refCoilMap, debugFolder_+"refCoilMap"); }
+
+                if ( refRecon.get_size(0) == RO )
+                {
+                    if ( startRO>=0 && endRO>0 && endRO>startRO && startRO<RO && endRO<RO )
+                    {
+                        crop_offset[0] = startRO;
+                        crop_size[0] = endRO-startRO+1;
+
+                        crop_offset[1] = 0;
+                        crop_size[1] = refRecon.get_size(1);
+
+                        crop_offset[2] = 0;
+                        crop_size[2] = refRecon.get_size(2);
+
+                        GADGET_CHECK_RETURN_FALSE(cropUpTo11DArray(refRecon, croppedRef, crop_offset, crop_size));
+                        refRecon = croppedRef;
+                    }
+                }
+            }
+
+            // if the ref N is smaller than the data N, e.g. in some cases with the separate mode
+            // make sure every data N have its ref data
+            if ( N < dataN )
+            {
+                hoNDArray<T> refReconDataN(refRecon.get_size(0), refRecon.get_size(1), refRecon.get_size(2), refRecon.get_size(3), dataN);
+                hoNDArray<T> refCoilMapDataN(refCoilMap.get_size(0), refCoilMap.get_size(1), refCoilMap.get_size(2), refCoilMap.get_size(3), dataN);
+
+                memcpy(refReconDataN.begin(), refRecon.begin(), refRecon.get_number_of_bytes());
+                memcpy(refCoilMapDataN.begin(), refCoilMap.begin(), refCoilMap.get_number_of_bytes());
+
+                size_t refReconN4D = refRecon.get_size(0)*refRecon.get_size(1)*refRecon.get_size(2)*refRecon.get_size(3);
+                size_t refCoilMapN4D = refCoilMap.get_size(0)*refCoilMap.get_size(1)*refCoilMap.get_size(2)*refCoilMap.get_size(3);
+
+                size_t n;
+                for ( n=N; n<dataN; n++ )
+                {
+                    memcpy(refReconDataN.begin()+n*refReconN4D, refRecon.begin()+(N-1)*refReconN4D, sizeof(T)*refReconN4D);
+                    memcpy(refCoilMapDataN.begin()+n*refCoilMapN4D, refCoilMap.begin()+(N-1)*refCoilMapN4D, sizeof(T)*refCoilMapN4D);
+                }
+
+                refRecon = refReconDataN;
+                refCoilMap = refCoilMapDataN;
+            }
+        }
+        else
+        {
+            GERROR_STREAM("CalibMode is not supported in gtPlusReconWorker3DT<T>::prepRef(...) : " << workOrder3DT->CalibMode_);
+            return false;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refRecon, debugFolder_+"refRecon"); }
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(refCoilMap, debugFolder_+"refCoilMap"); }
+
+        // if the upstream coil compression is needed
+        if ( workOrder3DT->upstream_coil_compression_ )
+        {
+            if ( !debugFolder_.empty() ) { GDEBUG_STREAM("Upstream coil compression ... "); }
+
+            if ( performTiming_ ) { gt_timer2_.start("average along N ... "); }
+            hoNDArray<T> aveAll;
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace5D(refRecon, aveAll));
+            aveAll.squeeze();
+            if ( performTiming_ ) { gt_timer2_.stop(); }
+
+            if ( performTiming_ ) { gt_timer2_.start("compute coil compression coefficients ... "); }
+            hoMatrix<T> coeff, eigenValues;
+            if ( workOrder3DT->upstream_coil_compression_num_modesKept_ > 0 )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(aveAll, 
+                            workOrder3DT->upstream_coil_compression_num_modesKept_, coeff, eigenValues, true));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(aveAll, 
+                            workOrder3DT->upstream_coil_compression_thres_, coeff, eigenValues, true));
+            }
+            if ( performTiming_ ) { gt_timer2_.stop(); }
+
+            eigenValues.print(std::cout);
+            GDEBUG_STREAM("Upstream coil compression, number of channel kept is " << coeff.cols());
+
+            size_t n;
+            std::vector<hoMatrix<T> > upstreamCoilCoeffRef(workOrder3DT->ref_.get_size(4)), upstreamCoilCoeffRefRecon(refRecon.get_size(4)), upstreamCoilCoeffData(workOrder3DT->data_.get_size(4));
+            for ( n=0; n<upstreamCoilCoeffRef.size(); n++ )
+            {
+                upstreamCoilCoeffRef[n] = coeff;
+            }
+
+            for ( n=0; n<upstreamCoilCoeffRefRecon.size(); n++ )
+            {
+                upstreamCoilCoeffRefRecon[n] = coeff;
+            }
+
+            for ( n=0; n<upstreamCoilCoeffData.size(); n++ )
+            {
+                upstreamCoilCoeffData[n] = coeff;
+            }
+
+            if (coeff.cols()<srcCHA)
+            {
+                // apply the coil compression
+                #ifdef USE_OMP
+                    omp_set_nested(1);
+                #endif // USE_OMP
+
+                if ( performTiming_ ) { gt_timer2_.start("apply upstream coil compression ... "); }
+                #pragma omp parallel sections default(shared)
+                {
+
+                    #pragma omp section
+                    {
+                        //if ( performTiming_ ) { gt_timer2_.start("apply the coil compression on data ... "); }
+                        // GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->data_, upstreamCoilCoeffData, data_dst_, true));
+                        if ( performTiming_ ) { gt_timer3_.start("applyKLCoilCompressionCoeff ... "); }
+                        gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->data_, upstreamCoilCoeffData, data_dst_, true);
+                        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                        if ( performTiming_ ) { gt_timer3_.start("copy data ... "); }
+                        workOrder3DT->data_ = data_dst_;
+                        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                        //if ( performTiming_ ) { gt_timer2_.stop(); }
+                    }
+
+                    #pragma omp section
+                    {
+                        //if ( performTiming_ ) { gt_timer2_.start("apply the coil compression on ref ... "); }
+                        //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->ref_, upstreamCoilCoeff, ref_dst_, true));
+                        gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->ref_, upstreamCoilCoeffRef, ref_dst_, true);
+                        workOrder3DT->ref_ = ref_dst_;
+                        //if ( performTiming_ ) { gt_timer2_.stop(); }
+                    }
+
+                    #pragma omp section
+                    {
+                        //if ( performTiming_ ) { gt_timer2_.start("apply the coil compression on refRecon ... "); }
+                        hoNDArray<T> refRecon_upstream;
+                        //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(refRecon, upstreamCoilCoeff, refRecon_upstream, true));
+                        gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(refRecon, upstreamCoilCoeffRefRecon, refRecon_upstream, true);
+                        refRecon = refRecon_upstream;
+                        refRecon_upstream.clear();
+                        //if ( performTiming_ ) { gt_timer2_.stop(); }
+                    }
+
+                    #pragma omp section
+                    {
+                        //if ( performTiming_ ) { gt_timer2_.start("apply the coil compression on ref for coil map ... "); }
+                        hoNDArray<T> refCoilMap_upstream;
+                        //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(refCoilMap, upstreamCoilCoeff, refCoilMap_upstream, true));
+                        gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(refCoilMap, upstreamCoilCoeffRefRecon, refCoilMap_upstream, true);
+                        refCoilMap = refCoilMap_upstream;
+                        refCoilMap_upstream.clear();
+                        //if ( performTiming_ ) { gt_timer2_.stop(); }
+                    }
+                }
+
+                if ( performTiming_ ) { gt_timer2_.stop(); }
+
+                #ifdef USE_OMP
+                    omp_set_nested(0);
+                #endif // USE_OMP 
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::prepRef(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::coilCompression(WorkOrderType* workOrder3DT)
+{
+    // the 3DT recon on 5D array [RO E1 E2 CHA N]
+    try
+    {
+        size_t RO = workOrder3DT->ref_recon_.get_size(0);
+        size_t E1 = workOrder3DT->ref_recon_.get_size(1);
+        size_t E2 = workOrder3DT->ref_recon_.get_size(2);
+        size_t srcCHA = workOrder3DT->ref_recon_.get_size(3);
+        size_t N = workOrder3DT->ref_recon_.get_size(4);
+
+        size_t dataN = workOrder3DT->data_.get_size(4);
+
+        size_t n;
+
+        if (workOrder3DT->CalibMode_ == ISMRMRD_noacceleration) return true;
+
+        // compute coil compression coeff
+        if ( workOrder3DT->coil_compression_ 
+            && workOrder3DT->recon_algorithm_!=ISMRMRD_SPIRIT 
+            && workOrder3DT->recon_algorithm_!=ISMRMRD_L1SPIRIT 
+            && workOrder3DT->recon_algorithm_!=ISMRMRD_L1SPIRIT_SLEP 
+            && workOrder3DT->recon_algorithm_!=ISMRMRD_L1SPIRIT_SLEP_MOTION_COMP )
+        {
+            // check whether coil compression coeff has been preset
+            if ( workOrder3DT->coilCompressionCoef_->size()!=dataN )
+            {
+                if ( workOrder3DT->same_coil_compression_coeff_allN_ )
+                {
+                    hoNDArray<T> aveAll;
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace5D(workOrder3DT->ref_recon_, aveAll));
+                    aveAll.squeeze();
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(aveAll, debugFolder_+"aveAll"); }
+
+                    hoMatrix<T> coeff, eigenValues;
+                    if ( workOrder3DT->coil_compression_num_modesKept_ > 0 )
+                    {
+                        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(aveAll, 
+                                    workOrder3DT->coil_compression_num_modesKept_, coeff, eigenValues, true));
+                    }
+                    else
+                    {
+                        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(aveAll, 
+                                    workOrder3DT->coil_compression_thres_, coeff, eigenValues, true));
+                    }
+
+                    workOrder3DT->coilCompressionCoef_->resize(dataN);
+
+                    for ( n=0; n<dataN; n++ )
+                    {
+                        (*workOrder3DT->coilCompressionCoef_)[n] = coeff;
+                    }
+
+                    if ( !debugFolder_.empty() ) {  eigenValues.print(std::cout); }
+                    GDEBUG_STREAM("Coil compression, number of channel kept is " << coeff.cols());
+                }
+                else
+                {
+                    std::vector<size_t> allNDim(4);
+                    allNDim[0] = RO;
+                    allNDim[1] = E1;
+                    allNDim[2] = E2;
+                    allNDim[3] = srcCHA;
+
+                    size_t num_modesKept = srcCHA;
+
+                    for ( n=0; n<N; n++ )
+                    {
+                        hoNDArray<T> dataCurrN(&allNDim, workOrder3DT->ref_recon_.begin()+n*RO*E1*E2*srcCHA, false);
+
+                        hoMatrix<T> coeff, eigenValues;
+
+                        if ( n == 0 )
+                        {
+                            if ( workOrder3DT->coil_compression_num_modesKept_ > 0 )
+                            {
+                                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(dataCurrN, 
+                                            workOrder3DT->coil_compression_num_modesKept_, coeff, eigenValues, true));
+                            }
+                            else
+                            {
+                                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(dataCurrN, 
+                                            workOrder3DT->coil_compression_thres_, coeff, eigenValues, true));
+                            }
+
+                            num_modesKept = coeff.get_size(0);
+                            workOrder3DT->coilCompressionCoef_->push_back(coeff);
+                        }
+                        else
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(dataCurrN, 
+                                            (int)num_modesKept, coeff, eigenValues, true));
+
+                            workOrder3DT->coilCompressionCoef_->push_back(coeff);
+                        }
+
+                        if ( !debugFolder_.empty() ) {  eigenValues.print(std::cout); }
+                        GDEBUG_STREAM("Coil compression, number of channel kept is " << coeff.cols());
+                    }
+                }
+            }
+
+            if ( N < dataN )
+            {
+                std::vector<hoMatrix<T> > coilCompressionCoef(dataN);
+                for ( n=0; n<N; n++ )
+                {
+                    coilCompressionCoef[n] = (*workOrder3DT->coilCompressionCoef_)[n];
+                }
+
+                for ( n=N; n<dataN; n++ )
+                {
+                    coilCompressionCoef[n] = (*workOrder3DT->coilCompressionCoef_)[N-1];
+                }
+
+                *(workOrder3DT->coilCompressionCoef_) = coilCompressionCoef;
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::coilCompression(WorkOrderType* workOrder3DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::unmixCoeff(const hoNDArray<T>& kerIm, const hoNDArray<T>& coilMap, hoNDArray<T>& unmixCoeff, hoNDArray<T>& gFactor)
+{
+    try
+    {
+        size_t RO = kerIm.get_size(0);
+        size_t E1 = kerIm.get_size(1);
+        size_t E2 = kerIm.get_size(2);
+        size_t srcCHA = kerIm.get_size(3);
+        size_t dstCHA = kerIm.get_size(4);
+
+        GADGET_CHECK_RETURN_FALSE(coilMap.get_size(0)==RO);
+        GADGET_CHECK_RETURN_FALSE(coilMap.get_size(1)==E1);
+        GADGET_CHECK_RETURN_FALSE(coilMap.get_size(2)==E2);
+        GADGET_CHECK_RETURN_FALSE(coilMap.get_size(3)==dstCHA);
+
+        unmixCoeff.create(RO, E1, E2, srcCHA);
+        Gadgetron::clear(&unmixCoeff);
+
+        int src;
+
+        T* pKerIm = const_cast<T*>(kerIm.begin());
+        T* pCoilMap = const_cast<T*>(coilMap.begin());
+        T* pCoeff = unmixCoeff.begin();
+
+        std::vector<size_t> dim(3);
+        dim[0] = RO;
+        dim[1] = E1;
+        dim[2] = E2;
+
+        #pragma omp parallel default(none) private(src) shared(RO, E1, E2, srcCHA, dstCHA, pKerIm, pCoilMap, pCoeff, dim)
+        {
+            hoNDArray<T> coeff2D, coeffTmp(&dim);
+            hoNDArray<T> coilMap2D;
+            hoNDArray<T> kerIm2D;
+
+            #pragma omp for
+            for ( src=0; src<(int)srcCHA; src++ )
+            {
+                coeff2D.create(&dim, pCoeff+src*RO*E1*E2);
+
+                for ( size_t dst=0; dst<dstCHA; dst++ )
+                {
+                    kerIm2D.create(&dim, pKerIm+src*RO*E1*E2+dst*RO*E1*E2*srcCHA);
+                    coilMap2D.create(&dim, pCoilMap+dst*RO*E1*E2);
+                    Gadgetron::multiplyConj(kerIm2D, coilMap2D, coeffTmp);
+                    Gadgetron::add(coeff2D, coeffTmp, coeff2D);
+                }
+            }
+        }
+
+        hoNDArray<T> conjUnmixCoeff(unmixCoeff);
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiplyConj(unmixCoeff, conjUnmixCoeff, conjUnmixCoeff));
+
+        gFactor.create(RO, E1, E2);
+        Gadgetron::clear(&gFactor);
+
+        hoNDArray<T> gFactorBuf(RO, E1, E2, 1, gFactor.begin());
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::sum_over_dimension(conjUnmixCoeff, gFactorBuf, 3));
+        Gadgetron::sqrt(gFactor, gFactor);
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::unmixCoeff(const hoNDArray<T>& kerIm, const hoNDArray<T>& coilMap, hoNDArray<T>& unmixCoeff, hoNDArray<T>& gFactor) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::applyImageDomainKernel(const hoNDArray<T>& kspace, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        size_t RO = kerIm.get_size(0);
+        size_t E1 = kerIm.get_size(1);
+        size_t E2 = kerIm.get_size(2);
+        size_t srcCHA = kerIm.get_size(3);
+        size_t dstCHA = kerIm.get_size(4);
+
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(0)==RO);
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(1)==E1);
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(2)==E2);
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(3)==srcCHA);
+
+        // buffer3DT_unwrapping_ = kspace;
+
+        hoNDArray<T> buffer3DT(kspace.get_dimensions());
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(kspace, buffer3DT));
+
+        GADGET_CHECK_RETURN_FALSE(applyImageDomainKernelImage(buffer3DT, kerIm, complexIm));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::applyImageDomainKernel(const hoNDArray<T>& kspace, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm)
+{
+    return applyImageDomainKernelImage(aliasedIm, kerIm, this->buf4D, complexIm);
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& kerImBuffer, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        size_t RO = kerIm.get_size(0);
+        size_t E1 = kerIm.get_size(1);
+        size_t E2 = kerIm.get_size(2);
+        size_t srcCHA = kerIm.get_size(3);
+        size_t dstCHA = kerIm.get_size(4);
+
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(0)==RO);
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(1)==E1);
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(2)==E2);
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(3)==srcCHA);
+
+        boost::shared_ptr< std::vector<size_t> > dim = aliasedIm.get_dimensions();
+        std::vector<size_t> dimIm(*dim);
+        dimIm[3] = dstCHA;
+
+        if ( !complexIm.dimensions_equal(&dimIm) )
+        {
+            complexIm.create(&dimIm);
+        }
+        Gadgetron::clear(&complexIm);
+
+        std::vector<size_t> dim4D(4);
+        dim4D[0] = RO;
+        dim4D[1] = E1;
+        dim4D[2] = E2;
+        dim4D[3] = srcCHA;
+
+        std::vector<size_t> dimIm4D(4);
+        dimIm4D[0] = RO;
+        dimIm4D[1] = E1;
+        dimIm4D[2] = E2;
+        dimIm4D[3] = dstCHA;
+
+        size_t num = aliasedIm.get_number_of_elements()/ (RO*E1*E2*srcCHA);
+
+        int n;
+
+        #pragma omp parallel default(none) private(n) shared(num, dim4D, aliasedIm, RO, E1, E2, srcCHA, dstCHA, kerIm, complexIm) num_threads( (int)((num<16) ? num : 16) )
+        {
+            hoNDArray<T> unwrapped4D(RO, E1, E2, srcCHA);
+
+            #pragma omp for
+            for ( n=0; n<(int)num; n++ )
+            {
+                hoNDArray<T> buf4D(&dim4D, const_cast<T*>(aliasedIm.begin()+n*RO*E1*E2*srcCHA));
+
+                int dCha;
+
+                for ( dCha=0; dCha<(int)dstCHA; dCha++ )
+                {
+                    hoNDArray<T> kerIm4D(RO, E1, E2, srcCHA, const_cast<T*>(kerIm.begin()+dCha*RO*E1*E2*srcCHA));
+                    hoNDArray<T> complexIm3D(RO, E1, E2, 1, complexIm.begin()+n*RO*E1*E2*dstCHA+dCha*RO*E1*E2);
+                    Gadgetron::multiply(kerIm4D, buf4D, unwrapped4D);
+                    Gadgetron::sum_over_dimension(unwrapped4D, complexIm3D, 3);
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& kerImBuffer, hoNDArray<T>& complexIm) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::applyUnmixCoeff(const hoNDArray<T>& kspace, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(0)==unmixCoeff.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(1)==unmixCoeff.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(2)==unmixCoeff.get_size(2));
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(3)==unmixCoeff.get_size(3));
+
+        // buffer3DT_unwrapping_ = kspace;
+        hoNDArray<T> buffer3DT(kspace.get_dimensions());
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(kspace, buffer3DT));
+        GADGET_CHECK_RETURN_FALSE(applyUnmixCoeffImage(buffer3DT, unmixCoeff, complexIm));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::applyUnmixCoeff(const hoNDArray<T>& kspace, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::applyUnmixCoeffImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(0)==unmixCoeff.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(1)==unmixCoeff.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(2)==unmixCoeff.get_size(2));
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(3)==unmixCoeff.get_size(3));
+
+        boost::shared_ptr< std::vector<size_t> > dim = aliasedIm.get_dimensions();
+        std::vector<size_t> dimIm(*dim);
+        dimIm[3] = 1;
+
+        if ( !complexIm.dimensions_equal(&dimIm) )
+        {
+            complexIm.create(&dimIm);
+        }
+        Gadgetron::clear(&complexIm);
+
+        hoNDArray<T> buffer3DT(aliasedIm.get_dimensions());
+
+        Gadgetron::multiply(aliasedIm, unmixCoeff, buffer3DT);
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(buffer3DT, complexIm, 3));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::applyUnmixCoeffImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::afterUnwrapping(WorkOrderType* workOrder3DT)
+{
+    try
+    {
+        bool fullres_coilmap = false;
+        bool ref_fillback = false;
+        bool averageallN_coilmap = false;
+        bool same_coilmap_allN = false;
+        size_t whichN_coilmap = 0;
+
+        size_t RO = workOrder3DT->data_.get_size(0);
+        size_t E1 = workOrder3DT->data_.get_size(1);
+        size_t E2 = workOrder3DT->data_.get_size(2);
+        size_t srcCHA = workOrder3DT->kernelIm_->get_size(3);
+        size_t dstCHA = workOrder3DT->kernelIm_->get_size(4);
+        size_t N = workOrder3DT->data_.get_size(4);
+
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_embedded )
+        {
+            if ( workOrder3DT->embedded_fullres_coilmap_ )
+            {
+                fullres_coilmap = true;
+            }
+
+            if ( workOrder3DT->embedded_ref_fillback_ )
+            {
+                ref_fillback = true;
+            }
+
+            if ( workOrder3DT->embedded_averageall_ref_ )
+            {
+                averageallN_coilmap = true;
+            }
+
+            if ( workOrder3DT->embedded_same_combinationcoeff_allN_ )
+            {
+                same_coilmap_allN = true;
+                whichN_coilmap = workOrder3DT->embedded_whichN_combinationcoeff_;
+            }
+        }
+
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_separate )
+        {
+            if ( workOrder3DT->separate_fullres_coilmap_ )
+            {
+                fullres_coilmap = true;
+            }
+
+            if ( workOrder3DT->separate_averageall_ref_ )
+            {
+                averageallN_coilmap = true;
+            }
+
+            if ( workOrder3DT->separate_same_combinationcoeff_allN_ )
+            {
+                same_coilmap_allN = true;
+                whichN_coilmap = workOrder3DT->separate_whichN_combinationcoeff_;
+            }
+        }
+
+        if ( whichN_coilmap >= N ) whichN_coilmap = N-1;
+
+        if ( ref_fillback )
+        {
+            if ( performTiming_ ) { gt_timer2_.start("ref fill back ... "); }
+
+            hoNDArray<T> ref_dst;
+            if ( workOrder3DT->coil_compression_ 
+                && workOrder3DT->recon_algorithm_!=ISMRMRD_SPIRIT 
+                && workOrder3DT->recon_algorithm_!=ISMRMRD_L1SPIRIT 
+                && workOrder3DT->recon_algorithm_!=ISMRMRD_L1SPIRIT_SLEP 
+                && workOrder3DT->recon_algorithm_!=ISMRMRD_L1SPIRIT_SLEP_MOTION_COMP )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.applyKLCoilCompressionCoeff(workOrder3DT->ref_, *workOrder3DT->coilCompressionCoef_, ref_dst, true));
+            }
+            else
+            {
+                ref_dst = workOrder3DT->ref_;
+            }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(ref_dst, debugFolder_+"ref_dst"); }
+
+            if ( (ref_dst.get_size(3)==dstCHA) && (ref_dst.get_size(4)==N) )
+            {
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->fullkspace_, debugFolder_+"fullkspace_"); }
+
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1E2(ref_dst, workOrder3DT->fullkspace_, 0, RO-1, startE1_, endE1_, startE2_, endE2_));
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->fullkspace_, debugFolder_+"fullkspace_After"); }
+            }
+
+            if ( performTiming_ ) { gt_timer2_.stop(); }
+        }
+
+        // partial fourier handling
+        if ( partial_fourier_handling_ )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->performPartialFourierHandling(workOrder3DT));
+        }
+
+        if ( this->computeKSpace(workOrder3DT) || fullres_coilmap )
+        {
+            if ( performTiming_ ) { gt_timer2_.start("full res coil map : allocate buffer 3DT ...  "); }
+            hoNDArray<T> buffer3DT(workOrder3DT->fullkspace_.get_dimensions());
+            hoNDArray<T> buffer3DT_Two(workOrder3DT->fullkspace_.get_dimensions());
+            if ( performTiming_ ) { gt_timer2_.stop(); }
+
+            if ( performTiming_ ) { gt_timer2_.start("full res coil map : go to image domain ...  "); }
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(workOrder3DT->fullkspace_, buffer3DT, buffer3DT_Two);
+            if ( performTiming_ ) { gt_timer2_.stop(); }
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer3DT, debugFolder_+"ComplexIm_afterRefFill"); }
+
+            if ( averageallN_coilmap )
+            {
+                if ( workOrder3DT->workFlow_use_BufferedKernel_ )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine3D(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->complexIm_));
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->complexIm_, debugFolder_+"complexImCombined"); }
+                }
+                else
+                {
+                    if ( performTiming_ ) { gt_timer2_.start("full res coil map : allocate coil map ...  "); }
+                    workOrder3DT->coilMap_->create(RO, E1, E2, dstCHA, 1);
+                    if ( performTiming_ ) { gt_timer2_.stop(); }
+
+                    if ( N > 1 )
+                    {
+                        hoNDArray<T> aveComplexIm(RO, E1, E2, dstCHA, 1);
+                        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace5D(buffer3DT, aveComplexIm));
+
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(aveComplexIm, debugFolder_+"aveComplexIm"); }
+
+                        if ( performTiming_ ) { gt_timer2_.start("full res coil map : compute 3D coil map ...  "); }
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(aveComplexIm, *workOrder3DT->coilMap_, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, (value_type)workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                        if ( performTiming_ ) { gt_timer2_.stop(); }
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*workOrder3DT->coilMap_, debugFolder_+"coilMap_fullres"); }
+                    }
+                    else
+                    {
+                        if ( performTiming_ ) { gt_timer2_.start("full res coil map : compute 3D coil map ...  "); }
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, (value_type)workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                        if ( performTiming_ ) { gt_timer2_.stop(); }
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*workOrder3DT->coilMap_, debugFolder_+"coilMap_fullres"); }
+                    }
+
+                    if ( performTiming_ ) { gt_timer2_.start("full res coil map : coil combine 3D ...  "); }
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine3D(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->complexIm_));
+                    if ( performTiming_ ) { gt_timer2_.stop(); }
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->complexIm_, debugFolder_+"complexImCombined"); }
+                }
+            }
+            else
+            {
+                if ( workOrder3DT->workFlow_use_BufferedKernel_ )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine3D(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->complexIm_));
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->complexIm_, debugFolder_+"complexIm_"); }
+                }
+                else
+                {
+                    workOrder3DT->coilMap_->create(RO, E1, E2, dstCHA, N);
+
+                    if ( same_coilmap_allN )
+                    {
+                        hoNDArray<T> complexImN(RO, E1, E2, dstCHA, buffer3DT.begin()+whichN_coilmap*RO*E1*E2*dstCHA);
+                        hoNDArray<T> coilMapN(RO, E1, E2, dstCHA, workOrder3DT->coilMap_->begin()+whichN_coilmap*RO*E1*E2*dstCHA);
+
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(complexImN, coilMapN, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, (value_type)workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                        GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder3DT->coilMap_, whichN_coilmap));
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*workOrder3DT->coilMap_, debugFolder_+"coilMap_fullres"); }
+                    }
+                    else
+                    {
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, (value_type)workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*workOrder3DT->coilMap_, debugFolder_+"coilMap_fullres"); }
+                    }
+
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine3D(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->complexIm_));
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->complexIm_, debugFolder_+"complexIm_"); }
+                }
+            }
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(workOrder3DT->complexIm_.get_size(0)==RO);
+            GADGET_CHECK_RETURN_FALSE(workOrder3DT->complexIm_.get_size(1)==E1);
+            GADGET_CHECK_RETURN_FALSE(workOrder3DT->complexIm_.get_size(2)==E2);
+
+            if ( partial_fourier_handling_ )
+            {
+                bool partialFourierHandling = true;
+                if ( (workOrder3DT->start_RO_<0 || workOrder3DT->end_RO_<0 || (workOrder3DT->end_RO_-workOrder3DT->start_RO_+1==RO) ) 
+                        && (workOrder3DT->start_E1_<0 || workOrder3DT->end_E1_<0 || (workOrder3DT->end_E1_-workOrder3DT->start_E1_+1==E1) ) 
+                        && (workOrder3DT->start_E2_<0 || workOrder3DT->end_E2_<0 || (workOrder3DT->end_E2_-workOrder3DT->start_E2_+1==E2) ) )
+                {
+                    partialFourierHandling = false;
+                }
+
+                // if the partial fourier handling is used to compute updated full kspace, the coil combination needs to be repeated
+                if ( partialFourierHandling )
+                {
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->complexIm_, debugFolder_+"complexIm_origin_noFullResCoilMap_"); }
+
+                    if ( performTiming_ ) { gt_timer2_.start("after partial fourier handling, allocate buffer 3DT ...  "); }
+                    hoNDArray<T> buffer3DT(workOrder3DT->fullkspace_.get_dimensions());
+                    hoNDArray<T> buffer3DT_Two(workOrder3DT->fullkspace_.get_dimensions());
+                    if ( performTiming_ ) { gt_timer2_.stop(); }
+
+                    // if the partial fourier handling is performed on the fullkspace, an extra coil combination is needed
+                    if (workOrder3DT->CalibMode_ == ISMRMRD_noacceleration)
+                    {
+                        hoNDArray<T> buffer3DT_Two(workOrder3DT->data_.get_dimensions());
+                        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(workOrder3DT->data_, buffer3DT, buffer3DT_Two);
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->complexIm_));
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->complexIm_, debugFolder_+"complexIm_noFullResCoilMap_"); }
+                    }
+                    else if ( workOrder3DT->fullkspace_.get_number_of_elements() > 0 )
+                    {
+                        if (workOrder3DT->fullkspace_.get_size(3) == workOrder3DT->coilMap_->get_size(3))
+                        {
+                            hoNDArray<T> buffer3DT_Two(workOrder3DT->fullkspace_.get_dimensions());
+                            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(workOrder3DT->fullkspace_, buffer3DT, buffer3DT_Two);
+                            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->complexIm_));
+                            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->complexIm_, debugFolder_+"complexIm_noFullResCoilMap_"); }
+                        }
+                        else
+                        {
+                            if (workOrder3DT->fullkspace_.get_size(3) == 1)
+                            {
+                                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(workOrder3DT->fullkspace_, buffer3DT);
+                                memcpy(workOrder3DT->complexIm_.begin(), buffer3DT.begin(), workOrder3DT->complexIm_.get_number_of_bytes());
+                                if (!debugFolder_.empty()) { gt_exporter_.exportArrayComplex(workOrder3DT->complexIm_, debugFolder_ + "complexIm_noFullResCoilMap__noReconKSpace_"); }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // flip along E2
+        if ( performTiming_ ) { gt_timer2_.start("flip along E2 ...  "); }
+
+        size_t imRO = workOrder3DT->complexIm_.get_size(0);
+        size_t imE1 = workOrder3DT->complexIm_.get_size(1);
+        size_t imE2 = workOrder3DT->complexIm_.get_size(2);
+        size_t imCHA = workOrder3DT->complexIm_.get_size(3);
+
+        hoNDArray<T> complexIm(workOrder3DT->complexIm_);
+
+        T* pSrc = workOrder3DT->complexIm_.begin();
+        T* pDst = complexIm.begin();
+
+        size_t mid_RO = imRO/2;
+        size_t mid_E1 = imE1/2;
+        size_t mid_E2 = imE2/2;
+
+        size_t n, cha;
+        for ( n=0; n<workOrder3DT->complexIm_.get_size(4); n++ )
+        {
+            for ( cha=0; cha<imCHA; cha++ )
+            {
+                size_t offset = n*imRO*imE1*imE2*imCHA+cha*imRO*imE1*imE2;
+
+                for ( size_t e2=0; e2<imE2; e2++ )
+                {
+                    size_t e2_from = 2*mid_E2-e2;
+                    if ( e2_from >= imE2 ) e2_from -= imE2;
+
+                    memcpy(pDst+offset+e2*imRO*imE1, pSrc+offset+e2_from*imRO*imE1, sizeof(T)*imRO*imE1);
+                }
+            }
+        }
+        if ( performTiming_ ) { gt_timer2_.stop(); }
+
+        workOrder3DT->complexIm_ = complexIm;
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::afterUnwrapping(WorkOrderType* workOrder3DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::performPartialFourierHandling(WorkOrderType* workOrder3DT)
+{
+    try
+    {
+        value_type partialFourierCompensationFactor = 1;
+
+        size_t RO = workOrder3DT->data_.get_size(0);
+        size_t E1 = workOrder3DT->data_.get_size(1);
+        size_t E2 = workOrder3DT->data_.get_size(2);
+
+        if ( !( workOrder3DT->start_RO_<0 || workOrder3DT->end_RO_<0 || (workOrder3DT->end_RO_-workOrder3DT->start_RO_+1==RO) ) )
+        {
+            partialFourierCompensationFactor *= (value_type)(RO)/(value_type)(workOrder3DT->end_RO_-workOrder3DT->start_RO_+1);
+        }
+
+        if ( !( workOrder3DT->start_E1_<0 || workOrder3DT->end_E1_<0 || (workOrder3DT->end_E1_-workOrder3DT->start_E1_+1==E1) ) )
+        {
+            if ( workOrder3DT->end_E1_-workOrder3DT->start_E1_+1 <= E1 )
+            {
+                partialFourierCompensationFactor *= (value_type)(E1)/(value_type)(workOrder3DT->end_E1_-workOrder3DT->start_E1_+1);
+            }
+        }
+
+        if ( !( workOrder3DT->start_E2_<0 || workOrder3DT->end_E2_<0 || (workOrder3DT->end_E2_-workOrder3DT->start_E2_+1==E2) ) )
+        {
+            if ( workOrder3DT->end_E2_-workOrder3DT->start_E2_+1 <= E2 )
+            {
+                partialFourierCompensationFactor *= (value_type)(E2)/(value_type)(workOrder3DT->end_E2_-workOrder3DT->start_E2_+1);
+            }
+        }
+
+        partialFourierCompensationFactor = std::sqrt(partialFourierCompensationFactor);
+        if ( performTiming_ ) { GDEBUG_STREAM("Partial fourier scaling factor : " << partialFourierCompensationFactor); }
+
+        // if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING ) return true;
+
+        if (workOrder3DT->CalibMode_ == ISMRMRD_noacceleration)
+        {
+            if ( (workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING || workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER) && (std::abs(partialFourierCompensationFactor-1)>FLT_EPSILON) )
+            {
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal(partialFourierCompensationFactor, workOrder3DT->data_));
+            }
+
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFilter(*workOrder3DT, workOrder3DT->data_));
+            }
+
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_POCS )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierPOCSRecon(*workOrder3DT, workOrder3DT->data_));
+            }
+
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_FENGHUANG )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFengHuangRecon(*workOrder3DT, workOrder3DT->data_));
+            }
+        }
+        else if ( workOrder3DT->fullkspace_.get_number_of_elements() > 0 )
+        {
+            if ( (workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING || workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER) && (std::abs(partialFourierCompensationFactor-1)>FLT_EPSILON) )
+            {
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal(partialFourierCompensationFactor, workOrder3DT->fullkspace_));
+            }
+
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFilter(*workOrder3DT, workOrder3DT->fullkspace_));
+            }
+
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_POCS )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierPOCSRecon(*workOrder3DT, workOrder3DT->fullkspace_));
+            }
+
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_FENGHUANG )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFengHuangRecon(*workOrder3DT, workOrder3DT->fullkspace_));
+            }
+        }
+        else
+        {
+            // perform partial fourier handling on the complex images after coil combination
+            hoNDArray<T> kspace(workOrder3DT->complexIm_.get_dimensions());
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(workOrder3DT->complexIm_, kspace);
+
+            if ( (workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING || workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER) && (std::abs(partialFourierCompensationFactor-1)>FLT_EPSILON) )
+            {
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal(partialFourierCompensationFactor, kspace));
+            }
+
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFilter(*workOrder3DT, kspace));
+            }
+
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_POCS )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierPOCSRecon(*workOrder3DT, kspace));
+            }
+
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_FENGHUANG )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFengHuangRecon(*workOrder3DT, kspace));
+            }
+
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(kspace, workOrder3DT->complexIm_);
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::performPartialFourierHandling(gtPlusReconworkOrder3DT<T>* workOrder3DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::performPartialFourierFilter(gtPlusReconWorkOrder3DT<T>& workOrder3DT, hoNDArray<T>& kspace)
+{
+    try
+    {
+        GDEBUG_STREAM("--> Into gt Plus 3DT partial fourier filter ... ");
+
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t E2 = kspace.get_size(2);
+
+        // check whether partial fourier is used
+        if ( (workOrder3DT.start_RO_<0 || workOrder3DT.end_RO_<0 || (workOrder3DT.end_RO_-workOrder3DT.start_RO_+1==RO) ) 
+            && (workOrder3DT.start_E1_<0 || workOrder3DT.end_E1_<0 || (workOrder3DT.end_E1_-workOrder3DT.start_E1_+1==E1) )
+            && (workOrder3DT.start_E2_<0 || workOrder3DT.end_E2_<0 || (workOrder3DT.end_E2_-workOrder3DT.start_E2_+1==E2) ) )
+        {
+            return true;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_before_PF_Filter"); }
+
+        hoNDArray<T> buffer3DT_partial_fourier(kspace.get_dimensions());
+
+        if ( workOrder3DT.filterROE1E2_partialfourier_.get_size(0)==RO 
+                && workOrder3DT.filterROE1E2_partialfourier_.get_size(1)==E1
+                && workOrder3DT.filterROE1E2_partialfourier_.get_size(2)==E2 )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspace3DfilterROE1E2(kspace, workOrder3DT.filterROE1E2_partialfourier_, buffer3DT_partial_fourier));
+            kspace = buffer3DT_partial_fourier;
+        }
+
+        else if ( (workOrder3DT.filterRO_partialfourier_.get_number_of_elements() == RO) 
+                && (workOrder3DT.filterE1_partialfourier_.get_number_of_elements() == E1) 
+                && (workOrder3DT.filterE2_partialfourier_.get_number_of_elements() == E2) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspace3DfilterROE1E2(kspace, workOrder3DT.filterRO_partialfourier_, 
+                    workOrder3DT.filterE1_partialfourier_, workOrder3DT.filterE2_partialfourier_, buffer3DT_partial_fourier));
+
+            kspace = buffer3DT_partial_fourier;
+        }
+
+        else
+        {
+            hoNDArray<T>* pSrc = &kspace;
+            hoNDArray<T>* pDst = &buffer3DT_partial_fourier;
+
+            bool filterPerformed = false;
+
+            if ( workOrder3DT.filterRO_partialfourier_.get_number_of_elements() == RO )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspacefilterRO(*pSrc, workOrder3DT.filterRO_partialfourier_, *pDst));
+                std::swap(pSrc, pDst);
+                filterPerformed = true;
+            }
+
+            if ( workOrder3DT.filterE1_partialfourier_.get_number_of_elements() == E1 )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspacefilterE1(*pSrc, workOrder3DT.filterE1_partialfourier_, *pDst));
+                std::swap(pSrc, pDst);
+                filterPerformed = true;
+            }
+
+            if ( workOrder3DT.filterE2_partialfourier_.get_number_of_elements() == E2 )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspace3DfilterE2(*pSrc, workOrder3DT.filterE2_partialfourier_, *pDst));
+                std::swap(pSrc, pDst);
+                filterPerformed = true;
+            }
+
+            if ( filterPerformed && pDst != &kspace )
+            {
+                kspace = *pDst;
+            }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_after_PF_Filter"); }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::performPartialFourierFilter(gtPlusReconWorkOrder3DT<T>& workOrder3DT, hoNDArray<T>& kspace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::performPartialFourierPOCSRecon(WorkOrderType& workOrder3DT, hoNDArray<T>& kspace)
+{
+    try
+    {
+        GDEBUG_STREAM("--> Into gt Plus 3DT partial fourier POCS ... ");
+
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t E2 = kspace.get_size(2);
+        size_t CHA = kspace.get_size(3);
+        size_t N = kspace.get_size(4);
+
+        // check whether partial fourier is used
+        if ( (workOrder3DT.start_RO_<0 || workOrder3DT.end_RO_<0 || (workOrder3DT.end_RO_-workOrder3DT.start_RO_+1==RO) ) 
+            && (workOrder3DT.start_E1_<0 || workOrder3DT.end_E1_<0 || (workOrder3DT.end_E1_-workOrder3DT.start_E1_+1==E1) )
+            && (workOrder3DT.start_E2_<0 || workOrder3DT.end_E2_<0 || (workOrder3DT.end_E2_-workOrder3DT.start_E2_+1==E2) ) )
+        {
+            return true;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_before_POCS"); }
+
+        // create kspace filter for homodyne phase estimation
+        ISMRMRDKSPACEFILTER filter_ref_type_ = ISMRMRD_FILTER_HANNING;
+        double filter_ref_sigma_ = 1.5;
+        double filter_ref_width_ = 0.15;
+
+        size_t startRO(0), endRO(RO-1);
+        hoNDArray<T> filterRO(RO);
+        if ( (workOrder3DT.start_RO_<0 || workOrder3DT.end_RO_<0) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(RO, 0, RO-1, 
+                filterRO, filter_ref_type_, filter_ref_sigma_, (size_t)std::ceil(filter_ref_width_*RO)));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(RO, workOrder3DT.start_RO_, workOrder3DT.end_RO_, 
+                filterRO, filter_ref_type_, filter_ref_sigma_, (size_t)std::ceil(filter_ref_width_*RO)));
+
+            startRO = workOrder3DT.start_RO_;
+            endRO = workOrder3DT.end_RO_;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(filterRO, debugFolder_+"filterRO_POCS"); }
+
+        size_t startE1(0), endE1(E1-1);
+        hoNDArray<T> filterE1(E1);
+        if ( (workOrder3DT.start_E1_<0 || workOrder3DT.end_E1_<0) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(E1, 0, E1-1, 
+                filterE1, filter_ref_type_, filter_ref_sigma_, (size_t)std::ceil(filter_ref_width_*E1)));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(E1, workOrder3DT.start_E1_, workOrder3DT.end_E1_, 
+                filterE1, filter_ref_type_, filter_ref_sigma_, (size_t)std::ceil(filter_ref_width_*E1)));
+
+            startE1 = workOrder3DT.start_E1_;
+            endE1 = workOrder3DT.end_E1_;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(filterE1, debugFolder_+"filterE1_POCS"); }
+
+        size_t startE2(0), endE2(E2-1);
+        hoNDArray<T> filterE2(E1);
+        if ( (workOrder3DT.start_E2_<0 || workOrder3DT.end_E2_<0) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(E2, 0, E2-1, 
+                filterE2, filter_ref_type_, filter_ref_sigma_, (size_t)std::ceil(filter_ref_width_*E2)));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(E2, workOrder3DT.start_E2_, workOrder3DT.end_E2_, 
+                filterE2, filter_ref_type_, filter_ref_sigma_, (size_t)std::ceil(filter_ref_width_*E2)));
+
+            startE2 = workOrder3DT.start_E2_;
+            endE2 = workOrder3DT.end_E2_;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(filterE2, debugFolder_+"filterE2_POCS"); }
+
+        hoNDArray<T> kspaceIter(kspace.get_dimensions());
+        kspaceIter = kspace;
+
+        // magnitude of complex images
+        hoNDArray<typename realType<T>::Type> mag(kspace.get_dimensions());
+        hoNDArray<T> magComplex(kspace.get_dimensions());
+
+        hoNDArray<T> buffer3DT(kspace.get_dimensions());
+        hoNDArray<T> buffer3DT_partial_fourier(kspace.get_dimensions());
+
+        // kspace filter
+        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspace3DfilterROE1E2(kspaceIter, filterRO, filterE1, filterE2, buffer3DT_partial_fourier));
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer3DT_partial_fourier, debugFolder_+"POCS_afterFiltered"); }
+
+        // go to image domain
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(buffer3DT_partial_fourier);
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer3DT_partial_fourier, debugFolder_+"POCS_afterFiltered_complexIm"); }
+
+        // get the complex image phase for the filtered kspace
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::abs(buffer3DT_partial_fourier, mag));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::addEpsilon(mag));
+        GADGET_CHECK_RETURN_FALSE(magComplex.copyFrom(mag));
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::divide(buffer3DT_partial_fourier, magComplex, buffer3DT));
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer3DT, debugFolder_+"POCS_afterFiltered_complexIm_phase"); }
+
+        // complex images, initialized as not filtered complex image
+        hoNDArray<T> complexIm(kspaceIter);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(kspaceIter, complexIm);
+        hoNDArray<T> complexImPOCS(complexIm);
+
+        // the kspace during iteration is buffered here
+        hoNDArray<T> buffer3DT_partial_fourierkspaceIter(kspaceIter);
+
+        size_t ii;
+        for ( ii=0; ii<workOrder3DT.partialFourier_POCS_iters_; ii++ )
+        {
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::abs(complexImPOCS, mag));
+            GADGET_CHECK_RETURN_FALSE(magComplex.copyFrom(mag));
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::multiply(magComplex, buffer3DT, complexImPOCS));
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(complexImPOCS, debugFolder_+"POCS_complexImPOCS"); }
+
+            // go back to kspace
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(complexImPOCS, kspaceIter);
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIter, debugFolder_+"POCS_kspaceIter"); }
+
+            // buffer kspace during iteration
+            buffer3DT_partial_fourierkspaceIter = kspaceIter;
+
+            // restore the acquired region
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1E2(kspace, kspaceIter, startRO, endRO, startE1, endE1, startE2, endE2));
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIter, debugFolder_+"POCS_kspaceIter_copyOri"); }
+
+            // update complex image
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(kspaceIter, complexImPOCS);
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(complexImPOCS, debugFolder_+"POCS_kspaceIter_copyOri_complexImPOCS"); }
+
+            // compute threshold to stop the iteration
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::subtract(complexImPOCS, complexIm, buffer3DT_partial_fourier));
+            typename realType<T>::Type diff, prev;
+            Gadgetron::norm2(complexIm, prev);
+            Gadgetron::norm2(buffer3DT_partial_fourier, diff);
+
+            typename realType<T>::Type thres = diff/prev;
+
+            if ( !debugFolder_.empty() )
+            {
+                GDEBUG_STREAM("POCS iter : " << ii << " - thres : " << thres << " ... ");
+            }
+
+            if ( thres < workOrder3DT.partialFourier_POCS_thres_ )
+            {
+                break;
+            }
+
+            complexIm = complexImPOCS;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer3DT_partial_fourierkspaceIter, debugFolder_+"kspaceIter_after_POCS"); }
+
+        if ( workOrder3DT.partialFourier_POCS_transitBand_ == 0 )
+        {
+            kspace = kspaceIter;
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1E2TransitionBand(kspace, buffer3DT_partial_fourierkspaceIter, startRO, endRO, startE1, endE1, startE2, endE2, 
+                workOrder3DT.partialFourier_POCS_transitBand_, workOrder3DT.partialFourier_POCS_transitBand_, workOrder3DT.partialFourier_POCS_transitBand_E2_));
+
+            kspace = buffer3DT_partial_fourierkspaceIter;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_after_POCS"); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::performPartialFourierPOCSRecon(WorkOrderType& workOrder3DT, hoNDArray<T>& kspace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::performPartialFourierFengHuangRecon(WorkOrderType& workOrder3DT, hoNDArray<T>& kspace)
+{
+    try
+    {
+        GDEBUG_STREAM("--> Into gt Plus 3DT partial fourier FengHuang ... ");
+
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t E2 = kspace.get_size(2);
+        size_t CHA = kspace.get_size(3);
+        size_t N = kspace.get_size(4);
+
+        // check whether partial fourier is used
+        if ( (workOrder3DT.start_RO_<0 || workOrder3DT.end_RO_<0 || (workOrder3DT.end_RO_-workOrder3DT.start_RO_+1==RO) ) 
+            && (workOrder3DT.start_E1_<0 || workOrder3DT.end_E1_<0 || (workOrder3DT.end_E1_-workOrder3DT.start_E1_+1==E1) )
+            && (workOrder3DT.start_E2_<0 || workOrder3DT.end_E2_<0 || (workOrder3DT.end_E2_-workOrder3DT.start_E2_+1==E2) ) )
+        {
+            return true;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_before_FengHuang"); }
+
+        size_t startRO(0), endRO(RO-1);
+        if ( workOrder3DT.start_RO_>=0 && workOrder3DT.end_RO_<RO )
+        {
+            startRO = workOrder3DT.start_RO_;
+            endRO = workOrder3DT.end_RO_;
+        }
+
+        size_t startE1(0), endE1(E1-1);
+        if ( workOrder3DT.start_E1_>=0 && workOrder3DT.end_E1_<E1 )
+        {
+            startE1 = workOrder3DT.start_E1_;
+            endE1 = workOrder3DT.end_E1_;
+        }
+
+        size_t startE2(0), endE2(E2-1);
+        if ( workOrder3DT.start_E2_>=0 && workOrder3DT.end_E2_<E2 )
+        {
+            startE2 = workOrder3DT.start_E2_;
+            endE2 = workOrder3DT.end_E2_;
+        }
+
+        // compute the conjugate symmetric kspace
+        hoNDArray<T> buffer3DT(kspace.get_dimensions());
+
+        if ( performTiming_ ) { gt_timer1_.start("conjugateSymmetry3D"); }
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().conjugateSymmetry3D(kspace, buffer3DT));
+        if ( performTiming_ ) { gt_timer1_.stop(); }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer3DT, debugFolder_+"kspaceConj_FengHuang"); }
+
+        // find the symmetric region in the kspace
+        size_t startSymRO, endSymRO;
+        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.findSymmetricSampledRegion(startRO, endRO, RO/2, startSymRO, endSymRO));
+
+        size_t startSymE1, endSymE1;
+        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.findSymmetricSampledRegion(startE1, endE1, E1/2, startSymE1, endSymE1));
+
+        size_t startSymE2, endSymE2;
+        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.findSymmetricSampledRegion(startE2, endE2, E2/2, startSymE2, endSymE2));
+
+        // the reference kspace for kernel estimation
+        hoNDArray<T> src, dst;
+        std::vector<size_t> start(5), size(5);
+
+        start[0] = startSymRO;
+        start[1] = startSymE1;
+        start[2] = startSymE2;
+        start[3] = 0;
+        start[4] = 0;
+
+        size[0] = endSymRO-startSymRO+1;
+        size[1] = endSymE1-startSymE1+1;
+        size[2] = endSymE2-startSymE2+1;;
+        size[3] = CHA;
+        size[4] = N;
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::cropUpTo11DArray(buffer3DT, src, start, size));
+        GADGET_CHECK_RETURN_FALSE(cropUpTo11DArray(kspace, dst, start, size));
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(src, debugFolder_+"src_FengHuang"); }
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(dst, debugFolder_+"dst_FengHuang"); }
+
+        if ( workOrder3DT.partialFourier_FengHuang_sameKernel_allN_ )
+        {
+            hoNDArray<T> ave4D;
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace5D(src, ave4D));
+            src = ave4D;
+
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace5D(dst, ave4D));
+            dst = ave4D;
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(src, debugFolder_+"src_ave4D_FengHuang"); }
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(dst, debugFolder_+"dst_ave4D_FengHuang"); }
+        }
+
+        // estimate the kernels
+        ho6DArray<T> kernel; // [RO E1 E2 srcCHA 1 N]
+        if ( performTiming_ ) { gt_timer1_.start("calibFengHuang"); }
+        GADGET_CHECK_RETURN_FALSE(this->calibFengHuang(workOrder3DT, src, dst, kernel));
+        if ( performTiming_ ) { gt_timer1_.stop(); }
+
+        // perform the recon
+        if ( workOrder3DT.partialFourier_FengHuang_transitBand_==0 )
+        {
+            if ( performTiming_ ) { gt_timer1_.start("performReconFangHuang"); }
+            GADGET_CHECK_RETURN_FALSE(this->performReconFangHuang(workOrder3DT, buffer3DT, kspace, (int)startRO, (int)endRO, (int)startE1, (int)endE1, (int)startE2, (int)endE2, kernel));
+            if ( performTiming_ ) { gt_timer1_.stop(); }
+        }
+        else
+        {
+            if ( performTiming_ ) { gt_timer1_.start("performReconFangHuang with transition band"); }
+
+            long long tb =  (long long)workOrder3DT.partialFourier_FengHuang_transitBand_;
+
+            long long sRO(startRO), eRO(endRO), sE1(startE1), eE1(endE1), sE2(startE2), eE2(endE2);
+
+            if ( startRO > 0 )
+            {
+                startRO += tb;
+                if ( startRO > RO ) startRO = 0;
+            }
+
+            if ( endRO < RO-1 )
+            {
+                endRO -= tb;
+                if ( endRO < 0 ) endRO = RO-1;
+            }
+
+            if ( startRO > endRO )
+            {
+                startRO = 0;
+                endRO = RO-1;
+            }
+
+            if ( startE1 > 0 )
+            {
+                startE1 += tb;
+                if ( startE1 > E1 ) startE1 = 0;
+            }
+
+            if ( endE1 < E1-1 )
+            {
+                endE1 -= tb;
+                if ( endE1 < 0 ) endE1 = E1-1;
+            }
+
+            if ( startE1 > endE1 )
+            {
+                startE1 = 0;
+                endE1 = E1-1;
+            }
+
+            if ( startE2 > 0 )
+            {
+                startE2 += tb;
+                if ( startE2 > E2 ) startE2 = 0;
+            }
+
+            if ( endE2 < E2-1 )
+            {
+                endE2 -= tb;
+                if ( endE2 < 0 ) endE2 = E2-1;
+            }
+
+            if ( startE2 > endE2 )
+            {
+                startE2 = 0;
+                endE2 = E2-1;
+            }
+
+            hoNDArray<T> buffer3DT_partial_fourier_kspaceIter(kspace.get_dimensions());
+            GADGET_CHECK_RETURN_FALSE(this->performReconFangHuang(workOrder3DT, buffer3DT, 
+                    buffer3DT_partial_fourier_kspaceIter, (int)startRO, (int)endRO, (int)startE1, (int)endE1, (int)startE2, (int)endE2, kernel));
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(buffer3DT_partial_fourier_kspaceIter, debugFolder_+"kspace_FengHuang_recon"); }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_FengHuang_original"); }
+
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1E2TransitionBand(kspace, buffer3DT_partial_fourier_kspaceIter, 
+                    sRO, eRO, sE1, eE1, sE2, eE2, workOrder3DT.partialFourier_FengHuang_transitBand_, 
+                    workOrder3DT.partialFourier_FengHuang_transitBand_, workOrder3DT.partialFourier_FengHuang_transitBand_E2_));
+
+            kspace = buffer3DT_partial_fourier_kspaceIter;
+
+            if ( performTiming_ ) { gt_timer1_.stop(); }
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace_after_FengHuang"); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::performPartialFourierFengHuangRecon(WorkOrderType& workOrder3DT, hoNDArray<T>& kspace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::calibFengHuang(WorkOrderType& workOrder3DT, const hoNDArray<T>& src, const hoNDArray<T>& dst, ho6DArray<T>& kernel)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(src.dimensions_equal(&dst));
+
+        long long RO = (long long)src.get_size(0);
+        long long E1 = (long long)src.get_size(1);
+        long long E2 = (long long)src.get_size(2);
+        long long srcCHA = (long long)src.get_size(3);
+        long long N = (long long)src.get_size(4);
+
+        long long kx = (long long)workOrder3DT.partialFourier_FengHuang_kSize_RO_;
+        long long ky = (long long)workOrder3DT.partialFourier_FengHuang_kSize_E1_;
+        long long kz = (long long)workOrder3DT.partialFourier_FengHuang_kSize_E2_;
+
+        if ( kx%2 == 0 ) kx++;
+        if ( ky%2 == 0 ) ky++;
+        if ( kz%2 == 0 ) kz++;
+
+        long long halfKx = (long long)kx/2;
+        long long halfKy = (long long)ky/2;
+        long long halfKz = (long long)kz/2;
+
+        // the cross-channel kernel is not estimated
+        kernel.createArray(kx, ky, kz, srcCHA, 1, N);
+
+        long long ii=0;
+        long long num = N*srcCHA;
+
+        long long startRO = halfKx;
+        long long endRO = RO - halfKx - 1;
+
+        long long startE1 = halfKy;
+        long long endE1 = E1 - halfKy - 1;
+
+        long long startE2 = halfKz;
+        long long endE2 = E2 - halfKz - 1;
+
+        long long rowA, colA, rowB, colB;
+        rowA = (endE2-startE2+1)*(endE1-startE1+1)*(endRO-startRO+1); 
+        colA = kx*ky*kz;
+
+        rowB = rowA;
+        colB = 1;
+
+        double thresReg = workOrder3DT.partialFourier_FengHuang_thresReg_;
+
+        #ifdef USE_OMP
+            omp_set_nested(1);
+        #endif // USE_OMP
+
+        #pragma omp parallel default(none) private(ii) shared(num, RO, E1, E2, srcCHA, N, kx, ky, kz, src, dst, kernel, rowA, colA, rowB, colB, startRO, endRO, startE1, endE1, startE2, endE2, halfKx, halfKy, halfKz, thresReg) if ( num > 1 ) num_threads( (int)(num<16 ? num : 16) )
+        {
+            hoNDArray<T> A_mem(rowA, colA);
+            hoNDArray<T> B_mem(rowB, colB);
+            hoNDArray<T> K_mem(colA, colB);
+
+            hoMatrix<T> A(rowA, colA, A_mem.begin());
+            hoMatrix<T> B(rowB, colB, B_mem.begin());
+            hoMatrix<T> K(colA, colB, K_mem.begin());
+
+            #pragma omp for
+            for ( ii=0; ii<num; ii ++ )
+            {
+                ho3DArray<T> src3D(RO, E1, E2, const_cast<T*>(src.begin())+ii*RO*E1*E2);
+                ho3DArray<T> dst3D(RO, E1, E2, const_cast<T*>(dst.begin())+ii*RO*E1*E2);
+
+                long long ro, e1, e2, row(0);
+                long long x, y, z;
+
+                for ( e2=startE2; e2<=endE2; e2++ )
+                {
+                    for ( e1=startE1; e1<=endE1; e1++ )
+                    {
+                        for ( ro=startRO; ro<=endRO; ro++ )
+                        {
+
+                            size_t colInd(0);
+                            for ( z=-halfKz; z<=halfKz; z++ )
+                            {
+                                for ( y=-halfKy; y<=halfKy; y++ )
+                                {
+                                    for ( x=-halfKx; x<=halfKx; x++ )
+                                    {
+                                        A(row, colInd++) = src3D(ro+x, e1+y, e2+z);
+                                    }
+                                }
+                            }
+
+                            B(row, 0) = dst3D(ro, e1, e2);
+
+                            row++;
+                        }
+                    }
+                }
+
+                Gadgetron::SolveLinearSystem_Tikhonov(A, B, K, thresReg);
+
+                memcpy(kernel.begin()+ii*kx*ky*kz, K.begin(), sizeof(T)*kx*ky*kz);
+            }
+        }
+
+        #ifdef USE_OMP
+            omp_set_nested(0);
+        #endif // USE_OMP
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::calibFengHuang(WorkOrderType& workOrder3DT, const hoNDArray<T>& src, const hoNDArray<T>& dst, ho6DArray<T>& kernel) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::performReconFangHuang(WorkOrderType& workOrder3DT, 
+                                                const hoNDArray<T>& kspaceConj, hoNDArray<T>& kspace, 
+                                                int startRO, int endRO, int startE1, int endE1, 
+                                                int startE2, int endE2, ho6DArray<T>& kernel)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(kspaceConj.dimensions_equal(&kspace));
+
+        long long RO = (long long)kspace.get_size(0);
+        long long E1 = (long long)kspace.get_size(1);
+        long long E2 = (long long)kspace.get_size(2);
+        long long CHA = (long long)kspace.get_size(3);
+        long long N = (long long)kspace.get_size(4);
+
+        long long kx = (long long)kernel.get_size(0);
+        long long ky = (long long)kernel.get_size(1);
+        long long kz = (long long)kernel.get_size(2);
+
+        long long halfKx = kx/2;
+        long long halfKy = ky/2;
+        long long halfKz = kz/2;
+
+        long long kerN = kernel.get_size(5);
+        GADGET_CHECK_RETURN_FALSE( (kerN==1) || (kerN==N) );
+
+        long long num = CHA*N;
+
+        long long rowD = RO*E1*E2 - ( (endE2-startE2+1) * (endE1-startE1+1) * (endRO-startRO+1) );
+        long long colD = kx*ky*kz;
+
+        ho2DArray<long long> coeffX(colD, rowD);
+        long long* pCx = coeffX.begin();
+
+        ho2DArray<long long> coeffY(colD, rowD);
+        long long* pCy = coeffY.begin();
+
+        ho2DArray<long long> coeffZ(colD, rowD);
+        long long* pCz = coeffZ.begin();
+
+        long long ro, e1, e2;
+        long long row(0);
+        long long x, y, z;
+
+        ho2DArray<long long> rowInd(3, rowD);
+        long long* pRowInd = rowInd.begin();
+
+        hoNDArray<long long> offsetX(colD);
+        long long* pOffsetX = offsetX.begin();
+
+        hoNDArray<long long> offsetY(colD);
+        long long* pOffsetY = offsetY.begin();
+
+        hoNDArray<long long> offsetZ(colD);
+        long long* pOffsetZ = offsetZ.begin();
+
+        long long colInd(0);
+        for ( z=-halfKz; z<=halfKz; z++ )
+        {
+            for ( y=-halfKy; y<=halfKy; y++ )
+            {
+                for ( x=-halfKx; x<=halfKx; x++ )
+                {
+                    offsetX(colInd) = x;
+                    offsetY(colInd) = y;
+                    offsetZ(colInd) = z;
+                    colInd++;
+                }
+            }
+        }
+
+        if ( performTiming_ ) { gt_timer3_.start("performReconFangHuang - compute coeff array"); }
+
+        if ( performTiming_ ) { gt_timer2_.start("performReconFangHuang - compute coeff array - internal"); }
+
+        long long* pRowIndCurr;
+        for ( e2=0; e2<E2; e2++ )
+        {
+            for ( e1=0; e1<E1; e1++ )
+            {
+                for ( ro=0; ro<RO; ro++ )
+                {
+                    if ( (ro>=startRO) && (ro<=endRO) && (e1>=startE1) && (e1<=endE1) && (e2>=startE2) && (e2<=endE2) )
+                    {
+                        continue;
+                    }
+
+                    pRowIndCurr = pRowInd + row*3;
+
+                    pRowIndCurr[0] = ro;
+                    pRowIndCurr[1] = e1;
+                    pRowIndCurr[2] = e2;
+
+                    row++;
+                }
+            }
+        }
+
+        long long r;
+        #pragma omp parallel for default(none) private(r) shared(rowD, colD, pCx, pCy, pCz, pRowInd, pRowIndCurr, pOffsetX, pOffsetY, pOffsetZ)
+        for ( r=0; r<rowD; r++ )
+        {
+            long long offsetC = r*colD;
+            pRowIndCurr = pRowInd + r*3;
+
+            for ( int colInd=0; colInd<colD; colInd++ )
+            {
+                pCx[offsetC+colInd] = pRowIndCurr[0]+pOffsetX[colInd];
+                pCy[offsetC+colInd] = pRowIndCurr[1]+pOffsetY[colInd];
+                pCz[offsetC+colInd] = pRowIndCurr[2]+pOffsetZ[colInd];
+            }
+        }
+
+        if ( performTiming_ ) { gt_timer2_.stop(); }
+
+        #pragma omp parallel for default(none) private(r) shared(rowD, colD, pCx, pCy, pCz, RO, E1, E2)
+        for ( r=0; r<rowD; r++ )
+        {
+            for ( int c=0; c<colD; c++ )
+            {
+                long long offset = c + r*colD;
+
+                //pCx[offset] += pOffsetX[c];
+
+                if ( pCx[offset] < 0 )
+                {
+                    pCx[offset] += RO;
+                }
+                else if ( pCx[offset] > RO-1 )
+                {
+                    pCx[offset] -= RO;
+                }
+
+                //pCy[offset] += pOffsetY[c];
+
+                if ( pCy[offset] < 0 )
+                {
+                    pCy[offset] += E1;
+                }
+                else if ( pCy[offset] > E1-1 )
+                {
+                    pCy[offset] -= E1;
+                }
+
+                //pCz[offset] += pOffsetZ[c];
+
+                if ( pCz[offset] < 0 )
+                {
+                    pCz[offset] += E2;
+                }
+                else if ( pCz[offset] > E2-1 )
+                {
+                    pCz[offset] -= E2;
+                }
+            }
+        }
+
+        /*row = 0;
+        for ( e2=0; e2<E2; e2++ )
+        {
+            for ( e1=0; e1<E1; e1++ )
+            {
+                for ( ro=0; ro<RO; ro++ )
+                {
+                    if ( (ro>=startRO) && (ro<=endRO) && (e1>=startE1) && (e1<=endE1) && (e2>=startE2) && (e2<=endE2) )
+                    {
+                        continue;
+                    }
+
+                    size_t colInd(0);
+
+                    for ( z=-halfKz; z<=halfKz; z++ )
+                    {
+                        dz = e2 + z;
+                        if ( dz < 0 ) dz += E2;
+                        if ( dz > E2-1 ) dz -= E2;
+
+                        for ( y=-halfKy; y<=halfKy; y++ )
+                        {
+                            dy = e1 + y;
+                            if ( dy < 0 ) dy += E1;
+                            if ( dy > E1-1 ) dy -= E1;
+
+                            for ( x=-halfKx; x<=halfKx; x++ )
+                            {
+                                dx = ro + x;
+                                if ( dx < 0 ) dx += RO;
+                                if ( dx > RO-1 ) dx -= RO;
+
+                                coeffX(row, colInd) = dx;
+                                coeffY(row, colInd) = dy;
+                                coeffZ(row, colInd) = dz;
+                                colInd++;
+                            }
+                        }
+                    }
+
+                    row++;
+                }
+            }
+        }*/
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        long long ii;
+        int numOfThreads = (int)((num>4) ? 4 : num);
+        #pragma omp parallel default(none) private(ii) shared(num, RO, E1, E2, CHA, N, kerN, kspaceConj, kspace, kernel, rowD, colD, coeffX, coeffY, coeffZ, pCx, pCy, pCz) if ( num > 1 ) num_threads( numOfThreads )
+        {
+            hoNDArray<T> D_mem(rowD, colD);
+
+            hoMatrix<T> D(rowD, colD, D_mem.begin());
+            T* pD = D.begin();
+
+            hoMatrix<T> K(colD, 1);
+            hoMatrix<T> R(rowD, 1);
+
+            Gadgetron::clear(D);
+            Gadgetron::clear(K);
+            Gadgetron::clear(R);
+
+            #pragma omp for
+            for ( ii=0; ii<num; ii ++ )
+            {
+                ho3DArray<T> src3D(RO, E1, E2, const_cast<T*>(kspaceConj.begin())+ii*RO*E1*E2);
+                ho3DArray<T> dst3D(RO, E1, E2, kspace.begin()+ii*RO*E1*E2);
+
+                long long row;
+
+                if ( performTiming_ ) { gt_timer2_.start("fill data matrix ... "); }
+                #pragma omp parallel for private(row) shared(colD, rowD, D, src3D, pD)
+                for ( row=0; row<rowD; row++ )
+                {
+                    for ( long long col=0; col<colD; col++ )
+                    {
+                        long long offset = col + row*colD;
+                        pD[offset] = src3D(pCx[offset], pCy[offset], pCz[offset]);
+                    }
+                }
+                if ( performTiming_ ) { gt_timer2_.stop(); }
+
+                if ( kerN == 1 )
+                {
+                    long long ind = ii;
+                    long long currS = ind/(CHA*N);
+                    ind %= CHA*N;
+                    long long currN = ind/CHA;
+                    ind %= CHA;
+                    memcpy(K.begin(), kernel.begin()+(ind+currS*CHA)*colD, sizeof(T)*colD);
+                }
+                else
+                {
+                    memcpy(K.begin(), kernel.begin()+ii*colD, sizeof(T)*colD);
+                }
+
+                // R = D*K
+                if ( performTiming_ ) { gt_timer2_.start("matrix multiplication ... "); }
+                Gadgetron::gemm(R, D, false, K, false);
+                if ( performTiming_ ) { gt_timer2_.stop(); }
+
+                size_t colCenter = colD/2;
+
+                if ( performTiming_ ) { gt_timer2_.start("fill the result array ... "); }
+                #pragma omp parallel for private(row) default(none) shared(rowD, dst3D, colCenter, coeffX, coeffY, coeffZ, R)
+                for ( row=0; row<rowD; row++ )
+                {
+                    dst3D( coeffX(colCenter, row), coeffY(colCenter, row), coeffZ(colCenter, row) ) = R(row, 0);
+                }
+                if ( performTiming_ ) { gt_timer2_.stop(); }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::performReconFangHuang(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::
+estimateJobSize(gtPlusReconWorkOrder<T>* workOrder3DT, size_t maxNumOfBytesPerJob, size_t overlapBetweenJobs, size_t numOfNodes, size_t& jobSize)
+{
+    try
+    {
+        size_t nodeN = numOfNodes;
+        GADGET_CHECK_RETURN_FALSE(this->computeEffectiveNodeNumberBasedOnComputingPowerIndex(workOrder3DT, nodeN));
+        if ( workOrder3DT->job_perform_on_control_node_ ) nodeN++;
+
+        GDEBUG_STREAM("GtPlus Cloud 3DT - job_perform_on_control_node is " << workOrder3DT->job_perform_on_control_node_  << " - nodeN is " << nodeN << " - overlapBetweenJobs is " << overlapBetweenJobs << " ... ");
+
+        // adjust jobN according to cloud size
+        size_t RO = workOrder3DT->data_.get_size(0);
+        size_t E1 = workOrder3DT->data_.get_size(1);
+        size_t E2 = workOrder3DT->data_.get_size(2);
+        size_t N = workOrder3DT->data_.get_size(4);
+
+        size_t srcCHA = workOrder3DT->kernelIm_->get_size(3);
+        size_t dstCHA = workOrder3DT->kernelIm_->get_size(4);
+
+        size_t totalJobNum = RO;
+        jobSize = (size_t)std::ceil( (double)(totalJobNum+overlapBetweenJobs*(nodeN-1))/(double)nodeN );
+
+        size_t numOfBytesPerJob = sizeof(T)*( E1*E2*srcCHA*dstCHA*jobSize + 2*E1*E2*srcCHA*jobSize );
+
+        // here a 64Mb graceful size is given to job
+        while ( numOfBytesPerJob > maxNumOfBytesPerJob-64.0*1024*1024 )
+        {
+            nodeN *= 2;
+            jobSize = (size_t)std::ceil( (double)(totalJobNum+overlapBetweenJobs*(nodeN-1))/(double)nodeN );
+            numOfBytesPerJob = sizeof(T)*( E1*E2*srcCHA*dstCHA*jobSize + 2*E1*E2*srcCHA*jobSize );
+        }
+
+        GDEBUG_STREAM("GtPlus Cloud 3DT - jobSize is " << jobSize << "; every job has " << numOfBytesPerJob/1024.0/1024 << " MBytes ... ");
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DT<T>::estimateJobSize(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTGRAPPA.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTGRAPPA.h
new file mode 100644
index 0000000..b1af968
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTGRAPPA.h
@@ -0,0 +1,642 @@
+/** \file   gtPlusISMRMRDReconWorker3DTGRAPPA.h
+    \brief  Implement the 3DT GRAPPA reconstruction
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd/ismrmrd.h"
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorker3DT.h"
+#include "gtPlusGRAPPA.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker3DTGRAPPA : public gtPlusReconWorker3DT<T>
+{
+public:
+
+    typedef gtPlusReconWorker3DT<T> BaseClass;
+    typedef typename BaseClass::value_type value_type;
+
+    typedef gtPlusReconWorkOrder3DT<T> WorkOrderType;
+
+    gtPlusReconWorker3DTGRAPPA() : BaseClass() {}
+    virtual ~gtPlusReconWorker3DTGRAPPA() {}
+
+    virtual bool performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT);
+
+    virtual bool performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT);
+    virtual bool performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT, size_t usedN);
+
+    virtual bool performUnwrapping(gtPlusReconWorkOrder3DT<T>* workOrder3DT, const hoNDArray<T>& data);
+
+    virtual bool computeKSpace(gtPlusReconWorkOrder3DT<T>* workOrder3DT);
+
+    virtual bool splitJob(gtPlusReconWorkOrder3DT<T>* workOrder3DT, size_t& jobN);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::verbose_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_cplx_;
+
+    using BaseClass::ref_src_;
+    using BaseClass::ref_dst_;
+    using BaseClass::data_dst_;
+    using BaseClass::ref_coil_map_dst_;
+    using BaseClass::startE1_;
+    using BaseClass::endE1_;
+
+    gtPlusGRAPPA<T> grappa_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker3DTGRAPPA<T>::computeKSpace(gtPlusReconWorkOrder3DT<T>* workOrder3DT)
+{
+    bool recon_kspace = false;
+
+    if ( workOrder3DT->CalibMode_ == ISMRMRD_embedded )
+    {
+        if ( workOrder3DT->embedded_fullres_coilmap_ || workOrder3DT->embedded_ref_fillback_ )
+        {
+            recon_kspace = true;
+        }
+    }
+
+    if ( workOrder3DT->CalibMode_ == ISMRMRD_separate )
+    {
+        if ( workOrder3DT->separate_fullres_coilmap_ )
+        {
+            recon_kspace = true;
+        }
+    }
+
+    if ( workOrder3DT->recon_kspace_needed_ )
+    {
+        recon_kspace = true;
+    }
+
+    return recon_kspace;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTGRAPPA<T>::
+splitJob(gtPlusReconWorkOrder3DT<T>* workOrder3DT, size_t& jobN)
+{
+    size_t RO = workOrder3DT->data_.get_size(0);
+    size_t E1 = workOrder3DT->data_.get_size(1);
+    size_t E2 = workOrder3DT->data_.get_size(2);
+
+    size_t srcCHA = workOrder3DT->kernel_->get_size(3);
+    size_t dstCHA = workOrder3DT->kernel_->get_size(4);
+
+    jobN = workOrder3DT->job_num_of_N_;
+    size_t jobMegaBytes = workOrder3DT->job_max_Megabytes_;
+
+    bool splitJobs = (jobN>0 && RO>jobN);
+    if ( !splitJobs )
+    {
+        if ( jobMegaBytes>0 )
+        {
+            size_t jobN = jobMegaBytes/(E1*E2*srcCHA*dstCHA*sizeof(T)/1024/1024);
+            if ( jobN < RO ) splitJobs = true;
+            GDEBUG_STREAM("grappa - 3DT - size of largest job : " << jobN);
+        }
+    }
+
+    bool reconKSpace = this->computeKSpace(workOrder3DT);
+    if ( !reconKSpace )
+    {
+        splitJobs = false;
+    }
+
+    return splitJobs;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTGRAPPA<T>::
+performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT)
+{
+    grappa_.performTiming_ = performTiming_;
+
+    size_t RO = workOrder3DT->data_.get_size(0);
+    size_t E1 = workOrder3DT->data_.get_size(1);
+    size_t E2 = workOrder3DT->data_.get_size(2);
+    size_t N = workOrder3DT->data_.get_size(4);
+    size_t srcCHA = workOrder3DT->data_.get_size(3);
+
+    size_t refRO = ref_dst.get_size(0);
+    size_t refE1 = ref_dst.get_size(1);
+    size_t refE2 = ref_dst.get_size(2);
+    size_t refN = ref_dst.get_size(4);
+    size_t dstCHA = ref_dst.get_size(3);
+
+    bool reconKSpace = this->computeKSpace(workOrder3DT);
+
+    std::vector<int> kE1, oE1;
+    bool fitItself = true;
+    GADGET_CHECK_RETURN_FALSE(grappa_.kerPattern(kE1, oE1, (int)workOrder3DT->acceFactorE1_, workOrder3DT->grappa_kSize_E1_, fitItself));
+
+    std::vector<int> kE2, oE2;
+    GADGET_CHECK_RETURN_FALSE(grappa_.kerPattern(kE2, oE2, (int)workOrder3DT->acceFactorE2_, workOrder3DT->grappa_kSize_E2_, fitItself));
+
+    size_t kRO = workOrder3DT->grappa_kSize_RO_;
+    size_t kNE1 = workOrder3DT->grappa_kSize_E1_;
+    size_t oNE1 = oE1.size();
+
+    size_t kNE2 = workOrder3DT->grappa_kSize_E2_;
+    size_t oNE2 = oE1.size();
+
+    workOrder3DT->kernel_->create(kRO, kNE1, kNE2, srcCHA, dstCHA, oNE1, oNE2, refN);
+    Gadgetron::clear(workOrder3DT->kernel_.get());
+
+    size_t jobN;
+    bool splitJobs = this->splitJob(workOrder3DT, jobN);
+
+    if ( !splitJobs )
+    {
+        if ( performTiming_ ) { gt_timer3_.start("allocate image domain kernel ... "); }
+        workOrder3DT->kernelIm_->create(RO, E1, E2, srcCHA, dstCHA, refN);
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+    }
+    else
+    {
+        int maxKE1 = std::abs(kE1[0]);
+        if ( std::abs(kE1[kNE1-1]) > maxKE1 )
+        {
+            maxKE1 = std::abs(kE1[kNE1-1]);
+        }
+        int convKE1 = 2*maxKE1+1;
+
+        int maxKE2 = std::abs(kE2[0]);
+        if ( std::abs(kE2[kNE2-1]) > maxKE2 )
+        {
+            maxKE2 = std::abs(kE2[kNE2-1]);
+        }
+        int convKE2 = 2*maxKE2+1;
+
+        if ( performTiming_ ) { gt_timer3_.start("allocate image domain kernel only along RO ... "); }
+        workOrder3DT->kernelIm_->create(convKE1, convKE2, RO, srcCHA, dstCHA, refN);
+        // pre-set to zero is needed here
+        memset(workOrder3DT->kernelIm_->begin(), 0, workOrder3DT->kernelIm_->get_number_of_bytes());
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+    }
+
+    if ( !reconKSpace )
+    {
+        if ( performTiming_ ) { gt_timer3_.start("allocate unmixing coefficient ... "); }
+        workOrder3DT->unmixingCoeffIm_->create(RO, E1, E2, srcCHA, refN);
+        Gadgetron::clear(workOrder3DT->unmixingCoeffIm_.get());
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        workOrder3DT->gfactor_.create(RO, E1, E2, refN);
+        Gadgetron::clear(&(workOrder3DT->gfactor_));
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTGRAPPA<T>::
+performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT, size_t usedN)
+{
+    size_t RO = workOrder3DT->data_.get_size(0);
+    size_t E1 = workOrder3DT->data_.get_size(1);
+    size_t E2 = workOrder3DT->data_.get_size(2);
+    size_t N = workOrder3DT->data_.get_size(4);
+    size_t srcCHA = workOrder3DT->data_.get_size(3);
+
+    size_t refRO = ref_dst.get_size(0);
+    size_t refE1 = ref_dst.get_size(1);
+    size_t refE2 = ref_dst.get_size(2);
+    size_t refN = ref_dst.get_size(4);
+    size_t dstCHA = ref_dst.get_size(3);
+
+    bool reconKSpace = this->computeKSpace(workOrder3DT);
+
+    std::vector<int> kE1, oE1;
+    bool fitItself = true;
+    GADGET_CHECK_RETURN_FALSE(grappa_.kerPattern(kE1, oE1, (size_t)workOrder3DT->acceFactorE1_, workOrder3DT->grappa_kSize_E1_, fitItself));
+
+    std::vector<int> kE2, oE2;
+    GADGET_CHECK_RETURN_FALSE(grappa_.kerPattern(kE2, oE2, (size_t)workOrder3DT->acceFactorE2_, workOrder3DT->grappa_kSize_E2_, fitItself));
+
+    size_t kRO = workOrder3DT->grappa_kSize_RO_;
+    size_t kNE1 = workOrder3DT->grappa_kSize_E1_;
+    size_t oNE1 = oE1.size();
+
+    size_t kNE2 = workOrder3DT->grappa_kSize_E2_;
+    size_t oNE2 = oE1.size();
+
+    ho4DArray<T> acsSrc(refRO, refE1, refE2, srcCHA, const_cast<T*>(ref_src.begin()+usedN*refRO*refE1*refE2*srcCHA));
+    ho4DArray<T> acsDst(refRO, refE1, refE2, dstCHA, const_cast<T*>(ref_dst.begin()+usedN*refRO*refE1*refE2*dstCHA));
+
+    std::ostringstream ostr;
+    ostr << "_n_" << usedN;
+    std::string suffix = ostr.str();
+
+    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(acsSrc, debugFolder_+"acsSrc"+suffix); }
+    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(acsDst, debugFolder_+"acsDst"+suffix); }
+
+    grappa_.calib_use_gpu_  = workOrder3DT->grappa_use_gpu_;
+
+    ho7DArray<T> ker(kRO, kNE1, kNE2, srcCHA, dstCHA, oNE1, oNE2, workOrder3DT->kernel_->begin()+usedN*kRO*kNE1*kNE2*srcCHA*dstCHA*oNE1*oNE2);
+    if ( performTiming_ ) { gt_timer3_.start("grappa 3D calibration ... "); }
+    grappa_.calib3D(acsSrc, acsDst, workOrder3DT->grappa_reg_lamda_, workOrder3DT->grappa_calib_over_determine_ratio_, kRO, kE1, kE2, oE1, oE2, ker);
+    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(ker, debugFolder_+"ker"+suffix); }
+
+    size_t jobN;
+    bool splitJobs = this->splitJob(workOrder3DT, jobN);
+
+    if ( !splitJobs )
+    {
+        hoNDArray<T> kIm(RO, E1, E2, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin()+usedN*RO*E1*E2*srcCHA*dstCHA);
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D image domain kernel ... "); }
+        grappa_.imageDomainKernel3D(ker, kRO, kE1, kE2, oE1, oE2, RO, E1, E2, kIm);
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( !reconKSpace )
+        {
+            hoNDArray<T> coilMap(RO, E1, E2, dstCHA, workOrder3DT->coilMap_->begin()+usedN*RO*E1*E2*dstCHA);
+            hoNDArray<T> unmixC(RO, E1, E2, srcCHA);
+            hoNDArray<T> gFactor(RO, E1, E2, workOrder3DT->gfactor_.begin()+usedN*RO*E1*E2);
+
+            this->unmixCoeff(kIm, coilMap, unmixC, gFactor);
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::scal( (value_type)(1.0/workOrder3DT->acceFactorE1_/workOrder3DT->acceFactorE2_), gFactor));
+
+            memcpy(workOrder3DT->unmixingCoeffIm_->begin()+usedN*RO*E1*E2*srcCHA, unmixC.begin(), unmixC.get_number_of_bytes());
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(unmixC, debugFolder_+"unmixC"+suffix); }
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(gFactor, debugFolder_+"gFactor"+suffix); }
+        }
+    }
+    else
+    {
+        int maxKE1 = std::abs(kE1[0]);
+        if ( std::abs(kE1[kNE1-1]) > maxKE1 )
+        {
+            maxKE1 = std::abs(kE1[kNE1-1]);
+        }
+        int convKE1 = 2*maxKE1+1;
+
+        int maxKE2 = std::abs(kE2[0]);
+        if ( std::abs(kE2[kNE2-1]) > maxKE2 )
+        {
+            maxKE2 = std::abs(kE2[kNE2-1]);
+        }
+        int convKE2 = 2*maxKE2+1;
+
+        hoNDArray<T> kIm(convKE1, convKE2, RO, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin()+usedN*convKE1*convKE2*RO*srcCHA*dstCHA);
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D image domain kernel only along RO ... "); }
+        GADGET_CHECK_RETURN_FALSE(grappa_.imageDomainKernelRO3D(ker, kRO, kE1, kE2, oE1, oE2, RO, kIm));
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( !debugFolder_.empty() )
+        {
+            hoNDArray<T> kImROACha(convKE1, convKE2, RO, srcCHA, kIm.begin());
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kImROACha, debugFolder_+"kImROACha"+suffix); }
+        }
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTGRAPPA<T>::
+performUnwrapping(gtPlusReconWorkOrder3DT<T>* workOrder3DT, const hoNDArray<T>& data_dst)
+{
+    try
+    {
+        int n;
+
+        size_t RO = workOrder3DT->data_.get_size(0);
+        size_t E1 = workOrder3DT->data_.get_size(1);
+        size_t E2 = workOrder3DT->data_.get_size(2);
+        size_t N = workOrder3DT->data_.get_size(4);
+
+        size_t srcCHA = workOrder3DT->kernelIm_->get_size(3);
+        size_t dstCHA = workOrder3DT->kernelIm_->get_size(4);
+
+        size_t refN = workOrder3DT->kernelIm_->get_size(5);
+
+        workOrder3DT->complexIm_.create(RO, E1, E2, 1, N);
+
+        hoNDArray<T> aliasedIm;
+
+        if ( performTiming_ ) { gt_timer3_.start("grappa 3D compute aliased image ... "); }
+        if ( workOrder3DT->downstream_coil_compression_ )
+        {
+            aliasedIm.create(workOrder3DT->data_.get_dimensions());
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(workOrder3DT->data_, aliasedIm);
+        }
+        else
+        {
+            aliasedIm.create(data_dst.get_dimensions());
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(data_dst, aliasedIm);
+        }
+
+        typename realType<T>::Type fftCompensationRatio = (typename realType<T>::Type)(1.0/std::sqrt( (double)workOrder3DT->acceFactorE1_ * (double)workOrder3DT->acceFactorE2_ ));
+        Gadgetron::scal( fftCompensationRatio, aliasedIm);
+
+        // if the image data is scaled and ref lines are going to be filled back to the data, 
+        // the reference lines should be scaled too
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_embedded )
+        {
+            if ( workOrder3DT->embedded_ref_fillback_ )
+            {
+                Gadgetron::scal( fftCompensationRatio, workOrder3DT->ref_);
+            }
+        }
+
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(aliasedIm, debugFolder_+"aliasedIm"); }
+
+        bool recon_kspace = this->computeKSpace(workOrder3DT);
+
+        // if kspace is actually needed
+        if ( recon_kspace )
+        {
+            workOrder3DT->fullkspace_ = data_dst;
+
+            size_t jobN;
+            bool splitJobs = this->splitJob(workOrder3DT, jobN);
+
+            if ( splitJobs )
+            {
+                size_t kE1 = workOrder3DT->kernelIm_->get_size(0);
+                size_t kE2 = workOrder3DT->kernelIm_->get_size(1);
+                size_t kRO = workOrder3DT->kernelIm_->get_size(2);
+
+                if ( (refN<N) || (refN==1) )
+                {
+                    hoNDArray<T> kImPermuted(kE1, kE2, RO, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin());
+
+                    hoNDArray<T> kImPermutedJob(kE1, kE2, jobN, srcCHA, dstCHA);
+
+                    if ( performTiming_ ) { gt_timer3_.start("grappa 3D allocate buffer for kImPermutedZeroFilledJob ... "); }
+                    hoNDArray<T> kImPermutedZeroFilledJob(E1, E2, jobN, srcCHA, dstCHA);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    // aliased images
+                    hoNDArray<T> aliasedImPermutedJob(E1, E2, jobN, srcCHA);
+
+                    if ( performTiming_ ) { gt_timer3_.start("grappa 3D allocate buffer for aliasedIm permuted ... "); }
+                    hoNDArray<T> aliasedImPermuted(E1, E2, RO, srcCHA, N);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( performTiming_ ) { gt_timer3_.start("permuteROTo3rdDimensionFor3DRecon for aliased images ... "); }
+
+                    std::vector<size_t> dim_order(3);
+                    dim_order[0] = 1;
+                    dim_order[1] = 2;
+                    dim_order[2] = 0;
+
+                    GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&aliasedIm, &aliasedImPermuted, &dim_order));
+
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    // unwrapped images
+                    hoNDArray<T> unwrappedImPermutedJob(E1, E2, jobN, srcCHA, N);
+
+                    if ( performTiming_ ) { gt_timer3_.start("grappa 3D allocate buffer for unwrapped images permuted ... "); }
+                    hoNDArray<T> unwrappedImPermuted(E1, E2, RO, dstCHA, N);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    // buffer
+                    if ( performTiming_ ) { gt_timer3_.start("grappa 3D allocate buffer for unwrapping ... "); }
+                    hoNDArray<T> buffer3DT_unwrapping(E1, E2, jobN, srcCHA);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    size_t ro=0;
+                    while ( ro<RO )
+                    {
+                        size_t start = ro;
+                        size_t end = ro+jobN-1;
+                        if ( end >= RO )
+                        {
+                            end = RO-1;
+                            start = end-jobN+1;
+                        }
+
+                        GDEBUG_STREAM("grappa 3D - processing " << start << " to " << end << " ... ");
+
+                        if ( (refN<N) || (refN==1) )
+                        {
+                            hoNDArray<T> kImPermuted(kE1, kE2, RO, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin());
+
+                            if ( performTiming_ ) { gt_timer3_.start("cropOver3rdDimension hybrid domain kernel ... "); }
+                            GADGET_CHECK_RETURN_FALSE(cropOver3rdDimension(kImPermuted, kImPermutedJob, start, end));
+                            if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                            if ( performTiming_ ) { gt_timer3_.start("imageDomainKernelE1E2RO ... "); }
+                            GADGET_CHECK_RETURN_FALSE(grappa_.imageDomainKernelE1E2RO(kImPermutedJob, (int)E1, (int)E2, kImPermutedZeroFilledJob));
+                            if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                            if ( performTiming_ ) { gt_timer3_.start("cropOver3rdDimension aliased images ... "); }
+                            GADGET_CHECK_RETURN_FALSE(cropOver3rdDimension(aliasedImPermuted, aliasedImPermutedJob, start, end));
+                            if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                            if ( performTiming_ ) { gt_timer3_.start("grappa 3D apply image domain kernel for every channel and every job ... "); }
+                            this->applyImageDomainKernelImage(aliasedImPermutedJob, kImPermutedZeroFilledJob, buffer3DT_unwrapping, unwrappedImPermutedJob);
+                            if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                            if ( performTiming_ ) { gt_timer3_.start("setSubArrayOver3rdDimension unwrapped images ... "); }
+                            GADGET_CHECK_RETURN_FALSE(setSubArrayOver3rdDimension(unwrappedImPermutedJob, unwrappedImPermuted, start, end));
+                            if ( performTiming_ ) { gt_timer3_.stop(); }
+                        }
+                        else
+                        {
+                            for ( n=0; n<(int)N; n++ )
+                            {
+                                hoNDArray<T> kImPermuted(kE1, kE2, RO, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin()+n*kE1*kE2*RO*srcCHA*dstCHA);
+
+                                if ( performTiming_ ) { gt_timer3_.start("cropOver3rdDimension hybrid domain kernel ... "); }
+                                GADGET_CHECK_RETURN_FALSE(cropOver3rdDimension(kImPermuted, kImPermutedJob, start, end));
+                                if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                                if ( performTiming_ ) { gt_timer3_.start("imageDomainKernelE1E2RO ... "); }
+                                GADGET_CHECK_RETURN_FALSE(grappa_.imageDomainKernelE1E2RO(kImPermutedJob, (int)E1, (int)E2, kImPermutedZeroFilledJob));
+                                if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                                hoNDArray<T> aliasedImPermutedN(E1, E2, RO, srcCHA, aliasedImPermuted.begin()+n*E1*E2*RO*srcCHA);
+
+                                if ( performTiming_ ) { gt_timer3_.start("cropOver3rdDimension aliased images ... "); }
+                                GADGET_CHECK_RETURN_FALSE(cropOver3rdDimension(aliasedImPermutedN, aliasedImPermutedJob, start, end));
+                                if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                                if ( performTiming_ ) { gt_timer3_.start("grappa 3D apply image domain kernel for every channel and every job ... "); }
+                                this->applyImageDomainKernelImage(aliasedImPermutedJob, kImPermutedZeroFilledJob, buffer3DT_unwrapping, unwrappedImPermutedJob);
+                                if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                                if ( performTiming_ ) { gt_timer3_.start("setSubArrayOver3rdDimension unwrapped images ... "); }
+                                GADGET_CHECK_RETURN_FALSE(setSubArrayOver3rdDimension(unwrappedImPermutedJob, unwrappedImPermuted, start, end));
+                                if ( performTiming_ ) { gt_timer3_.stop(); }
+                            }
+                        }
+
+                        ro += jobN;
+                    }
+
+                    if ( performTiming_ ) { gt_timer3_.start("permute RO to 1st dimension for unwrapped images ... "); }
+                    {
+                        size_t N3D = RO*E1*E2;
+                        size_t Num = dstCHA*N;
+
+                        T* pX = unwrappedImPermuted.begin();
+                        T* pR = workOrder3DT->fullkspace_.begin();
+
+                        long long n, e2;
+                        for (n = 0; n < (long long)Num; n++)
+                        {
+                            T* pXn = pX + n*N3D;
+                            T* pRn = pR + n*N3D;
+
+#pragma omp parallel for default(none) private(e2) shared(RO, E1, E2, pXn, pRn)
+                            for (e2 = 0; e2 < (long long)E2; e2++)
+                            {
+                                for (size_t e1 = 0; e1 < E1; e1++)
+                                {
+                                    size_t indXn = e1 + e2*E1;
+                                    size_t indRn = e1*RO + e2*RO*E1;
+                                    for (size_t ro = 0; ro < RO; ro++)
+                                    {
+                                        pRn[ro + indRn] = pXn[ro*E1*E2 + indXn];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+                }
+                else
+                {
+                    for ( n=0; n<(int)N; n++ )
+                    {
+                        
+                    }
+                }
+            }
+            else
+            {
+                if ( (refN<N) || (refN==1) )
+                {
+                    hoNDArray<T> kIm(RO, E1, E2, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin());
+
+                    if ( performTiming_ ) { gt_timer3_.start("grappa 3D allocate buffer for unwarpping ... "); }
+                    hoNDArray<T> buffer3DT_unwrapping(RO, E1, E2, srcCHA);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( performTiming_ ) { gt_timer3_.start("grappa 3D apply image domain kernel for every channel ... "); }
+                    this->applyImageDomainKernelImage(aliasedIm, kIm, buffer3DT_unwrapping, workOrder3DT->fullkspace_);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->fullkspace_, debugFolder_+"unwarppedIm"); }
+                }
+                else
+                {
+                    hoNDArray<T> buffer3DT_unwrapping(RO, E1, E2, srcCHA, dstCHA);
+
+                    hoNDArray<T> complexIm(RO, E1, E2, dstCHA);
+                    for ( n=0; n<(int)N; n++ )
+                    {
+                        hoNDArray<T> kIm(RO, E1, E2, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin()+n*RO*E1*E2*srcCHA*dstCHA);
+
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kIm, debugFolder_+"kIm_n"); }
+
+                        hoNDArray<T> aliasedImN(RO, E1, E2, srcCHA, aliasedIm.begin()+n*RO*E1*E2*srcCHA);
+
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(aliasedImN, debugFolder_+"aliasedIm_n"); }
+
+                        this->applyImageDomainKernelImage(aliasedImN, kIm, buffer3DT_unwrapping, complexIm);
+                        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(complexIm, debugFolder_+"complexIm_n"); }
+
+                        memcpy(workOrder3DT->fullkspace_.begin()+n*RO*E1*E2*dstCHA, complexIm.begin(), sizeof(T)*RO*E1*E2*dstCHA);
+                    }
+                }
+            }
+
+            if ( (workOrder3DT->coilMap_->get_size(0)==RO) 
+                && (workOrder3DT->coilMap_->get_size(1)==E1) 
+                && (workOrder3DT->coilMap_->get_size(2)==E2) 
+                && (workOrder3DT->coilMap_->get_size(3)==dstCHA) )
+            {
+                if ( performTiming_ ) { gt_timer3_.start("grappa 3D coil combination ... "); }
+                gtPlusISMRMRDReconUtilComplex<T>().coilCombine3D(workOrder3DT->fullkspace_, *workOrder3DT->coilMap_, workOrder3DT->complexIm_);
+                if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->complexIm_, debugFolder_+"combined"); }
+            }
+
+            if ( performTiming_ ) { gt_timer3_.start("grappa 3D go back to kspace ... "); }
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(workOrder3DT->fullkspace_);
+            if ( performTiming_ ) { gt_timer3_.stop(); }
+        }
+        else
+        {
+            if ( (refN<N) || (refN==1) )
+            {
+                if ( performTiming_ ) { gt_timer3_.start("grappa 3D unmixCoeff ... "); }
+                hoNDArray<T> unmixCoeff(RO, E1, E2, srcCHA, workOrder3DT->unmixingCoeffIm_->begin());
+                if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                if ( performTiming_ ) { gt_timer3_.start("grappa 3D apply unmixing coeff ... "); }
+                this->applyUnmixCoeffImage(aliasedIm, unmixCoeff, workOrder3DT->complexIm_);
+                if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->complexIm_, debugFolder_+"unwarppedIm"); }
+            }
+            else
+            {
+                for ( n=0; n<(int)N; n++ )
+                {
+                    hoNDArray<T> unmixCoeff(RO, E1, E2, srcCHA, workOrder3DT->unmixingCoeffIm_->begin()+n*RO*E1*E2*srcCHA);
+                    hoNDArray<T> aliasedImN(RO, E1, E2, srcCHA, aliasedIm.begin()+n*RO*E1*E2*srcCHA);
+                    hoNDArray<T> unwarppedIm(RO, E1, E2, 1, workOrder3DT->complexIm_.begin()+n*RO*E1*E2);
+
+                    this->applyUnmixCoeffImage(aliasedImN, unmixCoeff, unwarppedIm);
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(unwarppedIm, debugFolder_+"unwarppedIm"); }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DTGRAPPA<T>::performUnwrapping(gtPlusReconWorkOrder3DT<T>* workOrder3DT, const hoNDArray<T>& data) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTGRAPPA<T>::performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(workOrder3DT!=NULL);
+
+        // call the BaseClass
+        GADGET_CHECK_RETURN_FALSE(BaseClass::performRecon(workOrder3DT));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DTGRAPPA<T>::performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h
new file mode 100644
index 0000000..66e22d5
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h
@@ -0,0 +1,787 @@
+/** \file   gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h
+    \brief  Implement the 3DT non-linear SPIRIT reconstruction using the non-linear CG solver
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusISMRMRDReconWorker3DTSPIRIT.h"
+#include "gtPlusSPIRIT2DTOperator.h"
+#include "gtPlusSPIRITNoNullSpace2DOperator.h"
+#include "gtPlusSPIRITNoNullSpace2DTOperator.h"
+#include "gtPlusNCGSolver.h"
+#include "gtPlusWavelet2DOperator.h"
+#include "gtPlusWavelet3DOperator.h"
+#include "gtPlusWaveletNoNullSpace2DOperator.h"
+#include "gtPlusWaveletNoNullSpace3DOperator.h"
+#include "gtPlusDataFidelityOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker3DTL1SPIRITNCG : public gtPlusReconWorker3DTSPIRIT<T>
+{
+public:
+
+    typedef gtPlusReconWorker3DTSPIRIT<T> BaseClass;
+    typedef gtPlusReconWorkOrder3DT<T> WorkOrderType;
+    typedef typename BaseClass::value_type value_type;
+
+    gtPlusReconWorker3DTL1SPIRITNCG() : BaseClass() {}
+    virtual ~gtPlusReconWorker3DTL1SPIRITNCG() {}
+
+    virtual bool performUnwarppingImpl(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res, size_t n);
+    virtual bool performUnwarppingImplROPermuted(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& kernel, hoNDArray<T>& coilMap, hoNDArray<T>& res);
+    virtual bool performUnwarppingImplROPermuted(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& kernel, hoNDArray<T>& coilMap, hoNDArray<T>& kspaceLinear, hoNDArray<T>& res);
+    virtual bool performUnwarppingImpl(gtPlusReconJob2DT<T>& job);
+
+    virtual bool computeKSpace(gtPlusReconWorkOrder3DT<T>* workOrder3DT);
+
+    virtual bool autoReconParameter(gtPlusReconWorkOrder<T>* workOrder);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::verbose_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_cplx_;
+
+//protected::
+
+    using BaseClass::ref_src_;
+    using BaseClass::ref_dst_;
+    using BaseClass::data_dst_;
+    using BaseClass::ref_coil_map_dst_;
+    using BaseClass::startE1_;
+    using BaseClass::endE1_;
+
+    gtPlusSPIRIT<T> spirit_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker3DTL1SPIRITNCG<T>::computeKSpace(gtPlusReconWorkOrder3DT<T>* workOrder3DT)
+{
+    bool recon_kspace = true;
+    if ( workOrder3DT->spirit_perform_nonlinear_ && workOrder3DT->spirit_use_coil_sen_map_ ) recon_kspace = false;
+    return recon_kspace;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTL1SPIRITNCG<T>::autoReconParameter(gtPlusReconWorkOrder<T>* workOrder)
+{
+    BaseClass::autoReconParameter(workOrder);
+
+    gtPlusReconWorkOrder3DT<T>* workOrder3DT = dynamic_cast<gtPlusReconWorkOrder3DT<T>*>(workOrder);
+    if ( workOrder3DT == NULL ) return false;
+
+    double acceFactor = workOrder3DT->acceFactorE1_ * workOrder3DT->acceFactorE2_;
+
+    if ( workOrder3DT->spirit_perform_linear_ )
+    {
+        if ( acceFactor>=16 )
+        {
+            workOrder3DT->spirit_3D_scale_per_chunk_ = true;
+
+            if ( workOrder3DT->spirit_solve_symmetric_ )
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.0025;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+            else
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.0025;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+        }
+        else if ( acceFactor>=12 )
+        {
+            workOrder3DT->spirit_3D_scale_per_chunk_ = true;
+
+            if ( workOrder3DT->spirit_solve_symmetric_ )
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.0025;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+            else
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.0025;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+        }
+        else if ( acceFactor>=9 )
+        {
+            workOrder3DT->spirit_3D_scale_per_chunk_ = true;
+
+            if ( workOrder3DT->spirit_solve_symmetric_ )
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.0025;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+            else
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.0025;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+        }
+        else if ( acceFactor>=6 )
+        {
+            workOrder3DT->spirit_3D_scale_per_chunk_ = true;
+
+            if ( workOrder3DT->spirit_solve_symmetric_ )
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.002;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+            else
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.002;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+        }
+        else if ( acceFactor>=4 )
+        {
+            workOrder3DT->spirit_3D_scale_per_chunk_ = true;
+
+            if ( workOrder3DT->spirit_solve_symmetric_ )
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.0015;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+            else
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.002;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+        }
+        else
+        {
+            workOrder3DT->spirit_3D_scale_per_chunk_ = true;
+
+            if ( workOrder3DT->spirit_solve_symmetric_ )
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.0015;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+            else
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.002;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+        }
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTL1SPIRITNCG<T>::
+performUnwarppingImpl(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res, size_t n)
+{
+    try
+    {
+        // RO, E1, E2, srcCHA, dstCHA
+
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t E2 = kspace.get_size(2);
+
+        size_t srcCHA = adj_forward_G_I.get_size(3);
+        size_t dstCHA = adj_forward_G_I.get_size(4);
+
+        res.create(kspace.get_dimensions());
+
+        // perform the 3D recon by read-out decoupling
+
+        hoNDArray<T> kspaceIfftRO(RO, E1, E2, srcCHA);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(kspace, kspaceIfftRO);
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIfftRO, debugFolder_+"kspaceIfftRO"); }
+
+        hoNDArray<T> kspaceIfftROPermuted(E1, E2, srcCHA, RO);
+
+        if ( performTiming_ ) { gt_timer3_.start("permtue RO to 4th dimension ... "); }
+
+        std::vector<size_t> dim_order(4);
+        dim_order[0] = 1;
+        dim_order[1] = 2;
+        dim_order[2] = 3;
+        dim_order[3] = 0;
+
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&kspaceIfftRO, &kspaceIfftROPermuted, &dim_order));
+
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIfftROPermuted, debugFolder_+"kspaceIfftROPermuted"); }
+
+        // permute kernel
+        hoNDArray<T> kerPermuted(E1, E2, srcCHA, dstCHA, RO);
+        if ( performTiming_ ) { gt_timer3_.start("permute kernel RO to 5th dimension ... "); }
+
+        {
+            std::vector<size_t> dim_order(5);
+            dim_order[0] = 0;
+            dim_order[1] = 1;
+            dim_order[2] = 3;
+            dim_order[3] = 4;
+            dim_order[4] = 2;
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&adj_forward_G_I, &kerPermuted, &dim_order));
+        }
+
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        // permute coil map
+        hoNDArray<T> coilMapN(RO, E1, E2, dstCHA, workOrder3DT->coilMap_->begin()+n*RO*E1*E2*dstCHA);
+        hoNDArray<T> coilMapPermuted(E1, E2, dstCHA, RO);
+        if ( performTiming_ ) { gt_timer3_.start("permtue coil map RO to 4th dimension ... "); }
+
+        {
+            std::vector<size_t> dim_order(4);
+            dim_order[0] = 1;
+            dim_order[1] = 2;
+            dim_order[2] = 3;
+            dim_order[3] = 0;
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&coilMapN, &coilMapPermuted, &dim_order));
+        }
+
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(coilMapPermuted, debugFolder_+"coilMapPermuted"); }
+
+        hoNDArray<T> resPermuted(E1, E2, dstCHA, RO);
+        GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImplROPermuted(workOrder3DT, kspaceIfftROPermuted, kerPermuted, coilMapPermuted, resPermuted));
+
+        // permute the unwrapped kspace
+        if ( performTiming_ ) { gt_timer3_.start("permtue RO to 1st dimension ... "); }
+
+        {
+            std::vector<size_t> dim_order(4);
+            dim_order[0] = 3;
+            dim_order[1] = 0;
+            dim_order[2] = 1;
+            dim_order[3] = 2;
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&resPermuted, &kspaceIfftRO, &dim_order));
+        }
+
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        // perform fft along the first dimension
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft1c(kspaceIfftRO, res);
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"res_3DSpirit"); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DTL1SPIRITNCG<T>::performUnwarppingImpl(gtPlusReconWorkOrder3DT<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTL1SPIRITNCG<T>::
+performUnwarppingImplROPermuted(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& kernel, hoNDArray<T>& coilMap, hoNDArray<T>& res)
+{
+    try
+    {
+        size_t E1 = kspace.get_size(0);
+        size_t E2 = kspace.get_size(1);
+        size_t RO = kspace.get_size(3);
+
+        size_t kerE1 = kernel.get_size(0);
+        size_t kerE2 = kernel.get_size(1);
+        size_t srcCHA = kernel.get_size(2);
+        size_t dstCHA = kernel.get_size(3);
+        size_t kerN = kernel.get_size(5);
+
+        hoNDArray<T>* kerIm = &kernel;
+        hoNDArray<T> kerImE1E2RO;
+        if ( kerE1!=E1 || kerE2!=E2 )
+        {
+            GDEBUG_STREAM("gtPlusReconWorker3DTL1SPIRITNCG, kerE1!=E1 || kerE2!=E2, kernel needs to be converted along E1 and E2 ... ");
+
+            kerImE1E2RO.create(E1, E2, srcCHA, dstCHA, RO, kerN);
+            Gadgetron::clear(kerImE1E2RO);
+
+            GADGET_CHECK_RETURN_FALSE(spirit_.imageDomainKernelE1E2RO(kernel, (int)E1, (int)E2, kerImE1E2RO));
+            kerIm = &kerImE1E2RO;
+        }
+
+        hoNDArray<T> kspaceLinear(kspace);
+        res = kspace;
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace"); }
+
+        bool performLinear = workOrder3DT->spirit_perform_linear_;
+        if ( !workOrder3DT->spirit_perform_nonlinear_ ) performLinear = true;
+
+        if ( performLinear )
+        {
+            if ( performTiming_ ) { gt_timer3_.start("NCG spirit linear solver for 3DT ... "); }
+            GADGET_CHECK_RETURN_FALSE(BaseClass::performUnwarppingImplROPermuted(workOrder3DT, kspace, *kerIm, coilMap, kspaceLinear));
+            if ( performTiming_ ) { gt_timer3_.stop(); }
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceLinear, debugFolder_+"kspaceLinear"); }
+
+        if ( workOrder3DT->spirit_perform_nonlinear_ )
+        {
+            if ( workOrder3DT->spirit_3D_scale_per_chunk_ )
+            {
+                typename realType<T>::Type scaleFactor = 1.0;
+                Gadgetron::norm2(kspace, scaleFactor);
+                scaleFactor /= (value_type)(RO*std::sqrt(double(srcCHA)));
+
+                workOrder3DT->spirit_ncg_scale_factor_ = scaleFactor;
+            }
+
+            // apply the scale
+            Gadgetron::scal( (value_type)(1.0/workOrder3DT->spirit_ncg_scale_factor_), kspaceLinear);
+            Gadgetron::scal( (value_type)(1.0/workOrder3DT->spirit_ncg_scale_factor_), kspace);
+
+            boost::shared_ptr< hoNDArray<T> > coilMapN;
+            if ( workOrder3DT->coilMap_ 
+                && workOrder3DT->coilMap_->get_size(0)==E1 
+                && workOrder3DT->coilMap_->get_size(1)==E2 
+                && workOrder3DT->coilMap_->get_size(2)==dstCHA 
+                && workOrder3DT->coilMap_->get_size(3)==RO )
+            {
+                coilMapN = boost::shared_ptr< hoNDArray<T> >( new hoNDArray<T>(E1, E2, dstCHA, RO, coilMap.begin()) );
+            }
+
+            if ( RO > 1 )
+            {
+                boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(E1, E2, srcCHA, dstCHA, RO, kerIm->begin()));
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(E1, E2, srcCHA, RO, kspace.begin()));
+
+                gtPlusNCGSolver<hoNDArray<T>, hoNDArray<T>, gtPlusOperator<T> > ncgsolver;
+                ncgsolver.iterMax_ = workOrder3DT->spirit_ncg_iter_max_;
+                ncgsolver.printIter_ = workOrder3DT->spirit_ncg_print_iter_;
+                ncgsolver.secantRatio_ = 2;
+                ncgsolver.x0_ = &kspaceLinear;
+
+                hoNDArray<T> b;
+
+                if ( workOrder3DT->spirit_data_fidelity_lamda_ <= 0 )
+                {
+                    // parallel imaging term
+                    gtPlusSPIRIT2DTOperator<T> spirit;
+                    spirit.use_symmetric_spirit_ = false;
+
+                    spirit.setForwardKernel(ker, true);
+                    spirit.setAcquiredPoints(acq);
+
+                    // L1 term
+                    gtPlusWavelet3DOperator<T> wavNullSpace3DOperator;
+                    wavNullSpace3DOperator.setAcquiredPoints(acq);
+
+                    wavNullSpace3DOperator.scale_factor_first_dimension_ = (value_type)workOrder3DT->spirit_E1_enhancement_ratio_;
+                    wavNullSpace3DOperator.scale_factor_second_dimension_ = (value_type)workOrder3DT->spirit_E2_enhancement_ratio_;
+                    wavNullSpace3DOperator.scale_factor_third_dimension_ = (value_type)workOrder3DT->spirit_RO_enhancement_ratio_;
+
+                    if ( workOrder3DT->spirit_use_coil_sen_map_ && coilMapN )
+                    {
+                        wavNullSpace3DOperator.setCoilSenMap(coilMapN);
+                    }
+
+                    // set operators
+                    ncgsolver.add(spirit, (value_type)(workOrder3DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNullSpace3DOperator, (value_type)(workOrder3DT->spirit_image_reg_lamda_) );
+
+                    if ( performTiming_ ) { gt_timer3_.start("NCG spirit solver for 3DT ... "); }
+                    ncgsolver.solve(b, res);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"ncg_spirit_3DT_res"); }
+
+                    spirit.restoreAcquiredKSpace(kspace, res);
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"ncg_spirit_3DT_res_restored"); }
+                }
+                else
+                {
+                    gtPlusSPIRITNoNullSpace2DTOperator<T> spirit_noNullSpace;
+                    spirit_noNullSpace.use_symmetric_spirit_ = false;
+
+                    spirit_noNullSpace.setForwardKernel(ker, true);
+                    spirit_noNullSpace.setAcquiredPoints(acq);
+
+                    gtPlusDataFidelityOperator<T> dataOper;
+                    dataOper.setAcquiredPoints(acq);
+
+                    gtPlusWaveletNoNullSpace3DOperator<T> wavNoNullSpace3DOperator;
+                    wavNoNullSpace3DOperator.setAcquiredPoints(acq);
+
+                    wavNoNullSpace3DOperator.scale_factor_first_dimension_ = (value_type)workOrder3DT->spirit_E1_enhancement_ratio_;
+                    wavNoNullSpace3DOperator.scale_factor_second_dimension_ = (value_type)workOrder3DT->spirit_E2_enhancement_ratio_;
+                    wavNoNullSpace3DOperator.scale_factor_third_dimension_ = (value_type)workOrder3DT->spirit_RO_enhancement_ratio_;
+
+                    if ( workOrder3DT->spirit_use_coil_sen_map_ && coilMapN )
+                    {
+                        wavNoNullSpace3DOperator.setCoilSenMap(coilMapN);
+                    }
+
+                    ncgsolver.add(spirit_noNullSpace, (value_type)(workOrder3DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNoNullSpace3DOperator, (value_type)(workOrder3DT->spirit_image_reg_lamda_) );
+                    ncgsolver.add(dataOper, (value_type)(workOrder3DT->spirit_data_fidelity_lamda_) );
+
+                    if ( performTiming_ ) { gt_timer3_.start("NCG spirit solver for 3DT without null space ... "); }
+                    ncgsolver.solve(b, res);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"ncg_spirit_3DT_res_noNullSpace"); }
+                }
+            }
+            else
+            {
+                boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(E1, E2, srcCHA, dstCHA, kerIm->begin()));
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(E1, E2, srcCHA, kspace.begin()));
+
+                gtPlusNCGSolver<hoNDArray<T>, hoNDArray<T>, gtPlusOperator<T> > ncgsolver;
+                ncgsolver.iterMax_ = workOrder3DT->spirit_ncg_iter_max_;
+                ncgsolver.printIter_ = workOrder3DT->spirit_ncg_print_iter_;
+                ncgsolver.secantRatio_ = 2;
+                ncgsolver.x0_ = &kspaceLinear;
+
+                hoNDArray<T> b;
+
+                if ( workOrder3DT->spirit_data_fidelity_lamda_ <= 0 )
+                {
+                    // parallel imaging term
+                    gtPlusSPIRIT2DOperator<T> spirit;
+                    spirit.use_symmetric_spirit_ = false;
+                    spirit.setForwardKernel(ker, true);
+                    spirit.setAcquiredPoints(acq);
+
+                    // L1 term
+                    gtPlusWavelet2DOperator<T> wavNullSpace2DOperator;
+                    wavNullSpace2DOperator.setAcquiredPoints(acq);
+
+                    if ( workOrder3DT->spirit_use_coil_sen_map_ && coilMapN )
+                    {
+                        wavNullSpace2DOperator.setCoilSenMap(coilMapN);
+                    }
+
+                    // set operators
+                    ncgsolver.add(spirit, (value_type)(workOrder3DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNullSpace2DOperator, (value_type)(workOrder3DT->spirit_image_reg_lamda_) );
+
+                    if ( performTiming_ ) { gt_timer3_.start("NCG spirit solver for 3D ... "); }
+                    ncgsolver.solve(b, res);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"ncg_spirit_3D_res"); }
+
+                    spirit.restoreAcquiredKSpace(kspace, res);
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"ncg_spirit_3D_res_restored"); }
+                }
+                else
+                {
+                    gtPlusSPIRITNoNullSpace2DOperator<T> spirit_noNullSpace;
+                    spirit_noNullSpace.use_symmetric_spirit_ = false;
+                    spirit_noNullSpace.setForwardKernel(ker, true);
+                    spirit_noNullSpace.setAcquiredPoints(acq);
+
+                    gtPlusDataFidelityOperator<T> dataOper;
+                    dataOper.setAcquiredPoints(acq);
+
+                    gtPlusWaveletNoNullSpace2DOperator<T> wavNoNullSpace2DOperator;
+                    wavNoNullSpace2DOperator.setAcquiredPoints(acq);
+
+                    if ( workOrder3DT->spirit_use_coil_sen_map_ && coilMapN )
+                    {
+                        wavNoNullSpace2DOperator.setCoilSenMap(coilMapN);
+                    }
+
+                    ncgsolver.add(spirit_noNullSpace, (value_type)(workOrder3DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNoNullSpace2DOperator, (value_type)(workOrder3DT->spirit_image_reg_lamda_) );
+                    ncgsolver.add(dataOper, (value_type)(workOrder3DT->spirit_data_fidelity_lamda_) );
+
+                    if ( performTiming_ ) { gt_timer3_.start("NCG spirit solver for 3D without null space ... "); }
+                    ncgsolver.solve(b, res);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"ncg_spirit_3D_res_noNullSpace"); }
+                }
+            }
+
+            Gadgetron::scal( (value_type)(workOrder3DT->spirit_ncg_scale_factor_), res);
+        }
+        else
+        {
+            res = kspaceLinear;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DTL1SPIRITNCG<T>::performUnwarppingImplROPermuted(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTL1SPIRITNCG<T>::
+performUnwarppingImplROPermuted(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& kernel, hoNDArray<T>& coilMap, hoNDArray<T>& kspaceLinear, hoNDArray<T>& res)
+{
+    try
+    {
+        size_t E1 = kspace.get_size(0);
+        size_t E2 = kspace.get_size(1);
+        size_t RO = kspace.get_size(3);
+
+        size_t kerE1 = kernel.get_size(0);
+        size_t kerE2 = kernel.get_size(1);
+        size_t srcCHA = kernel.get_size(2);
+        size_t dstCHA = kernel.get_size(3);
+        size_t kerN = kernel.get_size(5);
+
+        hoNDArray<T>* kerIm = &kernel;
+        hoNDArray<T> kerImE1E2RO;
+        if ( kerE1!=E1 || kerE2!=E2 )
+        {
+            GDEBUG_STREAM("gtPlusReconWorker3DTL1SPIRITNCG, kerE1!=E1 || kerE2!=E2, kernel needs to be converted along E1 and E2 ... ");
+
+            kerImE1E2RO.create(E1, E2, srcCHA, dstCHA, RO, kerN);
+            Gadgetron::clear(kerImE1E2RO);
+
+            GADGET_CHECK_RETURN_FALSE(spirit_.imageDomainKernelE1E2RO(kernel, (int)E1, (int)E2, kerImE1E2RO));
+            kerIm = &kerImE1E2RO;
+        }
+
+        res = kspace;
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace, debugFolder_+"kspace"); }
+
+        bool performLinear = workOrder3DT->spirit_perform_linear_;
+        if ( !workOrder3DT->spirit_perform_nonlinear_ ) performLinear = true;
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceLinear, debugFolder_+"kspaceLinear"); }
+
+        if ( workOrder3DT->spirit_perform_nonlinear_ )
+        {
+            if ( workOrder3DT->spirit_3D_scale_per_chunk_ )
+            {
+                typename realType<T>::Type scaleFactor = 1.0;
+                Gadgetron::norm2(kspace, scaleFactor);
+                scaleFactor /= (value_type)(RO*std::sqrt(double(srcCHA)));
+
+                workOrder3DT->spirit_ncg_scale_factor_ = scaleFactor;
+            }
+
+            // apply the scale
+            Gadgetron::scal((value_type)(1.0/workOrder3DT->spirit_ncg_scale_factor_), kspaceLinear);
+            Gadgetron::scal((value_type)(1.0/workOrder3DT->spirit_ncg_scale_factor_), kspace);
+
+            boost::shared_ptr< hoNDArray<T> > coilMapN;
+            if ( workOrder3DT->coilMap_ 
+                && workOrder3DT->coilMap_->get_size(0)==E1 
+                && workOrder3DT->coilMap_->get_size(1)==E2 
+                && workOrder3DT->coilMap_->get_size(2)==dstCHA 
+                && workOrder3DT->coilMap_->get_size(3)==RO )
+            {
+                coilMapN = boost::shared_ptr< hoNDArray<T> >( new hoNDArray<T>(E1, E2, dstCHA, RO, coilMap.begin()) );
+            }
+
+            if ( RO > 1 )
+            {
+                boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(E1, E2, srcCHA, dstCHA, RO, kerIm->begin()));
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(E1, E2, srcCHA, RO, kspace.begin()));
+
+                gtPlusNCGSolver<hoNDArray<T>, hoNDArray<T>, gtPlusOperator<T> > ncgsolver;
+                ncgsolver.iterMax_ = workOrder3DT->spirit_ncg_iter_max_;
+                ncgsolver.printIter_ = workOrder3DT->spirit_ncg_print_iter_;
+                ncgsolver.secantRatio_ = 2;
+                ncgsolver.x0_ = &kspaceLinear;
+
+                hoNDArray<T> b;
+
+                if ( workOrder3DT->spirit_data_fidelity_lamda_ <= 0 )
+                {
+                    // parallel imaging term
+                    gtPlusSPIRIT2DTOperator<T> spirit;
+                    spirit.use_symmetric_spirit_ = false;
+
+                    spirit.setForwardKernel(ker, true);
+                    spirit.setAcquiredPoints(acq);
+
+                    // L1 term
+                    gtPlusWavelet3DOperator<T> wavNullSpace3DOperator;
+                    wavNullSpace3DOperator.setAcquiredPoints(acq);
+
+                    wavNullSpace3DOperator.scale_factor_first_dimension_ = (value_type)workOrder3DT->spirit_E1_enhancement_ratio_;
+                    wavNullSpace3DOperator.scale_factor_second_dimension_ = (value_type)workOrder3DT->spirit_E2_enhancement_ratio_;
+                    wavNullSpace3DOperator.scale_factor_third_dimension_ = (value_type)workOrder3DT->spirit_RO_enhancement_ratio_;
+
+                    if ( workOrder3DT->spirit_use_coil_sen_map_ && coilMapN )
+                    {
+                        wavNullSpace3DOperator.setCoilSenMap(coilMapN);
+                    }
+
+                    // set operators
+                    ncgsolver.add(spirit, (value_type)(workOrder3DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNullSpace3DOperator, (value_type)(workOrder3DT->spirit_image_reg_lamda_) );
+
+                    if ( performTiming_ ) { gt_timer3_.start("NCG spirit solver for 3DT ... "); }
+                    ncgsolver.solve(b, res);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, "ncg_spirit_3DT_res"); }
+
+                    spirit.restoreAcquiredKSpace(kspace, res);
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, "ncg_spirit_3DT_res_restored"); }
+                }
+                else
+                {
+                    gtPlusSPIRITNoNullSpace2DTOperator<T> spirit_noNullSpace;
+                    spirit_noNullSpace.use_symmetric_spirit_ = false;
+
+                    spirit_noNullSpace.setForwardKernel(ker, true);
+                    spirit_noNullSpace.setAcquiredPoints(acq);
+
+                    gtPlusDataFidelityOperator<T> dataOper;
+                    dataOper.setAcquiredPoints(acq);
+
+                    gtPlusWaveletNoNullSpace3DOperator<T> wavNoNullSpace3DOperator;
+                    wavNoNullSpace3DOperator.setAcquiredPoints(acq);
+
+                    wavNoNullSpace3DOperator.scale_factor_first_dimension_ = (value_type)workOrder3DT->spirit_E1_enhancement_ratio_;
+                    wavNoNullSpace3DOperator.scale_factor_second_dimension_ = (value_type)workOrder3DT->spirit_E2_enhancement_ratio_;
+                    wavNoNullSpace3DOperator.scale_factor_third_dimension_ = (value_type)workOrder3DT->spirit_RO_enhancement_ratio_;
+
+                    if ( workOrder3DT->spirit_use_coil_sen_map_ && coilMapN )
+                    {
+                        wavNoNullSpace3DOperator.setCoilSenMap(coilMapN);
+                    }
+
+                    ncgsolver.add(spirit_noNullSpace, (value_type)(workOrder3DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNoNullSpace3DOperator, (value_type)(workOrder3DT->spirit_image_reg_lamda_) );
+                    ncgsolver.add(dataOper, (value_type)(workOrder3DT->spirit_data_fidelity_lamda_) );
+
+                    if ( performTiming_ ) { gt_timer3_.start("NCG spirit solver for 3DT without null space ... "); }
+                    ncgsolver.solve(b, res);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, "ncg_spirit_3DT_res_noNullSpace"); }
+                }
+            }
+            else
+            {
+                boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(E1, E2, srcCHA, dstCHA, kerIm->begin()));
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(E1, E2, srcCHA, kspace.begin()));
+
+                gtPlusNCGSolver<hoNDArray<T>, hoNDArray<T>, gtPlusOperator<T> > ncgsolver;
+                ncgsolver.iterMax_ = workOrder3DT->spirit_ncg_iter_max_;
+                ncgsolver.printIter_ = workOrder3DT->spirit_ncg_print_iter_;
+                ncgsolver.secantRatio_ = 2;
+                ncgsolver.x0_ = &kspaceLinear;
+
+                hoNDArray<T> b;
+
+                if ( workOrder3DT->spirit_data_fidelity_lamda_ <= 0 )
+                {
+                    // parallel imaging term
+                    gtPlusSPIRIT2DOperator<T> spirit;
+                    spirit.use_symmetric_spirit_ = false;
+                    spirit.setForwardKernel(ker, true);
+                    spirit.setAcquiredPoints(acq);
+
+                    // L1 term
+                    gtPlusWavelet2DOperator<T> wavNullSpace2DOperator;
+                    wavNullSpace2DOperator.setAcquiredPoints(acq);
+
+                    if ( workOrder3DT->spirit_use_coil_sen_map_ && coilMapN )
+                    {
+                        wavNullSpace2DOperator.setCoilSenMap(coilMapN);
+                    }
+
+                    // set operators
+                    ncgsolver.add(spirit, (value_type)(workOrder3DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNullSpace2DOperator, (value_type)(workOrder3DT->spirit_image_reg_lamda_) );
+
+                    if ( performTiming_ ) { gt_timer3_.start("NCG spirit solver for 3D ... "); }
+                    ncgsolver.solve(b, res);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, "ncg_spirit_3D_res"); }
+
+                    spirit.restoreAcquiredKSpace(kspace, res);
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, "ncg_spirit_3D_res_restored"); }
+                }
+                else
+                {
+                    gtPlusSPIRITNoNullSpace2DOperator<T> spirit_noNullSpace;
+                    spirit_noNullSpace.use_symmetric_spirit_ = false;
+                    spirit_noNullSpace.setForwardKernel(ker, true);
+                    spirit_noNullSpace.setAcquiredPoints(acq);
+
+                    gtPlusDataFidelityOperator<T> dataOper;
+                    dataOper.setAcquiredPoints(acq);
+
+                    gtPlusWaveletNoNullSpace2DOperator<T> wavNoNullSpace2DOperator;
+                    wavNoNullSpace2DOperator.setAcquiredPoints(acq);
+
+                    if ( workOrder3DT->spirit_use_coil_sen_map_ && coilMapN )
+                    {
+                        wavNoNullSpace2DOperator.setCoilSenMap(coilMapN);
+                    }
+
+                    ncgsolver.add(spirit_noNullSpace, (value_type)(workOrder3DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNoNullSpace2DOperator, (value_type)(workOrder3DT->spirit_image_reg_lamda_) );
+                    ncgsolver.add(dataOper, (value_type)(workOrder3DT->spirit_data_fidelity_lamda_) );
+
+                    if ( performTiming_ ) { gt_timer3_.start("NCG spirit solver for 3D without null space ... "); }
+                    ncgsolver.solve(b, res);
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, "ncg_spirit_3D_res_noNullSpace"); }
+                }
+            }
+
+            Gadgetron::scal( (value_type)(workOrder3DT->spirit_ncg_scale_factor_), res);
+        }
+        else
+        {
+            res = kspaceLinear;
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DTL1SPIRITNCG<T>::performUnwarppingImplROPermuted(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTL1SPIRITNCG<T>::
+performUnwarppingImpl(gtPlusReconJob2DT<T>& job)
+{
+    try
+    {
+        hoNDArray<T>& kspace = job.kspace; // [E1 E2 srcCHA RO 1]
+        hoNDArray<T>& ker = job.ker; // [E1 E2 srcCHA dstCHA RO 1]
+        hoNDArray<T>& res = job.res; // [E1 E2 dstCHA RO 1]
+        gtPlusReconWorkOrder<T>* workOrder3DT = &(job.workOrder2DT);
+
+        job.res = job.kspace;
+
+        GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImplROPermuted(workOrder3DT, kspace, ker, *job.workOrder2DT.coilMap_, res));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DTL1SPIRITNCG<T>::performUnwarppingImpl(job) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTNoAcceleration.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTNoAcceleration.h
new file mode 100644
index 0000000..abc02f8
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTNoAcceleration.h
@@ -0,0 +1,144 @@
+/** \file   gtPlusISMRMRDReconWorker3DTNoAcceleration.h
+    \brief  Implement the 3DT reconstruction without the k-space undersampling
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd/ismrmrd.h"
+
+#include "GadgetronTimer.h"
+
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorker3DT.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker3DTNoAcceleration : public gtPlusReconWorker3DT<T>
+{
+public:
+
+    typedef gtPlusReconWorker3DT<T> BaseClass;
+    typedef typename BaseClass::value_type value_type;
+
+    gtPlusReconWorker3DTNoAcceleration() : BaseClass() {}
+    virtual ~gtPlusReconWorker3DTNoAcceleration() {}
+
+    virtual bool performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT);
+
+    virtual bool computeKSpace(gtPlusReconWorkOrder3DT<T>* /*workOrder3DT*/) { return false; }
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::verbose_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_cplx_;
+
+    using BaseClass::ref_src_;
+    using BaseClass::ref_dst_;
+    using BaseClass::data_dst_;
+    using BaseClass::ref_coil_map_dst_;
+    using BaseClass::startE1_;
+    using BaseClass::endE1_;
+    using BaseClass::startE2_;
+    using BaseClass::endE2_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker3DTNoAcceleration<T>::performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(workOrder3DT!=NULL);
+
+        if ( !workOrder3DT->workFlow_use_BufferedKernel_ )
+        {
+            if ( performTiming_ ) { gt_timer1_.start("prepRef"); }
+            GADGET_CHECK_RETURN_FALSE(this->prepRef(workOrder3DT, workOrder3DT->ref_, 
+                                                workOrder3DT->ref_recon_, 
+                                                workOrder3DT->ref_coil_map_, 
+                                                workOrder3DT->start_RO_, workOrder3DT->end_RO_, 
+                                                workOrder3DT->start_E1_, workOrder3DT->end_E1_, 
+                                                workOrder3DT->start_E2_, workOrder3DT->end_E2_, 
+                                                workOrder3DT->data_.get_size(1), workOrder3DT->data_.get_size(2)));
+            if ( performTiming_ ) { gt_timer1_.stop(); }
+        }
+
+        size_t RO = workOrder3DT->data_.get_size(0);
+        size_t E1 = workOrder3DT->data_.get_size(1);
+        size_t E2 = workOrder3DT->data_.get_size(2);
+        size_t CHA = workOrder3DT->data_.get_size(3);
+        size_t N = workOrder3DT->data_.get_size(4);
+
+        size_t refN = workOrder3DT->ref_recon_.get_size(4);
+        size_t usedN;
+
+        // estimate the coil sensitivity
+        if ( !workOrder3DT->workFlow_use_BufferedKernel_ 
+                    || (workOrder3DT->coilMap_->get_size(0)!=RO) 
+                    || (workOrder3DT->coilMap_->get_size(1)!=E1)
+                    || (workOrder3DT->coilMap_->get_size(2)!=E2) )
+        {
+            workOrder3DT->coilMap_->create(RO, E1, E2, CHA, refN);
+
+            if ( workOrder3DT->no_acceleration_same_combinationcoeff_allN_ )
+            {
+                usedN = workOrder3DT->no_acceleration_whichN_combinationcoeff_;
+                if ( usedN >= refN ) usedN = refN-1;
+
+                hoNDArray<T> refCoilMapN(RO, E1, E2, CHA, workOrder3DT->ref_coil_map_.begin()+usedN*RO*E1*E2*CHA);
+
+                hoNDArray<T> buffer3DT(refCoilMapN.get_dimensions());
+
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(refCoilMapN, buffer3DT);
+
+                hoNDArray<T> coilMapN(RO, E1, E2, CHA, workOrder3DT->coilMap_->begin()+usedN*RO*E1*E2*CHA);
+
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(buffer3DT, 
+                        coilMapN, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, (value_type)workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+
+                GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder3DT->coilMap_, usedN));
+            }
+            else
+            {
+                hoNDArray<T> buffer3DT(workOrder3DT->ref_coil_map_.get_dimensions());
+
+                Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(workOrder3DT->ref_coil_map_, buffer3DT);
+
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(buffer3DT, 
+                        *workOrder3DT->coilMap_, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, (value_type)workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+            }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*workOrder3DT->coilMap_, debugFolder_+"coilMap_"); }
+        }
+
+        // partial fourier handling
+        GADGET_CHECK_RETURN_FALSE(this->performPartialFourierHandling(workOrder3DT));
+
+        workOrder3DT->complexIm_.create(RO, E1, E2, N);
+
+        if ( performTiming_ ) { gt_timer1_.start("perform coil combination"); }
+
+        hoNDArray<T> buffer3DT(workOrder3DT->data_.get_dimensions());
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(workOrder3DT->data_, buffer3DT);
+        gtPlusISMRMRDReconUtilComplex<T>().coilCombine3D(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->complexIm_);
+
+        if ( performTiming_ ) { gt_timer1_.stop(); }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->complexIm_, debugFolder_+"combined"); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DTNoAcceleration<T>::performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTSPIRIT.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTSPIRIT.h
new file mode 100644
index 0000000..57497a2
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTSPIRIT.h
@@ -0,0 +1,1056 @@
+/** \file   gtPlusISMRMRDReconWorker3DTSPIRIT.h
+    \brief  Implement the 3DT linear SPIRIT reconstruction
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd/ismrmrd.h"
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorker3DT.h"
+#include "gtPlusSPIRIT.h"
+#include "gtPlusSPIRIT2DTOperator.h"
+#include "gtPlusLSQRSolver.h"
+
+#include "GadgetCloudController.h"
+#include "GadgetCloudJobMessageReadWrite.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker3DTSPIRIT : public gtPlusReconWorker3DT<T>
+{
+public:
+
+    typedef gtPlusReconWorker3DT<T> BaseClass;
+    typedef gtPlusReconWorkOrder3DT<T> WorkOrderType;
+    typedef typename BaseClass::value_type value_type;
+
+    gtPlusReconWorker3DTSPIRIT() : spirit_kernelIm_permuted_(false), BaseClass() {}
+    virtual ~gtPlusReconWorker3DTSPIRIT() {}
+
+    virtual bool performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT);
+
+    virtual bool performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT);
+    virtual bool performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT, size_t usedN);
+
+    virtual bool performUnwrapping(gtPlusReconWorkOrder3DT<T>* workOrder3DT, const hoNDArray<T>& data);
+    virtual bool performUnwarppingImplROPermuted(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& ker, hoNDArray<T>& coilMap, hoNDArray<T>& res);
+    virtual bool performUnwarppingImpl(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res, size_t n);
+    virtual bool performUnwarppingImpl(gtPlusReconJob2DT<T>& job);
+
+    virtual bool computeKSpace(gtPlusReconWorkOrder3DT<T>* workOrder3DT);
+
+    virtual bool autoReconParameter(gtPlusReconWorkOrder<T>* workOrder);
+
+    virtual bool splitJob(gtPlusReconWorkOrder3DT<T>* workOrder3DT, size_t& jobN);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::verbose_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+
+//protected::
+
+    using BaseClass::ref_src_;
+    using BaseClass::ref_dst_;
+    using BaseClass::data_dst_;
+    using BaseClass::ref_coil_map_dst_;
+    using BaseClass::startE1_;
+    using BaseClass::endE1_;
+
+    gtPlusSPIRIT<T> spirit_;
+
+    bool spirit_kernelIm_permuted_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::computeKSpace(gtPlusReconWorkOrder3DT<T>* workOrder3DT)
+{
+    bool recon_kspace = true;
+    return recon_kspace;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::autoReconParameter(gtPlusReconWorkOrder<T>* workOrder)
+{
+    gtPlusReconWorkOrder3DT<T>* workOrder3DT = dynamic_cast<gtPlusReconWorkOrder3DT<T>*>(workOrder);
+    if ( workOrder3DT == NULL ) return false;
+
+    double acceFactor = workOrder3DT->acceFactorE1_ * workOrder3DT->acceFactorE2_;
+
+    if ( acceFactor>=16 )
+    {
+        workOrder3DT->spirit_iter_max_ = 150;
+        workOrder3DT->spirit_iter_thres_ = 0.0025;
+        workOrder3DT->spirit_reg_lamda_ = 0.01;
+    }
+    else if ( acceFactor>=12 )
+    {
+        workOrder3DT->spirit_iter_max_ = 100;
+        workOrder3DT->spirit_iter_thres_ = 0.0025;
+        workOrder3DT->spirit_reg_lamda_ = 0.01;
+    }
+    else if ( acceFactor>=9 )
+    {
+        workOrder3DT->spirit_iter_max_ = 100;
+        workOrder3DT->spirit_iter_thres_ = 0.0025;
+        workOrder3DT->spirit_reg_lamda_ = 0.01;
+    }
+    else if ( acceFactor>=6 )
+    {
+        workOrder3DT->spirit_iter_max_ = 100;
+        workOrder3DT->spirit_iter_thres_ = 0.0025;
+        workOrder3DT->spirit_reg_lamda_ = 0.01;
+    }
+    else if ( acceFactor>=4 )
+    {
+        workOrder3DT->spirit_iter_max_ = 70;
+        workOrder3DT->spirit_iter_thres_ = 0.005;
+        workOrder3DT->spirit_reg_lamda_ = 0.01;
+    }
+    else
+    {
+        workOrder3DT->spirit_iter_max_ = 50;
+        workOrder3DT->spirit_iter_thres_ = 0.005;
+        workOrder3DT->spirit_reg_lamda_ = 0.01;
+
+        if ( workOrder3DT->recon_algorithm_ == ISMRMRD_SPIRIT )
+        {
+            workOrder3DT->spirit_iter_thres_ = 0.005;
+        }
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::
+performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT)
+{
+    spirit_.performTiming_ = performTiming_;
+    spirit_.debugFolder_ = debugFolder_;
+
+    size_t RO = workOrder3DT->data_.get_size(0);
+    size_t E1 = workOrder3DT->data_.get_size(1);
+    size_t E2 = workOrder3DT->data_.get_size(2);
+    size_t N = workOrder3DT->data_.get_size(4);
+    size_t srcCHA = ref_src.get_size(3);
+
+    size_t refRO = ref_dst.get_size(0);
+    size_t refE1 = ref_dst.get_size(1);
+    size_t refE2 = ref_dst.get_size(2);
+    size_t refN = ref_dst.get_size(4);
+    size_t dstCHA = ref_dst.get_size(3);
+
+    bool reconKSpace = this->computeKSpace(workOrder3DT);
+
+    size_t kRO = workOrder3DT->spirit_kSize_RO_;
+    size_t kE1 = workOrder3DT->spirit_kSize_E1_;
+    size_t kE2 = workOrder3DT->spirit_kSize_E2_;
+
+    size_t oRO = workOrder3DT->spirit_oSize_RO_;
+    size_t oE1 = workOrder3DT->spirit_oSize_E1_;
+    size_t oE2 = workOrder3DT->spirit_oSize_E2_;
+
+    workOrder3DT->kernel_->create(kRO, kE1, kE2, srcCHA, dstCHA, oRO, oE1, oE2, refN);
+
+    size_t jobN;
+    bool splitJobs = this->splitJob(workOrder3DT, jobN);
+
+    if ( !splitJobs )
+    {
+        if ( performTiming_ ) { gt_timer3_.start("allocate image domain kernel ... "); }
+        workOrder3DT->kernelIm_->create(E1, E2, RO, srcCHA, dstCHA, refN);
+        // pre-set to zero is needed here
+        memset(workOrder3DT->kernelIm_->begin(), 0, workOrder3DT->kernelIm_->get_number_of_bytes());
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+    }
+    else
+    {
+        size_t convKE1 = 2*kE1-1;
+        size_t convKE2 = 2*kE2-1;
+
+        if ( performTiming_ ) { gt_timer3_.start("allocate image domain kernel only along RO ... "); }
+        workOrder3DT->kernelIm_->create(convKE1, convKE2, RO, srcCHA, dstCHA, refN);
+        // pre-set to zero is needed here
+        memset(workOrder3DT->kernelIm_->begin(), 0, workOrder3DT->kernelIm_->get_number_of_bytes());
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::
+performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT, size_t usedN)
+{
+    size_t RO = workOrder3DT->data_.get_size(0);
+    size_t E1 = workOrder3DT->data_.get_size(1);
+    size_t E2 = workOrder3DT->data_.get_size(2);
+    size_t N = workOrder3DT->data_.get_size(4);
+    size_t srcCHA = ref_src.get_size(3);
+
+    size_t refRO = ref_dst.get_size(0);
+    size_t refE1 = ref_dst.get_size(1);
+    size_t refE2 = ref_dst.get_size(2);
+    size_t refN = ref_dst.get_size(4);
+    size_t dstCHA = ref_dst.get_size(3);
+
+    bool reconKSpace = this->computeKSpace(workOrder3DT);
+
+    size_t kRO = workOrder3DT->spirit_kSize_RO_;
+    size_t kE1 = workOrder3DT->spirit_kSize_E1_;
+    size_t kE2 = workOrder3DT->spirit_kSize_E2_;
+
+    size_t oRO = workOrder3DT->spirit_oSize_RO_;
+    size_t oE1 = workOrder3DT->spirit_oSize_E1_;
+    size_t oE2 = workOrder3DT->spirit_oSize_E2_;
+
+    ho4DArray<T> acsSrc(refRO, refE1, refE2, srcCHA, const_cast<T*>(ref_src.begin()+usedN*refRO*refE1*refE2*srcCHA));
+    ho4DArray<T> acsDst(refRO, refE1, refE2, dstCHA, const_cast<T*>(ref_dst.begin()+usedN*refRO*refE1*refE2*dstCHA));
+
+    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(acsSrc, debugFolder_+"acsSrc"); }
+    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(acsDst, debugFolder_+"acsDst"); }
+
+    hoNDArray<T> ker(kRO, kE1, kE2, srcCHA, dstCHA, oRO, oE1, oE2, workOrder3DT->kernel_->begin()+usedN*kRO*kE1*kE2*srcCHA*dstCHA*oRO*oE1*oE2);
+
+    spirit_.calib_use_gpu_ = workOrder3DT->spirit_use_gpu_;
+
+    if ( performTiming_ ) { gt_timer3_.start("SPIRIT 3D calibration ... "); }
+    GADGET_CHECK_RETURN_FALSE(spirit_.calib3D(acsSrc, acsDst, workOrder3DT->spirit_reg_lamda_, workOrder3DT->spirit_calib_over_determine_ratio_, kRO, kE1, kE2, oRO, oE1, oE2, ker));
+    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(ker, debugFolder_+"ker"); }
+
+    bool minusI = true;
+
+    size_t jobN;
+    bool splitJobs = this->splitJob(workOrder3DT, jobN);
+
+    if ( !splitJobs )
+    {
+        hoNDArray<T> kIm(E1, E2, RO, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin()+usedN*E1*E2*RO*srcCHA*dstCHA);
+
+        if ( performTiming_ ) { gt_timer3_.start("SPIRIT 3D image domain kernel ... "); }
+        GADGET_CHECK_RETURN_FALSE(spirit_.imageDomainKernel3D(ker, kRO, kE1, kE2, oRO, oE1, oE2, RO, E1, E2, kIm, minusI));
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( !debugFolder_.empty() )
+        {
+            hoNDArray<T> kImACha(E1, E2, RO, srcCHA, kIm.begin());
+            gt_exporter_.exportArrayComplex(kImACha, debugFolder_+"kImACha");
+        }
+    }
+    else
+    {
+        size_t convKE1 = 2*kE1-1;
+        size_t convKE2 = 2*kE2-1;
+
+        hoNDArray<T> kIm(convKE1, convKE2, RO, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin()+usedN*convKE1*convKE2*RO*srcCHA*dstCHA);
+
+        if ( performTiming_ ) { gt_timer3_.start("SPIRIT 3D image domain kernel only along RO ... "); }
+        GADGET_CHECK_RETURN_FALSE(spirit_.imageDomainKernelRO3D(ker, kRO, kE1, kE2, oRO, oE1, oE2, RO, kIm, minusI));
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        if ( !debugFolder_.empty() )
+        {
+            hoNDArray<T> kImROACha(convKE1, convKE2, RO, srcCHA, kIm.begin());
+            gt_exporter_.exportArrayComplex(kImROACha, debugFolder_+"kImROACha");
+        }
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::
+splitJob(gtPlusReconWorkOrder3DT<T>* workOrder3DT, size_t& jobN)
+{
+    size_t RO = workOrder3DT->data_.get_size(0);
+    size_t E1 = workOrder3DT->data_.get_size(1);
+    size_t E2 = workOrder3DT->data_.get_size(2);
+
+    size_t srcCHA = workOrder3DT->kernelIm_->get_size(3);
+    size_t dstCHA = workOrder3DT->kernelIm_->get_size(4);
+
+    bool splitByS = workOrder3DT->job_split_by_S_;
+    jobN = workOrder3DT->job_num_of_N_;
+    size_t jobMegaBytes = workOrder3DT->job_max_Megabytes_;
+
+    bool splitJobs = (splitByS==true || jobN>0);
+    if ( !splitJobs )
+    {
+        if ( jobMegaBytes>0 )
+        {
+            size_t jobN = jobMegaBytes/(E1*E2*srcCHA*dstCHA*sizeof(T)/1024/1024);
+            if ( jobN < RO ) splitJobs = true;
+            GDEBUG_STREAM("SPIRIT - 3DT - size of largest job : " << jobN);
+        }
+    }
+    if ( jobN >= RO ) splitJobs = false;
+
+    return splitJobs;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::
+performUnwrapping(gtPlusReconWorkOrder3DT<T>* workOrder3DT, const hoNDArray<T>& data_dst)
+{
+    try
+    {
+        int n;
+
+        size_t RO = workOrder3DT->data_.get_size(0);
+        size_t E1 = workOrder3DT->data_.get_size(1);
+        size_t E2 = workOrder3DT->data_.get_size(2);
+        size_t N = workOrder3DT->data_.get_size(4);
+
+        size_t kImE1 = workOrder3DT->kernelIm_->get_size(0);
+        size_t kImE2 = workOrder3DT->kernelIm_->get_size(1);
+        size_t kImRO = workOrder3DT->kernelIm_->get_size(2);
+        size_t srcCHA = workOrder3DT->kernelIm_->get_size(3);
+        size_t dstCHA = workOrder3DT->kernelIm_->get_size(4);
+
+        size_t refN = workOrder3DT->kernelIm_->get_size(5);
+
+        workOrder3DT->complexIm_.create(RO, E1, E2, N);
+
+        // downstream coil compression is not supported here
+        // kspace is always reconed
+        workOrder3DT->fullkspace_ = data_dst;
+
+        // compute the scaling factor
+        typename realType<T>::Type scaleFactor = 1.0;
+        hoNDArray<T> kspaceForScaleFactor(RO, E1, E2, srcCHA, const_cast<T*>(data_dst.begin()));
+        Gadgetron::norm2(kspaceForScaleFactor, scaleFactor);
+        scaleFactor /= (value_type)(RO*std::sqrt(double(srcCHA)));
+
+        workOrder3DT->spirit_ncg_scale_factor_ = scaleFactor;
+
+        size_t indMax;
+        hoNDArray<value_type> mag;
+        Gadgetron::abs(kspaceForScaleFactor, mag);
+        value_type maxMag;
+        Gadgetron::maxAbsolute(mag, maxMag, indMax);
+        workOrder3DT->spirit_slep_scale_factor_ = maxMag;
+
+        // split the jobs
+        size_t jobMegaBytes = workOrder3DT->job_max_Megabytes_;
+        size_t jobN = workOrder3DT->job_num_of_N_;
+        bool splitJobs = this->splitJob(workOrder3DT, jobN);
+        size_t maxNumOfBytesPerJob = jobMegaBytes*1024*1024;
+
+        size_t overlapN = workOrder3DT->job_overlap_;
+        if ( workOrder3DT->recon_algorithm_==ISMRMRD_SPIRIT )
+        {
+            overlapN = 0;
+        }
+
+        if ( splitJobs )
+        {
+            hoNDArray<T> kspaceIfftRO(RO, E1, E2, srcCHA, N);
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(data_dst, kspaceIfftRO);
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIfftRO, debugFolder_+"kspaceIfftRO"); }
+
+            hoNDArray<T> kspaceIfftROPermuted(E1, E2, srcCHA, RO, N);
+            if ( performTiming_ ) { gt_timer3_.start("permute kspace RO to 4th dimension ... "); }
+
+            std::vector<size_t> dim_order(4);
+            dim_order[0] = 1;
+            dim_order[1] = 2;
+            dim_order[2] = 3;
+            dim_order[3] = 0;
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&kspaceIfftRO, &kspaceIfftROPermuted, &dim_order));
+
+            if ( performTiming_ ) { gt_timer3_.stop(); }
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIfftROPermuted, debugFolder_+"kspaceIfftROPermuted"); }
+
+            hoNDArray<T> kerPermuted;
+            if ( !spirit_kernelIm_permuted_ )
+            {
+                spirit_kernelIm_permuted_ = true;
+
+                size_t kerN = kImE1*kImE2*srcCHA*dstCHA*kImRO*N;
+                size_t kerImSize = sizeof(T)*kerN;
+                GDEBUG_STREAM("SPIRIT - 3DT - image domain kernel size : " << kerImSize/1024.0/1024 << " MBytes ... ");
+
+                if ( performTiming_ ) { gt_timer3_.start("allocate permuted kernel ... "); }
+                kerPermuted.create(kImE1, kImE2, srcCHA, dstCHA, kImRO, N);
+                if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*workOrder3DT->kernelIm_, debugFolder_+"kernelImBeforePermuted"); }
+
+                if ( performTiming_ ) { gt_timer3_.start("permute kernel RO to 5th dimension ... "); }
+
+                std::vector<size_t> dim_order(5);
+                dim_order[0] = 0;
+                dim_order[1] = 1;
+                dim_order[2] = 3;
+                dim_order[3] = 4;
+                dim_order[4] = 2;
+
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(workOrder3DT->kernelIm_.get(), &kerPermuted, &dim_order));
+
+                if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kerPermuted, debugFolder_+"kerPermuted"); }
+
+                workOrder3DT->kernelIm_->reshape(kerPermuted.get_dimensions());
+                *workOrder3DT->kernelIm_ = kerPermuted;
+
+                kerPermuted.clear();
+
+                kerPermuted.create(kImE1, kImE2, srcCHA, dstCHA, kImRO, N, workOrder3DT->kernelIm_->begin());
+            }
+            else
+            {
+                kerPermuted.create(E1, E2, srcCHA, dstCHA, RO, N, workOrder3DT->kernelIm_->begin());
+            }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kerPermuted, debugFolder_+"kerPermuted_Used"); }
+
+            gtPlusReconWorkOrder3DT<T> workOrder3DTJobSplit;
+            workOrder3DT->duplicate(workOrder3DTJobSplit);
+
+            boost::shared_ptr< hoNDArray<T> > coilMapPermuted = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>()) ;
+            if ( workOrder3DT->coilMap_->get_number_of_elements() > 0 )
+            {
+                if ( performTiming_ ) { gt_timer3_.start("permute coil map RO to 4th dimension ... "); }
+
+                coilMapPermuted->create(workOrder3DT->coilMap_->get_size(1), 
+                                        workOrder3DT->coilMap_->get_size(2), 
+                                        workOrder3DT->coilMap_->get_size(3), 
+                                        workOrder3DT->coilMap_->get_size(0), 
+                                        workOrder3DT->coilMap_->get_size(4) );
+
+                std::vector<size_t> dim_order(4);
+                dim_order[0] = 1;
+                dim_order[1] = 2;
+                dim_order[2] = 3;
+                dim_order[3] = 0;
+
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(workOrder3DT->coilMap_.get(), coilMapPermuted.get(), &dim_order));
+
+                if ( performTiming_ ) { gt_timer3_.stop(); }
+                if (!debugFolder_.empty()) { gt_exporter_.exportArrayComplex(*coilMapPermuted, debugFolder_ + "coilMapPermuted"); }
+
+                workOrder3DTJobSplit.coilMap_ = coilMapPermuted;
+            }
+
+            bool runJobsOnCloud = workOrder3DT->CloudComputing_;
+            unsigned int cloudSize = workOrder3DT->CloudSize_;
+            bool runJobsOnLocalNode = workOrder3DT->job_perform_on_control_node_;
+
+            std::vector<gtPlusReconJob2DT<T> > jobList;
+
+            if ( runJobsOnCloud )
+            {
+                unsigned int j;
+
+                GADGET_CHECK_RETURN_FALSE(this->estimateJobSize(workOrder3DT, maxNumOfBytesPerJob, overlapN, cloudSize, jobN));
+
+                //GDEBUG_STREAM("SPIRIT - 3DT - cloudSize is " << cloudSize << " - RO is " << RO << " ... ");
+                //unsigned int nodeN = cloudSize;
+                //if ( runJobsOnLocalNode ) nodeN++;
+                //GDEBUG_STREAM("SPIRIT - 3DT - runJobsOnLocalNode is " << runJobsOnLocalNode << " - nodeN is " << nodeN << " - overlapN is " << overlapN << " ... ");
+
+                //// adjust jobN according to cloud size
+                //jobN = std::ceil( (double)(RO+overlapN*(nodeN-1))/(double)nodeN );
+
+                //size_t numOfBytesPerJob = sizeof(T)*( E1*E2*srcCHA*dstCHA*jobN + 2*E1*E2*srcCHA*jobN );
+
+                //while ( numOfBytesPerJob > 2.2*1024*1024*1024-64.0*1024*1024 )
+                //{
+                //    nodeN *= 2;
+                //    jobN = std::ceil( (double)(RO+overlapN*(nodeN-1))/(double)nodeN );
+                //    numOfBytesPerJob = sizeof(T)*( E1*E2*srcCHA*dstCHA*jobN + 2*E1*E2*srcCHA*jobN );
+                //}
+
+                //GDEBUG_STREAM("SPIRIT - 3DT - every job will have " << numOfBytesPerJob/1024.0/1024 << " MBytes ... ");
+
+                // split the job
+                GADGET_CHECK_RETURN_FALSE(this->splitReconJob(&workOrder3DTJobSplit, kspaceIfftROPermuted, kerPermuted, workOrder3DT->job_split_by_S_, jobN, jobMegaBytes, overlapN, jobList));
+
+                std::vector<gtPlusReconJob2DT<T> > completedJobList(jobList.size());
+
+                for ( j=0; j<jobList.size(); j++ )
+                {
+                    jobList[j].workOrder2DT.duplicate(completedJobList[j].workOrder2DT);
+                    completedJobList[j].job_index_startN_ = jobList[j].job_index_startN_;
+                    completedJobList[j].job_index_endN_ = jobList[j].job_index_endN_;
+                    completedJobList[j].job_index_S_ = jobList[j].job_index_S_;
+                }
+
+                GDEBUG_STREAM("SPIRIT - 3DT - total job : " << jobList.size() << " - job N : " << jobN << " - cloud size : " << cloudSize);
+
+                unsigned int numOfJobRunOnCloud = (unsigned int)(jobList.size() - jobList.size()/(cloudSize+1));
+                if ( !runJobsOnLocalNode ) numOfJobRunOnCloud = (unsigned int)jobList.size();
+
+                typedef Gadgetron::GadgetCloudController< gtPlusReconJob2DT<T> > GTCloudControllerType;
+                GTCloudControllerType controller;
+
+                if (controller.open () == -1)
+                {
+                    GERROR_STREAM("Cloud controller cannot open the cloud ...");
+                    controller.handle_close (ACE_INVALID_HANDLE, 0);
+                    runJobsOnCloud = false;
+                }
+                else
+                {
+                    std::vector<gtPlusReconJob2DT<T>* > jobListCloud(numOfJobRunOnCloud);
+                    std::vector<gtPlusReconJob2DT<T>* > completedJobListCloud(numOfJobRunOnCloud);
+                    std::vector<int> node_ids(numOfJobRunOnCloud);
+
+                    GADGET_CHECK_RETURN_FALSE(this->scheduleJobForNodes(workOrder3DT, numOfJobRunOnCloud, node_ids));
+
+                    for ( j=0; j<numOfJobRunOnCloud; j++ )
+                    {
+                        // node_ids[j] = j%cloudSize;
+                        jobListCloud[j] = &jobList[j];
+                        completedJobListCloud[j] = &completedJobList[j];
+                        GDEBUG_STREAM("--> job " << j << " runs on node " << node_ids[j] << " ... ");
+                    }
+
+                    std::vector<GadgetMessageReader*> readers(cloudSize, NULL);
+                    std::vector<GadgetMessageWriter*> writers(cloudSize, NULL);
+
+                    for ( j=0; j<cloudSize; j++ )
+                    {
+                        readers[j] = new GtPlusCloudJobMessageReaderCPFL();
+                        writers[j] = new GtPlusCloudJobMessageWriterCPFL();
+                    }
+
+                    if ( controller.createConnector(workOrder3DT->gt_cloud_, GADGET_MESSAGE_CLOUD_JOB, readers, GADGET_MESSAGE_CLOUD_JOB, writers) != 0 )
+                    {
+                        GERROR_STREAM("Cloud controller creates connectors failed ...");
+                        controller.handle_close (ACE_INVALID_HANDLE, 0);
+                        runJobsOnCloud = false;
+                    }
+                    else if ( controller.connectToCloud(workOrder3DT->gt_cloud_) != 0 )
+                    {
+                        GERROR_STREAM("Cloud controller cannot connect to the cloud ...");
+                        controller.handle_close (ACE_INVALID_HANDLE, 0);
+                        runJobsOnCloud = false;
+                    }
+                    else
+                    {
+                        if ( controller.runJobsOnCloud(jobListCloud, completedJobListCloud, node_ids) != 0 )
+                        {
+                            GERROR_STREAM("Cloud controller runs jobs on the cloud failed ...");
+                            controller.closeCloudNode();
+                            controller.handle_close (ACE_INVALID_HANDLE, 0);
+                            runJobsOnCloud = false;
+                        }
+                        else
+                        {
+                            controller.closeCloudNode();
+
+                            // run the left over jobs on the local computer
+                            for ( j=numOfJobRunOnCloud; j<jobList.size(); j++ )
+                            {
+                                GDEBUG_STREAM("SPIRIT - 3DT - job : " << j << " - size :" << jobList[j].job_index_endN_-jobList[j].job_index_startN_+1);
+
+                                if ( performTiming_ ) { gt_timer3_.start("SPIRIT 3DT ... "); }
+                                GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(jobList[j]));
+                                if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                                std::ostringstream ostr;
+                                ostr << "job_fullkspace" << "_" << j;
+                                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(jobList[j].res, debugFolder_+ostr.str()); }
+                            }
+
+                            // wait the cloud job to complete
+                            controller.waitForJobToComplete();
+
+                            // combine results from cloud and local run
+                            for ( j=0; j<numOfJobRunOnCloud; j++ )
+                            {
+                                jobList[j].res = controller.completed_job_list_[j]->res;
+                                jobList[j].complexIm = controller.completed_job_list_[j]->complexIm;
+                            }
+
+                            // if some jobs are not actually completed, process them
+                            for ( j=0; j<numOfJobRunOnCloud; j++ )
+                            {
+                                if ( 
+                                    !jobList[j].res.dimensions_equal(&jobList[j].kspace) 
+                                        && 
+                                    ( jobList[j].complexIm.get_size(0)!= jobList[j].kspace.get_size(0) 
+                                    || jobList[j].complexIm.get_size(1)!= jobList[j].kspace.get_size(1) 
+                                    || jobList[j].complexIm.get_size(2)!= jobList[j].kspace.get_size(2) ) 
+                                   )
+                                {
+                                    GDEBUG_STREAM("SPIRIT - 3DT - uncompleted cloud job : " << j << " - size :" << jobList[j].job_index_endN_-jobList[j].job_index_startN_+1);
+
+                                    if ( performTiming_ ) { gt_timer3_.start("SPIRIT 3DT ... "); }
+                                    GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(jobList[j]));
+                                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                                    std::ostringstream ostr;
+                                    ostr << "job_fullkspace" << "_" << j;
+                                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(jobList[j].res, debugFolder_+ostr.str()); }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            if ( !runJobsOnCloud )
+            {
+                // split the job
+                GADGET_CHECK_RETURN_FALSE(this->splitReconJob(&workOrder3DTJobSplit, kspaceIfftROPermuted, kerPermuted, workOrder3DT->job_split_by_S_, jobN, jobMegaBytes, overlapN, jobList));
+
+                GDEBUG_STREAM("SPIRIT - 3DT - total job : " << jobList.size());
+
+                size_t j;
+                for ( j=0; j<jobList.size(); j++ )
+                {
+                    GDEBUG_STREAM("SPIRIT - 3DT - job : " << j << " - size :" << jobList[j].job_index_endN_-jobList[j].job_index_startN_+1);
+
+                    if ( performTiming_ ) { gt_timer3_.start("SPIRIT 3DT ... "); }
+                    GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(jobList[j]));
+                    if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                    std::ostringstream ostr;
+                    ostr << "job_fullkspace" << "_" << j;
+                    if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(jobList[j].res, debugFolder_+ostr.str()); }
+                }
+            }
+
+            // combine the job
+            workOrder3DTJobSplit.fullkspace_.create(E1, E2, dstCHA, RO, N);
+            GADGET_CHECK_RETURN_FALSE(this->combineReconJob(&workOrder3DTJobSplit, jobList, RO, N));
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DTJobSplit.fullkspace_, debugFolder_+"job_combined_fullkspace"); }
+
+            // clear the memory
+            jobList.clear();
+
+            // permute the unwrapped kspace
+            if ( performTiming_ ) { gt_timer3_.start("permtue RO to 1st dimension ... "); }
+
+            {
+                std::vector<size_t> dim_order(4);
+                dim_order[0] = 3;
+                dim_order[1] = 0;
+                dim_order[2] = 1;
+                dim_order[3] = 2;
+
+                GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&workOrder3DTJobSplit.fullkspace_, &kspaceIfftRO, &dim_order));
+            }
+
+            if ( performTiming_ ) { gt_timer3_.stop(); }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIfftRO, debugFolder_+"res_fullkspace_ROinIm"); }
+
+            // perform fft along the first dimension
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft1c(kspaceIfftRO, workOrder3DT->fullkspace_);
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->fullkspace_, debugFolder_+"res_3DSpirit"); }
+        }
+        else
+        {
+            for ( n=0; n<(int)N; n++ )
+            {
+                size_t kernelN = n;
+                if ( kernelN >= refN ) kernelN = refN-1;
+
+                hoNDArray<T> kIm(E1, E2, RO, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin()+kernelN*RO*E1*E2*srcCHA*dstCHA);
+
+                hoNDArray<T> aliasedKSpace(RO, E1, E2, srcCHA, const_cast<T*>(data_dst.begin())+n*RO*E1*E2*srcCHA);
+
+                hoNDArray<T> unwarppedKSpace(RO, E1, E2, dstCHA, workOrder3DT->fullkspace_.begin()+n*RO*E1*E2*dstCHA);
+
+                if ( performTiming_ ) { gt_timer3_.start("spirit 3D unwrapping ... "); }
+                GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(workOrder3DT, aliasedKSpace, kIm, unwarppedKSpace, n));
+                if ( performTiming_ ) { gt_timer3_.stop(); }
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(unwarppedKSpace, debugFolder_+"unwarppedKSpace"); }
+            }
+        }
+
+        hoNDArray<T> complexImMultiChannel(RO, E1, E2, dstCHA, N);
+
+        if ( (workOrder3DT->coilMap_->get_size(0)==RO) 
+            && (workOrder3DT->coilMap_->get_size(1)==E1) 
+            && (workOrder3DT->coilMap_->get_size(2)==E2) 
+            && (workOrder3DT->coilMap_->get_size(3)==dstCHA) )
+        {
+            hoNDArray<T> complexImMultiChannel(RO, E1, E2, dstCHA, N);
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(workOrder3DT->fullkspace_, complexImMultiChannel);
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(complexImMultiChannel, debugFolder_+"unwarppedComplexIm"); }
+
+            if ( performTiming_ ) { gt_timer3_.start("spirit 3D coil combination ... "); }
+            gtPlusISMRMRDReconUtilComplex<T>().coilCombine3D(complexImMultiChannel, *workOrder3DT->coilMap_, workOrder3DT->complexIm_);
+            if ( performTiming_ ) { gt_timer3_.stop(); }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(workOrder3DT->complexIm_, debugFolder_+"combined"); }
+        }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DTSPIRIT<T>::performUnwrapping(gtPlusReconWorkOrder3DT<T>* workOrder3DT, const hoNDArray<T>& data) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::
+performUnwarppingImplROPermuted(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& ker, hoNDArray<T>& /*coilMap*/, hoNDArray<T>& res)
+{
+    try
+    {
+        size_t E1 = kspace.get_size(0);
+        size_t E2 = kspace.get_size(1);
+        size_t RO = kspace.get_size(3);
+
+        size_t kerE1 = ker.get_size(0);
+        size_t kerE2 = ker.get_size(1);
+        size_t srcCHA = ker.get_size(2);
+        size_t dstCHA = ker.get_size(3);
+        size_t kerN = ker.get_size(5);
+
+        hoNDArray<T>* kerIm = &ker;
+        hoNDArray<T> kerImE1E2RO;
+        if ( kerE1!=E1 || kerE2!=E2 )
+        {
+            GDEBUG_STREAM("gtPlusReconWorker3DTSPIRIT, kerE1!=E1 || kerE2!=E2, kernel needs to be converted along E1 and E2 ... ");
+
+            if ( performTiming_ ) { gt_timer3_.start("kernel conversion along E1 and E2 ... "); }
+
+            kerImE1E2RO.create(E1, E2, srcCHA, dstCHA, RO, kerN);
+            Gadgetron::clear(kerImE1E2RO);
+
+            GADGET_CHECK_RETURN_FALSE(spirit_.imageDomainKernelE1E2RO(ker, (int)E1, (int)E2, kerImE1E2RO));
+            kerIm = &kerImE1E2RO;
+
+            if ( performTiming_ ) { gt_timer3_.stop(); }
+        }
+
+        res.create(kspace.get_dimensions());
+
+        long long NUM = (long long)RO;
+
+        #ifdef USE_OMP
+            int numThreads = NUM;
+
+            int numOpenMPProcs = omp_get_num_procs();
+            GDEBUG_STREAM("gtPlusReconWorker3DTSPIRIT, numOpenMPProcs : " << numOpenMPProcs);
+
+            int maxOpenMPThreads = omp_get_max_threads();
+            GDEBUG_STREAM("gtPlusReconWorker3DTSPIRIT, maxOpenMPThreads : " << maxOpenMPThreads);
+
+            int allowOpenMPNested = omp_get_nested();
+
+            if ( NUM < numOpenMPProcs-2 )
+            {
+                omp_set_nested(1);
+                allowOpenMPNested = 1;
+            }
+            else
+            {
+                omp_set_nested(0);
+                allowOpenMPNested = 0;
+            }
+
+            GDEBUG_STREAM("gtPlusReconWorker3DTSPIRIT, allowOpenMPNested : " << allowOpenMPNested);
+            GDEBUG_STREAM("gtPlusReconWorker3DTSPIRIT, numThreads : " << numThreads);
+
+            if ( numThreads > numOpenMPProcs ) numThreads = numOpenMPProcs;
+            GDEBUG_STREAM("gtPlusReconWorker3DTSPIRIT, numThreads : " << numThreads);
+
+        #endif
+
+        long long t;
+
+        hoNDArray<T> ker_Shifted(kerIm);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifftshift2D(*kerIm, ker_Shifted);
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(ker_Shifted, debugFolder_+"ker_Shifted"); }
+
+        hoNDArray<T> kspace_Shifted(kspace);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifftshift2D(kspace, kspace_Shifted);
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspace_Shifted, debugFolder_+"kspace_Shifted"); }
+
+        #pragma omp parallel default(none) private(t) shared(RO, E1, E2, srcCHA, dstCHA, workOrder3DT, NUM, kspace_Shifted, ker_Shifted, res) if ( NUM > 1 ) num_threads( numThreads )
+        {
+            gtPlusSPIRIT2DOperator<T> spirit;
+            spirit.use_symmetric_spirit_ = false;
+            spirit.use_non_centered_fft_ = true;
+
+            hoNDArray<T> x0(E1, E2, srcCHA);
+            Gadgetron::clear(x0);
+
+            gtPlusLinearSolver<hoNDArray<T>, hoNDArray<T>, gtPlusSPIRIT2DOperator<T> >* pCGSolver;
+
+            pCGSolver = new gtPlusLSQRSolver<hoNDArray<T>, hoNDArray<T>, gtPlusSPIRIT2DOperator<T> >();
+
+            gtPlusLinearSolver<hoNDArray<T>, hoNDArray<T>, gtPlusSPIRIT2DOperator<T> >& cgSolver = *pCGSolver;
+
+            cgSolver.iterMax_ = workOrder3DT->spirit_iter_max_;
+            cgSolver.thres_ = (value_type)workOrder3DT->spirit_iter_thres_;
+            cgSolver.printIter_ = workOrder3DT->spirit_print_iter_;
+
+            cgSolver.set(spirit);
+
+            hoNDArray<T> b(E1, E2, srcCHA);
+
+            #pragma omp for
+            for ( t=0; t<NUM; t++ )
+            {
+                size_t ro = t;
+
+                hoNDArray<T> kspaceCurr(E1, E2, srcCHA, kspace_Shifted.begin()+ro*E1*E2*srcCHA);
+                hoNDArray<T> resCurr(E1, E2, dstCHA, res.begin()+ro*E1*E2*dstCHA);
+
+                // solve the 2D spirit problem
+                Gadgetron::clear(x0);
+
+                boost::shared_ptr<hoNDArray<T> > kerCurr(new hoNDArray<T>(E1, E2, srcCHA, dstCHA, ker_Shifted.begin()+ro*E1*E2*srcCHA*dstCHA));
+
+                spirit.setForwardKernel(kerCurr, false);
+
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(E1, E2, srcCHA, kspaceCurr.begin()));
+                spirit.setAcquiredPoints(acq);
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*kerCurr, debugFolder_+"spirit3D_ker"); }
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*acq, debugFolder_+"spirit3D_kspace"); }
+
+                cgSolver.x0_ = acq.get();
+
+                // compute rhs
+                spirit.computeRighHandSide(*acq, b);
+
+                // solve
+                cgSolver.solve(b, resCurr);
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(resCurr, debugFolder_+"unwarppedKSpace_t"); }
+
+                // restore the acquired points
+                spirit.restoreAcquiredKSpace(*acq, resCurr);
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(resCurr, debugFolder_+"unwarppedKSpace_t_setAcq"); }
+            }
+
+            delete pCGSolver;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"res_Shifted"); }
+
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fftshift2D(res, kspace_Shifted);
+        res = kspace_Shifted;
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"resPermuted"); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DTSPIRIT<T>::performUnwarppingImplROPermuted(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::
+performUnwarppingImpl(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res, size_t n)
+{
+    try
+    {
+        // RO, E1, E2, srcCHA, dstCHA, N
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t E2 = kspace.get_size(2);
+
+        size_t srcCHA = adj_forward_G_I.get_size(3);
+        size_t dstCHA = adj_forward_G_I.get_size(4);
+
+        // perform the 3D recon by read-out decoupling
+        hoNDArray<T> resDecoupled(E1, E2, dstCHA, RO);
+
+        hoNDArray<T> kspaceIfftRO(RO, E1, E2, srcCHA);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(kspace, kspaceIfftRO);
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIfftRO, debugFolder_+"kspaceIfftRO"); }
+
+        hoNDArray<T> kspaceIfftROPermuted(E1, E2, srcCHA, RO);
+
+        if ( performTiming_ ) { gt_timer3_.start("permtue RO to 4th dimension ... "); }
+
+        std::vector<size_t> dim_order(4);
+        dim_order[0] = 1;
+        dim_order[1] = 2;
+        dim_order[2] = 3;
+        dim_order[3] = 0;
+
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&kspaceIfftRO, &kspaceIfftROPermuted, &dim_order));
+
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(kspaceIfftROPermuted, debugFolder_+"kspaceIfftROPermuted"); }
+
+        T* pKspaceIfftROPermuted = kspaceIfftROPermuted.begin();
+
+        T* pG_I = adj_forward_G_I.begin();
+
+        long long NUM = (long long)RO;
+
+        long long t;
+
+        #pragma omp parallel default(none) private(t) shared(RO, E1, E2, srcCHA, dstCHA, workOrder3DT, NUM, resDecoupled, pKspaceIfftROPermuted, pG_I) if ( NUM > 6 ) num_threads( (int)((NUM<16) ? NUM : 16) )
+        {
+            hoNDArray<T> adjForG_I_Decoupled(E1, E2, srcCHA, dstCHA);
+            T* pDecoupledG_I = adjForG_I_Decoupled.begin();
+
+            gtPlusSPIRIT2DOperator<T> spirit;
+            spirit.use_symmetric_spirit_ = false;
+
+            hoNDArray<T> x0(E1, E2, srcCHA);
+            Gadgetron::clear(x0);
+
+            gtPlusLinearSolver<hoNDArray<T>, hoNDArray<T>, gtPlusSPIRIT2DOperator<T> >* pCGSolver;
+            pCGSolver = new gtPlusLSQRSolver<hoNDArray<T>, hoNDArray<T>, gtPlusSPIRIT2DOperator<T> >();
+            gtPlusLinearSolver<hoNDArray<T>, hoNDArray<T>, gtPlusSPIRIT2DOperator<T> >& cgSolver = *pCGSolver;
+
+            cgSolver.iterMax_ = workOrder3DT->spirit_iter_max_;
+            cgSolver.thres_ = (value_type)workOrder3DT->spirit_iter_thres_;
+            cgSolver.printIter_ = workOrder3DT->spirit_print_iter_;
+
+            cgSolver.set(spirit);
+
+            hoNDArray<T> b(E1, E2, srcCHA);
+
+            #pragma omp for
+            for ( t=0; t<NUM; t++ )
+            {
+                size_t ro = t;
+
+                hoNDArray<T> kspace_DeDecoupled(E1, E2, srcCHA, pKspaceIfftROPermuted+ro*E1*E2*srcCHA);
+                hoNDArray<T> resCurr(E1, E2, dstCHA, resDecoupled.begin()+ro*E1*E2*dstCHA);
+
+                // fill in kernel and kspace
+                size_t scha, dcha;
+
+                for ( dcha=0; dcha<dstCHA; dcha++)
+                {
+                    for ( scha=0; scha<srcCHA; scha++)
+                    {
+
+                        T* pDst = pDecoupledG_I + scha*E1*E2+dcha*E1*E2*srcCHA;
+                        T* pSrc = pG_I + ro*E1*E2+scha*RO*E1*E2+dcha*RO*E1*E2*srcCHA;
+                        memcpy(pDst, pSrc, sizeof(T)*E1*E2);
+                    }
+                }
+
+                // solve the 2D spirit problem
+                Gadgetron::clear(x0);
+
+                boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(E1, E2, srcCHA, dstCHA, pDecoupledG_I));
+
+                spirit.setForwardKernel(ker, false);
+
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(E1, E2, srcCHA, kspace_DeDecoupled.begin()));
+                spirit.setAcquiredPoints(acq);
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*ker, debugFolder_+"spirit3D_ker"); }
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(*acq, debugFolder_+"spirit3D_kspace"); }
+
+                cgSolver.x0_ = acq.get();
+
+                // compute rhs
+                spirit.computeRighHandSide(*acq, b);
+
+                // solve
+                cgSolver.solve(b, resCurr);
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(resCurr, debugFolder_+"unwarppedKSpace_t"); }
+
+                // restore the acquired points
+                spirit.restoreAcquiredKSpace(*acq, resCurr);
+
+                if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(resCurr, debugFolder_+"unwarppedKSpace_t_setAcq"); }
+            }
+
+            delete pCGSolver;
+        }
+
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(resDecoupled, debugFolder_+"resDecoupled"); }
+
+        // permute the unwrapped kspace
+        if ( performTiming_ ) { gt_timer3_.start("permtue RO to 1st dimension ... "); }
+
+        {
+            std::vector<size_t> dim_order(4);
+            dim_order[0] = 3;
+            dim_order[1] = 0;
+            dim_order[2] = 1;
+            dim_order[3] = 2;
+
+            GADGET_CHECK_EXCEPTION_RETURN_FALSE(Gadgetron::permute(&resDecoupled, &kspaceIfftRO, &dim_order));
+        }
+
+        if ( performTiming_ ) { gt_timer3_.stop(); }
+
+        // perform fft along the first dimension
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft1c(kspaceIfftRO, res);
+        if ( !debugFolder_.empty() ) { gt_exporter_.exportArrayComplex(res, debugFolder_+"res_3DSpirit"); }
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DTSPIRIT<T>::performUnwarppingImpl(gtPlusReconWorkOrder3DT<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::
+performUnwarppingImpl(gtPlusReconJob2DT<T>& job)
+{
+    try
+    {
+        hoNDArray<T>& kspace = job.kspace; // [E1 E2 srcCHA RO 1]
+        hoNDArray<T>& ker = job.ker; // [E1 E2 srcCHA dstCHA RO 1]
+        hoNDArray<T>& res = job.res; // [E1 E2 dstCHA RO 1]
+        gtPlusReconWorkOrder<T>* workOrder3DT = &(job.workOrder2DT);
+
+        job.res = job.kspace;
+
+        GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImplROPermuted(workOrder3DT, kspace, ker, *job.workOrder2DT.coilMap_, res));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DTSPIRIT<T>::performUnwarppingImpl(job) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(workOrder3DT!=NULL);
+
+        // call the BaseClass
+        GADGET_CHECK_RETURN_FALSE(BaseClass::performRecon(workOrder3DT));
+    }
+    catch(...)
+    {
+        GERROR_STREAM("Errors in gtPlusReconWorker3DTSPIRIT<T>::performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/log/CMakeLists.txt b/toolboxes/log/CMakeLists.txt
new file mode 100644
index 0000000..81aa909
--- /dev/null
+++ b/toolboxes/log/CMakeLists.txt
@@ -0,0 +1,10 @@
+if (WIN32)
+    ADD_DEFINITIONS(-D__BUILD_GADGETRON_LOG__)
+endif (WIN32)
+
+add_library(gadgetron_toolbox_log SHARED log.cpp)
+set_target_properties(gadgetron_toolbox_log PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+install(TARGETS gadgetron_toolbox_log DESTINATION lib COMPONENT main)
+install(FILES log.h log_export.h DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
diff --git a/toolboxes/log/log.cpp b/toolboxes/log/log.cpp
new file mode 100644
index 0000000..d14b89e
--- /dev/null
+++ b/toolboxes/log/log.cpp
@@ -0,0 +1,206 @@
+#include "log.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string>
+#include <time.h>
+#include <cstring>
+
+namespace Gadgetron
+{
+  GadgetronLogger* GadgetronLogger::instance()
+  {
+    if (!instance_) instance_ = new GadgetronLogger();
+    return instance_;
+  }
+  
+  GadgetronLogger* GadgetronLogger::instance_ = NULL;
+  
+  GadgetronLogger::GadgetronLogger()
+    : level_mask_(GADGETRON_LOG_LEVEL_MAX,false)
+    , print_mask_(GADGETRON_LOG_PRINT_MAX, false)
+  {
+    char* log_mask = getenv(GADGETRON_LOG_MASK_ENVIRONMENT);
+    if ( log_mask != NULL) {
+      
+      std::string log_mask_str(log_mask);
+
+      //Which log levels are enabled
+      if (log_mask_str.find("ALL") != std::string::npos) {
+	enableAllOutputOptions();
+	enableAllLogLevels();
+	return;
+      }
+      
+      if (log_mask_str.find("LEVEL_DEBUG") != std::string::npos) 
+	enableLogLevel(GADGETRON_LOG_LEVEL_DEBUG);
+
+      if (log_mask_str.find("LEVEL_INFO") != std::string::npos) 
+	enableLogLevel(GADGETRON_LOG_LEVEL_INFO);
+
+      if (log_mask_str.find("LEVEL_WARNING") != std::string::npos) 
+	enableLogLevel(GADGETRON_LOG_LEVEL_WARNING);
+
+      if (log_mask_str.find("LEVEL_ERROR") != std::string::npos) 
+	enableLogLevel(GADGETRON_LOG_LEVEL_ERROR);
+
+      if (log_mask_str.find("PRINT_FILELOC") != std::string::npos) 
+	enableOutputOption(GADGETRON_LOG_PRINT_FILELOC);
+
+      if (log_mask_str.find("PRINT_LEVEL") != std::string::npos) 
+	enableOutputOption(GADGETRON_LOG_PRINT_LEVEL);
+      
+      if (log_mask_str.find("PRINT_DATETIME") != std::string::npos) 
+	enableOutputOption(GADGETRON_LOG_PRINT_DATETIME);
+    } else {
+      enableLogLevel(GADGETRON_LOG_LEVEL_DEBUG);
+      enableLogLevel(GADGETRON_LOG_LEVEL_INFO);
+      enableLogLevel(GADGETRON_LOG_LEVEL_WARNING);
+      enableLogLevel(GADGETRON_LOG_LEVEL_ERROR);
+      enableOutputOption(GADGETRON_LOG_PRINT_FILELOC);
+      enableOutputOption(GADGETRON_LOG_PRINT_LEVEL);
+      enableOutputOption(GADGETRON_LOG_PRINT_DATETIME);
+    }
+  }
+
+
+  void GadgetronLogger::log(GadgetronLogLevel LEVEL, const char* filename, int lineno, const char* cformatting, ...)
+  {
+    //Check if we should log this message
+    if (!isLevelEnabled(LEVEL)) return;
+
+    const char* fmt = cformatting;
+    std::string fmt_str;
+    bool append_cformatting_needed = false; //Will be set to true if we add any additional labels
+
+    if (isOutputOptionEnabled(GADGETRON_LOG_PRINT_DATETIME)) {
+      time_t rawtime;
+      struct tm * timeinfo;
+
+      time ( &rawtime );
+      timeinfo = localtime ( &rawtime );
+      
+      //Time the format YYYY-MM-DD HH:MM:SS
+      char timestr[22];sprintf(timestr, "%d-%02d-%02d %02d:%02d:%02d ",
+			       timeinfo->tm_year+1900, timeinfo->tm_mon+1, timeinfo->tm_mday,
+			       timeinfo->tm_hour, timeinfo->tm_min, timeinfo->tm_sec);
+
+      fmt_str += std::string(timestr);
+      append_cformatting_needed = true;
+    }
+
+    if (isOutputOptionEnabled(GADGETRON_LOG_PRINT_LEVEL)) {
+      switch (LEVEL) {
+      case GADGETRON_LOG_LEVEL_DEBUG:
+	fmt_str += "DEBUG ";
+	break;
+      case GADGETRON_LOG_LEVEL_INFO:
+	fmt_str += "INFO ";
+	break;
+      case GADGETRON_LOG_LEVEL_WARNING:
+	fmt_str += "WARNING ";
+	break;
+      case GADGETRON_LOG_LEVEL_ERROR:
+	fmt_str += "ERROR ";
+	break;
+      default:
+	;
+      }
+      append_cformatting_needed = true;
+    }
+
+    if (isOutputOptionEnabled(GADGETRON_LOG_PRINT_FILELOC)) {
+      if (!isOutputOptionEnabled(GADGETRON_LOG_PRINT_FOLDER)) {
+	const char* base_start = strrchr(filename,'/');
+	if (!base_start) {
+	  base_start = strrchr(filename,'\\'); //Maybe using backslashes
+	}
+	if (base_start) {
+	  base_start++;
+	  fmt_str += std::string("[") + std::string(base_start);
+	} else {
+	  std::string("[") + std::string(filename);
+	}
+      } else {
+	fmt_str += std::string("[") + std::string(filename);
+      }
+      char linenostr[8];sprintf(linenostr, "%d", lineno);
+      fmt_str += std::string(":") + std::string(linenostr);
+      fmt_str += std::string("] ");
+      append_cformatting_needed = true;
+    }
+
+    if (append_cformatting_needed) {
+      fmt_str += std::string(cformatting); 
+      fmt = fmt_str.c_str();      
+    }
+
+    va_list args;
+    va_start (args, cformatting);
+    vprintf(fmt, args);
+    va_end (args);
+    fflush(stdout);
+  }
+
+  void GadgetronLogger::enableLogLevel(GadgetronLogLevel LEVEL)
+  {
+    if (LEVEL < level_mask_.size()) {
+      level_mask_[LEVEL] = true;
+    }
+  }
+
+  void GadgetronLogger::disableLogLevel(GadgetronLogLevel LEVEL)
+  {
+    if (LEVEL < level_mask_.size()) {
+      level_mask_[LEVEL] = false;
+    }
+  }
+  
+  bool GadgetronLogger::isLevelEnabled(GadgetronLogLevel LEVEL)
+  {
+    if (LEVEL >= level_mask_.size()) return false;
+    return level_mask_[LEVEL];
+  }
+  
+  void GadgetronLogger::enableAllLogLevels()
+  {
+    level_mask_.assign(GADGETRON_LOG_LEVEL_MAX,true);
+  }
+
+  void GadgetronLogger::disableAllLogLevels()
+  {
+    level_mask_.assign(GADGETRON_LOG_LEVEL_MAX,false);
+  }
+
+  void GadgetronLogger::enableOutputOption(GadgetronLogOutput OUTPUT) 
+  {
+    if (OUTPUT < print_mask_.size()) {
+      print_mask_[OUTPUT] = true;
+    }
+  }
+
+  void GadgetronLogger::disableOutputOption(GadgetronLogOutput OUTPUT) 
+  {
+    if (OUTPUT < print_mask_.size()) {
+      print_mask_[OUTPUT] = false;
+    }
+  }
+ 
+  bool GadgetronLogger::isOutputOptionEnabled(GadgetronLogOutput OUTPUT)
+  {
+    if (OUTPUT < print_mask_.size()) {
+      return print_mask_[OUTPUT];
+    }
+    return false;
+  }
+  
+  void GadgetronLogger::enableAllOutputOptions() 
+  {
+    print_mask_.assign(GADGETRON_LOG_PRINT_MAX, true);
+  }
+
+  void GadgetronLogger::disableAllOutputOptions() 
+  {
+    print_mask_.assign(GADGETRON_LOG_PRINT_MAX, false);
+  }
+}
diff --git a/toolboxes/log/log.h b/toolboxes/log/log.h
new file mode 100644
index 0000000..47311f1
--- /dev/null
+++ b/toolboxes/log/log.h
@@ -0,0 +1,195 @@
+#ifndef GADGETRON_LOG_H
+#define GADGETRON_LOG_H
+
+#include "log_export.h"
+
+#include <vector> //For mask fields
+
+#include <sstream> //For deprecated macros
+
+#define GADGETRON_LOG_MASK_ENVIRONMENT "GADGETRON_LOG_MASK"
+
+namespace Gadgetron
+{
+  /**
+     Gadgetron log levels
+   */
+  enum GadgetronLogLevel
+  {
+    GADGETRON_LOG_LEVEL_DEBUG = 0,     //!< Debug information
+    GADGETRON_LOG_LEVEL_INFO,          //!< Regular application information
+    GADGETRON_LOG_LEVEL_WARNING,       //!< Warnings about events that could lead to failues
+    GADGETRON_LOG_LEVEL_ERROR,         //!< Errors after which application will be unable to continue
+    GADGETRON_LOG_LEVEL_VERBOSE,       //!< Verbose information about algorithm parameters, etc. 
+    GADGETRON_LOG_LEVEL_MAX            //!< All log levels must have values lower than this
+  };
+
+  /**
+     Gadgetron output options. These options control what context information
+     will be printed with the log statements.
+   */
+  enum GadgetronLogOutput
+  {
+    GADGETRON_LOG_PRINT_FILELOC = 0,  //!< Print filename and line in file
+    GADGETRON_LOG_PRINT_FOLDER,       //!< Print the folder name too (full filename)
+    GADGETRON_LOG_PRINT_LEVEL,        //!< Print Log Level
+    GADGETRON_LOG_PRINT_DATETIME,     //!< Print date and time
+    GADGETRON_LOG_PRINT_MAX           //!< All print options must have lower values than this
+  };
+
+  /**
+     Main logging utility class for the Gadgetron and associated toolboxes. 
+
+     This is a process wide singleton. 
+
+     Logging/Debug messages should be done with the convenience macros:
+
+     GDEBUG
+     GDINFO
+     GWARN
+     GERROR
+     GVERBOSE
+
+     These macros use a printf style syntax:
+
+     GDEBUG("Here we are logging some values %f, %d\n", myFloat, myInt);
+
+     The c++ std::cout is not recommended for logging as it does not add
+     filename, log level, timing or other context information to the logging
+     statements. Use of std::cout can also cause log lines from different threads
+     to be interleaved. For people more comfortable with the std::cout style 
+     syntax, we provide the macros:
+
+     GDEBUG_STREAM
+     GINFO_STREAM
+     GWARN_STREAM
+     GERROR_STREAM
+     GVERBOSE_STREAM
+
+     To use them:
+     
+     GDEBUG("Here we are logging some values " << myFloat << ", " << myInt << std::endl);
+
+     It is possible to control which log levels are output using the @enableLogLevel, @disableLogLevel, 
+     @enableOutputOption, and @disableOutputOption functions. 
+
+     Log levels are defined in @GadgetronLogLevel
+     Ouput options are defined in @GadgetronLogOutput
+ 
+     The logger checks the environment variable GADGETRON_LOG_MASK. If it is set,
+     it disables all log levels and outputs and only enables the ones in the mask. 
+     It can be specified with (on unix system):
+
+     export GADGETRON_LOG_MASK="LEVEL_INFO,LEVEL_DEBUG,PRINT_FILELOC,PRINT_DATETIME"
+     
+     Any (or no) seperator is allowed between the levels and ourput options.
+
+   */
+  class EXPORTGADGETRONLOG GadgetronLogger
+  {
+  public:
+    ///Function for accessing the process wide singleton
+    static GadgetronLogger* instance();
+
+    ///Generic log function. Use the logging macros for easy access to this function
+    void log(GadgetronLogLevel LEVEL, const char* filename, int lineno, const char* cformatting, ...);
+
+    void enableLogLevel(GadgetronLogLevel LEVEL);
+    void disableLogLevel(GadgetronLogLevel LEVEL);
+    bool isLevelEnabled(GadgetronLogLevel LEVEL);
+    void enableAllLogLevels();
+    void disableAllLogLevels();
+
+    void enableOutputOption(GadgetronLogOutput OUTPUT);
+    void disableOutputOption(GadgetronLogOutput OUTPUT);
+    bool isOutputOptionEnabled(GadgetronLogOutput OUTPUT);
+    void enableAllOutputOptions();
+    void disableAllOutputOptions();
+
+  protected:
+    GadgetronLogger();
+    static GadgetronLogger* instance_;
+    std::vector<bool> level_mask_;
+    std::vector<bool> print_mask_;
+  };
+}
+
+#define GDEBUG(...)   Gadgetron::GadgetronLogger::instance()->log(Gadgetron::GADGETRON_LOG_LEVEL_DEBUG,   __FILE__, __LINE__, __VA_ARGS__)
+#define GINFO(...)    Gadgetron::GadgetronLogger::instance()->log(Gadgetron::GADGETRON_LOG_LEVEL_INFO,    __FILE__, __LINE__, __VA_ARGS__)
+#define GWARN(...)    Gadgetron::GadgetronLogger::instance()->log(Gadgetron::GADGETRON_LOG_LEVEL_WARNING, __FILE__, __LINE__, __VA_ARGS__)
+#define GERROR(...)   Gadgetron::GadgetronLogger::instance()->log(Gadgetron::GADGETRON_LOG_LEVEL_ERROR,   __FILE__, __LINE__, __VA_ARGS__)
+#define GVERBOSE(...) Gadgetron::GadgetronLogger::instance()->log(Gadgetron::GADGETRON_LOG_LEVEL_VERBOSE,   __FILE__, __LINE__, __VA_ARGS__)
+
+#define GEXCEPTION(err, message);	  \
+  {					  \
+    std::string gdb(message);		  \
+    gdb += std::string(" --> ");	  \
+    gdb += err.what();			  \
+    GDEBUG(gdb.c_str());		  \
+ }
+
+//Stream syntax log level functions
+#define GDEBUG_STREAM(message)				\
+  {							\
+    std::stringstream gadget_msg_dep_str;		\
+    gadget_msg_dep_str  << message << std::endl;	\
+    GDEBUG(gadget_msg_dep_str.str().c_str());		\
+  }
+
+#define GINFO_STREAM(message)				\
+  {							\
+    std::stringstream gadget_msg_dep_str;		\
+    gadget_msg_dep_str  << message << std::endl;	\
+    GINFO(gadget_msg_dep_str.str().c_str());		\
+  }
+
+#define GWARN_STREAM(message)					\
+  {								\
+    std::stringstream gadget_msg_dep_str;			\
+    gadget_msg_dep_str  << message << std::endl;		\
+    GWARN(gadget_msg_dep_str.str().c_str());			\
+  }
+
+#define GERROR_STREAM(message)					\
+  {								\
+    std::stringstream gadget_msg_dep_str;			\
+    gadget_msg_dep_str  << message << std::endl;		\
+    GERROR(gadget_msg_dep_str.str().c_str());			\
+  }
+
+#define GVERBOSE_STREAM(message)					\
+  {								\
+    std::stringstream gadget_msg_dep_str;			\
+    gadget_msg_dep_str  << message << std::endl;		\
+    GVERBOSE(gadget_msg_dep_str.str().c_str());			\
+  }
+     
+
+//Older debugging macros
+//TODO: Review and check that they are up to date
+#define GDEBUG_CONDITION_STREAM(con, message) { if ( con ) GDEBUG_STREAM(message) }
+#define GWARN_CONDITION_STREAM(con, message) { if ( con ) GWARN_STREAM(message) }
+     
+#define GADGET_THROW(msg) { GERROR_STREAM(msg); throw std::runtime_error(msg); }
+#define GADGET_CHECK_THROW(con) { if ( !(con) ) { GERROR_STREAM(#con); throw std::runtime_error(#con); } }
+
+#define GADGET_CATCH_THROW(con) { try { con; } catch(...) { GERROR_STREAM(#con); throw std::runtime_error(#con); } }
+
+#define GADGET_CHECK_RETURN(con, value) { if ( ! (con) ) { GERROR_STREAM("Returning '" << value << "' due to failed check: '" << #con << "'"); return (value); } }
+#define GADGET_CHECK_RETURN_FALSE(con) { if ( ! (con) ) { GERROR_STREAM("Returning false due to failed check: '" << #con << "'"); return false; } }
+
+#define GADGET_CHECK_EXCEPTION_RETURN(con, value) { try { con; } catch(...) { GERROR_STREAM("Returning '" << value << "' due to failed check: '" << #con << "'"); return (value); } }
+#define GADGET_CHECK_EXCEPTION_RETURN_FALSE(con) { try { con; } catch(...) { GERROR_STREAM("Returning false due to failed check: '" << #con << "'"); return false; } }
+
+#ifdef GADGET_DEBUG_MODE
+#define GADGET_DEBUG_CHECK_THROW(con) GADGET_CHECK_THROW(con)
+#define GADGET_DEBUG_CHECK_RETURN(con, value) GADGET_CHECK_RETURN(con, value)
+#define GADGET_DEBUG_CHECK_RETURN_FALSE(con) GADGET_CHECK_RETURN_FALSE(con)
+#else
+#define GADGET_DEBUG_CHECK_THROW(con)
+#define GADGET_DEBUG_CHECK_RETURN(con, value)
+#define GADGET_DEBUG_CHECK_RETURN_FALSE(con)
+#endif // GADGET_DEBUG_MODE
+
+
+#endif //GADGETRON_LOG_H
diff --git a/toolboxes/log/log_export.h b/toolboxes/log/log_export.h
new file mode 100644
index 0000000..89934f1
--- /dev/null
+++ b/toolboxes/log/log_export.h
@@ -0,0 +1,14 @@
+#ifndef LOG_EXPORT_H_
+#define LOG_EXPORT_H_
+
+#if defined (WIN32)
+   #if defined (__BUILD_GADGETRON_LOG__) || defined (gadgetron_toolbox_log__EXPORTS)
+      #define EXPORTGADGETRONLOG __declspec(dllexport)
+   #else
+      #define EXPORTGADGETRONLOG __declspec(dllimport)
+   #endif
+#else
+   #define EXPORTGADGETRONLOG
+#endif
+
+#endif /* LOG_EXPORT_H_ */
diff --git a/toolboxes/mri/CMakeLists.txt b/toolboxes/mri/CMakeLists.txt
new file mode 100644
index 0000000..d586d8d
--- /dev/null
+++ b/toolboxes/mri/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_subdirectory(pmri)
+
+if (ARMADILLO_FOUND)
+   add_subdirectory(epi)
+   if (CUDA_FOUND)
+      add_subdirectory(hyper)
+   endif(CUDA_FOUND)
+endif (ARMADILLO_FOUND)
+
diff --git a/toolboxes/mri/epi/CMakeLists.txt b/toolboxes/mri/epi/CMakeLists.txt
new file mode 100644
index 0000000..be67c6b
--- /dev/null
+++ b/toolboxes/mri/epi/CMakeLists.txt
@@ -0,0 +1,29 @@
+    if (WIN32)
+        ADD_DEFINITIONS(-D__BUILD_GADGETRON_EPI__)
+        link_directories(${Boost_LIBRARY_DIRS})
+    endif (WIN32)
+
+    add_library(gadgetron_toolbox_epi SHARED
+     EPIExport.h
+     EPIReconXObject.h
+     EPIReconXObjectFlat.h
+     EPIReconXObjectTrapezoid.h)
+
+    set_target_properties(gadgetron_toolbox_epi PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+    set_target_properties(gadgetron_toolbox_epi PROPERTIES LINKER_LANGUAGE CXX)
+
+    target_link_libraries(gadgetron_toolbox_epi gadgetron_toolbox_cpucore gadgetron_toolbox_cpucore_math gadgetron_toolbox_log ${ARMADILLO_LIBRARIES})
+
+    install (FILES 
+        EPIExport.h 
+        EPIReconXObject.h
+        EPIReconXObjectFlat.h
+        EPIReconXObjectTrapezoid.h
+        DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+    # install(TARGETS epi DESTINATION lib)
+
+    #if (GTEST_FOUND)
+    #    add_subdirectory(ut)
+    #endif (GTEST_FOUND)
+
diff --git a/toolboxes/mri/epi/EPIExport.h b/toolboxes/mri/epi/EPIExport.h
new file mode 100644
index 0000000..d5127db
--- /dev/null
+++ b/toolboxes/mri/epi/EPIExport.h
@@ -0,0 +1,16 @@
+/** \file       EPIExport.h
+    \brief      Implement windows export/import for EPI toolbox
+    \author     Souheil Inati
+*/
+
+#pragma once
+
+#if defined (WIN32)
+    #if defined (__BUILD_GADGETRON_EPI__)
+        #define EXPORTEPI __declspec(dllexport)
+    #else
+        #define EXPORTEPI __declspec(dllimport)
+    #endif
+#else
+    #define EXPORTEPI
+#endif
diff --git a/toolboxes/mri/epi/EPIReconXObject.h b/toolboxes/mri/epi/EPIReconXObject.h
new file mode 100644
index 0000000..c2b4d8e
--- /dev/null
+++ b/toolboxes/mri/epi/EPIReconXObject.h
@@ -0,0 +1,73 @@
+/** \file   EPIReconXObject.h
+    \brief  Define the symbols and implement functionality for EPI X reconstruction operator
+    \author Souheil Inati
+*/
+
+#pragma once
+
+#include "EPIExport.h"
+
+#include "ismrmrd/ismrmrd.h"
+#include "hoNDArray.h"
+
+namespace Gadgetron { namespace EPI {
+
+enum EPIType
+{
+  FLAT,
+  TRAPEZOID,
+  SINUSOID,
+  ARBITRARY
+};
+
+enum EPIReceiverPhaseType
+{
+  NONE,
+  EVEN,
+  FULL
+};
+
+template <typename T> class EPIReconXObject 
+{
+ public:
+  EPIReconXObject();
+  virtual ~EPIReconXObject();
+
+  hoNDArray <float> getTrajectoryPos();
+  hoNDArray <float> getTrajectoryNeg();
+
+  hoNDArray <float> filterPos_;
+  hoNDArray <float> filterNeg_;
+  float slicePosition[3];
+
+  virtual int computeTrajectory()=0;
+
+  virtual int apply(ISMRMRD::AcquisitionHeader &hdr_in,  hoNDArray <T> &data_in, 
+		    ISMRMRD::AcquisitionHeader &hdr_out, hoNDArray <T> &data_out)=0;
+  EPIReceiverPhaseType rcvType_;
+
+ protected:
+  hoNDArray <float> trajectoryPos_;
+  hoNDArray <float> trajectoryNeg_;
+
+};
+
+template <typename T> EPIReconXObject<T>::EPIReconXObject()
+{
+}
+
+template <typename T> EPIReconXObject<T>::~EPIReconXObject()
+{
+}
+
+template <typename T> hoNDArray<float> EPIReconXObject<T>::getTrajectoryPos()
+{
+  return trajectoryPos_;
+}
+
+template <typename T> hoNDArray<float> EPIReconXObject<T>::getTrajectoryNeg()
+{
+  return trajectoryNeg_;
+}
+
+}}
diff --git a/toolboxes/mri/epi/EPIReconXObjectFlat.h b/toolboxes/mri/epi/EPIReconXObjectFlat.h
new file mode 100644
index 0000000..5723979
--- /dev/null
+++ b/toolboxes/mri/epi/EPIReconXObjectFlat.h
@@ -0,0 +1,189 @@
+/** \file   EPIReconXObjectFlat.h
+    \brief  Implement functionality for EPI X reconstruction operator for Flat type (non-rampsampled)
+    \author Souheil Inati
+*/
+
+#pragma once
+
+#include "EPIExport.h"
+#include "EPIReconXObject.h"
+#include "hoArmadillo.h"
+#include "hoNDArray_elemwise.h"
+#include "gadgetronmath.h"
+#include <complex>
+
+namespace Gadgetron { namespace EPI {
+
+template <typename T> class EPIReconXObjectFlat : public EPIReconXObject<T>
+{
+ public:
+  EPIReconXObjectFlat();
+  virtual ~EPIReconXObjectFlat();
+
+  virtual int computeTrajectory();
+
+  virtual int apply(ISMRMRD::AcquisitionHeader &hdr_in, hoNDArray <T> &data_in, 
+		    ISMRMRD::AcquisitionHeader &hdr_out, hoNDArray <T> &data_out);
+
+  using EPIReconXObject<T>::filterPos_;
+  using EPIReconXObject<T>::filterNeg_;
+  using EPIReconXObject<T>::slicePosition;
+  using EPIReconXObject<T>::rcvType_;
+
+  int   numSamples_;
+  float dwellTime_;
+  int   encodeNx_;
+  float encodeFOV_;
+  int   reconNx_;
+  float reconFOV_;
+
+ protected:
+  using EPIReconXObject<T>::trajectoryPos_;
+  using EPIReconXObject<T>::trajectoryNeg_;
+
+  hoNDArray <T> Mpos_;
+  hoNDArray <T> Mneg_;
+  bool operatorComputed_;
+
+};
+
+template <typename T> EPIReconXObjectFlat<T>::EPIReconXObjectFlat()
+{
+  rcvType_ = EVEN;
+  numSamples_ = 0.0;
+  dwellTime_ = 0.0;
+  encodeNx_ = 0;
+  reconNx_ = 0;
+  encodeFOV_ = 0.0;
+  reconFOV_ = 0.0;
+  operatorComputed_ = false;
+}
+
+template <typename T> EPIReconXObjectFlat<T>::~EPIReconXObjectFlat()
+{
+}
+
+template <typename T> int EPIReconXObjectFlat<T>::computeTrajectory()
+{
+
+  // Initialize the k-space trajectory arrays
+  trajectoryPos_.create(numSamples_);
+  Gadgetron::clear(trajectoryPos_);
+  trajectoryNeg_.create(numSamples_);
+  Gadgetron::clear(trajectoryNeg_);
+
+  // Temporary trajectory for a symmetric readout
+  // first calculate the integral with G = 1;
+  int nK = numSamples_;
+  hoNDArray <float> k(nK);
+  float t;
+  int n;
+
+  // Some timings
+  float readTime = dwellTime_ * numSamples_;
+  float readArea = readTime;
+
+  // Prephase is set so that k=0 is halfway through the readout time
+  float prePhaseArea = 0.5 * readArea;
+
+  // The scale is set so that the read out area corresponds to the number of encoded points
+  float scale = encodeNx_ /readArea;
+
+  for (n=0; n<nK; n++)
+  {
+    t = (n+1.0)*dwellTime_;  // end of the dwell time
+    k[n] = t;
+  }
+
+  // Fill the positive and negative trajectories
+  for (n=0; n<numSamples_; n++)
+  {
+    trajectoryPos_[n] = scale * (k[n] - prePhaseArea);
+    trajectoryNeg_[n] = scale * (-1.0*k[n] + readArea - prePhaseArea);
+  }
+
+  // reset the operatorComputed_ flag
+  operatorComputed_ = false;
+
+  return(0);
+}
+
+
+template <typename T> int EPIReconXObjectFlat<T>::apply(ISMRMRD::AcquisitionHeader &hdr_in, hoNDArray <T> &data_in, 
+		    ISMRMRD::AcquisitionHeader &hdr_out, hoNDArray <T> &data_out)
+{
+  if (!operatorComputed_) {
+    // Compute the reconstruction operator
+    int Km = floor(encodeNx_ / 2.0);
+    int Ne = 2*Km + 1;
+    int p,q; // counters
+
+    // resize the reconstruction operator
+    Mpos_.create(reconNx_,numSamples_);
+    Mneg_.create(reconNx_,numSamples_);
+
+    // evenly spaced k-space locations
+    arma::vec keven = arma::linspace<arma::vec>(-Km, Km, Ne);
+
+    // image domain locations [-0.5,...,0.5)
+    arma::vec x = arma::linspace<arma::vec>(-0.5,(reconNx_-1.)/(2.*reconNx_),reconNx_);
+
+    // DFT operator
+    // Going from k space to image space, we use the IFFT sign convention
+    arma::cx_mat F(reconNx_, Ne);
+    double fftscale = 1.0 / std::sqrt((double)Ne);
+    for (p=0; p<reconNx_; p++) {
+      for (q=0; q<Ne; q++) {
+	F(p,q) = fftscale * std::exp(std::complex<double>(0.0,1.0*2*M_PI*keven(q)*x(p)));
+      }
+    }
+
+    // forward operators
+    arma::mat Qp(numSamples_, Ne);
+    arma::mat Qn(numSamples_, Ne);
+    for (p=0; p<numSamples_; p++) {
+      //GDEBUG_STREAM(trajectoryPos_(p) << "    " << trajectoryNeg_(p) << std::endl);
+      for (q=0; q<Ne; q++) {
+	Qp(p,q) = sinc(trajectoryPos_(p)-keven(q));
+	Qn(p,q) = sinc(trajectoryNeg_(p)-keven(q));
+      }
+    }
+
+    // recon operators
+    arma::cx_mat Mp(reconNx_,numSamples_);
+    arma::cx_mat Mn(reconNx_,numSamples_);
+    Mp = F * arma::pinv(Qp);
+    Mn = F * arma::pinv(Qn);
+    for (p=0; p<reconNx_; p++) {
+      for (q=0; q<numSamples_; q++) {
+        Mpos_(p,q) = Mp(p,q);
+        Mneg_(p,q) = Mn(p,q);
+      }
+    }
+
+    // set the operator computed flag
+    operatorComputed_ = true;
+  }
+
+  // convert to armadillo representation of matrices and vectors
+  arma::Mat<typename stdType<T>::Type> adata_in = as_arma_matrix(&data_in);
+  arma::Mat<typename stdType<T>::Type> adata_out = as_arma_matrix(&data_out);
+
+  // Apply it
+  if (hdr_in.isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_REVERSE)) {
+    // Negative readout
+    adata_out = as_arma_matrix(&Mneg_) * adata_in;
+  } else {
+    // Forward readout
+    adata_out = as_arma_matrix(&Mpos_) * adata_in;
+  }
+
+  // Copy the input header to the output header and set the size and the center sample
+  hdr_out = hdr_in;
+  hdr_out.number_of_samples = reconNx_;
+  hdr_out.center_sample = reconNx_/2;
+  
+  return 0;
+}
+
+}}
diff --git a/toolboxes/mri/epi/EPIReconXObjectTrapezoid.h b/toolboxes/mri/epi/EPIReconXObjectTrapezoid.h
new file mode 100644
index 0000000..5c6f117
--- /dev/null
+++ b/toolboxes/mri/epi/EPIReconXObjectTrapezoid.h
@@ -0,0 +1,240 @@
+/** \file   EPIReconXObjectTrapezoid.h
+    \brief  Implement functionality for EPI X reconstruction operator for Trapezoidal type
+    \author Souheil Inati
+*/
+
+#pragma once
+
+#include "EPIExport.h"
+#include "EPIReconXObject.h"
+#include "hoArmadillo.h"
+#include "hoNDArray_elemwise.h"
+#include "gadgetronmath.h"
+#include <complex>
+
+namespace Gadgetron { namespace EPI {
+
+template <typename T> class EPIReconXObjectTrapezoid : public EPIReconXObject<T>
+{
+ public:
+  EPIReconXObjectTrapezoid();
+  virtual ~EPIReconXObjectTrapezoid();
+
+  virtual int computeTrajectory();
+
+  virtual int apply(ISMRMRD::AcquisitionHeader &hdr_in, hoNDArray <T> &data_in, 
+		    ISMRMRD::AcquisitionHeader &hdr_out, hoNDArray <T> &data_out);
+
+  using EPIReconXObject<T>::filterPos_;
+  using EPIReconXObject<T>::filterNeg_;
+  using EPIReconXObject<T>::slicePosition;
+  using EPIReconXObject<T>::rcvType_;
+
+  bool  balanced_;
+  float rampUpTime_;
+  float rampDownTime_;
+  float flatTopTime_;
+  float acqDelayTime_;
+  int   numSamples_;
+  float dwellTime_;
+  int   encodeNx_;
+  float encodeFOV_;
+  int   reconNx_;
+  float reconFOV_;
+
+ protected:
+  using EPIReconXObject<T>::trajectoryPos_;
+  using EPIReconXObject<T>::trajectoryNeg_;
+
+  hoNDArray <T> Mpos_;
+  hoNDArray <T> Mneg_;
+  bool operatorComputed_;
+
+};
+
+template <typename T> EPIReconXObjectTrapezoid<T>::EPIReconXObjectTrapezoid()
+{
+  rcvType_ = EVEN;
+  balanced_ = true;
+  rampUpTime_ = 0.0;
+  rampDownTime_ = 0.0;
+  flatTopTime_ = 0.0;
+  acqDelayTime_ = 0.0;
+  numSamples_ = 0.0;
+  dwellTime_ = 0.0;
+  encodeNx_ = 0;
+  reconNx_ = 0;
+  encodeFOV_ = 0.0;
+  reconFOV_ = 0.0;
+  operatorComputed_ = false;
+}
+
+template <typename T> EPIReconXObjectTrapezoid<T>::~EPIReconXObjectTrapezoid()
+{
+}
+
+template <typename T> int EPIReconXObjectTrapezoid<T>::computeTrajectory()
+{
+
+  // Initialize the k-space trajectory arrays
+  trajectoryPos_.create(numSamples_);
+  Gadgetron::clear(trajectoryPos_);
+  trajectoryNeg_.create(numSamples_);
+  Gadgetron::clear(trajectoryNeg_);
+
+  // Temporary trajectory for a symmetric readout
+  // first calculate the integral with G = 1;
+  int nK = numSamples_;
+  hoNDArray <float> k(nK);
+  float t;
+  int n;
+
+  //GDEBUG_STREAM("Dwell = " << dwellTime_ << "    acqDelayTime = " << acqDelayTime_ << std::endl);
+  //GDEBUG_STREAM("rampUpTime = " << rampUpTime_ << "    flatTopTime = " << flatTopTime_ << "    rampDownTime = " << rampDownTime_ << std::endl);
+  
+  // Some timings
+  float totTime = rampUpTime_ + flatTopTime_ + rampDownTime_;
+  float readTime = dwellTime_ * numSamples_;
+
+  // Fix the acqDelayTime for balanced acquisitions
+  if (balanced_) {
+    acqDelayTime_ = 0.5 * (totTime - readTime);
+  }
+
+  // Some Areas
+  float totArea = 0.5*rampUpTime_ + flatTopTime_ + 0.5*rampDownTime_;
+  float readArea =  0.5*rampUpTime_ + flatTopTime_ + 0.5*rampDownTime_;
+  if (rampUpTime_ > 0.0) {
+      readArea -= 0.5*(acqDelayTime_)*(acqDelayTime_)/rampUpTime_;
+  }
+  if (rampDownTime_ > 0.0) {
+      readArea -= 0.5*(totTime - (acqDelayTime_+readTime))*(totTime - (acqDelayTime_+readTime))/rampDownTime_;
+  }
+  
+  // Prephase is set so that k=0 is halfway through the readout time
+  float prePhaseArea = 0.5 * totArea;
+
+  // The scale is set so that the read out area corresponds to the number of encoded points
+  float scale = encodeNx_ /readArea;
+
+  for (n=0; n<nK; n++)
+  {
+    t = (n+1.0)*dwellTime_ + acqDelayTime_;  // end of the dwell time
+    if (t <= rampUpTime_) {
+      // on the ramp up
+      k[n] = 0.5 / rampUpTime_ * t*t;
+    }
+    else if ((t > rampUpTime_) && (t <= (rampUpTime_+flatTopTime_))) {
+      // on the flat top
+      k[n] = 0.5*rampUpTime_ + (t - rampUpTime_);
+    }
+    else {
+      // on the ramp down
+      float v = (rampUpTime_+flatTopTime_+rampDownTime_-t);
+      k[n] = 0.5*rampUpTime_ + flatTopTime_ + 0.5*rampDownTime_ - 0.5/rampDownTime_*v*v;
+    }
+    //GDEBUG_STREAM(n << ":  " << t << "  " << k[n] << " " << std::endl);
+  }
+
+  // Fill the positive and negative trajectories
+  for (n=0; n<numSamples_; n++)
+  {
+    trajectoryPos_[n] = scale * (k[n] - prePhaseArea);
+    trajectoryNeg_[n] = scale * (-1.0*k[n] + totArea - prePhaseArea);
+    //GDEBUG_STREAM(n << ":  " << trajectoryPos_[n] << "  " << trajectoryNeg_[n] << std::endl);
+  }
+
+  // reset the operatorComputed_ flag
+  operatorComputed_ = false;
+
+  return(0);
+}
+
+
+template <typename T> int EPIReconXObjectTrapezoid<T>::apply(ISMRMRD::AcquisitionHeader &hdr_in, hoNDArray <T> &data_in, 
+		    ISMRMRD::AcquisitionHeader &hdr_out, hoNDArray <T> &data_out)
+{
+  if (!operatorComputed_) {
+    // Compute the reconstruction operator
+    int Km = floor(encodeNx_ / 2.0);
+    int Ne = 2*Km + 1;
+    int p,q; // counters
+
+    // resize the reconstruction operator
+    Mpos_.create(reconNx_,numSamples_);
+    Mneg_.create(reconNx_,numSamples_);
+
+    // evenly spaced k-space locations
+    arma::vec keven = arma::linspace<arma::vec>(-Km, Km, Ne);
+    //keven.print("keven =");
+
+    // image domain locations [-0.5,...,0.5)
+    arma::vec x = arma::linspace<arma::vec>(-0.5,(reconNx_-1.)/(2.*reconNx_),reconNx_);
+    //x.print("x =");
+
+    // DFT operator
+    // Going from k space to image space, we use the IFFT sign convention
+    arma::cx_mat F(reconNx_, Ne);
+    double fftscale = 1.0 / std::sqrt((double)Ne);
+    for (p=0; p<reconNx_; p++) {
+      for (q=0; q<Ne; q++) {
+	F(p,q) = fftscale * std::exp(std::complex<double>(0.0,1.0*2*M_PI*keven(q)*x(p)));
+      }
+    }
+    //F.print("F =");
+
+    // forward operators
+    arma::mat Qp(numSamples_, Ne);
+    arma::mat Qn(numSamples_, Ne);
+    for (p=0; p<numSamples_; p++) {
+      //GDEBUG_STREAM(trajectoryPos_(p) << "    " << trajectoryNeg_(p) << std::endl);
+      for (q=0; q<Ne; q++) {
+	Qp(p,q) = sinc(trajectoryPos_(p)-keven(q));
+	Qn(p,q) = sinc(trajectoryNeg_(p)-keven(q));
+      }
+    }
+
+    //Qp.print("Qp =");
+    //Qn.print("Qn =");
+
+    // recon operators
+    arma::cx_mat Mp(reconNx_,numSamples_);
+    arma::cx_mat Mn(reconNx_,numSamples_);
+    Mp = F * arma::pinv(Qp);
+    Mn = F * arma::pinv(Qn);
+    for (p=0; p<reconNx_; p++) {
+      for (q=0; q<numSamples_; q++) {
+        Mpos_(p,q) = Mp(p,q);
+        Mneg_(p,q) = Mn(p,q);
+      }
+    }
+    
+    //Mp.print("Mp =");
+    //Mn.print("Mn =");
+
+    // set the operator computed flag
+    operatorComputed_ = true;
+  }
+
+  // convert to armadillo representation of matrices and vectors
+  arma::Mat<typename stdType<T>::Type> adata_in = as_arma_matrix(&data_in);
+  arma::Mat<typename stdType<T>::Type> adata_out = as_arma_matrix(&data_out);
+
+  // Apply it
+  if (hdr_in.isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_REVERSE)) {
+    // Negative readout
+    adata_out = as_arma_matrix(&Mneg_) * adata_in;
+  } else {
+    // Forward readout
+    adata_out = as_arma_matrix(&Mpos_) * adata_in;
+  }
+
+  // Copy the input header to the output header and set the size and the center sample
+  hdr_out = hdr_in;
+  hdr_out.number_of_samples = reconNx_;
+  hdr_out.center_sample = reconNx_/2;
+  
+  return 0;
+}
+
+}}
diff --git a/toolboxes/mri/hyper/CMRTOperator.cpp b/toolboxes/mri/hyper/CMRTOperator.cpp
new file mode 100644
index 0000000..f261212
--- /dev/null
+++ b/toolboxes/mri/hyper/CMRTOperator.cpp
@@ -0,0 +1,21 @@
+/*
+ * CMRTOperator.cpp
+ *
+ *  Created on: Apr 15, 2014
+ *      Author: u051747
+ */
+
+#include "CMRTOperator.h"
+
+namespace Gadgetron {
+
+CMRTOperator::CMRTOperator() {
+	// TODO Auto-generated constructor stub
+
+}
+
+CMRTOperator::~CMRTOperator() {
+	// TODO Auto-generated destructor stub
+}
+
+} /* namespace Gadgetron */
diff --git a/toolboxes/mri/hyper/CMRTOperator.h b/toolboxes/mri/hyper/CMRTOperator.h
new file mode 100644
index 0000000..a1cfffc
--- /dev/null
+++ b/toolboxes/mri/hyper/CMRTOperator.h
@@ -0,0 +1,147 @@
+/*
+ * CMRTOperator.h
+ *
+ *  Created on: Apr 15, 2014
+ *      Author: u051747
+ */
+#pragma once
+
+#include "linearOperator.h"
+#include "cuNDArray.h"
+#include "radial_utilities.h"
+#include "cuNFFTOperator.h"
+#include "cuNDFFT.h"
+#include "vector_td_operators.h"
+
+#include "hoNDArray_fileio.h"
+#include "cudaDeviceManager.h"
+
+#include <boost/make_shared.hpp>
+
+namespace Gadgetron {
+
+template<class REAL> class CMRTOperator: public linearOperator<cuNDArray< complext<REAL > > > {
+	typedef complext<REAL> COMPLEX;
+public:
+	CMRTOperator(): W_(5.5), alpha_(2),readout_oversampling_factor_(1){};
+	virtual ~CMRTOperator(){};
+
+	virtual void mult_MH(cuNDArray<COMPLEX>* in, cuNDArray<COMPLEX>* out, bool accumulate = false){
+		cuNDArray<COMPLEX> projections(&projection_dims);
+		E_.mult_MH(in,&projections);
+		std::vector<size_t> permute_dims;
+		permute_dims.push_back(0);
+		permute_dims.push_back(2);
+		permute_dims.push_back(1);
+		permute_dims.push_back(3);
+		projections = *permute(&projections,&permute_dims);
+		cuNDFFT<REAL>::instance()->fft(&projections,0u);
+
+		COMPLEX* proj_ptr = projections.get_data_ptr();
+		std::vector<size_t> proj_dim3d(projection_dims.begin(),projection_dims.end()-1);
+		std::vector<size_t> out_dim3d({out->get_size(0),out->get_size(1),out->get_size(2)});
+		COMPLEX* out_ptr = out->get_data_ptr();
+		for (size_t t = 0; t< out->get_size(3); t++){
+			cuNDArray<COMPLEX> proj_view(proj_dim3d,proj_ptr);
+			cuNDArray<COMPLEX> out_view(out_dim3d,out_ptr);
+			backprojections[t]->mult_MH(&proj_view,&out_view,accumulate);
+			proj_ptr += proj_view.get_number_of_elements();
+			out_ptr += out_view.get_number_of_elements();
+		}
+	}
+
+
+	virtual void mult_M(cuNDArray<COMPLEX>* in, cuNDArray<COMPLEX>* out, bool accumulate = false){
+
+		cuNDArray<COMPLEX> projections(&projection_dims_permuted);
+
+		COMPLEX* proj_ptr = projections.get_data_ptr();
+		std::vector<size_t> proj_dim3d(projection_dims.begin(),projection_dims.end()-1);
+		std::vector<size_t> in_dim3d({in->get_size(0),in->get_size(1),in->get_size(2)});
+		COMPLEX* in_ptr = in->get_data_ptr();
+		for (size_t t = 0; t< in->get_size(3); t++){
+			cuNDArray<COMPLEX> proj_view(proj_dim3d,proj_ptr);
+			cuNDArray<COMPLEX> in_view(in_dim3d,in_ptr);
+			backprojections[t]->mult_M(&in_view,&proj_view,accumulate);
+			proj_ptr += proj_view.get_number_of_elements();
+			in_ptr += in_view.get_number_of_elements();
+		}
+
+
+		cuNDFFT<REAL>::instance()->ifft(&projections,0u);
+		std::vector<size_t> permute_dims;
+		permute_dims.push_back(0);
+		permute_dims.push_back(2);
+		permute_dims.push_back(1);
+		permute_dims.push_back(3);
+		projections = *permute(&projections,&permute_dims);
+
+
+		E_.mult_M(&projections,out,accumulate);
+	}
+
+
+	void setup( boost::shared_ptr<cuNDArray<vector_td<REAL,2> > > traj, std::vector<size_t>& dims,std::vector<size_t>& projection_dims, unsigned int offset, bool golden_ratio ){
+
+		E_.setup( uint64d2(projection_dims[0], projection_dims[1]),
+				uint64d2(projection_dims[0], projection_dims[1])*size_t(2), // !! <-- alpha_
+				W_ );
+		E_.preprocess(traj.get());
+
+		this->projection_dims = projection_dims;
+		projection_dims_permuted = projection_dims;
+		projection_dims_permuted[1] = projection_dims[2];
+		projection_dims_permuted[2] = projection_dims[1];
+
+		size_t ntimeframes = projection_dims.size() > 3 ? projection_dims[3] : 1;
+		/*
+		boost::shared_ptr< cuNDArray<REAL> > b_dcw = compute_radial_dcw_fixed_angle_2d
+						( dims[0], projection_dims[2], alpha_, 1.0f/readout_oversampling_factor_ );
+		sqrt_inplace(b_dcw.get());
+
+		//backprojection.set_dcw(b_dcw);
+		 */
+
+		size_t time_offset =offset;
+		backprojections.clear();
+		for (size_t t = 0; t < ntimeframes; t++){
+			auto backprojection = boost::make_shared<cuNFFTOperator<REAL,2>>();
+			backprojection->setup( uint64d2(dims[0], dims[1]),
+					uint64d2(dims[0], dims[1])*size_t(2), // !! <-- alpha_
+					W_ );
+
+			boost::shared_ptr< cuNDArray<floatd2> > traj2;
+
+			if (golden_ratio){
+				traj2= compute_radial_trajectory_golden_ratio_2d<REAL>
+				( projection_dims[0], projection_dims[2],1, time_offset, GR_ORIGINAL );
+
+			}
+			else{
+				traj2= compute_radial_trajectory_fixed_angle_2d<REAL>
+				( projection_dims[0], projection_dims[2], 1/*number of frames*/ );
+			}
+
+
+			backprojection->preprocess(traj2.get());
+			backprojections.push_back(backprojection);
+			time_offset += projection_dims[2];
+		}
+
+	}
+
+
+protected:
+
+	cuNFFTOperator<REAL,2> E_; //cuNFFTOperator reconstructing the 2d projections
+	std::vector< boost::shared_ptr< cuNFFTOperator<REAL,2>>> backprojections; //cuNFFTOperator doing the equivalent of backprojection
+
+	std::vector<size_t> projection_dims;
+	std::vector<size_t> projection_dims_permuted;
+	REAL W_;
+	REAL readout_oversampling_factor_;
+	REAL alpha_;
+};
+
+} /* namespace Gadgetron */
+
diff --git a/toolboxes/mri/hyper/CMakeLists.txt b/toolboxes/mri/hyper/CMakeLists.txt
new file mode 100644
index 0000000..e010b5d
--- /dev/null
+++ b/toolboxes/mri/hyper/CMakeLists.txt
@@ -0,0 +1,44 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_TOOLBOX_HYPER__)
+endif (WIN32)
+
+if(WIN32)
+link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+include_directories(
+  ${CUDA_INCLUDE_DIRS}
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/fft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CUDA_INCLUDE_DIRS}
+  ${Boost_INCLUDE_DIR}
+  )
+
+cuda_add_library(gadgetron_toolbox_hyper SHARED 
+    CSIOperator.cpp
+    CSI_utils.cu
+	CSIOperator.h
+	CSI_utils.h
+	gadgetron_toolbox_hyper_export.h
+  )
+
+set_target_properties(gadgetron_toolbox_hyper PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_toolbox_hyper 
+  gadgetron_toolbox_gpucore 
+  gadgetron_toolbox_gpunfft
+  ${Boost_LIBRARIES}
+  ${CUDA_LIBRARIES}
+  ${CUDA_CUBLAS_LIBRARIES} 
+  )
+
+install(TARGETS gadgetron_toolbox_hyper DESTINATION lib COMPONENT main)
+
+install(FILES 
+    CSIOperator.h
+    CSI_utils.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/mri/hyper/CSIOperator.cpp b/toolboxes/mri/hyper/CSIOperator.cpp
new file mode 100644
index 0000000..4796d1d
--- /dev/null
+++ b/toolboxes/mri/hyper/CSIOperator.cpp
@@ -0,0 +1,60 @@
+/*
+ * CSIOperator.cpp
+ *
+ *  Created on: Nov 10, 2014
+ *      Author: dch
+ */
+
+#include "CSIOperator.h"
+#include "cuNDFFT.h"
+#include "cuNDArray_math.h"
+#include "CSI_utils.h"
+
+namespace Gadgetron {
+
+
+template<class T> CSIOperator<T>::CSIOperator() {
+	// TODO Auto-generated constructor stub
+}
+
+template<class T> CSIOperator<T>::CSIOperator(T dtt, T dte ) : dtt_(dtt), dte_(dte) {
+	// TODO Auto-generated constructor stub
+
+}
+
+template<class T> CSIOperator<T>::~CSIOperator() {
+	// TODO Auto-generated destructor stub
+}
+
+
+template<class T> void CSIOperator<T>::mult_MH(cuNDArray<complext<T>> *in , cuNDArray<complext<T>> * out, bool accumulate){
+
+	std::vector<size_t> kdim = *in->get_dimensions();
+	kdim[1] =frequencies.size(); //Set to number of time samples rather than number of frequencies
+	cuNDArray<complext<T>> tmp(kdim);
+	//senseOp->mult_MH(in,out,accumulate);
+
+	CSI_dft(&tmp,in,&frequencies,dtt_,dte_);
+	senseOp->mult_MH(&tmp,out,accumulate);
+	//cuNDFFT<float>::instance()->fft(&tmp,0u); //FFT along the TE dimension
+}
+
+template<class T> void CSIOperator<T>::mult_M(cuNDArray<complext<T>> *in , cuNDArray<complext<T>> * out, bool accumulate){
+	cuNDArray<complext<T>>* out_tmp = out;
+	if (accumulate) out_tmp = new cuNDArray<complext<T>>(out->get_dimensions());
+	std::vector<size_t> kdim = *out->get_dimensions();
+	kdim[1] =frequencies.size();
+	cuNDArray<complext<T>> tmp(kdim);
+	senseOp->mult_M(in,&tmp,accumulate);
+
+	CSI_dftH(&tmp,out_tmp,&frequencies,dtt_,dte_);
+	if (accumulate){
+		*out += *out_tmp;
+		delete out_tmp;
+	}
+}
+
+
+template class EXPORTHYPER CSIOperator<float>;
+
+} /* namespace Gadgetron */
diff --git a/toolboxes/mri/hyper/CSIOperator.h b/toolboxes/mri/hyper/CSIOperator.h
new file mode 100644
index 0000000..5b4c3da
--- /dev/null
+++ b/toolboxes/mri/hyper/CSIOperator.h
@@ -0,0 +1,42 @@
+/*
+ * CSIOperator.h
+ *
+ *  Created on: Nov 10, 2014
+ *      Author: dch
+ */
+
+#ifndef CSIOPERATOR_H_
+#define CSIOPERATOR_H_
+
+#include "linearOperator.h"
+#include "cuNDArray.h"
+#include <thrust/device_vector.h>
+
+namespace Gadgetron {
+
+template <class T> class CSIOperator: public Gadgetron::linearOperator<cuNDArray<complext<T>>> {
+public:
+	CSIOperator();
+	CSIOperator(T dtt, T dte);
+	virtual ~CSIOperator();
+	virtual void mult_M(cuNDArray<complext<T>>* in, cuNDArray<complext<T>>* out,bool accumulate );
+	virtual void mult_MH(cuNDArray<complext<T>>* in, cuNDArray<complext<T>>* out,bool accumulate );
+
+	void set_senseOp(boost::shared_ptr<linearOperator<cuNDArray<complext<T>>>> op){ senseOp = op;}
+	void set_frequencies(std::vector<T> & freq) { frequencies=thrust::device_vector<T>(freq.begin(),freq.end());
+	}
+
+
+	T get_echotime(){ return dte_;}
+	T get_pointtime(){return dtt_;}
+
+protected:
+	boost::shared_ptr<linearOperator<cuNDArray<complext<T>>>> senseOp;
+	T dte_; //Time between echoes
+	T dtt_; //Time between k-space points
+	thrust::device_vector<T> frequencies;
+};
+
+} /* namespace Gadgetron */
+
+#endif /* CSIOPERATOR_H_ */
diff --git a/toolboxes/mri/hyper/CSI_utils.cu b/toolboxes/mri/hyper/CSI_utils.cu
new file mode 100644
index 0000000..e4b1ea9
--- /dev/null
+++ b/toolboxes/mri/hyper/CSI_utils.cu
@@ -0,0 +1,97 @@
+#include "CSI_utils.h"
+#include <algorithm>
+#include "cudaDeviceManager.h"
+#include "complext.h"
+#include <math_constants.h>
+#include <stdio.h>
+using namespace Gadgetron;
+
+
+template<class T> static __global__ void dft_kernel(complext<T>* __restrict__ kspace, const complext<T>* __restrict__ tspace, T* __restrict__ frequencies, unsigned int spiral_length, unsigned int echoes, unsigned int nfreqs,T dte, T dtt){
+	const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < spiral_length*nfreqs ){
+		complext<T> result = 0;
+		T frequency = frequencies[idx/spiral_length];
+		T time_offset = dtt*(idx%spiral_length);
+		unsigned int kpoint = idx%spiral_length;
+		for (unsigned int i =0; i < echoes; i++){
+			result += exp(complext<T>(0,-frequency*2*CUDART_PI_F*(dte*i+time_offset)))*tspace[kpoint+i*spiral_length];
+		}
+		kspace[idx] = result;
+	}
+}
+
+template<class T> static __global__ void dftH_kernel(const complext<T>* __restrict__ kspace, complext<T>* __restrict__ tspace, T* __restrict__ frequencies, unsigned int spiral_length, unsigned int echoes, unsigned int nfreqs,T dte, T dtt){
+	const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < spiral_length*echoes ){
+		complext<T> result = 0;
+		unsigned int kpoint = idx%spiral_length;
+		T timeshift = dte*(idx/spiral_length)+dtt*kpoint;
+		for (unsigned int i =0; i < nfreqs; i++){
+			result += exp(complext<T>(0,frequencies[i]*2*CUDART_PI_F*timeshift))*kspace[kpoint+i*spiral_length];
+		}
+		tspace[idx] = result;
+	}
+}
+
+
+
+template<class T>
+void Gadgetron::CSI_dft(cuNDArray<complext<T> >* kspace,
+		cuNDArray<complext<T> >* tspace, thrust::device_vector<T>* frequencies, T dtt, T dte) {
+
+	size_t elements = kspace->get_size(0)*kspace->get_size(1);
+	size_t batches = kspace->get_number_of_elements()/elements;
+	size_t t_elements = tspace->get_size(0)*tspace->get_size(1);
+	for (int i = 0; i< batches; i++){
+		int threadsPerBlock = std::min<int>(elements,cudaDeviceManager::Instance()->max_blockdim());
+		dim3 dimBlock(threadsPerBlock);
+		int totalBlocksPerGrid = (elements+threadsPerBlock-1)/threadsPerBlock;
+		dim3 dimGrid(totalBlocksPerGrid);
+
+		if (totalBlocksPerGrid > cudaDeviceManager::Instance()->max_griddim())
+			throw std::runtime_error("CSIOperator: Input dimensions too large");
+
+		//size_t batchSize = dimGrid.x*dimBlock.x;
+		cudaFuncSetCacheConfig(dft_kernel<T>,cudaFuncCachePreferL1);
+
+		std::vector<size_t> dims = *tspace->get_dimensions();
+		// Invoke kernel
+		dft_kernel<T><<<dimGrid, dimBlock>>>(kspace->get_data_ptr()+i*elements,tspace->get_data_ptr()+i*t_elements,thrust::raw_pointer_cast(frequencies->data()),dims[0],dims[1], frequencies->size(),dte,dtt);
+		cudaThreadSynchronize();
+	CHECK_FOR_CUDA_ERROR();
+
+	}
+
+}
+
+template<class T>
+void Gadgetron::CSI_dftH(cuNDArray<complext<T> >* kspace,
+		cuNDArray<complext<T> >* tspace, thrust::device_vector<T>* frequencies, T dtt, T dte) {
+	size_t k_elements = kspace->get_size(0)*kspace->get_size(1);
+	size_t elements = tspace->get_size(0)*tspace->get_size(1);
+
+	size_t batches = tspace->get_number_of_elements()/elements;
+	for (int i =0; i< batches; i++){
+		int threadsPerBlock = std::min<int>(elements,cudaDeviceManager::Instance()->max_blockdim());
+		dim3 dimBlock(threadsPerBlock);
+		int totalBlocksPerGrid = (elements+threadsPerBlock-1)/threadsPerBlock;
+		dim3 dimGrid(totalBlocksPerGrid);
+
+		if (totalBlocksPerGrid > cudaDeviceManager::Instance()->max_griddim())
+			throw std::runtime_error("CSIOperator: Input dimensions too large");
+
+		//size_t batchSize = dimGrid.x*dimBlock.x;
+		cudaFuncSetCacheConfig(dftH_kernel<T>,cudaFuncCachePreferL1);
+
+		std::vector<size_t> dims = *tspace->get_dimensions();
+
+		// Invoke kernel
+		dftH_kernel<T><<<dimGrid, dimBlock>>>(kspace->get_data_ptr()+i*k_elements,tspace->get_data_ptr()+i*elements,thrust::raw_pointer_cast(frequencies->data()),dims[0],dims[1], frequencies->size(),dte,dtt);
+		CHECK_FOR_CUDA_ERROR();
+	}
+}
+
+template EXPORTHYPER void Gadgetron::CSI_dft<float>(cuNDArray<float_complext>* kspace,cuNDArray<float_complext>* tspace, thrust::device_vector<float>* frequencies, float dtt, float dte);
+template EXPORTHYPER void Gadgetron::CSI_dftH<float>(cuNDArray<float_complext>* kspace,cuNDArray<float_complext>* tspace, thrust::device_vector<float>* frequencies, float dtt, float dte);
+
diff --git a/toolboxes/mri/hyper/CSI_utils.h b/toolboxes/mri/hyper/CSI_utils.h
new file mode 100644
index 0000000..bf71190
--- /dev/null
+++ b/toolboxes/mri/hyper/CSI_utils.h
@@ -0,0 +1,35 @@
+/*
+ * CSI_utils.h
+ *
+ *  Created on: Nov 20, 2014
+ *      Author: dch
+ */
+
+#pragma once
+
+#include "cuNDArray.h"
+#include <thrust/device_vector.h>
+#include "gadgetron_toolbox_hyper_export.h"
+namespace Gadgetron {
+
+	/**
+	 * Performs a non-cartesian discrete fourier transform (DFT) along the specified frequencies, at the specified time intervals.
+	 * Note that this should be no used for general purpose DFTS as it will be mindnumbingly slow.
+	 * @param kspace The output kspace
+	 * @param tspace The input time space
+	 * @param frequencies The frequencies on which to do DFT.
+	 * @param dtt Time step between points in the first dimension of the tspace
+	 * @param dte Time step between points in the second dimension of the tspace
+	 */
+	template<class T> EXPORTHYPER void CSI_dft(cuNDArray<complext<T> >* kspace, cuNDArray<complext<T> >* tspace, thrust::device_vector<T>* frequencies, T dtt, T dte);
+	/**
+	 * Performs the adjoint of the non-cartesian discrete fourier transform.
+	 * @param kspace The input kspace
+	 * @param tspace The output time space
+	 * @param frequencies Frequencies on which to do DFT.
+	 * @param dte Time step between points in the first dimension of the tspace
+	 * @param dtt Time step between points in the second dimension of the tspace
+	 */
+	template<class T> EXPORTHYPER void CSI_dftH(cuNDArray<complext<T> >* kspace, cuNDArray<complext<T> >* tspace, thrust::device_vector<T>* frequencies, T dte, T dtt);
+
+}
diff --git a/toolboxes/mri/hyper/CSfreqOperator.h b/toolboxes/mri/hyper/CSfreqOperator.h
new file mode 100644
index 0000000..37f277b
--- /dev/null
+++ b/toolboxes/mri/hyper/CSfreqOperator.h
@@ -0,0 +1,52 @@
+/*
+ * CSfreqOperator.h
+ *
+ *  Created on: Dec 2, 2014
+ *      Author: dch
+ */
+
+#ifndef CSFREQOPERATOR_H_
+#define CSFREQOPERATOR_H_
+
+#include "linearOperator.h"
+#include "cuNDArray.h"
+#include "CSI_utils.h"
+namespace Gadgetron{
+class CSfreqOperator : public linearOperator<cuNDArray<float_complext> > {
+
+public:
+
+	CSfreqOperator(){};
+	CSfreqOperator(float dtt_, float dte_) : dtt(dtt_), dte(dte_){
+
+}
+
+	virtual  void mult_M(cuNDArray<float_complext> * in, cuNDArray<float_complext>* out,bool accumulate){
+		auto tmp_out  = out;
+		if (accumulate) tmp_out = new cuNDArray<float_complext>(*out);
+		CSI_dftH(in,tmp_out,&freqs,dte,dtt);
+		if (accumulate){
+			*out += *tmp_out;
+			delete tmp_out;
+		}
+	}
+	virtual  void mult_MH(cuNDArray<float_complext> * in, cuNDArray<float_complext>* out,bool accumulate){
+		auto tmp_out  = out;
+		if (accumulate) tmp_out = new cuNDArray<float_complext>(*out);
+		CSI_dft(tmp_out,in,&freqs,dte,dtt);
+		if (accumulate){
+			*out += *tmp_out;
+			delete tmp_out;
+		}
+	}
+	void set_frequencies(std::vector<float> & freq) { freqs=thrust::device_vector<float>(freq.begin(),freq.end());
+	}
+
+
+	thrust::device_vector<float> freqs;
+	float dtt,dte;
+};
+}
+
+
+#endif /* CSFREQOPERATOR_H_ */
diff --git a/toolboxes/mri/hyper/gadgetron_toolbox_hyper_export.h b/toolboxes/mri/hyper/gadgetron_toolbox_hyper_export.h
new file mode 100644
index 0000000..2fb545f
--- /dev/null
+++ b/toolboxes/mri/hyper/gadgetron_toolbox_hyper_export.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_TOOLBOX_HYPER__)
+#define EXPORTHYPER __declspec(dllexport)
+#else
+#define EXPORTHYPER __declspec(dllimport)
+#endif
+#else
+#define EXPORTHYPER
+#endif
\ No newline at end of file
diff --git a/toolboxes/mri/pmri/CMakeLists.txt b/toolboxes/mri/pmri/CMakeLists.txt
new file mode 100644
index 0000000..863c5fb
--- /dev/null
+++ b/toolboxes/mri/pmri/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(CUDA_FOUND)
+  add_subdirectory(gpu)
+endif()
diff --git a/toolboxes/mri/pmri/gpu/CMakeLists.txt b/toolboxes/mri/pmri/gpu/CMakeLists.txt
new file mode 100644
index 0000000..5916ea7
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/CMakeLists.txt
@@ -0,0 +1,90 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUPMRI__)
+  ADD_DEFINITIONS(-DWIN32_LEAN_AND_MEAN)
+endif (WIN32)
+
+if(WIN32)
+  link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+include_directories(  
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CMAKE_SOURCE_DIR}/toolboxes/fft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+  ${CUDA_INCLUDE_DIRS}
+  ${Boost_INCLUDE_DIR}
+  ${ARMADILLO_INCLUDE_DIRS} 
+  )
+
+cuda_add_library(gadgetron_toolbox_gpuparallelmri SHARED 
+    b1_map.h
+    spirit_calibration.h
+    cuCartesianSenseOperator.h
+    cuNonCartesianKtSenseOperator.h
+    cuNonCartesianSenseOperator.h
+    cuSpiritOperator.h
+    cuBuffer.h
+    cuSenseBuffer.h
+    cuSenseBufferCg.h
+    cuSenseOperator.h
+    gpupmri_export.h
+    htgrappa.h
+    senseOperator.h
+    sense_utilities.h
+    b1_map.cu
+    b1_map_NIH_Souheil.cu
+    spirit_calibration.cu
+    sense_utilities.cu
+    cuSenseOperator.cu
+    cuCartesianSenseOperator.cu
+    cuNonCartesianSenseOperator.cu
+    cuNonCartesianKtSenseOperator.cu
+    cuBuffer.cpp
+    cuSenseBuffer.cpp
+    cuSenseBufferCg.cpp
+    cuSpiritBuffer.cpp
+    htgrappa.cpp
+    htgrappa.cu
+    trajectory_utils.h
+    trajectory_utils.cu
+  )
+
+set_target_properties(gadgetron_toolbox_gpuparallelmri PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_toolbox_gpuparallelmri 
+  gadgetron_toolbox_gpucore
+  gadgetron_toolbox_log
+  gadgetron_toolbox_gpunfft 
+  gadgetron_toolbox_cpucore
+  gadgetron_toolbox_cpucore_math
+  ${Boost_LIBRARIES}
+  ${FFTW3_LIBRARIES} 
+  ${CUDA_LIBRARIES} 
+  ${CUDA_CUFFT_LIBRARIES} 
+  ${CUDA_CUBLAS_LIBRARIES} 
+  )
+
+install(TARGETS gadgetron_toolbox_gpuparallelmri DESTINATION lib COMPONENT main)
+
+install(FILES 
+	b1_map.h
+	sense_utilities.h
+	htgrappa.h
+	senseOperator.h
+	cuSenseOperator.h
+	cuCartesianSenseOperator.h
+	cuNonCartesianSenseOperator.h
+	cuNonCartesianKtSenseOperator.h
+        cuSpiritOperator.h
+        cuBuffer.h
+	cuSenseBuffer.h
+	cuSenseBufferCg.h
+	cuSpiritBuffer.h
+	gpupmri_export.h
+DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/mri/pmri/gpu/b1_map.cu b/toolboxes/mri/pmri/gpu/b1_map.cu
new file mode 100644
index 0000000..5a87e0b
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/b1_map.cu
@@ -0,0 +1,733 @@
+#include "b1_map.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "vector_td_utilities.h"
+#include "real_utilities.h"
+#include "real_utilities_device.h"
+#include "complext.h"
+#include "check_CUDA.h"
+#include "cudaDeviceManager.h"
+#include "setup_grid.h"
+
+#include <iostream>
+#include <cmath>
+
+using namespace std;
+
+namespace Gadgetron{
+
+  const int kernel_width = 7;
+
+  template<class REAL, unsigned int D> static void smooth_correlation_matrices( cuNDArray<complext<REAL> >*, cuNDArray<complext<REAL> >*);
+  template<class REAL> static boost::shared_ptr< cuNDArray<complext<REAL> > > extract_csm( cuNDArray<complext<REAL> >*, unsigned int, unsigned int);
+  template<class REAL> static void set_phase_reference( cuNDArray<complext<REAL> >*, unsigned int, unsigned int);
+  template<class T> static void find_stride( cuNDArray<T> *in, unsigned int dim, unsigned int *stride, std::vector<size_t> *dims );
+  template<class T> static boost::shared_ptr< cuNDArray<T> > correlation( cuNDArray<T> *in );
+  template<class T> static void rss_normalize( cuNDArray<T> *in_out, unsigned int dim );
+
+  //
+  // Main method
+  //
+
+  template<class REAL, unsigned int D> boost::shared_ptr< cuNDArray<complext<REAL> > >
+  estimate_b1_map( cuNDArray<complext<REAL> > *data_in, int target_coils)
+  {
+
+    if( data_in->get_number_of_dimensions() < 2 ){
+      cout << endl << "estimate_b1_map:: dimensionality mismatch." << endl; 
+      return boost::shared_ptr< cuNDArray<complext<REAL> > >();
+    }
+
+    if( data_in->get_number_of_dimensions()-1 != D ){
+      cout << endl << "estimate_b1_map:: dimensionality mismatch." << endl; 
+      return boost::shared_ptr< cuNDArray<complext<REAL> > >();
+    }
+
+    int target_coils_int = 0;
+    if ((target_coils <= 0) || (target_coils > data_in->get_size(D))) {
+      target_coils_int = data_in->get_size(D);
+    } else {
+      target_coils_int = target_coils;
+    }
+
+    vector<unsigned int> image_dims, dims_to_xform;
+    unsigned int pixels_per_coil = 1;
+  
+    for( unsigned int i=0; i<D; i++ ){
+      image_dims.push_back(data_in->get_size(i));
+      dims_to_xform.push_back(i);
+      pixels_per_coil *= data_in->get_size(i);
+    }
+  
+    unsigned int ncoils = data_in->get_size(D);
+
+    // Make a copy of input data, but only the target coils
+    boost::shared_ptr< cuNDArray<complext<REAL> > > data_out;
+    if (0 && target_coils_int == ncoils) {
+      cuNDArray<complext<REAL> > *_data_out = new cuNDArray<complext<REAL> >(*data_in);
+      data_out = boost::shared_ptr< cuNDArray<complext<REAL> > >(_data_out);
+    } else {
+      std::vector<size_t> odims = *(data_in->get_dimensions().get());
+      odims[D] = target_coils_int;
+      cuNDArray<complext<REAL> > *_data_out = new cuNDArray<complext<REAL> >(&odims);
+      data_out = boost::shared_ptr< cuNDArray<complext<REAL> > >(_data_out);
+
+      //Now copy one coil at a time
+      unsigned int elements_per_coil = data_in->get_number_of_elements()/ncoils;
+      for (unsigned int i = 0; i < target_coils_int; i++) {
+	cudaMemcpy(data_out.get()->get_data_ptr()+i*elements_per_coil,
+		   data_in->get_data_ptr()+i*elements_per_coil,
+		   elements_per_coil*sizeof(complext<REAL>),
+		   cudaMemcpyDeviceToDevice);
+      }
+      ncoils = target_coils_int;
+    }
+  
+    // Normalize by the RSS of the coils
+    rss_normalize( data_out.get(), D );
+  
+    // Now calculate the correlation matrices
+    boost::shared_ptr<cuNDArray<complext<REAL> > > corrm = correlation( data_out.get() );
+    data_out.reset();
+  
+    // Smooth (onto copy of corrm)
+    cuNDArray<complext<REAL> > *_corrm_smooth = new cuNDArray<complext<REAL> >();
+    _corrm_smooth->create(corrm->get_dimensions().get());
+    boost::shared_ptr<cuNDArray<complext<REAL> > > corrm_smooth(_corrm_smooth);
+
+    smooth_correlation_matrices<REAL,D>( corrm.get(), corrm_smooth.get() );
+    corrm.reset();
+
+    // Get the dominant eigenvector for each correlation matrix.
+    boost::shared_ptr<cuNDArray<complext<REAL> > > csm = extract_csm<REAL>( corrm_smooth.get(), ncoils, pixels_per_coil );
+    corrm_smooth.reset();
+  
+    // Set phase according to reference (coil 0)
+    set_phase_reference<REAL>( csm.get(), ncoils, pixels_per_coil );
+  
+    return csm;
+  }
+
+  template<class T> static void find_stride( cuNDArray<T> *in, unsigned int dim,
+					     unsigned int *stride, std::vector<size_t> *dims )
+  {
+    *stride = 1;
+    for( unsigned int i=0; i<in->get_number_of_dimensions(); i++ ){
+      if( i != dim )
+	dims->push_back(in->get_size(i));
+      if( i < dim )
+	*stride *= in->get_size(i);
+    }
+  }
+  
+  template<class REAL, class T> __inline__  __device__ static REAL
+  _rss( unsigned int idx, T *in, unsigned int stride, unsigned int number_of_batches )
+  {
+    unsigned int in_idx = (idx/stride)*stride*number_of_batches+(idx%stride);
+    REAL rss = REAL(0);
+    
+    for( unsigned int i=0; i<number_of_batches; i++ ) 
+      rss += norm(in[i*stride+in_idx]);
+    
+    rss = std::sqrt(rss); 
+    
+    return rss;
+  }
+  
+  template<class T> __global__ static void
+  rss_normalize_kernel( T *in_out, unsigned int stride, unsigned int number_of_batches, unsigned int number_of_elements )
+  {
+    typedef typename realType<T>::Type REAL;
+
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    
+    if( idx < number_of_elements ){
+      
+      REAL reciprocal_rss = 1/(_rss<REAL,T>(idx, in_out, stride, number_of_batches));
+      
+      unsigned int in_idx = (idx/stride)*stride*number_of_batches+(idx%stride);
+      
+      for( unsigned int i=0; i<number_of_batches; i++ ) {
+	T out = in_out[i*stride+in_idx];
+	out *= reciprocal_rss; // complex-scalar multiplication (element-wise operator)
+	in_out[i*stride+in_idx] = out; 
+      } 
+    }
+  }
+  
+  // Normalized RSS
+  template<class T> static
+  void rss_normalize( cuNDArray<T> *in_out, unsigned int dim )
+  {
+    unsigned int number_of_batches = in_out->get_size(dim);
+    unsigned int number_of_elements = in_out->get_number_of_elements()/number_of_batches;
+    
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( number_of_elements, &blockDim, &gridDim );
+
+    // Find element stride
+    unsigned int stride; std::vector<size_t> dims;
+    find_stride<T>( in_out, dim, &stride, &dims );
+
+    // Invoke kernel
+    rss_normalize_kernel<T><<< gridDim, blockDim >>>( in_out->get_data_ptr(), stride, number_of_batches, number_of_elements );
+ 
+    CHECK_FOR_CUDA_ERROR();    
+  }
+
+  template<class REAL, class T> __global__ static void
+  correlation_kernel( const T * __restrict__ in, T * __restrict__ corrm, unsigned int num_batches, unsigned int num_elements )
+  {
+    const unsigned int p = blockIdx.x*blockDim.x + threadIdx.x;
+    const unsigned int i = threadIdx.y;
+    
+    if( p < num_elements ){
+      for( unsigned int j=0; j<i; j++){
+	T tmp = in[i*num_elements+p]*conj(in[j*num_elements+p]);
+	corrm[(j*num_batches+i)*num_elements+p] = tmp;
+	corrm[(i*num_batches+j)*num_elements+p] = conj(tmp);
+      }
+      T tmp = in[i*num_elements+p];
+      corrm[(i*num_batches+i)*num_elements+p] = tmp*conj(tmp);
+    }
+  }
+  
+  // Build correlation matrix
+  template<class T> static boost::shared_ptr< cuNDArray<T> > correlation( cuNDArray<T> *in )
+  {
+    typedef typename realType<T>::Type REAL;
+    // Prepare internal array
+    int cur_device = cudaDeviceManager::Instance()->getCurrentDevice();
+
+    unsigned int number_of_batches = in->get_size(in->get_number_of_dimensions()-1);
+    unsigned int number_of_elements = in->get_number_of_elements()/number_of_batches;
+
+    int warp_size = cudaDeviceManager::Instance()->warp_size(cur_device);
+    int max_blockdim = cudaDeviceManager::Instance()->max_blockdim(cur_device);
+    dim3 blockDim(((max_blockdim/number_of_batches)/warp_size)*warp_size, number_of_batches);
+
+    if( blockDim.x == 0 ){
+      throw std::runtime_error("correlation: correlation dimension exceeds device capacity.");
+    }
+  
+    dim3 gridDim((number_of_elements+blockDim.x-1)/blockDim.x);
+
+    // Invoke kernel
+    std::vector<size_t> dims = *in->get_dimensions(); dims.push_back(number_of_batches);
+    boost::shared_ptr< cuNDArray<T> > out( new cuNDArray<T> );
+    out->create(&dims);
+
+    correlation_kernel<REAL,T><<< gridDim, blockDim >>>( in->get_data_ptr(), out->get_data_ptr(), number_of_batches, number_of_elements );
+    
+    CHECK_FOR_CUDA_ERROR();
+    
+    return out;
+  }
+
+  // Smooth correlation matrices by box filter (1D)
+  template<class REAL> __global__ static void
+  smooth_correlation_matrices_kernel( const complext<REAL> * __restrict__ corrm, complext<REAL> * __restrict__ corrm_smooth, intd<1>::Type image_dims )
+  {
+    const int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    const int batch = blockIdx.y;
+
+    const int num_image_elements = prod(image_dims);
+
+    if( idx < num_image_elements ){
+    
+      const int co = idx;    
+      const int x = co;
+    
+      const int size_x = image_dims.vec[0];
+    
+      const REAL scale = REAL(1)/((REAL)kernel_width);
+    
+      complext<REAL> result = complext<REAL>(0);
+    
+      for (int kx = 0; kx < kernel_width; kx++) {
+      
+	if ((x-(kernel_width>>1)+kx) >= 0 &&
+	    (x-(kernel_width>>1)+kx) < size_x)
+	  {	    
+	    int source_offset = 
+	      batch*num_image_elements +
+	      (x-(kernel_width>>1)+kx);
+	  
+	    result += corrm[source_offset];
+	  }
+      }
+      corrm_smooth[batch*num_image_elements+idx] = scale*result;
+    }
+  }
+
+  // Smooth correlation matrices by box filter (2D)
+  template<class REAL> __global__ static  void
+  smooth_correlation_matrices_kernel( const complext<REAL> * __restrict__ corrm, complext<REAL> * __restrict__ corrm_smooth, intd<2>::Type image_dims )
+  {
+    const int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    const int batch = blockIdx.y;
+
+    const int num_image_elements = prod(image_dims);
+
+    if( idx < num_image_elements ){
+    
+      const intd2 co = idx_to_co<2>(idx, image_dims);
+    
+      const int x = co.vec[0];
+      const int y = co.vec[1];
+    
+      const int size_x = image_dims.vec[0];
+      const int size_y = image_dims.vec[1];
+    
+      const int half_width = kernel_width>>1;
+
+      const int yminus = y-half_width;
+      const int xminus = x-half_width;
+      const int yplus = y+half_width;
+      const int xplus = x+half_width;
+
+      const REAL scale = REAL(1)/((REAL)(kernel_width*kernel_width));
+    
+      complext<REAL> result = complext<REAL>(0);
+   
+      if( (yminus >=0) ){
+	if( yplus < size_y ){
+	  if( xminus >= 0 ){
+	    if( xplus < size_x ){
+
+#pragma unroll
+	      for (int ky = 0; ky < kernel_width; ky++){
+#pragma unroll
+		for (int kx = 0; kx < kernel_width; kx++) {
+		
+		  int cy = yminus+ky;
+		  int cx = xminus+kx;
+		
+		  int source_offset = batch*num_image_elements + cy*size_x + cx;
+		  result += corrm[source_offset];
+		}
+	      }
+	    }
+	  }
+	}
+      }
+      corrm_smooth[batch*num_image_elements+idx] = scale*result;
+    }
+  }
+
+  // Smooth correlation matrices by box filter (3D)
+  template<class REAL> __global__ static  void
+  smooth_correlation_matrices_kernel( const  complext<REAL> * __restrict__ corrm, complext<REAL> * __restrict__ corrm_smooth, intd<3>::Type image_dims )
+  {
+    const int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    const int batch = blockIdx.y;
+
+    const int num_image_elements = prod(image_dims);
+
+    if( idx < num_image_elements ){
+    
+      const intd3 co = idx_to_co<3>(idx, image_dims);
+    
+      const int x = co.vec[0];
+      const int y = co.vec[1];
+      const int z = co.vec[2];
+    
+      const int size_x = image_dims.vec[0];
+      const int size_y = image_dims.vec[1];
+      const int size_z = image_dims.vec[2];
+    
+      const REAL scale = REAL(1)/((REAL)(kernel_width*kernel_width*kernel_width));
+    
+      complext<REAL> result = complext<REAL>(0);
+    
+      for (int kz = 0; kz < kernel_width; kz++) {
+	for (int ky = 0; ky < kernel_width; ky++) {
+	  for (int kx = 0; kx < kernel_width; kx++) {
+	
+	    if ((z-(kernel_width>>1)+kz) >= 0 &&
+		(z-(kernel_width>>1)+kz) < size_z &&
+		(y-(kernel_width>>1)+ky) >= 0 &&
+		(y-(kernel_width>>1)+ky) < size_y &&
+		(x-(kernel_width>>1)+kx) >= 0 &&
+		(x-(kernel_width>>1)+kx) < size_x) 
+	      {	    
+		int source_offset = 
+		  batch*num_image_elements +
+		  (z-(kernel_width>>1)+kz)*size_x*size_y +
+		  (y-(kernel_width>>1)+ky)*size_x +
+		  (x-(kernel_width>>1)+kx);
+	    
+		result += corrm[source_offset];
+	      }
+	  }
+	}
+      }
+      corrm_smooth[batch*num_image_elements+idx] = scale*result;
+    }
+  }
+
+  // Smooth correlation matrices by box filter (3D)
+  template<class REAL> __global__ static void
+  smooth_correlation_matrices_kernel( const complext<REAL> * __restrict__ corrm, complext<REAL> * __restrict__ corrm_smooth, intd<4>::Type image_dims )
+  {
+    const int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    const int batch = blockIdx.y;
+
+    const int num_image_elements = prod(image_dims);
+
+    if( idx < num_image_elements ){
+    
+      const intd4 co = idx_to_co<4>(idx, image_dims);
+    
+      const int x = co.vec[0];
+      const int y = co.vec[1];
+      const int z = co.vec[2];
+      const int w = co.vec[3];
+    
+      const int size_x = image_dims.vec[0];
+      const int size_y = image_dims.vec[1];
+      const int size_z = image_dims.vec[2];    
+      const int size_w = image_dims.vec[3];
+    
+      const REAL scale = REAL(1)/((REAL)(kernel_width*kernel_width*kernel_width*kernel_width));
+    
+      complext<REAL> result = complext<REAL>(0);
+    
+      for (int kw = 0; kw < kernel_width; kw++) {
+	for (int kz = 0; kz < kernel_width; kz++) {
+	  for (int ky = 0; ky < kernel_width; ky++) {
+	    for (int kx = 0; kx < kernel_width; kx++) {
+	
+	      if ((w-(kernel_width>>1)+kw) >= 0 &&
+		  (w-(kernel_width>>1)+kw) < size_w &&
+		  (z-(kernel_width>>1)+kz) >= 0 &&
+		  (z-(kernel_width>>1)+kz) < size_z &&
+		  (y-(kernel_width>>1)+ky) >= 0 &&
+		  (y-(kernel_width>>1)+ky) < size_y &&
+		  (x-(kernel_width>>1)+kx) >= 0 &&
+		  (x-(kernel_width>>1)+kx) < size_x) 
+		{	    
+		  int source_offset = 
+		    batch*num_image_elements +
+		    (w-(kernel_width>>1)+kw)*size_x*size_y*size_z +
+		    (z-(kernel_width>>1)+kz)*size_x*size_y +
+		    (y-(kernel_width>>1)+ky)*size_x +
+		    (x-(kernel_width>>1)+kx);
+	    
+		  result += corrm[source_offset];
+		}
+	    }
+	  }
+	}
+      }
+      corrm_smooth[batch*num_image_elements+idx] = scale*result;
+    }
+  }
+
+  __device__ int _min( int A, int B ){
+    return (A<B) ? A : B;
+  }
+
+  // Smooth correlation matrices border by box filter (2D)
+  template<class REAL> __global__ static void
+  smooth_correlation_matrices_border_kernel( const complext<REAL> * __restrict__ corrm, complext<REAL> * __restrict__ corrm_smooth, intd<2>::Type image_dims, unsigned int number_of_border_threads )
+  {
+    const int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    const int batch = blockIdx.y;
+
+    const int num_image_elements = prod(image_dims);
+
+    if( idx < number_of_border_threads ){
+    
+      intd2 co;
+      const int half_width = kernel_width>>1;
+
+      co.vec[1] = idx/image_dims.vec[0];
+      co.vec[1] = _min(co.vec[1], half_width );
+    
+      if( co.vec[1] == half_width ){
+	int new_idx = idx-half_width*image_dims.vec[0];
+	int num_skips = new_idx/half_width;
+	int rows_offset = _min(num_skips>>1, image_dims.vec[1]-(half_width<<1) );
+	co.vec[1] += rows_offset;
+
+	if( co.vec[1] == (half_width + image_dims.vec[1]-(half_width<<1)) ){
+	  new_idx -= ((image_dims.vec[1]-(half_width<<1))*(half_width<<1));
+	  co.vec[1] += (new_idx / image_dims.vec[0]);
+	  co.vec[0] = (new_idx % image_dims.vec[0]);
+	}
+	else{
+	  co.vec[0] = (num_skips%2)*(image_dims.vec[0]-half_width) + (new_idx%half_width);
+	}
+      }
+      else{
+	co.vec[0] = idx%image_dims.vec[0];
+      }
+    
+      const int x = co.vec[0];
+      const int y = co.vec[1];
+    
+      const int size_x = image_dims.vec[0];
+      const int size_y = image_dims.vec[1];
+    
+      const int yminus = y-half_width;
+      const int xminus = x-half_width;
+
+      const REAL scale = REAL(1)/((REAL)(kernel_width*kernel_width));
+    
+      complext<REAL> result = complext<REAL>(0);
+ 
+#pragma unroll
+      for (int ky = 0; ky < kernel_width; ky++) {
+#pragma unroll
+	for (int kx = 0; kx < kernel_width; kx++) {
+	
+	  if( (yminus+ky >=0) ){
+	    if( yminus+ky < size_y ){
+	      if( xminus+kx >= 0 ){
+		if( xminus+kx < size_x ){
+		
+		  int source_offset = 
+		    batch*num_image_elements +
+		    (yminus+ky)*size_x +
+		    (xminus+kx);
+		
+		  result += corrm[source_offset];
+		}
+	      }
+	    }
+	  }
+	}
+      }
+      corrm_smooth[batch*num_image_elements+co_to_idx<2>(co,image_dims)] = scale*result;  
+    }
+  }
+
+  template<class REAL, unsigned int D> static void
+  smooth_correlation_matrices( cuNDArray<complext<REAL> > * corrm, cuNDArray<complext<REAL> > * corrm_smooth )
+  {
+    typename intd<D>::Type image_dims;
+
+    for( unsigned int i=0; i<D; i++ ){
+      image_dims.vec[i] = corrm->get_size(i);
+    }
+  
+    unsigned int number_of_batches = 1;
+  
+    for( unsigned int i=D; i<corrm->get_number_of_dimensions(); i++ ){
+      number_of_batches *= corrm->get_size(i);
+    }
+  
+    int device; cudaGetDevice( &device );
+    cudaDeviceProp deviceProp; cudaGetDeviceProperties( &deviceProp, device );
+
+    dim3 blockDim(deviceProp.maxThreadsPerBlock);
+    dim3 gridDim((unsigned int) std::ceil((double)prod(image_dims)/blockDim.x), number_of_batches);
+
+    smooth_correlation_matrices_kernel<REAL><<<gridDim, blockDim>>>
+      ( corrm->get_data_ptr(), corrm_smooth->get_data_ptr(), image_dims );
+  
+    CHECK_FOR_CUDA_ERROR();
+
+    unsigned int number_of_border_threads = ((kernel_width>>1)<<1)*(sum(image_dims)-((kernel_width>>1)<<1));
+    blockDim = dim3(128);
+    gridDim = dim3((unsigned int) std::ceil((double)number_of_border_threads/blockDim.x), number_of_batches);
+  
+    smooth_correlation_matrices_border_kernel<REAL><<<gridDim, blockDim>>>
+      ( corrm->get_data_ptr(), corrm_smooth->get_data_ptr(), image_dims, number_of_border_threads );
+
+    CHECK_FOR_CUDA_ERROR();
+  }
+
+  extern __shared__ char shared_mem[];
+
+  // Extract CSM
+  template<class REAL> __global__ static void
+  extract_csm_kernel( const complext<REAL> * __restrict__ corrm, complext<REAL> * __restrict__ csm, unsigned int num_batches, unsigned int num_elements )
+  {
+    const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    const unsigned int i = threadIdx.x;
+  
+    if( idx < num_elements ){    
+    
+      // Get the dominant eigenvector for each correlation matrix.
+      // Copying Peter Kellman's approach we use the power method:
+      //  b_k+1 = A*b_k / ||A*b_k||
+    
+      complext<REAL> *data_out = (complext<REAL>*) shared_mem;
+      complext<REAL> *tmp_v = &(((complext<REAL>*) shared_mem)[num_batches*blockDim.x]);
+    
+      const unsigned int iterations = 2;
+    
+      for( unsigned int c=0; c<num_batches; c++){
+	data_out[c*blockDim.x+i] = complext<REAL>(1);
+      }
+    
+      for( unsigned int it=0; it<iterations; it++ ){
+      
+	for( unsigned int c=0; c<num_batches; c++){
+	  tmp_v[c*blockDim.x+i] = complext<REAL>(0);
+	}
+      
+	for( unsigned j=0; j<num_batches; j++){
+	  for( unsigned int k=0; k<num_batches; k++){
+	    tmp_v[j*blockDim.x+i] += corrm[(k*num_batches+j)*num_elements+idx]*data_out[k*blockDim.x+i];
+	  }
+	}
+      
+	REAL tmp = REAL(0);
+      
+	for (unsigned int c=0; c<num_batches; c++){
+	  tmp += norm(tmp_v[c*blockDim.x+i]);
+	}
+      
+	tmp = 1/std::sqrt(tmp);
+
+      
+	for (unsigned int c=0; c<num_batches; c++){
+	  complext<REAL> res = tmp*tmp_v[c*blockDim.x+i];
+	  data_out[c*blockDim.x+i] = res;
+	}
+      }
+    
+      for (unsigned int c=0; c<num_batches; c++){
+	csm[c*num_elements+idx] = data_out[c*blockDim.x+i];
+      }
+    }
+  }
+
+  // Extract CSM
+  template<class REAL> __global__ static void
+  extract_csm_kernel( const complext<REAL> * __restrict__ corrm, complext<REAL> * __restrict__ csm, unsigned int num_batches, unsigned int num_elements, complext<REAL> * __restrict__ tmp_v )
+  {
+    const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if( idx < num_elements ){    
+    
+      // Get the dominant eigenvector for each correlation matrix.
+      // Copying Peter Kellman's approach we use the power method:
+      //  b_k+1 = A*b_k / ||A*b_k||
+    
+      const unsigned int iterations = 2;
+
+      for( unsigned int c=0; c<num_batches; c++){
+	csm[c*num_elements+idx] = complext<REAL>(1);
+      }
+    
+      for( unsigned int it=0; it<iterations; it++ ){
+
+	for( unsigned int c=0; c<num_batches; c++){
+	  tmp_v[c*num_elements+idx] = complext<REAL>(0);
+	}
+      
+	for( unsigned j=0; j<num_batches; j++){
+	  for( unsigned int k=0; k<num_batches; k++){
+	    typedef complext<REAL> T;
+	    tmp_v[j*num_elements+idx] += corrm[(k*num_batches+j)*num_elements+idx]*csm[k*num_elements+idx];
+	  }
+	}
+
+	REAL tmp = REAL(0);
+      
+	for (unsigned int c=0; c<num_batches; c++){
+	  tmp += norm(tmp_v[c*num_elements+idx]);
+	}
+      
+	tmp = 1/std::sqrt(tmp);
+
+      
+	for (unsigned int c=0; c<num_batches; c++){
+	  complext<REAL> res = tmp*tmp_v[c*num_elements+idx];
+	  csm[c*num_elements+idx] = res;
+	}
+      }
+    }
+  }
+
+  // Extract CSM
+  template<class REAL> __host__ static
+  boost::shared_ptr<cuNDArray<complext<REAL> > > extract_csm(cuNDArray<complext<REAL> > *corrm_in, unsigned int number_of_batches, unsigned int number_of_elements )
+  {
+    vector<size_t> image_dims;
+
+    for( unsigned int i=0; i<corrm_in->get_number_of_dimensions()-1; i++ ){
+      image_dims.push_back(corrm_in->get_size(i));
+    }
+  
+    // Allocate output
+    cuNDArray<complext<REAL> > *out = new cuNDArray<complext<REAL> >; out->create(&image_dims);
+
+    dim3 blockDim(256);
+    dim3 gridDim((unsigned int) std::ceil((double)number_of_elements/blockDim.x));
+
+    /*  
+	if( out != 0x0 )
+	extract_csm_kernel<REAL><<< gridDim, blockDim, number_of_batches*blockDim.x*2*sizeof(complext<REAL>) >>>
+	( corrm_in->get_data_ptr(), out->get_data_ptr(), number_of_batches, number_of_elements );
+    */
+
+    // Temporary buffer. TODO: use shared memory
+    cuNDArray<complext<REAL> > *tmp_v = new cuNDArray<complext<REAL> >; tmp_v->create(&image_dims);
+
+    if( out != 0x0 && tmp_v != 0x0 )
+      extract_csm_kernel<REAL><<< gridDim, blockDim >>>
+	( corrm_in->get_data_ptr(), out->get_data_ptr(), number_of_batches, number_of_elements, tmp_v->get_data_ptr() );
+
+    CHECK_FOR_CUDA_ERROR();
+  
+    delete tmp_v;
+    return boost::shared_ptr<cuNDArray<complext<REAL> > >(out);
+  }
+
+  // Set refence phase
+  template<class REAL> __global__ static void
+  set_phase_reference_kernel( complext<REAL> *csm, unsigned int num_batches, unsigned int num_elements )
+  {
+    const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if( idx < num_elements ){
+      REAL angle = arg<REAL>(csm[idx]); //Phase of the first coil
+      REAL sin_a, cos_a; gad_sincos( angle, &sin_a, &cos_a );
+
+      complext<REAL> tmp;
+      tmp.vec[0] = cos_a; tmp.vec[1] = sin_a;
+      tmp = conj(tmp);
+
+      for( unsigned int c=0; c<num_batches; c++ ){
+	complext<REAL> val = csm[c*num_elements+idx];
+	typedef complext<REAL> T;
+	val = val*tmp;
+	csm[c*num_elements+idx] = val;
+      }
+    }
+  }
+  
+  // Set reference phase
+  template<class REAL> __host__ static
+  void set_phase_reference(cuNDArray<complext<REAL> > *csm, unsigned int number_of_batches, unsigned int number_of_elements )
+  {
+    dim3 blockDim(128);
+    dim3 gridDim((unsigned int) std::ceil((double)number_of_elements/blockDim.x));
+  
+    set_phase_reference_kernel<REAL><<< gridDim, blockDim >>>( csm->get_data_ptr(), number_of_batches, number_of_elements );
+  
+    CHECK_FOR_CUDA_ERROR();
+  }
+
+
+
+  //
+  // Template instantiation
+  //
+
+  //template EXPORTGPUPMRI boost::shared_ptr< cuNDArray<complext<float> > > estimate_b1_map<float,1>(cuNDArray<complext<float> >*, int);
+  template EXPORTGPUPMRI boost::shared_ptr< cuNDArray<complext<float> > > estimate_b1_map<float,2>(cuNDArray<complext<float> >*, int);
+  //template boost::shared_ptr< cuNDArray<complext<float> > > estimate_b1_map<float,3>(cuNDArray<complext<float> >*, int);
+  //template boost::shared_ptr< cuNDArray<complext<float> > > estimate_b1_map<float,4>(cuNDArray<complext<float> >*, int);
+
+  //template EXPORTGPUPMRI boost::shared_ptr< cuNDArray<complext<double> > > estimate_b1_map<double,1>(cuNDArray<complext<double> >*, int);
+  template EXPORTGPUPMRI boost::shared_ptr< cuNDArray<complext<double> > > estimate_b1_map<double,2>(cuNDArray<complext<double> >*, int);
+  //template EXPORTGPUPMRI boost::shared_ptr< cuNDArray<complext<double> > > estimate_b1_map<double,3>(cuNDArray<complext<double> >*, int);
+  //template EXPORTGPUPMRI boost::shared_ptr< cuNDArray<complext<double> > > estimate_b1_map<double,4>(cuNDArray<complext<double> >*, int);
+}
diff --git a/toolboxes/mri/pmri/gpu/b1_map.h b/toolboxes/mri/pmri/gpu/b1_map.h
new file mode 100644
index 0000000..1ef8517
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/b1_map.h
@@ -0,0 +1,32 @@
+/** \file b1_map.h
+    \brief Utility to estimate b1 maps (MRI coil sensitivities), GPU based. 
+*/
+
+#pragma once
+
+#include "gpupmri_export.h"
+#include "cuNDArray.h"
+#include "vector_td.h"
+#include "complext.h"
+
+#include <boost/shared_ptr.hpp>
+
+namespace Gadgetron{
+
+  /** 
+   * \brief Estimate b1 map (coil sensitivities) of single or double precision according to REAL and of dimensionality D.
+   * \param data Reconstructed reference images from the individual coils. Dimensionality is D+1 where the latter dimensions denotes the coil images.
+   * \param taget_coils Denotes the number of target coils. Cannot exceed the size of dimension D of the data. A negative value indicates that sensitivity maps are computed for the full coil image dimension.
+   */
+  template<class REAL, unsigned int D> EXPORTGPUPMRI boost::shared_ptr< cuNDArray<complext<REAL> > >
+  estimate_b1_map( cuNDArray<complext<REAL> > *data, int target_coils = -1 );
+
+    /** 
+   * \brief Estimate b1 map (coil sensitivities) of single or double precision using the NIH Souheil method
+   * \param data [RO E1 CHA] for single 2D or [RO E1 N CHA] for multiple 2D reconstructed reference images from the individual coils. 
+   */
+  template<class REAL> EXPORTGPUPMRI bool
+  estimate_b1_map_2D_NIH_Souheil( cuNDArray<complext<REAL> >* data, cuNDArray<complext<REAL> >* csm, size_t ks, size_t power,
+                                  cuNDArray<complext<REAL> >& D, cuNDArray<complext<REAL> >& DH_D, 
+                                  cuNDArray<complext<REAL> >& V1, cuNDArray<complext<REAL> >& U1 );
+}
diff --git a/toolboxes/mri/pmri/gpu/b1_map_NIH_Souheil.cu b/toolboxes/mri/pmri/gpu/b1_map_NIH_Souheil.cu
new file mode 100644
index 0000000..995e81e
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/b1_map_NIH_Souheil.cu
@@ -0,0 +1,647 @@
+#include "b1_map.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "vector_td_utilities.h"
+#include "real_utilities.h"
+#include "real_utilities_device.h"
+#include "complext.h"
+#include "check_CUDA.h"
+#include "cudaDeviceManager.h"
+#include "setup_grid.h"
+#include "hoNDArray_fileio.h"
+#include "GPUTimer.h"
+
+#include "CUBLASContextProvider.h"
+#include <cublas_v2.h>
+
+#include <iostream>
+#include <cmath>
+
+using namespace std;
+
+namespace Gadgetron{
+
+    template <class T> int write_cuNDArray_to_disk(cuNDArray<T>* a, const char* filename)
+    {
+        boost::shared_ptr< hoNDArray<T> > host = a->to_host();
+        write_nd_array<complext<float> >(host.get(), filename);
+        return 0;
+    }
+
+    extern __shared__ char _shared_mem[];
+
+    //
+    // Main method
+    //
+
+    template<class REAL> EXPORTGPUPMRI bool
+    estimate_b1_map_2D_NIH_Souheil( cuNDArray<complext<REAL> >* data, cuNDArray<complext<REAL> >* csm, size_t ks, size_t power, 
+                                    cuNDArray<complext<REAL> >& D, cuNDArray<complext<REAL> >& DH_D, 
+                                    cuNDArray<complext<REAL> >& V1, cuNDArray<complext<REAL> >& U1)
+    {
+        if( data->get_number_of_dimensions() < 2 )
+        {
+            cout << endl << "estimate_b1_map_2D_NIH_Souheil:: dimensionality mismatch." << endl; 
+            return false;
+        }
+
+        if ( !csm->dimensions_equal(data) )
+        {
+            csm->create(data->get_dimensions());
+        }
+
+        size_t kss = ks*ks;
+
+        {
+            assemble_D( data, &D, ks );
+        }
+
+        //{
+        //    std::string dstDir = "D:/software/Gadgetron/20130114/gadgetron/toolboxes/gtplus/ut/result/";
+        //    std::string filename = dstDir + "D.cplx";
+        //    write_cuNDArray_to_disk(&D, filename.c_str());
+        //}
+
+        {
+            computeDH_D( data, &D, &DH_D, kss );
+        }
+
+        //{
+        //    std::string dstDir = "D:/software/Gadgetron/20130114/gadgetron/toolboxes/gtplus/ut/result/";
+        //    std::string filename = dstDir + "DH_D.cplx";
+        //    write_cuNDArray_to_disk(&DH_D, filename.c_str());
+        //}
+
+        {
+            computeV1( data, &D, &DH_D, &V1, csm, power, kss);
+        }
+
+        //{
+        //    std::string dstDir = "D:/software/Gadgetron/20130114/gadgetron/toolboxes/gtplus/ut/result/";
+        //    std::string filename = dstDir + "V1.cplx";
+        //    write_cuNDArray_to_disk(&V1, filename.c_str());
+        //}
+
+        {
+            computeU1( data, &D, &V1, &U1, kss);
+        }
+
+        //{
+        //    std::string dstDir = "D:/software/Gadgetron/20130114/gadgetron/toolboxes/gtplus/ut/result/";
+        //    std::string filename = dstDir + "U1.cplx";
+        //    write_cuNDArray_to_disk(&U1, filename.c_str());
+        //}
+
+        {
+            extract_csm( data, &V1, &U1, csm, kss);
+        }
+
+        //{
+        //    std::string dstDir = "D:/software/Gadgetron/20130114/gadgetron/toolboxes/gtplus/ut/result/";
+        //    std::string filename = dstDir + "csm.cplx";
+        //    write_cuNDArray_to_disk(csm, filename.c_str());
+        //}
+
+        return true;
+    }
+
+    // assemble_D
+    template<class T> __global__ void
+    assemble_D_kernel( const T* __restrict__ pData, T* __restrict__ pD, int RO, int E1, int N, int CHA, int kss, int halfKs )
+    {
+        typedef typename realType<T>::Type REAL;
+
+        const unsigned int cha = threadIdx.y;
+
+        unsigned int n = (blockIdx.x*blockDim.x + threadIdx.x)/(RO*E1);
+        unsigned int e1 = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)/RO;
+        unsigned int ro = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)%RO;
+
+        // printf("ro=%d, e1=%d, cha=%d, n=%d\n", ro, e1, cha, n);
+
+        if( ro<RO && e1<E1 && n<N )
+        {
+            // printf("ro=%d, e1=%d\n", ro, e1);
+
+            unsigned int idx2D = cha*RO*E1*kss*N + n*RO*E1 + ro + e1*RO;
+
+            int kro, ke1, de1, dro;
+
+            if ( e1>=halfKs && e1<E1-halfKs && ro>=halfKs && ro<RO-halfKs )
+            {
+                // printf("e1>=halfKs && e1<E1-halfKs && ro>=halfKs && ro<RO-halfKs\n");
+
+                const T* pDataCurr = pData + n*RO*E1 + cha*RO*E1*N;
+
+                int ind=0;
+                for ( ke1=-halfKs; ke1<=halfKs; ke1++ )
+                {
+                    de1 = e1 + ke1;
+                    for ( kro=-halfKs; kro<=halfKs; kro++ )
+                    {
+                        pD[ind*RO*E1*N + idx2D] = pDataCurr[de1*RO+ro+kro];
+                        //printf("pD[idxD]=%f\n", pD[idxD].real());
+                        ind++;
+                    }
+                }
+            }
+            else
+            {
+                // printf("boundary\n");
+                const T* pDataCurr = pData + n*RO*E1 + cha*RO*E1*N;
+                int ind=0;
+                for ( ke1=-halfKs; ke1<=halfKs; ke1++ )
+                {
+                    de1 = e1 + ke1;
+                    if ( de1 < 0 ) de1 += E1;
+                    if ( de1 >= E1 ) de1 -= E1;
+
+                    for ( kro=-halfKs; kro<=halfKs; kro++ )
+                    {
+                        dro = ro + kro;
+                        if ( dro < 0 ) dro += RO;
+                        if ( dro >= RO ) dro -= RO;
+
+                        pD[ind*RO*E1*N+ idx2D] = pDataCurr[de1*RO+dro];
+                        ind++;
+                    }
+                }
+            }
+        }
+    }
+
+    template<class T>
+    void assemble_D( cuNDArray<T>* data, cuNDArray<T>* D, size_t ks )
+    {
+        size_t RO = data->get_size(0);
+        size_t E1 = data->get_size(1);
+        size_t N(1), CHA;
+
+        size_t NDim = data->get_number_of_dimensions();
+
+        if ( NDim == 3 )
+        {
+            CHA = data->get_size(2);
+        }
+
+        if ( NDim == 4 )
+        {
+            N = data->get_size(2);
+            CHA = data->get_size(3);
+        }
+
+        if ( ks%2 != 1 )
+        {
+            ks++;
+        }
+
+        size_t halfKs = ks/2;
+
+        // Setup block/grid dimensions
+        int cur_device = cudaDeviceManager::Instance()->getCurrentDevice();
+        int warp_size = cudaDeviceManager::Instance()->warp_size(cur_device);
+        int max_blockdim = cudaDeviceManager::Instance()->max_blockdim(cur_device);
+        dim3 blockDim(((max_blockdim/CHA)/warp_size)*warp_size, CHA);
+
+        if( blockDim.x == 0 )
+        {
+            blockDim.x = warp_size;
+            while ( blockDim.x*CHA*CHA > max_blockdim && blockDim.x>1 )
+            {
+                blockDim.x /= 2;
+            }
+
+            if ( blockDim.x <= 1 )
+            {
+                blockDim.x = 1;
+            }
+        }
+
+        dim3 gridDim((RO*E1*N+blockDim.x-1)/blockDim.x);
+
+        // Invoke kernel
+        assemble_D_kernel<T><<< gridDim, blockDim >>>( data->get_data_ptr(), D->get_data_ptr(), RO, E1, N, CHA, ks*ks, halfKs );
+
+        CHECK_FOR_CUDA_ERROR();
+    }
+
+    // compute DH_D
+    template<class T> __global__ void
+    computeDH_D_kernel( const T* __restrict__ pD, T* __restrict__ pDH_D, int RO, int E1, int N, int CHA, int kss )
+    {
+        typedef typename realType<T>::Type REAL;
+
+        // DH_D, [RO E1 CHA CHA_Prime]
+        const unsigned int cha = threadIdx.y;
+        const unsigned int cha_prime = threadIdx.z;
+
+        unsigned int n = (blockIdx.x*blockDim.x + threadIdx.x)/(RO*E1);
+        unsigned int e1 = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)/RO;
+        unsigned int ro = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)%RO;
+
+        if( ro<RO && e1<E1 && n<N )
+        {
+            unsigned int idx = ro + e1*RO + n*RO*E1;
+
+            // every thread compute an element of DH_D for a pixel
+            int k;
+            T v;
+            v = 0;
+            for ( k=0; k<kss; k++ )
+            {
+                v += conj(pD[cha*RO*E1*N*kss + k*RO*E1*N + idx])*pD[cha_prime*RO*E1*N*kss + k*RO*E1*N + idx];
+            }
+
+            pDH_D[cha_prime*RO*E1*N*CHA + cha*RO*E1*N + idx] = v;
+        }
+    }
+
+    // use the shared memory
+    template<class T> __global__ void
+    computeDH_D_kernel3( const T*  __restrict__ pD, T* __restrict__ pDH_D, int RO, int E1, int N, int CHA, int kss, int ks, int num )
+    {
+        typedef typename realType<T>::Type REAL;
+
+        // DH_D, [RO E1 CHA CHA_Prime]
+        const unsigned int cha = threadIdx.y;
+        const unsigned int cha_prime = threadIdx.z;
+
+        unsigned int n = (blockIdx.x*blockDim.x + threadIdx.x)/(RO*E1);
+        unsigned int e1 = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)/RO;
+        unsigned int ro = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)%RO;
+
+        if( ro<RO && e1<E1 && n<N )
+        {
+            unsigned int idx = ro + e1*RO + n*RO*E1;
+            unsigned int idxD = idx + cha*RO*E1*N*kss;
+            unsigned int idxShared = threadIdx.x*kss*CHA;
+
+            T *shared_mem = (T*) _shared_mem;
+
+            int k;
+
+            if ( cha_prime == 0 )
+            {
+                for ( k=0; k<kss; k++ )
+                {
+                    shared_mem[idxShared + k + cha*kss ] = pD[idxD + k*RO*E1*N ];
+                }
+            }
+
+            __syncthreads();
+
+            T v = conj(shared_mem[idxShared + cha*kss])*shared_mem[idxShared + cha_prime*kss];
+            for ( k=1; k<kss; k++ )
+            {
+                v += conj(shared_mem[idxShared + cha*kss + k])*shared_mem[idxShared + cha_prime*kss + k];
+            }
+
+            pDH_D[cha_prime*RO*E1*N*CHA + cha*RO*E1*N + idx] = v;
+        }
+    }
+
+    template<class T>
+    void computeDH_D( cuNDArray<T>* data, cuNDArray<T>* D, cuNDArray<T>* DH_D, size_t kss )
+    {
+        size_t RO = data->get_size(0);
+        size_t E1 = data->get_size(1);
+        size_t N(1), CHA;
+
+        size_t NDim = data->get_number_of_dimensions();
+
+        if ( NDim == 3 )
+        {
+            CHA = data->get_size(2);
+        }
+
+        if ( NDim == 4 )
+        {
+            N = data->get_size(2);
+            CHA = data->get_size(3);
+        }
+
+        // Setup block/grid dimensions
+        int cur_device = cudaDeviceManager::Instance()->getCurrentDevice();
+        int warp_size = cudaDeviceManager::Instance()->warp_size(cur_device);
+        int max_blockdim = cudaDeviceManager::Instance()->max_blockdim(cur_device);
+        size_t shared_mem_per_block = cudaDeviceManager::Instance()->shared_mem_per_block(cur_device);
+
+        // estimate how many pixels a block can process
+        size_t ks = (size_t)std::sqrt((double)kss);
+
+        // size_t numOfPixels = shared_mem_per_block/4/(sizeof(T)*(kss+ks)*CHA);
+        size_t numOfPixels = shared_mem_per_block/4/(sizeof(T)*kss*CHA);
+
+        while ( numOfPixels*ks*CHA>max_blockdim && numOfPixels>0 )
+        {
+            numOfPixels--;
+        }
+
+        if ( numOfPixels > 0 )
+        {
+            dim3 blockDim(numOfPixels, CHA, CHA);
+
+            dim3 gridDim((RO*E1*N+blockDim.x-1)/blockDim.x);
+
+            computeDH_D_kernel3<T><<< gridDim, blockDim, numOfPixels*sizeof(T)*kss*CHA >>>( D->get_data_ptr(), DH_D->get_data_ptr(), RO, E1, N, CHA, kss, ks, numOfPixels );
+        }
+        else
+        {
+            dim3 blockDim(((max_blockdim/(CHA*CHA))/warp_size)*warp_size, CHA, CHA);
+
+            if( blockDim.x == 0 )
+            {
+                blockDim.x = warp_size;
+                while ( blockDim.x*CHA*CHA > max_blockdim && blockDim.x>1 )
+                {
+                    blockDim.x /= 2;
+                }
+
+                if ( blockDim.x <= 1 )
+                {
+                    blockDim.x = 1;
+                }
+            }
+
+            dim3 gridDim((RO*E1*N+blockDim.x-1)/blockDim.x);
+
+            // Invoke kernel
+            computeDH_D_kernel<T><<< gridDim, blockDim >>>( D->get_data_ptr(), DH_D->get_data_ptr(), RO, E1, N, CHA, kss );
+        }
+
+        CHECK_FOR_CUDA_ERROR();
+    }
+
+    // compute V1
+    template<class T> __global__ void
+    computeV1_kernel( const T* __restrict__ pD, T* __restrict__ pV1, int RO, int E1, int N, int CHA, int kss )
+    {
+        typedef typename realType<T>::Type REAL;
+
+        const unsigned int cha = threadIdx.y;
+        unsigned int n = (blockIdx.x*blockDim.x + threadIdx.x)/(RO*E1);
+        unsigned int e1 = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)/RO;
+        unsigned int ro = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)%RO;
+
+        if( ro<RO && e1<E1 && n<N )
+        {
+            unsigned int idx = ro + e1*RO + n*RO*E1;
+            unsigned int idxD = cha*RO*E1*N*kss + idx;
+
+            T v = 0;
+            for ( int ii=0; ii<kss; ii++ )
+            {
+                v += pD[idxD + ii*RO*E1*N];
+            }
+            pV1[cha*RO*E1*N + idx] = v;
+        }
+    }
+
+    template<class T> __global__ void
+    power_method_kernel( const T* __restrict__ pDH_D, T* __restrict__ pV1,  T* __restrict__ pV, unsigned int RO, unsigned int E1, unsigned int N, unsigned int CHA, unsigned int kss, unsigned int power )
+    {
+        typedef typename realType<T>::Type REAL;
+
+        const unsigned int ro = blockIdx.x*blockDim.x+threadIdx.x;
+        const unsigned int e1 = blockIdx.y*blockDim.y+threadIdx.y;
+        unsigned int n = blockIdx.z;
+
+        if( ro<RO && e1<E1 && n<N )
+        {
+            unsigned int cha;
+
+            unsigned int idx2D = ro + e1*RO + n*RO*E1;
+
+            unsigned int N3D = RO*E1*N;
+
+            REAL v1Norm(0);
+            for ( cha=0; cha<CHA; cha++ )
+            {
+                v1Norm += norm(pV1[cha*N3D + idx2D]);
+            }
+            v1Norm = ::sqrt(v1Norm);
+
+            for ( cha=0; cha<CHA; cha++ )
+            {
+                pV1[cha*N3D + idx2D] /= v1Norm;
+            }
+
+            unsigned int po;
+            for ( po=0; po<power; po++ )
+            {
+                for( unsigned j=0; j<CHA; j++)
+                {
+                    T v = 0;
+                    for( unsigned int k=0; k<CHA; k++)
+                    {
+                        v += pDH_D[k*CHA*N3D+j*N3D+idx2D]*pV1[k*N3D+idx2D];
+                    }
+                    pV[j*N3D+idx2D] = v;
+                }
+
+                for ( cha=0; cha<CHA; cha++ )
+                {
+                    pV1[cha*N3D + idx2D] = pV[cha*N3D + idx2D];
+                }
+
+                v1Norm = 0;
+                for ( cha=0; cha<CHA; cha++ )
+                {
+                    v1Norm += norm(pV1[cha*N3D + idx2D]);
+                }
+                v1Norm = 1/std::sqrt(v1Norm);
+
+                for ( cha=0; cha<CHA; cha++ )
+                {
+                    pV1[cha*N3D + idx2D] *= v1Norm;
+                }
+            }
+        }
+    }
+
+    template<class T>
+    void computeV1( cuNDArray<T>* data, cuNDArray<T>* D, cuNDArray<T>* DH_D, cuNDArray<T>* V1, cuNDArray<T>* V, int power, int kss)
+    {
+        size_t RO = data->get_size(0);
+        size_t E1 = data->get_size(1);
+        size_t N(1), CHA;
+
+        size_t NDim = data->get_number_of_dimensions();
+
+        if ( NDim == 3 )
+        {
+            CHA = data->get_size(2);
+        }
+
+        if ( NDim == 4 )
+        {
+            N = data->get_size(2);
+            CHA = data->get_size(3);
+        }
+
+        // Setup block/grid dimensions
+        int cur_device = cudaDeviceManager::Instance()->getCurrentDevice();
+        int warp_size = cudaDeviceManager::Instance()->warp_size(cur_device);
+        int max_blockdim = cudaDeviceManager::Instance()->max_blockdim(cur_device);
+        dim3 blockDim(((max_blockdim/CHA)/warp_size)*warp_size, CHA);
+
+        if( blockDim.x == 0 )
+        {
+            GERROR_STREAM("blockDim.x == 0");
+            throw std::runtime_error("computeDH_D: dimension exceeds device capacity.");
+        }
+
+        dim3 gridDim((RO*E1*N+blockDim.x-1)/blockDim.x);
+
+        // Invoke kernel
+        computeV1_kernel<T><<< gridDim, blockDim >>>( D->get_data_ptr(), V1->get_data_ptr(), RO, E1, N, CHA, kss );
+
+        // power method
+        dim3 blockDim2(16, 16);
+        dim3 gridDim2((RO+blockDim2.x-1)/blockDim2.x, (E1+blockDim2.y-1)/blockDim2.y, N);
+
+        power_method_kernel<T><<< gridDim2, blockDim2 >>>( DH_D->get_data_ptr(), V1->get_data_ptr(), V->get_data_ptr(), RO, E1, N, CHA, kss, power );
+
+        CHECK_FOR_CUDA_ERROR();
+    }
+
+    // compute U1
+    template<class T> __global__ void
+    computeU1_kernel(const  T* __restrict__ pD, const T* __restrict__ pV1, T* __restrict__ pU1, int RO, int E1, int N, int CHA, int kss )
+    {
+        typedef typename realType<T>::Type REAL;
+
+        const unsigned int k = threadIdx.y;
+        unsigned int n = (blockIdx.x*blockDim.x + threadIdx.x)/(RO*E1);
+        unsigned int e1 = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)/RO;
+        unsigned int ro = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)%RO;
+
+        if( ro<RO && e1<E1 && n<N )
+        {
+            unsigned int idx = ro + e1*RO + n*RO*E1;
+            unsigned int idxD = k*RO*E1*N + idx;
+
+            T v = 0;
+            for ( int ii=0; ii<CHA; ii++ )
+            {
+                v += pD[idxD + ii*kss*RO*E1*N] * pV1[ii*RO*E1*N+idx];
+            }
+            pU1[k*RO*E1*N + idx] = v;
+        }
+    }
+
+    template<class T>
+    void computeU1( cuNDArray<T>* data, cuNDArray<T>* D, cuNDArray<T>* V1, cuNDArray<T>* U1, int kss)
+    {
+        size_t RO = data->get_size(0);
+        size_t E1 = data->get_size(1);
+        size_t N(1), CHA;
+
+        size_t NDim = data->get_number_of_dimensions();
+
+        if ( NDim == 3 )
+        {
+            CHA = data->get_size(2);
+        }
+
+        if ( NDim == 4 )
+        {
+            N = data->get_size(2);
+            CHA = data->get_size(3);
+        }
+
+        // Setup block/grid dimensions
+        int cur_device = cudaDeviceManager::Instance()->getCurrentDevice();
+        int warp_size = cudaDeviceManager::Instance()->warp_size(cur_device);
+        int max_blockdim = cudaDeviceManager::Instance()->max_blockdim(cur_device);
+        dim3 blockDim(((max_blockdim/kss)/warp_size)*warp_size, kss);
+
+        if( blockDim.x == 0 )
+        {
+            // GERROR_STREAM("blockDim.x == 0");
+            blockDim.x = warp_size;
+            while ( blockDim.x*kss > max_blockdim && blockDim.x>1 )
+            {
+                blockDim.x /= 2;
+            }
+
+            if ( blockDim.x <= 1 )
+            {
+                blockDim.x = 1;
+            }
+        }
+
+        dim3 gridDim((RO*E1*N+blockDim.x-1)/blockDim.x);
+
+        // Invoke kernel
+        computeU1_kernel<T><<< gridDim, blockDim >>>( D->get_data_ptr(), V1->get_data_ptr(), U1->get_data_ptr(), RO, E1, N, CHA, kss );
+
+        CHECK_FOR_CUDA_ERROR();
+    }
+
+    // extract the csm
+    template<class T> __global__ void
+    extract_csm_kernel( const T* __restrict__ pV1, const T* __restrict__ pU1, T* __restrict__ pCSM, unsigned int RO, unsigned int E1, unsigned int N, unsigned int CHA, unsigned int kss )
+    {
+        typedef typename realType<T>::Type REAL;
+
+        const unsigned int ro = blockIdx.x*blockDim.x+threadIdx.x;
+        const unsigned int e1 = blockIdx.y*blockDim.y+threadIdx.y;
+        unsigned int n = blockIdx.z;
+
+        if( ro<RO && e1<E1 && n<N )
+        {
+            unsigned int cha;
+            unsigned int idx = ro + e1*RO + n*RO*E1;
+
+            T phaseU1 = pU1[idx];
+            for ( int po=1; po<kss; po++ )
+            {
+                phaseU1 += pU1[idx + po*RO*E1*N];
+            }
+            phaseU1 /= abs(phaseU1);
+
+            // put the mean object phase to coil map
+            for ( cha=0; cha<CHA; cha++ )
+            {
+                pCSM[cha*RO*E1*N+idx] = phaseU1 * conj(pV1[cha*RO*E1*N+idx]);
+            }
+        }
+    }
+
+    template<class T>
+    void extract_csm( cuNDArray<T>* data, cuNDArray<T>* V1, cuNDArray<T>* U1, cuNDArray<T>* csm, int kss)
+    {
+        size_t RO = data->get_size(0);
+        size_t E1 = data->get_size(1);
+        size_t N(1), CHA;
+
+        size_t NDim = data->get_number_of_dimensions();
+
+        if ( NDim == 3 )
+        {
+            CHA = data->get_size(2);
+        }
+
+        if ( NDim == 4 )
+        {
+            N = data->get_size(2);
+            CHA = data->get_size(3);
+        }
+
+        // Setup block/grid dimensions
+        dim3 blockDim(16, 16);
+        dim3 gridDim((RO+blockDim.x-1)/blockDim.x, (E1+blockDim.y-1)/blockDim.y, N);
+
+        extract_csm_kernel<T><<< gridDim, blockDim >>>( V1->get_data_ptr(), U1->get_data_ptr(), csm->get_data_ptr(), RO, E1, N, CHA, kss );
+
+        CHECK_FOR_CUDA_ERROR();
+    }
+
+    //
+    // Template instantiation
+    //
+    template EXPORTGPUPMRI bool estimate_b1_map_2D_NIH_Souheil<float>( cuNDArray<complext<float> >* data, cuNDArray<complext<float> >* csm, size_t ks, size_t power,
+                                    cuNDArray<complext<float> >& D, cuNDArray<complext<float> >& DH_D, 
+                                    cuNDArray<complext<float> >& V1, cuNDArray<complext<float> >& U1 );
+}
diff --git a/toolboxes/mri/pmri/gpu/b1map_test.cu b/toolboxes/mri/pmri/gpu/b1map_test.cu
new file mode 100644
index 0000000..0a77192
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/b1map_test.cu
@@ -0,0 +1,48 @@
+#include "b1_map.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray.h"
+#include "ndarray_vector_td_utilities.hcu"
+#include "NFFT.h"
+#include "check_CUDA.h"
+
+#include <cutil.h>
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+int main( int argc, char** argv) 
+{
+  hoNDArray<float_complext::Type> host_data = 
+    read_nd_array<float_complext::Type>("b1_mapping_data/coil_images.cplx");
+  
+  //hoNDArray<float_complext::Type> host_data = 
+  //read_nd_array<float_complext::Type>("b1_mapping_data/5ch.cplx");
+  
+  if( host_data.get_number_of_dimensions() != 3 ){
+    printf("\nInput data is not three-dimensional (a series of images). Quitting!\n");
+    exit(1);
+  }
+  
+  // Copy the image data to the device
+  cuNDArray<float_complext::Type> device_data(host_data);
+  
+  unsigned int timer; cutCreateTimer(&timer); double time;
+  printf("\nComputing CSM..."); fflush(stdout);
+  cutResetTimer( timer ); cutStartTimer( timer );
+  
+  // Compute CSM
+  boost::shared_ptr< cuNDArray<float_complext::Type> > csm = estimate_b1_map<float,2>( &device_data );
+  
+  cudaThreadSynchronize(); cutStopTimer( timer );
+  time = cutGetTimerValue( timer ); printf("done: %.1f ms.", time ); fflush(stdout);
+
+  // Output result
+
+  hoNDArray<float_complext::Type> host_csm = csm->to_host();
+  write_nd_array<float_complext::Type>( host_csm, "csm.cplx" );
+
+  printf("\n", time ); fflush(stdout);
+
+  CHECK_FOR_CUDA_ERROR();
+  return 0;
+}
diff --git a/toolboxes/mri/pmri/gpu/cuBuffer.cpp b/toolboxes/mri/pmri/gpu/cuBuffer.cpp
new file mode 100644
index 0000000..260af2a
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuBuffer.cpp
@@ -0,0 +1,197 @@
+#include "cuBuffer.h"
+#include "vector_td_utilities.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D, bool ATOMICS>
+  cuBuffer<REAL,D,ATOMICS>::cuBuffer() 
+  {
+    acc_buffer_ = boost::shared_ptr< cuNDArray<_complext> >(new cuNDArray<_complext>);
+    cyc_buffer_ = boost::shared_ptr< cuNDArray<_complext> >(new cuNDArray<_complext>);
+    nfft_plan_  = boost::shared_ptr< cuNFFT_plan<REAL,D,ATOMICS> >(new cuNFFT_plan<REAL,D,ATOMICS>);
+    num_coils_ = 0;
+    cur_idx_ = cur_sub_idx_ = 0;
+    cycle_length_ = 0; sub_cycle_length_ = 0;
+    acc_buffer_empty_ = true;
+    Gadgetron::clear(matrix_size_);
+    Gadgetron::clear(matrix_size_os_);
+    W_ = REAL(0);
+  }
+  
+  template<class REAL, unsigned int D, bool ATOMICS>
+  void cuBuffer<REAL,D,ATOMICS>::clear()
+  {
+    Gadgetron::clear(acc_buffer_.get());
+    Gadgetron::clear(cyc_buffer_.get());
+    cur_idx_ = cur_sub_idx_ = 0;
+    acc_buffer_empty_ = true;
+  }
+
+  template<class REAL, unsigned int D, bool ATOMICS>
+  void cuBuffer<REAL,D,ATOMICS>
+  ::setup( _uint64d matrix_size, _uint64d matrix_size_os, REAL W, 
+           unsigned int num_coils, unsigned int num_cycles, unsigned int num_sub_cycles )
+  {      
+    bool matrix_size_changed = (matrix_size_ == matrix_size);
+    bool matrix_size_os_changed = (matrix_size_os_ == matrix_size_os);
+    bool kernel_changed = (W_ == W);
+    bool num_coils_changed = (num_coils_ == num_coils );
+    bool num_cycles_changed = (cycle_length_ == num_cycles+1);
+
+    matrix_size_ = matrix_size;
+    matrix_size_os_ = matrix_size_os;
+    W_ = W;
+    num_coils_ = num_coils;
+    cycle_length_ = num_cycles+1; // +1 as we need a "working buffer" in a addition to 'cycle_length' full ones
+    sub_cycle_length_ = num_sub_cycles;
+
+    if( !nfft_plan_->is_setup() || matrix_size_changed || matrix_size_os_changed || kernel_changed ){
+      nfft_plan_->setup( matrix_size_, matrix_size_os_, W );
+    }
+    
+    std::vector<size_t> dims = to_std_vector(matrix_size_os_);    
+    dims.push_back(num_coils_);
+
+    if( acc_buffer_->get_number_of_elements() == 0 || matrix_size_os_changed || num_coils_changed ){
+      acc_buffer_->create(&dims);
+      Gadgetron::clear( acc_buffer_.get() );
+    }
+
+    dims.push_back(cycle_length_);
+    if( cyc_buffer_->get_number_of_elements() == 0 || matrix_size_os_changed || num_coils_changed ){
+      cyc_buffer_->create(&dims);      
+      Gadgetron::clear( cyc_buffer_.get() );
+    }
+    else if( num_cycles_changed ){
+      // Reuse the old buffer content in this case...
+      // This happens automatically (in all cases?) with the current design?
+    }
+  }
+  
+  template<class REAL, unsigned int D, bool ATOMICS> 
+  bool cuBuffer<REAL,D,ATOMICS>::add_frame_data( cuNDArray<_complext> *samples, cuNDArray<_reald> *trajectory )
+  {
+    if( !samples || !trajectory ){
+      throw std::runtime_error("cuBuffer::add_frame_data: illegal input pointer");
+    }
+
+    if( num_coils_ != samples->get_size(samples->get_number_of_dimensions()-1) ){
+      throw std::runtime_error("cuBuffer::add_frame_data: unexpected number of coils according to setup");
+    }
+
+    //if( dcw_.get() == 0x0 ){
+    //throw std::runtime_error("cuBuffer::density compensation weights not set");
+    //}
+    
+    // Make array containing the "current" buffer from the cyclic buffer
+    //
+
+    cuNDArray<_complext> cur_buffer(acc_buffer_->get_dimensions().get(),
+				    cyc_buffer_->get_data_ptr()+cur_idx_*acc_buffer_->get_number_of_elements());
+
+    // Preprocess frame
+    //
+
+    nfft_plan_->preprocess( trajectory, cuNFFT_plan<REAL,D,ATOMICS>::NFFT_PREP_NC2C );
+    
+    // Convolve to form k-space frame (accumulation mode)
+    //
+    
+    nfft_plan_->convolve( samples, &cur_buffer, dcw_.get(), cuNFFT_plan<REAL,D,ATOMICS>::NFFT_CONV_NC2C, true );
+
+    // Update the accumulation buffer (if it is time...)
+    //
+
+    bool cycle_completed = false;
+
+    if( cur_sub_idx_ == sub_cycle_length_-1 ){
+
+      cycle_completed = true;
+      
+      // Buffer complete, add to accumulation buffer
+      //
+
+      *acc_buffer_ += cur_buffer;
+      acc_buffer_empty_ = false;
+
+      // Start filling the next buffer in the cycle ...
+      //
+
+      cur_idx_++; 
+      if( cur_idx_ == cycle_length_ ) cur_idx_ = 0;
+
+      // ... but first subtract this next buffer from the accumulation buffer
+      //
+
+      cur_buffer.create( acc_buffer_->get_dimensions().get(), cyc_buffer_->get_data_ptr()+cur_idx_*acc_buffer_->get_number_of_elements() );
+      *acc_buffer_ -= cur_buffer;
+
+      // Clear new buffer before refilling
+      //
+
+      Gadgetron::clear(&cur_buffer);
+    }
+
+    cur_sub_idx_++;
+    if( cur_sub_idx_ == sub_cycle_length_ ) cur_sub_idx_ = 0;
+
+    return cycle_completed;
+  }
+
+  template<class REAL, unsigned int D, bool ATOMICS>
+  boost::shared_ptr< cuNDArray<complext<REAL> > > cuBuffer<REAL,D,ATOMICS>::get_accumulated_coil_images()
+  {
+    std::vector<size_t> dims = to_std_vector(matrix_size_);
+    dims.push_back(num_coils_);
+
+    acc_image_ = boost::shared_ptr< cuNDArray<_complext> >( new cuNDArray<_complext>(&dims) );
+				    
+    // Check if we are ready to reconstruct. If not return an image of ones...
+    if( acc_buffer_empty_ ){
+      fill(acc_image_.get(),_complext(1));
+      return acc_image_;
+    }
+
+    // Finalize gridding of k-space CSM image (convolution has been done already)
+    //
+
+    // Copy accumulation buffer before in-place FFT
+    cuNDArray<_complext> acc_copy = *acc_buffer_;
+
+    // FFT
+    nfft_plan_->fft( &acc_copy, cuNFFT_plan<REAL,D,ATOMICS>::NFFT_BACKWARDS );
+    
+    // Deapodize
+    nfft_plan_->deapodize( &acc_copy );
+    
+    // Remove oversampling
+    crop<_complext,D>( (matrix_size_os_-matrix_size_)>>1, &acc_copy, acc_image_.get() );
+    
+    //if( normalize ){
+    //REAL scale = REAL(1)/(((REAL)cycle_length_-REAL(1))*(REAL)sub_cycle_length_);
+    //*acc_image_ *= scale;
+    //}
+    
+    return acc_image_;
+  }
+
+  //
+  // Instantiations
+  //
+  
+  template class EXPORTGPUPMRI cuBuffer<float,2,true>;
+  template class EXPORTGPUPMRI cuBuffer<float,2,false>;
+  
+  template class EXPORTGPUPMRI cuBuffer<float,3,true>;
+  template class EXPORTGPUPMRI cuBuffer<float,3,false>;
+  
+  template class EXPORTGPUPMRI cuBuffer<float,4,true>;
+  template class EXPORTGPUPMRI cuBuffer<float,4,false>;
+  
+  template class EXPORTGPUPMRI cuBuffer<double,2,false>;
+  template class EXPORTGPUPMRI cuBuffer<double,3,false>;
+  template class EXPORTGPUPMRI cuBuffer<double,4,false>;
+}
diff --git a/toolboxes/mri/pmri/gpu/cuBuffer.h b/toolboxes/mri/pmri/gpu/cuBuffer.h
new file mode 100644
index 0000000..9fdd316
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuBuffer.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include "vector_td_utilities.h"
+#include "complext.h"
+#include "cuNDArray.h"
+#include "cuNFFT.h"
+#include "gpupmri_export.h"
+
+#include <boost/shared_ptr.hpp>
+
+namespace Gadgetron{
+  
+  template<class REAL, unsigned int D, bool ATOMICS = false> class EXPORTGPUPMRI cuBuffer
+  {
+  public:
+    
+    typedef complext<REAL> _complext;
+    typedef typename uint64d<D>::Type _uint64d;
+    typedef typename reald<REAL,D>::Type _reald;
+
+    cuBuffer();
+    virtual ~cuBuffer() {}
+    
+    virtual void set_dcw( boost::shared_ptr< cuNDArray<REAL> > dcw ){
+      dcw_ = dcw;
+    }
+    
+    inline REAL get_normalization_factor(){
+      return REAL(1)/(((REAL)cycle_length_-REAL(1))*(REAL)sub_cycle_length_);
+    }
+    
+    virtual void clear();
+
+    virtual void setup( _uint64d matrix_size, _uint64d matrix_size_os, REAL W, 
+                        unsigned int num_coils, unsigned int num_cycles, unsigned int num_sub_cycles );
+
+    // Boolean return value indicates whether the accumulation buffer has changed (i.e. a cycle has been completed)
+    virtual bool add_frame_data( cuNDArray<_complext> *samples, cuNDArray<_reald> *trajectory ); 
+
+    virtual boost::shared_ptr< cuNDArray< complext<REAL> > > get_accumulated_coil_images();
+
+    // Workaround for weird boost/g++ error
+    virtual boost::shared_ptr< cuNDArray< complext<REAL> > > get_combined_coil_image() = 0;
+    
+  protected:
+    _uint64d matrix_size_, matrix_size_os_;
+    REAL W_;
+    unsigned int num_coils_;
+    unsigned int cycle_length_, sub_cycle_length_;
+    unsigned int cur_idx_, cur_sub_idx_;
+    bool acc_buffer_empty_;
+    boost::shared_ptr< cuNDArray<_complext> > acc_buffer_;
+    boost::shared_ptr< cuNDArray<_complext> > cyc_buffer_;
+    boost::shared_ptr< cuNDArray<_complext> > acc_image_;
+    boost::shared_ptr< cuNDArray<REAL> > dcw_;
+    boost::shared_ptr< cuNFFT_plan<REAL,D,ATOMICS> > nfft_plan_;
+  };
+
+  // To prevent the use of atomics with doubles.
+  template<unsigned int D> class EXPORTGPUPMRI cuBuffer<double,D,true>{};  
+}
diff --git a/toolboxes/mri/pmri/gpu/cuCartesianSenseOperator.cu b/toolboxes/mri/pmri/gpu/cuCartesianSenseOperator.cu
new file mode 100644
index 0000000..8b18fc2
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuCartesianSenseOperator.cu
@@ -0,0 +1,133 @@
+#include "cuCartesianSenseOperator.h"
+#include "cuNDFFT.h"
+
+#include <sstream>
+
+using namespace Gadgetron;
+
+template<class REAL> __global__ void 
+sample_array_kernel( const complext<REAL> * __restrict__ in, complext<REAL> * __restrict__ out,
+		     unsigned int *idx, 
+		     unsigned int image_elements,
+		     unsigned int samples,
+		     unsigned int coils )
+{
+  unsigned int idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+  if (idx_in < samples) {
+    for (unsigned int i = 0; i < coils; i++) {
+      out[idx_in + i*samples].vec[0] += in[idx[idx_in] + i*image_elements].vec[0];
+      out[idx_in + i*samples].vec[1] += in[idx[idx_in] + i*image_elements].vec[1];
+    }
+  }
+}
+
+template<class REAL> __global__ void 
+insert_samples_kernel( const complext<REAL> * __restrict__ in, complext<REAL> * __restrict__ out,
+		       unsigned int *idx, 
+		       unsigned int image_elements,
+		       unsigned int samples,
+		       unsigned int coils )
+{
+  unsigned int idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+  if (idx_in < samples) {
+    for (unsigned int i = 0; i < coils; i++) {
+      out[idx[idx_in] + i*image_elements].vec[0] += in[idx_in + i*samples].vec[0];
+      out[idx[idx_in] + i*image_elements].vec[1] += in[idx_in + i*samples].vec[1];
+    }
+  }
+}
+
+template<class REAL, unsigned int D> void
+cuCartesianSenseOperator<REAL,D>::mult_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate )
+{
+  if (!(in->dimensions_equal(this->get_domain_dimensions().get())) || !(out->dimensions_equal(this->get_codomain_dimensions().get())) ) {
+    throw std::runtime_error("cuCartesianSenseOperator::mult_M dimensions mismatch");
+  }
+  
+  std::vector<size_t> full_dimensions = *this->get_domain_dimensions();
+  full_dimensions.push_back(this->ncoils_);
+  cuNDArray< complext<REAL> > tmp(&full_dimensions);
+
+  this->mult_csm(in,&tmp);
+
+
+  std::vector<size_t> ft_dims;
+  for (unsigned int i = 0; i < this->get_domain_dimensions()->size(); i++) {
+    ft_dims.push_back(i);
+  }
+
+  cuNDFFT<REAL>::instance()->fft(&tmp, &ft_dims);
+
+  if (!accumulate) 
+    clear(out);
+
+  dim3 blockDim(512,1,1);
+  dim3 gridDim((unsigned int) std::ceil((double)idx_->get_number_of_elements()/blockDim.x), 1, 1 );
+  sample_array_kernel<REAL><<< gridDim, blockDim >>>( tmp.get_data_ptr(), out->get_data_ptr(), idx_->get_data_ptr(),
+						      in->get_number_of_elements(), idx_->get_number_of_elements(), this->ncoils_);
+  cudaError_t err = cudaGetLastError();
+  if( err != cudaSuccess ){
+    std::stringstream ss;
+    ss <<"cuCartesianSenseOperator::mult_M : Unable to sample data: " <<
+      cudaGetErrorString(err);
+    throw cuda_error(ss.str());
+  }
+}
+
+template<class REAL, unsigned int D> void
+cuCartesianSenseOperator<REAL,D>::mult_MH(cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate)
+{
+  if (!(out->dimensions_equal(this->get_domain_dimensions().get())) || 
+      !(in->dimensions_equal(this->get_codomain_dimensions().get())) ) {
+    throw std::runtime_error( "cuCartesianSenseOperator::mult_MH dimensions mismatch");
+
+  }
+
+  std::vector<size_t> tmp_dimensions = *this->get_domain_dimensions();
+  tmp_dimensions.push_back(this->ncoils_);
+
+  cuNDArray< complext<REAL> > tmp(&tmp_dimensions);
+  clear(&tmp);
+
+  dim3 blockDim(512,1,1);
+  dim3 gridDim((unsigned int) std::ceil((double)idx_->get_number_of_elements()/blockDim.x), 1, 1 );
+  insert_samples_kernel<REAL><<< gridDim, blockDim >>>( in->get_data_ptr(), tmp.get_data_ptr(),
+							idx_->get_data_ptr(),out->get_number_of_elements(),
+							idx_->get_number_of_elements(), this->ncoils_);
+  
+  cudaError_t err = cudaGetLastError();
+  if( err != cudaSuccess ){
+    std::stringstream ss;
+    ss << "cuCartesianSenseOperator::mult_EM : Unable to insert samples into array: " <<
+      cudaGetErrorString(err);
+    throw cuda_error(ss.str());
+  }
+
+
+  std::vector<size_t> ft_dims;
+  for (unsigned int i = 0; i < this->get_domain_dimensions()->size(); i++) {
+    ft_dims.push_back(i);
+  }
+
+  cuNDFFT<REAL>::instance()->ifft(&tmp, &ft_dims);
+
+  if (!accumulate) 
+    clear(out);
+  
+  this->mult_csm_conj_sum(&tmp,out);
+}
+
+//
+// Instantiations
+//
+
+template class EXPORTGPUPMRI cuCartesianSenseOperator<float,1>;
+template class EXPORTGPUPMRI cuCartesianSenseOperator<float,2>;
+template class EXPORTGPUPMRI cuCartesianSenseOperator<float,3>;
+template class EXPORTGPUPMRI cuCartesianSenseOperator<float,4>;
+
+template class EXPORTGPUPMRI cuCartesianSenseOperator<double,1>;
+template class EXPORTGPUPMRI cuCartesianSenseOperator<double,2>;
+template class EXPORTGPUPMRI cuCartesianSenseOperator<double,3>;
+template class EXPORTGPUPMRI cuCartesianSenseOperator<double,4>;
+
diff --git a/toolboxes/mri/pmri/gpu/cuCartesianSenseOperator.h b/toolboxes/mri/pmri/gpu/cuCartesianSenseOperator.h
new file mode 100644
index 0000000..2b292fa
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuCartesianSenseOperator.h
@@ -0,0 +1,35 @@
+/** \file cuCartesianSenseOperator.h
+    \brief Cartesian Sense operator, GPU based.
+*/
+
+#pragma once
+
+#include "cuSenseOperator.h"
+
+namespace Gadgetron{
+  
+  template<class REAL, unsigned int D> class EXPORTGPUPMRI cuCartesianSenseOperator : public cuSenseOperator<REAL,D>
+  {
+  public:
+    
+    cuCartesianSenseOperator() : cuSenseOperator<REAL,D>() {}
+    virtual ~cuCartesianSenseOperator() {}
+    
+    virtual void mult_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false);
+    virtual void mult_MH( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false);
+    
+    virtual void set_sampling_indices( boost::shared_ptr< cuNDArray<unsigned int> > idx) 
+    {
+      if (idx.get()) {
+	idx_ = idx;
+	std::vector<size_t> tmp_dims;
+	tmp_dims.push_back(idx_->get_number_of_elements());
+	tmp_dims.push_back(this->ncoils_);
+	this->set_codomain_dimensions(&tmp_dims);
+      }
+    }
+    
+  protected:
+    boost::shared_ptr< cuNDArray<unsigned int> > idx_;
+  };
+}
diff --git a/toolboxes/mri/pmri/gpu/cuNonCartesianKtSenseOperator.cu b/toolboxes/mri/pmri/gpu/cuNonCartesianKtSenseOperator.cu
new file mode 100644
index 0000000..72a98b9
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuNonCartesianKtSenseOperator.cu
@@ -0,0 +1,41 @@
+#include "cuNonCartesianKtSenseOperator.h"
+#include "cuNDFFT.h"
+
+using namespace Gadgetron;
+
+template<class REAL, unsigned int D> void
+cuNonCartesianKtSenseOperator<REAL,D>::mult_M( cuNDArray< complext<REAL> >* in, cuNDArray< complext<REAL> >* out, bool accumulate )
+{
+  if( accumulate ){
+    throw std::runtime_error( "cuNonCartesianKtSenseOperator::mult_M: accumulation not supported");
+  }
+  
+  // Make a copy of the input array as the fft transform in-place and we do not want to alter the input
+  cuNDArray< complext<REAL> > tmp(*in); 
+  cuNDFFT<REAL>::instance()->fft( &tmp, D );
+  
+  cuNonCartesianSenseOperator<REAL,D>::mult_M( &tmp, out, accumulate );
+}
+
+template<class REAL, unsigned int D> void
+cuNonCartesianKtSenseOperator<REAL,D>::mult_MH( cuNDArray< complext<REAL> >* in, cuNDArray< complext<REAL> >* out, bool accumulate )
+{  
+  if( accumulate ){
+    throw std::runtime_error( "cuNonCartesianKtSenseOperator::mult_MH: accumulation not supported");
+  }
+
+  cuNonCartesianSenseOperator<REAL,D>::mult_MH( in, out, accumulate );
+  cuNDFFT<REAL>::instance()->ifft( out, D );
+}
+
+//
+// Instantiations
+//
+
+template class EXPORTGPUPMRI cuNonCartesianKtSenseOperator<float,2>;
+template class EXPORTGPUPMRI cuNonCartesianKtSenseOperator<float,3>;
+template class EXPORTGPUPMRI cuNonCartesianKtSenseOperator<float,4>;
+
+template class EXPORTGPUPMRI cuNonCartesianKtSenseOperator<double,2>;
+template class EXPORTGPUPMRI cuNonCartesianKtSenseOperator<double,3>;
+template class EXPORTGPUPMRI cuNonCartesianKtSenseOperator<double,4>;
diff --git a/toolboxes/mri/pmri/gpu/cuNonCartesianKtSenseOperator.h b/toolboxes/mri/pmri/gpu/cuNonCartesianKtSenseOperator.h
new file mode 100644
index 0000000..e1085d4
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuNonCartesianKtSenseOperator.h
@@ -0,0 +1,27 @@
+/** \file cuNonCartesianKtSenseOperator.h
+    \brief Non-Cartesian kt-Sense operator, GPU based.
+*/
+
+#pragma once
+
+#include "cuNonCartesianSenseOperator.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D>
+  class EXPORTGPUPMRI cuNonCartesianKtSenseOperator : public cuNonCartesianSenseOperator<REAL,D>
+  {
+    
+  public:
+    
+    typedef typename uint64d<D>::Type _uint64d;
+    typedef typename reald<REAL,D>::Type _reald;
+    
+    cuNonCartesianKtSenseOperator() : cuNonCartesianSenseOperator<REAL,D>() {}
+    virtual ~cuNonCartesianKtSenseOperator() {}
+    
+    virtual void mult_M( cuNDArray< complext<REAL> >* in, cuNDArray< complext<REAL> >* out, bool accumulate = false );
+    virtual void mult_MH( cuNDArray< complext<REAL> >* in, cuNDArray< complext<REAL> >* out, bool accumulate = false );
+    
+  };
+}
diff --git a/toolboxes/mri/pmri/gpu/cuNonCartesianSenseOperator.cu b/toolboxes/mri/pmri/gpu/cuNonCartesianSenseOperator.cu
new file mode 100644
index 0000000..ecb4367
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuNonCartesianSenseOperator.cu
@@ -0,0 +1,102 @@
+#include "cuNonCartesianSenseOperator.h"
+#include "vector_td_utilities.h"
+
+using namespace Gadgetron;
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+cuNonCartesianSenseOperator<REAL,D,ATOMICS>::mult_M( cuNDArray< complext<REAL> >* in, cuNDArray< complext<REAL> >* out, bool accumulate )
+{
+  if( !in || !out ){
+    throw std::runtime_error("cuNonCartesianSenseOperator::mult_M : 0x0 input/output not accepted");
+  }
+  if ( !in->dimensions_equal(&this->domain_dims_) || !out->dimensions_equal(&this->codomain_dims_)){
+	  throw std::runtime_error("cuNonCartesianSenseOperator::mult_H: input/output arrays do not match specified domain/codomains");
+  }
+
+  std::vector<size_t> full_dimensions = *this->get_domain_dimensions();
+  full_dimensions.push_back(this->ncoils_);
+  cuNDArray< complext<REAL> > tmp(&full_dimensions);  
+  this->mult_csm( in, &tmp );
+  
+  // Forwards NFFT
+
+  if( accumulate ){
+    cuNDArray< complext<REAL> > tmp_out(out->get_dimensions());
+    plan_->compute( &tmp, &tmp_out, dcw_.get(), cuNFFT_plan<REAL,D,ATOMICS>::NFFT_FORWARDS_C2NC );
+    *out += tmp_out;
+  }
+  else
+    plan_->compute( &tmp, out, dcw_.get(), cuNFFT_plan<REAL,D,ATOMICS>::NFFT_FORWARDS_C2NC );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+cuNonCartesianSenseOperator<REAL,D,ATOMICS>::mult_MH( cuNDArray< complext<REAL> >* in, cuNDArray< complext<REAL> >* out, bool accumulate )
+{
+  if( !in || !out ){
+    throw std::runtime_error("cuNonCartesianSenseOperator::mult_MH : 0x0 input/output not accepted");
+  }
+
+  if ( !in->dimensions_equal(&this->codomain_dims_) || !out->dimensions_equal(&this->domain_dims_)){
+	  throw std::runtime_error("cuNonCartesianSenseOperator::mult_MH: input/output arrays do not match specified domain/codomains");
+  }
+  std::vector<size_t> tmp_dimensions = *this->get_domain_dimensions();
+  tmp_dimensions.push_back(this->ncoils_);
+  cuNDArray< complext<REAL> > tmp(&tmp_dimensions);
+
+ // Do the NFFT
+  plan_->compute( in, &tmp, dcw_.get(), cuNFFT_plan<REAL,D,ATOMICS>::NFFT_BACKWARDS_NC2C );
+
+  if( !accumulate ){
+    clear(out);    
+  }
+  
+  this->mult_csm_conj_sum( &tmp, out );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+cuNonCartesianSenseOperator<REAL,D,ATOMICS>::setup( _uint64d matrix_size, _uint64d matrix_size_os, REAL W )
+{  
+  plan_->setup( matrix_size, matrix_size_os, W );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+cuNonCartesianSenseOperator<REAL,D,ATOMICS>::preprocess( cuNDArray<_reald> *trajectory ) 
+{
+  if( trajectory == 0x0 ){
+    throw std::runtime_error( "cuNonCartesianSenseOperator: cannot preprocess 0x0 trajectory.");
+  }
+  
+  boost::shared_ptr< std::vector<size_t> > domain_dims = this->get_domain_dimensions();
+  if( domain_dims.get() == 0x0 || domain_dims->size() == 0 ){
+    throw std::runtime_error("cuNonCartesianSenseOperator::preprocess : operator domain dimensions not set");
+  }
+  plan_->preprocess( trajectory, cuNFFT_plan<REAL,D,ATOMICS>::NFFT_PREP_ALL );
+  is_preprocessed_ = true;
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+cuNonCartesianSenseOperator<REAL,D,ATOMICS>::set_dcw( boost::shared_ptr< cuNDArray<REAL> > dcw ) 
+{
+  dcw_ = dcw;  
+}
+
+//
+// Instantiations
+//
+
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<float,1,true>;
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<float,1,false>;
+
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<float,2,true>;
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<float,2,false>;
+
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<float,3,true>;
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<float,3,false>;
+
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<float,4,true>;
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<float,4,false>;
+
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<double,1,false>;
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<double,2,false>;
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<double,3,false>;
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<double,4,false>;
diff --git a/toolboxes/mri/pmri/gpu/cuNonCartesianSenseOperator.h b/toolboxes/mri/pmri/gpu/cuNonCartesianSenseOperator.h
new file mode 100644
index 0000000..fb56333
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuNonCartesianSenseOperator.h
@@ -0,0 +1,48 @@
+/** \file cuNonCartesianSenseOperator.h
+    \brief Non-Cartesian Sense operator, GPU based.
+*/
+
+#pragma once
+
+#include "cuSenseOperator.h"
+#include "cuNFFT.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D, bool ATOMICS = false> class EXPORTGPUPMRI cuNonCartesianSenseOperator : public cuSenseOperator<REAL,D>
+  {
+  
+  public:
+  
+    typedef typename uint64d<D>::Type _uint64d;
+    typedef typename reald<REAL,D>::Type _reald;
+
+    cuNonCartesianSenseOperator() : cuSenseOperator<REAL,D>() { 
+      plan_ = boost::shared_ptr< cuNFFT_plan<REAL, D, ATOMICS> >( new cuNFFT_plan<REAL, D, ATOMICS>() );
+      is_preprocessed_ = false;
+    }
+    
+    virtual ~cuNonCartesianSenseOperator() {}
+    
+    inline boost::shared_ptr< cuNFFT_plan<REAL, D, ATOMICS> > get_plan() { return plan_; }
+    inline boost::shared_ptr< cuNDArray<REAL> > get_dcw() { return dcw_; }
+    inline bool is_preprocessed() { return is_preprocessed_; } 
+
+    virtual void mult_M( cuNDArray< complext<REAL> >* in, cuNDArray< complext<REAL> >* out, bool accumulate = false );
+    virtual void mult_MH( cuNDArray< complext<REAL> >* in, cuNDArray< complext<REAL> >* out, bool accumulate = false );
+
+    virtual void setup( _uint64d matrix_size, _uint64d matrix_size_os, REAL W );
+    virtual void preprocess( cuNDArray<_reald> *trajectory );
+    virtual void set_dcw( boost::shared_ptr< cuNDArray<REAL> > dcw );
+
+
+  
+  protected:
+    boost::shared_ptr< cuNFFT_plan<REAL, D, ATOMICS> > plan_;
+    boost::shared_ptr< cuNDArray<REAL> > dcw_;
+    bool is_preprocessed_;
+  };
+  
+  //Atomics can't be used with doubles
+  template<unsigned int D> class EXPORTGPUPMRI cuNonCartesianSenseOperator<double,D,true>{};
+}
diff --git a/toolboxes/mri/pmri/gpu/cuSenseBuffer.cpp b/toolboxes/mri/pmri/gpu/cuSenseBuffer.cpp
new file mode 100644
index 0000000..2b085d6
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuSenseBuffer.cpp
@@ -0,0 +1,58 @@
+#include "cuSenseBuffer.h"
+
+namespace Gadgetron {
+
+  template<class REAL, unsigned int D, bool ATOMICS>
+  void cuSenseBuffer<REAL,D,ATOMICS>
+  ::setup( _uint64d matrix_size, _uint64d matrix_size_os, REAL W, 
+           unsigned int num_coils, unsigned int num_cycles, unsigned int num_sub_cycles )
+  {      
+    cuBuffer<REAL,D,ATOMICS>::setup(matrix_size, matrix_size_os, W, num_coils, num_cycles, num_sub_cycles );
+    
+    if( E_.get() == 0x0 ){   
+      std::vector<size_t> dims = to_std_vector(this->matrix_size_);    
+      E_ = boost::shared_ptr< cuNonCartesianSenseOperator<REAL,D,ATOMICS> >(new cuNonCartesianSenseOperator<REAL,D,ATOMICS>);      
+      E_->set_domain_dimensions(&dims);
+      E_->setup( this->matrix_size_, this->matrix_size_os_, W );
+    }    
+  }
+  
+  template<class REAL, unsigned int D, bool ATOMICS>
+  boost::shared_ptr< cuNDArray<complext<REAL> > > cuSenseBuffer<REAL,D,ATOMICS>::get_combined_coil_image()
+  {
+    if( this->csm_.get() == 0x0 ){
+      throw std::runtime_error("cuSenseBuffer::get_combined_coil_image: csm not set");
+    }
+    
+    if( this->acc_image_.get() == 0x0 ){
+      if( this->get_accumulated_coil_images().get() == 0x0 ){ // This updates acc_image_
+        throw std::runtime_error("cuSenseBuffer::get_combined_coil_image: unable to acquire accumulated coil images");
+      }
+    }
+    
+    std::vector<size_t> dims = to_std_vector(this->matrix_size_);
+    boost::shared_ptr< cuNDArray<_complext> > image( new cuNDArray<_complext>(&dims) );
+
+    E_->set_csm(this->csm_);
+    E_->mult_csm_conj_sum( this->acc_image_.get(), image.get() );
+
+    return image;
+  }
+  
+  //
+  // Instantiations
+  //
+
+  template class EXPORTGPUPMRI cuSenseBuffer<float,2,true>;
+  template class EXPORTGPUPMRI cuSenseBuffer<float,2,false>;
+
+  template class EXPORTGPUPMRI cuSenseBuffer<float,3,true>;
+  template class EXPORTGPUPMRI cuSenseBuffer<float,3,false>;
+
+  template class EXPORTGPUPMRI cuSenseBuffer<float,4,true>;
+  template class EXPORTGPUPMRI cuSenseBuffer<float,4,false>;
+
+  template class EXPORTGPUPMRI cuSenseBuffer<double,2,false>;
+  template class EXPORTGPUPMRI cuSenseBuffer<double,3,false>;
+  template class EXPORTGPUPMRI cuSenseBuffer<double,4,false>;
+}
diff --git a/toolboxes/mri/pmri/gpu/cuSenseBuffer.h b/toolboxes/mri/pmri/gpu/cuSenseBuffer.h
new file mode 100644
index 0000000..755e849
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuSenseBuffer.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "cuBuffer.h"
+#include "cuNonCartesianSenseOperator.h"
+
+#include <stdio.h>
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D, bool ATOMICS = false> 
+  class EXPORTGPUPMRI cuSenseBuffer : public cuBuffer<REAL,D,ATOMICS>
+  {
+  public:
+    
+    typedef typename cuBuffer<REAL,D,ATOMICS>::_complext _complext;
+    typedef typename cuBuffer<REAL,D,ATOMICS>::_uint64d  _uint64d;
+    typedef typename cuBuffer<REAL,D,ATOMICS>::_reald    _reald;
+
+    cuSenseBuffer() : cuBuffer<REAL,D,ATOMICS>() {}
+    virtual ~cuSenseBuffer() {}
+
+    virtual void setup( _uint64d matrix_size, _uint64d matrix_size_os, REAL W, 
+                        unsigned int num_coils, unsigned int num_cycles, unsigned int num_sub_cycles );
+
+    virtual void set_csm( boost::shared_ptr< cuNDArray<_complext> > csm ){
+      csm_ = csm;
+    }
+    
+    virtual boost::shared_ptr< cuNDArray< complext<REAL> > > get_combined_coil_image();
+
+  protected:
+    boost::shared_ptr< cuNDArray<_complext> > csm_;
+    boost::shared_ptr< cuNonCartesianSenseOperator<REAL,D,ATOMICS> > E_;    
+  };
+  
+  // To prevent the use of atomics with doubles.
+  template<unsigned int D> class EXPORTGPUPMRI cuSenseBuffer<double,D,true>{};  
+}
diff --git a/toolboxes/mri/pmri/gpu/cuSenseBufferCg.cpp b/toolboxes/mri/pmri/gpu/cuSenseBufferCg.cpp
new file mode 100644
index 0000000..46688c8
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuSenseBufferCg.cpp
@@ -0,0 +1,89 @@
+#include "cuSenseBufferCg.h"
+#include "vector_td_utilities.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "cuNDArray_elemwise.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D, bool ATOMICS>
+  void cuSenseBufferCg<REAL,D,ATOMICS>::
+  setup( _uint64d matrix_size, _uint64d matrix_size_os, REAL W, 
+	 unsigned int num_coils, unsigned int num_cycles, unsigned int num_sub_cycles )
+  {      
+    cuSenseBuffer<REAL,D,ATOMICS>::setup( matrix_size, matrix_size_os, W, num_coils, num_cycles, num_sub_cycles );
+    
+    D_ = boost::shared_ptr< cuCgPreconditioner<_complext> >( new cuCgPreconditioner<_complext>() );
+    
+    cg_.set_encoding_operator( this->E_ );
+    cg_.set_preconditioner( D_ );    
+    cg_.set_max_iterations( 2 );
+    cg_.set_tc_tolerance( 1e-6 );
+    cg_.set_output_mode( cuCgSolver<_complext>::OUTPUT_SILENT);    
+  }
+  
+  template<class REAL, unsigned int D, bool ATOMICS>
+  void cuSenseBufferCg<REAL,D,ATOMICS>::preprocess( cuNDArray<_reald> *traj ) {
+    this->E_->preprocess(traj);
+    std::vector<size_t> dims = *traj->get_dimensions();
+    dims.push_back(this->num_coils_);
+    this->E_->set_codomain_dimensions(&dims);
+  }
+
+  template<class REAL, unsigned int D, bool ATOMICS>
+  boost::shared_ptr< cuNDArray<complext<REAL> > > cuSenseBufferCg<REAL,D,ATOMICS>::get_combined_coil_image()
+  {
+    // Some validity checks
+    //
+
+    if( this->csm_.get() == 0x0 ){
+      throw std::runtime_error("cuSenseBufferCg::get_combined_coil_image: csm not set");
+    }
+
+    if( !this->E_->is_preprocessed() ){
+      throw std::runtime_error("cuSenseBufferCg::get_combined_coil_image: preprocessing not performed");
+    }
+    
+    // Compute (and scale) rhs
+    //
+
+    boost::shared_ptr< cuNDArray<_complext> > rhs = cuSenseBuffer<REAL,D,ATOMICS>::get_combined_coil_image();
+
+    if( rhs.get() == 0x0 ){
+      throw std::runtime_error("cuSenseBufferCg::get_combined_coil_image: failed to compute rhs");
+    }
+    
+    *rhs *= this->get_normalization_factor();
+
+    // Define preconditioning weights
+    //
+
+    boost::shared_ptr< cuNDArray<REAL> > _precon_weights = sum(abs_square(this->csm_.get()).get(), D);
+    reciprocal_sqrt_inplace(_precon_weights.get());	
+    boost::shared_ptr< cuNDArray<_complext> > precon_weights = real_to_complex<_complext>( _precon_weights.get() );
+    _precon_weights.reset();
+    D_->set_weights( precon_weights ); 
+
+    // Solve
+    //
+
+    return cg_.solve_from_rhs(rhs.get());
+  }
+  
+  //
+  // Instantiations
+  //
+
+  template class EXPORTGPUPMRI cuSenseBufferCg<float,2,true>;
+  template class EXPORTGPUPMRI cuSenseBufferCg<float,2,false>;
+
+  template class EXPORTGPUPMRI cuSenseBufferCg<float,3,true>;
+  template class EXPORTGPUPMRI cuSenseBufferCg<float,3,false>;
+
+  template class EXPORTGPUPMRI cuSenseBufferCg<float,4,true>;
+  template class EXPORTGPUPMRI cuSenseBufferCg<float,4,false>;
+
+  template class EXPORTGPUPMRI cuSenseBufferCg<double,2,false>;
+  template class EXPORTGPUPMRI cuSenseBufferCg<double,3,false>;
+  template class EXPORTGPUPMRI cuSenseBufferCg<double,4,false>;
+}
diff --git a/toolboxes/mri/pmri/gpu/cuSenseBufferCg.h b/toolboxes/mri/pmri/gpu/cuSenseBufferCg.h
new file mode 100644
index 0000000..c24385f
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuSenseBufferCg.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "cuSenseBuffer.h"
+#include "cuCgSolver.h"
+#include "cuCgPreconditioner.h"
+
+namespace Gadgetron{
+  
+  template<class REAL, unsigned int D, bool ATOMICS = false> 
+  class EXPORTGPUPMRI cuSenseBufferCg : public cuSenseBuffer<REAL,D,ATOMICS> 
+  {
+  public:
+    
+    typedef complext<REAL> _complext;
+    typedef typename uint64d<D>::Type _uint64d;
+    typedef typename reald<REAL,D>::Type _reald;
+
+    cuSenseBufferCg() : cuSenseBuffer<REAL,D,ATOMICS>() {}
+    virtual ~cuSenseBufferCg() {}
+
+    inline void set_dcw_for_rhs( boost::shared_ptr< cuNDArray<REAL> > dcw ){
+      this->E_->set_dcw(sqrt(dcw.get()));
+    }
+    
+    virtual void preprocess( cuNDArray<_reald> *traj );
+
+    virtual void setup( _uint64d matrix_size, _uint64d matrix_size_os, REAL W, 
+			unsigned int num_coils, unsigned int num_cycles, unsigned int num_sub_cycles );
+    
+    virtual boost::shared_ptr< cuNDArray<_complext> > get_combined_coil_image();
+    
+  protected:    
+    cuCgSolver<_complext> cg_;
+    boost::shared_ptr< cuCgPreconditioner<_complext> > D_;
+  };
+  
+  // To prevent the use of atomics with doubles.
+  template<unsigned int D> class EXPORTGPUPMRI cuSenseBufferCg<double,D,true>{};
+}
diff --git a/toolboxes/mri/pmri/gpu/cuSenseOperator.cu b/toolboxes/mri/pmri/gpu/cuSenseOperator.cu
new file mode 100644
index 0000000..27a989d
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuSenseOperator.cu
@@ -0,0 +1,32 @@
+#include "cuSenseOperator.h"
+#include "sense_utilities.h"
+#include "vector_td_utilities.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D> void
+  cuSenseOperator<REAL,D>::mult_csm( cuNDArray<complext<REAL> >* in, cuNDArray<complext<REAL> >* out )
+  {  
+    csm_mult_M<REAL,D>( in, out, this->csm_.get() );
+  }
+  
+  template<class REAL, unsigned int D> void
+  cuSenseOperator<REAL,D>::mult_csm_conj_sum( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out )
+  {
+    csm_mult_MH<REAL,D>( in, out, this->csm_.get() );
+  }
+  
+  //
+  // Instantiations
+  //
+  
+  template class EXPORTGPUPMRI cuSenseOperator<float,1>;
+  template class EXPORTGPUPMRI cuSenseOperator<float,2>;
+  template class EXPORTGPUPMRI cuSenseOperator<float,3>;
+  template class EXPORTGPUPMRI cuSenseOperator<float,4>;
+
+  template class EXPORTGPUPMRI cuSenseOperator<double,1>;
+  template class EXPORTGPUPMRI cuSenseOperator<double,2>;
+  template class EXPORTGPUPMRI cuSenseOperator<double,3>;
+  template class EXPORTGPUPMRI cuSenseOperator<double,4>;
+}
diff --git a/toolboxes/mri/pmri/gpu/cuSenseOperator.h b/toolboxes/mri/pmri/gpu/cuSenseOperator.h
new file mode 100644
index 0000000..34b13b5
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuSenseOperator.h
@@ -0,0 +1,31 @@
+/** \file cuSenseOperator.h
+    \brief Base class for the GPU based Sense operators
+*/
+
+#pragma once
+
+#include "senseOperator.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "vector_td.h"
+#include "complext.h"
+#include "gpupmri_export.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D> class EXPORTGPUPMRI cuSenseOperator : public senseOperator< cuNDArray< complext<REAL> >, D >
+  {
+    
+  public:
+    
+    cuSenseOperator() : senseOperator<cuNDArray< complext<REAL> >,D >() {}
+    virtual ~cuSenseOperator() {}
+        
+    virtual void mult_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false ) = 0;
+    virtual void mult_MH( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false ) = 0;
+    
+    virtual void mult_csm( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out );
+    virtual void mult_csm_conj_sum( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out );    
+  };
+}
diff --git a/toolboxes/mri/pmri/gpu/cuSpiritBuffer.cpp b/toolboxes/mri/pmri/gpu/cuSpiritBuffer.cpp
new file mode 100644
index 0000000..12bd026
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuSpiritBuffer.cpp
@@ -0,0 +1,89 @@
+#include "cuSpiritBuffer.h"
+#include "cuCgSolver.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_reductions.h"
+#include "hoNDArray_fileio.h"
+
+namespace Gadgetron {
+
+  template<class REAL, unsigned int D, bool ATOMICS>
+  void cuSpiritBuffer<REAL,D,ATOMICS>::
+  setup( _uint64d matrix_size, _uint64d matrix_size_os, REAL W, 
+	 unsigned int num_coils, unsigned int num_cycles, unsigned int num_sub_cycles )
+  {      
+    cuBuffer<REAL,D,ATOMICS>::setup( matrix_size, matrix_size_os, W, num_coils, num_cycles, num_sub_cycles );
+    
+    E_->setup( this->matrix_size_, this->matrix_size_os_, W );
+
+    cg_.set_encoding_operator( this->E_ );
+    cg_.set_max_iterations( 5 );
+    cg_.set_tc_tolerance( 1e-8 );
+    cg_.set_output_mode( cuCgSolver<_complext>::OUTPUT_VERBOSE);
+  }
+
+  template<class REAL, unsigned int D, bool ATOMICS>
+  void cuSpiritBuffer<REAL,D,ATOMICS>::preprocess( cuNDArray<_reald> *traj ) {
+    E_->preprocess(traj);
+    std::vector<size_t> dims = *traj->get_dimensions();
+    dims.push_back(this->num_coils_);
+    E_->set_codomain_dimensions(&dims);
+  }
+
+  template<class REAL, unsigned int D, bool ATOMICS>
+  boost::shared_ptr< cuNDArray<complext<REAL> > > cuSpiritBuffer<REAL,D,ATOMICS>::get_accumulated_coil_images()
+  {
+    // Apply adjoint operator to get the rhs
+    //
+
+    boost::shared_ptr< cuNDArray<_complext> > rhs = cuBuffer<REAL,D,ATOMICS>::get_accumulated_coil_images();
+
+    // Invert by cg solver
+    //
+
+    *rhs *= this->get_normalization_factor();
+    this->acc_image_ = cg_.solve_from_rhs(rhs.get());
+
+    static int counter = 0;
+    char filename[256];
+    sprintf((char*)filename, "_coil_images_%d.real", counter);
+    write_nd_array<REAL>( abs(this->acc_image_.get())->to_host().get(), filename );
+    counter++;
+
+    return this->acc_image_;
+  }
+
+  template<class REAL, unsigned int D, bool ATOMICS>
+  boost::shared_ptr< cuNDArray<complext<REAL> > > cuSpiritBuffer<REAL,D,ATOMICS>::get_combined_coil_image()
+  {
+    // Get the individual coil images
+    //
+
+    if( this->acc_image_.get() == 0x0 ){
+      if( this->get_accumulated_coil_images().get() == 0x0 ){ // This updates acc_image_
+        throw std::runtime_error("cuSpiritBuffer::get_combined_coil_image: unable to acquire accumulated coil images");
+      }
+    }
+    
+    // Compute RSS
+    //
+
+    return real_to_complex< complext<REAL> >(sqrt(sum(abs_square(this->acc_image_.get()).get(), 2).get()).get());
+  }
+  
+  //
+  // Instantiations
+  //
+
+  template class EXPORTGPUPMRI cuSpiritBuffer<float,2,true>;
+  template class EXPORTGPUPMRI cuSpiritBuffer<float,2,false>;
+
+  template class EXPORTGPUPMRI cuSpiritBuffer<float,3,true>;
+  template class EXPORTGPUPMRI cuSpiritBuffer<float,3,false>;
+
+  template class EXPORTGPUPMRI cuSpiritBuffer<float,4,true>;
+  template class EXPORTGPUPMRI cuSpiritBuffer<float,4,false>;
+
+  template class EXPORTGPUPMRI cuSpiritBuffer<double,2,false>;
+  template class EXPORTGPUPMRI cuSpiritBuffer<double,3,false>;
+  template class EXPORTGPUPMRI cuSpiritBuffer<double,4,false>;
+}
diff --git a/toolboxes/mri/pmri/gpu/cuSpiritBuffer.h b/toolboxes/mri/pmri/gpu/cuSpiritBuffer.h
new file mode 100644
index 0000000..a333691
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuSpiritBuffer.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include "cuBuffer.h"
+#include "cuCgSolver.h"
+#include "cuNFFTOperator.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D, bool ATOMICS = false> 
+  class EXPORTGPUPMRI cuSpiritBuffer : public cuBuffer<REAL,D,ATOMICS>
+  {
+  public:
+    
+    typedef typename cuBuffer<REAL,D,ATOMICS>::_complext _complext;
+    typedef typename cuBuffer<REAL,D,ATOMICS>::_uint64d  _uint64d;
+    typedef typename cuBuffer<REAL,D,ATOMICS>::_reald    _reald;
+
+    cuSpiritBuffer() : cuBuffer<REAL,D,ATOMICS>() {
+      E_ = boost::shared_ptr< cuNFFTOperator<REAL,D> >(new cuNFFTOperator<REAL,D>() );
+    }
+    
+    virtual ~cuSpiritBuffer() {}
+    
+    inline void set_dcw_for_rhs( boost::shared_ptr< cuNDArray<REAL> > dcw ){
+      this->E_->set_dcw(dcw);
+    }
+
+    virtual void setup( _uint64d matrix_size, _uint64d matrix_size_os, REAL W, 
+                        unsigned int num_coils, unsigned int num_cycles, unsigned int num_sub_cycles );
+    
+    virtual void preprocess( cuNDArray<_reald> *traj );
+
+    virtual boost::shared_ptr< cuNDArray< complext<REAL> > > get_accumulated_coil_images();
+    virtual boost::shared_ptr< cuNDArray< complext<REAL> > > get_combined_coil_image();
+    
+  protected:
+    cuCgSolver<_complext> cg_;
+    boost::shared_ptr< cuNFFTOperator<REAL,D> > E_;
+  };
+  
+  // To prevent the use of atomics with doubles.
+  template<unsigned int D> class EXPORTGPUPMRI cuSpiritBuffer<double,D,true>{};  
+}
diff --git a/toolboxes/mri/pmri/gpu/cuSpiritOperator.h b/toolboxes/mri/pmri/gpu/cuSpiritOperator.h
new file mode 100644
index 0000000..a1c638f
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuSpiritOperator.h
@@ -0,0 +1,141 @@
+/** \file cuSpiritOperator.h
+    \brief Spirit regularization operator.
+
+    The operator domain and codomain sizes are the image size times the number of coils. 
+ */
+
+#pragma once
+
+#include "cuDiagonalSumOperator.h"
+#include <numeric>
+
+namespace Gadgetron {
+
+template<class REAL> class cuSpirit2DOperator : public linearOperator< cuNDArray< complext<REAL> > >
+{
+public:
+
+	cuSpirit2DOperator() : linearOperator< cuNDArray< complext<REAL> > >() {
+		D_ = boost::shared_ptr< cuDiagonalSumOperator< complext<REAL> > >(new cuDiagonalSumOperator< complext<REAL> >());
+	}
+
+	virtual ~cuSpirit2DOperator() {}
+
+	virtual void set_calibration_kernels( boost::shared_ptr< cuNDArray< complext<REAL> > > kernels )
+	{
+		if( kernels->get_number_of_dimensions() !=3 ){
+			throw std::runtime_error("cuSpirit2DOperator::set_calibration kernels: kernels array must be three-dimensionsal (x,y,squared num coils)");
+		}
+		kernels_ = kernels;
+	}
+
+	virtual void mult_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false )
+	{
+		if( !kernels_.get() ){
+			throw std::runtime_error("cuSpiritCalibrationOperator::mult_M failed: calibration kernels not set");
+		}
+
+		if( in->get_number_of_dimensions() < 3 || out->get_number_of_dimensions() < 3 ){
+			throw std::runtime_error("cuSpiritCalibrationOperator::mult_M failed: expected at least 3 dimensions in input and output images");
+		}
+
+		const unsigned int num_coils_squared = kernels_->get_size(2);
+		const unsigned int num_phases_in = in->get_size(in->get_number_of_dimensions()-1);
+		const unsigned int num_phases_out = out->get_size(out->get_number_of_dimensions()-1);
+		const unsigned int num_frames = in->get_number_of_dimensions() == 3 ? 1 : in->get_size(2); //Number of frames 3rd dimension
+
+		if( num_phases_out != num_phases_in ){
+			throw std::runtime_error("cuSpirit2DOperator::mult_M failed: array size mismatch between input/output images");
+		}
+
+		if( num_phases_in*num_phases_out != num_coils_squared ){
+			throw std::runtime_error("cuSpirit2DOperator::mult_M failed: the calibration kernels do not correspond to the squared number of coils");
+		}
+
+		std::vector<size_t> dim_image = {in->get_size(0),in->get_size(1)};
+		std::vector<size_t> dim_coils = dim_image;
+		dim_coils.push_back(num_phases_in);
+
+
+		size_t num_elements_image = std::accumulate(dim_image.begin(),dim_image.end(),1,std::multiplies<size_t>());
+		size_t num_elements_coils = num_elements_image*dim_coils.back();
+
+		// Iterate over the coils
+		//
+		for( unsigned int i=0; i<num_phases_out; i++ ){
+			boost::shared_ptr< cuNDArray< complext<REAL> > > tmp_kernel( new cuNDArray< complext<REAL> >(&dim_coils, kernels_->get_data_ptr()+i*num_elements_coils ));
+			D_->set_diagonal(tmp_kernel);
+			for (unsigned int k=0; k < num_frames; k++){
+				cuNDArray<complext<REAL>> tmp_in(&dim_image,in->get_data_ptr()+k*num_elements_image);
+				cuNDArray< complext<REAL> > tmp_out( &dim_image, out->get_data_ptr()+i*num_elements_image*num_frames+k*num_elements_image );
+				D_->mult_M( &tmp_in, &tmp_out, accumulate );
+			}
+		}
+
+		// Subtract identity
+		//
+
+		*out -= *in;
+	}
+
+	virtual void mult_MH( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false )
+	{
+		if( !kernels_.get() ){
+			throw std::runtime_error("cuSpiritCalibrationOperator::mult_MH failed: calibration kernels not set");
+		}
+
+		if( in->get_number_of_dimensions() != 3 || out->get_number_of_dimensions() != 3 ){
+			throw std::runtime_error("cuSpiritCalibrationOperator::mult_MH failed: expected exactly 3 dimensions in input and output images");
+		}
+const unsigned int num_coils_squared = kernels_->get_size(2);
+		const unsigned int num_phases_in = in->get_size(in->get_number_of_dimensions()-1);
+		const unsigned int num_phases_out = out->get_size(out->get_number_of_dimensions()-1);
+		const unsigned int num_frames = in->get_number_of_dimensions() == 3 ? 1 : in->get_size(2); //Number of frames 3rd dimension
+
+		if( num_phases_out != num_phases_in ){
+			throw std::runtime_error("cuSpirit2DOperator::mult_M failed: array size mismatch between input/output images");
+		}
+
+		if( num_phases_in*num_phases_out != num_coils_squared ){
+			throw std::runtime_error("cuSpirit2DOperator::mult_M failed: the calibration kernels do not correspond to the squared number of coils");
+		}
+
+		std::vector<size_t> dim_image = {in->get_size(0),in->get_size(1)};
+		std::vector<size_t> dim_coils = dim_image;
+		dim_coils.push_back(num_phases_in);
+
+
+		size_t num_elements_image = std::accumulate(dim_image.begin(),dim_image.end(),1,std::multiplies<size_t>());
+		size_t num_elements_coils = num_elements_image*dim_coils.back();
+
+
+		// Iterate over the coils
+		//
+
+		for( unsigned int i=0; i<num_phases_in; i++ ){
+			boost::shared_ptr< cuNDArray< complext<REAL> > > tmp_kernel( new cuNDArray< complext<REAL> >(&dim_coils, kernels_->get_data_ptr()+i*num_elements_coils ));
+
+			D_->set_diagonal(tmp_kernel);
+			for (unsigned int k=0; i<num_frames; k++){
+			cuNDArray< complext<REAL> > tmp_in( &dim_image, in->get_data_ptr()+i*num_elements_image*num_frames+k*num_elements_image );
+			cuNDArray<complext<REAL> > tmp_out(dim_image,out->get_data_ptr()+k*num_elements_image);
+			if( i==0 && !accumulate )
+				D_->mult_MH( &tmp_in, &tmp_out, false );
+			else
+				D_->mult_MH( &tmp_in, &tmp_out, true );
+		}
+		}
+
+		// Subtract identity
+		//
+
+		*out -= *in;
+	}
+
+
+
+protected:
+	boost::shared_ptr< cuNDArray< complext<REAL> > > kernels_;
+	boost::shared_ptr< cuDiagonalSumOperator< complext<REAL> > > D_;
+};
+}
diff --git a/toolboxes/mri/pmri/gpu/gpupmri_export.h b/toolboxes/mri/pmri/gpu/gpupmri_export.h
new file mode 100644
index 0000000..a66fa32
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/gpupmri_export.h
@@ -0,0 +1,19 @@
+/** \file gpupmri_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef GPUPMRI_EXPORT_H_
+#define GPUPMRI_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GPUPMRI__) || defined (gpuparallelmri_EXPORTS)
+#define EXPORTGPUPMRI __declspec(dllexport)
+#else
+#define EXPORTGPUPMRI __declspec(dllimport)
+#endif
+#else
+#define EXPORTGPUPMRI
+#endif
+
+
+#endif /* GPUPMRI_EXPORT_H_ */
diff --git a/toolboxes/mri/pmri/gpu/htgrappa.cpp b/toolboxes/mri/pmri/gpu/htgrappa.cpp
new file mode 100644
index 0000000..032128e
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/htgrappa.cpp
@@ -0,0 +1,65 @@
+#include "htgrappa.h"
+#include "hoNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "hoNDArray_utils.h"
+#include "hoNDArray_linalg.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif
+
+
+/*
+  This file is used to hide certain Armadillo calls from the nvcc compiler. If Armadillo functions need to
+  be called in a *.cu file, it is preferably to wrap the calls in a function and place that function in
+  a *.cpp file so that Armadillo code will not be compiled by nvcc.
+
+  Some error handling may be needed in these functions, but eventually SymmetricHermitianPositiveDefiniteLinearSystem_posv
+  will be renamed and made to throw exceptions and then it should be handled.
+
+ */
+
+
+
+namespace Gadgetron
+{
+  template <class T> void ht_grappa_solve_spd_system(hoNDArray<T> *A, hoNDArray<T> *B) {
+    /*
+      We are swithcing off OpenMP threading before this call to posv. There seems to be a bad interaction between openmp, cuda, and BLAS.
+      So far this problem has only been observed from *.cu files (or in functions called from *.cu files) but the problem may be more general.
+
+      This is a temporary fix that we should keep an eye on.
+     */
+
+    hoNDArray<T> A_ori;
+    A_ori = *A;
+
+#ifdef USE_OMP
+    int num_threads = omp_get_num_threads();
+    omp_set_num_threads(1);
+#endif //USE_OMP
+
+    try
+    {
+        posv(*A, *B);
+    }
+    catch(...)
+    {
+        // it is found that if signal is very very high, the posv can throw exceptions due to ill-conditioned matrix of A
+        // hesv does not require A to be a positive-definite matrix, but an n-by-n symmetric matrix
+
+        GERROR_STREAM("ht_grappa_solve_spd_system : posv(*A, *B) throws exceptions ... ");
+        *A = A_ori;
+        hesv(*A, *B);
+        GERROR_STREAM("ht_grappa_solve_spd_system : hesv(*A, *B) is called ");
+    }
+
+#ifdef USE_OMP
+    omp_set_num_threads(num_threads);
+#endif //USE_OMP
+
+  }
+
+  template void ht_grappa_solve_spd_system< float_complext >(hoNDArray< float_complext > *A, hoNDArray< float_complext > *B);
+
+}
diff --git a/toolboxes/mri/pmri/gpu/htgrappa.cu b/toolboxes/mri/pmri/gpu/htgrappa.cu
new file mode 100644
index 0000000..5cfc72a
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/htgrappa.cu
@@ -0,0 +1,838 @@
+#include "htgrappa.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDFFT.h"
+#include "GadgetronTimer.h"
+#include "GPUTimer.h"
+#include "cuNDArray_elemwise.h"
+#include "CUBLASContextProvider.h"
+#include "hoNDArray_fileio.h"
+#include "hoNDArray_utils.h"
+
+#include <cublas_v2.h>
+//#include <cula_lapack_device.h>
+#include <iostream>
+
+namespace Gadgetron {
+
+  static int2 vec_to_int2(std::vector<unsigned int> vec)
+  {
+    int2 ret; ret.x = 0; ret.y = 0;
+    if (vec.size() < 2) {
+      GDEBUG_STREAM("vec_to_uint2 dimensions of vector too small" << std::endl);
+      return ret;
+    }
+
+    ret.x = vec[0]; ret.y = vec[1];
+    return ret;
+  }
+
+  template <class T> static int write_cuNDArray_to_disk(cuNDArray<T>* a, const char* filename)
+  {
+    boost::shared_ptr< hoNDArray<T> > host = a->to_host();
+    write_nd_array<complext<float> >(host.get(), filename);
+    return 0;
+  }
+
+  template <class T> __global__ void form_grappa_system_matrix_kernel_2d(const T* __restrict__ ref_data,
+                                                                         int2 dims,
+                                                                         int source_coils,
+                                                                         int target_coils,
+                                                                         int2 ros,
+                                                                         int2 ros_offset,
+                                                                         int2 kernel_size,
+                                                                         int acceleration_factor,
+                                                                         int set_number,
+                                                                         T* __restrict__ out_matrix,
+                                                                         T* __restrict__ b)
+  {
+    long idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+    int klocations = ros.x*ros.y;
+    int image_elements = dims.x*dims.y;
+    //int coefficients = kernel_size.x*kernel_size.y*coils;
+    if (idx_in < klocations) {
+      //unsigned int y = idx_in/ros.x;
+      //unsigned int x = idx_in - y*ros.x;
+      unsigned int x = idx_in/ros.y;
+      unsigned int y = idx_in - x*ros.y;
+      unsigned int idx_ref = 0;
+      unsigned int coeff_counter = 0;
+
+      int kernel_size_x = kernel_size.x;
+      int kernel_size_y = kernel_size.y;
+
+      for (int c = 0; c < source_coils; c++) {
+        for (int ky = -((kernel_size_y*acceleration_factor)>>1)+set_number+1;
+             ky < ((kernel_size_y*acceleration_factor+1)>>1); ky+=acceleration_factor) {
+          for (int kx = -(kernel_size_x>>1); kx < ((kernel_size_x+1)>>1); kx++) {
+            idx_ref = c*image_elements + x+kx+ros_offset.x + (y+ky+ros_offset.y)*dims.x;
+            //out_matrix[idx_in*coefficients+coeff_counter++] = ref_data[idx_ref];
+            out_matrix[idx_in+(coeff_counter++)*klocations] = ref_data[idx_ref];
+
+          }
+        }
+      }
+
+      //Loop over target coils here
+      for (unsigned int c = 0; c < target_coils; c++) {
+        //b[idx_in*coils + c] = ref_data[c*image_elements + y*dims.x+x];
+        b[idx_in + c*klocations] = ref_data[c*image_elements + (y+ros_offset.y)*dims.x+(x+ros_offset.x)];
+      }
+    }
+  }
+
+  //TODO: This should take source and target coils into consideration
+  template <class T> __global__ void copy_grappa_coefficients_to_kernel_2d(const T* __restrict__ coeffs,
+                                                                           T* __restrict__ kernel,
+                                                                           int source_coils,
+                                                                           int target_coils,
+                                                                           int2 kernel_size,
+                                                                           int acceleration_factor,
+                                                                           int set)
+  {
+    unsigned long idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+
+    unsigned int coefficients_in_set = source_coils*kernel_size.x*kernel_size.y*target_coils;
+
+    if (idx_in < coefficients_in_set) {
+      int idx_in_tmp = idx_in;
+      int kx = idx_in%kernel_size.x;
+      idx_in = (idx_in-kx)/kernel_size.x;
+      int ky = idx_in%kernel_size.y;
+      idx_in = (idx_in-ky)/kernel_size.y;
+      int coil = idx_in%source_coils;
+      idx_in = (idx_in-coil)/source_coils;
+      int coilg = idx_in;
+
+      kernel[coilg*source_coils*(kernel_size.y*acceleration_factor)*kernel_size.x +
+             coil*(kernel_size.y*acceleration_factor)*kernel_size.x +
+             (ky*acceleration_factor + set + 1)*kernel_size.x + kx] = coeffs[idx_in_tmp];
+
+      if ((coil == coilg) && (kx == 0) && (ky == 0) && (set == 0)) {
+        kernel[coilg*source_coils*(kernel_size.y*acceleration_factor)*kernel_size.x +
+               coil*(kernel_size.y*acceleration_factor)*kernel_size.x +
+               ((kernel_size.y>>1)*acceleration_factor)*kernel_size.x + (kernel_size.x>>1) ].vec[0] = 1;
+
+      }
+    }
+  }
+
+  template <class T> __global__ void copy_grappa_kernel_to_kspace_2d(const T* __restrict__ kernel,
+                                                                     T* __restrict__ out,
+                                                                     int2 dims,
+                                                                     int2 kernel_size,
+                                                                     int coils)
+  {
+
+    unsigned long idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+
+    if (idx_in < kernel_size.x*kernel_size.y*coils) {
+      int idx_in_tmp = idx_in;
+      int kx = idx_in%kernel_size.x;
+      idx_in = (idx_in-kx)/kernel_size.x;
+      int ky = idx_in%kernel_size.y;
+      idx_in = (idx_in-ky)/kernel_size.y;
+      int coil = idx_in;
+
+      int outx = -(kx- (kernel_size.x>>1)) + (dims.x>>1); //Flipping the kernel for conv
+      int outy = -(ky- (kernel_size.y>>1)) + (dims.y>>1);
+
+      out[coil*dims.x*dims.y + outy*dims.x + outx] = kernel[idx_in_tmp];
+    }
+  }
+
+  __global__ void scale_and_add_unmixing_coeffs(const complext<float> * __restrict__ unmixing,
+                                                const complext<float> * __restrict__ csm,
+                                                complext<float> * __restrict__ out,
+                                                int elements,
+                                                int coils,
+                                                float scale_factor)
+  {
+    unsigned long idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+
+    complext<float>  tmp;
+    if (idx_in < elements) {
+      for (int c = 0; c < coils; c++) {
+        tmp = unmixing[c*elements + idx_in]*conj(csm[idx_in]);
+        out[c*elements + idx_in] += scale_factor*tmp;
+
+      }
+    }
+  }
+
+  __global__ void scale_and_copy_unmixing_coeffs(const complext<float> * __restrict__ unmixing,
+                                                 complext<float> * __restrict__ out,
+                                                 int elements,
+                                                 int coils,
+                                                 float scale_factor)
+  {
+    unsigned long idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+
+    if (idx_in < elements) {
+      for (int c = 0; c < coils; c++) {
+        out[c*elements + idx_in] = scale_factor*unmixing[c*elements + idx_in];
+
+      }
+    }
+  }
+
+  __global__ void conj_csm_coeffs(const complext<float> * __restrict__ csm,
+                                  complext<float> * __restrict__ out,
+                                  int source_elements,
+                                  int target_elements)
+  {
+    //TODO: Here we need to have both src_elements and target_elements and we use conj(csm) for all target_elements and 0.0 when element > target_elements
+
+    unsigned long idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+
+    if (idx_in < source_elements) {
+      if (idx_in >= target_elements) {
+        out[idx_in] = complext<float> (0.0,0.0);
+      } else {
+        out[idx_in] = conj(csm[idx_in]);
+      }
+    }
+  }
+
+  __global__ void single_channel_coeffs(complext<float> * out,
+                                        int channel_no,
+                                        int elements_per_channel)
+  {
+    unsigned long idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+
+    if (idx_in < elements_per_channel) {
+      out[idx_in + channel_no*elements_per_channel] = complext<float>(1.0,0.0);
+    }
+  }
+
+
+  template <class T> int htgrappa_calculate_grappa_unmixing(cuNDArray<T>* ref_data,
+                                                            cuNDArray<T>* b1,
+                                                            unsigned int acceleration_factor,
+                                                            std::vector<unsigned int>* kernel_size,
+                                                            cuNDArray<T>* out_mixing_coeff,
+                                                            std::vector< std::pair<unsigned int, unsigned int> >* sampled_region,
+                                                            std::list< unsigned int >* uncombined_channels)
+  {
+
+    if (ref_data->get_number_of_dimensions() != b1->get_number_of_dimensions()) {
+      std::cerr << "htgrappa_calculate_grappa_unmixing: Dimensions mismatch" << std::endl;
+      return -1;
+    }
+
+    for (unsigned int i = 0; i < (ref_data->get_number_of_dimensions()-1); i++) {
+      if (ref_data->get_size(i) != b1->get_size(i)) {
+        std::cerr << "htgrappa_calculate_grappa_unmixing: Dimensions mismatch" << std::endl;
+        return -1;
+      }
+    }
+
+    unsigned int RO = ref_data->get_size(0);
+    unsigned int E1 = ref_data->get_size(1);
+    unsigned int source_coils = ref_data->get_size(ref_data->get_number_of_dimensions()-1);
+    unsigned int target_coils = b1->get_size(b1->get_number_of_dimensions()-1);
+    unsigned int elements_per_coil = b1->get_number_of_elements()/target_coils;
+
+    if (target_coils > source_coils) {
+      std::cerr << "target_coils > source_coils" << std::endl;
+      return -1;
+    }
+
+    if (acceleration_factor == 1) {
+      dim3 blockDim(512,1,1);
+      dim3 gridDim((unsigned int) std::ceil((1.0f*elements_per_coil*source_coils)/blockDim.x), 1, 1 );
+
+      conj_csm_coeffs<<< gridDim, blockDim >>>( b1->get_data_ptr(),
+                                                out_mixing_coeff->get_data_ptr(),
+                                                out_mixing_coeff->get_number_of_elements(),
+                                                b1->get_number_of_elements());
+
+      std::list<unsigned int>::iterator it;
+      gridDim = dim3((unsigned int) std::ceil((1.0f*(elements_per_coil))/blockDim.x), 1, 1 );
+      int uncombined_channel_no = 0;
+      for ( it = uncombined_channels->begin(); it != uncombined_channels->end(); it++ ) {
+        uncombined_channel_no++;
+        //TODO: Adjust pointers to reflect that number of target/source may not be qual
+        single_channel_coeffs<<< gridDim, blockDim >>>( out_mixing_coeff->get_data_ptr() + uncombined_channel_no*source_coils*elements_per_coil,
+                                                        *it,
+                                                        (elements_per_coil));
+      }
+      return 0;
+    }
+
+    if (kernel_size->size() != (ref_data->get_number_of_dimensions()-1)) {
+      std::cerr << "htgrappa_calculate_grappa_unmixing: Kernel size does not match the data dimensions" << std::endl;
+      return -1;
+    }
+
+    if (ref_data->get_number_of_dimensions() > 3) {
+      std::cerr << "htgrappa_calculate_grappa_unmixing: Not yet implemented for 3D" << std::endl;
+      return -1;
+    }
+
+    //Calculate region of support + offsets
+    std::vector<size_t> rosTmp = *ref_data->get_dimensions();
+
+    std::vector<unsigned int> ros(rosTmp.size());
+    for ( unsigned int ii=0; ii<rosTmp.size(); ii++ ){
+      ros[ii] = rosTmp[ii];
+    }
+
+    ros.pop_back(); //Remove the number of coils
+    std::vector<unsigned int> ros_offset(ref_data->get_number_of_dimensions(),0);
+    unsigned long int kspace_locations = 1;
+
+    if (sampled_region) {
+      for (unsigned int i = 0; i < ros.size(); i++) {
+        if (i > 0) {
+          ros[i] = (*sampled_region)[i].second-(*sampled_region)[i].first-((*kernel_size)[i]*acceleration_factor);
+        } else {
+          ros[i] = (*sampled_region)[i].second-(*sampled_region)[i].first-(*kernel_size)[i];
+        }
+        ros_offset[i] = (*sampled_region)[i].first+(((*sampled_region)[i].second-(*sampled_region)[i].first-ros[i])>>1);
+        kspace_locations *= ros[i];
+      }
+    } else {
+      for (unsigned int i = 0; i < ros.size(); i++) {
+        if (i > 0) {
+          ros[i] -= ((*kernel_size)[i]*acceleration_factor);
+        } else {
+          ros[i] -= (*kernel_size)[i];
+        }
+        ros_offset[i] = (ref_data->get_size(i)-ros[i])>>1;
+        kspace_locations *= ros[i];
+      }
+    }
+
+    /*
+      for (unsigned int i = 0; i < ros.size(); i++) {
+      GDEBUG_STREAM("ROS[" << i << "] = " << ros[i] << " + " << ros_offset[i] << std::endl);
+      }
+    */
+
+    std::vector<size_t> sys_matrix_size;
+    sys_matrix_size.push_back(kspace_locations);
+    sys_matrix_size.push_back(source_coils*(*kernel_size)[0]*(*kernel_size)[1]);
+
+    std::vector<size_t> b_size;
+    b_size.push_back(kspace_locations);
+    b_size.push_back(target_coils);
+
+    cuNDArray<T> system_matrix = cuNDArray<T>(&sys_matrix_size);
+
+    clear(&system_matrix);
+
+    cuNDArray<T> b = cuNDArray<T>(&b_size);
+
+    boost::shared_ptr< std::vector<size_t> > dimTmp = ref_data->get_dimensions();
+    std::vector<unsigned int> dimInt(2, 0);
+    dimInt[0] = (*dimTmp)[0];
+    dimInt[1] = (*dimTmp)[1];
+
+    int2 dims = vec_to_int2(dimInt);
+    int2 dros = vec_to_int2(ros);
+    int2 dros_offset = vec_to_int2(ros_offset);
+    int2 dkernel_size = vec_to_int2(*kernel_size);
+
+    //TODO: Use source coils here
+    int n = source_coils*(*kernel_size)[0]*(*kernel_size)[1];
+    int m = kspace_locations;
+
+    std::vector<size_t> AHA_dims(2,n);
+    cuNDArray<T> AHA = cuNDArray<T>(&AHA_dims);
+    cuNDArray<T> AHA_set0 = cuNDArray<T>(&AHA_dims);
+
+    hoNDArray<T> AHA_host(n, n);
+    float2* pAHA = (float2*) AHA_host.get_data_ptr();
+
+    //TODO: Use target coils here
+    std::vector<size_t> AHrhs_dims;
+    AHrhs_dims.push_back(n);
+    AHrhs_dims.push_back(target_coils);
+
+    cuNDArray<T> AHrhs = cuNDArray<T>(&AHrhs_dims);
+
+    cublasHandle_t handle = *CUBLASContextProvider::instance()->getCublasHandle();
+
+    std::vector<size_t> gkernel_dims;
+    gkernel_dims.push_back((*kernel_size)[0]);
+    gkernel_dims.push_back((*kernel_size)[1]*acceleration_factor);
+    gkernel_dims.push_back(source_coils);
+    gkernel_dims.push_back(target_coils);
+    cuNDArray<T> gkernel = cuNDArray<T>(&gkernel_dims);
+    clear(&gkernel);
+
+    //GadgetronTimer timer;
+
+    for (unsigned int set = 0; set < acceleration_factor-1; set++)
+      {
+        //GDEBUG_STREAM("Calculating coefficients for set " << set << std::endl);
+
+        //GDEBUG_STREAM("dros.x = " << dros.x << ", dros.y = " << dros.y << std::endl);
+
+        std::ostringstream ostr;
+        ostr << "Set_" << set << "_";
+        std::string appendix = ostr.str();
+
+        dim3 blockDim(512,1,1);
+        dim3 gridDim((unsigned int) std::ceil((1.0f*kspace_locations)/blockDim.x), 1, 1 );
+
+        form_grappa_system_matrix_kernel_2d<<< gridDim, blockDim >>>( ref_data->get_data_ptr(), dims,
+                                                                      source_coils, target_coils, dros, dros_offset,
+                                                                      dkernel_size, acceleration_factor, set,
+                                                                      system_matrix.get_data_ptr(),
+                                                                      b.get_data_ptr());
+
+        cudaError_t err = cudaGetLastError();
+        if( err != cudaSuccess ){
+          std::cerr << "htgrappa_calculate_grappa_unmixing: Unable to form system matrix: " <<
+            cudaGetErrorString(err) << std::endl;
+          return -1;
+        }
+
+        //  {
+        //      std::string filename = debugFolder+appendix+"A.cplx";
+            //write_cuNDArray_to_disk(&system_matrix, filename.c_str());
+        //  }
+
+        //  {
+        //      std::string filename = debugFolder+appendix+"b.cplx";
+            //write_cuNDArray_to_disk(&b, filename.c_str());
+        //  }
+
+        complext<float>  alpha = complext<float>(1);
+        complext<float>  beta = complext<float>(0);
+
+        cublasStatus_t stat;
+
+        if ( set == 0 )
+          {
+            {
+              //GPUTimer t2("Cgemm call");
+              stat = cublasCgemm(handle, CUBLAS_OP_C, CUBLAS_OP_N,
+                                 n,n,m,(float2*) &alpha,
+                                 (float2*) system_matrix.get_data_ptr(), m,
+                                 (float2*) system_matrix.get_data_ptr(), m,
+                                 (float2*) &beta, (float2*) AHA.get_data_ptr(), n);
+
+              if (stat != CUBLAS_STATUS_SUCCESS) {
+                std::cerr << "htgrappa_calculate_grappa_unmixing: Failed to form AHA product using cublas gemm" << std::endl;
+                std::cerr << "---- cublas error code " << stat << std::endl;
+                return -1;
+              }
+            }
+
+            {
+              //timer.start("copy AHA to host");
+              if (cudaMemcpy(pAHA, AHA.get_data_ptr(), AHA_host.get_number_of_bytes(), cudaMemcpyDeviceToHost) != cudaSuccess)
+                {
+                  std::cerr << "htgrappa_calculate_grappa_unmixing: Failed to copy AHA to host" << std::endl;
+                  std::cerr << "---- cublas error code " << stat << std::endl;
+                  return -1;
+                }
+              //timer.stop();
+
+              //timer.start("apply the regularization");
+              // apply the regularization
+              double lamda = 0.0005;
+
+              double trA = std::sqrt(pAHA[0].x*pAHA[0].x + pAHA[0].y*pAHA[0].y);
+              size_t c;
+              for ( c=1; c<n; c++ )
+                {
+                  float x = pAHA[c+c*n].x;
+                  float y = pAHA[c+c*n].y;
+                  trA += std::sqrt(x*x+y*y);
+                }
+
+              double value = trA*lamda/n;
+              for ( c=0; c<n; c++ )
+                {
+                  float x = pAHA[c+c*n].x;
+                  float y = pAHA[c+c*n].y;
+                  pAHA[c+c*n].x = std::sqrt(x*x+y*y) + value;
+                  pAHA[c+c*n].y = 0;
+                }
+              //timer.stop();
+
+              //timer.start("copy the AHA to device");
+              if (cudaMemcpy(AHA.get_data_ptr(), pAHA, AHA_host.get_number_of_bytes(), cudaMemcpyHostToDevice) != cudaSuccess)
+                {
+                  std::cerr << "htgrappa_calculate_grappa_unmixing: Failed to copy regularized AHA to device" << std::endl;
+                  std::cerr << "---- cublas error code " << stat << std::endl;
+                  return -1;
+                }
+              //timer.stop();
+            }
+
+            AHA_set0 = AHA;
+          }
+        else
+          {
+            AHA = AHA_set0;
+          }
+
+        //  {
+        //      std::string filename = debugFolder+appendix+"AHA.cplx";
+        //write_cuNDArray_to_disk(&AHA, filename.c_str());
+        //  }
+
+        {
+
+          //GPUTimer timer("GRAPPA cublas gemm");
+          //TODO: Sort out arguments for source and target coils here.
+          stat = cublasCgemm(handle, CUBLAS_OP_C, CUBLAS_OP_N,
+                             n,target_coils,m,(float2*) &alpha,
+                             (float2*) system_matrix.get_data_ptr(), m,
+                             (float2*) b.get_data_ptr(), m,
+                             (float2*) &beta, (float2*)AHrhs.get_data_ptr(), n);
+
+        }
+
+        //  {
+        //      std::string filename = debugFolder+appendix+"AHrhs.cplx";
+        //write_cuNDArray_to_disk(&AHrhs, filename.c_str());
+        //  }
+
+        if (stat != CUBLAS_STATUS_SUCCESS) {
+          std::cerr << "htgrappa_calculate_grappa_unmixing: Failed to form AHrhs product using cublas gemm" << std::endl;
+          std::cerr << "---- cublas error code " << stat << std::endl;
+          return -1;
+        }
+
+
+    /*
+    {
+      //This is the OLD GPU code using CULA
+      GPUTimer gpu_invert_time("GPU Inversion time");
+      culaStatus s;
+      s = culaDeviceCgels( 'N', n, n, target_coils,
+                             (culaDeviceFloatComplex*)AHA.get_data_ptr(), n,
+                             (culaDeviceFloatComplex*)AHrhs.get_data_ptr(), n);
+
+
+      if (s != culaNoError) {
+        GDEBUG_STREAM("htgrappa_calculate_grappa_unmixing: linear solve failed" << std::endl);
+        return -1;
+      }
+    }
+    */
+
+
+    {
+      //It actually turns out to be faster to do this inversion on the CPU. Problem is probably too small for GPU to make sense
+      //GPUTimer cpu_invert_time("CPU Inversion time");
+      boost::shared_ptr< hoNDArray<T> > AHA_h = AHA.to_host();
+      boost::shared_ptr< hoNDArray<T> > AHrhs_h = AHrhs.to_host();
+
+      std::vector<size_t> perm_dim;
+      perm_dim.push_back(1);
+      perm_dim.push_back(0);
+
+      permute(AHA_h.get(),&perm_dim);
+      permute(AHrhs_h.get(),&perm_dim);
+
+      ht_grappa_solve_spd_system(AHA_h.get(), AHrhs_h.get());
+
+      permute(AHrhs_h.get(),&perm_dim);
+      AHrhs = cuNDArray<T>(*AHrhs_h);
+    }
+
+#if 0
+        size_t free = 0, total = 0;
+        cudaMemGetInfo(&free, &total);
+        GDEBUG_STREAM("CUDA Memory: " << free << " (" << total << ")" << std::endl);
+#endif
+        //culaShutdown();
+
+        /*
+          if (cposv_wrapper(&AHA, &AHrhs) < 0) {
+          std::cerr << "htgrappa_calculate_grappa_unmixing: Error calling cgels" << std::endl;
+          return -1;
+          }
+        */
+
+        //  {
+        //      std::string filename = debugFolder+appendix+"AHrhs_solution.cplx";
+            //write_cuNDArray_to_disk(&AHrhs, filename.c_str());
+        //  }
+
+        gridDim = dim3((unsigned int) std::ceil((1.0f*n*source_coils)/blockDim.x), 1, 1 );
+
+        //TODO: This should be target coils used as argument here.
+        copy_grappa_coefficients_to_kernel_2d<<< gridDim, blockDim >>>( AHrhs.get_data_ptr(),
+                                                                        gkernel.get_data_ptr(),
+                                                                        source_coils,
+                                                                        target_coils,
+                                                                        dkernel_size,
+                                                                        acceleration_factor,
+                                                                        set);
+
+        //  {
+        //      std::string filename = debugFolder+appendix+"kernel.cplx";
+            //write_cuNDArray_to_disk(&gkernel, filename.c_str());
+        //  }
+
+        err = cudaGetLastError();
+        if( err != cudaSuccess ){
+          std::cerr << "htgrappa_calculate_grappa_unmixing: Failed to copy calculated coefficients to kernel: " <<
+            cudaGetErrorString(err) << std::endl;
+          return -1;
+        }
+
+      }
+
+    //{
+    //    std::string filename = debugFolder+"kernel_all.cplx";
+    //    write_cuNDArray_to_disk(&gkernel, filename.c_str());
+    //}
+
+    //TODO: This should be source coils
+    cuNDArray<T> tmp_mixing = cuNDArray<T>(ref_data->get_dimensions());
+
+    int kernel_elements = gkernel.get_number_of_elements()/target_coils;
+    int total_elements = tmp_mixing.get_number_of_elements()/source_coils;
+    dkernel_size.y *= acceleration_factor;
+
+    std::vector<size_t> ft_dims(2,0);ft_dims[1] = 1;
+    clear(out_mixing_coeff);
+    unsigned int current_uncombined_index = 0;
+
+    //TODO: Loop over target coils.
+    for (unsigned int c = 0; c < target_coils; c++)
+      {
+        clear(&tmp_mixing);
+
+        dim3 blockDim(512,1,1);
+        dim3 gridDim((unsigned int) std::ceil((1.0f*kernel_elements)/blockDim.x), 1, 1 );
+
+        //TODO: Take source and target into consideration
+        copy_grappa_kernel_to_kspace_2d<<< gridDim, blockDim >>>((gkernel.get_data_ptr()+(c*kernel_elements)),
+                                                                 tmp_mixing.get_data_ptr(),
+                                                                 dims,
+                                                                 dkernel_size,
+                                                                 source_coils);
+
+        cudaError_t err = cudaGetLastError();
+        if( err != cudaSuccess ){
+          std::cerr << "htgrappa_calculate_grappa_unmixing: Unable to pad GRAPPA kernel: " <<
+            cudaGetErrorString(err) << std::endl;
+          return -1;
+        }
+
+        cuNDFFT<typename realType<T>::Type>::instance()->ifft(&tmp_mixing, &ft_dims);
+
+        float scale_factor = (float)std::sqrt((double)(RO*E1));
+
+        gridDim = dim3((unsigned int) std::ceil(1.0f*total_elements/blockDim.x), 1, 1 );
+        scale_and_add_unmixing_coeffs<<< gridDim, blockDim >>>(tmp_mixing.get_data_ptr(),
+                                                               (b1->get_data_ptr()+ c*total_elements),
+                                                               out_mixing_coeff->get_data_ptr(),
+                                                               total_elements,
+                                                               source_coils,
+                                                               scale_factor);
+        err = cudaGetLastError();
+        if( err != cudaSuccess ){
+          std::cerr << "htgrappa_calculate_grappa_unmixing: scale and add mixing coeffs: " <<
+            cudaGetErrorString(err) << std::endl;
+          return -1;
+        }
+
+        if (uncombined_channels) {
+          std::list<unsigned int>::iterator it = std::find((*uncombined_channels).begin(),(*uncombined_channels).end(),c);
+          if (it != (*uncombined_channels).end()) {
+            current_uncombined_index++;
+            scale_and_copy_unmixing_coeffs<<< gridDim, blockDim >>>(tmp_mixing.get_data_ptr(),
+                                                                    (out_mixing_coeff->get_data_ptr()+current_uncombined_index*total_elements*source_coils),
+                                                                    total_elements,
+                                                                    source_coils,
+                                                                    scale_factor);
+          }
+        }
+
+      }
+
+    //GDEBUG_STREAM("**********cublasDestroy()**************" << std::endl);
+    //cublasDestroy_v2(handle);
+
+    return 0;
+  }
+
+  template <class T> int inverse_clib_matrix(cuNDArray<T>* A,
+                                             cuNDArray<T>* b,
+                                             cuNDArray<T>* coeff,
+                                             double lamda)
+  {
+    // A: M*N
+    // b: M*K
+    size_t M = A->get_size(0);
+    size_t N = A->get_size(1);
+
+    size_t K = b->get_size(1);
+
+    std::vector<size_t> AHA_dims(2,N);
+    cuNDArray<T> AHA = cuNDArray<T>(&AHA_dims);
+
+    std::vector<size_t> AHrhs_dims;
+    AHrhs_dims.push_back(N);
+    AHrhs_dims.push_back(K);
+
+    coeff->create(&AHrhs_dims);
+
+    cublasHandle_t handle = *CUBLASContextProvider::instance()->getCublasHandle();
+
+    complext<float>  alpha = complext<float>(1);
+    complext<float>  beta = complext<float>(0);
+
+    //{
+    //    std::string filename = debugFolder+"A.cplx";
+    //    write_cuNDArray_to_disk(A, filename.c_str());
+    //}
+
+    //{
+    //    std::string filename = debugFolder+"b.cplx";
+    //    write_cuNDArray_to_disk(b, filename.c_str());
+    //}
+
+    {
+      //GPUTimer t2("compute AHA ...");
+      cublasStatus_t stat = cublasCgemm(handle, CUBLAS_OP_C, CUBLAS_OP_N,
+                                        N,N,M,(float2*) &alpha,
+                                        (float2*) A->get_data_ptr(), M,
+                                        (float2*) A->get_data_ptr(), M,
+                                        (float2*) &beta, (float2*) AHA.get_data_ptr(), N);
+
+      if (stat != CUBLAS_STATUS_SUCCESS)
+        {
+          std::cerr << "inverse_clib_matrix: Failed to form AHA product using cublas gemm" << std::endl;
+          std::cerr << "---- cublas error code " << stat << std::endl;
+          return -1;
+        }
+    }
+
+    //{
+    //    std::string filename = debugFolder+"AHA.cplx";
+    //    write_cuNDArray_to_disk(&AHA, filename.c_str());
+    //}
+
+    {
+      //GPUTimer t2("compute AHrhs ...");
+      cublasStatus_t stat = cublasCgemm(handle, CUBLAS_OP_C, CUBLAS_OP_N,
+                                        N,K,M,(float2*) &alpha,
+                                        (float2*) A->get_data_ptr(), M,
+                                        (float2*) b->get_data_ptr(), M,
+                                        (float2*) &beta, (float2*)coeff->get_data_ptr(), N);
+
+      if (stat != CUBLAS_STATUS_SUCCESS)
+        {
+          std::cerr << "inverse_clib_matrix: Failed to form AHrhs product using cublas gemm" << std::endl;
+          std::cerr << "---- cublas error code " << stat << std::endl;
+          return -1;
+        }
+    }
+
+    //{
+    //    std::string filename = debugFolder+"AHrhs.cplx";
+    //    write_cuNDArray_to_disk(coeff, filename.c_str());
+    //}
+
+    // apply the regularization
+    if ( lamda > 0 )
+      {
+        hoNDArray<T> AHA_host(N, N);
+        float2* pAHA = (float2*) AHA_host.get_data_ptr();
+
+        //GadgetronTimer timer;
+
+        //timer.start("copy AHA to host");
+        if (cudaMemcpy(pAHA, AHA.get_data_ptr(), AHA_host.get_number_of_bytes(), cudaMemcpyDeviceToHost) != cudaSuccess)
+          {
+            std::cerr << "inverse_clib_matrix: Failed to copy AHA to host" << std::endl;
+            return -1;
+          }
+        //timer.stop();
+
+        //timer.start("apply the regularization");
+        // apply the regularization
+        double trA = std::sqrt(pAHA[0].x*pAHA[0].x + pAHA[0].y*pAHA[0].y);
+        size_t c;
+        for ( c=1; c<N; c++ )
+          {
+            float x = pAHA[c+c*N].x;
+            float y = pAHA[c+c*N].y;
+            trA += std::sqrt(x*x+y*y);
+          }
+
+        double value = trA*lamda/N;
+        for ( c=0; c<N; c++ )
+          {
+            float x = pAHA[c+c*N].x;
+            float y = pAHA[c+c*N].y;
+            pAHA[c+c*N].x = std::sqrt(x*x+y*y) + value;
+            pAHA[c+c*N].y = 0;
+          }
+        //timer.stop();
+
+        //timer.start("copy the AHA to device");
+        if (cudaMemcpy(AHA.get_data_ptr(), pAHA, AHA_host.get_number_of_bytes(), cudaMemcpyHostToDevice) != cudaSuccess)
+          {
+            std::cerr << "inverse_clib_matrix: Failed to copy regularized AHA to device" << std::endl;
+            return -1;
+          }
+        //timer.stop();
+      }
+
+    /*
+      culaStatus s;
+      s = culaDeviceCgels( 'N', N, N, K,
+      (culaDeviceFloatComplex*)AHA.get_data_ptr(), N,
+      (culaDeviceFloatComplex*)coeff->get_data_ptr(), N);
+    */
+    {
+      //It actually turns out to be faster to do this inversion on the CPU. Problem is probably too small for GPU to make sense
+      //GPUTimer cpu_invert_time("CPU Inversion time");
+      boost::shared_ptr< hoNDArray<T> > AHA_h = AHA.to_host();
+      boost::shared_ptr< hoNDArray<T> > AHrhs_h = coeff->to_host();
+
+      std::vector<size_t> perm_dim;
+      perm_dim.push_back(1);
+      perm_dim.push_back(0);
+
+      permute(AHA_h.get(),&perm_dim);
+      permute(AHrhs_h.get(),&perm_dim);
+
+      ht_grappa_solve_spd_system(AHA_h.get(), AHrhs_h.get());
+
+      permute(AHrhs_h.get(),&perm_dim);
+      *coeff = cuNDArray<T>(*AHrhs_h);
+    }
+
+
+    //{
+    //    std::string filename = debugFolder+"coeff.cplx";
+    //    write_cuNDArray_to_disk(coeff, filename.c_str());
+    //}
+
+    /*
+    if (s != culaNoError)
+      {
+        GDEBUG_STREAM("inverse_clib_matrix: linear solve failed" << std::endl);
+        return -1;
+      }
+    */
+    return 0;
+  }
+
+  //Template instanciation
+  template EXPORTGPUPMRI int htgrappa_calculate_grappa_unmixing(cuNDArray<complext<float> >* ref_data,
+                                                                cuNDArray<complext<float> >* b1,
+                                                                unsigned int acceleration_factor,
+                                                                std::vector<unsigned int> *kernel_size,
+                                                                cuNDArray<complext<float> >* out_mixing_coeff,
+                                                                std::vector< std::pair<unsigned int, unsigned int> >* sampled_region,
+                                                                std::list< unsigned int >* uncombined_channels);
+
+  template EXPORTGPUPMRI int inverse_clib_matrix(cuNDArray<complext<float> >* A,
+                                                 cuNDArray<complext<float> >* b,
+                                                 cuNDArray<complext<float> >* coeff,
+                                                 double lamda);
+}
diff --git a/toolboxes/mri/pmri/gpu/htgrappa.h b/toolboxes/mri/pmri/gpu/htgrappa.h
new file mode 100644
index 0000000..06ae26d
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/htgrappa.h
@@ -0,0 +1,36 @@
+/** \file htgrappa.h
+    \brief Utilities to calibrate grappa weights and corresponding unmixing coefficients - GPU-based.
+*/
+
+#pragma once
+#ifndef HTGRAPPA_H
+#define HTGRAPPA_H
+
+#include "gpupmri_export.h"
+#include "cuNDArray.h"
+
+#include <list>
+
+namespace Gadgetron
+{
+
+  template <class T> EXPORTGPUPMRI 
+  int htgrappa_calculate_grappa_unmixing(cuNDArray<T>* ref_data, 
+                                         cuNDArray<T>* b1,
+                                         unsigned int acceleration_factor,
+                                         std::vector<unsigned int>* kernel_size,
+                                         cuNDArray<T>* out_mixing_coeff,
+                                         std::vector< std::pair<unsigned int, unsigned int> >* sampled_region = 0, 
+                                         std::list< unsigned int >* uncombined_channels = 0);
+  
+  template <class T> EXPORTGPUPMRI 
+  int inverse_clib_matrix(cuNDArray<T>* A, 
+                          cuNDArray<T>* b,
+                          cuNDArray<T>* out_mixing_coeff, 
+                          double lamda);  
+
+  template <class T> void ht_grappa_solve_spd_system(hoNDArray<T> *A, hoNDArray<T> *B);
+
+}
+
+#endif //HTGRAPPA_H
diff --git a/toolboxes/mri/pmri/gpu/htgrappa_test.cpp b/toolboxes/mri/pmri/gpu/htgrappa_test.cpp
new file mode 100644
index 0000000..db0428d
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/htgrappa_test.cpp
@@ -0,0 +1,64 @@
+#include <iostream>
+#include <memory>
+
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDFFT.h"
+#include "GPUTimer.h"
+#include "htgrappa.h"
+
+using namespace Gadgetron;
+int main(int argc, char** argv)
+{
+  GDEBUG_STREAM("Simple HTGRAPPA program" << std::endl);
+  {
+    GPUTimer init_time("CUDA Initialization");
+  }
+  GPUTimer total_time("Total time elapsed");
+  
+
+  GPUTimer* timer_ptr = new GPUTimer("Loading data");
+  hoNDArray<cuFloatComplex> time_average_host = 
+    read_nd_array<cuFloatComplex>("time_average.cplx");
+
+  hoNDArray<cuFloatComplex> b1_host = 
+    read_nd_array<cuFloatComplex>("b1.cplx");
+
+  cuNDArray<cuFloatComplex> time_average_dev(time_average_host);
+  cuNDArray<cuFloatComplex> b1_dev(b1_host);
+  delete timer_ptr;
+
+  cuNDArray<cuFloatComplex> unmixing_dev;
+  if (!unmixing_dev.create(b1_dev.get_dimensions())) {
+    GDEBUG_STREAM("Unable to allocate memory for GRAPPA unmixing coefficients" << std::endl);
+    return 0;
+  }
+
+  {
+    GPUTimer unmix_timer("GRAPPA Unmixing");
+    std::vector<unsigned int> kernel_size;
+    kernel_size.push_back(5);
+    kernel_size.push_back(4);
+    if ( htgrappa_calculate_grappa_unmixing(&time_average_dev, 
+					    &b1_dev,
+					    4,
+					    kernel_size,
+					    &unmixing_dev) < 0) {
+      GDEBUG_STREAM("Error calculating unmixing coefficients" << std::endl);
+    }
+  }
+
+  /*
+  std::auto_ptr< cuNDArray<float2> > b1 = 
+    estimate_b1_map<uint2, float, float2>(&time_average_dev);
+  */
+
+  timer_ptr = new GPUTimer("Saving data");
+  hoNDArray<cuFloatComplex> average_image = time_average_dev.to_host();
+  write_nd_array<cuFloatComplex>(average_image, "average_image.cplx");
+  delete timer_ptr;
+
+  GDEBUG_STREAM("Reconstruction done" << std::endl);
+
+  return 0;
+}
diff --git a/toolboxes/mri/pmri/gpu/osSenseOperator.h b/toolboxes/mri/pmri/gpu/osSenseOperator.h
new file mode 100644
index 0000000..3d324eb
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/osSenseOperator.h
@@ -0,0 +1,97 @@
+/*
+ * osSenseOperator.h
+ *
+ *  Created on: Mar 31, 2015
+ *      Author: dch
+ */
+
+#pragma once
+#include "subsetOperator.h"
+#include <boost/shared_ptr.hpp>
+#include <boost/make_shared.hpp>
+#include "sense_utilities.h"
+namespace Gadgetron {
+
+template<class ARRAY, unsigned int D, class FFTOperator> class osSenseOperator: public virtual Gadgetron::subsetOperator<ARRAY>, public virtual FFTOperator {
+public:
+	osSenseOperator(): coils_per_subset(1){};
+	virtual ~osSenseOperator(){};
+	typedef typename ARRAY::element_type ELEMENT_TYPE;
+	typedef typename realType<ELEMENT_TYPE>::Type REAL;
+
+	virtual void set_csm(boost::shared_ptr<ARRAY> csm){
+		if (csm->get_size(csm->get_number_of_dimensions()-1)%coils_per_subset != 0)
+			throw std::runtime_error("osSenseOperator: number of coils in coil sensitivity map must be divisible by coils per subset");
+		this->csm = csm;
+	}
+	virtual boost::shared_ptr<ARRAY> get_csm(){
+		return csm;
+	}
+	virtual int get_number_of_subsets() override {
+		return csm->get_size(csm->get_number_of_dimensions()-1)/coils_per_subset;
+	}
+
+	virtual void set_coils_per_subset(unsigned int coils_per_subset){
+		this->coils_per_subset = coils_per_subset;
+	}
+
+	virtual void mult_M(ARRAY* in, ARRAY* out, int subset, bool accumulate) override {
+		auto subsize = *csm->get_dimensions();
+		subsize.pop_back();
+		subsize.push_back(coils_per_subset);
+		auto num_elements = std::accumulate(subsize.begin(),subsize.end(),1,std::multiplies<size_t>());
+		ARRAY sub_csm(subsize,csm->get_data_ptr()+num_elements*subset);
+		auto in_dims = *in->get_dimensions();
+		in_dims.push_back(coils_per_subset);
+		ARRAY tmp(in_dims);
+		csm_mult_M<REAL,D>(in,&tmp,&sub_csm);
+		FFTOperator::mult_M(&tmp,out,accumulate);
+	}
+
+	virtual void mult_M(ARRAY* in, ARRAY* out, bool accumulate) final {
+		subsetOperator<ARRAY>::mult_M(in,out,accumulate);
+	}
+
+	virtual void mult_MH(ARRAY* in, ARRAY* out, bool accumulate) final {
+		subsetOperator<ARRAY>::mult_MH(in,out,accumulate);
+	}
+
+	virtual void mult_MH_M(ARRAY* in, ARRAY* out, bool accumulate) final {
+
+		ARRAY tmp(this->codomain_dims_);
+		mult_M(in,&tmp,false);
+		mult_MH(&tmp,out,accumulate);
+	}
+
+	virtual void mult_MH(ARRAY* in, ARRAY* out, int subset, bool accumulate) override {
+		auto subsize = *csm->get_dimensions();
+		subsize.pop_back();
+		subsize.push_back(coils_per_subset);
+		auto num_elements = std::accumulate(subsize.begin(),subsize.end(),1,std::multiplies<size_t>());
+		ARRAY sub_csm(subsize,csm->get_data_ptr()+num_elements*subset);
+
+		auto out_dims = *out->get_dimensions();
+		out_dims.push_back(coils_per_subset);
+		ARRAY tmp(out_dims);
+
+		FFTOperator::mult_MH(in,&tmp,accumulate);
+
+		csm_mult_MH<REAL,D>(&tmp,out,&sub_csm);
+
+	}
+
+	virtual boost::shared_ptr<std::vector<size_t> > get_codomain_dimensions(int subset) override{
+		auto codom_dims = boost::make_shared<std::vector<size_t>>(this->codomain_dims_);
+		codom_dims->back() = coils_per_subset;
+		return codom_dims;
+	}
+
+
+
+protected:
+	boost::shared_ptr<ARRAY> csm;
+	unsigned int coils_per_subset;
+};
+
+} /* namespace Gadgetron */
+
diff --git a/toolboxes/mri/pmri/gpu/senseOperator.h b/toolboxes/mri/pmri/gpu/senseOperator.h
new file mode 100644
index 0000000..8047497
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/senseOperator.h
@@ -0,0 +1,48 @@
+/** \file senseOperator.h
+    \brief Base class for all Sense operators
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+#include "gpupmri_export.h"
+
+#include <boost/shared_ptr.hpp>
+#include <iostream>
+
+namespace Gadgetron{
+
+  template<class ARRAY_TYPE, unsigned int D> class EXPORTGPUPMRI senseOperator : public linearOperator<ARRAY_TYPE>
+  {
+
+  public:
+
+    senseOperator() : linearOperator<ARRAY_TYPE>(), ncoils_(0) {}
+    virtual ~senseOperator() {}
+
+    inline unsigned int get_number_of_coils() { return ncoils_; }
+    inline boost::shared_ptr<ARRAY_TYPE> get_csm() { return csm_; }
+    
+    virtual void set_csm( boost::shared_ptr<ARRAY_TYPE> csm )
+    {
+      if( csm.get() && csm->get_number_of_dimensions() == D+1 ) {
+	csm_ = csm;      
+	ncoils_ = csm_->get_size(D);
+      }
+      else{
+	throw std::runtime_error("Error: senseOperator::set_csm : unexpected csm dimensionality");
+      }    
+    }
+
+    virtual void mult_M( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false ) = 0;
+    virtual void mult_MH( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false ) = 0;
+
+    virtual void mult_csm( ARRAY_TYPE* in, ARRAY_TYPE* out ) = 0;
+    virtual void mult_csm_conj_sum( ARRAY_TYPE* in, ARRAY_TYPE* out) = 0;
+
+  protected:
+
+    unsigned int ncoils_;
+    boost::shared_ptr< ARRAY_TYPE > csm_;
+  };
+}
diff --git a/toolboxes/mri/pmri/gpu/sense_utilities.cu b/toolboxes/mri/pmri/gpu/sense_utilities.cu
new file mode 100644
index 0000000..0c486bd
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/sense_utilities.cu
@@ -0,0 +1,146 @@
+#include "sense_utilities.h"
+#include "vector_td_utilities.h"
+#include <sstream>
+
+namespace Gadgetron{
+
+  template<class REAL> __global__ void 
+  mult_csm_kernel( const complext<REAL> * __restrict__ in, complext<REAL> * __restrict__ out, complext<REAL> *csm,
+		   size_t image_elements, unsigned int nframes, unsigned int ncoils )
+  {
+    unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x;
+    if( idx < image_elements) {
+      complext<REAL> _in = in[idx+blockIdx.y*image_elements];
+      for( unsigned int i=0; i<ncoils; i++) {
+	out[idx + blockIdx.y*image_elements + i*image_elements*nframes] =  _in * csm[idx+i*image_elements];
+      }
+    }
+  }
+
+  template<class REAL, unsigned int D> void
+  csm_mult_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, cuNDArray< complext<REAL> > *csm )
+  {  
+    int device;
+    if( cudaGetDevice( &device ) != cudaSuccess ){
+      throw cuda_error( "mult_csm: unable to query current device");
+    }
+  
+    if( !in || in->get_device() != device || !out || out->get_device() != device || !csm || csm->get_device() != device ){
+      throw cuda_error("mult_csm: array not residing current device");
+    }
+  
+    if( in->get_number_of_dimensions() < D  || in->get_number_of_dimensions() > D+1 ){
+      throw std::runtime_error("mult_csm: unexpected input dimensionality");
+    }
+  
+    if( in->get_number_of_dimensions() > out->get_number_of_dimensions() ){
+      throw std::runtime_error("mult_csm: input dimensionality cannot exceed output dimensionality");
+    }
+
+    if( csm->get_number_of_dimensions() != D+1 ) {
+      throw std::runtime_error("mult_csm: input dimensionality of csm not as expected");
+    }
+
+    unsigned int num_image_elements = 1;
+    for( unsigned int d=0; d<D; d++ )
+      num_image_elements *= in->get_size(d);
+  
+    unsigned int num_frames = in->get_number_of_elements() / num_image_elements;
+  
+    dim3 blockDim(256);
+    dim3 gridDim((num_image_elements+blockDim.x-1)/blockDim.x, num_frames);
+
+    mult_csm_kernel<REAL><<< gridDim, blockDim >>>
+      ( in->get_data_ptr(), out->get_data_ptr(), csm->get_data_ptr(), num_image_elements, num_frames, csm->get_size(D) );
+
+    cudaError_t err = cudaGetLastError();
+    if( err != cudaSuccess ){
+      std::stringstream ss;
+      ss << "mult_csm: unable to multiply with coil sensitivities: " <<
+	cudaGetErrorString(err);
+      throw cuda_error(ss.str());
+
+    }
+  }
+
+  template <class REAL> __global__ void 
+  mult_csm_conj_sum_kernel(const  complext<REAL> * __restrict__ in, complext<REAL> * __restrict__ out, const complext<REAL> * __restrict__ csm,
+			    size_t image_elements, unsigned int nframes, unsigned int ncoils )
+  {
+    unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x;
+    if( idx < image_elements ) {
+      complext<REAL> _out =complext<REAL>(0);
+      for( unsigned int i = 0; i < ncoils; i++ ) {
+	_out += in[idx+blockIdx.y*image_elements+i*nframes*image_elements] * conj(csm[idx+i*image_elements]);
+      }
+      out[idx+blockIdx.y*image_elements] = _out;
+    }
+  }
+
+  template<class REAL, unsigned int D> void
+  csm_mult_MH( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out, cuNDArray<complext<REAL> > *csm )
+  {
+    int device;
+    if( cudaGetDevice( &device ) != cudaSuccess ){
+      throw cuda_error("mult_csm_conj_sum: unable to query current device");
+    }
+  
+    if( !in || in->get_device() != device || !out || out->get_device() != device || !csm || csm->get_device() != device ){
+      throw std::runtime_error("mult_csm_conj_sum: array not residing current device");
+    }
+  
+    if( out->get_number_of_dimensions() < D  || out->get_number_of_dimensions() > D+1 ){
+      throw std::runtime_error("mult_csm_conj_sum: unexpected output dimensionality");
+    }
+
+    if( out->get_number_of_dimensions() > in->get_number_of_dimensions() ){
+      throw std::runtime_error("mult_csm_conj_sum: output dimensionality cannot exceed input dimensionality");
+    }
+
+    if( csm->get_number_of_dimensions() != D+1 ) {
+      throw std::runtime_error("mult_csm_conj_sum: input dimensionality of csm not as expected");
+    }
+
+    unsigned int num_image_elements = 1;
+    for( unsigned int d=0; d<D; d++ )
+      num_image_elements *= out->get_size(d);
+  
+    unsigned int num_frames = out->get_number_of_elements() / num_image_elements;
+
+    dim3 blockDim(256);
+    dim3 gridDim((num_image_elements+blockDim.x-1)/blockDim.x, num_frames);
+
+    mult_csm_conj_sum_kernel<REAL><<< gridDim, blockDim >>>
+      ( in->get_data_ptr(), out->get_data_ptr(), csm->get_data_ptr(), num_image_elements, num_frames, csm->get_size(D) );
+
+    cudaError_t err = cudaGetLastError();
+    if( err != cudaSuccess ){
+      std::stringstream ss;
+      ss << "mult_csm_conj_sum: unable to combine coils " <<
+	cudaGetErrorString(err);
+      throw cuda_error(ss.str());
+    }
+  }
+
+  // Instantiation
+
+  template EXPORTGPUPMRI void csm_mult_M<float,1>( cuNDArray< complext<float> >*, cuNDArray< complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUPMRI void csm_mult_M<float,2>( cuNDArray< complext<float> >*, cuNDArray< complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUPMRI void csm_mult_M<float,3>( cuNDArray< complext<float> >*, cuNDArray< complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUPMRI void csm_mult_M<float,4>( cuNDArray< complext<float> >*, cuNDArray< complext<float> >*, cuNDArray< complext<float> >*);
+
+  template EXPORTGPUPMRI void csm_mult_M<double,1>( cuNDArray< complext<double> >*, cuNDArray< complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUPMRI void csm_mult_M<double,2>( cuNDArray< complext<double> >*, cuNDArray< complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUPMRI void csm_mult_M<double,3>( cuNDArray< complext<double> >*, cuNDArray< complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUPMRI void csm_mult_M<double,4>( cuNDArray< complext<double> >*, cuNDArray< complext<double> >*, cuNDArray< complext<double> >*);
+
+  template EXPORTGPUPMRI void csm_mult_MH<float,1>( cuNDArray< complext<float> >*, cuNDArray< complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUPMRI void csm_mult_MH<float,2>( cuNDArray< complext<float> >*, cuNDArray< complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUPMRI void csm_mult_MH<float,3>( cuNDArray< complext<float> >*, cuNDArray< complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUPMRI void csm_mult_MH<float,4>( cuNDArray< complext<float> >*, cuNDArray< complext<float> >*, cuNDArray< complext<float> >*);
+
+  template EXPORTGPUPMRI void csm_mult_MH<double,1>( cuNDArray< complext<double> >*, cuNDArray< complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUPMRI void csm_mult_MH<double,2>( cuNDArray< complext<double> >*, cuNDArray< complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUPMRI void csm_mult_MH<double,3>( cuNDArray< complext<double> >*, cuNDArray< complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUPMRI void csm_mult_MH<double,4>( cuNDArray< complext<double> >*, cuNDArray< complext<double> >*, cuNDArray< complext<double> >*);
+}
diff --git a/toolboxes/mri/pmri/gpu/sense_utilities.h b/toolboxes/mri/pmri/gpu/sense_utilities.h
new file mode 100644
index 0000000..56e26f7
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/sense_utilities.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "cuNDArray.h"
+#include "complext.h"
+#include "gpupmri_export.h"
+
+namespace Gadgetron{
+
+// Multiply with coil sensitivities
+//
+
+template< class REAL, unsigned int D> EXPORTGPUPMRI void
+csm_mult_M( cuNDArray< complext<REAL> > *in, 
+	    cuNDArray< complext<REAL> > *out, 
+	    cuNDArray< complext<REAL> > *csm );
+
+
+// Multiply with adjoint of coil sensitivities
+//
+
+template< class REAL, unsigned int D> EXPORTGPUPMRI void
+csm_mult_MH( cuNDArray< complext<REAL> > *in, 
+	     cuNDArray< complext<REAL> > *out, 
+	     cuNDArray< complext<REAL> > *csm );
+}
diff --git a/toolboxes/mri/pmri/gpu/spirit_calibration.cu b/toolboxes/mri/pmri/gpu/spirit_calibration.cu
new file mode 100644
index 0000000..f2499a8
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/spirit_calibration.cu
@@ -0,0 +1,363 @@
+#include "spirit_calibration.h"
+#include "vector_td_operators.h"
+#include "vector_td_utilities.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_reductions.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_blas.h"
+#include "cuNDFFT.h"
+#include "cudaDeviceManager.h"
+#include "setup_grid.h"
+#include "complext.h"
+#include "CUBLASContextProvider.h"
+#include "GPUTimer.h"
+#include "hoNDArray_fileio.h"
+#include "hoNDArray_utils.h"
+#include "htgrappa.h"
+
+#include <cublas_v2.h>
+//#include <cula_lapack_device.h>
+
+namespace Gadgetron {
+
+  static __global__ void 
+  compute_system_matrix_kernel( intd2 dims,
+                                int num_coils,
+                                int kernel_size,
+                                float_complext *kspace,
+                                float_complext *A )
+  {
+    // The grid contains one thread per coil element. 
+    // Each thread reads its corresponding data element and is responsible 
+    // for filling into the corresponding kernel_size*kernel entries in the matrix.
+    //
+    // The storage format is column major due to BLAS/LAPACK conventions.
+    // This increases the overhead of writes in this kernel (they are non-coaslesced and MANY). 
+    // TODO: optimize for performance.
+    //
+    
+    const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    const int elements_per_coil = prod(dims);
+
+    if( idx < elements_per_coil*num_coils ){
+
+      // Get the k-space value for this thread
+      //
+
+      float_complext val = kspace[idx];
+
+      const int num_kernel_elements = kernel_size*kernel_size-1;
+      const int coil = idx/elements_per_coil;
+      const int idx_in_coil = idx-coil*elements_per_coil;
+
+      // Loop over the number of outputs produced per thread
+      //
+
+      const int half_kernel_size = kernel_size>>1;
+
+      for( int j = -half_kernel_size; j<half_kernel_size+1; j++ ){ // row iterator
+        for( int i = -half_kernel_size; i<half_kernel_size+1; i++ ){ // column iterator
+
+          if( j==0 && i==0 ) continue; // The weight of the central points is set to 0
+
+          int kernel_idx = co_to_idx( intd2(i+half_kernel_size,j+half_kernel_size), intd2(kernel_size,kernel_size) );
+          if( (j==0 && i>0) || j>0 ) kernel_idx--;
+
+          const int m = 
+            (idx_in_coil+j*dims[0]+i+elements_per_coil)%elements_per_coil; // row idx
+
+          const int n = 
+            coil*num_kernel_elements + kernel_idx;
+          
+          const int A_idx = 
+            n*elements_per_coil + m; // Column major storage
+
+          A[A_idx] = val;
+        }
+      }      
+    }
+  }
+  
+
+  static __global__ void 
+  write_convolution_masks_kernel( intd2 dims,
+                                  int num_coils,
+                                  int kernel_size,
+                                  float_complext *kernels,
+                                  float_complext *kspace )
+  {
+    // Write out convolution masks in the center of kspace
+    // - thus prepare for FFT into image space
+    //
+
+    const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    const int elements_per_coil = prod(dims);
+
+    if( idx < elements_per_coil*num_coils*num_coils ){
+
+      const int half_kernel_size = kernel_size>>1;
+      const int num_kernel_elements = kernel_size*kernel_size-1;
+      const int batch = idx/(elements_per_coil*num_coils);
+      const int idx_in_batch = idx-batch*elements_per_coil*num_coils;
+      const int coil = idx_in_batch/elements_per_coil;
+      const int idx_in_coil = idx_in_batch-coil*elements_per_coil;
+      const intd2 co = idx_to_co( idx_in_coil, dims ) - (dims>>1);
+
+      if( co[1] >= -half_kernel_size && co[1] <= half_kernel_size && 
+          co[0] >= -half_kernel_size && co[0] <= half_kernel_size ){
+
+        // Compute kernel index 
+        // - keeping in mind the central elements are missing (forced to 0)
+        //
+        
+        int kernel_idx = co_to_idx( co+intd2(half_kernel_size, half_kernel_size), intd2(kernel_size, kernel_size) );
+        
+        if( co[1] == 0 && co[0] == 0 ) {
+          kspace[idx] = float_complext(0.0f);
+        }
+        else {
+          if( (co[1]==0 && co[0]>0) || co[1]>0 ) kernel_idx--;
+          kspace[idx] = kernels[batch*num_kernel_elements*num_coils + coil*num_kernel_elements + kernel_idx];
+        }
+      }
+      else{
+        kspace[idx] = float_complext(0.0f);
+      }          
+    }
+  }
+
+  boost::shared_ptr< cuNDArray<float_complext> > 
+  estimate_spirit_kernels( cuNDArray<float_complext> *_kspace, unsigned int kernel_size )
+  {
+    // Calibration is performed in k-space. 
+    // The result is Fourier transformed and returned as image space kernels.
+    // The convolution is expressed explicitly as a matrix equation an solved using BLAS/LAPACK.
+    //
+
+    if( _kspace == 0x0 ){
+      throw std::runtime_error("estimate_spirit_kernels: 0x0 input array");
+    }
+    
+    if( _kspace->get_number_of_dimensions() != 3 ) {
+      throw std::runtime_error("estimate_spirit_kernels: Only 2D spirit is supported currently");
+    }
+
+    if( (kernel_size%2) == 0 ) {
+      throw std::runtime_error("estimate_spirit_kernels: The kernel size should be odd");
+    }
+
+
+    // Normalize input array to an average intensity of one per element
+    //
+    std::vector<size_t> old_dims = *_kspace->get_dimensions();
+    std::vector<size_t> dims= old_dims;
+    /*dims[0] /= 2;
+    dims[1] /= 2;*/
+    //dims[0]=36;
+    //dims[1]=36;
+    //cuNDArray<float_complext> kspace(_kspace);
+    cuNDArray<float_complext> kspace(dims);
+
+    vector_td<size_t,2> offset((old_dims[0]-dims[0])/2,(old_dims[1]-dims[1])/2);
+    crop<float_complext,2>(offset,_kspace,&kspace);
+    float sum = nrm2(&kspace);    
+    float_complext in_max = kspace[amax(&kspace)];
+    kspace /= (float(kspace.get_number_of_elements())/sum);
+    unsigned int num_coils = kspace.get_size(kspace.get_number_of_dimensions()-1);
+    unsigned int elements_per_coil = kspace.get_number_of_elements()/num_coils;
+    
+    std::vector<size_t> out_dims;
+    out_dims.push_back(_kspace->get_size(0)); out_dims.push_back(_kspace->get_size(1));
+    out_dims.push_back(num_coils*num_coils);
+    
+    boost::shared_ptr< cuNDArray<float_complext> > kernel_images
+      ( new cuNDArray<float_complext>(&out_dims) );
+
+    // Clear to ones in case we terminate early
+    //
+
+    fill(kernel_images.get(), float_complext(1.0f/num_coils));
+
+    // Form m x n system matrix A
+    //
+
+    unsigned int m = elements_per_coil;
+    unsigned int n = num_coils*(kernel_size*kernel_size-1);
+
+    std::vector<size_t> A_dims; A_dims.push_back(m); A_dims.push_back(n);    
+    cuNDArray<float_complext> A(&A_dims); clear(&A);
+
+    // Fill system matrix
+    //
+
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( kspace.get_number_of_elements(), &blockDim, &gridDim );
+    
+    compute_system_matrix_kernel<<< gridDim, blockDim >>>
+      ( intd2(kspace.get_size(0), kspace.get_size(1)), num_coils, kernel_size,
+        kspace.get_data_ptr(), A.get_data_ptr() );
+
+    CHECK_FOR_CUDA_ERROR();    
+
+    /*
+    static int counter = 0;
+    char filename[256];
+    sprintf((char*)filename, "_A_%d.cplx", counter);
+    write_nd_array<float_complext>( A.to_host().get(), filename );
+    counter++;
+    */
+
+    // Compute A^H A
+    //
+
+    cublasStatus_t stat;
+    cublasHandle_t handle = *CUBLASContextProvider::instance()->getCublasHandle();
+
+    std::vector<size_t> AHA_dims(2,n);
+    cuNDArray<float_complext> AHA(&AHA_dims);
+
+    // Initialize AHA to identity (Tikhonov regularization)
+    //
+
+    float_complext one(1.0f);
+    clear(&AHA);
+    for( unsigned int i=0; i<n; i++ ){
+      cudaMemcpy( AHA.get_data_ptr()+i*n+i, &one, sizeof(float_complext), cudaMemcpyHostToDevice );
+    }
+    CHECK_FOR_CUDA_ERROR();
+
+    float_complext alpha(1.0f);
+    //float_complext beta(0.1f*in_max); // Tikhonov regularization weight
+    float_complext beta(0.0f); // Tikhonov regularization weight
+    
+    stat = cublasCgemm( handle, CUBLAS_OP_C, CUBLAS_OP_N,
+                        n,n,m,
+                        (cuFloatComplex*) &alpha,
+                        (cuFloatComplex*) A.get_data_ptr(), m,
+                        (cuFloatComplex*) A.get_data_ptr(), m,
+                        (cuFloatComplex*) &beta, 
+                        (cuFloatComplex*) AHA.get_data_ptr(), n );
+    
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+      std::cerr << "CUBLAS error code " << stat << std::endl;
+      throw std::runtime_error("estimate_spirit_kernels: CUBLAS error computing A^HA");
+    }
+
+    /*
+    static int counter = 0;
+    char filename[256];
+    sprintf((char*)filename, "_AHA_%d.cplx", counter);
+    write_nd_array<float_complext>( AHA.to_host().get(), filename );
+    counter++;
+    */
+
+    // Multiply A^H with each coil image (to form the rhs)
+    //
+
+    std::vector<size_t> rhs_dims; rhs_dims.push_back(n); rhs_dims.push_back(num_coils);    
+    cuNDArray<float_complext> rhs(&rhs_dims); clear(&rhs);
+
+    beta = float_complext(0.0f);
+
+    stat = cublasCgemm( handle, CUBLAS_OP_C, CUBLAS_OP_N,
+                        n, num_coils, m,
+                        (cuFloatComplex*) &alpha,
+                        (cuFloatComplex*) A.get_data_ptr(), m,
+                        (cuFloatComplex*) kspace.get_data_ptr(), m,
+                        (cuFloatComplex*) &beta, 
+                        (cuFloatComplex*) rhs.get_data_ptr(), n );
+    
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+      std::cerr << "CUBLAS error code " << stat << std::endl;
+      throw std::runtime_error("estimate_spirit_kernels: CUBLAS error computing rhs");
+    }
+    
+    /*
+    static int counter = 0;
+    char filename[256];
+    sprintf((char*)filename, "_rhs_%d.cplx", counter);
+    write_nd_array<float_complext>( rhs.to_host().get(), filename );
+    counter++;
+    */
+
+
+
+    //CGELS is used rather than a more conventional solver as it is part of CULA free.
+    /*
+    culaStatus s = culaDeviceCgels( 'N', n, n, num_coils,
+                                 (culaDeviceFloatComplex*)AHA.get_data_ptr(), n,
+                                 (culaDeviceFloatComplex*)rhs.get_data_ptr(), n);
+    */
+    {
+      //It actually turns out to be faster to do this inversion on the CPU. Problem is probably too small for GPU to make sense
+      //GPUTimer cpu_invert_time("CPU Inversion time");
+      boost::shared_ptr< hoNDArray<float_complext> > AHA_h = AHA.to_host();
+      boost::shared_ptr< hoNDArray<float_complext> > AHrhs_h = rhs.to_host();
+      
+      std::vector<size_t> perm_dim;
+      perm_dim.push_back(1);
+      perm_dim.push_back(0);
+      
+      permute(AHA_h.get(),&perm_dim);
+      permute(AHrhs_h.get(),&perm_dim);
+      
+      ht_grappa_solve_spd_system(AHA_h.get(), AHrhs_h.get());	  
+
+      permute(AHrhs_h.get(),&perm_dim);
+      rhs = cuNDArray<float_complext>(*AHrhs_h);
+    }
+
+
+    /*
+    if( s != culaNoError ) {
+      if( s == 8 ){
+        std::cerr << "CULA error code " << s << ": " << culaGetStatusString(s) << std::endl;
+        std::cerr << "Assuming that the buffer is not yet filled and return ones" << std::endl;
+        return kernel_images;
+      }
+      std::cerr << "CULA error code " << s << ": " << culaGetStatusString(s) << std::endl;
+      culaInfo i = culaGetErrorInfo();
+      char buf[2048];
+      culaGetErrorInfoString(s, i, buf, sizeof(buf));
+      printf("Error %d: %s\n", (int)i, buf);      
+      throw std::runtime_error("estimate_spirit_kernels: CULA error computing 'getrs'");
+    }
+    */
+
+    //CULA will sometime return NaN without an explicit error. This code tests for NaNs and returns if found.
+    float nan_test = nrm2(&rhs);
+    if (nan_test != nan_test) return kernel_images;
+
+    // Fill k-spaces with the computed kernels at the center
+    //
+
+    setup_grid( kernel_images->get_number_of_elements(), &blockDim, &gridDim );
+    
+    write_convolution_masks_kernel<<< gridDim, blockDim >>>
+      ( intd2(kernel_images->get_size(0), kernel_images->get_size(1)), num_coils, kernel_size,
+        rhs.get_data_ptr(), kernel_images->get_data_ptr() );
+    
+    CHECK_FOR_CUDA_ERROR();
+
+    // Batch FFT into image space
+    //
+    A.clear();
+    AHA.clear();
+    rhs.clear();
+
+    std::vector<size_t> dims_to_xform;
+    dims_to_xform.push_back(0); dims_to_xform.push_back(1);    
+    cuNDFFT<float>::instance()->ifft( kernel_images.get(), &dims_to_xform, false );
+    
+    /*
+    static int counter = 0;
+    char filename[256];
+    sprintf((char*)filename, "_kernels_%d.cplx", counter);
+    write_nd_array<float_complext>( kernel_images->to_host().get(), filename );
+    counter++;
+    */
+
+    return kernel_images;
+  }
+}
diff --git a/toolboxes/mri/pmri/gpu/spirit_calibration.h b/toolboxes/mri/pmri/gpu/spirit_calibration.h
new file mode 100644
index 0000000..d2622d9
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/spirit_calibration.h
@@ -0,0 +1,22 @@
+/** \file spirit_calibration.h
+    \brief Utility to calibrate spirit convolution kernels, GPU-based.
+*/
+
+#pragma once
+
+#include "gpupmri_export.h"
+#include "cuNDArray.h"
+
+namespace Gadgetron
+{
+  
+  /**
+     @brief Utility to estimate spirit convolution kernels, GPU-based.
+     @param[in] cartesian_kspace_data Array with fully sampled kspace data (Cartesian). E.g. as a result of accumulation of multiple frames.
+     @param[in] kernel_size Size of the convolution kernel to use for k-space calibration. Must be an odd number.
+     @return A set convolution kernels Fourier transformed into image space. For 'n' coils, n^2 calibration images are estimated, i.e. 'n' kernels for each coil.
+     Currently only 2D Spirit is supported in this function (higher-dimensional Spirit is supported in the gt-plus toolbox).
+  */
+  EXPORTGPUPMRI boost::shared_ptr< cuNDArray<float_complext> > 
+  estimate_spirit_kernels( cuNDArray<float_complext> *cartesian_kspace_data, unsigned int kernel_size );
+}
diff --git a/toolboxes/mri/pmri/gpu/trajectory_utils.cu b/toolboxes/mri/pmri/gpu/trajectory_utils.cu
new file mode 100644
index 0000000..8cab154
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/trajectory_utils.cu
@@ -0,0 +1,38 @@
+#include "cuNDArray.h"
+#include <thrust/iterator/zip_iterator.h>
+#include "vector_td.h"
+#include "vector_td_utilities.h"
+#include "trajectory_utils.h"
+using namespace Gadgetron;
+
+struct traj_filter_functor{
+
+	traj_filter_functor(float limit){
+		limit_ = limit;
+	}
+	__device__  thrust::tuple<floatd2,float> operator()(thrust::tuple<floatd2,float> tup){
+
+		floatd2 traj = thrust::get<0>(tup);
+		float dcw = thrust::get<1>(tup);
+		if ( abs(traj[0]) > limit_ || abs(traj[1]) > limit_)
+			dcw = 0;
+
+		return thrust::tuple<floatd2,float>(traj,dcw);
+
+	}
+
+	float limit_;
+};
+
+boost::shared_ptr< cuNDArray<float> > Gadgetron::filter_dcw(cuNDArray<floatd2>* traj, cuNDArray<float>* dcw,float limit){
+	cuNDArray<float>* dcw_new = new cuNDArray<float>(*dcw);
+
+	thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(traj->begin(),dcw_new->begin())),
+			thrust::make_zip_iterator(thrust::make_tuple(traj->end(),dcw_new->end())),
+			thrust::make_zip_iterator(thrust::make_tuple(traj->begin(),dcw_new->begin())),
+			traj_filter_functor(limit));
+
+	return boost::shared_ptr<cuNDArray<float> >(dcw_new);
+
+
+}
diff --git a/toolboxes/mri/pmri/gpu/trajectory_utils.h b/toolboxes/mri/pmri/gpu/trajectory_utils.h
new file mode 100644
index 0000000..473ba2c
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/trajectory_utils.h
@@ -0,0 +1,16 @@
+#pragma once
+#include "cuNDArray.h"
+#include <boost/shared_ptr.hpp>
+#include "vector_td.h"
+
+namespace Gadgetron {
+/**
+ * Creates a new set of dcw weights, where weights for all points outside the limit are 0. Useful for estimating coil
+ * sensitivity maps from k-space centers. Maybe.
+ * @param traj
+ * @param dcw
+ * @param limit
+ * @return
+ */
+boost::shared_ptr< cuNDArray<float> > filter_dcw(cuNDArray<floatd2>* traj, cuNDArray<float>* dcw,float limit);
+}
diff --git a/toolboxes/mri_core/CMakeLists.txt b/toolboxes/mri_core/CMakeLists.txt
new file mode 100644
index 0000000..1a982ad
--- /dev/null
+++ b/toolboxes/mri_core/CMakeLists.txt
@@ -0,0 +1,57 @@
+if (WIN32)
+    ADD_DEFINITIONS(-D__BUILD_GADGETRON_MRI_CORE__)
+endif (WIN32)
+
+include_directories(
+    ${Boost_INCLUDE_DIR} 
+    ${ARMADILLO_INCLUDE_DIRS} 
+    ${ACE_INCLUDE_DIR} 
+    ${ISMRMRD_INCLUDE_DIR}
+    ${FFTW3_INCLUDE_DIR}
+    ${MKL_INCLUDE_DIR}
+    ${CMAKE_SOURCE_DIR}/toolboxes/core
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+    ${CMAKE_SOURCE_DIR}/toolboxes/fft/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/algorithm
+    ${CMAKE_SOURCE_DIR}/toolboxes/operators
+    ${CMAKE_SOURCE_DIR}/toolboxes/operators/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+    ${CMAKE_SOURCE_DIR}/toolboxes/solvers/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math 
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/util
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/workflow
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/algorithm
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/solver
+)
+
+set( mri_core_header_files
+        mri_core_export.h
+        mri_core_def.h
+        mri_core_data.h
+        mri_core_utility.h
+        mri_core_grappa.h 
+        mri_core_coil_map_estimation.h )
+
+set( mri_core_source_files
+        mri_core_utility.cpp 
+        mri_core_grappa.cpp 
+        mri_core_coil_map_estimation.cpp )
+
+add_library(gadgetron_toolbox_mri_core SHARED 
+     ${mri_core_header_files} ${mri_core_source_files} )
+
+set_target_properties(gadgetron_toolbox_mri_core PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+set_target_properties(gadgetron_toolbox_mri_core PROPERTIES LINKER_LANGUAGE CXX)
+
+target_link_libraries(gadgetron_toolbox_mri_core gadgetron_toolbox_cpucore gadgetron_toolbox_cpucore_math ${ARMADILLO_LIBRARIES} gadgetron_toolbox_cpufft )
+
+install(TARGETS gadgetron_toolbox_mri_core DESTINATION lib COMPONENT main)
+
+install(FILES
+  ${mri_core_header_files}
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/mri_core/mri_core_coil_map_estimation.cpp b/toolboxes/mri_core/mri_core_coil_map_estimation.cpp
new file mode 100644
index 0000000..926e3f7
--- /dev/null
+++ b/toolboxes/mri_core/mri_core_coil_map_estimation.cpp
@@ -0,0 +1,610 @@
+
+/** \file   mri_core_coil_map_estimation.cpp
+    \brief  Implementation MRI coil sensitivity map estimation.
+    \author Hui Xue
+*/
+
+#include "mri_core_coil_map_estimation.h"
+#include "hoMatrix.h"
+#include "hoNDArray_linalg.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_reductions.h"
+
+namespace Gadgetron
+{
+
+template<typename T> 
+void coil_map_2d_Inati(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t power)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+
+        long long RO = data.get_size(0);
+        long long E1 = data.get_size(1);
+        long long CHA = data.get_size(2);
+
+        long long N = data.get_number_of_elements() / (RO*E1*CHA);
+        GADGET_CHECK_THROW(N == 1);
+
+        const T* pData = data.begin();
+
+        if (!data.dimensions_equal(&coilMap))
+        {
+            coilMap = data;
+        }
+        T* pSen = coilMap.begin();
+
+        if (ks % 2 != 1)
+        {
+            ks++;
+        }
+
+        size_t kss = ks*ks;
+        long long halfKs = (long long)ks / 2;
+
+        int e1;
+
+        #pragma omp parallel private(e1) shared(ks, RO, E1, CHA, pSen, pData, halfKs, power, kss)
+        {
+            hoNDArray<T> D(ks*ks, CHA);
+            T* pD = D.begin();
+
+            hoNDArray<T> DC(ks*ks, CHA);
+            T* pDC = DC.begin();
+
+            hoNDArray<T> DH_D(CHA, CHA);
+            Gadgetron::clear(DH_D);
+
+            hoNDArray<T> U1(ks*ks, 1);
+            T* pU1 = U1.begin();
+
+            hoNDArray<T> V1(CHA, 1);
+            T* pV1 = V1.begin();
+
+            hoNDArray<T> V(CHA, 1);
+
+            Gadgetron::clear(D);
+            Gadgetron::clear(DC);
+            Gadgetron::clear(DH_D);
+            Gadgetron::clear(U1);
+            Gadgetron::clear(V1);
+            Gadgetron::clear(V);
+
+            T phaseU1;
+
+            value_type v1Norm(1), u1Norm(1);
+
+            long long cha, ro, kro, ke1, de1, dro;
+            size_t po;
+
+            #pragma omp for
+            for (e1 = 0; e1<(int)E1; e1++)
+            {
+                for (ro = 0; ro<(long long)RO; ro++)
+                {
+                    // fill the data matrix D
+                    if (e1 >= halfKs && e1<E1 - halfKs && ro >= halfKs && ro<RO - halfKs)
+                    {
+                        for (cha = 0; cha<CHA; cha++)
+                        {
+                            const T* pDataCurr = pData + cha*RO*E1;
+                            int ind = 0;
+                            for (ke1 = -halfKs; ke1 <= halfKs; ke1++)
+                            {
+                                de1 = e1 + ke1;
+                                for (kro = -halfKs; kro <= halfKs; kro++)
+                                {
+                                    pD[ind + cha*kss] = pDataCurr[de1*RO + ro + kro];
+                                    ind++;
+                                }
+                            }
+                        }
+                    }
+                    else
+                    {
+                        for (cha = 0; cha<CHA; cha++)
+                        {
+                            const T* pDataCurr = pData + cha*RO*E1;
+                            int ind = 0;
+                            for (ke1 = -halfKs; ke1 <= halfKs; ke1++)
+                            {
+                                de1 = e1 + ke1;
+                                if (de1 < 0) de1 += E1;
+                                if (de1 >= E1) de1 -= E1;
+
+                                for (kro = -halfKs; kro <= halfKs; kro++)
+                                {
+                                    dro = ro + kro;
+                                    if (dro < 0) dro += RO;
+                                    if (dro >= RO) dro -= RO;
+
+                                    pD[ind + cha*kss] = pDataCurr[de1*RO + dro];
+                                    ind++;
+                                }
+                            }
+                        }
+                    }
+
+                    T* pTmp;
+                    for (cha = 0; cha<CHA; cha++)
+                    {
+                        pTmp = pD + cha*kss;
+                        pV1[cha] = pTmp[0];
+                        for (po = 1; po<kss; po++)
+                        {
+                            pV1[cha] += pTmp[po];
+                        }
+                    }
+
+                    value_type sum(0);
+                    for (cha = 0; cha<CHA; cha++)
+                    {
+                        const T& c = pV1[cha];
+                        const value_type re = c.real();
+                        const value_type im = c.imag();
+                        sum += ((re*re) + (im * im));
+                    }
+                    v1Norm = std::sqrt(sum);
+
+                    value_type v1NormInv = (value_type)1.0 / v1Norm;
+                    for (cha = 0; cha<CHA; cha++)
+                    {
+                        pV1[cha] *= v1NormInv;
+                    }
+
+                    memcpy(pDC, pD, sizeof(T)*ks*ks*CHA);
+                    gemm(DH_D, DC, true, D, false);
+
+                    for (po = 0; po<power; po++)
+                    {
+                        gemm(V, DH_D, false, V1, false);
+                        memcpy(V1.begin(), V.begin(), V.get_number_of_bytes());
+
+                        sum = 0;
+                        for (cha = 0; cha<CHA; cha++)
+                        {
+                            const T& c = pV1[cha];
+                            const value_type re = c.real();
+                            const value_type im = c.imag();
+                            sum += ((re*re) + (im * im));
+                        }
+                        v1Norm = std::sqrt(sum);
+
+                        value_type v1NormInv = (value_type)1.0 / v1Norm;
+                        for (cha = 0; cha<CHA; cha++)
+                        {
+                            pV1[cha] *= v1NormInv;
+                        }
+                    }
+
+                    gemm(U1, D, false, V1, false);
+
+                    phaseU1 = pU1[0];
+                    for (po = 1; po<kss; po++)
+                    {
+                        phaseU1 += pU1[po];
+                    }
+                    phaseU1 /= std::abs(phaseU1);
+
+                    const value_type c = phaseU1.real();
+                    const value_type d = phaseU1.imag();
+
+                    for (cha = 0; cha<CHA; cha++)
+                    {
+                        const T& v = pV1[cha];
+                        const value_type a = v.real();
+                        const value_type b = v.imag();
+
+                        reinterpret_cast< value_type(&)[2] >(pV1[cha])[0] = a*c + b*d;
+                        reinterpret_cast< value_type(&)[2] >(pV1[cha])[1] = a*d - b*c;
+                    }
+
+                    for (cha = 0; cha<CHA; cha++)
+                    {
+                        pSen[cha*RO*E1 + e1*RO + ro] = V1(cha, 0);
+                    }
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in coil_map_2d_Inati(...) ... ");
+        throw;
+    }
+}
+
+template EXPORTMRICORE void coil_map_2d_Inati(const hoNDArray< std::complex<float> >& data, hoNDArray< std::complex<float> >& coilMap, size_t ks, size_t power);
+template EXPORTMRICORE void coil_map_2d_Inati(const hoNDArray< std::complex<double> >& data, hoNDArray< std::complex<double> >& coilMap, size_t ks, size_t power);
+
+// ------------------------------------------------------------------------
+
+template<typename T> 
+void coil_map_3d_Inati(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t power)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+
+        long long RO = data.get_size(0);
+        long long E1 = data.get_size(1);
+        long long E2 = data.get_size(2);
+        long long CHA = data.get_size(3);
+
+        long long N = data.get_number_of_elements() / (RO*E1*E2*CHA);
+        GADGET_CHECK_THROW(N == 1);
+
+        const T* pData = data.begin();
+
+        if (!data.dimensions_equal(&coilMap))
+        {
+            coilMap = data;
+        }
+        T* pSen = coilMap.begin();
+
+        if (ks % 2 != 1)
+        {
+            ks++;
+        }
+
+        size_t kss = ks*ks*ks;
+        long long halfKs = (long long)ks / 2;
+
+        long long e2;
+
+        #pragma omp parallel default(none) private(e2) shared(ks, RO, E1, E2, CHA, pSen, pData, halfKs, power, kss)
+        {
+            hoMatrix<T> D(kss, CHA);
+            hoMatrix<T> DC(kss, CHA);
+            hoMatrix<T> DH_D(CHA, CHA);
+
+            hoMatrix<T> U1(kss, 1);
+            hoMatrix<T> V1(CHA, 1);
+            hoMatrix<T> V(CHA, 1);
+
+            Gadgetron::clear(D);
+            Gadgetron::clear(DC);
+            Gadgetron::clear(DH_D);
+            Gadgetron::clear(U1);
+            Gadgetron::clear(V1);
+            Gadgetron::clear(V);
+
+            T phaseU1;
+
+            value_type v1Norm(1);
+
+            long long cha, ro, e1, kro, dro, ke1, de1, ke2, de2;
+            size_t po;
+
+            #pragma omp for
+            for (e2 = 0; e2<(long long)E2; e2++)
+            {
+                for (e1 = 0; e1<(long long)E1; e1++)
+                {
+                    for (ro = 0; ro<(long long)RO; ro++)
+                    {
+                        // fill the data matrix D
+                        if (e2 >= halfKs && e2<E2 - halfKs && e1 >= halfKs && e1<E1 - halfKs && ro >= halfKs && ro<RO - halfKs)
+                        {
+                            for (cha = 0; cha<CHA; cha++)
+                            {
+                                const T* pDataCurr = pData + cha*RO*E1*E2;
+                                long long ind = 0;
+                                for (ke2 = -halfKs; ke2 <= halfKs; ke2++)
+                                {
+                                    de2 = e2 + ke2;
+                                    for (ke1 = -halfKs; ke1 <= halfKs; ke1++)
+                                    {
+                                        de1 = e1 + ke1;
+                                        for (kro = -halfKs; kro <= halfKs; kro++)
+                                        {
+                                            D(ind++, cha) = pDataCurr[de2*RO*E1 + de1*RO + ro + kro];
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        else
+                        {
+                            for (cha = 0; cha<CHA; cha++)
+                            {
+                                const T* pDataCurr = pData + cha*RO*E1*E2;
+                                long long ind = 0;
+                                for (ke2 = -halfKs; ke2 <= halfKs; ke2++)
+                                {
+                                    de2 = e2 + ke2;
+                                    if (de2 < 0) de2 += E2;
+                                    if (de2 >= E2) de2 -= E2;
+
+                                    for (ke1 = -halfKs; ke1 <= halfKs; ke1++)
+                                    {
+                                        de1 = e1 + ke1;
+                                        if (de1 < 0) de1 += E1;
+                                        if (de1 >= E1) de1 -= E1;
+
+                                        for (kro = -halfKs; kro <= halfKs; kro++)
+                                        {
+                                            dro = ro + kro;
+                                            if (dro < 0) dro += RO;
+                                            if (dro >= RO) dro -= RO;
+
+                                            D(ind++, cha) = pDataCurr[de2*RO*E1 + de1*RO + dro];
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        // compute V1
+                        D.sumOverCol(V1);
+                        norm2(V1, v1Norm);
+                        scal((value_type)1.0 / v1Norm, V1);
+
+                        memcpy(DC.begin(), D.begin(), sizeof(T)*kss*CHA);
+                        gemm(DH_D, DC, true, D, false);
+                        // gemm(DH_D, D, true, D, false);
+
+                        for (po = 0; po<power; po++)
+                        {
+                            gemm(V, DH_D, false, V1, false);
+                            V1 = V;
+                            norm2(V1, v1Norm);
+                            scal((value_type)1.0 / v1Norm, V1);
+                        }
+
+                        // compute U1
+                        gemm(U1, D, false, V1, false);
+
+                        phaseU1 = U1(0, 0);
+                        for (po = 1; po<kss; po++)
+                        {
+                            phaseU1 += U1(po, 0);
+                        }
+                        phaseU1 /= std::abs(phaseU1);
+
+                        // put the mean object phase to coil map
+                        conjugate(V1, V1);
+                        scal(phaseU1, V1);
+
+                        for (cha = 0; cha<CHA; cha++)
+                        {
+                            pSen[cha*RO*E1*E2 + e2*RO*E1 + e1*RO + ro] = V1(cha, 0);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in coil_map_3d_Inati(...) ... ");
+        throw;
+    }
+}
+
+template EXPORTMRICORE void coil_map_3d_Inati(const hoNDArray< std::complex<float> >& data, hoNDArray< std::complex<float> >& coilMap, size_t ks, size_t power);
+template EXPORTMRICORE void coil_map_3d_Inati(const hoNDArray< std::complex<double> >& data, hoNDArray< std::complex<double> >& coilMap, size_t ks, size_t power);
+
+// ------------------------------------------------------------------------
+
+template<typename T> 
+void coil_map_2d_Inati_Iter(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t iterNum, typename realType<T>::Type thres)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+
+        long long RO = data.get_size(0);
+        long long E1 = data.get_size(1);
+        long long CHA = data.get_size(2);
+
+        long long N = data.get_number_of_elements() / (RO*E1*CHA);
+        GADGET_CHECK_THROW(N == 1);
+
+        const T* pData = data.begin();
+
+        if (!data.dimensions_equal(&coilMap))
+        {
+            coilMap = data;
+        }
+
+        // create convolution kernel
+        hoNDArray<T> ker(ks, ks);
+        Gadgetron::fill(ker, T((value_type)1.0 / (ks*ks)));
+
+        hoNDArray<T> prevR(RO, E1, 1), R(RO, E1, 1), imT(RO, E1, 1), magT(RO, E1, 1), diffR(RO, E1, 1);
+        hoNDArray<T> coilMapConv(RO, E1, CHA);
+        hoNDArray<T> D(RO, E1, CHA);
+        hoNDArray<T> D_sum(1, E1, CHA);
+        hoNDArray<T> D_sum_1st_2nd(1, 1, CHA);
+        typename realType<T>::Type v, vR, vDiffR;
+        T vCha;
+        size_t iter;
+        long long cha;
+
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(data, D_sum, 0));
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(D_sum, D_sum_1st_2nd, 1));
+        Gadgetron::norm2(D_sum_1st_2nd, v);
+        Gadgetron::scal((value_type)1.0 / v, D_sum_1st_2nd);
+
+        Gadgetron::clear(R);
+        for (cha = 0; cha<CHA; cha++)
+        {
+            hoNDArray<T> dataCHA(RO, E1, const_cast<T*>(data.begin()) + cha*RO*E1);
+            vCha = D_sum_1st_2nd(cha);
+            Gadgetron::axpy(std::conj(vCha), dataCHA, R, R);
+        }
+
+        for (iter = 0; iter<iterNum; iter++)
+        {
+            prevR = R;
+
+            Gadgetron::conjugate(R, R);
+
+            GADGET_CATCH_THROW(Gadgetron::multiply(data, R, coilMap));
+
+            Gadgetron::conv2(coilMap, ker, coilMapConv);
+
+            Gadgetron::multiplyConj(coilMapConv, coilMapConv, D);
+
+            GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(D, R, 2));
+
+            Gadgetron::sqrt(R, R);
+
+            Gadgetron::addEpsilon(R);
+            Gadgetron::inv(R, R);
+
+            GADGET_CATCH_THROW(Gadgetron::multiply(coilMapConv, R, coilMap));
+
+            Gadgetron::multiplyConj(data, coilMap, D);
+            GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(D, R, 2));
+
+            GADGET_CATCH_THROW(Gadgetron::multiply(coilMap, R, D));
+
+            GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(D, D_sum, 0));
+            GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(D_sum, D_sum_1st_2nd, 1));
+
+            Gadgetron::norm2(D_sum_1st_2nd, v);
+            Gadgetron::scal((value_type)1.0 / v, D_sum_1st_2nd);
+
+            Gadgetron::clear(imT);
+            for (cha = 0; cha<CHA; cha++)
+            {
+                hoNDArray<T> coilMapCHA(RO, E1, coilMap.begin() + cha*RO*E1);
+                vCha = D_sum_1st_2nd(cha);
+                Gadgetron::axpy(std::conj(vCha), coilMapCHA, imT, imT);
+            }
+
+            Gadgetron::abs(imT, magT);
+            Gadgetron::divide(imT, magT, imT);
+
+            Gadgetron::multiply(R, imT, R);
+            Gadgetron::conjugate(imT, imT);
+            GADGET_CATCH_THROW(Gadgetron::multiply(coilMap, imT, coilMap));
+
+            Gadgetron::subtract(prevR, R, diffR);
+            Gadgetron::norm2(diffR, vDiffR);
+            Gadgetron::norm2(R, vR);
+
+            if (vDiffR / vR < thres) break;
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in coil_map_2d_Inati_Iter(...) ... ");
+        throw;
+    }
+}
+
+template EXPORTMRICORE void coil_map_2d_Inati_Iter(const hoNDArray< std::complex<float> >& data, hoNDArray< std::complex<float> >& coilMap, size_t ks, size_t iterNum, float thres);
+template EXPORTMRICORE void coil_map_2d_Inati_Iter(const hoNDArray< std::complex<double> >& data, hoNDArray< std::complex<double> >& coilMap, size_t ks, size_t iterNum, double thres);
+
+// ------------------------------------------------------------------------
+
+template<typename T> 
+void coil_map_3d_Inati_Iter(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t kz, size_t iterNum, typename realType<T>::Type thres)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t E2 = data.get_size(2);
+        size_t CHA = data.get_size(3);
+
+        size_t N = data.get_number_of_elements() / (RO*E1*E2*CHA);
+        GADGET_CHECK_THROW(N == 1);
+
+        const T* pData = data.begin();
+
+        if (!data.dimensions_equal(&coilMap))
+        {
+            coilMap = data;
+        }
+
+        // create convolution kernel
+        hoNDArray<T> ker(ks, ks, kz);
+        Gadgetron::fill(&ker, T((value_type)1.0 / (ks*ks*kz)));
+
+        hoNDArray<T> R(RO, E1, E2, 1), imT(RO, E1, E2, 1), magT(RO, E1, E2, 1);
+        hoNDArray<T> coilMapConv(RO, E1, E2, CHA);
+        hoNDArray<T> D(RO, E1, E2, CHA);
+        hoNDArray<T> D_sum(1, CHA);
+        typename realType<T>::Type v;
+        T vCha;
+        size_t iter, cha;
+
+        hoNDArray<T> dataByCha(RO*E1*E2, CHA, const_cast<T*>(data.begin()));
+        GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(data, D_sum, 0));
+        Gadgetron::norm2(D_sum, v);
+        Gadgetron::scal((value_type)1.0 / v, D_sum);
+
+        Gadgetron::clear(R);
+        for (cha = 0; cha<CHA; cha++)
+        {
+            hoNDArray<T> dataCHA(RO, E1, E2, const_cast<T*>(data.begin()) + cha*RO*E1*E2);
+            vCha = D_sum(cha);
+            Gadgetron::axpy(std::conj(vCha), dataCHA, R, R);
+        }
+
+        for (iter = 0; iter<iterNum; iter++)
+        {
+            Gadgetron::conjugate(R, R);
+
+            Gadgetron::multiply(data, R, coilMap);
+
+            Gadgetron::conv2(coilMap, ker, coilMapConv);
+
+            Gadgetron::multiplyConj(coilMapConv, coilMapConv, D);
+
+            GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(D, R, 3));
+
+            Gadgetron::sqrt(R, R);
+
+            Gadgetron::addEpsilon(R);
+            Gadgetron::inv(R, R);
+
+            Gadgetron::multiply(coilMapConv, R, coilMap);
+
+            Gadgetron::multiplyConj(data, coilMap, D);
+            GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(D, R, 3));
+
+            Gadgetron::multiply(coilMap, R, D);
+
+            hoNDArray<T> DByCha(RO*E1*E2, CHA, D.begin());
+            GADGET_CATCH_THROW(Gadgetron::sum_over_dimension(DByCha, D_sum, 0));
+
+            Gadgetron::norm2(D_sum, v);
+            Gadgetron::scal((value_type)1.0 / v, D_sum);
+
+            Gadgetron::clear(imT);
+            for (cha = 0; cha<CHA; cha++)
+            {
+                hoNDArray<T> coilMapCHA(RO, E1, E2, 1, coilMap.begin() + cha*RO*E1*E2);
+                vCha = D_sum(cha);
+                Gadgetron::axpy(std::conj(vCha), coilMapCHA, imT, imT);
+            }
+
+            Gadgetron::abs(imT, magT);
+            Gadgetron::divide(imT, magT, imT);
+
+            Gadgetron::multiply(R, imT, R);
+            Gadgetron::conjugate(imT, imT);
+            Gadgetron::multiply(coilMap, imT, coilMap);
+        }
+    }
+    catch (...)
+    {
+        GERROR_STREAM("Errors in coil_map_3d_Inati_Iter(...) ... ");
+        throw;
+    }
+}
+
+template EXPORTMRICORE void coil_map_3d_Inati_Iter(const hoNDArray< std::complex<float> >& data, hoNDArray< std::complex<float> >& coilMap, size_t ks, size_t kz, size_t iterNum, float thres);
+template EXPORTMRICORE void coil_map_3d_Inati_Iter(const hoNDArray< std::complex<double> >& data, hoNDArray< std::complex<double> >& coilMap, size_t ks, size_t kz, size_t iterNum, double thres);
+
+// ------------------------------------------------------------------------
+
+}
diff --git a/toolboxes/mri_core/mri_core_coil_map_estimation.h b/toolboxes/mri_core/mri_core_coil_map_estimation.h
new file mode 100644
index 0000000..b621ea0
--- /dev/null
+++ b/toolboxes/mri_core/mri_core_coil_map_estimation.h
@@ -0,0 +1,50 @@
+
+/** \file   mri_core_coil_map_estimation.h
+    \brief  Implementation MRI coil sensitivity map estimation functions.
+
+    ISMRMRD_SOUHEIL coil map estimation is based on:
+
+        Inati SJ, Hansen MS, Kellman P.
+        A solution to the phase problem in adaptive coil combination.
+        In: ISMRM proceeding; April; Salt Lake City, Utah, USA; 2013. 2672.
+
+        Kellman P, McVeigh ER.
+        Image reconstruction in SNR units: A general method for SNR measurement.
+        Magnetic Resonance in Medicine 2005;54(6):1439-1447.
+
+    ISMRMRD_SOUHEIL_ITER coil map estimation is based on:
+
+        Inati SJ, Hansen MS, Kellman P. 
+        A Fast Optimal Method for Coil Sensitivity Estimation and Adaptive Coil Combination for Complex Images.
+        In: ISMRM proceeding; May; Milan, Italy; 2014. 4407.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "mri_core_export.h"
+#include "hoNDArray.h"
+
+namespace Gadgetron
+{
+    // the Souheil method
+    // data: [RO E1 CHA], only 3D array
+    // these functions are using 2D data correlation matrix
+    // ks: the kernel size for local covariance estimation
+    // power: number of iterations to apply power method
+    template<typename T> EXPORTMRICORE void coil_map_2d_Inati(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t power);
+
+    // data: [RO E1 E2 CHA], this functions uses true 3D data correlation matrix
+    template<typename T> EXPORTMRICORE void coil_map_3d_Inati(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t power);
+
+    // the Souheil iteration method
+    // data: [RO E1 CHA], only 3D array
+    // ks: the kernel size for local covariance estimation
+    // iterNum: number of iterations to refine coil map
+    // thres: threshold to stop the iterations
+    template<typename T> EXPORTMRICORE void coil_map_2d_Inati_Iter(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t iterNum, typename realType<T>::Type thres);
+
+    // data: [RO E1 E2 CHA], true 3D coil map estimation
+    template<typename T> EXPORTMRICORE void coil_map_3d_Inati_Iter(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t kz, size_t iterNum, typename realType<T>::Type thres);
+}
diff --git a/toolboxes/mri_core/mri_core_data.h b/toolboxes/mri_core/mri_core_data.h
new file mode 100644
index 0000000..81620fb
--- /dev/null
+++ b/toolboxes/mri_core/mri_core_data.h
@@ -0,0 +1,282 @@
+#ifndef MRI_CORE_DATA_H
+#define MRI_CORE_DATA_H
+
+#include "GadgetContainerMessage.h"
+#include "ismrmrd/ismrmrd.h"
+#include "ismrmrd/meta.h"
+#include <vector>
+#include <set>
+#include "hoNDArray.h"
+
+namespace Gadgetron 
+{
+
+    /** 
+      This is a list of lables of the coordinates described in the ISMRMRD acquisition header.
+
+      It is useful for accumulators and triggers and for labeling the storage used in
+      the @IsmrmrdAcquisitionBucket and @IsmrmrdDataBuffered structures. 
+
+   */
+    enum IsmrmrdCONDITION {
+	KSPACE_ENCODE_STEP_1,
+	KSPACE_ENCODE_STEP_2,
+	AVERAGE,
+	SLICE,
+	CONTRAST,
+	PHASE,
+	REPETITION,
+	SET,
+	SEGMENT,
+	USER_0,
+	USER_1,
+	USER_2,
+	USER_3,
+	USER_4,
+	USER_5,
+	USER_6,
+	USER_7,
+	NONE
+      };
+    
+  /** 
+      This class functions as a storage unit for statistics related to
+      the @IsmrmrdAcquisitionData objects.
+
+   */
+  class IsmrmrdAcquisitionBucketStats
+  {
+    public:
+      // Set of labels found in the data or ref part of a bucket
+      //11D, fixed order [RO, E1, E2, CHA, SLC, PHS, CON, REP, SET, SEG, AVE]
+      std::set<uint16_t> kspace_encode_step_1;
+      std::set<uint16_t> kspace_encode_step_2;
+      std::set<uint16_t> slice;
+      std::set<uint16_t> phase;
+      std::set<uint16_t> contrast;
+      std::set<uint16_t> repetition;
+      std::set<uint16_t> set;
+      std::set<uint16_t> segment;
+      std::set<uint16_t> average;
+  };
+
+  /** 
+      This class functions as a storage unit for GadgetContainerMessage pointers
+      that point to acquisiton headers, data and trajectories.
+
+      It is the storage used in the @IsmrmrdAcquisitionBucket structure. 
+
+   */
+  class IsmrmrdAcquisitionData
+  {
+  public:
+    /**
+       Default Constructor
+    */
+    IsmrmrdAcquisitionData()
+      : head_(0)
+      , data_(0)
+      , traj_(0)
+      {
+
+      }
+    
+    /**
+       Constructor
+    */
+    IsmrmrdAcquisitionData(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* head,
+                           GadgetContainerMessage< hoNDArray< std::complex<float> > >* data,
+                           GadgetContainerMessage< hoNDArray< float > >* traj = 0)
+    {
+      if (head) {
+	head_ = head->duplicate();
+      } else {
+	head_ = 0;
+      }
+
+      if (data) {
+	data_ = data->duplicate();
+      } else {
+	data_ = 0;
+      }
+
+      if (traj) {
+	traj_ = traj->duplicate();
+      } else {
+	traj_ = 0;
+      }
+    }
+
+    /** 
+	Assignment operator
+     */
+    IsmrmrdAcquisitionData& operator=(const IsmrmrdAcquisitionData& d)
+      {
+	if (this != &d) { 
+	  if (d.head_) {
+	    if (head_) head_->release();
+	    head_ = d.head_->duplicate();
+	  } else {
+	    head_ = 0;
+	  }
+	  
+	  if (d.data_) {
+	    if (data_) data_->release();
+	    data_ = d.data_->duplicate();
+	  } else {
+	    data_ = 0;
+	  }
+	  
+	  if (d.traj_) {
+	    if (traj_) traj_->release();
+	    traj_ = d.traj_->duplicate();
+	  } else {
+	    traj_ = 0;
+	  }
+	}
+	return *this;
+      }
+
+    /**
+       Copy constructor
+     */
+    IsmrmrdAcquisitionData(const IsmrmrdAcquisitionData& d)
+      : head_(0)
+      , data_(0)
+      , traj_(0)
+      {
+	*this = d;
+      }
+
+
+    /**
+       Destructor. The memory in the GadgetContainer Messages will be deleted
+       when the object is destroyed. 
+     */
+    ~IsmrmrdAcquisitionData() {
+      if (head_) {
+	head_->release();
+	head_ = 0;
+      }
+
+      if (data_) {
+	data_->release();
+	data_ = 0;
+      }
+
+      if (traj_) {
+	traj_->release();
+	traj_ = 0;
+      }
+    }
+
+
+    GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* head_;
+    GadgetContainerMessage< hoNDArray< std::complex<float> > >* data_;
+    GadgetContainerMessage< hoNDArray< float > > * traj_;
+  };
+
+
+  /**
+
+     This class serves as the storage unit for buffered data. 
+     The @IsmrmrdAcquisitionData structure contains pointers 
+     to the GadgetContainerMessages with the data. 
+
+     Data stored in these buckets will automatically get deleted when the object is
+     destroyed. 
+
+   */ 
+  class IsmrmrdAcquisitionBucket
+  {
+  public:
+    std::vector< IsmrmrdAcquisitionData > data_;
+    std::vector< IsmrmrdAcquisitionData > ref_;
+    std::vector< IsmrmrdAcquisitionBucketStats > datastats_;
+    std::vector< IsmrmrdAcquisitionBucketStats > refstats_;
+  };
+  
+  
+  class SamplingLimit
+  {
+  public:
+    uint16_t min_;
+    uint16_t center_;
+    uint16_t max_;
+  };
+  
+  class SamplingDescription
+  {
+  public:
+    // encoding FOV
+    float encoded_FOV_[3];
+    // recon FOV
+    float recon_FOV_[3];
+    
+    uint16_t encoded_matrix_[3];
+    uint16_t recon_matrix_[3];
+    
+    // sampled range along RO, E1, E2 (for asymmetric echo and partial fourier)
+    // min, max and center
+    SamplingLimit sampling_limits_[3];
+  };
+  
+  class IsmrmrdDataBuffered
+  {
+  public:
+    //7D, fixed order [E0, E1, E2, CHA, N, S, LOC]
+    hoNDArray< std::complex<float> > data_;
+    
+    //7D, fixed order [TRAJ, E0, E1, E2, N, S, LOC]
+    //This element is optional (length is 0 if not present)
+    hoNDArray< float > trajectory_;
+    
+    //5D, fixed order [E1, E2, N, S, LOC]
+    hoNDArray< ISMRMRD::AcquisitionHeader > headers_;
+    
+    SamplingDescription sampling_;
+
+    // function to check if it's empty
+  };
+  
+
+  /**
+     This class is used to group a sub-unit of the data that would feed into a reconstruction. 
+   */
+  class IsmrmrdReconBit
+  {
+  public:
+    IsmrmrdDataBuffered data_;
+    IsmrmrdDataBuffered ref_;
+  };
+
+  /**
+     This class is used to store a unit of data that would feed into a reconstruction. 
+   */
+  class IsmrmrdReconData
+  {
+  public:
+    std::vector<IsmrmrdReconBit> rbit_;
+  };
+
+  
+  /**
+     This class is used to store an array of reconstructed data. 
+   */
+  class IsmrmrdImageArray
+  {
+  public:
+    //7D, fixed order [X, Y, Z, CHA, N, S, LOC]
+    hoNDArray< std::complex<float> > data_;
+    
+    //3D, fixed order [N, S, LOC]
+    hoNDArray< ISMRMRD::ImageHeader > headers_;
+    
+    //3D, fixed order [N, S, LOC]
+    //This element is optional (length is 0 if not present)
+    std::vector< ISMRMRD::MetaContainer > meta_;
+    
+  };
+
+}
+#endif //MRI_CORE_DATA_H
diff --git a/toolboxes/mri_core/mri_core_def.h b/toolboxes/mri_core/mri_core_def.h
new file mode 100644
index 0000000..91b7795
--- /dev/null
+++ b/toolboxes/mri_core/mri_core_def.h
@@ -0,0 +1,79 @@
+/** \file   mri_core_def.h
+    \brief  Define the symbols for mri_core toolbox
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd/ismrmrd.h"
+
+namespace Gadgetron
+{
+    /// defination of image meta attributes
+    /// user can set these attributes to record some properties of generated imaging results
+    /// how to interpret these attributes depends on the client side
+    #define GADGETRON_IMAGENUMBER                          "GADGETRON_ImageNumber"
+    #define GADGETRON_IMAGECOMMENT                         "GADGETRON_ImageComment"
+    #define GADGETRON_IMAGEPROCESSINGHISTORY               "GADGETRON_ImageProcessingHistory"
+    #define GADGETRON_IMAGE_CATEGORY                       "GADGETRON_ImageCategory"
+    #define GADGETRON_SEQUENCEDESCRIPTION                  "GADGETRON_SeqDescription"
+    #define GADGETRON_IMAGE_WINDOWCENTER                   "GADGETRON_WindowCenter"
+    #define GADGETRON_IMAGE_WINDOWWIDTH                    "GADGETRON_WindowWidth"
+    #define GADGETRON_IMAGE_SCALE_RATIO                    "GADGETRON_ScaleRatio"
+    #define GADGETRON_IMAGE_SCALE_OFFSET                   "GADGETRON_ScaleOffset"
+    #define GADGETRON_IMAGE_COLORMAP                       "GADGETRON_ColorMap"
+    #define GADGETRON_IMAGE_ECHOTIME                       "GADGETRON_TE"
+    #define GADGETRON_IMAGE_INVERSIONTIME                  "GADGETRON_TI"
+
+    /// role of image data
+    #define GADGETRON_DATA_ROLE                            "GADGETRON_DataRole"
+    #define GADGETRON_IMAGE_REGULAR                        "GADGETRON_Image"
+    #define GADGETRON_IMAGE_RETRO                          "GADGETRON_ImageRetro"
+    #define GADGETRON_IMAGE_MOCORECON                      "GADGETRON_ImageMoCo"
+    #define GADGETRON_IMAGE_GFACTOR                        "GADGETRON_Gfactor"
+    #define GADGETRON_IMAGE_SNR_MAP                        "GADGETRON_SNR_MAP"
+    #define GADGETRON_IMAGE_STD_MAP                        "GADGETRON_STD_MAP"
+    #define GADGETRON_IMAGE_WRAPAROUNDMAP                  "GADGETRON_WrapAround_MAP"
+    #define GADGETRON_IMAGE_PHASE                          "GADGETRON_Phase"
+    #define GADGETRON_IMAGE_INTENSITY_UNCHANGED            "GADGETRON_Image_Intensity_Unchanged"
+    #define GADGETRON_IMAGE_AIF                            "GADGETRON_AIF"
+    #define GADGETRON_IMAGE_AIF_LV_MASK                    "GADGETRON_AIFLVMASK"
+    #define GADGETRON_IMAGE_PERF_FLOW_MAP                  "GADGETRON_Perf_Flow_Map"
+
+    // other images than the regular reconstruction results
+    #define GADGETRON_IMAGE_OTHER                          "GADGETRON_Image_Other"
+    // other data roles
+    #define GADGETRON_IMAGE_T2W                            "T2W"
+    #define GADGETRON_IMAGE_PD                             "PD"
+    #define GADGETRON_IMAGE_MAGIR                          "MAGIR"
+    #define GADGETRON_IMAGE_PSIR                           "PSIR"
+
+    #define GADGETRON_IMAGE_T1MAP                          "T1"
+    #define GADGETRON_IMAGE_T1SDMAP                        "T1SD"
+    #define GADGETRON_IMAGE_T2MAP                          "T2"
+    #define GADGETRON_IMAGE_T2SDMAP                        "T2SD"
+    #define GADGETRON_IMAGE_T2STARMAP                      "T2STAR"
+    #define GADGETRON_IMAGE_T2STARMASKMAP                  "T2SMASKMAP"
+    #define GADGETRON_IMAGE_T2STARSDMAP                    "T2STARSD"
+    #define GADGETRON_IMAGE_T2STARAMAP                     "T2STARAMAP"
+    #define GADGETRON_IMAGE_T2STARTRUNCMAP                 "T2STARTRUNCMAP"
+
+    #define GADGETRON_IMAGE_FAT                            "FAT"
+    #define GADGETRON_IMAGE_WATER                          "WATER"
+    #define GADGETRON_IMAGE_FREQMAP                        "FREQMAP"
+    #define GADGETRON_IMAGE_B1MAP                          "B1MAP"
+    #define GADGETRON_IMAGE_FLIPANGLEMAP                   "FLIPANGLEMAP"
+    #define GADGETRON_IMAGE_FLOWMAP                        "FLOWMAP"
+
+    //MSH: Interventional MRI (Interactive Real Time, IRT)
+    #define GADGETRON_IMAGE_IRT_IMAGE                      "IRT_IMAGE"
+    #define GADGETRON_IMAGE_IRT_DEVICE                     "IRT_DEVICE"
+    #define GADGETRON_IMAGE_NUM_DEVICE_CHA                 "IRT_NUM_DEVICE_CHA"
+    #define GADGETRON_IMAGE_CUR_DEVICE_CHA                 "IRT_CUR_DEVICE_CHA"
+
+    /// data processing tag, used with ImageProcessingHistory
+    #define GADGETRON_IMAGE_SURFACECOILCORRECTION           "NORM"
+    #define GADGETRON_IMAGE_FILTER                          "FIL"
+    #define GADGETRON_IMAGE_MOCO                            "MOCO"
+    #define GADGETRON_IMAGE_AVE                             "AVE"
+}
diff --git a/toolboxes/mri_core/mri_core_export.h b/toolboxes/mri_core/mri_core_export.h
new file mode 100644
index 0000000..6f16bc8
--- /dev/null
+++ b/toolboxes/mri_core/mri_core_export.h
@@ -0,0 +1,18 @@
+#ifndef _MRI_CORE_EXPORT_H_
+#define _MRI_CORE_EXPORT_H_
+
+#if defined (WIN32)
+    #ifdef BUILD_TOOLBOX_STATIC
+        #define EXPORTMRICORE
+    #else
+        #if defined (__BUILD_GADGETRON_MRI_CORE__) || defined (mri_core_EXPORTS)
+            #define EXPORTMRICORE __declspec(dllexport)
+        #else
+            #define EXPORTMRICORE __declspec(dllimport)
+        #endif
+    #endif
+#else
+    #define EXPORTMRICORE
+#endif
+
+#endif /* _MRI_CORE_EXPORT_H_ */
diff --git a/toolboxes/mri_core/mri_core_grappa.cpp b/toolboxes/mri_core/mri_core_grappa.cpp
new file mode 100644
index 0000000..392543c
--- /dev/null
+++ b/toolboxes/mri_core/mri_core_grappa.cpp
@@ -0,0 +1,553 @@
+
+/** \file   mri_core_grappa.cpp
+    \brief  GRAPPA implementation for 2D and 3D MRI parallel imaging
+    \author Hui Xue
+
+    References to the implementation can be found in:
+
+    Griswold MA, Jakob PM, Heidemann RM, Nittka M, Jellus V, Wang J, Kiefer B, Haase A. 
+    Generalized autocalibrating partially parallel acquisitions (GRAPPA). 
+    Magnetic Resonance in Medicine 2002;47(6):1202-1210.
+
+    Kellman P, Epstein FH, McVeigh ER. 
+    Adaptive sensitivity encoding incorporating temporal filtering (TSENSE). 
+    Magnetic Resonance in Medicine 2001;45(5):846-852.
+
+    Breuer FA, Kellman P, Griswold MA, Jakob PM. .
+    Dynamic autocalibrated parallel imaging using temporal GRAPPA (TGRAPPA). 
+    Magnetic Resonance in Medicine 2005;53(4):981-985.
+
+    Saybasili H., Kellman P., Griswold MA., Derbyshire JA. Guttman, MA. 
+    HTGRAPPA: Real-time B1-weighted image domain TGRAPPA reconstruction. 
+    Magnetic Resonance in Medicine 2009;61(6): 1425-1433. 
+*/
+
+#include "mri_core_grappa.h"
+#include "mri_core_utility.h"
+#include "hoMatrix.h"
+#include "hoNDArray_linalg.h"
+#include "hoNDFFT.h"
+#include "hoNDArray_utils.h"
+#include "hoNDArray_elemwise.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron
+{
+
+void grappa2d_kerPattern(std::vector<int>& kE1, std::vector<int>& oE1, size_t& convKRO, size_t& convKE1, size_t accelFactor, size_t kRO, size_t kNE1, bool fitItself)
+{
+    kE1.resize(kNE1, 0);
+    if ( kNE1%2 == 0 )
+    {
+        long long k;
+        for ( k=-((long long)kNE1/2-1); k<=(long long)kNE1/2; k++ )
+        {
+            kE1[k+kNE1/2-1] = (int)(k*accelFactor);
+        }
+    }
+    else
+    {
+        long long k;
+        for ( k=-(long long)kNE1/2; k<=(long long)kNE1/2; k++ )
+        {
+            kE1[k+kNE1/2] = (int)(k*accelFactor);
+        }
+    }
+
+    if ( fitItself )
+    {
+        oE1.resize(accelFactor);
+        for ( long long a=0; a<(long long)accelFactor; a++ )
+        {
+            oE1[a] = (int)a;
+        }
+    }
+    else
+    {
+        oE1.resize(accelFactor-1);
+        for ( long long a=1; a<(long long)accelFactor; a++ )
+        {
+            oE1[a-1] = (int)a;
+        }
+    }
+
+    convKRO = 2 * kRO + 3;
+
+    long long maxKE1 = std::abs(kE1[0]);
+    if (std::abs(kE1[kNE1 - 1]) > maxKE1)
+    {
+        maxKE1 = std::abs(kE1[kNE1 - 1]);
+    }
+    convKE1 = 2 * maxKE1 + 1;
+
+    return;
+}
+
+template <typename T> 
+void grappa2d_calib(const hoNDArray<T>& acsSrc, const hoNDArray<T>& acsDst, double thres, size_t kRO, const std::vector<int>& kE1, const std::vector<int>& oE1, size_t startRO, size_t endRO, size_t startE1, size_t endE1, hoNDArray<T>& ker)
+{
+    try
+    {
+        GADGET_CHECK_THROW(acsSrc.get_size(0)==acsDst.get_size(0));
+        GADGET_CHECK_THROW(acsSrc.get_size(1)==acsDst.get_size(1));
+        GADGET_CHECK_THROW(acsSrc.get_size(2)>=acsDst.get_size(2));
+
+        size_t RO = acsSrc.get_size(0);
+        size_t E1 = acsSrc.get_size(1);
+        size_t srcCHA = acsSrc.get_size(2);
+        size_t dstCHA = acsDst.get_size(2);
+
+        const T* pSrc = acsSrc.begin();
+        const T* pDst = acsDst.begin();
+
+        long long kROhalf = kRO/2;
+        if ( 2*kROhalf == kRO )
+        {
+            GWARN_STREAM("grappa<T>::calib(...) - 2*kROhalf == kRO " << kRO);
+        }
+        kRO = 2*kROhalf + 1;
+
+        size_t kNE1 = kE1.size();
+        size_t oNE1 = oE1.size();
+
+        /// allocate kernel
+        ker.create(kRO, kNE1, srcCHA, dstCHA, oNE1);
+
+        /// loop over the calibration region and assemble the equation
+        /// Ax = b
+
+        size_t sRO = startRO + kROhalf;
+        size_t eRO = endRO - kROhalf;
+        size_t sE1 = std::abs(kE1[0]) + startE1;
+        size_t eE1 = endE1 - kE1[kNE1-1];
+
+        size_t lenRO = eRO - sRO + 1;
+
+        size_t rowA = (eE1-sE1+1)*lenRO;
+        size_t colA = kRO*kNE1*srcCHA;
+        size_t colB = dstCHA*oNE1;
+
+        hoMatrix<T> A;
+        hoMatrix<T> B;
+        hoMatrix<T> x( colA, colB );
+
+        hoNDArray<T> A_mem(rowA, colA);
+        A.createMatrix( rowA, colA, A_mem.begin() );
+        T* pA = A.begin();
+
+        hoNDArray<T> B_mem(rowA, colB);
+        B.createMatrix( A.rows(), colB, B_mem.begin() );
+        T* pB = B.begin();
+
+        long long e1;
+        for ( e1=(long long)sE1; e1<=(long long)eE1; e1++ )
+        {
+            for ( long long ro=sRO; ro<=(long long)eRO; ro++ )
+            {
+                long long rInd = (e1-sE1)*lenRO+ro-kROhalf;
+
+                size_t src, dst, ke1, oe1;
+                long long kro;
+
+                /// fill matrix A
+                size_t col = 0;
+                size_t offset = 0;
+                for ( src=0; src<srcCHA; src++ )
+                {
+                    for ( ke1=0; ke1<kNE1; ke1++ )
+                    {
+                        offset = src*RO*E1 + (e1+kE1[ke1])*RO;
+                        for ( kro=-kROhalf; kro<=kROhalf; kro++ )
+                        {
+                            /// A(rInd, col++) = acsSrc(ro+kro, e1+kE1[ke1], src);
+                            pA[rInd + col*rowA] = pSrc[ro+kro+offset];
+                            col++;
+                        }
+                    }
+                }
+
+                /// fill matrix B
+                col = 0;
+                for ( oe1=0; oe1<oNE1; oe1++ )
+                {
+                    for ( dst=0; dst<dstCHA; dst++ )
+                    {
+                        B(rInd, col++) = acsDst(ro, e1+oE1[oe1], dst);
+                    }
+                }
+            }
+        }
+
+        SolveLinearSystem_Tikhonov(A, B, x, thres);
+        memcpy(ker.begin(), x.begin(), ker.get_number_of_bytes());
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in grappa2d_calib(...) ... ");
+    }
+
+    return;
+}
+
+template EXPORTMRICORE void grappa2d_calib(const hoNDArray< std::complex<float> >& acsSrc, const hoNDArray< std::complex<float> >& acsDst, double thres, size_t kRO, const std::vector<int>& kE1, const std::vector<int>& oE1, size_t startRO, size_t endRO, size_t startE1, size_t endE1, hoNDArray< std::complex<float> >& ker);
+template EXPORTMRICORE void grappa2d_calib(const hoNDArray< std::complex<double> >& acsSrc, const hoNDArray< std::complex<double> >& acsDst, double thres, size_t kRO, const std::vector<int>& kE1, const std::vector<int>& oE1, size_t startRO, size_t endRO, size_t startE1, size_t endE1, hoNDArray< std::complex<double> >& ker);
+
+// ------------------------------------------------------------------------
+
+template <typename T>
+void grappa2d_convert_to_convolution_kernel(const hoNDArray<T>& ker, size_t kRO, const std::vector<int>& kE1, const std::vector<int>& oE1, hoNDArray<T>& convKer)
+{
+    try
+    {
+        long long srcCHA = (long long)(ker.get_size(2));
+        long long dstCHA = (long long)(ker.get_size(3));
+        long long kNE1 = (long long)(kE1.size());
+        long long oNE1 = (long long)(oE1.size());
+
+        long long kROhalf = kRO / 2;
+        if (2 * kROhalf == kRO)
+        {
+            GWARN_STREAM("grappa2d_convert_to_convolution_kernel - 2*kROhalf == kRO " << kRO);
+        }
+        kRO = 2 * kROhalf + 1;
+
+        //// fill the convolution kernels
+        long long convKRO = 2 * kRO + 3;
+
+        long long maxKE1 = std::abs(kE1[0]);
+        if (std::abs(kE1[kNE1 - 1]) > maxKE1)
+        {
+            maxKE1 = std::abs(kE1[kNE1 - 1]);
+        }
+        long long convKE1 = 2 * maxKE1 + 1;
+
+        //// allocate the convolution kernel
+        convKer.create(convKRO, convKE1, srcCHA, dstCHA);
+        Gadgetron::clear(&convKer);
+
+        //// index
+        long long oe1, kro, ke1, src, dst;
+
+        //// fill the convolution kernel and sum up multiple kernels
+        for (oe1 = 0; oe1<oNE1; oe1++)
+        {
+            for (ke1 = 0; ke1<kNE1; ke1++)
+            {
+                for (kro = -kROhalf; kro <= kROhalf; kro++)
+                {
+                    for (dst = 0; dst<dstCHA; dst++)
+                    {
+                        for (src = 0; src<srcCHA; src++)
+                        {
+                            convKer(-kro + kRO + 1, oE1[oe1] - kE1[ke1] + maxKE1, src, dst) = ker(kro + kROhalf, ke1, src, dst, oe1);
+                        }
+                    }
+
+                }
+            }
+        }
+
+        if (oE1[0] != 0)
+        {
+            for (dst = 0; dst<dstCHA; dst++)
+            {
+                convKer(kRO + 1, maxKE1, dst, dst) = 1.0;
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_THROW("Errors in grappa2d_convert_to_convolution_kernel(...) ... ");
+    }
+
+    return;
+}
+
+template EXPORTMRICORE void grappa2d_convert_to_convolution_kernel(const hoNDArray< std::complex<float> >& ker, size_t kRO, const std::vector<int>& kE1, const std::vector<int>& oE1, hoNDArray< std::complex<float> >& convKer);
+template EXPORTMRICORE void grappa2d_convert_to_convolution_kernel(const hoNDArray< std::complex<double> >& ker, size_t kRO, const std::vector<int>& kE1, const std::vector<int>& oE1, hoNDArray< std::complex<double> >& convKer);
+
+// ------------------------------------------------------------------------
+
+template <typename T>
+void grappa2d_calib_convolution_kernel(const hoNDArray<T>& acsSrc, const hoNDArray<T>& acsDst, size_t accelFactor, double thres, size_t kRO, size_t kNE1, size_t startRO, size_t endRO, size_t startE1, size_t endE1, hoNDArray<T>& convKer)
+{
+    try
+    {
+        std::vector<int> kE1, oE1;
+
+        bool fitItself = false;
+        if (&acsSrc != &acsDst) fitItself = true;
+
+        size_t convkRO, convkE1;
+
+        grappa2d_kerPattern(kE1, oE1, convkRO, convkE1, accelFactor, kRO, kNE1, fitItself);
+
+        hoNDArray<T> ker;
+        grappa2d_calib(acsSrc, acsDst, thres, kRO, kE1, oE1, startRO, endRO, startE1, endE1, ker);
+
+        grappa2d_convert_to_convolution_kernel(ker, kRO, kE1, oE1, convKer);
+
+    }
+    catch (...)
+    {
+        GADGET_THROW("Errors in grappa2d_calib_convolution_kernel(...) ... ");
+    }
+
+    return;
+}
+
+template EXPORTMRICORE void grappa2d_calib_convolution_kernel(const hoNDArray< std::complex<float> >& acsSrc, const hoNDArray< std::complex<float> >& acsDst, size_t accelFactor, double thres, size_t kRO, size_t kNE1, size_t startRO, size_t endRO, size_t startE1, size_t endE1, hoNDArray< std::complex<float> >& convKer);
+template EXPORTMRICORE void grappa2d_calib_convolution_kernel(const hoNDArray< std::complex<double> >& acsSrc, const hoNDArray< std::complex<double> >& acsDst, size_t accelFactor, double thres, size_t kRO, size_t kNE1, size_t startRO, size_t endRO, size_t startE1, size_t endE1, hoNDArray< std::complex<double> >& convKer);
+
+
+// ------------------------------------------------------------------------
+
+template <typename T>
+void grappa2d_calib_convolution_kernel(const hoNDArray<T>& acsSrc, const hoNDArray<T>& acsDst, size_t accelFactor, double thres, size_t kRO, size_t kNE1, hoNDArray<T>& convKer)
+{
+    size_t startRO = 0;
+    size_t endRO = acsSrc.get_size(0) - 1;
+    size_t startE1 = 0;
+    size_t endE1 = acsSrc.get_size(1) - 1;
+
+    grappa2d_calib_convolution_kernel(acsSrc, acsDst, accelFactor, thres, kRO, kNE1, startRO, endRO, startE1, endE1, convKer);
+}
+
+template EXPORTMRICORE void grappa2d_calib_convolution_kernel(const hoNDArray< std::complex<float> >& acsSrc, const hoNDArray< std::complex<float> >& acsDst, size_t accelFactor, double thres, size_t kRO, size_t kNE1, hoNDArray< std::complex<float> >& convKer);
+template EXPORTMRICORE void grappa2d_calib_convolution_kernel(const hoNDArray< std::complex<double> >& acsSrc, const hoNDArray< std::complex<double> >& acsDst, size_t accelFactor, double thres, size_t kRO, size_t kNE1, hoNDArray< std::complex<double> >& convKer);
+
+// ------------------------------------------------------------------------
+
+template <typename T> 
+void grappa2d_calib_convolution_kernel(const hoNDArray<T>& dataSrc, const hoNDArray<T>& dataDst, hoNDArray<unsigned short>& dataMask, size_t accelFactor, double thres, size_t kRO, size_t kNE1, hoNDArray<T>& convKer)
+{
+    try
+    {
+        bool fitItself = false;
+        if (&dataSrc != &dataDst) fitItself = true;
+
+        GADGET_CHECK_THROW(dataSrc.dimensions_equal(&dataMask));
+        GADGET_CHECK_THROW(dataDst.dimensions_equal(&dataMask));
+
+        // find the fully sampled region
+        size_t RO = dataMask.get_size(0);
+        size_t E1 = dataMask.get_size(1);
+        size_t srcCHA = dataSrc.get_size(2);
+        size_t dstCHA = dataDst.get_size(2);
+
+        size_t startRO(0), endRO(0), startE1(0), endE1(0);
+
+        size_t ro, e1, scha, dcha;
+
+        for (e1 = 0; e1 < E1; e1++)
+        {
+            for (ro = 0; ro < RO; ro++)
+            {
+                if (dataMask(ro, e1)>0)
+                {
+                    if (ro < startRO) startRO = ro;
+                    if (ro > endRO) endRO = ro;
+
+                    if (e1 < startE1) startE1 = e1;
+                    if (e1 > endE1) endE1 = e1;
+                }
+            }
+        }
+
+        GADGET_CHECK_THROW(endRO>startRO);
+        GADGET_CHECK_THROW(endE1>startE1 + accelFactor);
+
+        GADGET_CATCH_THROW(grappa2d_calib_convolution_kernel(dataSrc, dataDst, accelFactor, thres, kRO, kNE1, startRO, endRO, startE1, endE1, convKer));
+    }
+    catch (...)
+    {
+        GADGET_THROW("Errors in grappa2d_calib_convolution_kernel(dataMask) ... ");
+    }
+}
+
+template EXPORTMRICORE void grappa2d_calib_convolution_kernel(const hoNDArray< std::complex<float> >& dataSrc, const hoNDArray< std::complex<float> >& dataDst, hoNDArray<unsigned short>& dataMask, size_t accelFactor, double thres, size_t kRO, size_t kNE1, hoNDArray< std::complex<float> >& convKer);
+template EXPORTMRICORE void grappa2d_calib_convolution_kernel(const hoNDArray< std::complex<double> >& dataSrc, const hoNDArray< std::complex<double> >& dataDst, hoNDArray<unsigned short>& dataMask, size_t accelFactor, double thres, size_t kRO, size_t kNE1, hoNDArray< std::complex<double> >& convKer);
+
+// ------------------------------------------------------------------------
+
+template <typename T> 
+void grappa2d_image_domain_kernel(const hoNDArray<T>& convKer, size_t RO, size_t E1, hoNDArray<T>& kIm)
+{
+    try
+    {
+        hoNDArray<T> convKerScaled(convKer);
+        Gadgetron::scal((typename realType<T>::Type)(std::sqrt((double)(RO*E1))), convKerScaled);
+        Gadgetron::pad(RO, E1, &convKerScaled, &kIm);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kIm);
+    }
+    catch(...)
+    {
+        GADGET_THROW("Errors in grappa2d_image_domain_kernel(...) ... ");
+    }
+
+    return;
+}
+
+template EXPORTMRICORE void grappa2d_image_domain_kernel(const hoNDArray< std::complex<float> >& convKer, size_t RO, size_t E1, hoNDArray< std::complex<float> >& kIm);
+template EXPORTMRICORE void grappa2d_image_domain_kernel(const hoNDArray< std::complex<double> >& convKer, size_t RO, size_t E1, hoNDArray< std::complex<double> >& kIm);
+
+// ------------------------------------------------------------------------
+
+template <typename T>
+void grappa2d_unmixing_coeff(const hoNDArray<T>& kerIm, const hoNDArray<T>& coilMap, size_t acceFactorE1, hoNDArray<T>& unmixCoeff, hoNDArray< typename realType<T>::Type >& gFactor)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+
+        size_t RO = kerIm.get_size(0);
+        size_t E1 = kerIm.get_size(1);
+        size_t srcCHA = kerIm.get_size(2);
+        size_t dstCHA = kerIm.get_size(3);
+
+        GADGET_CHECK_THROW(acceFactorE1 >= 1);
+
+        GADGET_CHECK_THROW(coilMap.get_size(0) == RO);
+        GADGET_CHECK_THROW(coilMap.get_size(1) == E1);
+        GADGET_CHECK_THROW(coilMap.get_size(2) == dstCHA);
+
+        std::vector<size_t> dimUnmixing(3);
+        dimUnmixing[0] = RO; dimUnmixing[1] = E1; dimUnmixing[2] = srcCHA;
+        if (!unmixCoeff.dimensions_equal(&dimUnmixing))
+        {
+            unmixCoeff.create(RO, E1, srcCHA);
+        }
+        Gadgetron::clear(&unmixCoeff);
+
+        std::vector<size_t> dimGFactor(2);
+        dimGFactor[0] = RO; dimGFactor[1] = E1;
+        if (!gFactor.dimensions_equal(&dimGFactor))
+        {
+            gFactor.create(RO, E1);
+        }
+        Gadgetron::clear(&gFactor);
+
+        int src;
+
+        T* pKerIm = const_cast<T*>(kerIm.begin());
+        T* pCoilMap = const_cast<T*>(coilMap.begin());
+        T* pCoeff = unmixCoeff.begin();
+
+        std::vector<size_t> dim(2);
+        dim[0] = RO;
+        dim[1] = E1;
+
+#pragma omp parallel default(none) private(src) shared(RO, E1, srcCHA, dstCHA, pKerIm, pCoilMap, pCoeff, dim)
+        {
+            hoNDArray<T> coeff2D, coeffTmp(&dim);
+            hoNDArray<T> coilMap2D;
+            hoNDArray<T> kerIm2D;
+
+#pragma omp for
+            for (src = 0; src<(int)srcCHA; src++)
+            {
+                coeff2D.create(&dim, pCoeff + src*RO*E1);
+
+                for (size_t dst = 0; dst<dstCHA; dst++)
+                {
+                    kerIm2D.create(&dim, pKerIm + src*RO*E1 + dst*RO*E1*srcCHA);
+                    coilMap2D.create(&dim, pCoilMap + dst*RO*E1);
+                    Gadgetron::multiplyConj(kerIm2D, coilMap2D, coeffTmp);
+                    Gadgetron::add(coeff2D, coeffTmp, coeff2D);
+                }
+            }
+        }
+
+        hoNDArray<T> conjUnmixCoeff(unmixCoeff);
+        Gadgetron::multiplyConj(unmixCoeff, conjUnmixCoeff, conjUnmixCoeff);
+        // Gadgetron::sumOverLastDimension(conjUnmixCoeff, gFactor);
+
+        hoNDArray<T> gFactorBuf(RO, E1, 1);
+        Gadgetron::sum_over_dimension(conjUnmixCoeff, gFactorBuf, 2);
+        Gadgetron::sqrt(gFactorBuf, gFactorBuf);
+        Gadgetron::scal((value_type)(1.0 / acceFactorE1), gFactorBuf);
+
+        Gadgetron::complex_to_real(gFactorBuf, gFactor);
+    }
+    catch (...)
+    {
+        GADGET_THROW("Errors in grappa2d_unmixing_coeff(const hoNDArray<T>& kerIm, const hoNDArray<T>& coilMap, hoNDArray<T>& unmixCoeff, hoNDArray<T>& gFactor) ... ");
+    }
+}
+
+template EXPORTMRICORE void grappa2d_unmixing_coeff(const hoNDArray< std::complex<float> >& kerIm, const hoNDArray< std::complex<float> >& coilMap, size_t acceFactorE1, hoNDArray< std::complex<float> >& unmixCoeff, hoNDArray<float>& gFactor);
+template EXPORTMRICORE void grappa2d_unmixing_coeff(const hoNDArray< std::complex<double> >& kerIm, const hoNDArray< std::complex<double> >& coilMap, size_t acceFactorE1, hoNDArray< std::complex<double> >& unmixCoeff, hoNDArray<double>& gFactor);
+
+// ------------------------------------------------------------------------
+
+template <typename T>
+void apply_unmix_coeff_kspace(const hoNDArray<T>& kspace, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        GADGET_CHECK_THROW(kspace.get_size(0) == unmixCoeff.get_size(0));
+        GADGET_CHECK_THROW(kspace.get_size(1) == unmixCoeff.get_size(1));
+        GADGET_CHECK_THROW(kspace.get_size(2) == unmixCoeff.get_size(2));
+
+        hoNDArray<T> buffer2DT(kspace);
+        GADGET_CATCH_THROW(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kspace, buffer2DT));
+
+        std::vector<size_t> dim;
+        kspace.get_dimensions(dim);
+        dim[2] = 1;
+
+        if (!complexIm.dimensions_equal(&dim))
+        {
+            complexIm.create(&dim);
+        }
+
+        Gadgetron::multiply(buffer2DT, unmixCoeff, buffer2DT);
+        Gadgetron::sum_over_dimension(buffer2DT, complexIm, 2);
+    }
+    catch (...)
+    {
+        GADGET_THROW("Errors in apply_unmix_coeff_kspace(const hoNDArray<T>& kspace, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm) ... ");
+    }
+}
+
+template EXPORTMRICORE void apply_unmix_coeff_kspace(const hoNDArray< std::complex<float> >& kspace, const hoNDArray< std::complex<float> >& unmixCoeff, hoNDArray< std::complex<float> >& complexIm);
+template EXPORTMRICORE void apply_unmix_coeff_kspace(const hoNDArray< std::complex<double> >& kspace, const hoNDArray< std::complex<double> >& unmixCoeff, hoNDArray< std::complex<double> >& complexIm);
+
+// ------------------------------------------------------------------------
+
+template <typename T>
+void apply_unmix_coeff_aliased_image(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        GADGET_CHECK_THROW(aliasedIm.get_size(0) == unmixCoeff.get_size(0));
+        GADGET_CHECK_THROW(aliasedIm.get_size(1) == unmixCoeff.get_size(1));
+        GADGET_CHECK_THROW(aliasedIm.get_size(2) == unmixCoeff.get_size(2));
+
+        std::vector<size_t> dim;
+        aliasedIm.get_dimensions(dim);
+        dim[2] = 1;
+
+        if (!complexIm.dimensions_equal(&dim))
+        {
+            complexIm.create(&dim);
+        }
+
+        hoNDArray<T> buffer2DT(aliasedIm);
+
+        Gadgetron::multiply(aliasedIm, unmixCoeff, buffer2DT);
+        Gadgetron::sum_over_dimension(buffer2DT, complexIm, 2);
+    }
+    catch (...)
+    {
+        GADGET_THROW("Errors in apply_unmix_coeff_aliased_image(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm) ... ");
+    }
+}
+
+template EXPORTMRICORE void apply_unmix_coeff_aliased_image(const hoNDArray< std::complex<float> >& aliasedIm, const hoNDArray< std::complex<float> >& unmixCoeff, hoNDArray< std::complex<float> >& complexIm);
+template EXPORTMRICORE void apply_unmix_coeff_aliased_image(const hoNDArray< std::complex<double> >& aliasedIm, const hoNDArray< std::complex<double> >& unmixCoeff, hoNDArray< std::complex<double> >& complexIm);
+
+}
diff --git a/toolboxes/mri_core/mri_core_grappa.h b/toolboxes/mri_core/mri_core_grappa.h
new file mode 100644
index 0000000..66fd756
--- /dev/null
+++ b/toolboxes/mri_core/mri_core_grappa.h
@@ -0,0 +1,80 @@
+
+/** \file   mri_core_grappa.h
+    \brief  GRAPPA implementation for 2D and 3D MRI parallel imaging
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "mri_core_export.h"
+#include "hoNDArray.h"
+
+namespace Gadgetron {
+    /// ---------------------------------------------------------------------
+    /// 2D grappa
+    /// ---------------------------------------------------------------------
+
+    /// grappa 2d calibration function to compute convolution kernel
+    /// acsSrc : calibration data for source channel [RO E1 srcCHA], full kspace
+    /// acsDst : calibration data for destination channel [RO E1 dstCHA], full kspace
+    /// startRO, endRO, startE1, endE1: define the data region [startRO endRO], [startE1 endE1] which is used for calibration
+    /// accelFactor: acceleration factor
+    /// thres: the threshold for regularization during kernel estimation
+    /// kRO: kernel size along RO
+    /// kNE1: kernel size along E1
+    /// convKer: computed grappa convolution kernel
+    template <typename T> EXPORTMRICORE void grappa2d_calib_convolution_kernel(const hoNDArray<T>& acsSrc, const hoNDArray<T>& acsDst, size_t accelFactor, double thres, size_t kRO, size_t kNE1, size_t startRO, size_t endRO, size_t startE1, size_t endE1, hoNDArray<T>& convKer);
+    /// entire data in acsSrc and acsDst is used
+    template <typename T> EXPORTMRICORE void grappa2d_calib_convolution_kernel(const hoNDArray<T>& acsSrc, const hoNDArray<T>& acsDst, size_t accelFactor, double thres, size_t kRO, size_t kNE1, hoNDArray<T>& convKer);
+    /// dataMask : [RO E1] array, marking fully rectangular sampled region with 1
+    template <typename T> EXPORTMRICORE void grappa2d_calib_convolution_kernel(const hoNDArray<T>& dataSrc, const hoNDArray<T>& dataDst, hoNDArray<unsigned short>& dataMask, size_t accelFactor, double thres, size_t kRO, size_t kNE1, hoNDArray<T>& convKer);
+
+    /// compute image domain kernel from 2d grappd convolution kernel
+    /// RO, E1: the size of image domain kernel
+    /// kIm: image domain kernel [RO E1 srcCHA dstCHA]
+    template <typename T> EXPORTMRICORE void grappa2d_image_domain_kernel(const hoNDArray<T>& convKer, size_t RO, size_t E1, hoNDArray<T>& kIm);
+
+    /// compute unmixing coefficient from image domain kernel and coil sensitivity
+    /// kerIm: [RO E1 srcCHA dstCHA], image domain kernel
+    /// coilMap: [RO E1 dstCHA] coil sensitivity map
+    /// unmixCoeff: [RO E1 srcCHA] unmixing coefficient
+    /// gFactor: [RO E1], gfactor
+    template <typename T> EXPORTMRICORE void grappa2d_unmixing_coeff(const hoNDArray<T>& kerIm, const hoNDArray<T>& coilMap, size_t acceFactorE1, hoNDArray<T>& unmixCoeff, hoNDArray< typename realType<T>::Type >& gFactor);
+
+    /// apply unmixing coefficient on undersampled kspace
+    /// kspace: [RO E1 srcCHA ...]
+    /// unmixCoeff : [RO E1 srcCHA]
+    /// complexIm : [RO E1 ...] wrapped complex images
+    template <typename T> EXPORTMRICORE void apply_unmix_coeff_kspace(const hoNDArray<T>& kspace, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm);
+
+    /// aliasedIm : [RO E1 srcCHA ...]
+    template <typename T> EXPORTMRICORE void apply_unmix_coeff_aliased_image(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm);
+
+    /// ------------------------
+    /// grappa 2d low level functions
+    /// ------------------------
+    /// get the kernel pattern, given the acceleration factor and kernel size
+    /// kE1: kernel pattern along E1
+    /// oE1: output pattern along E1
+    /// convKRO: convolution kernel size along RO
+    /// convKE1: convolution kernel size along E1
+    /// e.g. for R=4 and kNE1=4, the kernel pattern kE1 will be [-4 0 4 8] and the output pattern oE1 will be [0 1 2 3] if fitItself==true
+    /// if fitItself==false, the output pattern oE1 will be [1 2 3]
+    /// if the acsSrc and acsDst are generated in different ways, often fitItself needs to be true; e.g. acsSrc is in the origin acquired channels
+    /// and acsDst is in eigen channel
+    /// accelFactor: acceleration factor
+    /// kRO: kernel size along RO
+    /// kNE1: kernel size along E1
+    EXPORTMRICORE void grappa2d_kerPattern(std::vector<int>& kE1, std::vector<int>& oE1, size_t& convKRO, size_t& convKE1, size_t accelFactor, size_t kRO, size_t kNE1, bool fitItself);
+
+    /// grappa calibration for 2D case
+    /// kE1: the kernel pattern along E1
+    /// oE1: the output kernel pattern along E1
+    /// ker : kernel array [kRO kE1 srcCHA dstCHA oE1]
+    template <typename T> EXPORTMRICORE void grappa2d_calib(const hoNDArray<T>& acsSrc, const hoNDArray<T>& acsDst, double thres, size_t kRO, const std::vector<int>& kE1, const std::vector<int>& oE1, hoNDArray<T>& ker);
+
+    /// convert the grappa multiplication kernel computed from grappa2d_calib to convolution kernel
+    /// convKer : [convRO convE1 srcCHA dstCHA]
+    template <typename T> EXPORTMRICORE void grappa2d_convert_to_convolution_kernel(const hoNDArray<T>& ker, size_t kRO, const std::vector<int>& kE1, const std::vector<int>& oE1, hoNDArray<T>& convKer);
+
+}
diff --git a/toolboxes/mri_core/mri_core_utility.cpp b/toolboxes/mri_core/mri_core_utility.cpp
new file mode 100644
index 0000000..65e825f
--- /dev/null
+++ b/toolboxes/mri_core/mri_core_utility.cpp
@@ -0,0 +1,13 @@
+
+/** \file   mri_core_utility.cpp
+    \brief  Implementation useful utility functionalities for 2D and 3D MRI parallel imaging
+    \author Hui Xue
+*/
+
+#include "mri_core_utility.h"
+#include "hoNDArray_elemwise.h"
+
+namespace Gadgetron
+{
+
+}
diff --git a/toolboxes/mri_core/mri_core_utility.h b/toolboxes/mri_core/mri_core_utility.h
new file mode 100644
index 0000000..7a7dabd
--- /dev/null
+++ b/toolboxes/mri_core/mri_core_utility.h
@@ -0,0 +1,14 @@
+
+/** \file   mri_core_utility.h
+    \brief  Implementation useful utility functionalities for 2D and 3D MRI parallel imaging
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "mri_core_export.h"
+#include "hoNDArray.h"
+
+namespace Gadgetron
+{
+}
diff --git a/toolboxes/nfft/CMakeLists.txt b/toolboxes/nfft/CMakeLists.txt
new file mode 100644
index 0000000..2244056
--- /dev/null
+++ b/toolboxes/nfft/CMakeLists.txt
@@ -0,0 +1,3 @@
+IF (CUDA_FOUND)
+  add_subdirectory(gpu)
+ENDIF (CUDA_FOUND)
diff --git a/toolboxes/nfft/gpu/CMakeLists.txt b/toolboxes/nfft/gpu/CMakeLists.txt
new file mode 100644
index 0000000..db23d5a
--- /dev/null
+++ b/toolboxes/nfft/gpu/CMakeLists.txt
@@ -0,0 +1,47 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUNFFT__)
+  ADD_DEFINITIONS(-D_USE_MATH_DEFINES)
+endif (WIN32)
+
+if(WIN32)
+  link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+include_directories( 
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/fft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CUDA_INCLUDE_DIRS}
+  ${Boost_INCLUDE_DIR}
+  )
+
+cuda_add_library(gadgetron_toolbox_gpunfft SHARED 
+    cuNFFT.h
+    cuNFFTOperator.h
+    gpunfft_export.h
+    cuNFFT.cu 
+    cuNFFTOperator.cu
+  )
+
+set_target_properties(gadgetron_toolbox_gpunfft PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_toolbox_gpunfft 
+  gadgetron_toolbox_gpufft
+  gadgetron_toolbox_gpucore
+  gadgetron_toolbox_log
+  ${Boost_LIBRARIES}
+  ${CUDA_LIBRARIES} 
+  ${CUDA_CUFFT_LIBRARIES} 
+  ${CUDA_CUBLAS_LIBRARIES}
+  ${CUDA_CUSPARSE_LIBRARIES}
+  )
+
+install(TARGETS gadgetron_toolbox_gpunfft DESTINATION lib COMPONENT main)
+
+install(FILES 
+  cuNFFT.h 
+  cuNFFTOperator.h 
+  gpunfft_export.h 
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/nfft/gpu/KaiserBessel_kernel.cu b/toolboxes/nfft/gpu/KaiserBessel_kernel.cu
new file mode 100644
index 0000000..09c3889
--- /dev/null
+++ b/toolboxes/nfft/gpu/KaiserBessel_kernel.cu
@@ -0,0 +1,127 @@
+//
+// Kaiser-Bessel convolution kernels
+//
+
+__inline__ __device__ double 
+bessi0(double x)
+{
+   double denominator;
+   double numerator;
+   double z;
+
+   if (x == 0.0) {
+      return 1.0;
+   } else {
+      z = x * x;
+      numerator = (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* 
+                     (z* 0.210580722890567e-22  + 0.380715242345326e-19 ) +
+                         0.479440257548300e-16) + 0.435125971262668e-13 ) +
+                         0.300931127112960e-10) + 0.160224679395361e-7  ) +
+                         0.654858370096785e-5)  + 0.202591084143397e-2  ) +
+                         0.463076284721000e0)   + 0.754337328948189e2   ) +
+                         0.830792541809429e4)   + 0.571661130563785e6   ) +
+                         0.216415572361227e8)   + 0.356644482244025e9   ) +
+                         0.144048298227235e10);
+
+      denominator = (z*(z*(z-0.307646912682801e4)+
+                       0.347626332405882e7)-0.144048298227235e10);
+   }
+
+   return -numerator/denominator;
+}
+
+__inline__ __device__ float 
+bessi0(float x)
+{
+   float denominator;
+   float numerator;
+   float z;
+
+   if (x == 0.0f) {
+      return 1.0f;
+   } else {
+      z = x * x;
+      numerator = (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* 
+                     (z* 0.210580722890567e-22f  + 0.380715242345326e-19f ) +
+                         0.479440257548300e-16f) + 0.435125971262668e-13f ) +
+                         0.300931127112960e-10f) + 0.160224679395361e-7f  ) +
+                         0.654858370096785e-5f)  + 0.202591084143397e-2f  ) +
+                         0.463076284721000e0f)   + 0.754337328948189e2f   ) +
+                         0.830792541809429e4f)   + 0.571661130563785e6f   ) +
+                         0.216415572361227e8f)   + 0.356644482244025e9f   ) +
+                         0.144048298227235e10f);
+
+      denominator = (z*(z*(z-0.307646912682801e4f)+
+                       0.347626332405882e7f)-0.144048298227235e10f);
+   }
+
+   return -numerator/denominator;
+}
+
+
+// Kaiser Bessel according to Beatty et. al. IEEE TMI 2005;24(6):799-808.
+// There is a slight difference wrt Jackson's formulation, IEEE TMI 1991;10(3):473-478.
+
+__inline__ __device__ double
+KaiserBessel( double u, double matrix_size_os, double one_over_W, double beta )
+{
+  double _tmp = 2.0*u*one_over_W;
+  double tmp = _tmp*_tmp;
+  double arg = beta*std::sqrt(1.0-tmp);
+  double bessi = bessi0(arg);
+  double ret = matrix_size_os*bessi*one_over_W;
+  return ret;
+}
+
+__inline__ __device__ float
+KaiserBessel( float u, float matrix_size_os, float one_over_W, float beta )
+{
+  float _tmp = 2.0f*u*one_over_W;
+  float tmp = _tmp*_tmp;
+  float arg = beta*std::sqrt(1.0f-tmp);
+  float bessi = bessi0(arg);
+  float ret = matrix_size_os*bessi*one_over_W;
+  return ret;
+}
+
+//
+// Below the intended interface
+//
+
+template<class REAL> __inline__ __device__ REAL
+KaiserBessel( const Gadgetron::vector_td<REAL,1> &u, const Gadgetron::vector_td<REAL,1> &matrix_size_os, 
+	      REAL one_over_W, const vector_td<REAL,1> &beta )
+{
+  REAL phi_x = KaiserBessel( u.vec[0], matrix_size_os.vec[0], one_over_W, beta[0] );
+  return phi_x;
+}
+
+template<class REAL> __inline__ __device__ REAL
+KaiserBessel( const Gadgetron::vector_td<REAL,2> &u, const Gadgetron::vector_td<REAL,2> &matrix_size_os, 
+	      REAL one_over_W, const vector_td<REAL,2> &beta )
+{
+  REAL phi_x = KaiserBessel( u.vec[0], matrix_size_os.vec[0], one_over_W, beta[0] );
+  REAL phi_y = KaiserBessel( u.vec[1], matrix_size_os.vec[1], one_over_W, beta[1] );
+  return phi_x*phi_y;
+}
+
+template<class REAL> __inline__ __device__ REAL
+KaiserBessel( const Gadgetron::vector_td<REAL,3> &u, const Gadgetron::vector_td<REAL,3> &matrix_size_os, 
+	      REAL one_over_W, const vector_td<REAL,3> &beta )
+{
+  REAL phi_x = KaiserBessel( u.vec[0], matrix_size_os.vec[0], one_over_W, beta[0] );
+  REAL phi_y = KaiserBessel( u.vec[1], matrix_size_os.vec[1], one_over_W, beta[1] );
+  REAL phi_z = KaiserBessel( u.vec[2], matrix_size_os.vec[2], one_over_W, beta[2] );
+  return phi_x*phi_y*phi_z;
+}
+
+template<class REAL> __inline__ __device__ REAL
+KaiserBessel( const Gadgetron::vector_td<REAL,4> &u, const Gadgetron::vector_td<REAL,4> &matrix_size_os,
+	      REAL one_over_W, const vector_td<REAL,4> &beta )
+{
+  REAL phi_x = KaiserBessel( u.vec[0], matrix_size_os.vec[0], one_over_W, beta[0] );
+  REAL phi_y = KaiserBessel( u.vec[1], matrix_size_os.vec[1], one_over_W, beta[1] );
+  REAL phi_z = KaiserBessel( u.vec[2], matrix_size_os.vec[2], one_over_W, beta[2] );
+  REAL phi_w = KaiserBessel( u.vec[3], matrix_size_os.vec[3], one_over_W, beta[3] );
+  return phi_x*phi_y*phi_z*phi_w;
+}
diff --git a/toolboxes/nfft/gpu/NFFT_C2NC_conv_kernel.cu b/toolboxes/nfft/gpu/NFFT_C2NC_conv_kernel.cu
new file mode 100644
index 0000000..85c284e
--- /dev/null
+++ b/toolboxes/nfft/gpu/NFFT_C2NC_conv_kernel.cu
@@ -0,0 +1,249 @@
+/*
+  CUDA implementation of the NFFT.
+
+  -----------
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12): 1974-1985. 
+*/
+
+//
+// There is no header file accompanying this kernel, so it makes most sense to read the code/file from the end and upwards
+//
+
+//
+// Transfer result from shared memory to global memory.
+//
+
+template<class REAL> __inline__ __device__ void 
+NFFT_output( unsigned int number_of_samples, unsigned int number_of_batches, complext<REAL> * __restrict__ samples,
+	     unsigned int double_warp_size_power, unsigned int globalThreadId, unsigned int sharedMemFirstSampleIdx, bool accumulate )
+{
+  
+  REAL *shared_mem = (REAL*) _shared_mem;
+  
+  for( unsigned int batch=0; batch<number_of_batches; batch++ ){
+    complext<REAL>sample_value;
+    sample_value.vec[0] = shared_mem[sharedMemFirstSampleIdx+(batch<<double_warp_size_power)];
+    sample_value.vec[1] = shared_mem[sharedMemFirstSampleIdx+(batch<<double_warp_size_power)+warpSize];
+
+    unsigned int out_idx = (batch*gridDim.y+blockIdx.y)*number_of_samples + globalThreadId;
+
+    if( accumulate ) sample_value += samples[out_idx];
+    samples[out_idx] = sample_value;
+  }
+}
+
+template<unsigned int D> __inline__ __device__ static void
+resolve_wrap( vector_td<int,D> &grid_position, vector_td<unsigned int,D> &matrix_size_os )
+{
+  vector_td<int,D> zero(0);
+  grid_position += vector_less(grid_position, zero)*matrix_size_os;
+  grid_position -= vector_greater_equal(grid_position, matrix_size_os)* matrix_size_os;
+}
+
+template<class REAL, unsigned int D> __inline__ __device__ void
+NFFT_iterate_body( typename reald<REAL,D>::Type alpha, typename reald<REAL,D>::Type beta, REAL W, 
+		   vector_td<unsigned int, D> matrix_size_os, unsigned int number_of_batches, complext<REAL> * __restrict__ image,
+		   unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, vector_td<REAL,D> matrix_size_os_real, unsigned int sharedMemFirstSampleIdx,
+		   vector_td<REAL,D> sample_position, vector_td<int,D> grid_position )
+{
+      
+  // Calculate the distance between current sample and the grid cell
+  vector_td<REAL,D> grid_position_real = vector_td<REAL,D>(grid_position);
+  const vector_td<REAL,D> delta = abs(sample_position-grid_position_real);
+  const vector_td<REAL,D> half_W_vec(half_W );
+  
+  // If cell too distant from sample then move on to the next cell
+  if( weak_greater( delta, half_W_vec ))
+    return;
+
+  // Compute convolution weight.
+  const REAL weight = KaiserBessel<REAL>( delta, matrix_size_os_real, one_over_W, beta );
+
+  // Safety measure. We have occationally observed a NaN from the KaiserBessel computation
+  if( !isfinite(weight) )
+    return;
+
+  // Resolve wrapping of grid position
+  resolve_wrap<D>( grid_position, matrix_size_os);
+
+  REAL *shared_mem = (REAL*) _shared_mem;
+  
+  for( unsigned int batch=0; batch<number_of_batches; batch++ ){
+    
+    // Read the grid cell value from global memory
+    const complext<REAL> grid_value = 
+      image[ (batch*gridDim.y+blockIdx.y)*prod(matrix_size_os) + co_to_idx<D>( vector_td<unsigned int, D>(grid_position), matrix_size_os ) ];
+    
+    // Add 'weight*grid_value' to the samples in shared memory
+    shared_mem[sharedMemFirstSampleIdx+(batch<<double_warp_size_power)] += (weight*grid_value.vec[0]);
+    shared_mem[sharedMemFirstSampleIdx+(batch<<double_warp_size_power)+warpSize] += (weight*grid_value.vec[1]);
+  }
+}
+
+//
+// This method is deliberately overloaded in 'UINTd' (rather than templetized) to improve performance of the loop iteration
+//
+
+template<class REAL> __inline__ __device__ void
+NFFT_iterate( typename reald<REAL,1>::Type alpha, typename reald<REAL,1>::Type beta, REAL W, 
+	      vector_td<unsigned int,1> matrix_size_os, unsigned int number_of_batches, complext<REAL> * __restrict__ image,
+	      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, vector_td<REAL,1> matrix_size_os_real, unsigned int sharedMemFirstSampleIdx,
+	      vector_td<REAL,1> sample_position, vector_td<int,1> lower_limit, vector_td<int,1> upper_limit )
+{
+  // Iterate through all grid cells influencing the corresponding sample
+  for( int x = lower_limit.vec[0]; x<=upper_limit.vec[0]; x++ ){
+    
+    const intd<1>::Type grid_position(x);
+    
+    NFFT_iterate_body<REAL,1>( alpha, beta, W, matrix_size_os, number_of_batches, image, double_warp_size_power, half_W, 
+			       one_over_W, matrix_size_os_real, sharedMemFirstSampleIdx, sample_position, grid_position );
+  }
+}
+
+//
+// This method is deliberately overloaded in 'UINTd' (rather than templetized) to improve performance of the loop iteration
+//
+
+template<class REAL> __inline__ __device__ void
+NFFT_iterate( typename reald<REAL,2>::Type alpha, typename reald<REAL,2>::Type beta, REAL W, 
+	      vector_td<unsigned int,2> matrix_size_os, unsigned int number_of_batches, complext<REAL> * __restrict__ image,
+	      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, vector_td<REAL,2> matrix_size_os_real, unsigned int sharedMemFirstSampleIdx,
+	      vector_td<REAL,2> sample_position, vector_td<int,2> lower_limit, vector_td<int,2> upper_limit )
+{
+  // Iterate through all grid cells influencing the corresponding sample
+  for( int y = lower_limit.vec[1]; y<=upper_limit.vec[1]; y++ ){
+    for( int x = lower_limit.vec[0]; x<=upper_limit.vec[0]; x++ ){
+      
+      const intd<2>::Type grid_position(x,y);
+      
+      NFFT_iterate_body<REAL,2>( alpha, beta, W, matrix_size_os, number_of_batches, image, double_warp_size_power, half_W, 
+				 one_over_W, matrix_size_os_real, sharedMemFirstSampleIdx, sample_position, grid_position );
+    }
+  }
+}
+
+//
+// This method is deliberately overloaded in 'd' (rather than templetized) to improve performance of the loop iteration
+//
+
+template<class REAL> __inline__ __device__ void
+NFFT_iterate( typename reald<REAL,3>::Type alpha, typename reald<REAL,3>::Type beta, REAL W, 
+	      vector_td<unsigned int,3> matrix_size_os, unsigned int number_of_batches, complext<REAL> * __restrict__ image,
+	      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, vector_td<REAL,3> matrix_size_os_real, unsigned int sharedMemFirstSampleIdx,
+	      vector_td<REAL,3> sample_position, vector_td<int,3> lower_limit, vector_td<int,3> upper_limit )
+{
+  // Iterate through all grid cells influencing the corresponding sample
+  for( int z = lower_limit.vec[2]; z<=upper_limit.vec[2]; z++ ){
+    for( int y = lower_limit.vec[1]; y<=upper_limit.vec[1]; y++ ){
+      for( int x = lower_limit.vec[0]; x<=upper_limit.vec[0]; x++ ){
+	
+	const intd<3>::Type grid_position(x,y,z);
+	
+	NFFT_iterate_body<REAL,3>( alpha, beta, W, matrix_size_os, number_of_batches, image, double_warp_size_power, half_W, 
+				   one_over_W, matrix_size_os_real, sharedMemFirstSampleIdx, sample_position, grid_position );
+      }
+    }
+  }
+}
+
+//
+// This method is deliberately overloaded in 'd' (rather than templetized) to improve performance of the loop iteration
+//
+
+template<class REAL> __inline__ __device__ void
+NFFT_iterate( typename reald<REAL,4>::Type alpha, typename reald<REAL,4>::Type beta, REAL W, 
+	      vector_td<unsigned int,4> matrix_size_os, unsigned int number_of_batches, complext<REAL> * __restrict__ image,
+	      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, vector_td<REAL,4> matrix_size_os_real, unsigned int sharedMemFirstSampleIdx,
+	      vector_td<REAL,4> sample_position, vector_td<int,4> lower_limit, vector_td<int,4> upper_limit )
+{
+  // Iterate through all grid cells influencing the corresponding sample
+  for( int w = lower_limit.vec[3]; w<=upper_limit.vec[3]; w++ ){
+    for( int z = lower_limit.vec[2]; z<=upper_limit.vec[2]; z++ ){
+      for( int y = lower_limit.vec[1]; y<=upper_limit.vec[1]; y++ ){
+	for( int x = lower_limit.vec[0]; x<=upper_limit.vec[0]; x++ ){
+	  
+	  const intd<4>::Type grid_position(x,y,z,w);
+	  
+	  NFFT_iterate_body<REAL,4>( alpha, beta, W, matrix_size_os, number_of_batches, image, double_warp_size_power, half_W, 
+				     one_over_W, matrix_size_os_real, sharedMemFirstSampleIdx, sample_position, grid_position );
+	}
+      }
+    }
+  }
+}
+
+template<class REAL, unsigned int D> __inline__ __device__ void
+NFFT_convolve( typename reald<REAL,D>::Type alpha, typename reald<REAL,D>::Type beta, REAL W, 
+	       vector_td<unsigned int, D> matrix_size_os, vector_td<unsigned int, D> matrix_size_wrap, 
+	       unsigned int number_of_samples, unsigned int number_of_batches, const vector_td<REAL,D> * __restrict__ traj_positions, complext<REAL> * __restrict__ image,
+	       unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, vector_td<REAL,D> matrix_size_os_real,
+	       unsigned int globalThreadId, unsigned int sharedMemFirstSampleIdx )
+{
+  
+  // Sample position to convolve onto
+  // Computed in preprocessing, which included a wrap zone. Remove this wrapping.
+  const vector_td<REAL,D> half_wrap_real = vector_td<REAL,D>(matrix_size_wrap>>1);
+  const vector_td<REAL,D> sample_position = traj_positions[globalThreadId+blockIdx.y*number_of_samples]-half_wrap_real;
+  
+  // Half the kernel width
+  const vector_td<REAL,D> half_W_vec( half_W );
+  
+  // Limits of the subgrid to consider
+  const vector_td<int,D> lower_limit = vector_td<int,D>( ceil(sample_position-half_W_vec));
+  const vector_td<int,D> upper_limit = vector_td<int,D>( floor(sample_position+half_W_vec));
+
+  // Accumulate contributions from the grid
+  NFFT_iterate<REAL>( alpha, beta, W, matrix_size_os, number_of_batches, image, double_warp_size_power, 
+		      half_W, one_over_W, matrix_size_os_real, sharedMemFirstSampleIdx, sample_position, lower_limit, upper_limit );
+}
+
+//
+// kernel main
+//
+
+template<class REAL, unsigned int D> __global__ void
+NFFT_convolve_kernel( typename reald<REAL,D>::Type alpha, typename reald<REAL,D>::Type beta, REAL W, 
+		      vector_td<unsigned int, D> matrix_size_os, vector_td<unsigned int, D> matrix_size_wrap,
+		      unsigned int number_of_samples, unsigned int number_of_batches, 
+		      const vector_td<REAL,D> * __restrict__ traj_positions, complext<REAL> *image,  complext<REAL> * __restrict__ samples,
+		      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, bool accumulate, vector_td<REAL,D> matrix_size_os_real )
+{
+
+  // Global thread number	
+  const unsigned int globalThreadId = (blockIdx.x*blockDim.x+threadIdx.x);
+
+  // Check if we are within bounds
+  if( globalThreadId >= number_of_samples )
+    return;
+  
+  // Number of reals to compute/output per thread
+  const unsigned int num_reals = number_of_batches<<1;
+  
+  // All shared memory reals corresponding to domain 'threadIdx.x' are located in bank threadIdx.x%warp_size to limit bank conflicts
+  const unsigned int scatterSharedMemStart = (threadIdx.x/warpSize)*warpSize;
+  const unsigned int scatterSharedMemStartOffset = threadIdx.x&(warpSize-1); // a faster way of saying (threadIdx.x%warpSize) 
+  const unsigned int sharedMemFirstSampleIdx = scatterSharedMemStart*num_reals + scatterSharedMemStartOffset;
+
+  REAL *shared_mem = (REAL*) _shared_mem;
+  const REAL zero = REAL(0);
+
+  // Initialize shared memory
+  for( unsigned int i=0; i<num_reals; i++ )
+    shared_mem[sharedMemFirstSampleIdx+warpSize*i] = zero;
+  
+  // Compute NFFT using arbitrary sample trajectories
+  NFFT_convolve<REAL,D>( alpha, beta, W, matrix_size_os, matrix_size_wrap, number_of_samples, number_of_batches, 
+			 traj_positions, image, double_warp_size_power, half_W, one_over_W, 
+			 matrix_size_os_real, globalThreadId, sharedMemFirstSampleIdx );
+  
+  // Output k-space image to global memory
+  NFFT_output<REAL>( number_of_samples, number_of_batches, samples, double_warp_size_power, globalThreadId, sharedMemFirstSampleIdx, accumulate );
+}
+
diff --git a/toolboxes/nfft/gpu/NFFT_NC2C_atomic_conv_kernel.cu b/toolboxes/nfft/gpu/NFFT_NC2C_atomic_conv_kernel.cu
new file mode 100644
index 0000000..09ccc33
--- /dev/null
+++ b/toolboxes/nfft/gpu/NFFT_NC2C_atomic_conv_kernel.cu
@@ -0,0 +1,227 @@
+/*
+  CUDA implementation of the NFFT.
+
+  -----------
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12): 1974-1985. 
+
+  Notice:
+  This version of the code uses atomic writes and thus differs from the two references above.
+*/
+
+//
+// There is no header file accompanying this kernel, so it makes most sense to read the code/file from the end and upwards
+//
+
+//
+// First the implementation of the inner-most loop
+// 
+
+template<class REAL, unsigned int D> static __inline__ __device__ void
+NFFT_iterate_body( typename reald<REAL,D>::Type alpha, typename reald<REAL,D>::Type beta, 
+		   REAL W, vector_td<unsigned int, D> matrix_size_os, 
+		   unsigned int number_of_batches, const complext<REAL> * __restrict__ samples,  complext<REAL> * __restrict__ image,
+		   unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, vector_td<REAL,D> matrix_size_os_real, 
+		   unsigned int frame, unsigned int num_frames,
+		   unsigned int num_samples_per_batch, unsigned int sample_idx_in_batch, 
+		   vector_td<REAL,D> sample_position, vector_td<int,D> grid_position )
+{
+  // Calculate the distance between current sample and the grid cell
+  vector_td<REAL,D> grid_position_real = vector_td<REAL,D>(grid_position);
+  const vector_td<REAL,D> delta = abs(sample_position-grid_position_real);
+  const vector_td<REAL,D> half_W_vec(half_W );
+  
+  // If cell too distant from sample then move on to the next cell
+  if( weak_greater( delta, half_W_vec ))
+    return;
+
+  // Compute convolution weight.
+  const REAL weight = KaiserBessel<REAL>( delta, matrix_size_os_real, one_over_W, beta );
+  
+  // Safety measure. We have occationally observed a NaN from the KaiserBessel computation
+  if( !isfinite(weight) )
+    return;
+
+  // Resolve wrapping of grid position
+  resolve_wrap<D>( grid_position, matrix_size_os );
+
+  for( unsigned int batch=0; batch<number_of_batches; batch++ ){
+
+    // Read the grid sample value from global memory
+    complext<REAL> sample_value = samples[sample_idx_in_batch+batch*num_samples_per_batch];
+    
+    // Determine the grid cell idx
+    unsigned int grid_idx = 
+      (batch*num_frames+frame)*prod(matrix_size_os) + co_to_idx<D>( vector_td<unsigned int, D>(grid_position), matrix_size_os );
+
+    // Atomic update of real and imaginary component
+    atomicAdd( &(((REAL*)image)[(grid_idx<<1)+0]), weight*real(sample_value) );
+    atomicAdd( &(((REAL*)image)[(grid_idx<<1)+1]), weight*imag(sample_value) );
+  }
+}
+
+//
+// This method is deliberately overloaded in 'UINTd' (rather than templetized) to improve performance of the loop iteration
+//
+
+template<class REAL> __inline__ __device__ void
+NFFT_iterate( typename reald<REAL,1>::Type alpha, typename reald<REAL,1>::Type beta, 
+	      REAL W, vector_td<unsigned int,1> matrix_size_os, 
+	      unsigned int number_of_batches, const complext<REAL> * __restrict__ samples, complext<REAL> * __restrict__ image,
+	      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, 
+	      vector_td<REAL,1> matrix_size_os_real, 
+	      unsigned int frame, unsigned int num_frames, 
+	      unsigned int num_samples_per_batch, unsigned int sample_idx_in_batch, 
+	      vector_td<REAL,1> sample_position, vector_td<int,1> lower_limit, vector_td<int,1> upper_limit )
+{
+  // Iterate through all grid cells influencing the corresponding sample
+  for( int x = lower_limit.vec[0]; x<=upper_limit.vec[0]; x++ ){
+    
+    const intd<1>::Type grid_position(x);
+    
+    NFFT_iterate_body<REAL,1>( alpha, beta, W, matrix_size_os, number_of_batches, samples, image, double_warp_size_power, 
+			       half_W, one_over_W, matrix_size_os_real, frame, num_frames,
+			       num_samples_per_batch, sample_idx_in_batch, sample_position, grid_position );
+  }
+}
+
+//
+// This method is deliberately overloaded in 'd' (rather than templetized) to improve performance of the loop iteration
+//
+
+template<class REAL> __inline__ __device__ void
+NFFT_iterate( typename reald<REAL,2>::Type alpha, typename reald<REAL,2>::Type beta, 
+	      REAL W, vector_td<unsigned int,2> matrix_size_os, 
+	      unsigned int number_of_batches, const complext<REAL> * __restrict__ samples, complext<REAL> * __restrict__ image,
+	      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, 
+	      vector_td<REAL,2> matrix_size_os_real, 
+	      unsigned int frame, unsigned int num_frames, 
+	      unsigned int num_samples_per_batch, unsigned int sample_idx_in_batch, 
+	      vector_td<REAL,2> sample_position, vector_td<int,2> lower_limit, vector_td<int,2> upper_limit )
+{
+  // Iterate through all grid cells influencing the corresponding sample
+  for( int y = lower_limit.vec[1]; y<=upper_limit.vec[1]; y++ ){
+    for( int x = lower_limit.vec[0]; x<=upper_limit.vec[0]; x++ ){
+      
+      const intd<2>::Type grid_position(x,y);
+      
+      NFFT_iterate_body<REAL,2>( alpha, beta, W, matrix_size_os, number_of_batches, samples, image, double_warp_size_power, 
+				 half_W, one_over_W, matrix_size_os_real, frame, num_frames,
+				 num_samples_per_batch, sample_idx_in_batch, sample_position, grid_position );
+    }
+  }
+}
+
+//
+// This method is deliberately overloaded in 'd' (rather than templetized) to improve performance of the loop iteration
+//
+
+template<class REAL> __inline__ __device__ void
+NFFT_iterate( typename reald<REAL,3>::Type alpha, typename reald<REAL,3>::Type beta, 
+	      REAL W, vector_td<unsigned int,3> matrix_size_os, 
+	      unsigned int number_of_batches, const complext<REAL> * __restrict__ samples, complext<REAL> * __restrict__ image,
+	      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, 
+	      vector_td<REAL,3> matrix_size_os_real, 
+	      unsigned int frame, unsigned int num_frames, 	      
+	      unsigned int num_samples_per_batch, unsigned int sample_idx_in_batch, 
+	      vector_td<REAL,3> sample_position, vector_td<int,3> lower_limit, vector_td<int,3> upper_limit )
+{
+  // Iterate through all grid cells influencing the corresponding sample
+  for( int z = lower_limit.vec[2]; z<=upper_limit.vec[2]; z++ ){
+    for( int y = lower_limit.vec[1]; y<=upper_limit.vec[1]; y++ ){
+      for( int x = lower_limit.vec[0]; x<=upper_limit.vec[0]; x++ ){
+	
+	const intd<3>::Type grid_position(x,y,z);
+	
+	NFFT_iterate_body<REAL,3>( alpha, beta, W, matrix_size_os, number_of_batches, samples, image, double_warp_size_power, 
+				   half_W, one_over_W, matrix_size_os_real, frame, num_frames,
+				   num_samples_per_batch, sample_idx_in_batch, sample_position, grid_position );
+      }
+    }
+  }
+}
+
+//
+// This method is deliberately overloaded in 'd' (rather than templetized) to improve performance of the loop iteration
+//
+
+template<class REAL> __inline__ __device__ void
+NFFT_iterate( typename reald<REAL,4>::Type alpha, typename reald<REAL,4>::Type beta, 
+	      REAL W, vector_td<unsigned int,4> matrix_size_os, 
+	      unsigned int number_of_batches, const complext<REAL> * __restrict__ samples, complext<REAL> * __restrict image,
+	      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W,
+	      vector_td<REAL,4> matrix_size_os_real, 
+	      unsigned int frame, unsigned int num_frames, 
+	      unsigned int num_samples_per_batch, unsigned int sample_idx_in_batch, 
+	      vector_td<REAL,4> sample_position, vector_td<int,4> lower_limit, vector_td<int,4> upper_limit )
+{
+  // Iterate through all grid cells influencing the corresponding sample
+  for( int w = lower_limit.vec[3]; w<=upper_limit.vec[3]; w++ ){
+    for( int z = lower_limit.vec[2]; z<=upper_limit.vec[2]; z++ ){
+      for( int y = lower_limit.vec[1]; y<=upper_limit.vec[1]; y++ ){
+	for( int x = lower_limit.vec[0]; x<=upper_limit.vec[0]; x++ ){
+	  
+	  const intd<4>::Type grid_position(x,y,z,w);
+	  
+	  NFFT_iterate_body<REAL,4>( alpha, beta, W, matrix_size_os, number_of_batches, samples, image, double_warp_size_power, 
+				     half_W, one_over_W, matrix_size_os_real, frame, num_frames,
+				     num_samples_per_batch, sample_idx_in_batch, sample_position, grid_position );
+	}
+      }
+    }
+  }
+}
+
+//
+// kernel main
+//
+
+template<class REAL, unsigned int D> __global__ void
+NFFT_H_atomic_convolve_kernel( typename reald<REAL,D>::Type alpha, typename reald<REAL,D>::Type beta, REAL W, 
+			       vector_td<unsigned int, D> matrix_size_os, vector_td<unsigned int, D> matrix_size_wrap,
+			       unsigned int num_samples_per_frame, unsigned int num_batches, 
+			       const vector_td<REAL,D> * __restrict__ traj_positions, const complext<REAL> * __restrict__ samples, complext<REAL> * __restrict__ image,
+			       unsigned int double_warp_size_power, REAL half_W, REAL one_over_W,
+			       vector_td<REAL,D> matrix_size_os_real )
+{
+  
+  // A runtime check will prevent this kernel from being run for compute models 1.x.
+  //
+  
+#if(__CUDA_ARCH__>=200)
+    
+  const unsigned int sample_idx_in_frame = (blockIdx.x*blockDim.x+threadIdx.x);
+
+  // Check if we are within bounds
+  if( sample_idx_in_frame >= num_samples_per_frame )
+    return;
+      
+  const unsigned int frame = blockIdx.y;
+  const unsigned int num_frames = gridDim.y;
+  const unsigned int num_samples_per_batch = num_samples_per_frame*num_frames ;
+  const unsigned int sample_idx_in_batch = sample_idx_in_frame+frame*num_samples_per_frame;
+  
+  // Sample position computed in preprocessing includes a wrap zone. Remove this wrapping.
+  const vector_td<REAL,D> half_wrap_real = vector_td<REAL,D>(matrix_size_wrap>>1);
+  const vector_td<REAL,D> sample_position = traj_positions[sample_idx_in_batch]-half_wrap_real;
+  
+  // Half the kernel width
+  const vector_td<REAL,D> half_W_vec = vector_td<REAL,D>( half_W );
+  
+  // Limits of the subgrid to consider
+  const vector_td<int,D> lower_limit = vector_td<int,D>( ceil(sample_position-half_W_vec));
+  const vector_td<int,D> upper_limit = vector_td<int,D>( floor(sample_position+half_W_vec));
+
+  // Output to the grid
+  NFFT_iterate<REAL>( alpha, beta, W, matrix_size_os, num_batches, samples, image, double_warp_size_power, 
+		      half_W, one_over_W, matrix_size_os_real, 
+		      frame, num_frames, num_samples_per_batch, sample_idx_in_batch, 
+		      sample_position, lower_limit, upper_limit );
+#endif
+}
diff --git a/toolboxes/nfft/gpu/NFFT_NC2C_conv_kernel.cu b/toolboxes/nfft/gpu/NFFT_NC2C_conv_kernel.cu
new file mode 100644
index 0000000..ed2ed2c
--- /dev/null
+++ b/toolboxes/nfft/gpu/NFFT_NC2C_conv_kernel.cu
@@ -0,0 +1,142 @@
+/*
+  CUDA implementation of the NFFT.
+
+  -----------
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12): 1974-1985. 
+*/
+
+//
+// There is no header file accompanying this kernel, so it makes most sense to read the code/file from the end and upwards
+//
+
+//
+// Transfer result from shared memory to global memory.
+//
+
+template<class REAL> __inline__ __device__ void 
+NFFT_H_output( unsigned int number_of_batches, complext<REAL>* __restrict__ image,
+	       unsigned int double_warp_size_power, unsigned int number_of_domains, 
+	       unsigned int globalThreadId, unsigned int sharedMemFirstCellIdx )
+{
+
+  REAL *shared_mem = (REAL*) _shared_mem;
+  
+  for( unsigned int batch=0; batch<number_of_batches; batch++ ){
+    complext<REAL>cell_coefficient;
+    cell_coefficient.vec[0] = shared_mem[sharedMemFirstCellIdx+(batch<<double_warp_size_power)];
+    cell_coefficient.vec[1] = shared_mem[sharedMemFirstCellIdx+(batch<<double_warp_size_power)+warpSize];
+    image[(batch*gridDim.y+blockIdx.y)*number_of_domains+globalThreadId] = cell_coefficient;
+  }
+}
+
+
+template<class REAL, unsigned int D> __inline__ __device__ void
+NFFT_H_convolve( typename reald<REAL,D>::Type alpha, typename reald<REAL,D>::Type beta, REAL W, 
+		 unsigned int number_of_samples, unsigned int number_of_batches, unsigned int number_of_domains,
+		 const vector_td<REAL,D> * __restrict__ traj_positions, complext<REAL>*samples, const unsigned int * __restrict__ tuples_last,
+		 const unsigned int * __restrict__ bucket_begin, const unsigned int * __restrict__ bucket_end,
+		 unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, vector_td<REAL,D> matrix_size_os_real, 
+		 unsigned int globalThreadId, vector_td<unsigned int,D> domainPos, unsigned int sharedMemFirstCellIdx )
+{
+
+  REAL *shared_mem = (REAL*) _shared_mem;
+
+  // Cell position as reald
+  vector_td<REAL,D> cell_pos = vector_td<REAL,D>( domainPos );
+  
+  // Convolve samples onto the domain (shared memory)
+  const unsigned int frame_offset = blockIdx.y*number_of_domains;
+  for( unsigned int i=bucket_begin[globalThreadId+frame_offset]; i<bucket_end[globalThreadId+frame_offset]; i++ )
+    {
+      // Safety precaution TODO
+      unsigned int sampleIdx = tuples_last[i];
+
+      // Safety precaution TODO
+      vector_td<REAL,D> sample_pos = traj_positions[sampleIdx];
+      
+      // Calculate the distance between the cell and the sample
+      vector_td<REAL,D> delta = abs(sample_pos-cell_pos);
+      vector_td<REAL,D> half_W_vec( half_W );
+  
+      // Check if sample will contribute
+      if( weak_greater(delta, half_W_vec ))
+	continue;
+      
+      // Compute convolution weights
+      float weight = KaiserBessel<REAL>( delta, matrix_size_os_real, one_over_W, beta );
+      
+      // Safety measure
+      if( !isfinite(weight) )
+      	continue;
+      
+      // Apply Kaiser-Bessel filter to input images
+      for( unsigned int batch=0; batch<number_of_batches; batch++ ){
+	
+	complext<REAL>sample_val = samples[sampleIdx+batch*gridDim.y*number_of_samples];
+
+	// Apply filter to shared memory domain. 
+	shared_mem[sharedMemFirstCellIdx+(batch<<double_warp_size_power)] += (weight*sample_val.vec[0]);
+	shared_mem[sharedMemFirstCellIdx+(batch<<double_warp_size_power)+warpSize] += (weight*sample_val.vec[1]);
+      }
+    }
+}
+
+//
+// kernel main
+//
+
+template<class REAL, unsigned int D> __global__ void
+NFFT_H_convolve_kernel( typename reald<REAL,D>::Type alpha, typename reald<REAL,D>::Type beta, REAL W,
+			vector_td<unsigned int,D> domain_count_grid, unsigned int number_of_samples, unsigned int number_of_batches,
+			const vector_td<REAL,D> * __restrict__ traj_positions, complext<REAL>* __restrict__ image, complext<REAL>* __restrict__ samples,
+			const unsigned int * __restrict__ tuples_last, const unsigned int * __restrict__ bucket_begin, const unsigned int * __restrict__ bucket_end,
+			unsigned int double_warp_size_power,
+			REAL half_W, REAL one_over_W, vector_td<REAL,D> matrix_size_os_real )
+{
+  
+  // Global thread index
+  const unsigned int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+  // Number of domains
+  const unsigned int number_of_domains = prod(domain_count_grid);
+
+  // Check if we are within bounds
+  if( index >= number_of_domains )
+    return;
+  
+  // Mapped global thread index (actually we don't use a map currently)
+  const unsigned int domainIdx = index; 
+
+  // Compute global domain position
+  const vector_td<unsigned int,D> domainPos = idx_to_co<D>( domainIdx, domain_count_grid );
+	
+  // Number of cells
+  const unsigned int num_reals = number_of_batches<<1;
+
+  // All shared memory floats corresponding to domain 'threadIdx.x' is located in bank threadIdx.x%warp_size to limit bank conflicts
+  const unsigned int scatterSharedMemStart = (threadIdx.x/warpSize)*warpSize;
+  const unsigned int scatterSharedMemStartOffset = threadIdx.x&(warpSize-1); // a faster way of saying (threadIdx.x%warpSize) 
+  const unsigned int sharedMemFirstCellIdx = scatterSharedMemStart*num_reals + scatterSharedMemStartOffset;
+
+  REAL *shared_mem = (REAL*) _shared_mem;
+
+  // Initialize shared memory
+  for( unsigned int i=0; i<num_reals; i++ )
+    shared_mem[sharedMemFirstCellIdx+warpSize*i] = REAL(0);
+  
+  // Compute NFFT using arbitrary sample trajectories.
+  NFFT_H_convolve<REAL, D>
+    ( alpha, beta, W, number_of_samples, number_of_batches, number_of_domains,
+      traj_positions, samples, tuples_last, bucket_begin, bucket_end,
+      double_warp_size_power, half_W, one_over_W,  matrix_size_os_real, index, domainPos, sharedMemFirstCellIdx );
+  
+  // Output k-space image to global memory
+  NFFT_H_output<REAL>( number_of_batches, image, double_warp_size_power, number_of_domains, index, sharedMemFirstCellIdx );
+}
diff --git a/toolboxes/nfft/gpu/NFFT_preprocess_kernel.cu b/toolboxes/nfft/gpu/NFFT_preprocess_kernel.cu
new file mode 100644
index 0000000..94c892a
--- /dev/null
+++ b/toolboxes/nfft/gpu/NFFT_preprocess_kernel.cu
@@ -0,0 +1,171 @@
+//
+// NFFT_H preprocessing kernels
+//
+
+// convert input trajectory in [-1/2;1/2] to [0;matrix_size_os+matrix_size_wrap]
+
+template<class REAL, unsigned int D> struct trajectory_scale
+{
+  typename reald<REAL,D>::Type matrix, bias;
+  
+  trajectory_scale( const typename reald<REAL,D>::Type &m, const typename reald<REAL,D>::Type &b ){
+    matrix = m;
+    bias = b;
+  }
+  
+  __host__ __device__
+  typename reald<REAL,D>::Type operator()(const typename reald<REAL,D>::Type &in) const { 
+    return component_wise_mul<REAL,D>(in,matrix)+bias;
+  }
+};
+
+template<class REAL, unsigned int D>
+struct compute_num_cells_per_sample
+{
+  __host__ __device__
+  compute_num_cells_per_sample(REAL _half_W) : half_W(_half_W) {}
+  
+  __host__ __device__
+  unsigned int operator()(typename reald<REAL,D>::Type p) const
+  {
+    unsigned int num_cells = 1;
+    for( unsigned int dim=0; dim<D; dim++ ){
+      unsigned int upper_limit = (unsigned int)floor((((float*)&p)[dim])+half_W);
+      unsigned int lower_limit = (unsigned int)ceil((((float*)&p)[dim])-half_W);
+      num_cells *= (upper_limit-lower_limit+1);
+    }
+    return num_cells;
+  }
+  
+  REAL half_W;
+};
+
+template<class REAL> __inline__ __device__ void
+output_pairs( unsigned int sample_idx, unsigned int frame, 
+	      typename reald<REAL,1>::Type &p, typename uintd<1>::Type &matrix_size_os, typename uintd<1>::Type &matrix_size_wrap, 
+	      REAL half_W, const unsigned int * __restrict__ write_offsets, unsigned int * __restrict__ tuples_first, unsigned int * __restrict__ tuples_last )
+{
+  unsigned int lower_limit_x = (unsigned int)ceil(p.vec[0]-half_W);
+  unsigned int upper_limit_x = (unsigned int)floor(p.vec[0]+half_W);
+
+  unsigned int pair_idx = 0;
+  unsigned int write_offset = (sample_idx==0) ? 0 : write_offsets[sample_idx-1];
+  unsigned int frame_offset = frame*prod(matrix_size_os+matrix_size_wrap);
+  for( unsigned int x=lower_limit_x; x<=upper_limit_x; x++ ){
+    typename uintd<1>::Type co; co.vec[0] = x;
+    tuples_first[write_offset+pair_idx] = co_to_idx<1>(co, matrix_size_os+matrix_size_wrap)+frame_offset;
+    tuples_last[write_offset+pair_idx] = sample_idx;
+    pair_idx++;
+  }
+}
+
+template<class REAL> __inline__ __device__ void
+output_pairs( unsigned int sample_idx, unsigned int frame, 
+	      typename reald<REAL,2>::Type &p, typename uintd<2>::Type &matrix_size_os, typename uintd<2>::Type &matrix_size_wrap, 
+	      REAL half_W, const unsigned int * __restrict__ write_offsets, unsigned int * __restrict__ tuples_first, unsigned int * __restrict__ tuples_last )
+{
+  unsigned int lower_limit_x = (unsigned int)ceil(p.vec[0]-half_W);
+  unsigned int lower_limit_y = (unsigned int)ceil(p.vec[1]-half_W);
+  unsigned int upper_limit_x = (unsigned int)floor(p.vec[0]+half_W);
+  unsigned int upper_limit_y = (unsigned int)floor(p.vec[1]+half_W);
+
+  unsigned int pair_idx = 0;
+  unsigned int write_offset = (sample_idx==0) ? 0 : write_offsets[sample_idx-1];
+  unsigned int frame_offset = frame*prod(matrix_size_os+matrix_size_wrap);
+  for( unsigned int y=lower_limit_y; y<=upper_limit_y; y++ ){
+    for( unsigned int x=lower_limit_x; x<=upper_limit_x; x++ ){
+      typename uintd<2>::Type co; co.vec[0] = x; co.vec[1] = y;
+      tuples_first[write_offset+pair_idx] = co_to_idx<2>(co, matrix_size_os+matrix_size_wrap)+frame_offset;
+      tuples_last[write_offset+pair_idx] = sample_idx;
+      pair_idx++;
+    }
+  }
+}
+
+template <class REAL> __inline__ __device__ void
+output_pairs( unsigned int sample_idx, unsigned int frame, 
+	      typename reald<REAL,3>::Type &p, typename uintd<3>::Type &matrix_size_os, typename uintd<3>::Type &matrix_size_wrap, 
+	      REAL half_W, const unsigned int * __restrict__ write_offsets, unsigned int * __restrict__ tuples_first, unsigned int * __restrict__ tuples_last )
+{
+  unsigned int lower_limit_x = (unsigned int)ceil(p.vec[0]-half_W);
+  unsigned int lower_limit_y = (unsigned int)ceil(p.vec[1]-half_W);
+  unsigned int lower_limit_z = (unsigned int)ceil(p.vec[2]-half_W);
+  unsigned int upper_limit_x = (unsigned int)floor(p.vec[0]+half_W);
+  unsigned int upper_limit_y = (unsigned int)floor(p.vec[1]+half_W);
+  unsigned int upper_limit_z = (unsigned int)floor(p.vec[2]+half_W);
+
+  unsigned int pair_idx = 0;
+  unsigned int write_offset = (sample_idx==0) ? 0 : write_offsets[sample_idx-1];
+  unsigned int frame_offset = frame*prod(matrix_size_os+matrix_size_wrap);
+  for( unsigned int z=lower_limit_z; z<=upper_limit_z; z++ ){
+    for( unsigned int y=lower_limit_y; y<=upper_limit_y; y++ ){
+      for( unsigned int x=lower_limit_x; x<=upper_limit_x; x++ ){
+	typename uintd<3>::Type co; co.vec[0] = x; co.vec[1] = y; co.vec[2] = z;
+	tuples_first[write_offset+pair_idx] = co_to_idx<3>(co, matrix_size_os+matrix_size_wrap)+frame_offset;
+	tuples_last[write_offset+pair_idx] = sample_idx;
+	pair_idx++;
+      }
+    }
+  }
+}
+
+template <class REAL> __inline__ __device__ void
+output_pairs( unsigned int sample_idx, unsigned int frame, 
+	      typename reald<REAL,4>::Type &p, typename uintd<4>::Type &matrix_size_os, typename uintd<4>::Type &matrix_size_wrap, 
+	      REAL half_W, const unsigned int * __restrict__ write_offsets, unsigned int * __restrict__ tuples_first, unsigned int * __restrict__ tuples_last )
+{
+  unsigned int lower_limit_x = (unsigned int)ceil(p.vec[0]-half_W);
+  unsigned int lower_limit_y = (unsigned int)ceil(p.vec[1]-half_W);
+  unsigned int lower_limit_z = (unsigned int)ceil(p.vec[2]-half_W);
+  unsigned int lower_limit_w = (unsigned int)ceil(p.vec[3]-half_W);
+  unsigned int upper_limit_x = (unsigned int)floor(p.vec[0]+half_W);
+  unsigned int upper_limit_y = (unsigned int)floor(p.vec[1]+half_W);
+  unsigned int upper_limit_z = (unsigned int)floor(p.vec[2]+half_W);
+  unsigned int upper_limit_w = (unsigned int)floor(p.vec[3]+half_W);
+
+  unsigned int pair_idx = 0;
+  unsigned int write_offset = (sample_idx==0) ? 0 : write_offsets[sample_idx-1];
+  unsigned int frame_offset = frame*prod(matrix_size_os+matrix_size_wrap);
+  for( unsigned int w=lower_limit_w; w<=upper_limit_w; w++ ){
+    for( unsigned int z=lower_limit_z; z<=upper_limit_z; z++ ){
+      for( unsigned int y=lower_limit_y; y<=upper_limit_y; y++ ){
+	for( unsigned int x=lower_limit_x; x<=upper_limit_x; x++ ){
+	  typename uintd<4>::Type co; co.vec[0] = x; co.vec[1] = y; co.vec[2] = z; co.vec[3] = w;
+	  tuples_first[write_offset+pair_idx] = co_to_idx<4>(co, matrix_size_os+matrix_size_wrap)+frame_offset;
+	  tuples_last[write_offset+pair_idx] = sample_idx;
+	  pair_idx++;
+	}
+      }
+    }
+  }
+}
+
+template<class REAL, unsigned int D> __global__ void
+write_pairs_kernel( typename uintd<D>::Type matrix_size_os, typename uintd<D>::Type matrix_size_wrap, unsigned int num_samples_per_frame, REAL half_W, 
+		   const typename reald<REAL,D>::Type * __restrict__ traj_positions, unsigned int * __restrict__ write_offsets, unsigned int * __restrict__ tuples_first, unsigned int * __restrict__ tuples_last )
+{
+  // Get sample idx
+  unsigned int sample_idx = blockIdx.x*blockDim.x + threadIdx.x;
+  unsigned int frame = blockIdx.y;
+
+  if( sample_idx<num_samples_per_frame ){
+
+    sample_idx += frame*num_samples_per_frame;
+    typename reald<REAL,D>::Type p = traj_positions[sample_idx];
+    output_pairs<REAL>( sample_idx, frame, p, matrix_size_os, matrix_size_wrap, half_W, write_offsets, tuples_first, tuples_last );
+  }
+};
+
+template <class REAL, unsigned int D> void 
+write_pairs( typename uintd<D>::Type matrix_size_os, typename uintd<D>::Type matrix_size_wrap, unsigned int num_samples_per_frame, unsigned int num_frames, REAL W, 
+	     const typename reald<REAL,D>::Type * __restrict__ traj_positions, unsigned int * __restrict__ write_offsets, unsigned int * __restrict__ tuples_first, unsigned int * __restrict__ tuples_last )
+{  
+  dim3 blockDim(256);
+  dim3 gridDim((int)ceil((double)num_samples_per_frame/(double)blockDim.x), num_frames);
+
+  REAL half_W = REAL(0.5)*W;
+  write_pairs_kernel<REAL,D><<< gridDim, blockDim >>>
+    ( matrix_size_os, matrix_size_wrap, num_samples_per_frame, half_W, traj_positions, write_offsets, tuples_first, tuples_last );
+
+ CHECK_FOR_CUDA_ERROR();
+}
diff --git a/toolboxes/nfft/gpu/NFFT_sparseMatrix_kernel.cu b/toolboxes/nfft/gpu/NFFT_sparseMatrix_kernel.cu
new file mode 100644
index 0000000..59c203d
--- /dev/null
+++ b/toolboxes/nfft/gpu/NFFT_sparseMatrix_kernel.cu
@@ -0,0 +1,171 @@
+#include "vector_td.h"
+#include "vector_td_utilities.h"
+#include "setup_grid.h"
+#include "cuSparseMatrix.h"
+#include <thrust/fill.h>
+namespace Gadgetron {
+
+template<unsigned int N> struct iteration_counter{};
+
+template<class REAL, unsigned int D, unsigned int N> __device__ __inline__ REAL ndim_loop(const vector_td<REAL,D> point, unsigned int & loop_counter, complext<REAL> * __restrict__ weights,
+		int * __restrict__ column_indices,
+		vector_td<int, D> & grid_point, const vector_td<int,D> & image_dims, const REAL W, const vector_td<REAL,D> & beta, iteration_counter<N>){
+	REAL wsum = 0;
+	for ( int i = ::ceil(point[N]-W*0.5); i <= ::floor(point[N]+W*0.5); i++){
+		grid_point[N] = i;
+		wsum += ndim_loop(point,loop_counter,weights,column_indices,grid_point,image_dims,W,beta,iteration_counter<N-1>());
+	}
+	return wsum;
+
+
+}
+template<class REAL, unsigned int D> __device__ __inline__ REAL ndim_loop(const vector_td<REAL,D> point, unsigned int & loop_counter, complext<REAL> * __restrict__ weights,
+		int * __restrict__ column_indices,
+		vector_td<int, D> & grid_point, const vector_td<int,D> & image_dims, const REAL W, const vector_td<REAL,D> & beta, iteration_counter<0>){
+
+	REAL wsum =0;
+	for ( int i = ::ceil(point[0]-W*0.5); i <= ::floor(point[0]+W*0.5); i++){
+		grid_point[0] = i;
+		REAL weight = KaiserBessel<REAL>(abs(point-vector_td<REAL,D>(grid_point)),vector_td<REAL,D>(image_dims),REAL(1)/W,beta);
+		weights[loop_counter] = weight;
+		//column_indices[loop_counter] = co_to_idx(grid_point%image_dims,image_dims);
+		column_indices[loop_counter] = co_to_idx(grid_point%image_dims,image_dims);
+		loop_counter++;
+		wsum += weight;
+	}
+	return wsum;
+
+
+}
+
+template<class T> __device__ void index_sort(T* values, int* indices, int nvals){
+
+	//Insertion sort, as we anticipate stuff to be mostly sorted. Might be faster to just sort the entire thing in a separate kernel
+	for (int i = 0; i < nvals; i++){
+		int index = indices[i];
+		T val = values[i];
+		int j = i;
+		while (j > 0 && indices[j-1] > index){
+			indices[j] = indices[j-1];
+			values[j] = values[j-1];
+			j--;
+		}
+		indices[j] = index;
+		values[j] = val;
+
+	}
+
+}
+
+/**
+ *
+ * @param points non-cartesian points on which to do the NFFT. Size tot_size
+ * @param offsets Array containing the offsets at which the rows should be stored. Size tot_size+1
+ * @param weights Output array which will contain the values of the sparse matrix.
+ * @param column_indices Output array containing the column indices of the sparse matrix
+ * @param tot_size Total number of points
+ */
+template<class REAL, unsigned int D> __global__ void make_NFFT_matrix_kernel(const vector_td<REAL,D> * __restrict__ points, const int * __restrict__ offsets, complext<REAL> * __restrict__ weights, int * __restrict__ column_indices, const vector_td<int,D> image_dims, const vector_td<REAL,D> beta, const REAL W, unsigned int tot_size ){
+	const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (idx < tot_size){
+		vector_td<REAL,D> p = points[idx];
+		const int offset = offsets[idx];
+
+
+		complext<REAL> * local_weight = weights+offset;
+		int * local_column_indices = column_indices+offset;
+		unsigned int loop_counter = 0;
+		vector_td<int,D> grid_point;
+		REAL wsum = ndim_loop(p,loop_counter, local_weight,local_column_indices, grid_point,image_dims,W,beta,iteration_counter<D-1>());
+		REAL inv_wsum = 1.0/wsum;
+		for (unsigned int i = offset; i < offsets[idx+1]; i++){
+			weights[i] *= inv_wsum;
+		}
+		index_sort(local_weight,local_column_indices,offsets[idx+1]-offset);
+
+	}
+}
+
+template<class REAL> void check_csrMatrix(cuCsrMatrix<complext<REAL> > &matrix){
+
+	if (matrix.csrRow.size() != matrix.m+1){
+		throw std::runtime_error("Malformed CSR matrix: length of CSR vector does not match matrix size");
+	}
+
+	if (matrix.csrColdnd.size() != matrix.nnz){
+		throw std::runtime_error("Malformed CSR matrix: length of column indices vector does not match number of non-zero elements");
+	}
+	if (matrix.data.size() != matrix.nnz ){
+		throw std::runtime_error("Malformed CSR matrix: length of data vector does not match number of non-zero elements");
+	}
+
+	int min_ind = *thrust::min_element(matrix.csrColdnd.begin(),matrix.csrColdnd.end());
+	int max_ind = *thrust::max_element(matrix.csrColdnd.begin(),matrix.csrColdnd.end());
+
+	if (min_ind < 0 || max_ind > matrix.n){
+		std::stringstream ss;
+		ss << "Malformed CSR matrix: column indices vector contains illegal values. Min " << min_ind<< " max " << max_ind;
+		throw std::runtime_error(ss.str());
+	}
+	int min_row = *thrust::min_element(matrix.csrRow.begin(),matrix.csrRow.end());
+	int max_row = *thrust::max_element(matrix.csrRow.begin(),matrix.csrRow.end());
+
+	if (min_row < 0 || max_row != matrix.nnz){
+		throw std::runtime_error("Malformed CSR matrix: CSR vector conains illegal values");
+	}
+
+
+	if (isnan(abs(thrust::reduce(matrix.data.begin(),matrix.data.end()))))
+		throw std::runtime_error("Matrix contains NaN");
+
+
+}
+
+template<class REAL, unsigned int D> boost::shared_ptr<cuCsrMatrix<complext<REAL> > > make_NFFT_matrix( thrust::device_vector<vector_td<REAL,D> > & points,  const vector_td<size_t,D> image_dims, const vector_td<REAL,D> beta, const REAL W ){
+
+	boost::shared_ptr<cuCsrMatrix<complext<REAL> > > matrix(new cuCsrMatrix<complext<REAL> >);
+
+	matrix->csrRow = thrust::device_vector<int>(points.size()+1);
+	matrix->csrRow[0] = 0;
+	CHECK_FOR_CUDA_ERROR();
+
+	REAL half_W = REAL(0.5)*W;
+	{
+		thrust::device_vector<int> c_p_s(points.size());
+		thrust::transform(points.begin(), points.end(), c_p_s.begin(), compute_num_cells_per_sample<REAL,D>(half_W));
+
+		thrust::inclusive_scan( c_p_s.begin(), c_p_s.end(), matrix->csrRow.begin()+1, thrust::plus<int>()); // prefix sum
+
+
+	}
+	unsigned int num_pairs = matrix->csrRow.back();
+	//cuNDArray<int> row_indices(ind_dims);
+	matrix->csrColdnd = thrust::device_vector<int>(num_pairs);
+	matrix->data = thrust::device_vector<complext<REAL> >(num_pairs);
+	//cuNDArray<complext<REAL> > values(ind_dims);
+
+
+	dim3 dimBlock;
+	dim3 dimGrid;
+	setup_grid(points.size(),&dimBlock,&dimGrid);
+
+	make_NFFT_matrix_kernel<<<dimGrid,dimBlock>>>(thrust::raw_pointer_cast(&points[0]),thrust::raw_pointer_cast(&matrix->csrRow[0]), thrust::raw_pointer_cast(&matrix->data[0]), thrust::raw_pointer_cast(&matrix->csrColdnd[0]),vector_td<int,D>(image_dims),beta,W, points.size() );
+	cudaDeviceSynchronize();
+	CHECK_FOR_CUDA_ERROR();
+	matrix->m = points.size();
+	matrix->n = prod(image_dims);
+	matrix->nnz = num_pairs;
+
+ std::cout << " Matrix sum: " << thrust::reduce(matrix->data.begin(),matrix->data.end()) << std::endl;
+	//cusparseSet
+	//CUSPARSE_CALL(cusparseSetMatType(matrix->descr,CUSPARSE_MATRIX_TYPE_GENERAL));
+	//CUSPARSE_CALL(cusparseSetMatDiagType(matrix->descr,CUSPARSE_DIAG_TYPE_NON_UNIT));
+	//CUSPARSE_CALL(cusparseSetMatIndexBase(matrix->descr,CUSPARSE_INDEX_BASE_ZERO));
+
+	return matrix;
+}
+
+}
+
+
diff --git a/toolboxes/nfft/gpu/cuNFFT.cu b/toolboxes/nfft/gpu/cuNFFT.cu
new file mode 100644
index 0000000..655631f
--- /dev/null
+++ b/toolboxes/nfft/gpu/cuNFFT.cu
@@ -0,0 +1,1455 @@
+/*
+  CUDA implementation of the NFFT.
+
+  -----------
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12): 1974-1985. 
+*/
+
+// Includes - Thrust
+#include <thrust/scan.h>
+#include <thrust/sort.h>
+#include <thrust/binary_search.h>
+#include <thrust/extrema.h>
+// Includes - Gadgetron
+#include "cuNFFT.h"
+#include "cuNDFFT.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "vector_td_utilities.h"
+#include "vector_td_io.h"
+#include "cudaDeviceManager.h"
+#include "check_CUDA.h"
+
+// Includes - CUDA
+#include <device_functions.h>
+#include <math_constants.h>
+#include <cufft.h>
+
+
+// Includes - stdlibs
+#include <stdio.h>
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <cmath>
+#include <sstream>
+#include <stdexcept>
+
+//using namespace std;
+using std::vector;
+using namespace thrust;
+using namespace Gadgetron;
+
+// Kernel configuration  
+#define NFFT_MAX_COILS_COMPUTE_1x    8
+#define NFFT_MAX_COILS_COMPUTE_2x   16
+#define NFFT_THREADS_PER_KERNEL    192
+
+// Reference to shared memory
+extern __shared__ char _shared_mem[];
+
+// Includes containing the NFFT convolution implementation
+#include "KaiserBessel_kernel.cu"
+#include "NFFT_C2NC_conv_kernel.cu"
+#include "NFFT_NC2C_conv_kernel.cu"
+#include "NFFT_NC2C_atomic_conv_kernel.cu"
+#include "NFFT_preprocess_kernel.cu"
+
+// Default template arguments requires c++-0x ?
+typedef float dummy;
+
+// The declaration of atomic/non-atomic NC2C convolution
+// We would love to hide this inside the class, but the compiler core dumps on us when we try...
+//
+template<class REAL, unsigned int D, bool ATOMICS> struct _convolve_NFFT_NC2C{
+  static bool apply( cuNFFT_plan<REAL,D,ATOMICS> *plan, 
+                     cuNDArray<complext<REAL> > *in, 
+                     cuNDArray<complext<REAL> > *out, 
+                     bool accumulate );
+};
+
+// Common multi-device handling: prepare
+//
+template<class I1, class I2, class I3>
+static bool prepare( int device, int *old_device, 
+                     cuNDArray<I1> *in1,       cuNDArray<I1> **in1_int,
+                     cuNDArray<I2> *in2 = 0x0, cuNDArray<I2> **in2_int = 0x0,
+                     cuNDArray<I3> *in3 = 0x0, cuNDArray<I3> **in3_int = 0x0 )
+{
+  // Get current Cuda device
+  if( cudaGetDevice(old_device) != cudaSuccess ) {
+    throw cuda_error("Error: cuNFFT : unable to get device no");
+  }
+
+  if( device != *old_device && cudaSetDevice(device) != cudaSuccess) {
+    throw cuda_error("Error : cuNFFT : unable to set device no");
+  }
+  
+  // Transfer arrays to compute device if necessary
+  if( in1 ){
+    if( device != in1->get_device() )
+      *in1_int = new cuNDArray<I1>(*in1); // device transfer
+    else
+      *in1_int = in1;
+  }
+  
+  if( in2 ){
+    if( device != in2->get_device() )
+      *in2_int = new cuNDArray<I2>(*in2); // device transfer
+    else
+      *in2_int = in2;
+  }
+
+  if( in3 ){
+    if( device != in3->get_device() )
+      *in3_int = new cuNDArray<I3>(*in3); // device transfer
+    else
+      *in3_int = in3;
+  }
+  
+  return true;
+}  
+
+// Common multi-device handling: restore
+//
+template<class I1, class I2, class I3>
+static bool restore( int old_device, cuNDArray<I1> *out, 
+                     cuNDArray<I1> *in1, cuNDArray<I1> *in1_int,
+                     cuNDArray<I2> *in2 = 0x0, cuNDArray<I2> *in2_int = 0x0,
+                     cuNDArray<I3> *in3 = 0x0, cuNDArray<I3> *in3_int = 0x0 )
+{
+  if( in1 && out && out->get_device() != in1_int->get_device() ){ 
+    *out = *in1_int; // device transfer by assignment
+  } 
+  
+  // Check if internal array needs deletion (they do only if they were created in ::prepare()
+  //
+  if( in1 && in1->get_device() != in1_int->get_device() ){
+    delete in1_int;
+  }   
+  if( in2 && in2->get_device() != in2_int->get_device() ){
+    delete in2_int;
+  }   
+  if( in3 && in3->get_device() != in3_int->get_device() ){
+    delete in3_int;
+  }   
+  
+  // Get current Cuda device
+  int device;
+  if( cudaGetDevice(&device) != cudaSuccess ) {
+    throw cuda_error("Error: cuNFFT : unable to get device no");
+  }
+  
+  // Restore old device
+  if( device != old_device && cudaSetDevice(old_device) != cudaSuccess) {
+    throw cuda_error("Error: cuNFFT : unable to restore device no");
+  }
+  
+  return true;
+}
+
+
+//
+// Public class methods
+//
+
+template<class REAL, unsigned int D, bool ATOMICS> 
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::cuNFFT_plan()
+{
+  // Minimal initialization
+  barebones();
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> 
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::cuNFFT_plan( typename uint64d<D>::Type matrix_size, typename uint64d<D>::Type matrix_size_os, REAL W, int device )
+{
+  // Minimal initialization
+  barebones();
+
+  // Setup plan
+  setup( matrix_size, matrix_size_os, W, device );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> 
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::~cuNFFT_plan()
+{
+  wipe(NFFT_WIPE_ALL);
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> 
+void Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::setup( typename uint64d<D>::Type matrix_size, typename uint64d<D>::Type matrix_size_os, REAL W, int _device )
+{
+  // Free memory
+  wipe(NFFT_WIPE_ALL);
+
+  //
+  // Check if the device is valid
+  //
+
+  if( _device<0 ){
+    if( cudaGetDevice( &device ) != cudaSuccess ){
+      throw cuda_error("Error: cuNFFT_plan::setup: unable to determine device properties.");
+    }
+  }
+  else
+    device = _device;
+
+  // The convolution does not work properly for very small convolution kernel widths
+  // (experimentally observed limit)
+
+  if( W < REAL(1.8) ) {
+    throw std::runtime_error("Error: the convolution kernel width for the cuNFFT plan is too small.");
+  }
+
+  typename uint64d<D>::Type vec_warp_size( (size_t)(cudaDeviceManager::Instance()->warp_size(device)) );
+
+  //
+  // Check input against certain requirements
+  //
+  
+  if( sum(matrix_size%vec_warp_size) || sum(matrix_size_os%vec_warp_size) ){
+    //GDEBUG_STREAM("Matrix size: " << matrix_size << std::endl);
+    //GDEBUG_STREAM("Matrix size os: " << matrix_size_os << std::endl);
+    //GDEBUG_STREAM("Warp size: " << vec_warp_size << std::endl);
+    throw std::runtime_error("Error: Illegal matrix size for the cuNFFT plan (not a multiple of the warp size)");
+  }
+
+  //
+  // Setup private variables
+  //
+
+  this->matrix_size = matrix_size;
+  this->matrix_size_os = matrix_size_os;
+
+  REAL W_half = REAL(0.5)*W;
+  vector_td<REAL,D> W_vec(W_half);
+
+  matrix_size_wrap = vector_td<size_t,D>( ceil(W_vec) );
+  matrix_size_wrap<<=1; 
+  
+  alpha = vector_td<REAL,D>(matrix_size_os) / vector_td<REAL,D>(matrix_size);
+  
+  typename reald<REAL,D>::Type ones(REAL(1));
+  if( weak_less( alpha, ones ) ){
+    throw std::runtime_error("Error: cuNFFT : Illegal oversampling ratio suggested");
+  }
+
+  this->W = W;
+  
+  // Compute Kaiser-Bessel beta
+  compute_beta();
+  
+  int device_no_old;
+  if (cudaGetDevice(&device_no_old) != cudaSuccess) {
+    throw cuda_error("Error: cuNFFT_plan::setup: unable to get device no");
+  }  
+  if( device != device_no_old && cudaSetDevice(device) != cudaSuccess) {
+    throw cuda_error("Error: cuNFFT_plan::setup: unable to set device");
+  }  
+  initialized = true;
+
+  if( device != device_no_old && cudaSetDevice(device_no_old) != cudaSuccess) {
+    throw cuda_error("Error: cuNFFT_plan::setup: unable to restore device");
+  }
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> 
+void Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::preprocess( cuNDArray<typename reald<REAL,D>::Type> *trajectory, NFFT_prep_mode mode )
+{
+  if( !trajectory || trajectory->get_number_of_elements()==0 ){
+    throw std::runtime_error("Error: cuNFFT_plan::preprocess: invalid trajectory");
+  }
+  
+  if( !initialized ){
+    throw std::runtime_error("Error: cuNFFT_plan::preprocess: cuNFFT_plan::setup must be invoked prior to preprocessing.");
+  }
+
+  wipe(NFFT_WIPE_PREPROCESSING);
+
+  cuNDArray<typename reald<REAL,D>::Type> *trajectory_int;
+  int old_device;
+
+  if( !prepare<typename reald<REAL,D>::Type,dummy,dummy>(device, &old_device, trajectory, &trajectory_int ) ){
+    throw cuda_error("Error: cuNFFT_plan::preprocess: device preparation error.");
+  }
+    
+  number_of_samples = trajectory_int->get_size(0);
+  number_of_frames = trajectory_int->get_number_of_elements()/number_of_samples;
+
+  // Make sure that the trajectory values are within range [-1/2;1/2]
+  thrust::pair< thrust::device_ptr<REAL>, thrust::device_ptr<REAL> > mm_pair = 
+    thrust::minmax_element( device_pointer_cast<REAL>((REAL*)trajectory_int->get_data_ptr()), 
+                            device_pointer_cast<REAL>(((REAL*)trajectory_int->get_data_ptr())+trajectory_int->get_number_of_elements()*D ));
+  
+  if( *mm_pair.first < REAL(-0.5) || *mm_pair.second > REAL(0.5) ){
+	  std::stringstream ss;
+	  ss << "Error: cuNFFT::preprocess : trajectory [" << *mm_pair.first << "; " << *mm_pair.second << "] out of range [-1/2;1/2]";
+    throw std::runtime_error(ss.str());
+  }
+  
+  // Make Thrust device vector of trajectory and samples
+  device_vector< vector_td<REAL,D> > trajectory_positions_in
+    ( device_pointer_cast< vector_td<REAL,D> >(trajectory_int->get_data_ptr()), 
+      device_pointer_cast< vector_td<REAL,D> >(trajectory_int->get_data_ptr()+trajectory_int->get_number_of_elements() ));
+  
+  trajectory_positions = new device_vector< vector_td<REAL,D> >( trajectory_int->get_number_of_elements() );
+
+  CHECK_FOR_CUDA_ERROR();
+
+  vector_td<REAL,D> matrix_size_os_real = vector_td<REAL,D>( matrix_size_os );
+  vector_td<REAL,D> matrix_size_os_plus_wrap_real = vector_td<REAL,D>( (matrix_size_os+matrix_size_wrap)>>1 );
+
+  // convert input trajectory in [-1/2;1/2] to [0;matrix_size_os]
+  thrust::transform( trajectory_positions_in.begin(), trajectory_positions_in.end(), trajectory_positions->begin(), 
+                     trajectory_scale<REAL,D>(matrix_size_os_real, matrix_size_os_plus_wrap_real) );
+  
+  CHECK_FOR_CUDA_ERROR();
+
+  if( !( mode == NFFT_PREP_C2NC || ATOMICS )){
+
+    // allocate storage for and compute temporary prefix-sum variable (#cells influenced per sample)
+    device_vector<unsigned int> c_p_s(trajectory_int->get_number_of_elements());
+    device_vector<unsigned int> c_p_s_ps(trajectory_int->get_number_of_elements());
+    CHECK_FOR_CUDA_ERROR();
+    
+    REAL half_W = REAL(0.5)*W;
+    thrust::plus<unsigned int> binary_op;
+    thrust::transform(trajectory_positions->begin(), trajectory_positions->end(), c_p_s.begin(), compute_num_cells_per_sample<REAL,D>(half_W));
+    inclusive_scan( c_p_s.begin(), c_p_s.end(), c_p_s_ps.begin(), binary_op ); // prefix sum
+    
+    // Build the vector of (grid_idx, sample_idx) tuples. Actually kept in two seperate vectors.
+    unsigned int num_pairs = c_p_s_ps.back();
+    c_p_s.clear();
+
+    thrust::device_vector<unsigned int> *tuples_first = new device_vector<unsigned int>(num_pairs);
+    tuples_last = new device_vector<unsigned int>(num_pairs);
+    
+    CHECK_FOR_CUDA_ERROR();
+    
+    // Fill tuple vector
+    write_pairs<REAL,D>( vector_td<unsigned int,D>(matrix_size_os), vector_td<unsigned int,D>(matrix_size_wrap), number_of_samples, number_of_frames, W,
+                         raw_pointer_cast(&(*trajectory_positions)[0]), raw_pointer_cast(&c_p_s_ps[0]), 
+                         raw_pointer_cast(&(*tuples_first)[0]), raw_pointer_cast(&(*tuples_last)[0]) );
+    c_p_s_ps.clear();
+
+    // Sort by grid indices
+    sort_by_key( tuples_first->begin(), tuples_first->end(), tuples_last->begin() );
+    
+    // each bucket_begin[i] indexes the first element of bucket i's list of points
+    // each bucket_end[i] indexes one past the last element of bucket i's list of points
+
+    bucket_begin = new device_vector<unsigned int>(number_of_frames*prod(matrix_size_os+matrix_size_wrap));
+    bucket_end   = new device_vector<unsigned int>(number_of_frames*prod(matrix_size_os+matrix_size_wrap));
+    
+    CHECK_FOR_CUDA_ERROR();
+    
+    // find the beginning of each bucket's list of points
+    counting_iterator<unsigned int> search_begin(0);
+    lower_bound(tuples_first->begin(), tuples_first->end(), search_begin, search_begin + number_of_frames*prod(matrix_size_os+matrix_size_wrap), bucket_begin->begin() );
+    
+    // find the end of each bucket's list of points
+    upper_bound(tuples_first->begin(), tuples_first->end(), search_begin, search_begin + number_of_frames*prod(matrix_size_os+matrix_size_wrap), bucket_end->begin() );
+  
+    delete tuples_first;
+  }
+
+  preprocessed_C2NC = true;
+
+  if( mode != NFFT_PREP_C2NC )
+    preprocessed_NC2C = true;
+
+  if( !restore<typename reald<REAL,D>::Type,dummy,dummy>(old_device, trajectory, trajectory, trajectory_int) ){
+    throw cuda_error("Error: cuNFFT_plan::preprocess: unable to restore compute device.");
+  }
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::compute( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out,
+                                                 cuNDArray<REAL> *dcw, NFFT_comp_mode mode )
+{  
+  // Validity checks
+  
+  unsigned char components;
+
+  if( mode == NFFT_FORWARDS_C2NC ) 
+    components = _NFFT_CONV_C2NC + _NFFT_FFT + _NFFT_DEAPODIZATION;
+
+  else if( mode == NFFT_FORWARDS_NC2C ) 
+    components = _NFFT_CONV_NC2C + _NFFT_FFT + _NFFT_DEAPODIZATION;
+
+  else if( mode == NFFT_BACKWARDS_NC2C ) 
+    components = _NFFT_CONV_NC2C + _NFFT_FFT + _NFFT_DEAPODIZATION;
+
+  else if( mode == NFFT_BACKWARDS_C2NC ) 
+    components = _NFFT_CONV_C2NC + _NFFT_FFT + _NFFT_DEAPODIZATION;
+  else{
+    throw std::runtime_error("Error: cuNFFT_plan::compute: unknown mode");
+  }
+  
+  {
+    cuNDArray<complext<REAL> > *samples, *image;
+
+    if( mode == NFFT_FORWARDS_C2NC || mode == NFFT_BACKWARDS_C2NC ){
+      image = in; samples = out;
+    } else{
+      image = out; samples = in;
+    }
+    
+    check_consistency( samples, image, dcw, components );
+  }
+  
+  cuNDArray<complext<REAL> > *in_int = 0x0, *out_int = 0x0;
+  cuNDArray<REAL> *dcw_int = 0x0;
+  int old_device;
+
+  if( !prepare<complext<REAL>, complext<REAL>, REAL>
+      (device, &old_device, in, &in_int, out, &out_int, dcw, &dcw_int ) ){
+    throw cuda_error("Error: cuNFFT_plan::compute: device preparation error.");
+  }
+
+  typename uint64d<D>::Type image_dims = from_std_vector<size_t,D>
+    ( (mode == NFFT_FORWARDS_C2NC || mode == NFFT_BACKWARDS_C2NC ) ? *in->get_dimensions() : *out->get_dimensions() );
+  bool oversampled_image = (image_dims==matrix_size_os);
+  
+  vector<size_t> vec_dims = to_std_vector(matrix_size_os);
+  {
+    cuNDArray<complext<REAL> > *image = ((mode == NFFT_FORWARDS_C2NC || mode == NFFT_BACKWARDS_C2NC ) ? in : out );
+    for( unsigned int d=D; d<image->get_number_of_dimensions(); d++ )
+      vec_dims.push_back(image->get_size(d));
+  }
+
+  cuNDArray<complext<REAL> > *working_image = 0x0, *working_samples = 0x0;
+
+  switch(mode){
+
+  case NFFT_FORWARDS_C2NC:
+    
+    if( !oversampled_image ){
+      working_image = new cuNDArray<complext<REAL> >(&vec_dims);
+      pad<complext<REAL>, D>( in_int, working_image );
+    }
+    else{
+      working_image = in_int;
+    }
+    
+    compute_NFFT_C2NC( working_image, out_int );
+
+    if( dcw_int )
+        	*out_int *= *dcw_int;
+
+    if( !oversampled_image ){
+      delete working_image; working_image = 0x0;
+    }    
+    break;
+    
+  case NFFT_FORWARDS_NC2C:
+
+    // Density compensation
+    if( dcw_int ){
+      working_samples = new cuNDArray<complext<REAL> >(*in_int);
+      *working_samples *= *dcw_int;
+    }
+    else{
+      working_samples = in_int;
+    }
+    
+    if( !oversampled_image ){
+      working_image = new cuNDArray<complext<REAL> >(&vec_dims);
+    }
+    else{
+      working_image = out_int;
+    }
+
+    compute_NFFT_NC2C( working_samples, working_image );
+
+    if( !oversampled_image ){
+      crop<complext<REAL>, D>( (matrix_size_os-matrix_size)>>1, working_image, out_int );
+    }
+    
+    if( !oversampled_image ){
+      delete working_image; working_image = 0x0;
+    }
+    
+    if( dcw_int ){
+      delete working_samples; working_samples = 0x0;
+    }    
+    break;
+    
+  case NFFT_BACKWARDS_NC2C:
+    
+    // Density compensation
+    if( dcw_int ){
+      working_samples = new cuNDArray<complext<REAL> >(*in_int);
+      *working_samples *= *dcw_int;
+    }
+    else{
+      working_samples = in_int;
+    }
+    
+    if( !oversampled_image ){
+      working_image = new cuNDArray<complext<REAL> >(&vec_dims);
+    }
+    else{
+      working_image = out_int;
+    }
+    
+    compute_NFFTH_NC2C( working_samples, working_image );
+    
+    if( !oversampled_image ){
+      crop<complext<REAL> ,D>( (matrix_size_os-matrix_size)>>1, working_image, out_int );
+    }
+    
+    if( !oversampled_image ){
+      delete working_image; working_image = 0x0;
+    }
+    
+    if( dcw_int ){
+      delete working_samples; working_samples = 0x0;
+    }    
+    break;
+    
+  case NFFT_BACKWARDS_C2NC:
+    
+    if( !oversampled_image ){
+      working_image = new cuNDArray<complext<REAL> >(&vec_dims);
+      
+      pad<complext<REAL>, D>( in_int, working_image );
+    }
+    else{
+      working_image = in_int;
+    }
+    
+    compute_NFFTH_C2NC( working_image, out_int );
+    
+    if( dcw_int )
+    	*out_int *= *dcw_int;
+
+
+
+    if( !oversampled_image ){
+      delete working_image; working_image = 0x0;
+    }
+    
+    break;
+  };
+  
+  if( !restore<complext<REAL> ,complext<REAL> ,REAL>
+      (old_device, out, out, out_int, in, in_int, dcw, dcw_int ) ){
+    throw cuda_error("Error: cuNFFT_plan::compute: unable to restore compute device.");
+  }
+  
+  CHECK_FOR_CUDA_ERROR();
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::mult_MH_M( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out,
+                                                   cuNDArray<REAL> *dcw, std::vector<size_t> halfway_dims )
+{
+  // Validity checks
+  
+  unsigned char components = _NFFT_CONV_C2NC + _NFFT_CONV_NC2C + _NFFT_FFT + _NFFT_DEAPODIZATION;
+  
+  if( in->get_number_of_elements() != out->get_number_of_elements() ){
+    throw std::runtime_error("Error: cuNFFT_plan::mult_MH_M: in/out image sizes mismatch");
+  }
+  
+  cuNDArray<complext<REAL> > *working_samples = new cuNDArray<complext<REAL> >(&halfway_dims);
+
+  check_consistency( working_samples, in, dcw, components );
+  
+  cuNDArray<complext<REAL> > *in_int = 0x0;
+  cuNDArray<complext<REAL> > *out_int = 0x0;
+  cuNDArray<REAL> *dcw_int = 0x0;
+  int old_device;
+  
+  if( !prepare<complext<REAL>, complext<REAL>, REAL>
+      (device, &old_device, in, &in_int, out, &out_int, dcw, &dcw_int ) ){
+    throw cuda_error("Error: cuNFFT_plan::mult_MH_M: device preparation error.");
+  }
+  
+  cuNDArray<complext<REAL> > *working_image = 0x0;
+
+  typename uint64d<D>::Type image_dims = from_std_vector<size_t,D>(*in->get_dimensions()); 
+  bool oversampled_image = (image_dims==matrix_size_os); 
+ 
+  vector<size_t> vec_dims = to_std_vector(matrix_size_os); 
+  for( unsigned int d=D; d<in->get_number_of_dimensions(); d++ )
+    vec_dims.push_back(in->get_size(d));
+  
+  if( !oversampled_image ){
+    working_image = new cuNDArray<complext<REAL> >(&vec_dims);
+    pad<complext<REAL>, D>( in_int, working_image );
+  }
+  else{
+    working_image = in_int;
+  }
+  
+  compute_NFFT_C2NC( working_image, working_samples );
+  
+  // Density compensation
+  if( dcw ){
+    *working_samples *= *dcw_int;
+    *working_samples *= *dcw_int;
+  }
+    
+  compute_NFFTH_NC2C( working_samples, working_image );
+    
+  delete working_samples;
+  working_samples = 0x0;
+    
+  if( !oversampled_image ){
+    crop<complext<REAL>, D>( (matrix_size_os-matrix_size)>>1, working_image, out_int );
+    delete working_image; working_image = 0x0;
+  }
+        
+  restore<complext<REAL> ,complext<REAL> ,REAL>
+    (old_device, out, out, out_int, in, in_int, dcw, dcw_int );
+    
+  CHECK_FOR_CUDA_ERROR();
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::convolve( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out,
+                                                  cuNDArray<REAL> *dcw, NFFT_conv_mode mode, bool accumulate )
+{
+  unsigned char components;
+
+  if( mode == NFFT_CONV_C2NC ) 
+    components = _NFFT_CONV_C2NC;
+  else
+    components = _NFFT_CONV_NC2C;
+  
+  {
+    cuNDArray<complext<REAL> > *samples, *image;
+    
+    if( mode == NFFT_CONV_C2NC ){
+      image = in; samples = out;
+    } else{
+      image = out; samples = in;
+    }
+    
+    check_consistency( samples, image, dcw, components );
+  }
+  
+  cuNDArray<complext<REAL> > *in_int = 0x0, *out_int = 0x0;
+  cuNDArray<REAL> *dcw_int = 0x0;
+  int old_device;
+  
+  prepare<complext<REAL>, complext<REAL>, REAL>
+    (device, &old_device, in, &in_int, out, &out_int, dcw, &dcw_int );
+  
+  cuNDArray<complext<REAL> > *working_samples = 0x0;
+  
+  typename uint64d<D>::Type image_dims = from_std_vector<size_t, D>
+    (*(((mode == NFFT_CONV_C2NC) ? in : out )->get_dimensions())); 
+  bool oversampled_image = (image_dims==matrix_size_os); 
+  
+  if( !oversampled_image ){
+    throw std::runtime_error("Error: cuNFFT_plan::convolve: ERROR: oversampled image not provided as input.");
+  }
+
+  vector<size_t> vec_dims = to_std_vector(matrix_size_os); 
+  {
+    cuNDArray<complext<REAL> > *image = ((mode == NFFT_CONV_C2NC) ? in : out );
+    for( unsigned int d=D; d<image->get_number_of_dimensions(); d++ )
+      vec_dims.push_back(image->get_size(d));
+  }
+
+  switch(mode){
+
+  case NFFT_CONV_C2NC:
+  	convolve_NFFT_C2NC( in_int, out_int, accumulate );
+  	if( dcw_int ) *out_int *= *dcw_int;
+    break;
+    
+  case NFFT_CONV_NC2C:
+
+    // Density compensation
+    if( dcw_int ){
+      working_samples = new cuNDArray<complext<REAL> >(*in_int);
+      *working_samples *= *dcw_int;
+    }
+    else{
+      working_samples = in_int;
+    }
+    
+    _convolve_NFFT_NC2C<REAL,D,ATOMICS>::apply( this, working_samples, out_int, accumulate );
+    
+    if( dcw_int ){
+      delete working_samples; working_samples = 0x0;
+    }    
+    break;
+
+  default:
+    throw std::runtime_error( "Error: cuNFFT_plan::convolve: unknown mode.");
+  }
+
+  restore<complext<REAL>, complext<REAL>, REAL>
+    (old_device, out, out, out_int, in, in_int, dcw, dcw_int );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::fft(cuNDArray<complext<REAL> > *data, NFFT_fft_mode mode, bool do_scale )
+{
+  cuNDArray<complext<REAL> > *data_int = 0x0;
+  int old_device;
+  
+  prepare<complext<REAL>,dummy,dummy>( device, &old_device, data, &data_int );
+  
+  typename uint64d<D>::Type _dims_to_transform = counting_vec<size_t,D>();
+  vector<size_t> dims_to_transform = to_std_vector( _dims_to_transform );
+  
+  if( mode == NFFT_FORWARDS ){
+    cuNDFFT<REAL>::instance()->fft( data_int, &dims_to_transform, do_scale );
+  }
+  else{
+    cuNDFFT<REAL>::instance()->ifft( data_int, &dims_to_transform, do_scale );
+  }
+
+  restore<complext<REAL> ,dummy,dummy>(old_device, data, data, data_int);
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::deapodize( cuNDArray<complext<REAL> > *image, bool fourier_domain)
+{
+  unsigned char components;
+  components = _NFFT_FFT;
+  check_consistency( 0x0, image, 0x0, components );
+
+  cuNDArray<complext<REAL> > *image_int = 0x0;
+  int old_device;
+  
+  prepare<complext<REAL>,dummy,dummy>(device, &old_device, image, &image_int );
+
+  typename uint64d<D>::Type image_dims = from_std_vector<size_t, D>(*image->get_dimensions()); 
+  bool oversampled_image = (image_dims==matrix_size_os); 
+  
+  if( !oversampled_image ){
+    throw std::runtime_error( "Error: cuNFFT_plan::deapodize: ERROR: oversampled image not provided as input.");
+  }
+  if (fourier_domain){
+  	if (!deapodization_filterFFT)
+  		deapodization_filterFFT = 	compute_deapodization_filter(true);
+  	*image_int *= *deapodization_filterFFT;
+  } else {
+  	if (!deapodization_filter)
+  		deapodization_filter = compute_deapodization_filter(false);
+  	*image_int *= *deapodization_filter;
+  }
+    
+  restore<complext<REAL> ,dummy,dummy>(old_device, image, image, image_int);
+}
+
+//
+// Private class methods
+//
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::check_consistency( cuNDArray<complext<REAL> > *samples, cuNDArray<complext<REAL> > *image,
+                                                           cuNDArray<REAL> *weights, unsigned char components )
+{
+
+  if( !initialized ){
+    throw std::runtime_error( "Error: cuNFFT_plan: Unable to proceed without setup.");
+  }
+  
+  if( (components & _NFFT_CONV_C2NC ) && !preprocessed_C2NC ){
+    throw std::runtime_error("Error: cuNFFT_plan: Unable to compute NFFT before preprocessing.");
+  }
+  
+  if( (components & _NFFT_CONV_NC2C ) && !(preprocessed_NC2C || (preprocessed_C2NC && ATOMICS ) ) ){
+    throw std::runtime_error("Error: cuNFFT_plan: Unable to compute NFFT before preprocessing.");
+  }
+  
+  if( ((components & _NFFT_CONV_C2NC ) || (components & _NFFT_CONV_NC2C )) && !(image && samples) ){
+    throw std::runtime_error("Error: cuNFFT_plan: Unable to process 0x0 input/output.");
+  }
+  
+  if( ((components & _NFFT_FFT) || (components & _NFFT_DEAPODIZATION )) && !image ){
+    throw std::runtime_error("Error: cuNFFT_plan: Unable to process 0x0 input.");
+  }
+
+  if( image->get_number_of_dimensions() < D ){
+    throw std::runtime_error("Error: cuNFFT_plan: Number of image dimensions mismatch the plan.");
+  }    
+
+  typename uint64d<D>::Type image_dims = from_std_vector<size_t,D>( *image->get_dimensions() );
+  bool oversampled_image = (image_dims==matrix_size_os);
+  
+  if( !((oversampled_image) ? (image_dims == matrix_size_os) : (image_dims == matrix_size) )){
+    throw std::runtime_error("Error: cuNFFT_plan: Image dimensions mismatch.");
+  }
+  
+  if( (components & _NFFT_CONV_C2NC ) || (components & _NFFT_CONV_NC2C )){    
+    if( (samples->get_number_of_elements() == 0) || (samples->get_number_of_elements() % (number_of_frames*number_of_samples)) ){
+      printf("\ncuNFFT::check_consistency() failed:\n#elements in the samples array: %ld.\n#samples from preprocessing: %d.\n#frames from preprocessing: %d.\n",samples->get_number_of_elements(), number_of_samples, number_of_frames ); fflush(stdout);
+      throw std::runtime_error("Error: cuNFFT_plan: The number of samples is not a multiple of #samples/frame x #frames as requested through preprocessing");
+    }
+    
+    unsigned int num_batches_in_samples_array = samples->get_number_of_elements()/(number_of_frames*number_of_samples);
+    unsigned int num_batches_in_image_array = 1;
+
+    for( unsigned int d=D; d<image->get_number_of_dimensions(); d++ ){
+      num_batches_in_image_array *= image->get_size(d);
+    }
+    num_batches_in_image_array /= number_of_frames;
+
+    if( num_batches_in_samples_array != num_batches_in_image_array ){
+      printf("\ncuNFFT::check_consistency() failed:\n#elements in the samples array: %ld.\n#samples from preprocessing: %d.\n#frames from preprocessing: %d.\nLeading to %d batches in the samples array.\nThe number of batches in the image array is %d.\n",samples->get_number_of_elements(), number_of_samples, number_of_frames, num_batches_in_samples_array, num_batches_in_image_array ); fflush(stdout);
+      throw std::runtime_error("Error: cuNFFT_plan: Number of batches mismatch between samples and image arrays");
+    }
+  }
+  
+  if( components & _NFFT_CONV_NC2C ){
+    if( weights ){ 
+      if( weights->get_number_of_elements() == 0 ||
+          !( weights->get_number_of_elements() == number_of_samples || 
+             weights->get_number_of_elements() == number_of_frames*number_of_samples) ){
+        printf("\ncuNFFT::check_consistency() failed:\n#elements in the samples array: %ld.\n#samples from preprocessing: %d.\n#frames from preprocessing: %d.\n#weights: %ld.\n",samples->get_number_of_elements(), number_of_samples, number_of_frames, weights->get_number_of_elements() ); fflush(stdout);
+        throw std::runtime_error("Error: cuNFFT_plan: The number of weights should match #samples/frame x #frames as requested through preprocessing");
+      }
+    }
+  }  
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> 
+void Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::barebones()
+{	
+  // These are the fundamental booleans checked before accessing the various member pointers
+  initialized = preprocessed_C2NC = preprocessed_NC2C = false;
+
+  // Clear matrix sizes
+  clear(matrix_size);
+  clear(matrix_size_os);
+
+  // Clear pointers
+  trajectory_positions = 0x0;
+  tuples_last = bucket_begin = bucket_end = 0x0;
+
+  // and specify the device
+  if (cudaGetDevice(&device) != cudaSuccess) {
+    throw cuda_error("Error: cuNFFT_plan::barebones:: unable to get device no");
+  }
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> 
+void Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::wipe( NFFT_wipe_mode mode )
+{
+  // Get current Cuda device
+  int old_device;
+  if( cudaGetDevice(&old_device) != cudaSuccess ) {
+    throw cuda_error("Error: cuNFFT_plan::wipe: unable to get device no");
+  }
+
+  if( device != old_device && cudaSetDevice(device) != cudaSuccess) {
+    throw cuda_error("Error: cuNFFT_plan::wipe: unable to set device no");
+  }
+
+  if( mode==NFFT_WIPE_ALL && initialized ){
+    deapodization_filter.reset();
+    initialized = false;
+  }
+    
+  if( preprocessed_NC2C ){
+    if( tuples_last )  delete tuples_last;
+    if( bucket_begin ) delete bucket_begin;
+    if( bucket_end )   delete bucket_end;
+  }
+  
+  if( preprocessed_C2NC || preprocessed_NC2C ){
+    delete trajectory_positions;
+    preprocessed_C2NC = preprocessed_NC2C = false;
+  }
+
+  if( device != old_device && cudaSetDevice(old_device) != cudaSuccess) {
+    throw cuda_error("Error: cuNFFT_plan::wipe: unable to restore device no");
+  }
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> 
+void Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::compute_beta()
+{	
+  // Compute Kaiser-Bessel beta paramter according to the formula provided in 
+  // Beatty et. al. IEEE TMI 2005;24(6):799-808.
+  for( unsigned int d=0; d<D; d++ )
+    beta[d] = (M_PI*std::sqrt((W*W)/(alpha[d]*alpha[d])*(alpha[d]-REAL(0.5))*(alpha[d]-REAL(0.5))-REAL(0.8))); 
+}
+
+//
+// Grid fictitious trajectory with a single sample at the origin
+//
+
+template<class REAL, unsigned int D> __global__ void
+compute_deapodization_filter_kernel( typename uintd<D>::Type matrix_size_os, typename reald<REAL,D>::Type matrix_size_os_real, 
+                                     REAL W, REAL half_W, REAL one_over_W, 
+                                     typename reald<REAL,D>::Type beta, complext<REAL> * __restrict__ image_os )
+{
+  const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
+  const unsigned int num_elements = prod(matrix_size_os);
+
+  if( idx <num_elements ){
+
+    // Compute weight from Kaiser-Bessel filter
+    const typename uintd<D>::Type cell_pos = idx_to_co<D>(idx, matrix_size_os);
+
+    // Sample position ("origin")
+    const vector_td<REAL,D> sample_pos = REAL(0.5)*matrix_size_os_real;
+
+    // Calculate the distance between the cell and the sample
+    vector_td<REAL,D> cell_pos_real = vector_td<REAL,D>(cell_pos);
+    const typename reald<REAL,D>::Type delta = abs(sample_pos-cell_pos_real);
+
+    // Compute convolution weight. 
+    REAL weight; 
+    REAL zero = REAL(0);
+    vector_td<REAL,D> half_W_vec( half_W );
+
+    if( weak_greater( delta, half_W_vec ) )
+      weight = zero;
+    else{ 
+      weight = KaiserBessel<REAL>( delta, matrix_size_os_real, one_over_W, beta );
+      //if( !isfinite(weight) )
+      //weight = zero;
+    }
+    
+    // Output weight
+    complext<REAL>  result;
+    result.vec[0] = weight; 
+    result.vec[1] = zero;
+    image_os[idx] = result;
+  }
+}
+
+//
+// Function to calculate the deapodization filter
+//
+
+template<class REAL, unsigned int D, bool ATOMICS> boost::shared_ptr<cuNDArray<complext<REAL> > >
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::compute_deapodization_filter( bool FFTed)
+{
+  std::vector<size_t> tmp_vec_os = to_std_vector(matrix_size_os);
+
+ boost::shared_ptr< cuNDArray<complext<REAL> > > filter( new cuNDArray<complext<REAL> >(tmp_vec_os));
+  vector_td<REAL,D> matrix_size_os_real = vector_td<REAL,D>(matrix_size_os);
+  
+  // Find dimensions of grid/blocks.
+  dim3 dimBlock( 256 );
+  dim3 dimGrid( (prod(matrix_size_os)+dimBlock.x-1)/dimBlock.x );
+
+  // Invoke kernel
+  compute_deapodization_filter_kernel<REAL,D><<<dimGrid, dimBlock>>> 
+    ( vector_td<unsigned int,D>(matrix_size_os), matrix_size_os_real, W, REAL(0.5)*W, REAL(1)/W, beta, filter->get_data_ptr() );
+
+  CHECK_FOR_CUDA_ERROR();
+  
+  // FFT
+  if (FFTed)
+  	fft( filter.get(), NFFT_FORWARDS, false );
+  else
+  	fft( filter.get(), NFFT_BACKWARDS, false );
+  // Reciprocal
+  reciprocal_inplace(filter.get());
+  return filter;
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::compute_NFFT_C2NC( cuNDArray<complext<REAL> > *image, cuNDArray<complext<REAL> > *samples )
+{
+  // private method - no consistency check. We trust in ourselves.
+
+  // Deapodization
+  deapodize( image );
+    
+  // FFT
+  fft( image, NFFT_FORWARDS );
+
+  // Convolution
+  convolve( image, samples, 0x0, NFFT_CONV_C2NC );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::compute_NFFTH_NC2C( cuNDArray<complext<REAL> > *samples, cuNDArray<complext<REAL> > *image )
+{
+  // private method - no consistency check. We trust in ourselves.
+
+  // Convolution
+  convolve( samples, image, 0x0, NFFT_CONV_NC2C );
+
+  // FFT
+  fft( image, NFFT_BACKWARDS );
+  
+  // Deapodization  
+  deapodize( image );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::compute_NFFTH_C2NC( cuNDArray<complext<REAL> > *image, cuNDArray<complext<REAL> > *samples )
+{
+  // private method - no consistency check. We trust in ourselves.
+
+  // Deapodization
+  deapodize( image, true );
+ 
+  // FFT
+  fft( image, NFFT_BACKWARDS );
+
+  // Convolution
+  convolve( image, samples, 0x0, NFFT_CONV_C2NC );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::compute_NFFT_NC2C( cuNDArray<complext<REAL> > *samples, cuNDArray<complext<REAL> > *image )
+{
+  // private method - no consistency check. We trust in ourselves.
+
+  // Convolution
+  convolve( samples, image, 0x0, NFFT_CONV_NC2C );
+  
+  // FFT
+  fft( image, NFFT_FORWARDS );
+  
+  // Deapodization
+  deapodize( image, true );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::convolve_NFFT_C2NC( cuNDArray<complext<REAL> > *image, cuNDArray<complext<REAL> > *samples, bool accumulate )
+{
+  // private method - no consistency check. We trust in ourselves.
+  
+  unsigned int num_batches = 1;
+  for( unsigned int d=D; d<image->get_number_of_dimensions(); d++ )
+    num_batches *= image->get_size(d);
+  num_batches /= number_of_frames;
+  
+  /*
+    Setup grid and threads
+  */
+
+  size_t threads_per_block;
+  unsigned int max_coils;
+    
+  threads_per_block = NFFT_THREADS_PER_KERNEL;
+  
+  if( cudaDeviceManager::Instance()->major_version(device) == 1 ){
+    max_coils = NFFT_MAX_COILS_COMPUTE_1x;
+  }
+  else{
+    max_coils = NFFT_MAX_COILS_COMPUTE_2x;
+  }
+  
+  // We can (only) convolve max_coils batches per run due to shared memory issues. 
+  unsigned int domain_size_coils_desired = num_batches;
+  unsigned int num_repetitions = domain_size_coils_desired/max_coils + 
+    ( ((domain_size_coils_desired%max_coils)==0) ? 0 : 1 );
+  unsigned int domain_size_coils = (num_repetitions==1) ? domain_size_coils_desired : max_coils;
+  unsigned int domain_size_coils_tail = (num_repetitions==1) ? domain_size_coils_desired : domain_size_coils_desired - (num_repetitions-1)*domain_size_coils;
+
+  // Block and Grid dimensions
+  dim3 dimBlock( (unsigned int)threads_per_block );
+  dim3 dimGrid( (number_of_samples+dimBlock.x-1)/dimBlock.x, number_of_frames );
+
+  // Calculate how much shared memory to use per thread
+  size_t bytes_per_thread = domain_size_coils * sizeof( vector_td<REAL,D> );
+  size_t bytes_per_thread_tail = domain_size_coils_tail * sizeof( vector_td<REAL,D> );
+
+  unsigned int double_warp_size_power=0;
+  unsigned int __tmp = cudaDeviceManager::Instance()->warp_size(device)<<1;
+  while(__tmp!=1){
+    __tmp>>=1;
+    double_warp_size_power++;
+  }
+  
+  vector_td<REAL,D> matrix_size_os_real = vector_td<REAL,D>( matrix_size_os );
+
+  /*
+    Invoke kernel
+  */
+
+  for( unsigned int repetition = 0; repetition<num_repetitions; repetition++ ){
+    NFFT_convolve_kernel<REAL,D>
+      <<<dimGrid, dimBlock, ((repetition==num_repetitions-1) ? dimBlock.x*bytes_per_thread_tail : dimBlock.x*bytes_per_thread)>>>
+      ( alpha, beta, W, vector_td<unsigned int,D>(matrix_size_os), vector_td<unsigned int,D>(matrix_size_wrap), number_of_samples,
+        (repetition==num_repetitions-1) ? domain_size_coils_tail : domain_size_coils, 
+        raw_pointer_cast(&(*trajectory_positions)[0]), 
+        image->get_data_ptr()+repetition*prod(matrix_size_os)*number_of_frames*domain_size_coils,
+        samples->get_data_ptr()+repetition*number_of_samples*number_of_frames*domain_size_coils, 
+        double_warp_size_power, REAL(0.5)*W, REAL(1)/(W), accumulate, matrix_size_os_real );
+
+    CHECK_FOR_CUDA_ERROR();    
+  }
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::convolve_NFFT_NC2C( cuNDArray<complext<REAL> > *image, cuNDArray<complext<REAL> > *samples, bool accumulate )
+{
+  _convolve_NFFT_NC2C<REAL,D,ATOMICS>::apply( this, image, samples, accumulate );
+}
+
+template<unsigned int D> struct
+_convolve_NFFT_NC2C<float,D,true>{ // True: use atomic operations variant
+  static bool apply( cuNFFT_plan<float,D,true> *plan, 
+                     cuNDArray<complext<float> > *samples, 
+                     cuNDArray<complext<float> > *image, 
+                     bool accumulate )
+  {   
+    //
+    // Bring in some variables from the plan
+    
+    unsigned int device = plan->device;
+    unsigned int number_of_frames = plan->number_of_frames;
+    unsigned int number_of_samples = plan->number_of_samples;
+    typename uint64d<D>::Type matrix_size_os = plan->matrix_size_os;
+    typename uint64d<D>::Type matrix_size_wrap = plan->matrix_size_wrap;
+    typename reald<float,D>::Type alpha = plan->alpha;
+    typename reald<float,D>::Type beta = plan->beta;
+    float W = plan->W;
+    thrust::device_vector< typename reald<float,D>::Type > *trajectory_positions = plan->trajectory_positions;    
+
+    //
+    // Atomic operations are only supported in compute model 2.0 and up
+    //
+
+    if( cudaDeviceManager::Instance()->major_version(device) == 1 ){
+      throw cuda_error("Error: Atomic NC2C NFFT only supported on device with compute model 2.0 or higher");
+    }
+    
+    // Check if warp_size is a power of two. We do some modulus tricks in the kernels that depend on this...
+    if( !((cudaDeviceManager::Instance()->warp_size(device) & (cudaDeviceManager::Instance()->warp_size(device)-1)) == 0 ) ){
+      throw cuda_error("cuNFFT: unsupported hardware (warpSize is not a power of two)");
+    }
+    
+    unsigned int num_batches = 1;
+    for( unsigned int d=D; d<image->get_number_of_dimensions(); d++ )
+      num_batches *= image->get_size(d);
+    num_batches /= number_of_frames;
+    
+    //
+    //  Setup grid and threads
+    //
+    
+    size_t threads_per_block;
+    unsigned int max_coils;
+    
+    threads_per_block = NFFT_THREADS_PER_KERNEL;
+    max_coils = NFFT_MAX_COILS_COMPUTE_2x;
+    
+    // We can (only) convolve domain_size_coils batches per run due to shared memory issues. 
+    unsigned int domain_size_coils_desired = num_batches;
+    unsigned int num_repetitions = domain_size_coils_desired/max_coils + 
+      ( ((domain_size_coils_desired%max_coils)==0) ? 0 : 1 );
+    unsigned int domain_size_coils = (num_repetitions==1) ? domain_size_coils_desired : max_coils;
+    unsigned int domain_size_coils_tail = (num_repetitions==1) ? domain_size_coils_desired : domain_size_coils_desired - (num_repetitions-1)*domain_size_coils;
+    
+    // Block and Grid dimensions
+    dim3 dimBlock( (unsigned int)threads_per_block ); 
+    dim3 dimGrid( (number_of_samples+dimBlock.x-1)/dimBlock.x, number_of_frames );
+    
+    // Calculate how much shared memory to use per thread
+    size_t bytes_per_thread = domain_size_coils * sizeof( vector_td<float,D> );
+    size_t bytes_per_thread_tail = domain_size_coils_tail * sizeof( vector_td<float,D> );
+    
+    unsigned int double_warp_size_power=0, __tmp = cudaDeviceManager::Instance()->warp_size(device)<<1;
+    while(__tmp!=1){
+      __tmp>>=1;
+      double_warp_size_power++;
+    }
+    
+    vector_td<float,D> matrix_size_os_real = vector_td<float,D>( matrix_size_os );
+    
+    if( !accumulate ){
+      clear(image);
+    }
+    
+    //
+    // Invoke kernel
+    //
+    
+    for( unsigned int repetition = 0; repetition<num_repetitions; repetition++ ){
+      
+      NFFT_H_atomic_convolve_kernel<float,D>
+        <<<dimGrid, dimBlock, ((repetition==num_repetitions-1) ? dimBlock.x*bytes_per_thread_tail : dimBlock.x*bytes_per_thread)>>>
+        ( alpha, beta, W, vector_td<unsigned int,D>(matrix_size_os), vector_td<unsigned int,D>(matrix_size_wrap), number_of_samples,
+          (repetition==num_repetitions-1) ? domain_size_coils_tail : domain_size_coils,
+          raw_pointer_cast(&(*trajectory_positions)[0]), 
+          samples->get_data_ptr()+repetition*number_of_samples*number_of_frames*domain_size_coils,
+          image->get_data_ptr()+repetition*prod(matrix_size_os)*number_of_frames*domain_size_coils,
+          double_warp_size_power, float(0.5)*W, float(1)/(W), matrix_size_os_real );
+    }
+    
+    CHECK_FOR_CUDA_ERROR();
+   
+    return true;
+  }
+};
+
+template<unsigned int D> struct
+_convolve_NFFT_NC2C<double,D,true>{ // True: use atomic operations variant
+  // Atomics don't exist for doubles, so this gives a compile error if you actually try to use it.
+};
+
+template<class REAL, unsigned int D> struct
+_convolve_NFFT_NC2C<REAL,D,false>{ // False: use non-atomic operations variant
+  static void apply( cuNFFT_plan<REAL,D,false> *plan,
+                     cuNDArray<complext<REAL> > *samples, 
+                     cuNDArray<complext<REAL> > *image, 
+                     bool accumulate )
+  {
+    // Bring in some variables from the plan
+    
+    unsigned int device = plan->device;
+    unsigned int number_of_frames = plan->number_of_frames;
+    unsigned int number_of_samples = plan->number_of_samples;
+    typename uint64d<D>::Type matrix_size_os = plan->matrix_size_os;
+    typename uint64d<D>::Type matrix_size_wrap = plan->matrix_size_wrap;
+    typename reald<REAL,D>::Type alpha = plan->alpha;
+    typename reald<REAL,D>::Type beta = plan->beta;
+    REAL W = plan->W;
+    thrust::device_vector< typename reald<REAL,D>::Type > *trajectory_positions = plan->trajectory_positions;    
+    thrust::device_vector<unsigned int> *tuples_last = plan->tuples_last;
+    thrust::device_vector<unsigned int> *bucket_begin = plan->bucket_begin;
+    thrust::device_vector<unsigned int> *bucket_end = plan->bucket_end;
+
+    // private method - no consistency check. We trust in ourselves.
+    // Check if warp_size is a power of two. We do some modulus tricks in the kernels that depend on this...
+    if( !((cudaDeviceManager::Instance()->warp_size(device) & (cudaDeviceManager::Instance()->warp_size(device)-1)) == 0 ) ){
+      throw cuda_error("cuNFFT: unsupported hardware (warpSize is not a power of two)");
+
+    }
+    unsigned int num_batches = 1;
+    for( unsigned int d=D; d<image->get_number_of_dimensions(); d++ )
+      num_batches *= image->get_size(d);
+    num_batches /= number_of_frames;
+    
+    //
+    // Setup grid and threads
+    //
+    
+    size_t threads_per_block;
+    unsigned int max_coils;
+    
+    threads_per_block = NFFT_THREADS_PER_KERNEL;
+    
+    if( cudaDeviceManager::Instance()->major_version(device) == 1 ){
+      max_coils = NFFT_MAX_COILS_COMPUTE_1x;
+    }
+    else{
+      max_coils = NFFT_MAX_COILS_COMPUTE_2x;
+    }
+    
+    // We can (only) convolve domain_size_coils batches per run due to shared memory issues. 
+    unsigned int domain_size_coils_desired = num_batches;
+    unsigned int num_repetitions = domain_size_coils_desired/max_coils + 
+      ( ((domain_size_coils_desired%max_coils)==0) ? 0 : 1 );
+    unsigned int domain_size_coils = (num_repetitions==1) ? domain_size_coils_desired : max_coils;
+    unsigned int domain_size_coils_tail = (num_repetitions==1) ? domain_size_coils_desired : domain_size_coils_desired - (num_repetitions-1)*domain_size_coils;
+    
+    // Block and Grid dimensions
+    dim3 dimBlock( (unsigned int)threads_per_block ); 
+    dim3 dimGrid( (prod(matrix_size_os+matrix_size_wrap)+dimBlock.x-1)/dimBlock.x, number_of_frames );
+    
+    // Calculate how much shared memory to use per thread
+    size_t bytes_per_thread = domain_size_coils * sizeof( vector_td<REAL,D> );
+    size_t bytes_per_thread_tail = domain_size_coils_tail * sizeof( vector_td<REAL,D> );
+    
+    unsigned int double_warp_size_power=0, __tmp = cudaDeviceManager::Instance()->warp_size(device)<<1;
+    while(__tmp!=1){
+      __tmp>>=1;
+      double_warp_size_power++;
+    }
+    
+    vector_td<REAL,D> matrix_size_os_real = vector_td<REAL,D>( matrix_size_os );
+    
+    // Define temporary image that includes a wrapping zone
+    cuNDArray<complext<REAL> > _tmp;
+    
+    vector<size_t> vec_dims = to_std_vector(matrix_size_os+matrix_size_wrap); 
+    if( number_of_frames > 1 )
+      vec_dims.push_back(number_of_frames);
+    if( num_batches > 1 ) 
+      vec_dims.push_back(num_batches);
+    
+    _tmp.create(&vec_dims);
+    
+    //
+    // Invoke kernel
+    //
+    
+    for( unsigned int repetition = 0; repetition<num_repetitions; repetition++ ){
+      
+      NFFT_H_convolve_kernel<REAL,D>
+        <<<dimGrid, dimBlock, ((repetition==num_repetitions-1) ? dimBlock.x*bytes_per_thread_tail : dimBlock.x*bytes_per_thread)>>>
+        ( alpha, beta, W, vector_td<unsigned int,D>(matrix_size_os+matrix_size_wrap), number_of_samples,
+          (repetition==num_repetitions-1) ? domain_size_coils_tail : domain_size_coils, 
+          raw_pointer_cast(&(*trajectory_positions)[0]), 
+          _tmp.get_data_ptr()+repetition*prod(matrix_size_os+matrix_size_wrap)*number_of_frames*domain_size_coils,
+          samples->get_data_ptr()+repetition*number_of_samples*number_of_frames*domain_size_coils, 
+          raw_pointer_cast(&(*tuples_last)[0]), raw_pointer_cast(&(*bucket_begin)[0]), raw_pointer_cast(&(*bucket_end)[0]),
+          double_warp_size_power, REAL(0.5)*W, REAL(1)/(W), matrix_size_os_real );
+    }
+    
+    CHECK_FOR_CUDA_ERROR();
+    
+    plan->image_wrap( &_tmp, image, accumulate );
+  };
+};
+
+// Image wrap kernels
+
+template<class REAL, unsigned int D> __global__ void
+image_wrap_kernel( typename uintd<D>::Type matrix_size_os, typename uintd<D>::Type matrix_size_wrap, bool accumulate,
+                   const complext<REAL> * __restrict__ in, complext<REAL> * __restrict__ out )
+{
+  unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
+  const unsigned int num_elements_per_image_src = prod(matrix_size_os+matrix_size_wrap);
+  const unsigned int image_offset_src = blockIdx.y*num_elements_per_image_src;
+  
+  const typename uintd<D>::Type co = idx_to_co<D>(idx, matrix_size_os);
+  const typename uintd<D>::Type half_wrap = matrix_size_wrap>>1;
+  
+  // Make "boolean" vectors denoting whether wrapping needs to be performed in a given direction (forwards/backwards)
+  vector_td<bool,D> B_l = vector_less( co, half_wrap );
+  vector_td<bool,D> B_r = vector_greater_equal( co, matrix_size_os-half_wrap );
+  
+  complext<REAL>  result = in[co_to_idx<D>(co+half_wrap, matrix_size_os+matrix_size_wrap) + image_offset_src];
+
+  if( sum(B_l+B_r) > 0 ){
+    
+    // Fold back the wrapping zone onto the image ("periodically")
+    //
+    // There is 2^D-1 ways to pick combinations of dimensions in D-dimensionsal space, e.g. 
+    // 
+    //  { x, y, xy } in 2D
+    //  { x, y, x, xy, xz, yz, xyz } in 3D
+    //
+    // Every "letter" in each combination provides two possible wraps (eiher end of the dimension)
+    // 
+    // For every 2^D-1 combinations DO
+    //   - find the number of dimensions, d, in the combination
+    //   - create 2^(d) stride vectors and test for wrapping using the 'B'-vectors above.
+    //   - accumulate the contributions
+    // 
+    //   The following code represents dimensions as bits in a char.
+    //
+    
+    for( unsigned char combination = 1; combination < (1<<D); combination++ ){
+    
+      // Find d
+      unsigned char d = 0;
+      for( unsigned char i=0; i<D; i++ )
+        d += ((combination & (1<<i)) > 0 );
+       
+      // Create stride vector for each wrapping test
+      for( unsigned char s = 0; s < (1<<d); s++ ){
+        
+        // Target for stride
+        typename intd<D>::Type stride;
+        char wrap_requests = 0;
+        char skipped_dims = 0;
+	
+        // Fill dimensions of the stride
+        for( unsigned char i=1; i<D+1; i++ ){
+    
+          // Is the stride dimension present in the current combination?
+          if( i & combination ){
+    
+            // A zero bit in s indicates "check for left wrap" and a one bit is interpreted as "check for right wrap" 
+            // ("left/right" for the individual dimension meaning wrapping on either side of the dimension).
+    
+            if( i & (s<<(skipped_dims)) ){
+              if( B_r.vec[i-1] ){ // Wrapping required 
+              	stride[i-1] = -1;
+                wrap_requests++;
+              }
+              else
+              	stride[i-1] = 0;
+            }
+            else{ 
+              if( B_l.vec[i-1] ){ // Wrapping required 
+              	stride[i-1] =1 ;
+                wrap_requests++;
+              }
+              else
+              	stride[i-1] = 0;
+            }
+          }
+          else{
+            // Do not test for wrapping in dimension 'i-1' (for this combination)
+          	stride[i-1] = 0;
+            skipped_dims++;
+          }
+        }
+	
+        // Now it is time to do the actual wrapping (if needed)
+        if( wrap_requests == d ){
+          typename intd<D>::Type src_co_int = vector_td<int,D>(co+half_wrap);
+          typename intd<D>::Type matrix_size_os_int = vector_td<int,D>(matrix_size_os);
+          typename intd<D>::Type co_offset_int = src_co_int + component_wise_mul<int,D>(stride,matrix_size_os_int);
+          typename uintd<D>::Type co_offset = vector_td<unsigned int,D>(co_offset_int);
+          result += in[co_to_idx<D>(co_offset, matrix_size_os+matrix_size_wrap) + image_offset_src];
+          break; // only one stride per combination can contribute (e.g. one edge, one corner)
+        } 
+      } 
+    }
+  }
+  
+  // Output
+  const unsigned int image_offset_tgt = blockIdx.y*prod(matrix_size_os);
+  if( accumulate ) result += out[idx+image_offset_tgt];
+  out[idx+image_offset_tgt] = result;
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::image_wrap( cuNDArray<complext<REAL> > *source, cuNDArray<complext<REAL> > *target, bool accumulate )
+{
+  unsigned int num_batches = 1;
+  for( unsigned int d=D; d<source->get_number_of_dimensions(); d++ )
+    num_batches *= source->get_size(d);
+  num_batches /= number_of_frames;
+
+  // Set dimensions of grid/blocks.
+  unsigned int bdim = 256;
+  dim3 dimBlock( bdim );
+  dim3 dimGrid( prod(matrix_size_os)/bdim, number_of_frames*num_batches );
+
+  // Safety check
+  if( (prod(matrix_size_os)%bdim) != 0 ) {
+  	std::stringstream ss;
+  	ss << "Error: cuNFFT : the number of oversampled image elements must be a multiplum of the block size: " << bdim;
+    throw std::runtime_error(ss.str());
+  }
+
+  // Invoke kernel
+  image_wrap_kernel<REAL,D><<<dimGrid, dimBlock>>>
+    ( vector_td<unsigned int,D>(matrix_size_os), vector_td<unsigned int,D>(matrix_size_wrap), accumulate, source->get_data_ptr(), target->get_data_ptr() );
+  
+  CHECK_FOR_CUDA_ERROR();
+}	
+
+//
+// Template instantion
+//
+
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< float, 1, true >;
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< float, 1, false >;
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< double, 1, false >;
+
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< float, 2, true >;
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< float, 2, false >;
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< double, 2, false >;
+
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< float, 3, true >;
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< float, 3, false >;
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< double, 3, false >;
+
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< float, 4, true >;
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< float, 4, false >;
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< double, 4, false >;
diff --git a/toolboxes/nfft/gpu/cuNFFT.h b/toolboxes/nfft/gpu/cuNFFT.h
new file mode 100644
index 0000000..5f70aba
--- /dev/null
+++ b/toolboxes/nfft/gpu/cuNFFT.h
@@ -0,0 +1,294 @@
+/** \file cuNFFT.h
+    \brief Cuda implementation of the non-Cartesian FFT
+
+    Reference information on the CUDA/GPU implementation of the NFFT can be found in the papers
+    
+    Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+    T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+    IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+    
+    Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+    T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+    IEEE Transactions on Medical Imaging 2009; 28(12):1974-1985. 
+*/
+
+#pragma once
+
+#include "cuNDArray.h"
+#include "vector_td.h"
+#include "complext.h"
+#include "gpunfft_export.h"
+
+#include <thrust/device_vector.h>
+#include <boost/shared_ptr.hpp>
+
+template<class REAL, unsigned int D, bool ATOMICS> struct _convolve_NFFT_NC2C;
+
+namespace Gadgetron{
+
+  /** \class cuNFFT_plan
+      \brief Cuda implementation of the non-Cartesian FFT
+
+      ------------------------------
+      --- NFFT class declaration ---
+      ------------------------------      
+      REAL:  desired precision : float or double
+      D:  dimensionality : { 1,2,3,4 }
+      ATOMICS: use atomic device memory transactions : { true, false }
+      
+      For the tested hardware the implementation using atomic operations is slower as its non-atomic counterpart.
+      However, using atomic operations has the advantage of not requiring any pre-processing.
+      As the preprocessing step can be quite costly in terms of memory usage,
+      the atomic mode can be necessary for very large images or for 3D/4D volumes.
+      Notice: currently no devices support atomics operations in double precision.
+  */
+  template< class REAL, unsigned int D, bool ATOMICS = false > class EXPORTGPUNFFT cuNFFT_plan
+  {
+  
+  public: // Main interface
+    
+    /** 
+        Default constructor
+    */
+    cuNFFT_plan();
+
+    /**
+       Constructor defining the required NFFT parameters.
+       \param matrix_size the matrix size to use for the NFFT. Define as a multiple of 32.
+       \param matrix_size_os intermediate oversampled matrix size. Define as a multiple of 32.
+       The ratio between matrix_size_os and matrix_size define the oversampling ratio for the NFFT implementation.
+       Use an oversampling ratio between 1 and 2. The higher ratio the better quality results, 
+       however at the cost of increased execution times. 
+       \param W the concolution window size used in the NFFT implementation. 
+       The larger W the better quality at the cost of increased runtime.
+       \param device the device (GPU id) to use for the NFFT computation. 
+       The default value of -1 indicates that the currently active device is used.
+    */
+    cuNFFT_plan( typename uint64d<D>::Type matrix_size, typename uint64d<D>::Type matrix_size_os,
+                 REAL W, int device = -1 );
+
+    /**
+       Destructor
+    */
+    virtual ~cuNFFT_plan();
+
+    /** 
+        Enum to specify the desired mode for cleaning up when using the wipe() method.
+    */
+    enum NFFT_wipe_mode { 
+      NFFT_WIPE_ALL, /**< delete all internal memory. */
+      NFFT_WIPE_PREPROCESSING /**< delete internal memory holding the preprocessing data structures. */
+    };
+
+    /** 
+        Clear internal storage
+        \param mode enum defining the wipe mode
+    */
+    void wipe( NFFT_wipe_mode mode );
+
+    /** 
+        Setup the plan. Please see the constructor taking similar arguments for a parameter description.
+    */
+    void setup( typename uint64d<D>::Type matrix_size, typename uint64d<D>::Type matrix_size_os,
+                REAL W, int device = -1 );
+
+    /**
+       Enum to specify the preprocessing mode.
+    */
+    enum NFFT_prep_mode { 
+      NFFT_PREP_C2NC, /**< preprocess to perform a Cartesian to non-Cartesian NFFT. */
+      NFFT_PREP_NC2C, /**< preprocess to perform a non-Cartesian to Cartesian NFFT. */
+      NFFT_PREP_ALL /**< preprocess to perform NFFTs in both directions. */
+    };
+
+    /**
+       Perform NFFT preprocessing for a given trajectory.
+       \param trajectory the NFFT non-Cartesian trajectory normalized to the range [-1/2;1/2]. 
+       \param mode enum specifying the preprocessing mode
+    */
+    void preprocess( cuNDArray<typename reald<REAL,D>::Type> *trajectory, NFFT_prep_mode mode );
+
+    /**
+       Enum defining the desired NFFT operation
+    */
+    enum NFFT_comp_mode { 
+      NFFT_FORWARDS_C2NC, /**< forwards NFFT Cartesian to non-Cartesian. */
+      NFFT_FORWARDS_NC2C, /**< forwards NFFT non-Cartesian to Cartesian. */
+      NFFT_BACKWARDS_C2NC, /**< backwards NFFT Cartesian to non-Cartesian. */
+      NFFT_BACKWARDS_NC2C /**< backwards NFFT non-Cartesian to Cartesian. */
+    };
+
+    /**
+       Execute the NFFT.
+       \param[in] in the input array.
+       \param[out] out the output array.
+       \param[in] dcw optional density compensation weights weighing the input samples according to the sampling density. 
+       If an 0x0-pointer is provided no density compensation is used.
+       \param mode enum specifying the mode of operation.
+    */
+    void compute( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out,
+                  cuNDArray<REAL> *dcw, NFFT_comp_mode mode );
+
+    /**
+       Execute an NFFT iteraion (from Cartesian image space to non-Cartesian Fourier space and back to Cartesian image space).
+       \param[in] in the input array.
+       \param[out] out the output array.
+       \param[in] dcw optional density compensation weights weighing the input samples according to the sampling density. 
+       If an 0x0-pointer is provided no density compensation is used.
+       \param[in] halfway_dims specifies the dimensions of the intermediate Fourier space (codomain).
+    */
+    void mult_MH_M( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out,
+                    cuNDArray<REAL> *dcw, std::vector<size_t> halfway_dims );
+  
+  public: // Utilities
+  
+    /**
+       Enum specifying the direction of the NFFT standalone convolution
+    */
+    enum NFFT_conv_mode { 
+      NFFT_CONV_C2NC, /**< convolution: Cartesian to non-Cartesian. */
+      NFFT_CONV_NC2C /**< convolution: non-Cartesian to Cartesian. */
+    };
+    
+    /**
+       Perform "standalone" convolution
+       \param[in] in the input array.
+       \param[out] out the output array.
+       \param[in] dcw optional density compensation weights.
+       \param[in] mode enum specifying the mode of the convolution
+       \param[in] accumulate specifies whether the result is added to the output (accumulation) or if the output is overwritten.
+    */
+    void convolve( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out, cuNDArray<REAL> *dcw,
+                   NFFT_conv_mode mode, bool accumulate = false );
+    
+    /**
+       Enum specifying the direction of the NFFT standalone FFT.
+    */
+    enum NFFT_fft_mode { 
+      NFFT_FORWARDS, /**< forwards FFT. */
+      NFFT_BACKWARDS /**< backwards FFT. */
+    };
+
+    /**
+       Cartesian FFT. For completeness, just invokes the cuNDFFT class.
+       \param[in,out] data the data for the inplace FFT.
+       \param mode enum specifying the direction of the FFT.
+       \param do_scale boolean specifying whether FFT normalization is desired.
+    */
+    void fft( cuNDArray<complext<REAL> > *data, NFFT_fft_mode mode, bool do_scale = true );
+  
+    /**
+       NFFT deapodization.
+       \param[in,out] image the image to be deapodized (inplace).
+    */
+    void deapodize( cuNDArray<complext<REAL> > *image, bool fourier_domain=false);
+
+  public: // Setup queries
+    
+    /**
+       Get the matrix size.
+    */
+    inline typename uint64d<D>::Type get_matrix_size(){
+      return matrix_size;
+    }
+
+    /**
+       Get the oversampled matrix size.
+    */
+    inline typename uint64d<D>::Type get_matrix_size_os(){
+      return matrix_size_os;
+    }
+
+    /**
+       Get the convolution kernel size
+    */
+    inline REAL get_W(){
+      return W;
+    }
+    
+    /**
+       Get the assigned device id
+    */
+    inline unsigned int get_device(){
+      return device;
+    }
+    
+    /**
+       Query of the plan has been setup
+    */
+    inline bool is_setup(){
+      return initialized;
+    }
+    
+    friend struct _convolve_NFFT_NC2C<REAL,D,ATOMICS>;
+  
+  private: // Internal to the implementation
+
+    // Validate setup / arguments
+    enum NFFT_components { _NFFT_CONV_C2NC = 1, _NFFT_CONV_NC2C = 2, _NFFT_FFT = 4, _NFFT_DEAPODIZATION = 8 };
+    void check_consistency( cuNDArray<complext<REAL> > *samples, cuNDArray<complext<REAL> > *image,
+                            cuNDArray<REAL> *dcw, unsigned char components );
+
+    // Shared barebones constructor
+    void barebones();
+    
+    // Compute beta control parameter for Kaiser-Bessel kernel
+    void compute_beta();
+
+    // Compute deapodization filter
+    boost::shared_ptr<cuNDArray<complext<REAL> > > compute_deapodization_filter(bool FFTed = false);
+
+    // Dedicated computes
+    void compute_NFFT_C2NC( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out );
+    void compute_NFFT_NC2C( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out );
+    void compute_NFFTH_NC2C( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out );
+    void compute_NFFTH_C2NC( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out );
+
+    // Dedicated convolutions
+    void convolve_NFFT_C2NC( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out, bool accumulate );
+    void convolve_NFFT_NC2C( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out, bool accumulate );
+  
+    // Internal utility
+    void image_wrap( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out, bool accumulate );
+
+  private:
+    
+    typename uint64d<D>::Type matrix_size;          // Matrix size
+    typename uint64d<D>::Type matrix_size_os;       // Oversampled matrix size
+    typename uint64d<D>::Type matrix_size_wrap;     // Wrap size at border
+
+    typename reald<REAL,D>::Type alpha;           // Oversampling factor (for each dimension)
+    typename reald<REAL,D>::Type beta;            // Kaiser-Bessel convolution kernel control parameter
+
+    REAL W;                                       // Kernel width in oversampled grid
+
+    unsigned int number_of_samples;               // Number of samples per frame per coil
+    unsigned int number_of_frames;                // Number of frames per reconstruction
+    
+    int device;                                   // Associated device id
+
+    //
+    // Internal data structures for convolution and deapodization
+    //
+
+    boost::shared_ptr< cuNDArray<complext<REAL> > > deapodization_filter; //Inverse fourier transformed deapodization filter
+
+    boost::shared_ptr< cuNDArray<complext<REAL> > > deapodization_filterFFT; //Fourier transformed deapodization filter
+   
+    thrust::device_vector< typename reald<REAL,D>::Type > *trajectory_positions;
+    thrust::device_vector<unsigned int> *tuples_last;
+    thrust::device_vector<unsigned int> *bucket_begin, *bucket_end;
+
+    //
+    // State variables
+    //
+
+    bool preprocessed_C2NC, preprocessed_NC2C;
+    bool initialized;
+  };
+
+  // Pure virtual class to cause compile errors if you try to use NFFT with double and atomics
+  // - since this is not supported on the device
+  template< unsigned int D> class EXPORTGPUNFFT cuNFFT_plan<double,D,true>{ 
+    virtual void atomics_not_supported_for_type_double() = 0; };
+}
diff --git a/toolboxes/nfft/gpu/cuNFFTOperator.cu b/toolboxes/nfft/gpu/cuNFFTOperator.cu
new file mode 100644
index 0000000..4363e11
--- /dev/null
+++ b/toolboxes/nfft/gpu/cuNFFTOperator.cu
@@ -0,0 +1,118 @@
+#include "cuNFFTOperator.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D, bool ATOMICS> void
+  cuNFFTOperator<REAL,D,ATOMICS>::mult_M( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out, bool accumulate )
+  {
+    if( !in || !out ){
+      throw std::runtime_error("cuNFFTOperator::mult_M : 0x0 input/output not accepted");
+    }
+
+    cuNDArray<complext<REAL> > *tmp_out;
+
+    if( accumulate ){
+      tmp_out = new cuNDArray<complext<REAL> >(out->get_dimensions());
+    }
+    else{
+      tmp_out = out;
+    }
+  
+    plan_->compute( in, tmp_out, dcw_.get(), cuNFFT_plan<REAL,D,ATOMICS>::NFFT_FORWARDS_C2NC );
+
+    if( accumulate ){
+      *out += *tmp_out;
+      delete tmp_out;
+    }
+  }
+
+  template<class REAL, unsigned int D, bool ATOMICS> void
+  cuNFFTOperator<REAL,D,ATOMICS>::mult_MH( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out, bool accumulate )
+  {
+    if( !in || !out ){
+      throw std::runtime_error("cuNFFTOperator::mult_MH : 0x0 input/output not accepted");
+    }
+
+    cuNDArray<complext<REAL> > *tmp_out;
+
+    if( accumulate ){
+      tmp_out = new cuNDArray<complext<REAL> >(out->get_dimensions());
+    }
+    else{
+      tmp_out = out;
+    }
+
+    plan_->compute( in, tmp_out, dcw_.get(), cuNFFT_plan<REAL,D,ATOMICS>::NFFT_BACKWARDS_NC2C );
+    if( accumulate ){
+      *out += *tmp_out;
+      delete tmp_out;
+    }
+  }
+
+  template<class REAL, unsigned int D, bool ATOMICS> void
+  cuNFFTOperator<REAL,D,ATOMICS>::mult_MH_M( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out, bool accumulate )
+  {
+    if( !in || !out ){
+      throw std::runtime_error("cuNFFTOperator::mult_MH_M : 0x0 input/output not accepted");
+    }
+    
+    boost::shared_ptr< std::vector<size_t> > codomain_dims = this->get_codomain_dimensions();
+    if( codomain_dims.get() == 0x0 || codomain_dims->size() == 0 ){
+      throw std::runtime_error("cuNFFTOperator::mult_MH_M : operator codomain dimensions not set");
+    }
+
+    cuNDArray<complext<REAL> > *tmp_out;
+    
+    if( accumulate ){
+      tmp_out = new cuNDArray<complext<REAL> >(out->get_dimensions());
+    }
+    else{
+      tmp_out = out;
+    }
+    
+    plan_->mult_MH_M( in, tmp_out, dcw_.get(), *codomain_dims );
+    
+    if( accumulate ){
+      *out += *tmp_out;
+      delete tmp_out;
+    } 
+  }
+  
+  template<class REAL, unsigned int D, bool ATOMICS> void
+  cuNFFTOperator<REAL,D,ATOMICS>::setup( typename uint64d<D>::Type matrix_size, typename uint64d<D>::Type matrix_size_os, REAL W )
+  {  
+    plan_->setup( matrix_size, matrix_size_os, W );  
+  }
+
+  template<class REAL, unsigned int D, bool ATOMICS> void
+  cuNFFTOperator<REAL,D,ATOMICS>::preprocess( cuNDArray<typename reald<REAL,D>::Type> *trajectory )
+  {
+    if( trajectory == 0x0 ){
+      throw std::runtime_error("cuNFFTOperator::preprocess : 0x0 trajectory provided.");
+    }
+    
+    plan_->preprocess( trajectory, cuNFFT_plan<REAL,D,ATOMICS>::NFFT_PREP_ALL );
+  }
+  
+  //
+  // Instantiations
+  //
+
+  template class EXPORTGPUNFFT cuNFFTOperator<float,1,false>;
+  template class EXPORTGPUNFFT cuNFFTOperator<float,2,false>;
+  template class EXPORTGPUNFFT cuNFFTOperator<float,3,false>;
+  template class EXPORTGPUNFFT cuNFFTOperator<float,4,false>;
+
+  template class EXPORTGPUNFFT cuNFFTOperator<float,1,true>;
+  template class EXPORTGPUNFFT cuNFFTOperator<float,2,true>;
+  template class EXPORTGPUNFFT cuNFFTOperator<float,3,true>;
+  template class EXPORTGPUNFFT cuNFFTOperator<float,4,true>;
+
+  template class EXPORTGPUNFFT cuNFFTOperator<double,1>;
+  template class EXPORTGPUNFFT cuNFFTOperator<double,2>;
+  template class EXPORTGPUNFFT cuNFFTOperator<double,3>;
+  template class EXPORTGPUNFFT cuNFFTOperator<double,4>;
+}
diff --git a/toolboxes/nfft/gpu/cuNFFTOperator.h b/toolboxes/nfft/gpu/cuNFFTOperator.h
new file mode 100644
index 0000000..78ffd3d
--- /dev/null
+++ b/toolboxes/nfft/gpu/cuNFFTOperator.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "linearOperator.h"
+#include "cuNFFT.h"
+#include "gpunfft_export.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D, bool ATOMICS=false> class EXPORTGPUNFFT cuNFFTOperator : public virtual linearOperator<cuNDArray< complext<REAL> > >
+  {  
+  public:
+  
+    cuNFFTOperator() : linearOperator<cuNDArray< complext<REAL> > >() {
+      plan_ = boost::shared_ptr< cuNFFT_plan<REAL, D,ATOMICS> >( new cuNFFT_plan<REAL, D, ATOMICS>() );
+    }
+  
+    virtual ~cuNFFTOperator() {}
+  
+    virtual void set_dcw( boost::shared_ptr< cuNDArray<REAL> > dcw ) { dcw_ = dcw; }
+    inline boost::shared_ptr< cuNDArray<REAL> > get_dcw() { return dcw_; }
+
+    inline boost::shared_ptr< cuNFFT_plan<REAL, D,ATOMICS> > get_plan() { return plan_; }
+  
+    virtual void setup( typename uint64d<D>::Type matrix_size, typename uint64d<D>::Type matrix_size_os, REAL W );
+    virtual void preprocess( cuNDArray<typename reald<REAL,D>::Type> *trajectory );
+
+    virtual void mult_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false );
+    virtual void mult_MH( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false );
+    virtual void mult_MH_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false );
+
+
+  protected:
+    boost::shared_ptr< cuNFFT_plan<REAL, D,ATOMICS> > plan_;
+    boost::shared_ptr< cuNDArray<REAL> > dcw_;
+  };
+}
diff --git a/toolboxes/nfft/gpu/gpunfft_export.h b/toolboxes/nfft/gpu/gpunfft_export.h
new file mode 100644
index 0000000..28f9752
--- /dev/null
+++ b/toolboxes/nfft/gpu/gpunfft_export.h
@@ -0,0 +1,19 @@
+/** \file gpunfft_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef GPUNFFT_EXPORT_H_
+#define GPUNFFT_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GPUNFFT__) || defined (gpunfft_EXPORTS)
+#define EXPORTGPUNFFT __declspec(dllexport)
+#else
+#define EXPORTGPUNFFT __declspec(dllimport)
+#endif
+#else
+#define EXPORTGPUNFFT
+#endif
+
+
+#endif /* GPUNFFT_EXPORT_H_ */
diff --git a/toolboxes/operators/CMakeLists.txt b/toolboxes/operators/CMakeLists.txt
new file mode 100644
index 0000000..9ec3318
--- /dev/null
+++ b/toolboxes/operators/CMakeLists.txt
@@ -0,0 +1,32 @@
+include_directories(
+  ${Boost_INCLUDE_DIR}
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  )
+
+install(FILES 	
+  generalOperator.h
+  linearOperator.h
+  identityOperator.h
+  diagonalOperator.h
+  diagonalSumOperator.h
+  encodingOperatorContainer.h
+  multiplicationOperatorContainer.h
+  FFTOperator.h
+  imageOperator.h
+  encodedImageOperator.h
+  partialDerivativeOperator.h
+  convolutionOperator.h
+  laplaceOperator.h
+  downsampleOperator.h
+  upsampleOperator.h
+  tvPicsOperator.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+IF (ARMADILLO_FOUND)
+  add_subdirectory(cpu)
+ENDIF (ARMADILLO_FOUND)
+
+IF (CUDA_FOUND)
+  add_subdirectory(gpu)
+ENDIF (CUDA_FOUND)
diff --git a/toolboxes/operators/FFTOperator.h b/toolboxes/operators/FFTOperator.h
new file mode 100644
index 0000000..73a48bf
--- /dev/null
+++ b/toolboxes/operators/FFTOperator.h
@@ -0,0 +1,76 @@
+/** \file FFTOperator.h
+    \brief Device independent implementation of the FFT operator.
+
+    The file FFTOperator.h is a device independent implementation of an operator perfoming a Cartesian FFT.
+    To simplify the actual instantiation we refer to 
+    - the class(/file) hoFFTOperator(/.h) for a cpu instantiated operator using the hoNDArray class
+    - the class(/file) cuFFTOperator(/.h) for a gpu instantiated operator using the cuNDArray class
+ */
+
+#pragma once
+
+#include "linearOperator.h"
+
+namespace Gadgetron{
+
+template <class ARRAY_TYPE, class FFT> class FFTOperator : public linearOperator<ARRAY_TYPE>
+{
+public:
+
+	FFTOperator() : linearOperator<ARRAY_TYPE>() {}
+	virtual ~FFTOperator() {}
+
+	virtual void mult_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+	{
+		if( in == 0x0 || out == 0x0 ){
+			throw std::runtime_error("Error: FFTOperator::mult_M(): illegal array pointer provided");
+		}
+
+		if( accumulate ){
+			ARRAY_TYPE tmp(in);
+			FFT::instance()->fft(&tmp);
+			*out += tmp;
+			axpy(ELEMENT_TYPE(sqrt(1.0/tmp.get_number_of_elements())),&tmp,out);
+		}
+		else{
+			*out = *in;
+			FFT::instance()->fft(out);
+			out *= ELEMENT_TYPE(sqrt(1.0/tmp.get_number_of_elements()));
+		}
+	}
+
+	virtual void mult_MH( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+	{
+		if( in == 0x0 || out == 0x0 ){
+			throw std::runtime_error("Error: FFTOperator::mult_M(): illegal array pointer provided");
+		}
+
+		if( accumulate ){
+			ARRAY_TYPE tmp(in);
+			FFT::instance()->ifft(&tmp,false);
+			axpy(ELEMENT_TYPE(sqrt(1.0/tmp.get_number_of_elements())),&tmp,out);
+			//*out += tmp;
+		}
+		else{
+			*out = *in;
+			FFT::instance()->ifft(out,false);
+			out *= ELEMENT_TYPE(sqrt(1.0/tmp.get_number_of_elements()));
+		}
+	}
+
+	virtual void mult_MH_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+	{
+		if( accumulate )
+			*out += *in;
+		else
+			*out = *in;
+	}
+
+	virtual boost::shared_ptr< linearOperator< ARRAY_TYPE > > clone()
+    		{
+		return linearOperator<ARRAY_TYPE>::clone(this);
+    		}
+private:
+	typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+};
+}
diff --git a/toolboxes/operators/convolutionOperator.h b/toolboxes/operators/convolutionOperator.h
new file mode 100644
index 0000000..6ad6fa0
--- /dev/null
+++ b/toolboxes/operators/convolutionOperator.h
@@ -0,0 +1,220 @@
+/** \file convolutionOperator.h
+    \brief Base class for all convolution operators.
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+#include "vector_td_utilities.h"
+
+#include <boost/shared_ptr.hpp>
+#include <vector>
+
+namespace Gadgetron{
+
+  template <class COMPLEX_ARRAY_TYPE, unsigned int D> class convolutionOperator : public linearOperator<COMPLEX_ARRAY_TYPE>
+  {  
+  protected:
+    typedef typename COMPLEX_ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+    
+  public:
+    
+    convolutionOperator() : linearOperator<COMPLEX_ARRAY_TYPE>() {}
+    virtual ~convolutionOperator() {}
+    
+    // Set the convolution kernel
+    virtual void set_kernel( COMPLEX_ARRAY_TYPE *image_space_kernel )
+    {     
+      if (!image_space_kernel) throw std::runtime_error("convolutionOperator: null pointer kernel provided");
+      COMPLEX_ARRAY_TYPE *freq_kernel = new COMPLEX_ARRAY_TYPE(*image_space_kernel);
+      operator_fft( true, freq_kernel );
+      kernel_ = boost::shared_ptr<COMPLEX_ARRAY_TYPE>(freq_kernel);
+      
+      COMPLEX_ARRAY_TYPE *freq_kernel_adjoint = new COMPLEX_ARRAY_TYPE(freq_kernel->get_dimensions());      
+      origin_mirror( freq_kernel, freq_kernel_adjoint );
+      adjoint_kernel_ = boost::shared_ptr<COMPLEX_ARRAY_TYPE>(freq_kernel_adjoint);           
+    }
+    
+    // Apply image operators
+    //
+    
+    virtual void mult_MH_M( COMPLEX_ARRAY_TYPE *in, COMPLEX_ARRAY_TYPE *out, bool accumulate = false )
+    {    
+      if( !kernel_.get() ){
+	throw std::runtime_error( "convolutionOperator::mult_MH_M failed : kernel is not set");
+      }
+    
+      if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+    	throw std::runtime_error( "convolutionOperator::mult_MH_M failed : in/out image dimensions mismatch");
+      }
+      
+      bool use_oversampling;
+      if( in->get_number_of_elements() == kernel_->get_number_of_elements() )
+	use_oversampling = false;
+      else if( (in->get_number_of_elements()<<D) == kernel_->get_number_of_elements() )
+	use_oversampling = true;
+      else{
+	throw std::runtime_error( "convolutionOperator::mult_MH_M failed : in/out image dimensions mismatch the kernel");
+      }
+      
+      // Intermediate variables
+      COMPLEX_ARRAY_TYPE *tmp_out;
+
+      if( use_oversampling ){
+	boost::shared_ptr< std::vector<size_t> > osdims = kernel_->get_dimensions();
+	tmp_out = new COMPLEX_ARRAY_TYPE(osdims);
+	pad<ELEMENT_TYPE,D>( in, tmp_out );
+      }
+      else if( accumulate ){
+	tmp_out = new COMPLEX_ARRAY_TYPE(*in);
+      }
+      else{ 
+	*out = *in;
+	tmp_out = out;
+      } 
+
+      // Forwards fft
+      operator_fft( true, tmp_out );
+
+      // Multiply
+      *tmp_out *= *kernel_;
+      *tmp_out *= *adjoint_kernel_;
+
+      // Inverse fft
+      operator_fft( false, tmp_out );
+
+      if( use_oversampling ) {
+	operator_crop( tmp_out, out );
+	delete tmp_out;
+      }    
+      else if( accumulate ){
+    	*out += *tmp_out;
+	delete tmp_out;
+      }    
+    }
+    
+  
+    virtual void mult_M( COMPLEX_ARRAY_TYPE *in, COMPLEX_ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( !kernel_.get() ){
+    	throw std::runtime_error("convolutionOperator::mult_M failed : kernel is not set");
+      }
+    
+      if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+    	throw std::runtime_error( "convolutionOperator::mult_M failed : in/out image dimensions mismatch");
+      }
+
+      bool use_oversampling;
+      if( in->get_number_of_elements() == kernel_->get_number_of_elements() )
+	use_oversampling = false;
+      else if( (in->get_number_of_elements()<<D) == kernel_->get_number_of_elements() )
+	use_oversampling = true;
+      else{
+	throw std::runtime_error( "convolutionOperator::mult_M failed : in/out image dimensions mismatch the kernel");
+      }
+    
+      // Intermediate variables
+      COMPLEX_ARRAY_TYPE *tmp_out;
+
+      if( use_oversampling ){
+	boost::shared_ptr< std::vector<size_t> > osdims = kernel_->get_dimensions();
+	tmp_out = new COMPLEX_ARRAY_TYPE(osdims);
+	pad<ELEMENT_TYPE,D>( in, tmp_out );
+      }
+      else if( accumulate ){
+	tmp_out = new COMPLEX_ARRAY_TYPE(*in);
+      }
+      else{ 
+	*out = *in;
+	tmp_out = out;
+      } 
+
+      // Forwards fft
+      operator_fft( true, tmp_out );
+
+      // Multiply
+      *tmp_out *= *kernel_;
+ 
+      // Inverse fft
+      operator_fft( false, tmp_out );
+
+      if( use_oversampling ) {
+	operator_crop( tmp_out, out );
+	delete tmp_out;
+      }    
+      else if( accumulate ){
+    	*out += *tmp_out;
+	delete tmp_out;
+      }    
+    }
+  
+    virtual void mult_MH( COMPLEX_ARRAY_TYPE *in, COMPLEX_ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( !adjoint_kernel_.get() ){
+	throw std::runtime_error("convolutionOperator::mult_MH failed : kernel is not set");
+      }
+      
+      if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+	throw std::runtime_error("convolutionOperator::mult_MH failed : in/out image dimensions mismatch");
+      }
+
+      bool use_oversampling;
+      if( in->get_number_of_elements() == adjoint_kernel_->get_number_of_elements() )
+	use_oversampling = false;
+      else if( (in->get_number_of_elements()<<D) == adjoint_kernel_->get_number_of_elements() )
+	use_oversampling = true;
+      else{
+    	throw std::runtime_error( "convolutionOperator::mult_MH failed : in/out image dimensions mismatch the kernel");
+      }
+      
+      // Intermediate variables
+      COMPLEX_ARRAY_TYPE *tmp_out;
+
+      if( use_oversampling ){
+	boost::shared_ptr< std::vector<size_t> > osdims = adjoint_kernel_->get_dimensions();
+	tmp_out = new COMPLEX_ARRAY_TYPE(osdims);
+	pad<ELEMENT_TYPE,D>( in, tmp_out );
+      }
+      else if( accumulate ){
+	tmp_out = new COMPLEX_ARRAY_TYPE(*in);
+      }
+      else{ 
+	*out = *in;
+	tmp_out = out;
+      } 
+      
+      // Forwards fft
+      operator_fft( true, tmp_out );
+
+      // Multiply
+      *tmp_out *= *adjoint_kernel_;
+
+      // Inverse fft
+      operator_fft( false, tmp_out );
+
+      if( use_oversampling ) {
+	operator_crop( tmp_out, out );
+	delete tmp_out;
+      }    
+      else if( accumulate ){
+    	*out += *tmp_out;
+	delete tmp_out;
+      }
+    }
+
+  protected:
+  
+    virtual void operator_fft( bool forwards_transform, COMPLEX_ARRAY_TYPE *image ) = 0;    
+    virtual void origin_mirror( COMPLEX_ARRAY_TYPE *in, COMPLEX_ARRAY_TYPE *out ) = 0;
+
+    virtual void operator_crop( COMPLEX_ARRAY_TYPE *in, COMPLEX_ARRAY_TYPE *out ){
+      typename uint64d<D>::Type offset = from_std_vector<size_t,D>(*(in->get_dimensions().get()))>>2;
+      crop<ELEMENT_TYPE,D>( offset, in, out );
+    }
+    
+  private:
+    boost::shared_ptr<COMPLEX_ARRAY_TYPE> kernel_;
+    boost::shared_ptr<COMPLEX_ARRAY_TYPE> adjoint_kernel_;
+  };
+}
diff --git a/toolboxes/operators/cpu/CMakeLists.txt b/toolboxes/operators/cpu/CMakeLists.txt
new file mode 100644
index 0000000..eea1f2a
--- /dev/null
+++ b/toolboxes/operators/cpu/CMakeLists.txt
@@ -0,0 +1,24 @@
+if (WIN32)
+ADD_DEFINITIONS(-D__BUILD_GADGETRON_CPUOPERATORS__)
+endif (WIN32)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  )
+
+#target_link_libraries(cpuoperators  
+#  )
+
+#install(TARGETS cpuoperators DESTINATION lib)
+
+install(FILES 	
+  hoIdentityOperator.h
+  hoImageOperator.h
+  hoDiagonalOperator.h
+  hoDiagonalSumOperator.h
+  hoFFTOperator.h
+  hoPartialDerivativeOperator.h
+  hoTvOperator.h
+  hoTvPicsOperator.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/operators/cpu/hoDiagonalOperator.h b/toolboxes/operators/cpu/hoDiagonalOperator.h
new file mode 100644
index 0000000..e691094
--- /dev/null
+++ b/toolboxes/operators/cpu/hoDiagonalOperator.h
@@ -0,0 +1,20 @@
+/** \file hoDiagonalOperator.h
+    \brief Diagonal matrix operator, CPU instantiation.
+*/
+
+#pragma once
+
+#include "hoNDArray_operators.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_blas.h"
+#include "diagonalOperator.h"
+
+namespace Gadgetron{
+
+  template <class T> class hoDiagonalOperator : public diagonalOperator< hoNDArray<T> >
+  {
+  public:
+    hoDiagonalOperator() : diagonalOperator< hoNDArray<T> >() {}
+    virtual ~hoDiagonalOperator() {}
+  };
+}
diff --git a/toolboxes/operators/cpu/hoDiagonalSumOperator.h b/toolboxes/operators/cpu/hoDiagonalSumOperator.h
new file mode 100644
index 0000000..a24ed3e
--- /dev/null
+++ b/toolboxes/operators/cpu/hoDiagonalSumOperator.h
@@ -0,0 +1,20 @@
+/** \file hoDiagonalSumOperator.h
+    \brief Sum of diagonal matrices operator, CPU instantiation.
+*/
+
+#pragma once
+
+#include "hoNDArray_operators.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_blas.h"
+#include "diagonalSumOperator.h"
+
+namespace Gadgetron{
+
+  template <class T> class hoDiagonalSumOperator : public diagonalSumOperator< hoNDArray<T> >
+  {
+  public:
+    hoDiagonalSumOperator() : diagonalSumOperator< hoNDArray<T> >() {}
+    virtual ~hoDiagonalSumOperator() {}
+  };
+}
diff --git a/toolboxes/operators/cpu/hoFFTOperator.h b/toolboxes/operators/cpu/hoFFTOperator.h
new file mode 100644
index 0000000..716979c
--- /dev/null
+++ b/toolboxes/operators/cpu/hoFFTOperator.h
@@ -0,0 +1,29 @@
+/** \file hoFFTOperator.h
+    \brief Instantiation of the Cartesian FFT operator on the cpu.
+    
+    The file hoFFTOperator.h is a convienience wrapper for the device independent FFTOperator class.
+    The class hoFFTOperator instantiates the FFTOperator for the hoNDArray< std::complex<T> >
+    and the header furthermore includes additional neccessary header files.
+*/
+
+#pragma once
+
+#include "hoNDArray_math.h"
+#include "FFTOperator.h"
+#include "hoFFT.h"
+
+namespace Gadgetron{
+  
+  /** \class hoFFTOperator
+      \brief Instantiation of the Cartesian FFT operator on the cpu.
+      
+      The class hoFFTOperator is a convienience wrapper for the device independent FFTOperator.
+      It instantiates the FFTOperator for type hoNDArray< std::complex<T> >.
+  */
+  template <class T> class hoFFTOperator : public FFTOperator< hoNDArray< std::complex<T> >, hoFFT<T> >
+  {
+  public:    
+    hoFFTOperator() : FFTOperator< hoNDArray< std::complex<T> >, hoFFT<T> >() {}
+    virtual ~hoFFTOperator() {}
+  }; 
+}
diff --git a/toolboxes/operators/cpu/hoIdentityOperator.h b/toolboxes/operators/cpu/hoIdentityOperator.h
new file mode 100644
index 0000000..f9876c9
--- /dev/null
+++ b/toolboxes/operators/cpu/hoIdentityOperator.h
@@ -0,0 +1,28 @@
+/** \file hoIdentityOperator.h
+    \brief Instantiation of the identity operator on the cpu.
+    
+    The file hoIdentityOperator.h is a convienience wrapper for the device independent identityOperator class.
+    The class hoIdentityOperator instantiates the identityOperator for the hoNDArray
+    and the header furthermore includes additional neccessary header files.
+*/
+
+#pragma once
+
+#include "hoNDArray_math.h"
+#include "identityOperator.h"
+
+namespace Gadgetron{
+  
+  /** \class hoIdentityOperator
+      \brief Instantiation of the identity operator on the cpu.
+      
+      The class hoIdentityOperator is a convienience wrapper for the device independent identityOperator.
+      hoIdentityOperator instantiates the identityOperator for type hoNDArray<T>.
+  */
+  template <class T> class hoIdentityOperator : public identityOperator< hoNDArray<T> >
+  {
+  public:    
+    hoIdentityOperator() : identityOperator< hoNDArray<T> >() {}
+    virtual ~hoIdentityOperator() {}
+  }; 
+}
diff --git a/toolboxes/operators/cpu/hoImageOperator.h b/toolboxes/operators/cpu/hoImageOperator.h
new file mode 100644
index 0000000..0a74746
--- /dev/null
+++ b/toolboxes/operators/cpu/hoImageOperator.h
@@ -0,0 +1,58 @@
+/** \file hoImageOperator.h
+    \brief Image regularization operator, CPU based.
+*/
+
+#pragma once
+
+#include "hoNDArray_math.h"
+#include "complext.h"
+#include "imageOperator.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace Gadgetron{
+
+  template <class T> class hoImageOperator : public imageOperator< hoNDArray<typename realType<T>::Type >, hoNDArray<T> >
+  {
+  public:
+
+    hoImageOperator() : imageOperator< hoNDArray<typename realType<T>::Type >, hoNDArray<T> >() {}
+    virtual ~hoImageOperator() {}    
+
+    typedef typename imageOperator< hoNDArray<typename realType<T>::Type>, hoNDArray<T> >::REAL REAL;
+
+    virtual boost::shared_ptr< linearOperator< hoNDArray<T> > > clone() {
+      return linearOperator< hoNDArray<T> >::clone(this);
+    }
+
+  protected:
+
+    // Estimate offset to the regularization image
+    virtual REAL estimate_offset()
+    {
+      // Estimation based on simple histogram analysis:
+      // Returns an estimation of the "average" intensity of the 'sigma' proportion of the image with the smallest intensities.
+      //
+      
+      const unsigned int granularity = 50000; 
+      std::vector<unsigned int> histogram(granularity,0);
+      REAL max_value = this->image_->at(amax(this->image_.get()));
+      REAL *d = this->image_->get_data_ptr();
+
+      for( unsigned int i=0; i<this->image_->get_number_of_elements(); i++) {
+	unsigned int bin = std::min(static_cast<unsigned int>(std::floor((d[i]/max_value)*granularity)), granularity-1);
+	histogram[bin]++;
+      }
+      
+      //Find 1th percentile
+      //
+
+      unsigned int cumsum = 0, counter = 0;
+      while (cumsum < (unsigned int)(REAL(0.01)*this->image_->get_number_of_elements())) {
+	cumsum += histogram[counter++];
+      }      
+      return REAL(counter+1)*max_value/granularity;
+    }
+  };
+}
diff --git a/toolboxes/operators/cpu/hoPartialDerivativeOperator.h b/toolboxes/operators/cpu/hoPartialDerivativeOperator.h
new file mode 100644
index 0000000..5345ab1
--- /dev/null
+++ b/toolboxes/operators/cpu/hoPartialDerivativeOperator.h
@@ -0,0 +1,107 @@
+/** \file hoPartialDerivativeOperator.h
+\brief Partial derivative regularization operator, CPU based.
+*/
+
+#pragma once
+
+#include "partialDerivativeOperator.h"
+#include "hoNDArray_math.h"
+#include "vector_td_utilities.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif
+
+namespace Gadgetron{
+
+    /** \class hoPartialDerivativeOperator
+    \brief CPU implementation of device dependent portions of the partialDerivative operator.
+    */
+    template <class T, unsigned int D> class hoPartialDerivativeOperator
+        : public partialDerivativeOperator<D, hoNDArray<T> >
+    {
+    public:
+
+        hoPartialDerivativeOperator() : 
+          partialDerivativeOperator< D, hoNDArray<T> >(0) {}
+
+          hoPartialDerivativeOperator( size_t dimension ) : 
+          partialDerivativeOperator<D, hoNDArray<T> >( dimension ) {}
+
+          virtual ~hoPartialDerivativeOperator() {}
+
+          virtual void compute_partial_derivative( typename int64d<D>::Type stride, hoNDArray<T> *in,
+              hoNDArray<T> *out, bool accumulate )
+          {
+              if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+                  throw std::runtime_error( "hoPartialDerivativeOperator::compute_partial_derivative : array dimensions mismatch.");
+              }
+
+              if( in->get_number_of_dimensions() != D || out->get_number_of_dimensions() != D ){
+                  throw std::runtime_error("hoPartialDerivativeOperator::compute_partial_derivative : dimensionality mismatch");
+              }
+
+              typename int64d<D>::Type dims = vector_td<long long,D>( from_std_vector<size_t,D>( *(in->get_dimensions().get()) ));
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+      for( long long idx=0; idx<in->get_number_of_elements(); idx++ ) {
+
+                  T valN, valC;
+
+                  typename int64d<D>::Type co = idx_to_co<D>(idx, dims);
+                  typename int64d<D>::Type coN = (co+dims+stride)%dims;
+
+                  valN = in->get_data_ptr()[co_to_idx<D>(coN, dims)];
+                  valC = in->get_data_ptr()[co_to_idx<D>(co, dims)];
+
+                  T val = valN-valC;
+
+                  if( accumulate )
+                      out->get_data_ptr()[idx] += val;
+                  else
+                      out->get_data_ptr()[idx] = val;
+              }
+          }
+
+          virtual void compute_second_order_partial_derivative( typename int64d<D>::Type forwards_stride,
+              typename int64d<D>::Type adjoint_stride, 
+              hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate )
+          {
+              if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+                  throw std::runtime_error( "hoPartialDerivativeOperator::compute_second_order_partial_derivative : array dimensions mismatch.");
+              }
+
+              if( in->get_number_of_dimensions() != D || out->get_number_of_dimensions() != D ){
+                  throw std::runtime_error( "hoPartialDerivativeOperator::compute_second_order_partial_derivative : dimensionality mismatch");
+              }
+
+              typename int64d<D>::Type dims = vector_td<long long,D>( from_std_vector<size_t,D>( *(in->get_dimensions().get()) ));
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+      for( long long idx=0; idx<in->get_number_of_elements(); idx++ ) {
+
+                  T valN1, valN2, valC;
+
+                  typename int64d<D>::Type co = idx_to_co<D>(idx, dims);
+                  typename int64d<D>::Type coN1 = (co+dims+forwards_stride)%dims;
+                  typename int64d<D>::Type coN2 = (co+dims+adjoint_stride)%dims;
+
+                  valN1 = in->get_data_ptr()[co_to_idx<D>(coN1, dims)];
+                  valN2 = in->get_data_ptr()[co_to_idx<D>(coN2, dims)];
+                  valC = in->get_data_ptr()[co_to_idx<D>(co, dims)];
+
+                  T val = valC+valC-valN1-valN2;
+
+                  if( accumulate )
+                      out->get_data_ptr()[idx] += val;
+                  else
+                      out->get_data_ptr()[idx] = val;
+              }
+          }
+
+    };
+}
diff --git a/toolboxes/operators/cpu/hoTvOperator.h b/toolboxes/operators/cpu/hoTvOperator.h
new file mode 100644
index 0000000..6b2d609
--- /dev/null
+++ b/toolboxes/operators/cpu/hoTvOperator.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#include "hoNDArray_math.h"
+#include "generalOperator.h"
+
+#include "vector_td_operators.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif
+
+namespace Gadgetron{
+
+template<class T, unsigned int D> class hoTvOperator
+		: public generalOperator< hoNDArray<T> >
+{
+protected:
+	typedef typename realType<T>::Type REAL;
+
+public:
+	hoTvOperator() : generalOperator< hoNDArray<T> >(){
+		limit_ = REAL(1e-8);
+	}
+
+	virtual ~hoTvOperator() {}
+
+	void set_limit(REAL limit){
+		limit_ = limit;
+	}
+
+	virtual void gradient( hoNDArray<T> *in_array, hoNDArray<T> *out_array, bool accumulate=false )
+	{
+		if (in_array->get_number_of_elements() != out_array->get_number_of_elements()){
+			throw std::runtime_error("hoTvOperator: input/output array dimensions mismatch");
+		}
+
+		T* in = in_array->get_data_ptr();
+		T* out = out_array->get_data_ptr();
+
+		vector_td<unsigned int,D> dims = from_std_vector<unsigned int, D>(*(in_array->get_dimensions()));
+
+		if (!accumulate)
+			clear(out_array);
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+		for (int idx=0; idx < in_array->get_number_of_elements(); idx++){
+
+			T xi = in[idx];
+			T result = T(0);
+
+			vector_td<unsigned int,D> co = idx_to_co<D>(idx, dims);
+
+			REAL grad = gradient_(in,dims,co);
+
+			if (grad > limit_) {
+				result += REAL(D)*xi/grad;
+				for (int i = 0; i < D; i++){
+					co[i]+=1;
+					result -= in[co_to_idx<D>((co+dims)%dims,dims)]/grad;
+					co[i]-=1;
+				}
+			}
+
+			for (int i = 0; i < D; i++){
+				co[i]-=1;
+				grad = gradient_(in,dims,co);
+				if (grad > limit_) {
+					result +=(xi-in[co_to_idx<D>((co+dims)%dims,dims)])/grad;
+				}
+				co[i]+=1;
+			}
+			out[idx] += this->weight_*result;
+		}
+	}
+
+
+	virtual REAL magnitude( hoNDArray<T> *in_array )
+	{
+
+		T* in = in_array->get_data_ptr();
+
+		vector_td<unsigned int,D> dims = from_std_vector<unsigned int, D>(*(in_array->get_dimensions()));
+
+		REAL result =0;
+#ifdef USE_OMP
+#pragma omp parallel for reduction(+:result)
+#endif
+		for (int idx=0; idx < in_array->get_number_of_elements(); idx++){
+			vector_td<unsigned int,D> co = idx_to_co<D>(idx, dims);
+			REAL grad = gradient_(in,dims,co);
+			result += this->weight_*grad;
+		}
+
+		return result;
+	}
+
+private:
+
+	REAL inline gradient_(T* in, const vector_td<unsigned int,D> dims, vector_td<unsigned int,D> co)
+	{
+		REAL grad = REAL(0);
+		T xi = in[co_to_idx<D>((co+dims)%dims,dims)];
+		for (int i = 0; i < D; i++){
+			co[i]+=1;
+			T dt = in[co_to_idx<D>((co+dims)%dims,dims)];
+			grad += norm(xi-dt);
+			co[i]-=1;
+		}
+		return std::sqrt(grad);
+	}
+
+protected:
+	REAL limit_;
+};
+}
diff --git a/toolboxes/operators/cpu/hoTvPicsOperator.h b/toolboxes/operators/cpu/hoTvPicsOperator.h
new file mode 100644
index 0000000..fd5fb6e
--- /dev/null
+++ b/toolboxes/operators/cpu/hoTvPicsOperator.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "hoNDArray_math.h"
+#include "hoTvOperator.h"
+#include "tvPicsOperator.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class hoTvPicsOperator 
+    : public tvPicsOperator< hoNDArray<T>, hoTvOperator<T,D>, typename realType<T>::Type >
+  {
+  public:
+    hoTvPicsOperator() : tvPicsOperator< hoNDArray<T>, hoTvOperator<T,D>, typename realType<T>::Type >() {}
+    virtual ~hoTvPicsOperator() {}
+  };    
+}
diff --git a/toolboxes/operators/diagonalOperator.h b/toolboxes/operators/diagonalOperator.h
new file mode 100644
index 0000000..5d99a62
--- /dev/null
+++ b/toolboxes/operators/diagonalOperator.h
@@ -0,0 +1,74 @@
+/** \file diagonalOperator.h
+    \brief Base class for the diagonal matrix operators.
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+
+namespace Gadgetron {
+
+  template <class ARRAY_TYPE> class diagonalOperator : public linearOperator<ARRAY_TYPE>
+  {
+  public:
+  
+    diagonalOperator() : linearOperator<ARRAY_TYPE>() {}
+    virtual ~diagonalOperator() {}
+  
+    // Set/get diagonal
+    //
+    
+    virtual void set_diagonal( boost::shared_ptr<ARRAY_TYPE> diagonal ) { 
+      diagonal_ = diagonal;
+      diagonal_conj_ = conj(diagonal.get());
+    }
+
+    virtual boost::shared_ptr<ARRAY_TYPE> get_diagonal() { return diagonal_; }
+  
+    virtual void mult_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( accumulate ) {
+        ARRAY_TYPE tmp(*in);
+        tmp *= *diagonal_;
+        *out += tmp;
+      }
+      else{
+        *out = *in;
+        *out *= *diagonal_;
+      }
+    }
+  
+    virtual void mult_MH( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( accumulate ) {
+        ARRAY_TYPE tmp(*in);
+        tmp *= *diagonal_conj_;
+        *out += tmp;
+      }
+      else{
+        *out = *in;
+        *out *= *diagonal_conj_;
+      }
+    }
+    
+    // Apply diagonal operator (twice)
+    virtual void mult_MH_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate )
+    {    
+      if( accumulate ) {
+        ARRAY_TYPE tmp(*in);
+        tmp *= *diagonal_;
+        tmp *= *diagonal_conj_;
+        *out += tmp;
+      }
+      else{
+        *out = *in;
+        *out *= *diagonal_;
+        *out *= *diagonal_conj_;
+      }
+    }
+  
+  protected:
+    boost::shared_ptr<ARRAY_TYPE> diagonal_;
+    boost::shared_ptr<ARRAY_TYPE> diagonal_conj_;
+  };
+}
diff --git a/toolboxes/operators/diagonalSumOperator.h b/toolboxes/operators/diagonalSumOperator.h
new file mode 100644
index 0000000..d839237
--- /dev/null
+++ b/toolboxes/operators/diagonalSumOperator.h
@@ -0,0 +1,92 @@
+/** \file diagonalSumOperator.h
+    \brief Operator to compute the sum over a set of diagonal matrices times a set of corresponding vectors.
+
+    The domain of this operator is a set of images, the codomain a single image. 
+    The sum is computed over the last dimension of the provided diagonal array.
+*/
+
+#pragma once
+
+#include "diagonalOperator.h"
+
+namespace Gadgetron {
+
+  template <class ARRAY_TYPE> class diagonalSumOperator : public diagonalOperator<ARRAY_TYPE>
+  {
+  public:
+  
+    diagonalSumOperator() : diagonalOperator<ARRAY_TYPE>() {}
+    virtual ~diagonalSumOperator() {}
+  
+    virtual void mult_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( !this->diagonal_ ){
+        throw std::runtime_error("diagonalSumOperator::mult_M failed: diagonal not set");
+      }       
+
+      const unsigned int num_phases = this->diagonal_->get_size(this->diagonal_->get_number_of_dimensions()-1);
+      const unsigned int elements_per_phase = this->diagonal_->get_number_of_elements()/num_phases;
+      
+      if( in->get_number_of_elements() != this->diagonal_->get_number_of_elements() ){
+        throw std::runtime_error("diagonalSumOperator::mult_M failed: array size mismatch between input image and diagonal");
+      }
+
+      if( out->get_number_of_elements() != elements_per_phase ){
+        throw std::runtime_error("diagonalSumOperator::mult_M failed: the output image domain should only be a single image");
+      }
+
+      if( !accumulate ) 
+        clear(out);
+
+      std::vector<size_t> dims = *out->get_dimensions();
+     
+      // Iterate over the last dimension of the provided diagonal image
+      //
+
+      for( unsigned int i=0; i<num_phases; i++ ){
+
+        ARRAY_TYPE tmp_in( &dims, in->get_data_ptr()+i*elements_per_phase );
+        ARRAY_TYPE tmp_diag( &dims, this->diagonal_->get_data_ptr()+i*elements_per_phase );
+
+        if(i==0 && !accumulate){
+          *out = tmp_in;
+          *out *= tmp_diag;
+        }
+        else{
+          ARRAY_TYPE tmp(&tmp_in);
+          tmp *= tmp_diag;
+          *out += tmp;
+        }
+      }
+    }
+    
+    virtual void mult_MH( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( !this->diagonal_conj_ ){
+        throw std::runtime_error("diagonalSumOperator::mult_MH failed: diagonal not set");
+      }       
+
+      const unsigned int num_phases = this->diagonal_conj_->get_size(this->diagonal_conj_->get_number_of_dimensions()-1);
+      const unsigned int elements_per_phase = this->diagonal_conj_->get_number_of_elements()/num_phases;
+      
+      if( in->get_number_of_elements() != elements_per_phase ){
+        throw std::runtime_error("diagonalSumOperator::mult_MH failed: the input image domain should only be a single image");
+      }
+
+      if( out->get_number_of_elements() != this->diagonal_conj_->get_number_of_elements() ){
+        throw std::runtime_error("diagonalSumOperator::mult_MH failed: array size mismatch between output image and diagonal");
+      }
+
+      if( !accumulate ){
+        *out = *this->diagonal_conj_;
+        *out *= *in; // multiplies all phases with the input
+      }
+      else{
+        ARRAY_TYPE tmp(this->diagonal_conj_.get());
+        tmp *= *in; // multiplies all phases with the input
+        *out += tmp;
+      }
+    }
+
+  };
+}
diff --git a/toolboxes/operators/downsampleOperator.h b/toolboxes/operators/downsampleOperator.h
new file mode 100644
index 0000000..913864e
--- /dev/null
+++ b/toolboxes/operators/downsampleOperator.h
@@ -0,0 +1,51 @@
+/** \file DownsamplingOperator.h
+    \brief Base class for the downsampling operators.
+
+    For instantiation we refer to
+    - the class(/file) cuDownsamplingOperator(/.h) for a gpu instantiated operator using the cuNDArray class
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+#include "vector_td.h"
+
+namespace Gadgetron{
+  
+  template <class ARRAY_TYPE, unsigned int D> class downsampleOperator
+    : public linearOperator<ARRAY_TYPE>
+  {
+    
+  public:
+
+    typedef typename ARRAY_TYPE::element_type T;
+
+    downsampleOperator() : linearOperator<ARRAY_TYPE>() {}
+    virtual ~downsampleOperator() {}
+    
+    virtual void mult_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( accumulate ){
+        boost::shared_ptr<ARRAY_TYPE> tmp = downsample<T,D>(in);
+        *out += *tmp;
+      }
+      else
+        downsample<T,D>(in,out);
+    }
+    
+    virtual void mult_MH( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( accumulate ){
+        boost::shared_ptr<ARRAY_TYPE> tmp = upsample<T,D>(in);
+        *out += *tmp;
+      }
+      else
+        upsample<T,D>(in,out);
+    }
+
+    virtual boost::shared_ptr< linearOperator< ARRAY_TYPE > > clone()
+    {
+      return linearOperator<ARRAY_TYPE>::clone(this);
+    }    
+  };
+}
diff --git a/toolboxes/operators/encodedImageOperator.h b/toolboxes/operators/encodedImageOperator.h
new file mode 100644
index 0000000..e39fb33
--- /dev/null
+++ b/toolboxes/operators/encodedImageOperator.h
@@ -0,0 +1,48 @@
+/** \file encodedImageOperator.h
+    \brief Regularization operator for encoded images. Careful, only implements mult_MH_M and not (yet) mult_M and mult_MH.
+*/
+
+#pragma once
+
+#include "imageOperator.h"
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE_REAL, class ARRAY_TYPE_OPERATOR> class encodedImageOperator
+    : public imageOperator<ARRAY_TYPE_REAL, ARRAY_TYPE_OPERATOR>
+  {
+  
+  public:
+  
+    encodedImageOperator() : imageOperator<ARRAY_TYPE_REAL, ARRAY_TYPE_OPERATOR>() {}
+    virtual ~encodedImageOperator() {}
+ 
+    // Set encoding operator for the regularization image
+    virtual void set_encoding_operator( boost::shared_ptr< linearOperator<ARRAY_TYPE_OPERATOR> > encoding_operator )
+
+    {
+      encoding_operator_ = encoding_operator;
+    }
+  
+    // Apply regularization image operator
+    virtual void mult_MH_M( ARRAY_TYPE_OPERATOR *in, ARRAY_TYPE_OPERATOR *out, bool accumulate = false )
+    {    
+      if( !encoding_operator_.get() ){
+        throw std::runtime_error("encodedImageOperator::mult_MH_M failed : encoding operator not set");
+      }
+    
+      ARRAY_TYPE_OPERATOR tmp(in->get_dimensions());
+
+      encoding_operator_->mult_M( in, &tmp );
+ 
+      ARRAY_TYPE_OPERATOR tmp2(in->get_dimensions());
+
+      imageOperator<ARRAY_TYPE_REAL, ARRAY_TYPE_OPERATOR>::mult_MH_M( &tmp, &tmp2 );
+    
+      encoding_operator_->mult_MH( &tmp2, out, accumulate );
+    }  
+  
+  private:
+    boost::shared_ptr< linearOperator<ARRAY_TYPE_OPERATOR> > encoding_operator_;
+  };
+}
diff --git a/toolboxes/operators/encodingOperatorContainer.h b/toolboxes/operators/encodingOperatorContainer.h
new file mode 100644
index 0000000..13bd95d
--- /dev/null
+++ b/toolboxes/operators/encodingOperatorContainer.h
@@ -0,0 +1,231 @@
+/** \file encodingOperatorContainer.h
+    \brief An encoding operator that can contain multiple other encoding operators. Use it when more than one encoding operator is required in a solver.
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+
+#include <iostream>
+#include <vector>
+#include <boost/shared_ptr.hpp>
+#include <sstream>
+#include <stdexcept>
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE> class encodingOperatorContainer : public linearOperator<ARRAY_TYPE>
+  {
+  public:
+
+    encodingOperatorContainer() : linearOperator<ARRAY_TYPE>() { num_elements_ = 0; }
+    virtual ~encodingOperatorContainer(){}
+
+    // The domain and codomain dimensions of this container cannot be set. 
+    // They should be set indirectly through the contained operators instead.
+    //
+    virtual void set_domain_dimensions( std::vector<size_t>* ){
+      throw std::runtime_error( "Error: encodingOperatorContainer::set_domain_dimensions() : operation not supported." );
+    }
+    
+    virtual void set_codomain_dimensions( std::vector<size_t>* ){
+      throw std::runtime_error( "Error: encodingOperatorContainer::set_codomain_dimensions() : operation not supported." );
+    }
+    
+    // Get domain and codomain dimensions:
+    // The domain should match between the individual operators.
+    // The codomain is a concatenation of the indivudial operators' domains.
+    //
+    virtual boost::shared_ptr< std::vector<size_t> > get_domain_dimensions() 
+    { 
+      if( operators_.size() == 0 ){
+	throw std::runtime_error( "Error: encodingOperatorContainer::get_domain_dimensions() : no operators present." );
+      }
+      
+      boost::shared_ptr< std::vector<size_t> > dims = (operators_[0])->get_domain_dimensions();
+      for( size_t i=1; i<operators_.size(); i++ )
+	if( *dims != *((operators_[i])->get_domain_dimensions()) ){
+	  throw std::runtime_error( "Error: encodingOperatorContainer::get_domain_dimensions() : inconsistent operator dimensions." );
+	}
+      return dims;
+    }
+    
+    virtual boost::shared_ptr< std::vector<size_t> > get_codomain_dimensions() 
+    { 
+      if( num_elements_ == 0 ){
+	throw std::runtime_error( "Error: encodingOperatorContainer::get_codomain_dimensions() : no operators present." );
+      }
+      
+      std::vector<size_t> *dims = new std::vector<size_t>();
+      dims->push_back(num_elements_);
+      return boost::shared_ptr< std::vector<size_t> >(dims);
+    }
+
+    // Get domain and codomain for the individual operators
+    //
+    virtual boost::shared_ptr< std::vector<size_t> > get_domain_dimensions(size_t i) 
+    { 
+      if( i>=operators_.size() )
+	throw std::runtime_error("encodingOperatorContainer::get_domain_dimensions : illegal index provided");
+      return operators_[i]->get_domain_dimensions(); 
+    }
+  
+    virtual boost::shared_ptr< std::vector<size_t> > get_codomain_dimensions(size_t i) 
+    { 
+      if( i>=operators_.size() )
+	throw std::runtime_error("encodingOperatorContainer::get_codomain_dimensions : illegal index provided");
+      return operators_[i]->get_codomain_dimensions(); 
+    }
+  
+    // Allocate an array of the codomain dimensions
+    //
+    boost::shared_ptr< ARRAY_TYPE> create_codomain() 
+    {
+      return boost::shared_ptr<ARRAY_TYPE>(new ARRAY_TYPE(get_codomain_dimensions()));
+    }
+  
+    // Concatenate a vector of codomains into a single array
+    //
+    boost::shared_ptr< ARRAY_TYPE> create_codomain( std::vector<ARRAY_TYPE*> codoms )
+    {
+      if (codoms.size() != operators_.size())
+	throw std::runtime_error("encodingOperatorContainter::create_codomain: number of operators and number of codomains do no match");
+
+      boost::shared_ptr<ARRAY_TYPE> codomain(new ARRAY_TYPE(get_codomain_dimensions()));
+      size_t offset = 0;
+
+      for (size_t i = 0; i < operators_.size(); i++){
+
+	if (!codoms[i]->dimensions_equal(get_codomain_dimensions(i).get())){
+	  std::stringstream ss;
+	  ss << "encodingOperatorContainter::create_codomain: input codomain " << i << " does not match corresponding operator codomain" << std::endl;
+	  ss << "Input codomain: ";
+	  std::vector<size_t> ico = *codoms[i]->get_dimensions();
+	  for (size_t k = 0; k < ico.size(); k++) ss << ico[k] << " ";
+	  ss << std::endl;
+	  ss << "Operator codomain: ";
+	  ico = *get_codomain_dimensions(i);
+	  GDEBUG_STREAM("SIZE: " << ico.size() << std::endl);
+	  for (size_t k = 0; k < ico.size(); k++) ss << ico[k] << " ";
+	  ss << std::endl;
+	  throw std::runtime_error(ss.str());
+	}
+
+	ARRAY_TYPE slice;
+	slice.create(codoms[i]->get_dimensions().get(),codomain->get_data_ptr()+offset);
+	if (codoms[i])
+		slice = *codoms[i];
+	offset += slice.get_number_of_elements();
+      }
+
+      return codomain;    
+    }
+
+    // Get individual operators
+    //
+    boost::shared_ptr< linearOperator<ARRAY_TYPE> > get_operator(size_t i)
+    {
+      if( i>=operators_.size() )
+	throw std::runtime_error("encodingOperatorContainer::get_operator : illegal index provided");
+      return operators_[i];
+    }
+
+    // Get pointer offset into codomain for individual operators "sub-codomains"
+    //
+    size_t get_offset(size_t i)
+    {
+      if( i>=operators_.size() )
+	throw std::runtime_error("encodingOperatorContainer::get_offset : illegal index provided");
+      return offsets_[i];
+    }
+  
+    // Add operator to the container
+    //
+    void add_operator( boost::shared_ptr< linearOperator<ARRAY_TYPE> > op )
+    {
+      boost::shared_ptr< std::vector<size_t> > codomain = op->get_codomain_dimensions();
+      
+      if( codomain->size() == 0 ){
+	throw std::runtime_error("encodingOperatorContainer::add_operator : codomain dimensions not set on operator");
+      }
+
+      size_t elements = 1;
+      for (size_t i=0; i<codomain->size(); i++){
+	elements *= codomain->at(i);
+      }
+    
+      if( elements == 0 ){
+	throw std::runtime_error("encodingOperatorContainer::add_operator : illegal codomain dimensions on operator");
+      }
+
+      if (offsets_.size() == 0){
+	offsets_.push_back(0);
+      } else{
+	offsets_.push_back(num_elements_);
+      }
+
+      num_elements_ += elements;
+      operators_.push_back(op);
+    }
+  
+    virtual void mult_M( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false )
+    {
+      for (size_t i=0; i<operators_.size(); i++){
+	ARRAY_TYPE tmp_data(operators_[i]->get_codomain_dimensions(),out->get_data_ptr()+offsets_[i]);
+	operators_[i]->mult_M( in, &tmp_data, accumulate );
+      }
+    }
+
+    virtual void mult_MH( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false )
+    {
+      ARRAY_TYPE tmp_image(get_domain_dimensions());
+        
+      for (size_t i=0; i<operators_.size(); i++){
+      
+	boost::shared_ptr< linearOperator<ARRAY_TYPE> > op = operators_[i];
+	ARRAY_TYPE tmp_data(op->get_codomain_dimensions(),in->get_data_ptr()+offsets_[i]);
+      
+	// This operator is special in that it needs to apply the "internal" operator weights
+	//
+
+	op->mult_MH( &tmp_data, &tmp_image );
+
+	if( i == 0 && !accumulate ){
+	  *out = tmp_image;
+	  *out *= op->get_weight();
+	}
+	else {
+	  axpy( op->get_weight(), &tmp_image, out );
+	}
+      }
+    }
+  
+    virtual void mult_MH_M( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false )
+    {
+
+      ARRAY_TYPE tmp_image(get_domain_dimensions());
+    
+      for (size_t i=0; i<operators_.size(); i++){
+      
+	boost::shared_ptr< linearOperator<ARRAY_TYPE> > op = operators_[i];
+      
+	// This operator is special in that it needs to apply the "internal" operator weights
+	//
+      
+	op->mult_MH_M( in, &tmp_image );
+	if( i == 0 && !accumulate ){
+	  *out = tmp_image;
+	  *out *= op->get_weight();
+	}
+	else {
+	  axpy( op->get_weight(), &tmp_image, out ) ;
+	}
+      }
+    }
+
+  protected:
+    std::vector< boost::shared_ptr< linearOperator<ARRAY_TYPE> > > operators_;
+    std::vector<size_t> offsets_;
+    size_t num_elements_;
+  };
+}
diff --git a/toolboxes/operators/generalOperator.h b/toolboxes/operators/generalOperator.h
new file mode 100644
index 0000000..14d8f6c
--- /dev/null
+++ b/toolboxes/operators/generalOperator.h
@@ -0,0 +1,88 @@
+/** \file generalOperator.h
+    \brief Base class for all operators on which we can compute a gradient.
+*/
+
+#pragma once
+
+
+#include "complext.h"
+
+#include <boost/shared_ptr.hpp>
+#include <vector>
+#include <stdexcept>
+
+namespace Gadgetron{
+
+  template <class ARRAY> class generalOperator
+  {
+   public:
+
+    typedef typename ARRAY::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+
+    generalOperator() : weight_(REAL(1)){}
+
+    generalOperator(std::vector<size_t> *dims) : weight_(REAL(1)){
+      set_domain_dimensions(dims);
+    }
+
+    virtual ~generalOperator();
+
+    /**
+     * @brief Calculates the gradient of the operator at point "in"
+     * @param[in] in Input point at which to calculate the gradient
+     * @param[in,out] out Gradient
+     * @param[in] accumulate If false, overrides the output array. Otherwise adds result.
+     */
+    virtual void gradient(ARRAY* in, ARRAY* out, bool accumulate = false ) = 0;
+
+    /**
+     * @brief Calculates the function value of the operator
+     * @param[in] in Point at which to calculate the value
+     * @return Function value at point "in"
+     */
+    virtual REAL magnitude(ARRAY* in)=0;
+
+    /**
+     * Set the domain dimension (image size) of the operator
+     * @param[in] dims Domain dimensions
+     */
+    virtual void set_domain_dimensions( std::vector<size_t> *dims )
+    {
+      if( dims == 0x0 ) throw std::runtime_error("Null pointer provided");
+      domain_dims_ = *dims;  
+    }
+
+    /**
+     *
+     * @return The domain dimensions (image size) of the operator
+     */
+    virtual boost::shared_ptr< std::vector<size_t> > get_domain_dimensions()
+    {
+      std::vector<size_t> *dims = new std::vector<size_t>();
+      *dims = domain_dims_;
+      return boost::shared_ptr< std::vector<size_t> >(dims);
+    }
+
+    /**
+     * Sets the weight of the operator
+     * @param[in] weight
+     */
+    virtual void set_weight( REAL weight ){ weight_ = weight; }
+
+    /**
+     *
+     * @return Weight of the operator
+     */
+    virtual REAL get_weight(){ return weight_; }
+
+  protected:
+    REAL weight_;
+    std::vector<size_t> domain_dims_;
+  };
+
+  template <class ARRAY> 
+  generalOperator<ARRAY>::~generalOperator()
+  {
+  }
+}
diff --git a/toolboxes/operators/gpu/CMakeLists.txt b/toolboxes/operators/gpu/CMakeLists.txt
new file mode 100644
index 0000000..8fecf72
--- /dev/null
+++ b/toolboxes/operators/gpu/CMakeLists.txt
@@ -0,0 +1,67 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUOPERATORS__)
+endif (WIN32)
+
+if(WIN32)
+link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+include_directories(
+  ${CUDA_INCLUDE_DIRS}
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/fft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CUDA_INCLUDE_DIRS}
+  )
+
+cuda_add_library(gadgetron_toolbox_gpuoperators SHARED 
+  ../generalOperator.h
+  ../linearOperator.h
+  cuPartialDerivativeOperator.h
+  cuLaplaceOperator.h
+  cuTvOperator.h
+  cuTv1dOperator.h
+  cuConvolutionOperator.h
+  cuPartialDerivativeOperator.cu
+  cuPartialDerivativeOperator2.cu
+  cuLaplaceOperator.cu
+  cuTvOperator.cu
+  cuTv1dOperator.cu
+  cuConvolutionOperator.cu
+  )
+
+set_target_properties(gadgetron_toolbox_gpuoperators PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_toolbox_gpuoperators 
+  gadgetron_toolbox_gpucore 
+  gadgetron_toolbox_gpunfft
+  ${Boost_LIBRARIES}
+  ${CUDA_LIBRARIES}
+  ${CUDA_CUBLAS_LIBRARIES} 
+  )
+
+install(TARGETS gadgetron_toolbox_gpuoperators DESTINATION lib COMPONENT main)
+
+install(FILES 
+  cuImageOperator.h
+  cuDiagonalOperator.h
+  cuDiagonalSumOperator.h
+  cuPartialDerivativeOperator.h
+  cuPartialDerivativeOperator2.h
+  cuConvolutionOperator.h
+  cuLaplaceOperator.h
+  cuTvOperator.h
+  cuTvPicsOperator.h
+  cuTv1dOperator.h
+  cuDownsampleOperator.h
+  cuFFTOperator.h
+  cuUpsampleOperator.h
+  hoCuIdentityOperator.h
+  hoCuPartialDerivativeOperator.h
+  hoCuTvOperator.h
+  hoCuTvPicsOperator.h
+  hoCuEncodingOperatorContainer.h
+  gpuoperators_export.h
+  hoCuOperator.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/operators/gpu/cuConvolutionOperator.cu b/toolboxes/operators/gpu/cuConvolutionOperator.cu
new file mode 100644
index 0000000..65898c6
--- /dev/null
+++ b/toolboxes/operators/gpu/cuConvolutionOperator.cu
@@ -0,0 +1,89 @@
+#include "cuConvolutionOperator.h"
+#include "vector_td_utilities.h"
+#include "cudaDeviceManager.h"
+#include "setup_grid.h"
+
+namespace Gadgetron {
+
+  // Mirror, but keep the origin unchanged
+  template<class T, unsigned int D> __global__ void
+  origin_mirror_kernel( vector_td<unsigned int,D> matrix_size, vector_td<unsigned int,D> origin, const T * __restrict__ in, T * __restrict__ out, bool zero_fill )
+  {
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    
+    if( idx < prod(matrix_size) ){
+      
+      vector_td<unsigned int,D> in_co = idx_to_co<D>( idx, matrix_size );
+      vector_td<unsigned int,D> out_co = matrix_size-in_co;
+    
+      bool wrap = false;
+      for( unsigned int d=0; d<D; d++ ){
+	if( out_co.vec[d] == matrix_size.vec[d] ){
+	  out_co.vec[d] = 0;
+	  wrap = true;
+	}
+      }
+    
+      const unsigned int in_idx = co_to_idx<D>(in_co, matrix_size);
+      const unsigned int out_idx = co_to_idx<D>(out_co, matrix_size);
+
+      if( wrap && zero_fill )
+	out[out_idx] = T(0);
+      else
+	out[out_idx] = in[in_idx];
+    }
+  }
+  
+  // Mirror around the origin -- !! leaving the origin unchanged !!
+  // This creates empty space "on the left" that can be filled by zero (default) or the left-over entry.
+  template<class REAL, unsigned int D> void
+  cuConvolutionOperator<REAL,D>::origin_mirror( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out )
+  {
+    if( in == 0x0 || out == 0x0 ){
+      throw std::runtime_error( "origin_mirror: 0x0 ndarray provided");
+    }
+    
+    if( !in->dimensions_equal(out) ){
+      throw std::runtime_error("origin_mirror: image dimensions mismatch");
+    }
+    
+    if( in->get_number_of_dimensions() != D ){
+      std::stringstream ss;
+      ss << "origin_mirror: number of image dimensions is not " << D;
+      throw std::runtime_error(ss.str());
+    }
+
+    typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>( *in->get_dimensions() );
+  
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( prod(matrix_size), &blockDim, &gridDim );
+
+    // Invoke kernel
+    origin_mirror_kernel<complext<REAL>,D><<< gridDim, blockDim >>> 
+      ( vector_td<unsigned int,D>(matrix_size), vector_td<unsigned int,D>(matrix_size>>1), in->get_data_ptr(), out->get_data_ptr(), true );
+    
+    CHECK_FOR_CUDA_ERROR();
+  }
+
+
+  template <class REAL, unsigned int D> void 
+  cuConvolutionOperator<REAL,D>::operator_fft( bool forwards_transform, cuNDArray< complext<REAL> > *image )
+  {
+    if( forwards_transform )
+      cuNDFFT<REAL>::instance()->fft(image);
+    else
+      cuNDFFT<REAL>::instance()->ifft(image);
+  }    
+  
+  template EXPORTGPUOPERATORS class cuConvolutionOperator<float,1>;
+  template EXPORTGPUOPERATORS class cuConvolutionOperator<float,2>;
+  template EXPORTGPUOPERATORS class cuConvolutionOperator<float,3>;
+  template EXPORTGPUOPERATORS class cuConvolutionOperator<float,4>;
+
+  template EXPORTGPUOPERATORS class cuConvolutionOperator<double,1>;
+  template EXPORTGPUOPERATORS class cuConvolutionOperator<double,2>;
+  template EXPORTGPUOPERATORS class cuConvolutionOperator<double,3>;
+  template EXPORTGPUOPERATORS class cuConvolutionOperator<double,4>;
+  
+}
diff --git a/toolboxes/operators/gpu/cuConvolutionOperator.h b/toolboxes/operators/gpu/cuConvolutionOperator.h
new file mode 100644
index 0000000..f0f18c4
--- /dev/null
+++ b/toolboxes/operators/gpu/cuConvolutionOperator.h
@@ -0,0 +1,28 @@
+/** \file cuConvolutionOperator.h
+    \brief Convolution operator, GPU based.
+*/
+
+#pragma once
+
+#include "gpuoperators_export.h"
+#include "cuNDArray_math.h"
+#include "cuNDFFT.h"
+#include "vector_td_utilities.h"
+#include "convolutionOperator.h"
+
+namespace Gadgetron{
+
+  template <class REAL, unsigned int D> class EXPORTGPUOPERATORS cuConvolutionOperator 
+    : public convolutionOperator<cuNDArray<complext<REAL> >, D >
+  {
+    
+  public:
+  
+    cuConvolutionOperator() : convolutionOperator<cuNDArray<complext<REAL> >, D>() {  }
+    virtual ~cuConvolutionOperator() {}
+        
+    virtual void operator_fft( bool forwards_transform, cuNDArray< complext<REAL> > *image );
+    virtual void origin_mirror( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out );
+    
+  };
+}
diff --git a/toolboxes/operators/gpu/cuDiagonalOperator.h b/toolboxes/operators/gpu/cuDiagonalOperator.h
new file mode 100644
index 0000000..83dadde
--- /dev/null
+++ b/toolboxes/operators/gpu/cuDiagonalOperator.h
@@ -0,0 +1,20 @@
+/** \file cuDiagonalOperator.h
+    \brief Diagonal matrix operator, GPU instantiation.
+*/
+
+#pragma once
+
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "diagonalOperator.h"
+
+namespace Gadgetron{
+
+  template <class T> class cuDiagonalOperator : public diagonalOperator< cuNDArray<T> >
+  {
+  public:
+    cuDiagonalOperator() : diagonalOperator< cuNDArray<T> >() {}
+    virtual ~cuDiagonalOperator() {}
+  };
+}
diff --git a/toolboxes/operators/gpu/cuDiagonalSumOperator.h b/toolboxes/operators/gpu/cuDiagonalSumOperator.h
new file mode 100644
index 0000000..e6900e4
--- /dev/null
+++ b/toolboxes/operators/gpu/cuDiagonalSumOperator.h
@@ -0,0 +1,20 @@
+/** \file cuDiagonalSumOperator.h
+    \brief Sum of diagonal matrices, GPU instantiation.
+*/
+
+#pragma once
+
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "diagonalSumOperator.h"
+
+namespace Gadgetron{
+
+  template <class T> class cuDiagonalSumOperator : public diagonalSumOperator< cuNDArray<T> >
+  {
+  public:
+    cuDiagonalSumOperator() : diagonalSumOperator< cuNDArray<T> >() {}
+    virtual ~cuDiagonalSumOperator() {}
+  };
+}
diff --git a/toolboxes/operators/gpu/cuDownsampleOperator.h b/toolboxes/operators/gpu/cuDownsampleOperator.h
new file mode 100644
index 0000000..57d3912
--- /dev/null
+++ b/toolboxes/operators/gpu/cuDownsampleOperator.h
@@ -0,0 +1,28 @@
+/** \file cuDownsampleOperator.h
+    \brief Instantiation of the downsampling operator on the gpu.
+    
+    The file cuDownsampleOperator.h is a convienience wrapper for the device independent downsampleOperator class.
+    The class cuDownsampleOperator instantiates the downsampleOperator for the cuNDArray
+    and the header furthermore includes additional neccessary header files.
+*/
+
+#pragma once
+
+#include "cuNDArray_utils.h"
+#include "downsampleOperator.h"
+
+namespace Gadgetron{
+  
+  /** \class cuDownsampleOperator
+      \brief Instantiation of the downsample operator on the gpu.
+      
+      The class cuDownsampleOperator is a convienience wrapper for the device independent downsampleOperator.
+      cuDownsampleOperator instantiates the downsampleOperator for type cuNDArray<T>.
+  */
+  template <class T, unsigned int D> class cuDownsampleOperator : public downsampleOperator<cuNDArray<T>,D>
+  {
+  public:    
+    cuDownsampleOperator() : downsampleOperator<cuNDArray<T>,D>() {}
+    virtual ~cuDownsampleOperator() {}
+  }; 
+}
diff --git a/toolboxes/operators/gpu/cuFFTOperator.h b/toolboxes/operators/gpu/cuFFTOperator.h
new file mode 100644
index 0000000..5316019
--- /dev/null
+++ b/toolboxes/operators/gpu/cuFFTOperator.h
@@ -0,0 +1,29 @@
+/** \file cuFFTOperator.h
+    \brief Instantiation of the Cartesian FFT operator on the gpu.
+    
+    The file cuFFTOperator.h is a convienience wrapper for the device independent FFTOperator class.
+    The class cuFFTOperator instantiates the FFTOperator for cuNDArray< complext<T> >
+    and the header furthermore includes additional neccessary header files.
+*/
+
+#pragma once
+
+#include "cuNDArray_math.h"
+#include "FFTOperator.h"
+#include "cuNDFFT.h"
+
+namespace Gadgetron{
+  
+  /** \class cuFFTOperator
+      \brief Instantiation of the Cartesian FFT operator on the gpu.
+      
+      The class cuFFTOperator is a convienience wrapper for the device independent FFTOperator.
+      It instantiates the FFTOperator for type cuNDArray<T>.
+  */
+  template <class T> class cuFFTOperator : public FFTOperator< cuNDArray< complext<T> >, cuNDFFT<T> >
+  {
+  public:    
+    cuFFTOperator() : FFTOperator< cuNDArray< complext<T> >, cuNDFFT<T> >() {}
+    virtual ~cuFFTOperator() {}
+  }; 
+}
diff --git a/toolboxes/operators/gpu/cuIdentityOperator.h b/toolboxes/operators/gpu/cuIdentityOperator.h
new file mode 100644
index 0000000..57158e2
--- /dev/null
+++ b/toolboxes/operators/gpu/cuIdentityOperator.h
@@ -0,0 +1,28 @@
+/** \file cuIdentityOperator.h
+    \brief Instantiation of the identity operator on the gpu.
+    
+    The file cuIdentityOperator.h is a convienience wrapper for the device independent identityOperator class.
+    The class cuIdentityOperator instantiates the identityOperator for the cuNDArray
+    and the header furthermore includes additional neccessary header files.
+*/
+
+#pragma once
+
+#include "cuNDArray_math.h"
+#include "identityOperator.h"
+
+namespace Gadgetron{
+  
+  /** \class cuIdentityOperator
+      \brief Instantiation of the identity operator on the gpu.
+      
+      The class cuIdentityOperator is a convienience wrapper for the device independent identityOperator.
+      cuIdentityOperator instantiates the identityOperator for type cuNDArray<T>.
+  */
+  template <class T> class cuIdentityOperator : public identityOperator< cuNDArray<T> >
+  {
+  public:    
+    cuIdentityOperator() : identityOperator< cuNDArray<T> >() {}
+    virtual ~cuIdentityOperator() {}
+  }; 
+}
diff --git a/toolboxes/operators/gpu/cuImageOperator.h b/toolboxes/operators/gpu/cuImageOperator.h
new file mode 100644
index 0000000..6aa14d4
--- /dev/null
+++ b/toolboxes/operators/gpu/cuImageOperator.h
@@ -0,0 +1,66 @@
+/** \file cuImageOperator.h
+    \brief Image regularization operator, GPU based.
+*/
+
+#pragma once
+
+#include "cuNDArray_math.h"
+#include "complext.h"
+#include "imageOperator.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace Gadgetron{
+
+  template <class T> class cuImageOperator : public imageOperator< cuNDArray<typename realType<T>::Type >, cuNDArray<T> >
+  {
+  public:
+
+    cuImageOperator() : imageOperator< cuNDArray<typename realType<T>::Type >, cuNDArray<T> >() {}
+    virtual ~cuImageOperator() {}    
+
+    typedef typename imageOperator< cuNDArray<typename realType<T>::Type>, cuNDArray<T> >::REAL REAL;
+
+
+  protected:
+
+    // Windows/Cuda has given some real issues about defining min
+    // - so for now we resolve to defining our own:
+    const unsigned int& my_min(const unsigned int& a, const unsigned int& b) {
+        return (a>b)?b:a;
+    }
+
+    // Estimate offset to the regularization image
+    virtual REAL estimate_offset()
+    {
+      // Estimation based on simple histogram analysis:
+      // Returns an estimation of the "average" intensity of the 'sigma' proportion of the image with the smallest intensities.
+      //
+      
+      // This simple code is fast enough (<.5 ms on a 192x192 image) that we can just copy the hoImageOperators host code
+      //
+
+      const unsigned int granularity = 50000; 
+      std::vector<unsigned int> histogram(granularity,0);
+      REAL max_value = this->image_->at(amax(this->image_.get()));
+      boost::shared_ptr<hoNDArray<REAL> > tmp = this->image_->to_host();
+      REAL *d = tmp->get_data_ptr();
+      
+      for( unsigned int i=0; i<this->image_->get_number_of_elements(); i++) {
+	unsigned int bin = my_min(static_cast<unsigned int>(std::floor((d[i]/max_value)*granularity)), granularity-1);
+	histogram[bin]++;
+      }
+      
+      //Find 1th percentile
+      //
+      
+      unsigned int cumsum = 0, counter = 0;
+      while (cumsum < (unsigned int)(REAL(0.01)*this->image_->get_number_of_elements())) {
+	cumsum += histogram[counter++];
+      }      
+
+      return  REAL(counter+1)*max_value/granularity;
+    }
+  };
+}
diff --git a/toolboxes/operators/gpu/cuLaplaceOperator.cu b/toolboxes/operators/gpu/cuLaplaceOperator.cu
new file mode 100644
index 0000000..24ff704
--- /dev/null
+++ b/toolboxes/operators/gpu/cuLaplaceOperator.cu
@@ -0,0 +1,95 @@
+#include "cuLaplaceOperator.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "vector_td.h"
+#include "vector_td_utilities.h"
+#include "check_CUDA.h"
+
+namespace Gadgetron{
+
+  // Template Power function
+  template<unsigned int i, unsigned int j>
+  struct Pow
+  {
+    enum { Value = i*Pow<i,j-1>::Value};
+  };
+
+  template <unsigned int i>
+  struct Pow<i,1>
+  {
+    enum { Value = i};
+  };
+
+  template<class T, unsigned int D, unsigned int dim> class inner_laplace_functor{
+  public:
+		static __device__ __inline__ void apply(T& val,const T* __restrict__ in, const typename intd<D>::Type dims,const typename intd<D>::Type co, typename intd<D>::Type& stride){
+			for (int d = -1; d < 2; d++)
+				stride[dim]=d;
+				inner_laplace_functor<T,D,dim-1>::apply(val,in,dims,co,stride);
+		}
+  };
+  template<class T, unsigned int D> class inner_laplace_functor<T,D,0>{
+  public:
+  	static __device__ __inline__ void apply(T& val,const T* __restrict__ in, const typename intd<D>::Type dims,const typename intd<D>::Type co, typename intd<D>::Type& stride){
+  		typename intd<D>::Type coN = (co+dims+stride)%dims;
+  		val -= in[co_to_idx<D>(coN,dims)];
+  	}
+  };
+
+  template<class REAL, class T, unsigned int D> __global__ void
+  laplace_kernel( typename intd<D>::Type dims, const T * __restrict__ in, T * __restrict__ out )
+  {  
+    const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+    if( idx < prod(dims) ){
+    
+      T val = T(0);
+      typename intd<D>::Type coN;
+
+      typename intd<D>::Type co = idx_to_co<D>(idx, dims);
+
+      typename intd<D>::Type stride(0);
+
+
+      inner_laplace_functor<T,D,D-1>::apply(val,in,dims,co,stride);
+      out[idx] = val+in[co_to_idx<D>(co, dims)]*((REAL) Pow<3,D>::Value);
+    }
+  }
+
+  template< class T, unsigned int D> void
+  cuLaplaceOperator<T,D>::compute_laplace( cuNDArray<T> *in, cuNDArray<T> *out, bool accumulate )
+  {
+  
+    if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+      throw std::runtime_error("laplaceOperator::compute_laplace : array dimensions mismatch.");
+
+    }
+  
+    typename intd<D>::Type dims = vector_td<int,D>( from_std_vector<size_t,D>( *(in->get_dimensions().get()) ));
+
+    dim3 dimBlock( dims[0] );
+    dim3 dimGrid( prod(dims)/dims[0] );
+  
+    // Invoke kernel
+    laplace_kernel<typename realType<T>::Type ,T,D><<< dimGrid, dimBlock >>> (dims, in->get_data_ptr(), out->get_data_ptr() );
+  
+    CHECK_FOR_CUDA_ERROR();
+  }
+  
+  // Instantiations
+
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<float, 1>;
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<float, 2>;
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<float, 3>;
+
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<float_complext, 1>;
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<float_complext, 2>;
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<float_complext, 3>;
+
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<double, 1>;
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<double, 2>;
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<double, 3>;
+
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<double_complext, 1>;
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<double_complext, 2>;
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<double_complext, 3>;
+}
diff --git a/toolboxes/operators/gpu/cuLaplaceOperator.h b/toolboxes/operators/gpu/cuLaplaceOperator.h
new file mode 100644
index 0000000..124290d
--- /dev/null
+++ b/toolboxes/operators/gpu/cuLaplaceOperator.h
@@ -0,0 +1,24 @@
+/** \file cuLaplaceOperator.h
+    \brief Laplace regularization operator, GPU based.
+*/
+
+#pragma once
+
+#include "gpuoperators_export.h"
+#include "cuNDArray_math.h"
+#include "laplaceOperator.h"
+
+namespace Gadgetron{
+
+  template < class T, unsigned int D> class EXPORTGPUOPERATORS cuLaplaceOperator : public laplaceOperator<D, cuNDArray<T> >
+  {    
+  public:
+    
+    cuLaplaceOperator() : laplaceOperator< D, cuNDArray<T> >() {}
+    virtual ~cuLaplaceOperator() {}
+    
+
+  protected:
+    virtual void compute_laplace( cuNDArray<T> *in, cuNDArray<T> *out, bool accumulate );    
+  };
+}
diff --git a/toolboxes/operators/gpu/cuMultiplicationOperatorContainer.h b/toolboxes/operators/gpu/cuMultiplicationOperatorContainer.h
new file mode 100644
index 0000000..c2c00b7
--- /dev/null
+++ b/toolboxes/operators/gpu/cuMultiplicationOperatorContainer.h
@@ -0,0 +1,23 @@
+/** \file cuMultiplicationOperatorContainer.h
+    \brief Operator used to chain together (concatenate) a series of operators by multiplication, GPU version.
+*/
+
+#pragma once
+
+#include "cuNDArray_math.h"
+#include "multiplicationOperatorContainer.h"
+
+
+namespace Gadgetron{
+template <class REAL, class T> class cuMultiplicationOperatorContainer 
+  : public multiplicationOperatorContainer< REAL, cuNDArray<T> >
+{
+public:
+  cuMultiplicationOperatorContainer() : multiplicationOperatorContainer< REAL, cuNDArray<T> >() {}
+  virtual ~cuMultiplicationOperatorContainer() {}
+  
+  virtual boost::shared_ptr< linearOperator< REAL, cuNDArray<T> > > clone(){
+    return linearOperator< REAL, cuNDArray<T> >::clone(this);
+  }  
+};
+}
diff --git a/toolboxes/operators/gpu/cuPartialDerivativeOperator.cu b/toolboxes/operators/gpu/cuPartialDerivativeOperator.cu
new file mode 100644
index 0000000..1f2b7d1
--- /dev/null
+++ b/toolboxes/operators/gpu/cuPartialDerivativeOperator.cu
@@ -0,0 +1,145 @@
+/** \file cuPartialDerivativeOperator.h
+    \brief Implementation of the partial derivative operator for the gpu.
+*/
+
+#include "cuPartialDerivativeOperator.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "vector_td_utilities.h"
+#include "check_CUDA.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> __global__ void
+  first_order_partial_derivative_kernel( typename intd<D>::Type stride, 
+                                         typename intd<D>::Type dims, 
+                                         const T  * __restrict__ in, T * __restrict__ out )
+  {
+    const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+    if( idx < prod(dims) ){
+
+      T valN, valC;
+
+      typename intd<D>::Type co = idx_to_co<D>(idx, dims);
+      typename intd<D>::Type coN = (co+dims+stride)%dims;
+    
+      valN = in[co_to_idx<D>(coN, dims)];
+      valC = in[co_to_idx<D>(co, dims)];
+    
+      T val = valN-valC;
+    
+      out[idx] += val;
+    }
+  }
+
+  template<class T, unsigned int D> __global__ void
+  second_order_partial_derivative_kernel( typename intd<D>::Type forwards_stride, 
+                                          typename intd<D>::Type adjoint_stride, 
+                                          typename intd<D>::Type dims, 
+                                          const T  * __restrict__ in, T * __restrict__ out )
+  {
+    const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+    if( idx < prod(dims) ){
+
+      T valN1, valN2, valC;
+
+      typename intd<D>::Type co = idx_to_co<D>(idx, dims);
+      typename intd<D>::Type coN1 = (co+dims+forwards_stride)%dims;
+      typename intd<D>::Type coN2 = (co+dims+adjoint_stride)%dims;
+    
+      valN1 = in[co_to_idx<D>(coN1, dims)];
+      valN2 = in[co_to_idx<D>(coN2, dims)];
+      valC = in[co_to_idx<D>(co, dims)];
+    
+      T val = valC+valC-valN1-valN2;
+    
+      out[idx] += val;
+    }
+  }
+
+  template< class T, unsigned int D> void
+  cuPartialDerivativeOperator<T,D>::compute_partial_derivative( typename int64d<D>::Type stride,
+                                                                cuNDArray<T> *in, 
+                                                                cuNDArray<T> *out, 
+                                                                bool accumulate )
+  {
+    if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+      throw std::runtime_error( "partialDerivativeOperator::compute_partial_derivative : array dimensions mismatch.");
+
+    }
+
+
+    if (!accumulate) clear(out);
+    
+    typename int64d<D>::Type dims = vector_td<long long,D>( from_std_vector<size_t,D>( *(in->get_dimensions().get()) ));
+    dim3 dimBlock( dims.vec[0] );
+    dim3 dimGrid( 1, dims.vec[D-1] );
+  
+    for(int d=1; d<D-1; d++ )
+      dimGrid.x *= dims.vec[d];
+  
+    size_t elements = in->get_number_of_elements();
+
+    // Invoke kernel
+    for (size_t i = 0; i < elements/prod(dims); i++)
+    	first_order_partial_derivative_kernel<T,D><<< dimGrid, dimBlock >>> 
+        ( vector_td<int,D>(stride), vector_td<int,D>(dims),
+          in->get_data_ptr()+i*prod(dims), out->get_data_ptr()+i*prod(dims));
+  
+    CHECK_FOR_CUDA_ERROR();
+  }
+
+  template<class T, unsigned int D> void
+  cuPartialDerivativeOperator<T,D>::compute_second_order_partial_derivative( typename int64d<D>::Type forwards_stride,
+                                                                             typename int64d<D>::Type adjoint_stride, 
+                                                                             cuNDArray<T> *in, cuNDArray<T> *out, 
+                                                                             bool accumulate )
+  {  
+    if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+      throw std::runtime_error( "partialDerivativeOperator::compute_second_order_partial_derivative : array dimensions mismatch.");
+    }
+    
+    if (!accumulate) clear(out);
+
+    typename int64d<D>::Type dims = vector_td<long long,D>( from_std_vector<size_t,D>( *(in->get_dimensions().get()) ));
+    dim3 dimBlock( dims.vec[0] );
+    dim3 dimGrid( 1, dims.vec[D-1] );
+  
+    for(int d=1; d<D-1; d++ )
+      dimGrid.x *= dims.vec[d];
+  
+    size_t elements = in->get_number_of_elements();
+
+    // Invoke kernel
+		for (size_t i = 0; i < elements/prod(dims); i++)
+			second_order_partial_derivative_kernel<T,D><<< dimGrid, dimBlock >>> 
+        ( vector_td<int,D>(forwards_stride), vector_td<int,D>(adjoint_stride), vector_td<int,D>(dims),
+          in->get_data_ptr()+i*prod(dims), out->get_data_ptr()+i*prod(dims) );
+    
+    CHECK_FOR_CUDA_ERROR();
+  }
+
+  //
+  // Instantiations
+  //
+
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<float, 1>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<float, 2>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<float, 3>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<float, 4>;
+
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<float_complext, 1>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<float_complext, 2>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<float_complext, 3>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<float_complext, 4>;
+
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<double, 1>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<double, 2>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<double, 3>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<double, 4>;
+
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<double_complext, 1>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<double_complext, 2>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<double_complext, 3>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<double_complext, 4>;
+}
diff --git a/toolboxes/operators/gpu/cuPartialDerivativeOperator.h b/toolboxes/operators/gpu/cuPartialDerivativeOperator.h
new file mode 100644
index 0000000..7725a3c
--- /dev/null
+++ b/toolboxes/operators/gpu/cuPartialDerivativeOperator.h
@@ -0,0 +1,35 @@
+/** \file cuPartialDerivativeOperator.h
+    \brief Partial derivative regularization operator, GPU based.
+*/
+
+#pragma once
+
+#include "gpuoperators_export.h"
+#include "cuNDArray_math.h"
+#include "partialDerivativeOperator.h"
+
+namespace Gadgetron{
+
+  template <class T, unsigned int D> class EXPORTGPUOPERATORS cuPartialDerivativeOperator 
+    : public partialDerivativeOperator<D, cuNDArray<T> >
+  {
+  public:
+    
+    cuPartialDerivativeOperator() : 
+      partialDerivativeOperator< D, cuNDArray<T> >(0) {}
+    
+    cuPartialDerivativeOperator( size_t dimension ) : 
+      partialDerivativeOperator<D, cuNDArray<T> >( dimension ) {}
+    
+    virtual ~cuPartialDerivativeOperator() {}
+    
+    virtual void compute_partial_derivative( typename int64d<D>::Type stride, cuNDArray<T> *in,
+                                             cuNDArray<T> *out, bool accumulate );  
+    
+    virtual void compute_second_order_partial_derivative( typename int64d<D>::Type forwards_stride,
+                                                          typename int64d<D>::Type adjoint_stride, 
+                                                          cuNDArray<T> *in, cuNDArray<T> *out, bool accumulate );  
+    
+
+  };
+}
diff --git a/toolboxes/operators/gpu/cuPartialDerivativeOperator2.cu b/toolboxes/operators/gpu/cuPartialDerivativeOperator2.cu
new file mode 100644
index 0000000..53f4273
--- /dev/null
+++ b/toolboxes/operators/gpu/cuPartialDerivativeOperator2.cu
@@ -0,0 +1,217 @@
+/*
+ * cuPartialDerivativeOperator2.cu
+ *
+ *  Created on: Jul 27, 2012
+ *      Author: David C Hansen
+ */
+
+
+#include "cuPartialDerivativeOperator2.h"
+#include "vector_td_utilities.h"
+#include "check_CUDA.h"
+#include "cuNDArray_math.h"
+#define MAX_THREADS_PER_BLOCK 512
+
+using namespace Gadgetron;
+template<class T, unsigned int D> __global__ void
+partial_derivative_kernel2_forwards( typename intd<D>::Type dims,
+					T *in, T *out )
+{
+
+  const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+  if( idx < prod(dims) ){
+
+    T valN1, valC;
+
+    const typename intd<D>::Type co = idx_to_co<D>(idx, dims);
+
+    typename intd<D>::Type coN1 = co;
+    coN1[D-1] +=1;
+	if (co[D-1] == dims[D-1]-1)  coN1[D-1] -= 2;
+
+
+
+    valN1 = in[co_to_idx<D>(coN1, dims)];
+
+    valC = in[idx];
+    out[idx] += valC-valN1;
+  }
+}
+
+template<class T, unsigned int D> __global__ void
+partial_derivative_kernel2_backwards( typename intd<D>::Type dims,
+					T *in, T *out )
+{
+
+  const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+  if( idx < prod(dims) ){
+
+    T valN1, valC;
+
+    const typename intd<D>::Type co = idx_to_co<D>(idx, dims);
+
+    if (co[D-1] == 0) out[idx] += in[idx];
+    else {
+		typename intd<D>::Type coN1 = co;
+		coN1[D-1] -=1;
+
+
+		valN1 = in[co_to_idx<D>(coN1, dims)];
+
+		valC = in[idx];
+		T val = valC-valN1;
+		if (co[D-1]== dims[D-1]-2){
+			coN1[D-1] += 2;
+			val -= in[co_to_idx<D>(coN1, dims)];
+		}
+		out[idx] += val;
+    }
+  }
+}
+
+template<class T, unsigned int D> __global__ void
+second_order_partial_derivative_kernel2( typename intd<D>::Type dims,
+					T *in, T *out )
+{
+  const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+  if( idx < prod(dims) ){
+
+    T valN1, valN2, valC;
+
+    const typename intd<D>::Type co = idx_to_co<D>(idx, dims);
+
+    typename intd<D>::Type coN1 = co;
+    coN1[D-1] +=1;
+	if (co[D-1] == dims[D-1]-1)  coN1[D-1] -= 3;
+    typename intd<D>::Type coN2 = co;
+    coN2[D-1] -=1;
+
+
+
+    valN1 = in[co_to_idx<D>(coN1, dims)];
+    if (co[D-1] == 0)  valN2 = 0;
+    else valN2 = in[co_to_idx<D>(coN2, dims)];
+
+    valC = in[idx];
+    T val;
+
+    if (co[D-1] < dims[D-1]-2 ) val = valC+valC-valN1-valN2;
+    else if (co[D-1] == dims[D-1]-2 ) val =  2*valC+valC-2*valN1-valN2;
+    else val =  2*valC-valN1-2*valN2;
+
+    out[idx] += val;
+  }
+}
+
+
+template< class T, unsigned int D> void
+cuPartialDerivativeOperator2<T,D>::mult_MH_M(cuNDArray<T> *in, cuNDArray<T> *out,
+										bool accumulate )
+{
+
+	cuNDArray<T> tmp = *in;
+	mult_M(in,&tmp,false);
+	mult_MH(&tmp,out,accumulate);
+
+}
+
+template< class T, unsigned int D> void
+cuPartialDerivativeOperator2<T,D>::mult_MH(cuNDArray<T> *in, cuNDArray<T> *out,
+										bool accumulate )
+{
+  if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+	  throw std::runtime_error( "partialDerivativeOperator2::mult_MH : array dimensions mismatch." );
+  }
+
+  if( in->get_number_of_dimensions() != D || out->get_number_of_dimensions() != D ){
+	  throw std::runtime_error( "partialDerivativeOperator2::mult_MH  : dimensionality mismatch" );
+  }
+
+  typename uintd<D>::Type _dims = vector_td<unsigned int,D>(from_std_vector<size_t,D>( *(in->get_dimensions().get()) ));
+  typename intd<D>::Type dims;
+  for( unsigned int i=0; i<D; i++ ){
+    dims.vec[i] = (int)_dims.vec[i];
+  }
+
+
+
+  if (!accumulate) clear(out);
+  int threadsPerBlock =std::min(prod(dims),MAX_THREADS_PER_BLOCK);
+  dim3 dimBlock( threadsPerBlock);
+  int totalBlocksPerGrid = (prod(dims)+MAX_THREADS_PER_BLOCK-1)/MAX_THREADS_PER_BLOCK;
+  dim3 dimGrid(totalBlocksPerGrid);
+
+  // Invoke kernel
+  partial_derivative_kernel2_backwards<T,D><<< dimGrid, dimBlock >>> ( dims, in->get_data_ptr(), out->get_data_ptr() );
+
+  CHECK_FOR_CUDA_ERROR();
+
+
+
+}
+
+template< class T, unsigned int D> void
+cuPartialDerivativeOperator2<T,D>::mult_M(cuNDArray<T> *in, cuNDArray<T> *out,
+										bool accumulate )
+{
+  if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+    throw std::runtime_error( "partialDerivativeOperator2::mult_M : array dimensions mismatch.");
+
+  }
+
+  if( in->get_number_of_dimensions() != D || out->get_number_of_dimensions() != D ){
+	  throw std::runtime_error( "partialDerivativeOperator2::mult_M  : dimensionality mismatch" );
+  }
+
+  typename uintd<D>::Type _dims = vector_td<unsigned int,D>(from_std_vector<size_t,D>( *(in->get_dimensions().get()) ));
+  typename intd<D>::Type dims;
+  for( unsigned int i=0; i<D; i++ ){
+    dims.vec[i] = (int)_dims.vec[i];
+  }
+
+
+
+  if (!accumulate) clear(out);
+
+  int threadsPerBlock =std::min(prod(dims),MAX_THREADS_PER_BLOCK);
+   dim3 dimBlock( threadsPerBlock);
+   int totalBlocksPerGrid = (prod(dims)+MAX_THREADS_PER_BLOCK-1)/MAX_THREADS_PER_BLOCK;
+   dim3 dimGrid(totalBlocksPerGrid);
+
+  // Invoke kernel
+  partial_derivative_kernel2_forwards<T,D><<< dimGrid, dimBlock >>> ( dims, in->get_data_ptr(), out->get_data_ptr() );
+
+  CHECK_FOR_CUDA_ERROR();
+
+
+
+}
+
+
+
+
+
+template class EXPORTGPUOPERATORS cuPartialDerivativeOperator2<float,1>;
+
+template class EXPORTGPUOPERATORS cuPartialDerivativeOperator2<float,2>;
+template class EXPORTGPUOPERATORS cuPartialDerivativeOperator2<float,3>;
+template class EXPORTGPUOPERATORS cuPartialDerivativeOperator2<float,4>;
+
+template class EXPORTGPUOPERATORS cuPartialDerivativeOperator2<double,1>;
+template class EXPORTGPUOPERATORS cuPartialDerivativeOperator2<double,2>;
+template class EXPORTGPUOPERATORS cuPartialDerivativeOperator2<double,3>;
+template class EXPORTGPUOPERATORS cuPartialDerivativeOperator2<double,4>;
+
+
+template class EXPORTGPUOPERATORS cuPartialDerivativeOperator2<float_complext,1>;
+
+template class EXPORTGPUOPERATORS cuPartialDerivativeOperator2<float_complext,2>;
+template class EXPORTGPUOPERATORS cuPartialDerivativeOperator2<float_complext,3>;
+template class EXPORTGPUOPERATORS cuPartialDerivativeOperator2<float_complext,4>;
+
+
+template class EXPORTGPUOPERATORS cuPartialDerivativeOperator2<double_complext,1>;
+template class EXPORTGPUOPERATORS cuPartialDerivativeOperator2<double_complext,2>;
+template class EXPORTGPUOPERATORS cuPartialDerivativeOperator2<double_complext,3>;
+template class EXPORTGPUOPERATORS cuPartialDerivativeOperator2<double_complext,4>;
+
diff --git a/toolboxes/operators/gpu/cuPartialDerivativeOperator2.h b/toolboxes/operators/gpu/cuPartialDerivativeOperator2.h
new file mode 100644
index 0000000..f491979
--- /dev/null
+++ b/toolboxes/operators/gpu/cuPartialDerivativeOperator2.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "linearOperator.h"
+#include "vector_td.h"
+#include "cuNDArray.h"
+
+#include "gpuoperators_export.h"
+
+namespace Gadgetron {
+template <class T, unsigned int D> class EXPORTGPUOPERATORS cuPartialDerivativeOperator2
+	: public linearOperator<cuNDArray<T> >
+{
+
+public:
+
+  cuPartialDerivativeOperator2() : linearOperator<cuNDArray<T> >() {}
+  virtual ~cuPartialDerivativeOperator2() {}
+
+  virtual void mult_M( cuNDArray<T> *in,cuNDArray<T> *out, bool accumulate = false );
+
+
+  virtual void mult_MH(cuNDArray<T> *in, cuNDArray<T> *out, bool accumulate = false );
+
+  virtual void mult_MH_M( cuNDArray<T> *in, cuNDArray<T>*out, bool accumulate = false );
+
+};
+}
diff --git a/toolboxes/operators/gpu/cuTv1dOperator.cu b/toolboxes/operators/gpu/cuTv1dOperator.cu
new file mode 100644
index 0000000..ec302c8
--- /dev/null
+++ b/toolboxes/operators/gpu/cuTv1dOperator.cu
@@ -0,0 +1,129 @@
+#include "cuTv1dOperator.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "vector_td_utilities.h"
+#include "complext.h"
+#include "check_CUDA.h"
+#include "cudaDeviceManager.h"
+
+#include <iostream>
+
+using namespace Gadgetron;
+
+template<class REAL, class T, unsigned int D> static inline	__device__ REAL gradient(const T* __restrict__ in, const vector_td<int,D>& dims, vector_td<int,D>& co){
+
+	T xi = in[co_to_idx<D>((co+dims)%dims,dims)];
+
+	co[D-1]+=1;
+	T dt = in[co_to_idx<D>((co+dims)%dims,dims)];
+	REAL grad = norm(xi-dt);
+	co[D-1]-=1;
+
+	return sqrt(grad);
+}
+
+
+template<class REAL, class T, unsigned int D> static __global__ void tvGradient_kernel(const T* __restrict__ in, T* __restrict__ out, const vector_td<int,D> dims,REAL limit,REAL weight){
+	const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	if( idx < prod(dims) ){
+		T xi = in[idx];
+		T result=T(0);
+
+		vector_td<int,D> co = idx_to_co<D>(idx, dims);
+
+		REAL grad = gradient<REAL,T,D>(in,dims,co);
+
+
+		if (grad > limit) {
+			result += xi/grad;
+
+			co[D-1]+=1;
+			result -= in[co_to_idx<D>((co+dims)%dims,dims)]/grad;
+			co[D-1]-=1;
+
+		}
+
+		co[D-1]-=1;
+		grad = gradient<REAL,T,D>(in,dims,co);
+		if (grad > limit) {
+			result +=(xi-in[co_to_idx<D>((co+dims)%dims,dims)])/grad;
+		}
+		co[D-1]+=1;
+
+		out[idx] += weight*result;
+
+	}
+}
+
+
+template<class T, unsigned int D> void cuTv1DOperator<T,D>::gradient (cuNDArray<T> * in,cuNDArray<T> * out, bool accumulate){
+	if (!accumulate) clear(out);
+
+	const typename intd<D>::Type dims = vector_td<int,D>( from_std_vector<size_t,D>(*(in->get_dimensions())));
+	int elements = in->get_number_of_elements();
+
+	int threadsPerBlock =std::min(prod(dims),cudaDeviceManager::Instance()->max_blockdim());
+	dim3 dimBlock( threadsPerBlock);
+	int totalBlocksPerGrid = std::max(1,prod(dims)/cudaDeviceManager::Instance()->max_blockdim());
+	dim3 dimGrid(totalBlocksPerGrid);
+
+	for (int i =0; i < (elements/prod(dims)); i++){
+		tvGradient_kernel<<<dimGrid,dimBlock>>>(in->get_data_ptr()+i*prod(dims),out->get_data_ptr()+i*prod(dims),dims,limit_,this->weight_);
+	}
+
+
+	cudaDeviceSynchronize();
+	CHECK_FOR_CUDA_ERROR();
+}
+
+template<class REAL, class T, unsigned int D> static __global__ void tvMagnitude_kernel(const T* in,T* out,const vector_td<int,D> dims,REAL limit,REAL weight)
+{
+	const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	if( idx < prod(dims) ){
+		vector_td<int,D> co = idx_to_co<D>(idx, dims);
+		REAL grad = gradient<REAL,T,D>(in,dims,co);
+		out[idx] = grad*weight;
+	}
+}
+
+template<class T, unsigned int D> typename realType<T>::Type cuTv1DOperator<T,D>::magnitude (cuNDArray<T> * in){
+
+	cuNDArray<T> out(*in);
+	const typename intd<D>::Type dims = vector_td<int,D>( from_std_vector<size_t,D>(*(in->get_dimensions())));
+	int elements = in->get_number_of_elements();
+
+	int threadsPerBlock =std::min(prod(dims),cudaDeviceManager::Instance()->max_blockdim());
+	dim3 dimBlock( threadsPerBlock);
+	int totalBlocksPerGrid = std::max(1,prod(dims)/cudaDeviceManager::Instance()->max_blockdim());
+	dim3 dimGrid(totalBlocksPerGrid);
+
+	for (int i =0; i < (elements/prod(dims)); i++){
+		tvMagnitude_kernel<<<dimGrid,dimBlock>>>(in->get_data_ptr()+i*prod(dims),out.get_data_ptr()+i*prod(dims),dims,limit_,this->weight_);
+	}
+
+
+	cudaDeviceSynchronize();
+	CHECK_FOR_CUDA_ERROR();
+	return asum(&out);
+}
+
+
+template class EXPORTGPUOPERATORS cuTv1DOperator<float,1>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<float,2>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<float,3>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<float,4>;
+
+template class EXPORTGPUOPERATORS cuTv1DOperator<double,1>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<double,2>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<double,3>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<double,4>;
+
+template class EXPORTGPUOPERATORS cuTv1DOperator<float_complext,1>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<float_complext,2>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<float_complext,3>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<float_complext,4>;
+
+template class EXPORTGPUOPERATORS cuTv1DOperator<double_complext,1>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<double_complext,2>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<double_complext,3>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<double_complext,4>;
diff --git a/toolboxes/operators/gpu/cuTv1dOperator.h b/toolboxes/operators/gpu/cuTv1dOperator.h
new file mode 100644
index 0000000..d6663f6
--- /dev/null
+++ b/toolboxes/operators/gpu/cuTv1dOperator.h
@@ -0,0 +1,37 @@
+/** \file cuTv1DOperator.h
+    \brief Total variation regularization operator, GPU based. Optimized 1D version.
+*/
+
+#pragma once
+
+#include "gpuoperators_export.h"
+#include "cuNDArray_math.h"
+#include "generalOperator.h"
+#include "complext.h"
+
+namespace Gadgetron{
+  
+  template<class T, unsigned int D> class EXPORTGPUOPERATORS cuTv1DOperator : public generalOperator< cuNDArray<T> >
+  {    
+
+    
+  public:
+    
+    typedef typename realType<T>::Type REAL;
+    cuTv1DOperator() : generalOperator< cuNDArray<T> >(){
+      limit_ = REAL(1e-8);      
+    }
+    
+    virtual ~cuTv1DOperator(){};
+
+    void set_limit(REAL limit){
+      limit_ = limit;
+    }
+
+    virtual void gradient(cuNDArray<T>*,cuNDArray<T>*, bool accumulate=false);
+    virtual REAL magnitude(cuNDArray<T>*);
+
+  protected:
+    REAL limit_;    
+  };  
+}
diff --git a/toolboxes/operators/gpu/cuTvOperator.cu b/toolboxes/operators/gpu/cuTvOperator.cu
new file mode 100644
index 0000000..a7c91f5
--- /dev/null
+++ b/toolboxes/operators/gpu/cuTvOperator.cu
@@ -0,0 +1,141 @@
+#include "cuTvOperator.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "vector_td_utilities.h"
+#include "complext.h"
+#include <iostream>
+#include "check_CUDA.h"
+#include "cudaDeviceManager.h"
+#include <stdio.h>
+
+using namespace Gadgetron;
+
+template<class REAL, class T, unsigned int D> static inline  __device__ REAL gradient(const T* __restrict__ in, const vector_td<int,D>& dims, vector_td<int,D>& co)
+{
+	REAL grad = REAL(0);
+	T xi = in[co_to_idx<D>((co+dims)%dims,dims)];
+	for (int i = 0; i < D; i++){
+		co[i]+=1;
+		T dt = in[co_to_idx<D>((co+dims)%dims,dims)];
+		grad += norm(xi-dt);
+		co[i]-=1;
+	}
+	return sqrt(grad);
+}
+
+
+template<class REAL, class T, unsigned int D> static __global__ void tvGradient_kernel(const T* __restrict__ in, T* __restrict__ out, const vector_td<int,D> dims,REAL limit,REAL weight)
+{
+	const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	if( idx < prod(dims) ){
+		T xi = in[idx];
+		T result=T(0);
+
+		vector_td<int,D> co = idx_to_co<D>(idx, dims);
+
+		REAL grad = ::max(gradient<REAL,T,D>(in,dims,co),limit);
+
+		if (grad > limit) {
+			//result += REAL(D)*xi/grad;
+			for (int i = 0; i < D; i++){
+				co[i]+=1;
+				result += (xi-in[co_to_idx<D>((co+dims)%dims,dims)])/grad;
+				co[i]-=1;
+			}
+		}
+
+		for (int i = 0; i < D; i++){
+			co[i]-=1;
+			grad = ::max(gradient<REAL,T,D>(in,dims,co),limit);
+
+			if (grad > limit) {
+				result +=(xi-in[co_to_idx<D>((co+dims)%dims,dims)])/grad;
+			}
+			co[i]+=1;
+		}
+		out[idx] += result*weight;
+	}
+}
+
+
+
+
+
+
+
+
+template<class T, unsigned int D> void cuTvOperator<T,D>::gradient (cuNDArray<T> * in,cuNDArray<T> * out, bool accumulate)
+{
+	if (!accumulate)
+		clear(out);
+
+	const typename intd<D>::Type dims = vector_td<int,D>( from_std_vector<size_t,D>(*(in->get_dimensions())));
+	int elements = in->get_number_of_elements();
+
+	int threadsPerBlock =std::min(prod(dims),256); //Using hardcoded blockSize because we use quite a lot of registers
+
+	dim3 dimBlock( threadsPerBlock);
+	int totalBlocksPerGridx = std::min(std::max(1,prod(dims)/threadsPerBlock),cudaDeviceManager::Instance()->max_griddim());
+	int totalBlocksPerGridy = (prod(dims)-1)/(threadsPerBlock*totalBlocksPerGridx)+1;
+	dim3 dimGrid(totalBlocksPerGridx,totalBlocksPerGridy);
+
+	for (int i =0; i < (elements/prod(dims)); i++){
+		tvGradient_kernel<<<dimGrid,dimBlock>>>(in->get_data_ptr()+i*prod(dims),out->get_data_ptr()+i*prod(dims),dims,limit_,this->weight_);
+	}
+
+	//cudaDeviceSynchronize();
+	//CHECK_FOR_CUDA_ERROR();
+}
+
+template<class REAL, class T, unsigned int D> static __global__ void tvMagnitude_kernel(const  T* __restrict__  in,T* __restrict__ out,const vector_td<int,D> dims,REAL limit,REAL weight)
+{
+	const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	if( idx < prod(dims) ){
+		vector_td<int,D> co = idx_to_co<D>(idx, dims);
+		REAL grad = gradient<REAL,T,D>(in,dims,co);
+		//out[idx] =  (grad > limit) ? grad*weight : REAL(0);
+		out[idx] = grad*weight;
+	}
+}
+
+
+template<class T, unsigned int D> typename realType<T>::Type cuTvOperator<T,D>::magnitude (cuNDArray<T> * in)
+{
+	cuNDArray<T> out(in->get_dimensions());
+	const typename intd<D>::Type dims = vector_td<int,D>( from_std_vector<size_t,D>(*(in->get_dimensions())));
+	int elements = in->get_number_of_elements();
+
+	int threadsPerBlock =std::min(prod(dims),256); //Using hardcoded blockSize because we use quite a lot of registers
+	dim3 dimBlock( threadsPerBlock);
+	int totalBlocksPerGridx = std::min(std::max(1,prod(dims)/threadsPerBlock),cudaDeviceManager::Instance()->max_griddim());
+	int totalBlocksPerGridy = (prod(dims)-1)/(threadsPerBlock*totalBlocksPerGridx)+1;
+	dim3 dimGrid(totalBlocksPerGridx,totalBlocksPerGridy);
+
+	for (int i =0; i < (elements/prod(dims)); i++){
+		tvMagnitude_kernel<<<dimGrid,dimBlock>>>(in->get_data_ptr()+i*prod(dims),out.get_data_ptr()+i*prod(dims),dims,limit_,this->weight_);
+	}
+
+	//cudaDeviceSynchronize();
+	//CHECK_FOR_CUDA_ERROR();
+	return asum(&out);
+}
+
+template class EXPORTGPUOPERATORS cuTvOperator<float,1>;
+template class EXPORTGPUOPERATORS cuTvOperator<float,2>;
+template class EXPORTGPUOPERATORS cuTvOperator<float,3>;
+template class EXPORTGPUOPERATORS cuTvOperator<float,4>;
+
+template class EXPORTGPUOPERATORS cuTvOperator<double,1>;
+template class EXPORTGPUOPERATORS cuTvOperator<double,2>;
+template class EXPORTGPUOPERATORS cuTvOperator<double,3>;
+template class EXPORTGPUOPERATORS cuTvOperator<double,4>;
+
+template class EXPORTGPUOPERATORS cuTvOperator<float_complext,1>;
+template class EXPORTGPUOPERATORS cuTvOperator<float_complext,2>;
+template class EXPORTGPUOPERATORS cuTvOperator<float_complext,3>;
+template class EXPORTGPUOPERATORS cuTvOperator<float_complext,4>;
+
+template class EXPORTGPUOPERATORS cuTvOperator<double_complext,1>;
+template class EXPORTGPUOPERATORS cuTvOperator<double_complext,2>;
+template class EXPORTGPUOPERATORS cuTvOperator<double_complext,3>;
+template class EXPORTGPUOPERATORS cuTvOperator<double_complext,4>;
diff --git a/toolboxes/operators/gpu/cuTvOperator.h b/toolboxes/operators/gpu/cuTvOperator.h
new file mode 100644
index 0000000..452c09d
--- /dev/null
+++ b/toolboxes/operators/gpu/cuTvOperator.h
@@ -0,0 +1,41 @@
+/** \file cuTvOperator.h
+    \brief Total variation regularization operator, GPU based.
+*/
+
+#pragma once
+
+#include "gpuoperators_export.h"
+#include "cuNDArray_math.h"
+#include "generalOperator.h"
+
+#include "complext.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class EXPORTGPUOPERATORS cuTvOperator 
+    : public generalOperator<cuNDArray<T> > 
+  {
+
+
+  public:
+    typedef typename realType<T>::Type REAL;
+
+    cuTvOperator() : generalOperator<cuNDArray<T> >(){
+      limit_ = REAL(0);
+    }
+
+    virtual ~cuTvOperator(){};
+
+    void set_limit(REAL limit){
+      limit_ = limit;
+    }
+
+    virtual void gradient(cuNDArray<T>*,cuNDArray<T>*, bool accumulate=false);
+    virtual REAL magnitude(cuNDArray<T>*);
+
+  protected:
+
+  protected:    
+    REAL limit_;
+  };
+}
diff --git a/toolboxes/operators/gpu/cuTvPicsOperator.h b/toolboxes/operators/gpu/cuTvPicsOperator.h
new file mode 100644
index 0000000..f321082
--- /dev/null
+++ b/toolboxes/operators/gpu/cuTvPicsOperator.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "cuNDArray_math.h"
+#include "cuTvOperator.h"
+#include "tvPicsOperator.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class cuTvPicsOperator 
+    : public tvPicsOperator< cuNDArray<T>, cuTvOperator<T,D>, typename realType<T>::Type >
+  {
+  public:
+    cuTvPicsOperator() : tvPicsOperator< cuNDArray<T>, cuTvOperator<T,D>, typename realType<T>::Type >() {}
+    virtual ~cuTvPicsOperator() {}
+  };    
+}
diff --git a/toolboxes/operators/gpu/cuUpsampleOperator.h b/toolboxes/operators/gpu/cuUpsampleOperator.h
new file mode 100644
index 0000000..06fd812
--- /dev/null
+++ b/toolboxes/operators/gpu/cuUpsampleOperator.h
@@ -0,0 +1,28 @@
+/** \file cuUpsampleOperator.h
+    \brief Instantiation of the upsampling operator on the gpu.
+    
+    The file cuUpsampleOperator.h is a convienience wrapper for the device independent upsampleOperator class.
+    The class cuUpsampleOperator instantiates the upsampleOperator for the cuNDArray
+    and the header furthermore includes additional neccessary header files.
+*/
+
+#pragma once
+
+#include "cuNDArray_utils.h"
+#include "upsampleOperator.h"
+
+namespace Gadgetron{
+  
+  /** \class cuUpsampleOperator
+      \brief Instantiation of the upsample operator on the gpu.
+      
+      The class cuUpsampleOperator is a convienience wrapper for the device independent upsampleOperator.
+      cuUpsampleOperator instantiates the upsampleOperator for type cuNDArray<T>.
+  */
+  template <class T, unsigned int D> class cuUpsampleOperator : public upsampleOperator<cuNDArray<T>, D>
+  {
+  public:    
+    cuUpsampleOperator() : upsampleOperator<cuNDArray<T>,D>() {}
+    virtual ~cuUpsampleOperator() {}
+  }; 
+}
diff --git a/toolboxes/operators/gpu/gpuoperators_export.h b/toolboxes/operators/gpu/gpuoperators_export.h
new file mode 100644
index 0000000..0a9622b
--- /dev/null
+++ b/toolboxes/operators/gpu/gpuoperators_export.h
@@ -0,0 +1,18 @@
+/** \file gpuoperators_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef GPUOPERATORS_EXPORT_H_
+#define GPUOPERATORS_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GPUOPERATORS__)
+#define EXPORTGPUOPERATORS __declspec(dllexport)
+#else
+#define EXPORTGPUOPERATORS __declspec(dllimport)
+#endif
+#else
+#define EXPORTGPUOPERATORS
+#endif
+
+#endif /* GPUOPERATORS_EXPORT_H_ */
diff --git a/toolboxes/operators/gpu/hoCuDiagonalOperator.h b/toolboxes/operators/gpu/hoCuDiagonalOperator.h
new file mode 100644
index 0000000..e262cf1
--- /dev/null
+++ b/toolboxes/operators/gpu/hoCuDiagonalOperator.h
@@ -0,0 +1,20 @@
+/** \file hoCuDiagonalOperator.h
+    \brief Diagonal matrix regularization operator for array type hoCuNDarray
+*/
+
+#pragma once
+
+#include "hoCuNDArray_operators.h"
+#include "hoCuNDArray_elemwise.h"
+#include "hoCuNDArray_blas.h"
+#include "diagonalOperator.h"
+
+namespace Gadgetron{
+
+  template <class T> class hoCuDiagonalOperator : public diagonalOperator< hoCuNDArray<T> >
+  {
+  public:
+    hoCuDiagonalOperator() : diagonalOperator< hoCuNDArray<T> >() {}
+    virtual ~hoCuDiagonalOperator() {}
+  };
+}
diff --git a/toolboxes/operators/gpu/hoCuEncodingOperatorContainer.h b/toolboxes/operators/gpu/hoCuEncodingOperatorContainer.h
new file mode 100644
index 0000000..0cf3890
--- /dev/null
+++ b/toolboxes/operators/gpu/hoCuEncodingOperatorContainer.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "hoCuNDArray.h"
+#include "hoCuNDArray_operators.h"
+#include "hoCuNDArray_elemwise.h"
+#include "hoCuNDArray_blas.h"
+#include "encodingOperatorContainer.h"
+
+namespace Gadgetron{
+  
+  template<class T> class hoCuEncodingOperatorContainer 
+    : public encodingOperatorContainer< hoCuNDArray<T> >
+  {
+  public:
+    hoCuEncodingOperatorContainer() : encodingOperatorContainer< hoCuNDArray<T> >() {}
+    virtual ~hoCuEncodingOperatorContainer() {}
+    
+    virtual boost::shared_ptr< linearOperator< hoCuNDArray<T> > > clone(){
+      return linearOperator< hoCuNDArray<T> >::clone(this);
+    }  
+  }; 
+}
diff --git a/toolboxes/operators/gpu/hoCuIdentityOperator.h b/toolboxes/operators/gpu/hoCuIdentityOperator.h
new file mode 100644
index 0000000..40d7a0b
--- /dev/null
+++ b/toolboxes/operators/gpu/hoCuIdentityOperator.h
@@ -0,0 +1,28 @@
+/** \file hoCuIdentityOperator.h
+    \brief Instantiation of the identity operator for array type hoCuNDArray
+    
+    The file hoCuIdentityOperator.h is a convienience wrapper for the device independent identityOperator class.
+    The class hoCuIdentityOperator instantiates the identityOperator for the hoCuNDArray
+    and the header furthermore includes additional neccessary header files.
+*/
+
+#pragma once
+
+#include "hoCuNDArray_math.h"
+#include "identityOperator.h"
+
+namespace Gadgetron{
+  
+  /** \class hoCuIdentityOperator
+      \brief Instantiation of the identity operator for array type hoCuNDArray
+      
+      The class hoCuIdentityOperator is a convienience wrapper for the device independent identityOperator.
+      hoCuIdentityOperator instantiates the identityOperator for type hoCuNDArray<T>.
+  */
+  template <class T> class hoCuIdentityOperator : public identityOperator< hoCuNDArray<T> >
+  {
+  public:    
+    hoCuIdentityOperator() : identityOperator< hoCuNDArray<T> >() {}
+    virtual ~hoCuIdentityOperator() {}
+  }; 
+}
diff --git a/toolboxes/operators/gpu/hoCuOperator.h b/toolboxes/operators/gpu/hoCuOperator.h
new file mode 100644
index 0000000..d22025c
--- /dev/null
+++ b/toolboxes/operators/gpu/hoCuOperator.h
@@ -0,0 +1,55 @@
+#pragma once
+#include "hoCuNDArray_math.h"
+#include "linearOperator.h"
+
+#include <boost/shared_ptr.hpp>
+namespace Gadgetron{
+
+
+template<class T > class hoCuOperator : public linearOperator<hoCuNDArray<T> > {
+
+	public:
+		hoCuOperator(){};
+		hoCuOperator(boost::shared_ptr<linearOperator<hoNDArray<T> > > _op): op(_op) {};
+		virtual ~hoCuOperator(){};
+
+		virtual void mult_M(hoCuNDArray<T>* in, hoCuNDArray<T>* out, bool accumulate=false){
+			op->mult_M(in,out,accumulate);
+		}
+		virtual void mult_MH(hoCuNDArray<T>* in, hoCuNDArray<T>* out, bool accumulate=false){
+			op->mult_MH(in,out,accumulate);
+		}
+
+		virtual void gradient(hoCuNDArray<T>* in, hoCuNDArray<T>* out, bool accumulate=false){
+			op->gradient(in,out,accumulate);
+		}
+		virtual void mult_MH_M(hoCuNDArray<T>* in, hoCuNDArray<T>* out, bool accumulate=false){
+			op->mult_MH_M(in,out,accumulate);
+		}
+
+		virtual boost::shared_ptr< linearOperator< hoCuNDArray<T> > > clone() {
+			return linearOperator< hoCuNDArray<T> >::clone(this);
+		}
+		virtual boost::shared_ptr< std::vector<unsigned int> > get_codomain_dimensions(){
+			return op->get_codomain_dimensions();
+		}
+		virtual boost::shared_ptr< std::vector<unsigned int> > get_domain_dimensions(){
+			return op->get_domain_dimensions();
+		}
+		 virtual void set_weight( typename realType<T>::Type weight ){ op->set_weight(weight); };
+		virtual typename realType<T>::Type get_weight(){ return op->get_weight(); };
+		virtual void set_codomain_dimensions( std::vector<unsigned int> *dims ){
+			op->set_codomain_dimensions(dims);
+		}
+		virtual void set_domain_dimensions( std::vector<unsigned int> *dims ){
+			op->set_domain_dimensions(dims);
+		}
+	private:
+	 boost::shared_ptr<linearOperator<hoNDArray<T> > > op;
+};
+
+template<class T> boost::shared_ptr<linearOperator<hoCuNDArray<T> > > to_hoCu(boost::shared_ptr<linearOperator<hoNDArray<T> > > _op){
+	return boost::shared_ptr<hoCuOperator<T> > (new hoCuOperator<T>(_op));
+}
+
+}
diff --git a/toolboxes/operators/gpu/hoCuPartialDerivativeOperator.h b/toolboxes/operators/gpu/hoCuPartialDerivativeOperator.h
new file mode 100644
index 0000000..4c2f2ff
--- /dev/null
+++ b/toolboxes/operators/gpu/hoCuPartialDerivativeOperator.h
@@ -0,0 +1,90 @@
+#pragma once
+
+#include "partialDerivativeOperator.h"
+#include "cuPartialDerivativeOperator.h"
+#include "hoCuNDArray.h"
+#include "cudaDeviceManager.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+
+#include "hoPartialDerivativeOperator.h"
+
+namespace Gadgetron{
+
+  template <class T, unsigned int D> class hoCuPartialDerivativeOperator :
+    public linearOperator<hoCuNDArray<T> >
+  {
+  public: 
+  
+    hoCuPartialDerivativeOperator() : 
+      linearOperator<hoCuNDArray<T> >(),dev(),hoDev(),_dimension(0) {}
+  
+    hoCuPartialDerivativeOperator( unsigned int dimension ) : 
+      linearOperator<hoCuNDArray<T> >(),dev(dimension),hoDev(dimension), _dimension(dimension){ }
+
+    virtual ~hoCuPartialDerivativeOperator() {}
+
+    //TODO: Generalize to work if we can fit just the 1 single dimension on the gpu
+    virtual void mult_M(hoCuNDArray<T>* in, hoCuNDArray<T>* out, bool accumulate)
+    {
+      size_t free = cudaDeviceManager::Instance()->getFreeMemory();
+
+      if( free/sizeof(T) < in->get_number_of_elements()*2)
+	throw std::runtime_error("hoCuPartialDerivativeOperator: not enough device memory");
+      cuNDArray<T> cuIn(in);
+      cuNDArray<T> cuOut(out->get_dimensions());
+
+      if (accumulate) cuOut =cuNDArray<T>(out);
+
+      dev.mult_M(&cuIn,&cuOut,accumulate);
+
+      cudaMemcpy(out->get_data_ptr(),cuOut.get_data_ptr(),out->get_number_of_elements()*sizeof(T),cudaMemcpyDeviceToHost);
+    	//hoDev.mult_M(in,out,accumulate);
+    }
+
+    //TODO: Generalize to work if we can fit just the 1 single dimension on the gpu
+    virtual void mult_MH(hoCuNDArray<T>* in, hoCuNDArray<T>* out, bool accumulate)
+    {
+
+      size_t free = cudaDeviceManager::Instance()->getFreeMemory();
+
+      if( free/sizeof(T) < in->get_number_of_elements()*2)
+	throw std::runtime_error("hoCuPartialDerivativeOperator: not enough device memory");
+      cuNDArray<T> cuIn(in);
+      cuNDArray<T> cuOut(out->get_dimensions());
+
+      if (accumulate) cuOut =cuNDArray<T>(out);
+
+      dev.mult_MH(&cuIn,&cuOut,accumulate);
+
+      cudaMemcpy(out->get_data_ptr(),cuOut.get_data_ptr(),out->get_number_of_elements()*sizeof(T),cudaMemcpyDeviceToHost);
+
+    	//hoDev.mult_MH(in,out,accumulate);
+    }
+
+    //TODO: Generalize to work if we can fit just the 1 single dimension on the gpu
+    virtual void mult_MH_M(hoCuNDArray<T>* in, hoCuNDArray<T>* out, bool accumulate)
+    {
+
+      size_t free = cudaDeviceManager::Instance()->getFreeMemory();
+
+      if( free/sizeof(T) < in->get_number_of_elements()*2)
+	throw std::runtime_error("hoCuPartialDerivativeOperator: not enough device memory");
+      cuNDArray<T> cuIn(in);
+      cuNDArray<T> cuOut(out->get_dimensions());
+
+      if (accumulate) cuOut =cuNDArray<T>(out);
+
+      dev.mult_MH_M(&cuIn,&cuOut,accumulate);
+
+      cudaMemcpy(out->get_data_ptr(),cuOut.get_data_ptr(),out->get_number_of_elements()*sizeof(T),cudaMemcpyDeviceToHost);
+
+    	//hoDev.mult_MH_M(in,out,accumulate);
+    }
+
+  protected:
+    cuPartialDerivativeOperator<T,D> dev;
+    hoPartialDerivativeOperator<T,D> hoDev;
+    unsigned int _dimension;
+  };
+}
diff --git a/toolboxes/operators/gpu/hoCuTvOperator.h b/toolboxes/operators/gpu/hoCuTvOperator.h
new file mode 100644
index 0000000..5dd1b3d
--- /dev/null
+++ b/toolboxes/operators/gpu/hoCuTvOperator.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include "hoCuNDArray_math.h"
+#include "generalOperator.h"
+#include "hoCuNDArray.h"
+#include "cuTvOperator.h"
+#include "vector_td_utilities.h"
+
+namespace Gadgetron{
+
+template<class T, size_t D> class hoCuTvOperator :
+public generalOperator< hoCuNDArray<T> >
+{
+
+protected:
+	typedef typename realType<T>::Type REAL;
+
+public:
+
+	hoCuTvOperator() : generalOperator< hoCuNDArray<T> >(){
+		limit_ = REAL(1e-8);
+		cuTV.set_limit(limit_);
+	}
+
+	virtual ~hoCuTvOperator(){}
+
+	void set_limit( REAL limit ){
+		limit_ = limit;
+		cuTV.set_limit(limit);
+	}
+
+	virtual void gradient( hoCuNDArray<T> *in, hoCuNDArray<T> *out, bool accumulate=false )
+	{
+		if (in->get_number_of_elements() != out->get_number_of_elements()){
+			throw std::runtime_error("hoCuTvOperator: input/output array dimensions mismatch");
+		}
+
+		const vector_td<size_t,D> dims = from_std_vector<size_t, D>(*(in->get_dimensions()));
+		int elements = in->get_number_of_elements();
+
+		for (int i=0; i < (elements/prod(dims)); i++){
+
+			std::vector<size_t> dimensions = to_std_vector(dims);
+
+			hoNDArray<T> tmp_in;
+			tmp_in.create(&dimensions,in->get_data_ptr()+i*prod(dims));
+
+			hoNDArray<T> tmp_out;
+			tmp_out.create(&dimensions,out->get_data_ptr()+i*prod(dims));
+
+			cuNDArray<T> cuIn(&tmp_in);
+			cuNDArray<T> cuOut(&tmp_out);
+
+			cuTV.gradient(&cuIn,&cuOut,accumulate);
+			boost::shared_ptr< hoNDArray<T> > tmp = cuOut.to_host();
+			tmp_out = *tmp;
+		}
+	}
+
+	virtual REAL magnitude( hoCuNDArray<T> *in)
+	{
+		const vector_td<size_t,D> dims = from_std_vector<size_t, D>(*(in->get_dimensions()));
+		int elements = in->get_number_of_elements();
+		REAL result = 0;
+		for (int i=0; i < (elements/prod(dims)); i++){
+			std::vector<size_t> dimensions = to_std_vector(dims);
+			hoNDArray<T> tmp_in;
+			tmp_in.create(&dimensions,in->get_data_ptr()+i*prod(dims));
+			cuNDArray<T> cuIn(&tmp_in);
+			result += cuTV.magnitude(&cuIn);
+		}
+		return result;
+	}
+
+	virtual void set_weight(REAL weight){
+		this->weight_ = weight;
+		cuTV.set_weight(weight);
+	}
+
+protected:
+	REAL limit_;
+	cuTvOperator<T,D> cuTV;
+};
+}
diff --git a/toolboxes/operators/gpu/hoCuTvPicsOperator.h b/toolboxes/operators/gpu/hoCuTvPicsOperator.h
new file mode 100644
index 0000000..35ad216
--- /dev/null
+++ b/toolboxes/operators/gpu/hoCuTvPicsOperator.h
@@ -0,0 +1,16 @@
+#pragma once
+
+
+#include "hoCuTvOperator.h"
+#include "tvPicsOperator.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class hoCuTvPicsOperator 
+    : public tvPicsOperator< hoCuNDArray<T>, hoCuTvOperator<T,D>, typename realType<T>::Type >
+  {
+  public:
+    hoCuTvPicsOperator() : tvPicsOperator< hoCuNDArray<T>, hoCuTvOperator<T,D>, typename realType<T>::Type >() {}
+    virtual ~hoCuTvPicsOperator() {}
+  };    
+}
diff --git a/toolboxes/operators/identityOperator.h b/toolboxes/operators/identityOperator.h
new file mode 100644
index 0000000..13a010d
--- /dev/null
+++ b/toolboxes/operators/identityOperator.h
@@ -0,0 +1,51 @@
+/** \file identityOperator.h
+    \brief Device independent implementation of the identity operator.
+
+    The file identityOperator.h is a device independent implementation of the identity operator.
+    To simplify the actual instantiation we refer to 
+    - the class(/file) hoIdentityOperator(/.h) for a cpu instantiated operator using the hoNDArray class
+    - the class(/file) cuIdentityOperator(/.h) for a gpu instantiated operator using the cuNDArray class
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE> class identityOperator : public linearOperator<ARRAY_TYPE>
+  {
+  public:
+    
+    identityOperator() : linearOperator<ARRAY_TYPE>() {}
+    virtual ~identityOperator() {}
+    
+    virtual void mult_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( in == 0x0 || out == 0x0 ){
+	throw std::runtime_error("Error: identityOperator::mult_{M,MH,MHM}: illegal array pointer provided");
+      }
+
+      // We will do only the most basic dimensionality checking
+      if( in->get_number_of_elements() != out->get_number_of_elements() ){
+	throw std::runtime_error("Error: identityOperator: in/out dimensions mismatch");
+      }
+        
+      if( accumulate )
+    	*out += *in;
+      else 
+	*out = *in;           
+    }
+    
+    virtual void mult_MH( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      return mult_M(in, out, accumulate);
+    }
+    
+    virtual void mult_MH_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      return mult_M(in, out, accumulate);
+    }
+
+  };
+}
diff --git a/toolboxes/operators/imageOperator.h b/toolboxes/operators/imageOperator.h
new file mode 100644
index 0000000..bb3accc
--- /dev/null
+++ b/toolboxes/operators/imageOperator.h
@@ -0,0 +1,99 @@
+/** \file imageOperator.h
+    \brief Base class for the image regularization operators.
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+#include "GadgetronTimer.h"
+
+namespace Gadgetron{
+  
+  template <class ARRAY_TYPE_REAL, class ARRAY_TYPE_OPERATOR> class imageOperator : public linearOperator<ARRAY_TYPE_OPERATOR>
+  {
+
+  protected:
+    typedef typename ARRAY_TYPE_REAL::element_type REAL;
+    typedef typename ARRAY_TYPE_OPERATOR::element_type ELEMENT_TYPE;
+    
+  public:
+    
+    imageOperator() : linearOperator<ARRAY_TYPE_OPERATOR>(), offset_(REAL(0)) {}
+    virtual ~imageOperator() {}
+  
+    // Get regularization image
+    virtual boost::shared_ptr<ARRAY_TYPE_REAL> get() { return image_; }
+    
+    // Compute regularization image
+    virtual void compute( ARRAY_TYPE_OPERATOR *image, bool offset_estimation = true )
+    {
+      // Make temporary copy of input
+      ARRAY_TYPE_OPERATOR tmp(*image);
+
+      // Normalize to an average energy of "one intensity unit per image element"
+      REAL sum = asum( &tmp );
+      REAL scale = ( (REAL) tmp.get_number_of_elements()/sum );
+      tmp *= scale;
+
+      image_ =  abs(&tmp);
+
+      if( offset_estimation )
+	offset_ = estimate_offset();
+      
+      // Reciprocalize image
+      if(offset_ > REAL(0)) *image_ += offset_;      
+      reciprocal_inplace(image_.get());
+    }
+    
+    // Apply regularization image operator
+    virtual void mult_MH_M( ARRAY_TYPE_OPERATOR *in, ARRAY_TYPE_OPERATOR *out, bool accumulate = false )
+    {        
+      ARRAY_TYPE_OPERATOR *tmp;
+      if( !accumulate ){
+    	tmp = out;
+    	*tmp = *in;
+      } 
+      else
+    	tmp = new ARRAY_TYPE_OPERATOR(*in);
+      
+      *tmp *= *image_;
+      *tmp *= *image_;
+      
+      if (accumulate){
+    	*out += *tmp;
+    	delete tmp;
+      }
+    }
+  
+    virtual void mult_M( ARRAY_TYPE_OPERATOR *in, ARRAY_TYPE_OPERATOR *out, bool accumulate = false )
+    {
+      ARRAY_TYPE_OPERATOR *tmp;
+      if( !accumulate ){
+	tmp = out;
+	*tmp = *in;
+      } else
+	tmp = new ARRAY_TYPE_OPERATOR(*in);
+
+      *tmp *= *image_;
+
+      if (accumulate){
+	*out += *tmp;
+	delete tmp;
+      }
+    }
+  
+    virtual void mult_MH( ARRAY_TYPE_OPERATOR *in, ARRAY_TYPE_OPERATOR *out, bool accumulate = false )
+    {
+      mult_M(in,out,accumulate);
+    }
+
+  
+  protected:
+    // Estimate offset to the regularization image
+    virtual REAL estimate_offset()=0;
+
+  protected:
+    boost::shared_ptr< ARRAY_TYPE_REAL > image_;
+    REAL offset_;
+  };
+}
diff --git a/toolboxes/operators/laplaceOperator.h b/toolboxes/operators/laplaceOperator.h
new file mode 100644
index 0000000..92f8b0c
--- /dev/null
+++ b/toolboxes/operators/laplaceOperator.h
@@ -0,0 +1,31 @@
+/** \file laplaceOperator.h
+    \brief Base class for the Laplacian operator implementations.
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+
+namespace Gadgetron{
+  
+  template <unsigned int D, class ARRAY_TYPE> class laplaceOperator : public linearOperator<ARRAY_TYPE>
+  {    
+  public:
+    
+    laplaceOperator( ) : linearOperator<ARRAY_TYPE>() { }
+    virtual ~laplaceOperator() {}
+    
+    virtual void mult_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      compute_laplace( in, out, accumulate );
+    }
+  
+    virtual void mult_MH( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      compute_laplace( in, out, accumulate );
+    }
+    
+  protected:
+    virtual void compute_laplace( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate ) = 0;
+  };
+}
diff --git a/toolboxes/operators/linearOperator.h b/toolboxes/operators/linearOperator.h
new file mode 100644
index 0000000..bd4f6f1
--- /dev/null
+++ b/toolboxes/operators/linearOperator.h
@@ -0,0 +1,93 @@
+/** \file linearOperator.h
+    \brief Base class for all linear operators.
+*/
+
+#pragma once
+
+#include "generalOperator.h"
+
+namespace Gadgetron{
+
+  /** \class linearOperator
+      \brief Base class for all linear Operators
+  */
+  template <class ARRAY_TYPE> class linearOperator : public generalOperator<ARRAY_TYPE>
+  {
+  public:
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+  linearOperator() : generalOperator<ARRAY_TYPE>() {}
+
+  linearOperator(std::vector<size_t> *dims) : generalOperator<ARRAY_TYPE>(dims) {
+      set_codomain_dimensions(dims);
+    }
+
+  linearOperator(std::vector<size_t> *dims, std::vector<size_t> *codims)
+    : generalOperator<ARRAY_TYPE>(dims) {
+      set_codomain_dimensions(codims);
+    }
+
+    virtual ~linearOperator() {}
+
+    /**
+     * The gradient of a linear operator corresponds to mult_MH_M, times the weight of the operator.
+     * @param[in] in Input array.
+     * @param[in,out] out Output Array.
+     * @param accumulate If true, adds result to out. If false, overwrites out.
+     */
+    virtual void gradient(ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false)
+    {
+      if( in == 0x0 || out == 0x0 )
+	throw std::runtime_error("linearOperator::gradient(): Invalid input and/or output array");
+
+      ARRAY_TYPE* tmp = out;
+      if (accumulate) {
+	tmp = new ARRAY_TYPE(out->get_dimensions());
+      }
+      mult_MH_M(in,tmp,false);
+      *tmp *= this->weight_;
+      if (accumulate){
+	*out += *tmp;
+	delete tmp;
+      }
+    }
+
+
+    virtual REAL magnitude(ARRAY_TYPE* in){
+      ARRAY_TYPE tmp(&this->codomain_dims_);
+      this->mult_M(in,&tmp);
+      return std::sqrt(this->get_weight())*real(dot(&tmp,&tmp));
+    }
+    virtual void set_codomain_dimensions( std::vector<size_t> *dims )
+    {
+      if( dims == 0x0 )
+	throw std::runtime_error("linearOperator::set_codomain_dimensions: illegal dimensions array provided");
+      codomain_dims_ = *dims;
+    }
+
+    virtual boost::shared_ptr< std::vector<size_t> > get_codomain_dimensions()
+      {
+	std::vector<size_t> *dims = new std::vector<size_t>();
+	*dims = codomain_dims_;
+	return boost::shared_ptr< std::vector<size_t> >(dims);
+      }
+
+    virtual void mult_M( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false) = 0;
+    virtual void mult_MH( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false) = 0;
+
+    virtual void mult_MH_M( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false )
+    {
+      if( codomain_dims_.size() == 0 ){
+	throw std::runtime_error("Error: linearOperator::mult_MH_M : codomain dimensions not set");
+      }
+
+      ARRAY_TYPE tmp;
+      tmp.create(&codomain_dims_);
+      mult_M( in, &tmp, false );
+      mult_MH( &tmp, out, accumulate );
+    }
+
+  protected:
+    std::vector<size_t> codomain_dims_;
+  };
+}
diff --git a/toolboxes/operators/multiplicationOperatorContainer.h b/toolboxes/operators/multiplicationOperatorContainer.h
new file mode 100644
index 0000000..d0432bf
--- /dev/null
+++ b/toolboxes/operators/multiplicationOperatorContainer.h
@@ -0,0 +1,163 @@
+/** \file multiplicationOperatorContainer.h
+    \brief Operator used to chain together (concatenate) a series of operators by multiplication.
+ */
+
+#pragma once
+
+#include "linearOperator.h"
+#include <iostream>
+#include <vector>
+
+namespace Gadgetron{
+
+template <class ARRAY_TYPE> class multiplicationOperatorContainer
+		: public linearOperator<ARRAY_TYPE>
+{
+	typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+	typedef typename realType<ELEMENT_TYPE>::Type REAL;
+public:
+	multiplicationOperatorContainer() : linearOperator<ARRAY_TYPE>() {}
+	virtual ~multiplicationOperatorContainer(){}
+
+	// Set/get domain and codomain dimensions.
+	//
+
+	virtual void set_domain_dimensions( std::vector<unsigned int>* )
+	{
+		throw std::runtime_error("Warning: multiplicationOperatorContainer::set_domain_dimensions : dimensions ignored, using dimensions of the individual operators instead");
+	}
+
+	virtual void set_codomain_dimensions( std::vector<size_t> *dims )
+	{
+		throw std::runtime_error("Warning: multiplicationOperatorContainer::set_codomain_dimensions : dimensions ignored, using dimensions of the individual operators instead");
+
+	}
+
+	virtual boost::shared_ptr< std::vector<size_t> > get_domain_dimensions()
+  		{
+		if( operators_.size() == 0 )
+			return boost::shared_ptr< std::vector<size_t> >();
+		else
+			return operators_[0]->get_domain_dimensions();
+  		}
+
+	virtual boost::shared_ptr< std::vector<size_t> > get_codomain_dimensions()
+  		{
+		if( operators_.size() == 0 )
+			return boost::shared_ptr< std::vector<size_t> >();
+		else
+			return operators_[operators_.size()-1]->get_codomain_dimensions();
+  		}
+
+	virtual void set_weight( REAL weight ){
+		REAL op_weight = REAL(1);
+		for( int i=0; i<operators_.size(); i++ )
+			op_weight *= operators_[i]->get_weight();
+		this->weight_ = weight*op_weight;
+	}
+
+	// Add operator to the container
+	//
+	void add_operator( boost::shared_ptr< linearOperator<ARRAY_TYPE> > op )
+	{
+		if( op.get() == 0x0 ){
+			throw std::runtime_error( "Error: multiplicationOperatorContainer::add_operator : illegal operator" );
+		}
+
+		// All operators needs the domain and codomain dimensions set
+		//
+		if( op->get_domain_dimensions()->size() == 0 ){
+			throw std::runtime_error(  "Error: multiplicationOperatorContainer::add_operator : domain dimensions not set on operator" );
+
+		}
+		if( op->get_codomain_dimensions()->size() == 0 ){
+			throw std::runtime_error( "Error: multiplicationOperatorContainer::add_operator : codomain dimensions not set on operator");
+		}
+		operators_.push_back( op );
+		this->weight_ *= op->get_weight();
+	}
+
+	virtual void mult_M( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false )
+	{
+		if( operators_.size() == 0 ){
+			throw std::runtime_error("Error: multiplicationOperatorContainer::mult_M : no operators added");
+
+		}
+
+		ARRAY_TYPE *tmp_in = in, *tmp_out = 0x0;
+		ARRAY_TYPE ping, pong;
+
+		if( operators_.size() > 1 ){
+			ping.create( operators_[0]->get_codomain_dimensions().get() );
+			tmp_out = &ping;
+		}
+		else{
+			tmp_out = out;
+		}
+
+		// Loop over operators
+		//
+		for( int i=0; i < operators_.size(); i++ ){
+
+			operators_[i]->mult_M( tmp_in, tmp_out, (i==operators_.size()-1) ? accumulate : false );
+
+			ARRAY_TYPE *tmp_tmp_out = (i==0) ? &pong : tmp_in;
+			tmp_in = tmp_out;
+
+			if( operators_.size() > 2 && i < operators_.size()-2 ){
+				tmp_tmp_out->create( operators_[i+1]->get_codomain_dimensions().get() );
+				tmp_out = tmp_tmp_out;
+			}
+			else if( i == operators_.size()-2 ){
+				tmp_out = out;
+			}
+		}
+
+	}
+
+	virtual void mult_MH( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false )
+	{
+		if( operators_.size() == 0 ){
+			throw std::runtime_error("Error: multiplicationOperatorContainer::mult_MH : no operators added");
+		}
+
+		ARRAY_TYPE *tmp_in = in, *tmp_out = 0x0;
+		ARRAY_TYPE ping, pong;
+
+		if( operators_.size() > 1 ){
+			ping.create( operators_[operators_.size()-1]->get_domain_dimensions().get() );
+			tmp_out = &ping;
+		}
+		else{
+			tmp_out = out;
+		}
+
+		// Loop over operators
+		//
+		for( int i=operators_.size()-1; i>=0; i-- ){
+
+			operators_[i]->mult_MH( tmp_in, tmp_out, (i==0) ? accumulate : false );
+
+			ARRAY_TYPE *tmp_tmp_out = (i==operators_.size()-1) ? &pong : tmp_in;
+			tmp_in = tmp_out;
+
+			if( i > 1 ){
+				tmp_tmp_out->create( operators_[i-1]->get_domain_dimensions().get() );
+				tmp_out = tmp_tmp_out;
+			}
+			else if( i == 1 ){
+				tmp_out = out;
+			}
+		}
+	}
+
+	virtual boost::shared_ptr< linearOperator<ARRAY_TYPE> > clone() {
+		return linearOperator<ARRAY_TYPE>::clone(this);
+	}
+
+
+
+protected:
+	std::vector< boost::shared_ptr< linearOperator<ARRAY_TYPE> > > operators_;
+};
+}
diff --git a/toolboxes/operators/partialDerivativeOperator.h b/toolboxes/operators/partialDerivativeOperator.h
new file mode 100644
index 0000000..bfae587
--- /dev/null
+++ b/toolboxes/operators/partialDerivativeOperator.h
@@ -0,0 +1,71 @@
+/** \file partialDerivativeOperator.h
+    \brief Base class for the partialDerivative operators.
+
+    The file partialDerivativeOperator.h is a device independent partial implementation 
+    of a partial derivative operator.
+    To simplify the actual instantiation we refer to 
+    - the class(/file) hoPartialDerivativeOperator(/.h) for a cpu instantiated operator using the hoNDArray class
+    - the class(/file) cuPartialDerivativeOperator(/.h) for a gpu instantiated operator using the cuNDArray class
+*/
+#pragma once
+
+#include "linearOperator.h"
+#include "vector_td.h"
+
+namespace Gadgetron{
+  
+  template < unsigned int D, class ARRAY_TYPE> class partialDerivativeOperator 
+    : public linearOperator<ARRAY_TYPE>
+  {
+    
+  public:
+    
+    partialDerivativeOperator( size_t dimension ) : 
+      linearOperator<ARRAY_TYPE>() { compute_stride(dimension); }
+    
+    virtual ~partialDerivativeOperator() {}
+    
+    virtual void mult_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      compute_partial_derivative( forwards_stride_, in, out, accumulate );
+    }
+    
+    virtual void mult_MH( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      compute_partial_derivative( adjoint_stride_, in, out, accumulate );
+    }
+    
+    virtual void mult_MH_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {    
+      compute_second_order_partial_derivative( forwards_stride_, adjoint_stride_, in, out, accumulate );
+    }
+    
+    virtual void compute_partial_derivative
+    ( typename int64d<D>::Type stride, 
+      ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate ) = 0;  
+    
+    virtual void compute_second_order_partial_derivative
+    ( typename int64d<D>::Type forwards_stride, typename int64d<D>::Type adjoint_stride, 
+      ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate ) = 0;    
+    
+  protected:
+    
+    virtual void compute_stride( size_t _dimension )
+    {
+      size_t dim = _dimension;
+      
+      if( _dimension > D-1 ){
+        throw std::runtime_error("Error: partialDerivativeOperator: dimension out of range");
+      }
+      
+      for( unsigned int d=0; d<D; d++ ){
+        forwards_stride_.vec[d] = (d==dim) ? 1 : 0;
+        adjoint_stride_.vec[d] = (d==dim) ? -1 : 0;
+      }    
+    }
+    
+  private:
+    typename int64d<D>::Type forwards_stride_;
+    typename int64d<D>::Type adjoint_stride_;
+  };
+}
diff --git a/toolboxes/operators/permutationOperator.h b/toolboxes/operators/permutationOperator.h
new file mode 100644
index 0000000..f447351
--- /dev/null
+++ b/toolboxes/operators/permutationOperator.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include "linearOperator.h"
+namespace Gadgetron{
+
+
+template <class ARRAY_TYPE> class permutationOperator : public linearOperator<ARRAY_TYPE>{
+public:
+
+	virtual ~permutationOperator(){};
+
+	virtual void mult_M(ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate=false){
+		if (accumulate)
+			*out += *permute(in,&order);
+		else
+			permute(in,out,&order);
+	}
+
+	virtual void mult_MH(ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate=false){
+		if (accumulate)
+			*out += *permute(in,&transpose_order);
+		else
+			permute(in,out,&transpose_order);
+	}
+
+	virtual void mult_MH_M(ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate=false){
+		if (accumulate)
+			*out += *in;
+		else
+			*out = *in;
+	}
+
+	virtual void set_order(std::vector<size_t> order){
+		this->order =order;
+		transpose_order = std::vector<size_t>(order.size(),0);
+		for (unsigned int i = 0; i < order.size(); i++)
+			transpose_order[order[i]] = i;
+	}
+
+
+  virtual boost::shared_ptr< linearOperator<ARRAY_TYPE> > clone() {
+    return linearOperator<ARRAY_TYPE>::clone(this);
+  }
+protected:
+	 std::vector<size_t> order;
+	 std::vector<size_t> transpose_order;
+};
+}
diff --git a/toolboxes/operators/subsetOperator.h b/toolboxes/operators/subsetOperator.h
new file mode 100644
index 0000000..2aafe93
--- /dev/null
+++ b/toolboxes/operators/subsetOperator.h
@@ -0,0 +1,72 @@
+/*
+ * Base class for handling operations on a subset of the data. Is used as the operator class for all
+ * ordered subset solvers.
+ */
+#pragma once
+
+#include "linearOperator.h"
+#include <numeric>
+#include <functional>
+namespace Gadgetron{
+
+
+template<class ARRAY_TYPE> class subsetOperator : public virtual linearOperator<ARRAY_TYPE>{
+private:
+  typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+  typedef typename realType<ELEMENT_TYPE>::Type REAL;
+public:
+
+	subsetOperator(int _number_of_subsets) : number_of_subsets(_number_of_subsets){};
+	subsetOperator() : number_of_subsets(1){};
+
+	virtual ~subsetOperator(){};
+	virtual void mult_M(ARRAY_TYPE* in, ARRAY_TYPE* out, int subset, bool accumulate)=0;
+	virtual void mult_MH(ARRAY_TYPE* in, ARRAY_TYPE* out, int subset, bool accumulate)=0;
+	virtual void mult_MH_M(ARRAY_TYPE* in, ARRAY_TYPE* out, int subset, bool accumulate){
+		auto codim = this->get_codomain_dimensions(subset);
+		ARRAY_TYPE tmp(codim);
+		this->mult_M(in,&tmp,subset,false);
+		this->mult_MH(&tmp,out,subset,accumulate);
+	}
+
+	virtual void mult_M(ARRAY_TYPE* in, ARRAY_TYPE* out,bool accumulate){
+		if (!accumulate) clear(out);
+		std::vector<boost::shared_ptr<ARRAY_TYPE> > projections = projection_subsets(out);
+
+		for (int i = 0; i < this->get_number_of_subsets(); i++) mult_M(in,projections[i].get(),i,true);
+	}
+
+	virtual void mult_MH(ARRAY_TYPE* in, ARRAY_TYPE* out,bool accumulate){
+			if (!accumulate) clear(out);
+			std::vector<boost::shared_ptr<ARRAY_TYPE> > projections = projection_subsets(in);
+			for (int i = 0; i < this->get_number_of_subsets(); i++) mult_MH(projections[i].get(),out,i,true);
+	}
+
+	virtual boost::shared_ptr< std::vector<size_t> > get_codomain_dimensions(int subset)=0;
+/*
+ 	virtual void set_codomain_subsets(std::vector< std::vector<unsigned int> > & _dims){
+		codomain_dimensions = std::vector< std::vector<unsigned int> >(_dims);
+	}
+*/
+	virtual int get_number_of_subsets(){return number_of_subsets;}
+
+
+
+	virtual std::vector<boost::shared_ptr<ARRAY_TYPE> > projection_subsets(ARRAY_TYPE* projections){
+
+		std::vector<boost::shared_ptr<ARRAY_TYPE> > res;
+		ELEMENT_TYPE* curPtr = projections->get_data_ptr();
+		for (int subset = 0; subset < this->get_number_of_subsets(); subset++){
+			std::vector<size_t> subset_dim = *get_codomain_dimensions(subset);
+			res.push_back(boost::shared_ptr<ARRAY_TYPE>(new ARRAY_TYPE(&subset_dim,curPtr)));
+			curPtr += std::accumulate(subset_dim.begin(),subset_dim.end(),1,std::multiplies<unsigned int>());
+		}
+		return res;
+	}
+protected:
+	virtual void set_number_of_subsets(int nsubsets){ number_of_subsets=nsubsets;}
+private:
+	int number_of_subsets;
+
+};
+}
diff --git a/toolboxes/operators/tvPicsOperator.h b/toolboxes/operators/tvPicsOperator.h
new file mode 100644
index 0000000..5d490d6
--- /dev/null
+++ b/toolboxes/operators/tvPicsOperator.h
@@ -0,0 +1,44 @@
+#pragma once
+
+namespace Gadgetron{
+
+  template<class ARRAY_TYPE, class TV_OPERATOR, class REAL> class tvPicsOperator 
+    : public generalOperator<ARRAY_TYPE>
+  {
+  public:
+    
+    tvPicsOperator() : generalOperator<ARRAY_TYPE>() {}
+    virtual ~tvPicsOperator() {}
+
+    void set_prior(boost::shared_ptr<ARRAY_TYPE> prior){
+      prior_ = prior;
+    }
+
+    virtual void gradient(ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate){
+      ARRAY_TYPE tmp = *in;
+      tmp -= *prior_;
+      op_.gradient(&tmp, out, accumulate);
+    }
+
+    virtual REAL magnitude(ARRAY_TYPE *x){
+    	ARRAY_TYPE tmp = *x;
+    	tmp -= *prior_;
+    	return op_.magnitude(&tmp);
+    }
+    void set_limit(REAL limit){
+      op_.set_limit(limit);
+    }
+
+    virtual void set_weight(REAL weight){
+      op_.set_weight(weight);
+    }
+
+    virtual REAL get_weight(){
+      return op_.get_weight();
+    }
+
+  protected:
+    TV_OPERATOR op_;
+    boost::shared_ptr<ARRAY_TYPE> prior_;
+  };
+}
diff --git a/toolboxes/operators/upsampleOperator.h b/toolboxes/operators/upsampleOperator.h
new file mode 100644
index 0000000..6576891
--- /dev/null
+++ b/toolboxes/operators/upsampleOperator.h
@@ -0,0 +1,51 @@
+/** \file UpsamplingOperator.h
+    \brief Base class for the upsampling operators.
+
+    For instantiation we refer to
+    - the class(/file) cuUpsamplingOperator(/.h) for a gpu instantiated operator using the cuNDArray class
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+#include "vector_td.h"
+
+namespace Gadgetron{
+  
+  template <class ARRAY_TYPE, unsigned int D> class upsampleOperator
+    : public linearOperator<ARRAY_TYPE>
+  {
+    
+  public:
+    
+    upsampleOperator() : linearOperator<ARRAY_TYPE>() {}
+    virtual ~upsampleOperator() {}
+    
+    typedef typename ARRAY_TYPE::element_type T;
+
+    virtual void mult_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( accumulate ){
+        boost::shared_ptr<ARRAY_TYPE> tmp = upsample<T,D>(in);
+        *out += *tmp;
+      }
+      else
+        upsample<T,D>(in,out);
+    }
+    
+    virtual void mult_MH( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( accumulate ){
+        boost::shared_ptr<ARRAY_TYPE> tmp = downsample<T,D>(in);
+        *out += *tmp;
+      }
+      else
+        downsample<T,D>(in,out);
+    }
+
+    virtual boost::shared_ptr< linearOperator< ARRAY_TYPE > > clone()
+    {
+      return linearOperator<ARRAY_TYPE>::clone(this);
+    }    
+  };
+}
diff --git a/toolboxes/python/CMakeLists.txt b/toolboxes/python/CMakeLists.txt
new file mode 100644
index 0000000..c27cabd
--- /dev/null
+++ b/toolboxes/python/CMakeLists.txt
@@ -0,0 +1,43 @@
+include_directories(
+    ${CMAKE_SOURCE_DIR}/apps/gadgetron
+    ${CMAKE_BINARY_DIR}/apps/gadgetron
+    ${CMAKE_SOURCE_DIR}/toolboxes/core
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+    ${ISMRMRD_INCLUDE_DIR}
+    ${Boost_INCLUDE_DIR}
+    ${PYTHON_INCLUDE_PATH}
+    ${NUMPY_INCLUDE_DIRS}
+    ${ACE_INCLUDE_DIR})
+
+add_library(gadgetron_toolbox_python SHARED
+    python_toolbox.cpp
+    python_toolbox.h
+    python_export.h)
+
+target_link_libraries(gadgetron_toolbox_python
+    gadgetron_toolbox_cpucore
+    ${ISMRMRD_LIBRARIES}
+    ${PYTHON_LIBRARIES}
+    ${Boost_LIBRARIES}
+    ${ACE_LIBRARIES}
+    gadgetron_toolbox_log)
+
+set_target_properties(gadgetron_toolbox_python PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+if (WIN32)
+    # only add this definition to the library, not the example binaries
+    set_target_properties(gadgetron_toolbox_python PROPERTIES COMPILE_DEFINITIONS "__BUILD_GADGETRON_TOOLBOX_PYTHON__")
+endif (WIN32)
+
+install(TARGETS gadgetron_toolbox_python DESTINATION lib COMPONENT main)
+
+install(FILES
+    python_toolbox.h
+    python_converters.h
+    python_tuple_converter.h
+    python_hoNDArray_converter.h
+    python_ismrmrd_converter.h
+    python_numpy_wrappers.h
+    python_export.h
+    DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+add_subdirectory(example)
diff --git a/toolboxes/python/example/CMakeLists.txt b/toolboxes/python/example/CMakeLists.txt
new file mode 100644
index 0000000..5f620f5
--- /dev/null
+++ b/toolboxes/python/example/CMakeLists.txt
@@ -0,0 +1,9 @@
+include_directories(${CMAKE_SOURCE_DIR}/toolboxes/python ${ISMRMRD_INCLUDE_DIR})
+
+add_executable(gadgetron_test_python test_python.cpp)
+target_link_libraries(gadgetron_test_python gadgetron_toolbox_python)
+
+add_executable(python_demo demo.cpp)
+target_link_libraries(python_demo gadgetron_toolbox_python ${ISMRMRD_LIBRARIES})
+
+install(TARGETS gadgetron_test_python DESTINATION bin COMPONENT main)
diff --git a/toolboxes/python/example/demo.cpp b/toolboxes/python/example/demo.cpp
new file mode 100644
index 0000000..047a8bf
--- /dev/null
+++ b/toolboxes/python/example/demo.cpp
@@ -0,0 +1,87 @@
+#include "python_toolbox.h"
+#include "ismrmrd/ismrmrd.h"
+
+using namespace Gadgetron;
+
+int main(int argc, char** argv)
+{
+    int a = -42;
+    float b = 3.141592;
+    std::string c("hello, world");
+    unsigned int d(117);
+    std::complex<double> e(2.12894, -1.103103);
+
+    std::vector<size_t> dims;
+    dims.push_back(4);
+    dims.push_back(4);
+    dims.push_back(4);
+    hoNDArray<std::complex<float> > arr(dims);
+
+    // Call a function with no return value (print all arguments)
+    PythonFunction<> foo("__builtin__", "print");
+    foo(a, b, c, d, e, arr);
+
+    // Call a function with a single return value
+    PythonFunction<float> atan2("math", "atan2");
+    int x = 7, y = 4;
+    float atan = atan2(x, y);
+    std::cout << atan << std::endl;
+
+    // Call a function that returns a tuple
+    PythonFunction<float,float> divmod("__builtin__", "divmod");
+    float w = 6.89;
+    float z = 4.12;
+    float fsum = 0, fdiff = 0;
+    std::tie(fsum, fdiff) = divmod(w, z);
+    std::cout << fsum << ", " << fdiff << std::endl;
+
+    // Call a function that expects an iterable argument (tuple)
+    PythonFunction<int> tuplen("__builtin__", "len");
+    int l = tuplen(std::make_tuple(-7, 0, 7));
+    std::cout << "tuple length: " << l << std::endl;
+
+    // Generate an hoNDArray of even #s using numpy
+    PythonFunction<hoNDArray<float>> arange("numpy", "arange");
+    hoNDArray<float> evens = arange(0, 100, 2, "f64");
+    std::cout << "number of even numbers between 0 and 100: " <<
+            evens.get_number_of_elements() << std::endl;
+
+    {
+        GILLock gl;     // this is needed
+        boost::python::object main(boost::python::import("__main__"));
+        boost::python::object global(main.attr("__dict__"));
+        boost::python::exec("def modify(head): head.version = 42; return head",
+                global, global);
+    }
+
+    ISMRMRD::ImageHeader img_head, img_head2;
+    img_head.version = 0;
+    std::cout << "version before: " << img_head.version << std::endl;
+    PythonFunction<ISMRMRD::ImageHeader> modify_img_header("__main__", "modify");
+    img_head2 = modify_img_header(img_head);
+    std::cout << "version after: " << img_head2.version << std::endl;
+
+    ISMRMRD::AcquisitionHeader acq_head, acq_head2;
+    acq_head.version = 0;
+    std::cout << "version before: " << img_head.version << std::endl;
+    PythonFunction<ISMRMRD::AcquisitionHeader> modify_acq_header("__main__", "modify");
+    acq_head2 = modify_acq_header(acq_head);
+    std::cout << "version after: " << acq_head2.version << std::endl;
+
+    {
+        GILLock gl;     // this is needed
+        boost::python::object main(boost::python::import("__main__"));
+        boost::python::object global(main.attr("__dict__"));
+        boost::python::exec("from numpy.random import random\n"
+                "def rand_cplx_array(length): \n"
+                "    return random(length) + 1j * random(length)\n",
+                global, global);
+    }
+
+    std::vector<std::complex<double> > vec;
+    PythonFunction<std::vector<std::complex<double> > > make_vec("__main__", "rand_cplx_array");
+    vec = make_vec(32);
+    std::cout << vec[16] << std::endl;
+
+    return 0;
+}
diff --git a/toolboxes/python/example/test_python.cpp b/toolboxes/python/example/test_python.cpp
new file mode 100644
index 0000000..0854660
--- /dev/null
+++ b/toolboxes/python/example/test_python.cpp
@@ -0,0 +1,33 @@
+#include "log.h"
+#include "python_toolbox.h"
+#include "hoNDArray_fileio.h"
+
+using namespace Gadgetron;
+
+int main(int argc, char** argv)
+{
+  GINFO("This is the Python test application\n");
+
+  if (argc < 2) {
+    GERROR("You must supply an input file\n");
+    return -1;
+  }
+
+  boost::shared_ptr< hoNDArray< std::complex<float> > > source_data = read_nd_array< std::complex<float> >(argv[1]);
+
+  size_t coils = source_data->get_size(2);
+  size_t ny = source_data->get_size(1);
+  size_t nx = source_data->get_size(0);
+  GINFO("Array dimensions [%d, %d, %d]\n", nx, ny, coils);
+
+  hoNDArray< std::complex<float> > unmix;
+  hoNDArray<float> gmap;
+
+  PythonFunction<hoNDArray<std::complex<float> >, hoNDArray<float> > calculate_grappa_unmixing("ismrmrdtools.grappa", "calculate_grappa_unmixing");
+
+  std::tie(unmix, gmap) = calculate_grappa_unmixing(*source_data.get(), 3);
+
+  write_nd_array(&unmix, "unmix.cplx");
+
+  return 0;
+}
diff --git a/toolboxes/python/python_converters.h b/toolboxes/python/python_converters.h
new file mode 100644
index 0000000..c5c99c7
--- /dev/null
+++ b/toolboxes/python/python_converters.h
@@ -0,0 +1,33 @@
+#ifndef GADGETRON_PYTHON_MATH_CONVERSIONS_H
+#define GADGETRON_PYTHON_MATH_CONVERSIONS_H
+
+#include "ismrmrd/ismrmrd.h"
+
+namespace Gadgetron {
+
+/// Interface for registering C++ <-> NumPy type converters.
+/// A static function on the `python_converter` struct allows
+/// for partial template specialization.
+template <typename T>
+struct python_converter {
+    static void create() { }
+};
+
+/// Convenience wrapper for `python_converter<TS...>::create()`
+template <typename ...TS>
+void register_converter(void) {
+    // Parameter packs can only be expanded in specific semantic situations.
+    // This creates a fake array to expand and create converters for each
+    // variadic type.
+    using expander = int[];
+    (void) expander {0, (python_converter<TS>::create(), 0)...};
+}
+
+}
+
+#include "python_tuple_converter.h"
+#include "python_hoNDArray_converter.h"
+#include "python_ismrmrd_converter.h"
+#include "python_vector_converter.h"
+
+#endif // GADGETRON_PYTHON_MATH_CONVERSIONS_H
diff --git a/toolboxes/python/python_export.h b/toolboxes/python/python_export.h
new file mode 100644
index 0000000..5e236a6
--- /dev/null
+++ b/toolboxes/python/python_export.h
@@ -0,0 +1,14 @@
+#ifndef PYTHON_EXPORT_H_
+#define PYTHON_EXPORT_H_
+
+#if defined (WIN32)
+    #if defined (__BUILD_GADGETRON_PYTHON__) || defined (gadgetron_toolbox_python_EXPORTS)
+        #define EXPORTPYTHON __declspec(dllexport)
+    #else
+        #define EXPORTPYTHON __declspec(dllimport)
+    #endif
+#else
+    #define EXPORTPYTHON
+#endif
+
+#endif
diff --git a/toolboxes/python/python_hoNDArray_converter.h b/toolboxes/python/python_hoNDArray_converter.h
new file mode 100644
index 0000000..dcbde01
--- /dev/null
+++ b/toolboxes/python/python_hoNDArray_converter.h
@@ -0,0 +1,103 @@
+#ifndef GADGETRON_PYTHON_HONDARRAY_CONVERTER_H
+#define GADGETRON_PYTHON_HONDARRAY_CONVERTER_H
+
+#include "python_toolbox.h"
+#include "python_numpy_wrappers.h"
+
+#include "hoNDArray.h"
+#include "log.h"
+
+#include <boost/python.hpp>
+namespace bp = boost::python;
+
+namespace Gadgetron {
+
+/// Used for making a NumPy array from and hoNDArray
+template <typename T>
+struct hoNDArray_to_numpy_array {
+    static PyObject* convert(const hoNDArray<T>& arr) {
+        size_t ndim = arr.get_number_of_dimensions();
+        std::vector<npy_intp> dims2(ndim);
+        for (size_t i = 0; i < ndim; i++) {
+            dims2[ndim-i-1] = static_cast<npy_intp>(arr.get_size(i));
+        }
+        PyObject *obj = NumPyArray_SimpleNew(dims2.size(), &dims2[0], get_numpy_type<T>());
+        if (sizeof(T) != NumPyArray_ITEMSIZE(obj)) {
+            GERROR("sizeof(T): %d, ITEMSIZE: %d\n", sizeof(T), NumPyArray_ITEMSIZE(obj));
+            throw std::runtime_error("hondarray_to_numpy_array: "
+                    "python object and array data type sizes do not match");
+        }
+
+        // Copy data
+        memcpy(NumPyArray_DATA(obj), arr.get_data_ptr(),
+                arr.get_number_of_elements() * sizeof(T));
+
+        // increment the reference count so it exists after `return`
+        return bp::incref(obj);
+    }
+};
+
+/// Used for making an hoNDArray from a NumPy array
+template <typename T>
+struct hoNDArray_from_numpy_array {
+    hoNDArray_from_numpy_array() {
+        // actually register this converter with Boost
+        bp::converter::registry::push_back(
+                &convertible,
+                &construct,
+                bp::type_id<hoNDArray<T> >());
+    }
+
+    /// Returns NULL if the NumPy array is not convertible
+    static void* convertible(PyObject* obj) {
+        if (sizeof(T) != NumPyArray_ITEMSIZE(obj)) {
+            GERROR("sizeof(T): %d, ITEMSIZE: %d\n", sizeof(T), NumPyArray_ITEMSIZE(obj));
+            return NULL;
+        }
+        return obj;
+    }
+
+    /// Construct an hoNDArray in-place
+    static void construct(PyObject* obj, bp::converter::rvalue_from_python_stage1_data* data) {
+        void* storage = ((bp::converter::rvalue_from_python_storage<hoNDArray<T> >*)data)->storage.bytes;
+        data->convertible = storage;
+
+        size_t ndim = NumPyArray_NDIM(obj);
+        std::vector<size_t> dims(ndim);
+        for (size_t i = 0; i < ndim; i++) {
+            dims[ndim - i - 1] = NumPyArray_DIM(obj, i);
+        }
+
+        // Placement-new of hoNDArray in memory provided by Boost
+        hoNDArray<T>* arr = new (storage) hoNDArray<T>(dims);
+        memcpy(arr->get_data_ptr(), NumPyArray_DATA(obj),
+                sizeof(T) * arr->get_number_of_elements());
+    }
+};
+
+/// Create and register hoNDArray converter as necessary
+template <typename T> void create_hoNDArray_converter() {
+    bp::type_info info = bp::type_id<hoNDArray<T> >();
+    const bp::converter::registration* reg = bp::converter::registry::query(info);
+    // only register if not already registered!
+    if (nullptr == reg || nullptr == (*reg).m_to_python) {
+        bp::to_python_converter<hoNDArray<T>, hoNDArray_to_numpy_array<T> >();
+        hoNDArray_from_numpy_array<T>();
+    }
+}
+
+/// Partial specialization of `python_converter` for hoNDArray
+template <typename T>
+struct python_converter<hoNDArray<T> > {
+    static void create()
+    {
+        // ensure NumPy C-API is initialized
+        initialize_numpy();
+        // register hoNDArray converter
+        create_hoNDArray_converter<T>();
+    }
+};
+
+}
+
+#endif // GADGETRON_PYTHON_HONDARRAY_CONVERTER_H
diff --git a/toolboxes/python/python_ismrmrd_converter.h b/toolboxes/python/python_ismrmrd_converter.h
new file mode 100644
index 0000000..0838225
--- /dev/null
+++ b/toolboxes/python/python_ismrmrd_converter.h
@@ -0,0 +1,359 @@
+#ifndef GADGETRON_PYTHON_ISMRMRD_CONVERTER_H
+#define GADGETRON_PYTHON_ISMRMRD_CONVERTER_H
+
+#include "python_toolbox.h" // for pyerr_to_string()
+#include "ismrmrd/ismrmrd.h"
+
+#include <boost/python.hpp>
+namespace bp = boost::python;
+
+namespace Gadgetron {
+
+struct AcquisitionHeader_to_PythonAcquisitionHeader {
+    static PyObject* convert(const ISMRMRD::AcquisitionHeader& head) {
+        try {
+            bp::object module = bp::import("ismrmrd");
+            bp::object pyhead = module.attr("AcquisitionHeader")();
+
+            pyhead.attr("version") = head.version;
+            pyhead.attr("flags") = head.flags;
+            pyhead.attr("measurement_uid") = head.measurement_uid;
+            pyhead.attr("scan_counter") = head.scan_counter;
+            pyhead.attr("acquisition_time_stamp") = head.acquisition_time_stamp;
+            for (int i = 0; i < ISMRMRD::ISMRMRD_PHYS_STAMPS; i++) {
+                pyhead.attr("physiology_time_stamp")[i] = head.physiology_time_stamp[i];
+            }
+            pyhead.attr("number_of_samples") = head.number_of_samples;
+            pyhead.attr("available_channels") = head.available_channels;
+            pyhead.attr("active_channels") = head.active_channels;
+
+            for (int i = 0; i < ISMRMRD::ISMRMRD_CHANNEL_MASKS; i++) {
+                pyhead.attr("channel_mask")[i] = head.channel_mask[i];
+            }
+
+            pyhead.attr("discard_pre") = head.discard_pre;
+            pyhead.attr("discard_post") = head.discard_post;
+            pyhead.attr("center_sample") = head.center_sample;
+            pyhead.attr("encoding_space_ref") = head.encoding_space_ref;
+            pyhead.attr("trajectory_dimensions") = head.trajectory_dimensions;
+            pyhead.attr("sample_time_us") = head.sample_time_us;
+
+            for (int i = 0; i < ISMRMRD::ISMRMRD_POSITION_LENGTH; i++) {
+                pyhead.attr("position")[i] = head.position[i];
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_DIRECTION_LENGTH; i++) {
+                pyhead.attr("read_dir")[i] = head.read_dir[i];
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_DIRECTION_LENGTH; i++) {
+                pyhead.attr("phase_dir")[i] = head.phase_dir[i];
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_DIRECTION_LENGTH; i++) {
+                pyhead.attr("slice_dir")[i] = head.slice_dir[i];
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_POSITION_LENGTH; i++) {
+                pyhead.attr("patient_table_position")[i] = head.patient_table_position[i];
+            }
+
+            pyhead.attr("idx").attr("kspace_encode_step_1") = head.idx.kspace_encode_step_1;
+            pyhead.attr("idx").attr("kspace_encode_step_2") = head.idx.kspace_encode_step_2;
+            pyhead.attr("idx").attr("average") = head.idx.average;
+            pyhead.attr("idx").attr("slice") = head.idx.slice;
+            pyhead.attr("idx").attr("contrast") = head.idx.contrast;
+            pyhead.attr("idx").attr("phase") = head.idx.phase;
+            pyhead.attr("idx").attr("repetition") = head.idx.repetition;
+            pyhead.attr("idx").attr("set") = head.idx.set;
+            pyhead.attr("idx").attr("segment") = head.idx.segment;
+            for (int i = 0; i < ISMRMRD::ISMRMRD_USER_INTS; i++) {
+                pyhead.attr("idx").attr("user")[i] = head.idx.user[i];
+            }
+
+            for (int i = 0; i < ISMRMRD::ISMRMRD_USER_INTS; i++) {
+                pyhead.attr("user_int")[i] = head.user_int[i];
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_USER_FLOATS; i++) {
+                pyhead.attr("user_float")[i] = head.user_float[i];
+            }
+
+            // increment the reference count so it exists after `return`
+            return bp::incref(pyhead.ptr());
+        } catch (const bp::error_already_set&) {
+            std::string err = pyerr_to_string();
+            GERROR(err.c_str());
+            throw std::runtime_error(err);
+        }
+    }
+};
+
+struct AcquisitionHeader_from_PythonAcquisitionHeader {
+    AcquisitionHeader_from_PythonAcquisitionHeader() {
+        // actually register this converter with Boost
+        bp::converter::registry::push_back(
+                &convertible,
+                &construct,
+                bp::type_id<ISMRMRD::AcquisitionHeader>());
+    }
+
+    /// Returns NULL if the Python AcquisitionHeader is not convertible
+    static void* convertible(PyObject* obj) {
+        return obj;
+    }
+
+    /// Construct an ISMRMRD::AcquisitionHeader in-place
+    static void construct(PyObject* obj, bp::converter::rvalue_from_python_stage1_data* data) {
+        void* storage = ((bp::converter::rvalue_from_python_storage<ISMRMRD::AcquisitionHeader>*)data)->storage.bytes;
+
+        // Placement-new of ISMRMRD::AcquisitionHeader in memory provided by Boost
+        ISMRMRD::AcquisitionHeader* head = new (storage) ISMRMRD::AcquisitionHeader;
+        data->convertible = storage;
+
+        try {
+            bp::object pyhead((bp::handle<>(bp::borrowed(obj))));
+
+            head->version = bp::extract<uint16_t>(pyhead.attr("version"));
+            head->flags = bp::extract<uint64_t>(pyhead.attr("flags"));
+            head->measurement_uid = bp::extract<uint32_t>(pyhead.attr("measurement_uid"));
+            head->scan_counter = bp::extract<uint32_t>(pyhead.attr("scan_counter"));
+            head->acquisition_time_stamp = bp::extract<uint32_t>(pyhead.attr("acquisition_time_stamp"));
+            for (int i = 0; i < ISMRMRD::ISMRMRD_PHYS_STAMPS; i++) {
+                head->physiology_time_stamp[i] = bp::extract<uint32_t>(pyhead.attr("physiology_time_stamp")[i]);
+            }
+            head->number_of_samples = bp::extract<uint16_t>(pyhead.attr("number_of_samples"));
+            head->available_channels = bp::extract<uint16_t>(pyhead.attr("available_channels"));
+            head->active_channels = bp::extract<uint16_t>(pyhead.attr("active_channels"));
+            for (int i = 0; i < ISMRMRD::ISMRMRD_CHANNEL_MASKS; i++) {
+                head->channel_mask[i] = bp::extract<uint64_t>(pyhead.attr("channel_mask")[i]);
+            }
+            head->discard_pre = bp::extract<uint16_t>(pyhead.attr("discard_pre"));
+            head->discard_post = bp::extract<uint16_t>(pyhead.attr("discard_post"));
+            head->center_sample = bp::extract<uint16_t>(pyhead.attr("center_sample"));
+            head->encoding_space_ref = bp::extract<uint16_t>(pyhead.attr("encoding_space_ref"));
+            head->trajectory_dimensions = bp::extract<uint16_t>(pyhead.attr("trajectory_dimensions"));
+            head->sample_time_us = bp::extract<float>(pyhead.attr("sample_time_us"));
+            for (int i = 0; i < ISMRMRD::ISMRMRD_POSITION_LENGTH; i++) {
+                head->position[i] = bp::extract<float>(pyhead.attr("position")[i]);
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_DIRECTION_LENGTH; i++) {
+                head->read_dir[i] = bp::extract<float>(pyhead.attr("read_dir")[i]);
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_DIRECTION_LENGTH; i++) {
+                head->phase_dir[i] = bp::extract<float>(pyhead.attr("phase_dir")[i]);
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_DIRECTION_LENGTH; i++) {
+                head->slice_dir[i] = bp::extract<float>(pyhead.attr("slice_dir")[i]);
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_POSITION_LENGTH; i++) {
+                head->patient_table_position[i] = bp::extract<float>(pyhead.attr("patient_table_position")[i]);
+            }
+
+            head->idx.kspace_encode_step_1 = bp::extract<uint16_t>(pyhead.attr("idx").attr("kspace_encode_step_1"));
+            head->idx.kspace_encode_step_2 = bp::extract<uint16_t>(pyhead.attr("idx").attr("kspace_encode_step_2"));
+            head->idx.average = bp::extract<uint16_t>(pyhead.attr("idx").attr("average"));
+            head->idx.slice = bp::extract<uint16_t>(pyhead.attr("idx").attr("slice"));
+            head->idx.contrast = bp::extract<uint16_t>(pyhead.attr("idx").attr("contrast"));
+            head->idx.phase = bp::extract<uint16_t>(pyhead.attr("idx").attr("phase"));
+            head->idx.repetition = bp::extract<uint16_t>(pyhead.attr("idx").attr("repetition"));
+            head->idx.set = bp::extract<uint16_t>(pyhead.attr("idx").attr("set"));
+            head->idx.segment = bp::extract<uint16_t>(pyhead.attr("idx").attr("segment"));
+            for (int i = 0; i < ISMRMRD::ISMRMRD_USER_INTS; i++) {
+                head->idx.user[i] = bp::extract<uint16_t>(pyhead.attr("idx").attr("user")[i]);
+            }
+
+            for (int i = 0; i < ISMRMRD::ISMRMRD_USER_INTS; i++) {
+                head->user_int[i] = bp::extract<int32_t>(pyhead.attr("user_int")[i]);
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_USER_FLOATS; i++) {
+                head->user_float[i] = bp::extract<float>(pyhead.attr("user_float")[i]);
+            }
+        } catch (const bp::error_already_set&) {
+            std::string err = pyerr_to_string();
+            GERROR(err.c_str());
+            throw std::runtime_error(err);
+        }
+    }
+};
+
+
+struct ImageHeader_to_PythonImageHeader {
+    static PyObject* convert(const ISMRMRD::ImageHeader& head) {
+        try {
+            bp::object module = bp::import("ismrmrd");
+            bp::object pyhead = module.attr("ImageHeader")();
+
+            pyhead.attr("version") = head.version;
+            pyhead.attr("data_type") = head.data_type;
+            pyhead.attr("flags") = head.flags;
+            pyhead.attr("measurement_uid") = head.measurement_uid;
+            for (int i = 0; i < ISMRMRD::ISMRMRD_POSITION_LENGTH; i++) {
+                pyhead.attr("matrix_size")[i] = head.matrix_size[i];
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_POSITION_LENGTH; i++) {
+                pyhead.attr("field_of_view")[i] = head.field_of_view[i];
+            }
+            pyhead.attr("channels") = head.channels;
+            for (int i = 0; i < ISMRMRD::ISMRMRD_POSITION_LENGTH; i++) {
+                pyhead.attr("position")[i] = head.position[i];
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_DIRECTION_LENGTH; i++) {
+                pyhead.attr("read_dir")[i] = head.read_dir[i];
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_DIRECTION_LENGTH; i++) {
+                pyhead.attr("phase_dir")[i] = head.phase_dir[i];
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_DIRECTION_LENGTH; i++) {
+                pyhead.attr("slice_dir")[i] = head.slice_dir[i];
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_POSITION_LENGTH; i++) {
+                pyhead.attr("patient_table_position")[i] = head.patient_table_position[i];
+            }
+            pyhead.attr("average") = head.average;
+            pyhead.attr("slice") = head.slice;
+            pyhead.attr("contrast") = head.contrast;
+            pyhead.attr("phase") = head.phase;
+            pyhead.attr("repetition") = head.repetition;
+            pyhead.attr("set") = head.set;
+            pyhead.attr("acquisition_time_stamp") = head.acquisition_time_stamp;
+            for (int i = 0; i < ISMRMRD::ISMRMRD_PHYS_STAMPS; i++) {
+                pyhead.attr("physiology_time_stamp")[i] = head.physiology_time_stamp[i];
+            }
+            pyhead.attr("image_type") = head.image_type;
+            pyhead.attr("image_index") = head.image_index;
+            pyhead.attr("image_series_index") = head.image_series_index;
+            for (int i = 0; i < ISMRMRD::ISMRMRD_USER_INTS; i++) {
+                pyhead.attr("user_int")[i] = head.user_int[i];
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_USER_FLOATS; i++) {
+                pyhead.attr("user_float")[i] = head.user_float[i];
+            }
+            pyhead.attr("attribute_string_len") = head.attribute_string_len;
+
+            // increment the reference count so it exists after `return`
+            return bp::incref(pyhead.ptr());
+        } catch (const bp::error_already_set&) {
+            std::string err = pyerr_to_string();
+            GERROR(err.c_str());
+            throw std::runtime_error(err);
+        }
+    }
+};
+
+struct ImageHeader_from_PythonImageHeader {
+    ImageHeader_from_PythonImageHeader() {
+        // actually register this converter with Boost
+        bp::converter::registry::push_back(
+                &convertible,
+                &construct,
+                bp::type_id<ISMRMRD::ImageHeader>());
+    }
+
+    /// Returns NULL if the Python ImageHeader is not convertible
+    static void* convertible(PyObject* obj) {
+        return obj;
+    }
+
+    /// Construct an ISMRMRD::ImageHeader in-place
+    static void construct(PyObject* obj, bp::converter::rvalue_from_python_stage1_data* data) {
+        void* storage = ((bp::converter::rvalue_from_python_storage<ISMRMRD::ImageHeader>*)data)->storage.bytes;
+
+        // Placement-new of ISMRMRD::ImageHeader in memory provided by Boost
+        ISMRMRD::ImageHeader* head = new (storage) ISMRMRD::ImageHeader;
+        data->convertible = storage;
+
+        try {
+            bp::object pyhead((bp::handle<>(bp::borrowed(obj))));
+
+            head->version = bp::extract<uint16_t>(pyhead.attr("version"));
+            head->data_type = bp::extract<uint16_t>(pyhead.attr("data_type"));
+            head->flags = bp::extract<uint64_t>(pyhead.attr("flags"));
+            head->measurement_uid = bp::extract<uint32_t>(pyhead.attr("measurement_uid"));
+            for (int i = 0; i < ISMRMRD::ISMRMRD_POSITION_LENGTH; i++) {
+                head->matrix_size[i] = bp::extract<uint16_t>(pyhead.attr("matrix_size")[i]);
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_POSITION_LENGTH; i++) {
+                head->field_of_view[i] = bp::extract<float>(pyhead.attr("field_of_view")[i]);
+            }
+            head->channels = bp::extract<uint16_t>(pyhead.attr("channels"));
+            for (int i = 0; i < ISMRMRD::ISMRMRD_POSITION_LENGTH; i++) {
+                head->position[i] = bp::extract<float>(pyhead.attr("position")[i]);
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_DIRECTION_LENGTH; i++) {
+                head->read_dir[i] = bp::extract<float>(pyhead.attr("read_dir")[i]);
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_DIRECTION_LENGTH; i++) {
+                head->phase_dir[i] = bp::extract<float>(pyhead.attr("phase_dir")[i]);
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_DIRECTION_LENGTH; i++) {
+                head->slice_dir[i] = bp::extract<float>(pyhead.attr("slice_dir")[i]);
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_POSITION_LENGTH; i++) {
+                head->patient_table_position[i] = bp::extract<float>(pyhead.attr("patient_table_position")[i]);
+            }
+            head->average = bp::extract<uint16_t>(pyhead.attr("average"));
+            head->slice = bp::extract<uint16_t>(pyhead.attr("slice"));
+            head->contrast = bp::extract<uint16_t>(pyhead.attr("contrast"));
+            head->phase = bp::extract<uint16_t>(pyhead.attr("phase"));
+            head->repetition = bp::extract<uint16_t>(pyhead.attr("repetition"));
+            head->set = bp::extract<uint16_t>(pyhead.attr("set"));
+            head->acquisition_time_stamp = bp::extract<uint32_t>(pyhead.attr("acquisition_time_stamp"));
+            for (int i = 0; i < ISMRMRD::ISMRMRD_PHYS_STAMPS; i++) {
+                head->physiology_time_stamp[i] = bp::extract<uint32_t>(pyhead.attr("physiology_time_stamp")[i]);
+            }
+            head->image_type = bp::extract<uint16_t>(pyhead.attr("image_type"));
+            head->image_index = bp::extract<uint16_t>(pyhead.attr("image_index"));
+            head->image_series_index = bp::extract<uint16_t>(pyhead.attr("image_series_index"));
+            for (int i = 0; i < ISMRMRD::ISMRMRD_USER_INTS; i++) {
+                head->user_int[i] = bp::extract<int32_t>(pyhead.attr("user_int")[i]);
+            }
+            for (int i = 0; i < ISMRMRD::ISMRMRD_USER_FLOATS; i++) {
+                head->user_float[i] = bp::extract<float>(pyhead.attr("user_float")[i]);
+            }
+            head->attribute_string_len = bp::extract<uint32_t>(pyhead.attr("attribute_string_len"));
+        } catch (const bp::error_already_set&) {
+            std::string err = pyerr_to_string();
+            GERROR(err.c_str());
+            throw std::runtime_error(err);
+        }
+    }
+};
+
+/// Create and register AcquisitionHeader converter as necessary
+inline void create_ismrmrd_AcquisitionHeader_converter() {
+    bp::type_info info = bp::type_id<ISMRMRD::AcquisitionHeader>();
+    const bp::converter::registration* reg = bp::converter::registry::query(info);
+    // only register if not already registered!
+    if (nullptr == reg || nullptr == (*reg).m_to_python) {
+        bp::to_python_converter<ISMRMRD::AcquisitionHeader, AcquisitionHeader_to_PythonAcquisitionHeader>();
+        AcquisitionHeader_from_PythonAcquisitionHeader();
+    }
+}
+
+/// Create and register ImageHeader converter as necessary
+inline void create_ismrmrd_ImageHeader_converter() {
+    bp::type_info info = bp::type_id<ISMRMRD::ImageHeader>();
+    const bp::converter::registration* reg = bp::converter::registry::query(info);
+    // only register if not already registered!
+    if (nullptr == reg || nullptr == (*reg).m_to_python) {
+        bp::to_python_converter<ISMRMRD::ImageHeader, ImageHeader_to_PythonImageHeader>();
+        ImageHeader_from_PythonImageHeader();
+    }
+}
+
+
+/// Partial specialization of `python_converter` for ISMRMRD::AcquisitionHeader
+template<> struct python_converter<ISMRMRD::AcquisitionHeader> {
+    static void create()
+    {
+        create_ismrmrd_AcquisitionHeader_converter();
+    }
+};
+
+/// Partial specialization of `python_converter` for ISMRMRD::ImageHeader
+template<> struct python_converter<ISMRMRD::ImageHeader> {
+    static void create()
+    {
+        create_ismrmrd_ImageHeader_converter();
+    }
+};
+
+}
+
+#endif /* GADGETRON_PYTHON_ISMRMRD_CONVERTER_H */
diff --git a/toolboxes/python/python_numpy_wrappers.h b/toolboxes/python/python_numpy_wrappers.h
new file mode 100644
index 0000000..b585a7f
--- /dev/null
+++ b/toolboxes/python/python_numpy_wrappers.h
@@ -0,0 +1,40 @@
+#ifndef GADGETRON_PYTHON_NUMPY_WRAPPERS_H
+#define GADGETRON_PYTHON_NUMPY_WRAPPERS_H
+
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include "numpy/ndarraytypes.h"
+
+namespace Gadgetron {
+
+/// Wrappers for NumPy C-API functions. These functions must be used
+/// in the same C++ source file as the call to `import_array()`. In this
+/// case, that is Python.cpp. The simplest solution is to lightly wrap the API.
+EXPORTPYTHON int NumPyArray_NDIM(PyObject* obj);
+EXPORTPYTHON npy_intp NumPyArray_DIM(PyObject* obj, int i);
+EXPORTPYTHON void *NumPyArray_DATA(PyObject* obj);
+EXPORTPYTHON int NumPyArray_ITEMSIZE(PyObject* obj);
+EXPORTPYTHON npy_intp NumPyArray_SIZE(PyObject* obj);
+EXPORTPYTHON PyObject *NumPyArray_SimpleNew(int nd, npy_intp* dims, int typenum);
+
+/// return the enumerated numpy type for a given C++ type
+template <typename T> int get_numpy_type() { return NPY_VOID; }
+template <> inline int get_numpy_type< bool >() { return NPY_BOOL; }
+template <> inline int get_numpy_type< char >() { return NPY_INT8; }
+template <> inline int get_numpy_type< unsigned char >() { return NPY_UINT8; }
+template <> inline int get_numpy_type< short >() { return NPY_INT16; }
+template <> inline int get_numpy_type< unsigned short >() { return NPY_UINT16; }
+template <> inline int get_numpy_type< int >() { return NPY_INT32; }
+template <> inline int get_numpy_type< unsigned int >() { return NPY_UINT32; }
+template <> inline int get_numpy_type< long >() { return NPY_INT64; }
+template <> inline int get_numpy_type< unsigned long >() { return NPY_UINT64; }
+template <> inline int get_numpy_type< float >() { return NPY_FLOAT32; }
+template <> inline int get_numpy_type< double >() { return NPY_FLOAT64; }
+template <> inline int get_numpy_type< std::complex<float> >() { return NPY_COMPLEX64; }
+template <> inline int get_numpy_type< std::complex<double> >() { return NPY_COMPLEX128; }
+/* Don't define these for now */
+/* template <> inline int get_numpy_type< char* >() { return NPY_STRING; } */
+/* template <> inline int get_numpy_type< std::string >() { return NPY_STRING; } */
+
+}
+
+#endif // GADGETRON_PYTHON_NUMPY_WRAPPERS_H
diff --git a/toolboxes/python/python_toolbox.cpp b/toolboxes/python/python_toolbox.cpp
new file mode 100644
index 0000000..361ae05
--- /dev/null
+++ b/toolboxes/python/python_toolbox.cpp
@@ -0,0 +1,148 @@
+#include "python_toolbox.h"
+
+#include "Gadget.h"             // for GADGET_OK/FAIL
+#include "gadgetron_paths.h"    // for get_gadgetron_home()
+#include "gadgetron_config.h"   // for GADGETRON_PYTHON_PATH
+
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <numpy/numpyconfig.h>
+#include <numpy/arrayobject.h>
+
+#include <boost/thread/mutex.hpp>
+#include <boost/algorithm/string.hpp>
+
+
+namespace Gadgetron
+{
+
+static bool python_initialized = false;
+static bool numpy_initialized = false;
+static boost::mutex python_initialize_mtx;
+static boost::mutex numpy_initialize_mtx;
+
+int initialize_python(void)
+{
+    // lock here so only one thread can initialize Python
+    boost::mutex::scoped_lock lock(python_initialize_mtx);
+
+    if (!python_initialized) {
+        Py_Initialize();
+        initialize_numpy();
+
+        PyEval_InitThreads();
+
+        //Swap out and return current thread state and release the GIL
+        //Must be done, otherwise subsequent calls to PyGILState_Ensure()
+        //will not be guaranteed to acquire lock
+        PyThreadState* tstate = PyEval_SaveThread();
+        if (!tstate) {
+            GDEBUG("Error occurred returning lock to Python\n");
+            return GADGET_FAIL;
+        }
+
+        python_initialized = true; // interpreter successfully initialized
+
+        //Let's first get the path set for the library folder
+        std::string gadgetron_home = get_gadgetron_home();
+        std::string path_name = gadgetron_home + std::string("/") + std::string(GADGETRON_PYTHON_PATH);
+
+        if (gadgetron_home.size() != 0) {
+            if (add_python_path(path_name) == GADGET_FAIL) {
+                GDEBUG("python_toolbox failed to add path %s\n", path_name.c_str());
+                return GADGET_FAIL;
+            }
+        }
+    }
+    return GADGET_OK;
+}
+
+int initialize_numpy(void)
+{
+    // lock here so only one thread can initialize NumPy
+    boost::mutex::scoped_lock lock(numpy_initialize_mtx);
+
+    if (!numpy_initialized) {
+        _import_array();    // import NumPy
+        numpy_initialized = true; // numpy successfully initialized
+    }
+    return GADGET_OK;
+}
+
+int add_python_path(const std::string& path)
+{
+    GILLock lock;   // Lock the GIL
+
+    std::string add_path_cmd;
+    if (path.size() > 0) {
+        std::vector<std::string> paths;
+        boost::split(paths, path, boost::is_any_of(";"));
+        for (unsigned int i = 0; i < paths.size(); i++) {
+            add_path_cmd = std::string("import sys;\nif sys.path.count(\"") +
+                    paths[i] + std::string("\") == 0:\n\tsys.path.insert(0, \"") +
+                    paths[i] + std::string("\")\n");
+            //GDEBUG("Executing path command:\n%s\n", path_cmd.c_str());
+            boost::python::exec(add_path_cmd.c_str(),
+                    boost::python::import("__main__").attr("__dict__"));
+        }
+    }
+
+    return GADGET_OK;
+}
+
+/// Adapted from http://stackoverflow.com/a/6576177/1689220
+std::string pyerr_to_string(void)
+{
+    PyObject *exc, *val, *tb;
+    bp::object formatted_list, formatted;
+    PyErr_Fetch(&exc, &val, &tb);
+    // wrap exception, value, traceback with bp::handle for auto memory management
+    bp::handle<> hexc(exc), hval(bp::allow_null(val)), htb(bp::allow_null(tb));
+    // import "traceback" module
+    bp::object traceback(bp::import("traceback"));
+    if (!tb) {
+        bp::object format_exception_only(traceback.attr("format_exception_only"));
+        formatted_list = format_exception_only(hexc, hval);
+    } else {
+        bp::object format_exception(traceback.attr("format_exception"));
+        formatted_list = format_exception(hexc, hval, htb);
+    }
+    formatted = bp::str("\n").join(formatted_list);
+    return bp::extract<std::string>(formatted);
+}
+
+/// Wraps PyArray_NDIM
+int NumPyArray_NDIM(PyObject* obj)
+{
+    return PyArray_NDIM((PyArrayObject*)obj);
+}
+
+/// Wraps PyArray_DIM
+npy_intp NumPyArray_DIM(PyObject* obj, int i)
+{
+    return PyArray_DIM((PyArrayObject*)obj, i);
+}
+
+/// Wraps PyArray_DATA
+void* NumPyArray_DATA(PyObject* obj)
+{
+    return PyArray_DATA((PyArrayObject*)obj);
+}
+
+/// Wraps PyArray_ITEMSIZE
+int NumPyArray_ITEMSIZE(PyObject* obj)
+{
+    return PyArray_ITEMSIZE((PyArrayObject*)obj);
+}
+
+npy_intp NumPyArray_SIZE(PyObject* obj)
+{
+    return PyArray_SIZE((PyArrayObject*)obj);
+}
+
+/// Wraps PyArray_SimpleNew
+PyObject* NumPyArray_SimpleNew(int nd, npy_intp* dims, int typenum)
+{
+    return PyArray_SimpleNew(nd, dims, typenum);
+}
+
+}
diff --git a/toolboxes/python/python_toolbox.h b/toolboxes/python/python_toolbox.h
new file mode 100644
index 0000000..f9bdee3
--- /dev/null
+++ b/toolboxes/python/python_toolbox.h
@@ -0,0 +1,154 @@
+#ifndef GADGETRON_PYTHON_H
+#define GADGETRON_PYTHON_H
+
+#include "python_export.h"
+
+#include <boost/python.hpp>
+namespace bp = boost::python;
+
+namespace Gadgetron
+{
+
+/// Initialize Python and NumPy. Called by each PythonFunction constructor
+EXPORTPYTHON int initialize_python(void);
+/// Initialize NumPy
+EXPORTPYTHON int initialize_numpy(void);
+/// Add a path to the PYTHONPATH
+EXPORTPYTHON int add_python_path(const std::string& path);
+
+/// Extracts the exception/traceback to build and return a std::string
+EXPORTPYTHON std::string pyerr_to_string(void);
+
+}
+
+// Include converters after declaring above functions
+#include "python_converters.h"
+
+namespace Gadgetron
+{
+/// Utility class for RAII handling of the Python GIL. Usage:
+///
+///    GILLock lg;  // at the top of a block
+///
+class GILLock
+{
+public:
+    GILLock() { gstate_ = PyGILState_Ensure(); }
+    ~GILLock() { PyGILState_Release(gstate_); }
+private:
+    // noncopyable
+    GILLock(const GILLock&);
+    GILLock& operator=(const GILLock&);
+
+    PyGILState_STATE gstate_;
+};
+
+/// Base class for templated PythonFunction class. Do not use directly.
+class PythonFunctionBase
+{
+protected:
+    PythonFunctionBase(const std::string& module, const std::string& funcname)
+    {
+        initialize_python(); // ensure Python and NumPy are initialized
+        GILLock lg; // Lock the GIL, releasing at the end of constructor
+        try {
+            // import the module and load the function
+            bp::object mod(bp::import(module.c_str()));
+            fn_ = mod.attr(funcname.c_str());
+        } catch (const bp::error_already_set&) {
+            std::string err = pyerr_to_string();
+            GERROR(err.c_str());
+            throw std::runtime_error(err);
+        }
+    }
+
+    bp::object fn_;
+};
+
+/// PythonFunction for multiple return types (std::tuple)
+template <typename... ReturnTypes>
+class PythonFunction : public PythonFunctionBase
+{
+public:
+    typedef std::tuple<ReturnTypes...> TupleType;
+
+    PythonFunction(const std::string& module, const std::string& funcname)
+      : PythonFunctionBase(module, funcname)
+    {
+        // register the tuple return type converter
+        register_converter<TupleType>();
+    }
+
+    template <typename... TS>
+    TupleType operator()(const TS&... args)
+    {
+        // register type converter for each parameter type
+        register_converter<TS...>();
+        GILLock lg; // lock GIL and release at function exit
+        try {
+            bp::object res = fn_(args...);
+            return bp::extract<TupleType>(res);
+        } catch (bp::error_already_set const &) {
+            std::string err = pyerr_to_string();
+            GERROR(err.c_str());
+            throw std::runtime_error(err);
+        }
+    }
+};
+
+/// PythonFunction for a single return type
+template <typename RetType>
+class PythonFunction<RetType> : public PythonFunctionBase
+{
+public:
+    PythonFunction(const std::string& module, const std::string& funcname)
+      : PythonFunctionBase(module, funcname)
+    {
+        // register the return type converter
+        register_converter<RetType>();
+    }
+
+    template <typename... TS>
+    RetType operator()(const TS&... args)
+    {
+        // register type converter for each parameter type
+        register_converter<TS...>();
+        GILLock lg; // lock GIL and release at function exit
+        try {
+            bp::object res = fn_(args...);
+            return bp::extract<RetType>(res);
+        } catch (bp::error_already_set const &) {
+            std::string err = pyerr_to_string();
+            GERROR(err.c_str());
+            throw std::runtime_error(err);
+        }
+    }
+};
+
+/// PythonFunction returning nothing
+template <>
+class PythonFunction<>  : public PythonFunctionBase
+{
+public:
+    PythonFunction(const std::string& module, const std::string& funcname)
+      : PythonFunctionBase(module, funcname) {}
+
+    template <typename... TS>
+    void operator()(const TS&... args)
+    {
+        // register type converter for each parameter type
+        register_converter<TS...>();
+        GILLock lg; // lock GIL and release at function exit
+        try {
+            bp::object res = fn_(args...);
+        } catch (bp::error_already_set const &) {
+            std::string err = pyerr_to_string();
+            GERROR(err.c_str());
+            throw std::runtime_error(err);
+        }
+    }
+};
+
+}
+
+#endif // GADGETRON_PYTHON_H
diff --git a/toolboxes/python/python_tuple_converter.h b/toolboxes/python/python_tuple_converter.h
new file mode 100644
index 0000000..058fccd
--- /dev/null
+++ b/toolboxes/python/python_tuple_converter.h
@@ -0,0 +1,120 @@
+// Source: https://gist.github.com/niwibe/3729459
+#ifndef GADGETRON_PYTHON_TUPLE_CONVERTER_H
+#define GADGETRON_PYTHON_TUPLE_CONVERTER_H
+
+#include <boost/python.hpp>
+namespace bp = boost::python;
+
+namespace Gadgetron {
+
+/// indices trick
+template<int ...> struct seq{};
+template<int N, int ...S> struct gens : gens<N-1, N-1, S...>{};
+template<int ...S> struct gens<0, S...> {typedef seq<S...> type;};
+
+/// Used for expanding a C++ std::tuple into a boost::python::tuple
+template <typename ...Args>
+struct cpptuple2pytuple_wrapper {
+    std::tuple<Args...> params;
+    cpptuple2pytuple_wrapper(const std::tuple<Args...>& _params):params(_params){}
+
+    bp::tuple delayed_dispatch() {
+        return callFunc(typename gens<sizeof...(Args)>::type());
+    }
+
+    template<int ...S>
+    bp::tuple callFunc(seq<S...>) {
+        return bp::make_tuple(std::get<S>(params) ...);
+    }
+};
+
+/// Used for expanding a boost::python::tuple into a C++ std::tuple
+template <typename ...Args>
+struct pytuple2cpptuple_wrapper {
+    bp::tuple params;
+    pytuple2cpptuple_wrapper(const bp::tuple& _params):params(_params){}
+
+    std::tuple<Args...> delayed_dispatch() {
+        return callFunc(typename gens<sizeof...(Args)>::type());
+    }
+
+    template<int ...S>
+    std::tuple<Args...> callFunc(seq<S...>) {
+        return std::make_tuple((static_cast<Args>(bp::extract<Args>(params[S])))...);
+    }
+};
+
+/// Convert C++ std::tuple to boost::python::tuple as PyObject*.
+template<typename ... Args> PyObject* cpptuple2pytuple(const std::tuple<Args...>& t) {
+    cpptuple2pytuple_wrapper<Args...> wrapper(t);
+    bp::tuple bpt = wrapper.delayed_dispatch();
+    return bp::incref(bp::object(bpt).ptr());
+}
+
+/// Convert boost::python::tuple to C++ std::tuple.
+template<typename ... Args> std::tuple<Args...> pytuple2cpptuple(PyObject* obj) {
+    bp::tuple tup(bp::borrowed(obj));
+    pytuple2cpptuple_wrapper<Args...> wrapper(tup);
+    std::tuple<Args...> bpt = wrapper.delayed_dispatch();
+    return bpt;
+}
+
+/// To-Python converter used by Boost
+template<typename ... Args>
+struct cpptuple_to_python_tuple {
+    static PyObject* convert(const std::tuple<Args...>& t) {
+        return cpptuple2pytuple<Args...>(t);
+    }
+};
+
+/// From-Python converter used by Boost
+template<typename ... Args>
+struct cpptuple_from_python_tuple {
+    cpptuple_from_python_tuple() {
+        // actually register this converter
+        bp::converter::registry::push_back(&convertible, &construct, bp::type_id<std::tuple<Args...> >());
+    }
+
+    /// Returns NULL if the bp::tuple is not convertible
+    static void* convertible(PyObject* obj_ptr) {
+        if (!PyTuple_CheckExact(obj_ptr)) {
+            return NULL;
+        }
+        return obj_ptr;
+    }
+
+    /// Construct the std::tuple in place
+    static void construct(PyObject* obj_ptr, bp::converter::rvalue_from_python_stage1_data* data) {
+        void* storage = ((bp::converter::rvalue_from_python_storage<std::tuple<Args...> >*)data)->storage.bytes;
+        // Use placement-new to make std::tuple in memory provided by Boost
+        new (storage) std::tuple<Args...>(pytuple2cpptuple<Args...>(obj_ptr));
+        data->convertible = storage;
+    }
+};
+
+/// Create and register tuple converter as necessary
+template <typename ...TS> void create_tuple_converter() {
+    bp::type_info info = bp::type_id<std::tuple<TS...> >();
+    const bp::converter::registration* reg = bp::converter::registry::query(info);
+    // only register if not already registered!
+    if (nullptr == reg || nullptr == (*reg).m_to_python) {
+        bp::to_python_converter<std::tuple<TS...>, cpptuple_to_python_tuple<TS...> >();
+        cpptuple_from_python_tuple<TS...>();
+    }
+}
+
+/// Partial specialization of `python_converter` for std::tuple
+template <typename ...TS>
+struct python_converter<std::tuple<TS...> > {
+    static void create()
+    {
+        // register tuple converter
+        create_tuple_converter<TS...>();
+        // register converter for each type in the tuple
+        register_converter<TS...>();
+    }
+};
+
+}
+
+#endif // GADGETRON_PYTHON_TUPLE_CONVERTER_H
diff --git a/toolboxes/python/python_vector_converter.h b/toolboxes/python/python_vector_converter.h
new file mode 100644
index 0000000..f774dbb
--- /dev/null
+++ b/toolboxes/python/python_vector_converter.h
@@ -0,0 +1,92 @@
+#ifndef GADGETRON_PYTHON_VECTOR_CONVERTER_H
+#define GADGETRON_PYTHON_VECTOR_CONVERTER_H
+
+#include "python_toolbox.h"
+#include "python_numpy_wrappers.h"
+#include "log.h"
+
+#include <vector>
+
+#include <boost/python.hpp>
+namespace bp = boost::python;
+
+namespace Gadgetron {
+
+template <typename T>
+struct vector_to_numpy_array {
+    static PyObject* convert(const std::vector<T>& vec)
+    {
+        std::vector<npy_intp> dims(1);
+        dims[0] = vec.size();
+
+        // TODO: This probably only works for types that map to NumPy types
+        // so e.g. a std::vector<std::string> shouldn't work
+        PyObject* obj = NumPyArray_SimpleNew(dims.size(), &dims[0], get_numpy_type<T>());
+        if (sizeof(T) != NumPyArray_ITEMSIZE(obj)) {
+            GERROR("sizeof(T): %d, ITEMSIZE: %d\n", sizeof(T), NumPyArray_ITEMSIZE(obj));
+            throw std::runtime_error("vector_to_numpy_array: "
+                    "python object and std::vector data type sizes do not match");
+        }
+
+        // Copy data... this is safe right? or use a for-loop
+        memcpy(NumPyArray_DATA(obj), &vec[0], vec.size() * sizeof(T));
+
+        // increment the reference count so it exists after `return`
+        return bp::incref(obj);
+    }
+};
+
+template <typename T>
+struct vector_from_numpy_array {
+    vector_from_numpy_array() {
+        // actually register this converter with Boost
+        bp::converter::registry::push_back(
+                &convertible,
+                &construct,
+                bp::type_id<std::vector<T> >());
+    }
+
+    static void* convertible(PyObject* obj) {
+        if (sizeof(T) != NumPyArray_ITEMSIZE(obj)) {
+            GERROR("sizeof(T): %d, ITEMSIZE: %d\n", sizeof(T), NumPyArray_ITEMSIZE(obj));
+            return NULL;
+        }
+        return obj;
+    }
+
+    static void construct(PyObject* obj, bp::converter::rvalue_from_python_stage1_data* data) {
+        void* storage = ((bp::converter::rvalue_from_python_storage<hoNDArray<T> >*)data)->storage.bytes;
+        data->convertible = storage;
+
+        size_t length = NumPyArray_SIZE(obj);
+        std::vector<T>* vec = new (storage) std::vector<T>(length);
+        memcpy(&(*vec)[0], NumPyArray_DATA(obj), sizeof(T) * length);
+    }
+};
+
+/// Create and register vector converter as necessary
+template <typename T> void create_vector_converter() {
+    bp::type_info info = bp::type_id<std::vector<T> >();
+    const bp::converter::registration* reg = bp::converter::registry::query(info);
+    // only register if not already registered!
+    if (nullptr == reg || nullptr == (*reg).m_to_python) {
+        bp::to_python_converter<std::vector<T>, vector_to_numpy_array<T> >();
+        vector_from_numpy_array<T>();
+    }
+}
+
+/// Partial specialization of `python_converter` for std::vector
+template <typename T>
+struct python_converter<std::vector<T> > {
+    static void create()
+    {
+        // ensure NumPy C-API is initialized
+        initialize_numpy();
+        // register std::vector converter
+        create_vector_converter<T>();
+    }
+};
+
+}
+
+#endif /* GADGETRON_PYTHON_VECTOR_CONVERTER_H */
diff --git a/toolboxes/registration/CMakeLists.txt b/toolboxes/registration/CMakeLists.txt
new file mode 100644
index 0000000..b19925d
--- /dev/null
+++ b/toolboxes/registration/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(optical_flow)
diff --git a/toolboxes/registration/optical_flow/CMakeLists.txt b/toolboxes/registration/optical_flow/CMakeLists.txt
new file mode 100644
index 0000000..bb88f84
--- /dev/null
+++ b/toolboxes/registration/optical_flow/CMakeLists.txt
@@ -0,0 +1,32 @@
+include_directories(   
+  ${Boost_INCLUDE_DIR}
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+)
+
+install(FILES
+  registrationSolver.h
+  multiresRegistrationSolver.h
+  opticalFlowSolver.h 
+  resampleOperator.h
+  opticalFlowOperator.h 
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+if(ARMADILLO_FOUND)
+  if(${ARMADILLO_VERSION_STRING} VERSION_GREATER "3.819" )
+    message("Compiling cpu based optical flow registration toolbox.")
+    set(BUILD_CPU_OPTIMAL_FLOW_REG On)
+  else (${ARMADILLO_VERSION_STRING} VERSION_GREATER "3.819" )
+    message("Armadillo (at least version 3.820) not found. Not compiling cpu-based optical flow registration toolbox. ")  
+    set(BUILD_CPU_OPTIMAL_FLOW_REG Off)
+  endif(${ARMADILLO_VERSION_STRING} VERSION_GREATER "3.819" )
+endif(ARMADILLO_FOUND)
+
+add_subdirectory(cpu)
+
+if (CUDA_FOUND)
+  message("Compiling gpu based optical flow registration toolbox.")
+  add_subdirectory(gpu)
+endif (CUDA_FOUND)
diff --git a/toolboxes/registration/optical_flow/cpu/CMakeLists.txt b/toolboxes/registration/optical_flow/cpu/CMakeLists.txt
new file mode 100644
index 0000000..056f842
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/CMakeLists.txt
@@ -0,0 +1,131 @@
+    if (WIN32)
+        ADD_DEFINITIONS(-D__BUILD_GADGETRON_CPUREG__)
+    endif (WIN32)
+
+    if(WIN32)
+        link_directories(${Boost_LIBRARY_DIRS})
+    endif(WIN32)
+
+    include_directories(
+        ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+        ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu/transformation
+        ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu/solver
+        ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu/warper
+        ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu/dissimilarity
+        ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu/register
+        ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu/application
+        ${CMAKE_SOURCE_DIR}/toolboxes/core
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/image
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/algorithm
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/core
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/operators
+        ${CMAKE_SOURCE_DIR}/toolboxes/operators/cpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+        ${CMAKE_SOURCE_DIR}/toolboxes/solvers/cpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+        ${Boost_INCLUDE_DIR}
+        ${ARMADILLO_INCLUDE_DIR} 
+        ${ACE_INCLUDE_DIR}
+        ${ISMRMRD_INCLUDE_DIR} )
+
+    set(opticalflow_files 
+                hoCKOpticalFlowSolver.cpp
+                hoCKOpticalFlowSolver.h
+                hoHSOpticalFlowSolver.cpp
+                hoHSOpticalFlowSolver.h
+                hoLinearResampleOperator.cpp
+                hoLinearResampleOperator.h
+                hoOpticalFlowSolver.cpp
+                hoOpticalFlowSolver.h )
+
+    set(transformation_files transformation/hoImageRegTransformation.h
+                             transformation/hoImageRegParametricTransformation.h 
+                             transformation/hoImageRegHomogenousTransformation.h 
+                             transformation/hoImageRegRigid2DTransformation.h 
+                             transformation/hoImageRegRigid3DTransformation.h 
+                             transformation/hoImageRegNonParametricTransformation.h 
+                             transformation/hoImageRegDeformationField.h )
+
+    set(solver_files solver/hoImageRegSolver.h 
+                     solver/hoImageRegParametricSolver.h
+                     solver/hoImageRegParametricDownHillSolver.h
+                     solver/hoImageRegParametricGradientDescentSolver.h
+                     solver/hoImageRegNonParametricSolver.h
+                     solver/hoImageRegDeformationFieldSolver.h 
+                     solver/hoImageRegDeformationFieldBidirectionalSolver.h )
+
+    set(warper_files warper/hoImageRegWarper.h)
+
+    set(similarity_files dissimilarity/hoImageRegDissimilarity.h
+                         dissimilarity/hoImageRegDissimilarityHistogramBased.h
+                         dissimilarity/hoImageRegDissimilarityLocalCCR.h
+                         dissimilarity/hoImageRegDissimilarityMutualInformation.h
+                         dissimilarity/hoImageRegDissimilarityNormalizedMutualInformation.h
+                         dissimilarity/hoImageRegDissimilaritySSD.h )
+
+    set(register_files register/hoImageRegRegister.h
+                       register/hoImageRegParametricRegister.h
+                       register/hoImageRegNonParametricRegister.h
+                       register/hoImageRegDeformationFieldRegister.h 
+                       register/hoImageRegDeformationFieldBidirectionalRegister.h )
+
+    set(application_files application/hoImageRegContainer2DRegistration.h )
+
+    if ( BUILD_CPU_OPTIMAL_FLOW_REG )
+
+        add_library(gadgetron_toolbox_cpureg SHARED 
+                    cpureg_export.h
+                    ${opticalflow_files} 
+                    ${transformation_files} 
+                    ${solver_files} 
+                    ${warper_files}
+                    ${similarity_files} 
+                    ${register_files}
+                    ${application_files} )
+
+        set_target_properties(gadgetron_toolbox_cpureg PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+
+        target_link_libraries(gadgetron_toolbox_cpureg 
+                gadgetron_toolbox_cpucore
+		gadgetron_toolbox_log
+                gadgetron_toolbox_cpucore_math 
+                ${ARMADILLO_LIBRARIES}
+                optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} )
+
+    else ( BUILD_CPU_OPTIMAL_FLOW_REG )
+        message("cpureg library will not be built ... ")
+    endif ( BUILD_CPU_OPTIMAL_FLOW_REG )
+
+    if ( BUILD_CPU_OPTIMAL_FLOW_REG )
+        source_group(opticalflow FILES ${opticalflow_files})
+        install(FILES ${opticalflow_files} DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+    endif ( BUILD_CPU_OPTIMAL_FLOW_REG )
+
+    source_group(transformation FILES ${transformation_files})
+    source_group(solver FILES ${solver_files})
+    source_group(warper FILES ${warper_files})
+    source_group(similarity FILES ${similarity_files})
+    source_group(register FILES ${register_files})
+    source_group(application FILES ${application_files})
+
+    if ( BUILD_CPU_OPTIMAL_FLOW_REG )
+        install(TARGETS gadgetron_toolbox_cpureg DESTINATION lib COMPONENT main)
+    endif ( BUILD_CPU_OPTIMAL_FLOW_REG )
+
+    install(FILES
+        ${transformation_files}
+        ${solver_files}
+        ${warper_files}
+        ${similarity_files}
+        ${register_files}
+        ${application_files}
+        DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/registration/optical_flow/cpu/application/hoImageRegContainer2DRegistration.h b/toolboxes/registration/optical_flow/cpu/application/hoImageRegContainer2DRegistration.h
new file mode 100644
index 0000000..3863c29
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/application/hoImageRegContainer2DRegistration.h
@@ -0,0 +1,1418 @@
+/** \file   hoImageRegContainer2DRegistration.h
+    \brief  Define the class to perform image registration over a 2D image container
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <sstream>
+#include "hoNDArray.h"
+#include "hoNDImage.h"
+#include "hoNDInterpolator.h"
+#include "hoNDBoundaryHandler.h"
+#include "hoMatrix.h"
+#include "hoNDArray_utils.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDImage_util.h"
+#include "gtPlusISMRMRDReconUtil.h"
+
+// transformation
+#include "hoImageRegTransformation.h"
+#include "hoImageRegParametricTransformation.h"
+#include "hoImageRegDeformationField.h"
+
+// warper
+#include "hoImageRegWarper.h"
+
+// solver
+#include "hoImageRegDeformationFieldSolver.h"
+#include "hoImageRegParametricSolver.h"
+#include "hoImageRegDeformationFieldBidirectionalSolver.h"
+
+// dissimilarity
+#include "hoImageRegDissimilaritySSD.h"
+#include "hoImageRegDissimilarityLocalCCR.h"
+#include "hoImageRegDissimilarityMutualInformation.h"
+#include "hoImageRegDissimilarityNormalizedMutualInformation.h"
+
+// register
+#include "hoImageRegDeformationFieldRegister.h"
+#include "hoImageRegDeformationFieldBidirectionalRegister.h"
+
+// container2D
+#include "hoNDImageContainer2D.h"
+
+namespace Gadgetron
+{
+    template <typename ObjType> void printInfo(const ObjType& obj)
+    {
+        std::ostringstream outs;
+        obj.print(outs);
+        outs << std::ends;
+        std::string msg(outs.str());
+        GDEBUG_STREAM(msg.c_str());
+    }
+
+    enum GT_IMAGE_REG_CONTAINER_MODE
+    {
+        GT_IMAGE_REG_CONTAINER_PAIR_WISE,
+        GT_IMAGE_REG_CONTAINER_FIXED_REFERENCE,
+        GT_IMAGE_REG_CONTAINER_PROGRESSIVE
+    };
+
+    inline std::string getImageRegContainerModeName(GT_IMAGE_REG_CONTAINER_MODE v)
+    {
+        std::string name;
+
+        switch (v)
+        {
+            case GT_IMAGE_REG_CONTAINER_PAIR_WISE:
+                name = "Pair-wise";
+                break;
+
+            case GT_IMAGE_REG_CONTAINER_FIXED_REFERENCE:
+                name = "FixedReference";
+                break;
+
+            case GT_IMAGE_REG_CONTAINER_PROGRESSIVE:
+                name = "Progressive";
+                break;
+
+            default:
+                GERROR_STREAM("Unrecognized image registration container mode type : " << v);
+        }
+
+        return name;
+    }
+
+    inline GT_IMAGE_REG_CONTAINER_MODE getImageRegContainerModeType(const std::string& name)
+    {
+        GT_IMAGE_REG_CONTAINER_MODE v;
+
+        if ( name == "Pair-wise" )
+        {
+            v = GT_IMAGE_REG_CONTAINER_PAIR_WISE;
+        }
+        else if ( name == "FixedReference" )
+        {
+            v = GT_IMAGE_REG_CONTAINER_FIXED_REFERENCE;
+        }
+        else if ( name == "Progressive" )
+        {
+            v = GT_IMAGE_REG_CONTAINER_PROGRESSIVE;
+        }
+        else
+        {
+            GERROR_STREAM("Unrecognized image registration container mode name : " << name);
+        }
+
+        return v;
+    }
+
+    /// perform the image registration over an image container2D
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    class hoImageRegContainer2DRegistration
+    {
+    public:
+
+        typedef hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut> Self;
+
+        typedef hoNDImage<ValueType, DOut> TargetType;
+        typedef hoNDImage<ValueType, DIn> SourceType;
+
+        typedef hoNDImage<ValueType, 2> Target2DType;
+        typedef Target2DType Source2DType;
+
+        typedef hoNDImage<ValueType, 3> Target3DType;
+        typedef Target2DType Source3DType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef CoordType coord_type;
+
+        /// boundary handler and interpolator for target image
+        typedef hoNDBoundaryHandler<TargetType> BoundaryHandlerTargetType;
+        typedef hoNDBoundaryHandlerFixedValue<TargetType> BoundaryHandlerTargetFixedValueType;
+        typedef hoNDBoundaryHandlerBorderValue<TargetType> BoundaryHandlerTargetBorderValueType;
+        typedef hoNDBoundaryHandlerPeriodic<TargetType> BoundaryHandlerTargetPeriodicType;
+        typedef hoNDBoundaryHandlerMirror<TargetType> BoundaryHandlerTargetMirrorType;
+
+        typedef hoNDInterpolator<TargetType> InterpTargetType;
+        typedef hoNDInterpolatorLinear<TargetType> InterpTargetLinearType;
+        typedef hoNDInterpolatorNearestNeighbor<TargetType> InterpTargetNearestNeighborType;
+        typedef hoNDInterpolatorBSpline<TargetType, DIn> InterpTargetBSplineType;
+
+        /// boundary handler and interpolator for source image
+        typedef hoNDBoundaryHandler<SourceType> BoundaryHandlerSourceType;
+        typedef hoNDBoundaryHandlerFixedValue<SourceType> BoundaryHandlerSourceFixedValueType;
+        typedef hoNDBoundaryHandlerBorderValue<SourceType> BoundaryHandlerSourceBorderValueType;
+        typedef hoNDBoundaryHandlerPeriodic<SourceType> BoundaryHandlerSourcePeriodicType;
+        typedef hoNDBoundaryHandlerMirror<SourceType> BoundaryHandlerSourceMirrorType;
+
+        typedef hoNDInterpolator<SourceType> InterpSourceType;
+        typedef hoNDInterpolatorLinear<SourceType> InterpSourceLinearType;
+        typedef hoNDInterpolatorNearestNeighbor<SourceType> InterpSourceNearestNeighborType;
+        typedef hoNDInterpolatorBSpline<SourceType, DIn> InterpSourceBSplineType;
+
+        /// warper type
+        typedef hoImageRegWarper<ValueType, CoordType, DIn, DOut> WarperType;
+
+        /// image dissimilarity type
+        typedef hoImageRegDissimilarity<ValueType, DOut> DissimilarityType;
+
+        /// transformation
+        typedef hoImageRegParametricTransformation<CoordType, DIn, DOut> TransformationParametricType;
+
+        typedef hoImageRegDeformationField<CoordType, DIn> TransformationDeformationFieldType;
+        typedef typename TransformationDeformationFieldType::input_point_type input_point_type;
+        typedef typename TransformationDeformationFieldType::output_point_type output_point_type;
+        typedef typename TransformationDeformationFieldType::jacobian_position_type jacobian_position_type;
+        typedef typename TransformationDeformationFieldType::DeformationFieldType DeformationFieldType;
+
+        /// container
+        typedef hoNDImageContainer2D<TargetType> TargetContinerType;
+        typedef hoNDImageContainer2D<SourceType> SourceContinerType;
+        typedef hoNDImageContainer2D<DeformationFieldType> DeformationFieldContinerType;
+
+        hoImageRegContainer2DRegistration(unsigned int resolution_pyramid_levels=3, bool use_world_coordinates=false, ValueType bg_value=ValueType(0));
+        virtual ~hoImageRegContainer2DRegistration();
+
+        /// set the default parameters
+        virtual bool setDefaultParameters(unsigned int resolution_pyramid_levels=3, bool use_world_coordinates=false);
+
+        /// register two images
+        /// transform or deform can contain the initial transformation or deformation
+        /// if warped == NULL, warped images will not be computed
+        virtual bool registerTwoImagesParametric(const TargetType& target, const SourceType& source, bool initial, TargetType* warped, TransformationParametricType& transform);
+        virtual bool registerTwoImagesDeformationField(const TargetType& target, const SourceType& source, bool initial, TargetType* warped, DeformationFieldType** deform);
+        virtual bool registerTwoImagesDeformationFieldBidirectional(const TargetType& target, const SourceType& source, bool initial, TargetType* warped, DeformationFieldType** deform, DeformationFieldType** deformInv);
+
+        /// if warped is true, the warped images will be computed; if initial is true, the registration will be initialized by deformation_field_ and deformation_field_inverse_
+        virtual bool registerOverContainer2DPairWise(TargetContinerType& targetContainer, SourceContinerType& sourceContainer, bool warped, bool initial = false);
+        virtual bool registerOverContainer2DFixedReference(TargetContinerType& targetContainer, const std::vector<unsigned int>& referenceFrame, bool warped, bool initial = false);
+        virtual bool registerOverContainer2DProgressive(TargetContinerType& targetContainer, const std::vector<unsigned int>& referenceFrame);
+
+        /// warp image containers
+        template <typename ValueType2> 
+        bool warpContainer2D(const hoNDImageContainer2D< hoNDImage<ValueType2, DOut> >& targetContainer, 
+                             const hoNDImageContainer2D< hoNDImage<ValueType2, DIn> >& sourceContainer, 
+                             DeformationFieldContinerType deformation_field[], 
+                             hoNDImageContainer2D< hoNDImage<ValueType2, DOut> >& warppedContainer,
+                             Gadgetron::GT_BOUNDARY_CONDITION bh=GT_BOUNDARY_CONDITION_FIXEDVALUE)
+        {
+            try
+            {
+                typedef hoNDImage<ValueType2, DOut> ImageTargetType;
+                typedef hoNDImage<ValueType2, DIn> ImageSourceType;
+
+                size_t R = sourceContainer.rows();
+                std::vector<size_t> cols = sourceContainer.cols();
+
+                GADGET_CHECK_RETURN_FALSE(targetContainer.dimensions_equal_container(sourceContainer));
+                GADGET_CHECK_RETURN_FALSE(targetContainer.dimensions_equal_container(deformation_field[0]));
+
+                if ( !targetContainer.dimensions_equal_container(warppedContainer) )
+                {
+                    GADGET_CHECK_RETURN_FALSE(warppedContainer.copyFrom(targetContainer));
+                }
+
+                if ( R == 1 )
+                {
+                    long long N = (long long)cols[0];
+
+                    long long c;
+                    #pragma omp parallel private(c) shared(N, targetContainer, sourceContainer, warppedContainer, deformation_field, bh) if ( DIn==2 )
+                    {
+                        hoImageRegDeformationField<CoordType, DIn> deformTransform;
+                        hoNDBoundaryHandlerFixedValue< ImageSourceType > bhFixedValue;
+                        hoNDBoundaryHandlerBorderValue< ImageSourceType > bhBorderValue;
+                        hoNDBoundaryHandlerPeriodic< ImageSourceType > bhPeriodic;
+                        hoNDBoundaryHandlerMirror< ImageSourceType > bhMirror;
+
+                        hoNDInterpolatorBSpline<ImageSourceType, DIn> interpBSpline(5);
+
+                        hoImageRegWarper<ValueType2, CoordType, DIn, DOut> warper;
+                        warper.setBackgroundValue(bg_value_);
+                        warper.setTransformation(deformTransform);
+                        warper.setInterpolator(interpBSpline);
+
+                        #pragma omp for 
+                        for ( c=0; c<N; c++ )
+                        {
+                            const ImageTargetType& target = targetContainer(0, c);
+                            ImageSourceType& source = const_cast<ImageSourceType&>(sourceContainer(0, c));
+                            ImageTargetType& warpped = warppedContainer(0, c);
+
+                            bhFixedValue.setArray( source );
+                            interpBSpline.setArray( source );
+
+                            if ( bh == GT_BOUNDARY_CONDITION_FIXEDVALUE )
+                                interpBSpline.setBoundaryHandler(bhFixedValue);
+                            else if ( bh == GT_BOUNDARY_CONDITION_BORDERVALUE )
+                                interpBSpline.setBoundaryHandler(bhBorderValue);
+                            else if ( bh == GT_BOUNDARY_CONDITION_PERIODIC )
+                                interpBSpline.setBoundaryHandler(bhPeriodic);
+                            else if ( bh == GT_BOUNDARY_CONDITION_MIRROR )
+                                interpBSpline.setBoundaryHandler(bhMirror);
+                            else
+                                interpBSpline.setBoundaryHandler(bhFixedValue);
+
+                            for ( unsigned int ii=0; ii<DIn; ii++ )
+                            {
+                                deformTransform.setDeformationField( deformation_field[ii](0, c), ii );
+                            }
+
+                            warper.warp(target, source, use_world_coordinates_, warpped);
+                        }
+                    }
+                }
+                else
+                {
+
+                    long long r, c;
+                    #pragma omp parallel default(none) private(r, c) shared(targetContainer, sourceContainer, warppedContainer, deformation_field, R, cols, bh) if ( DIn==2 )
+                    {
+                        hoImageRegDeformationField<CoordType, DIn> deformTransform;
+                        hoNDBoundaryHandlerFixedValue< ImageSourceType > bhFixedValue;
+                        hoNDBoundaryHandlerBorderValue< ImageSourceType > bhBorderValue;
+                        hoNDBoundaryHandlerPeriodic< ImageSourceType > bhPeriodic;
+                        hoNDBoundaryHandlerMirror< ImageSourceType > bhMirror;
+
+                        hoNDInterpolatorBSpline<ImageSourceType, DIn> interpBSpline(5);
+
+                        hoImageRegWarper<ValueType2, CoordType, DIn, DOut> warper;
+                        warper.setBackgroundValue(bg_value_);
+                        warper.setTransformation(deformTransform);
+                        warper.setInterpolator(interpBSpline);
+
+                        #pragma omp for 
+                        for ( r=0; r<(long long)R; r++ )
+                        {
+                            long long N = (long long)cols[r];
+                            for ( c=0; c<N; c++ )
+                            {
+                                const ImageTargetType& target = targetContainer(r, c);
+                                ImageSourceType& source = const_cast<ImageSourceType&>(sourceContainer(r, c));
+                                ImageTargetType& warpped = warppedContainer(r, c);
+
+                                bhFixedValue.setArray( source );
+                                interpBSpline.setArray( source );
+
+                                if ( bh == GT_BOUNDARY_CONDITION_FIXEDVALUE )
+                                    interpBSpline.setBoundaryHandler(bhFixedValue);
+                                else if ( bh == GT_BOUNDARY_CONDITION_BORDERVALUE )
+                                    interpBSpline.setBoundaryHandler(bhBorderValue);
+                                else if ( bh == GT_BOUNDARY_CONDITION_PERIODIC )
+                                    interpBSpline.setBoundaryHandler(bhPeriodic);
+                                else if ( bh == GT_BOUNDARY_CONDITION_MIRROR )
+                                    interpBSpline.setBoundaryHandler(bhMirror);
+                                else
+                                    interpBSpline.setBoundaryHandler(bhFixedValue);
+
+                                for ( unsigned int ii=0; ii<DIn; ii++ )
+                                {
+                                    deformTransform.setDeformationField( deformation_field[ii](r, c), ii );
+                                }
+
+                                warper.warp(target, source, use_world_coordinates_, warpped);
+                            }
+                        }
+                    }
+                }
+            }
+            catch(...)
+            {
+                GERROR_STREAM("Errors happened in hoImageRegContainer2DRegistration<...>::warpContainer2D(...) ... ");
+                return false;
+            }
+
+            return true;
+        }
+
+        /// print the class information
+        virtual void print(std::ostream& os) const;
+
+        // ----------------------------------
+        // parameters
+        // ----------------------------------
+
+        /// mode for registration over the container
+        GT_IMAGE_REG_CONTAINER_MODE container_reg_mode_;
+
+        /// mode for transformation
+        GT_IMAGE_REG_TRANSFORMATION container_reg_transformation_;
+
+        /// back ground values, used to mark regions in the target image which will not be warped
+        ValueType bg_value_;
+
+        /// whether to perform world coordinate registration
+        bool use_world_coordinates_;
+
+        /// number of resolution pyramid levels
+        unsigned int resolution_pyramid_levels_;
+
+        /// number of iterations for every pyramid level
+        std::vector<unsigned int> max_iter_num_pyramid_level_;
+
+        /// dissimilarity
+        GT_IMAGE_DISSIMILARITY dissimilarity_type_;
+
+        /// threshold for dissimilarity for every pyramid level
+        std::vector<ValueType> dissimilarity_thres_pyramid_level_;
+
+        /// number of search size division for every pyramid level
+        std::vector<unsigned int> div_num_pyramid_level_;
+
+        /// parameters for dissimilarity measures, for every paramid level
+        /// LocalCCR
+        std::vector<std::vector<ValueType> > dissimilarity_LocalCCR_sigmaArg_;
+
+        /// Histogram based
+        /// Mutual information
+        std::vector<ValueType> dissimilarity_MI_betaArg_;
+
+        /// regularization strength for every pyramid level
+        /// if regularization_hilbert_strength_world_coordinate_=true, this strength is in the unit of world coordinate
+        /// if regularization_hilbert_strength_world_coordinate_=false, this strength is in the unit of pixel
+        bool regularization_hilbert_strength_world_coordinate_;
+        std::vector< std::vector<ValueType> > regularization_hilbert_strength_pyramid_level_;
+
+        /// boundary handler type
+        std::vector<GT_BOUNDARY_CONDITION> boundary_handler_type_warper_;
+        std::vector<GT_IMAGE_INTERPOLATOR> interp_type_warper_;
+
+        /// number of iterations to improve the estimation of the inverse transform
+        std::vector<unsigned int> inverse_deform_enforce_iter_pyramid_level_;
+        /// weight to update the estimation of the inverse transform, must be within [0 1]
+        std::vector<CoordType> inverse_deform_enforce_weight_pyramid_level_;
+
+        /// in-FOV constraint
+        bool apply_in_FOV_constraint_;
+
+        /// verbose mode
+        bool verbose_;
+
+        // ----------------------------------
+        // debug and timing
+        // ----------------------------------
+        // clock for timing
+        Gadgetron::GadgetronTimer gt_timer1_;
+        Gadgetron::GadgetronTimer gt_timer2_;
+        Gadgetron::GadgetronTimer gt_timer3_;
+
+        bool performTiming_;
+
+        // exporter
+        Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+        // debug folder
+        std::string debugFolder_;
+
+        // ----------------------------------
+        // registration results
+        // ----------------------------------
+
+        /// warpped images
+        TargetContinerType warped_container_;
+
+        /// for parametric registration
+        std::vector< std::vector<TransformationParametricType*> > parametric_tranformation_;
+
+        /// deformation field registration
+        DeformationFieldContinerType deformation_field_[DIn];
+        DeformationFieldContinerType deformation_field_inverse_[DIn];
+
+    protected:
+
+        bool initialize(const TargetContinerType& targetContainer, bool warped);
+
+    };
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::
+    hoImageRegContainer2DRegistration(unsigned int resolution_pyramid_levels, bool use_world_coordinates, ValueType bg_value) 
+    : bg_value_(bg_value), use_world_coordinates_(use_world_coordinates), resolution_pyramid_levels_(resolution_pyramid_levels), performTiming_(false)
+    {
+        gt_timer1_.set_timing_in_destruction(false);
+        gt_timer2_.set_timing_in_destruction(false);
+        gt_timer3_.set_timing_in_destruction(false);
+
+        GADGET_CHECK_THROW(this->setDefaultParameters(resolution_pyramid_levels, use_world_coordinates));
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::
+    ~hoImageRegContainer2DRegistration()
+    {
+        if ( !parametric_tranformation_.empty() )
+        {
+            size_t r, c;
+            for ( r=0; r<parametric_tranformation_.size(); r++ )
+            {
+                if ( !parametric_tranformation_[r].empty() )
+                {
+                    for ( c=0; c<parametric_tranformation_[r].size(); c++ )
+                    {
+                        if ( parametric_tranformation_[r][c] != NULL )
+                        {
+                            delete parametric_tranformation_[r][c];
+                            parametric_tranformation_[r][c] = NULL;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::setDefaultParameters(unsigned int resolution_pyramid_levels, bool use_world_coordinates)
+    {
+        unsigned int ii;
+
+        use_world_coordinates_ = use_world_coordinates;
+        resolution_pyramid_levels_ = resolution_pyramid_levels;
+
+        container_reg_mode_ = GT_IMAGE_REG_CONTAINER_PAIR_WISE;
+        container_reg_transformation_ = GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD;
+
+        max_iter_num_pyramid_level_.clear();
+        max_iter_num_pyramid_level_.resize(resolution_pyramid_levels_, 32);
+        max_iter_num_pyramid_level_[0] = 16;
+
+        dissimilarity_type_ = GT_IMAGE_DISSIMILARITY_LocalCCR;
+
+        dissimilarity_thres_pyramid_level_.clear();
+        dissimilarity_thres_pyramid_level_.resize(resolution_pyramid_levels_, (ValueType)(1e-5) );
+
+        div_num_pyramid_level_.clear();
+        div_num_pyramid_level_.resize(resolution_pyramid_levels_, 2);
+
+        dissimilarity_LocalCCR_sigmaArg_.clear();
+        dissimilarity_LocalCCR_sigmaArg_.resize(resolution_pyramid_levels_);
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            dissimilarity_LocalCCR_sigmaArg_[ii].resize(DIn, 2.0);
+        }
+
+        dissimilarity_MI_betaArg_.clear();
+        dissimilarity_MI_betaArg_.resize(resolution_pyramid_levels_, 2);
+
+        regularization_hilbert_strength_world_coordinate_ = false;
+        regularization_hilbert_strength_pyramid_level_.resize(resolution_pyramid_levels_);
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            regularization_hilbert_strength_pyramid_level_[ii].resize(DIn, 12.0);
+        }
+
+        boundary_handler_type_warper_.clear();
+        boundary_handler_type_warper_.resize(resolution_pyramid_levels_, GT_BOUNDARY_CONDITION_BORDERVALUE);
+
+        interp_type_warper_.clear();
+        interp_type_warper_.resize(resolution_pyramid_levels_, GT_IMAGE_INTERPOLATOR_LINEAR);
+
+        inverse_deform_enforce_iter_pyramid_level_.clear();
+        inverse_deform_enforce_iter_pyramid_level_.resize(resolution_pyramid_levels_, 10);
+
+        inverse_deform_enforce_weight_pyramid_level_.clear();
+        inverse_deform_enforce_weight_pyramid_level_.resize(resolution_pyramid_levels_, 0.5);
+
+        apply_in_FOV_constraint_ = false;
+
+        verbose_ = false;
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::
+    registerTwoImagesParametric(const TargetType& target, const SourceType& source, bool initial, TargetType* warped, TransformationParametricType& transform)
+    {
+        try
+        {
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::registerTwoImagesParametric(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::
+    registerTwoImagesDeformationField(const TargetType& target, const SourceType& source, bool initial, TargetType* warped, DeformationFieldType** deform)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(DIn==DOut);
+            GADGET_CHECK_RETURN_FALSE(deform!=NULL);
+
+            hoImageRegDeformationFieldRegister<ValueType, CoordType, DIn> reg(resolution_pyramid_levels_, use_world_coordinates_, bg_value_);
+
+            if ( !debugFolder_.empty() )
+            {
+                reg.debugFolder_ = debugFolder_;
+            }
+
+            GADGET_CHECK_RETURN_FALSE(reg.setDefaultParameters(resolution_pyramid_levels_, use_world_coordinates_));
+
+            reg.max_iter_num_pyramid_level_ = max_iter_num_pyramid_level_;
+            reg.div_num_pyramid_level_ = div_num_pyramid_level_;
+            reg.dissimilarity_MI_betaArg_ = dissimilarity_MI_betaArg_;
+            reg.regularization_hilbert_strength_world_coordinate_ = regularization_hilbert_strength_world_coordinate_;
+            reg.regularization_hilbert_strength_pyramid_level_ = regularization_hilbert_strength_pyramid_level_;
+            reg.dissimilarity_LocalCCR_sigmaArg_ = dissimilarity_LocalCCR_sigmaArg_;
+            reg.boundary_handler_type_warper_ = boundary_handler_type_warper_;
+            reg.interp_type_warper_ = interp_type_warper_;
+            reg.apply_in_FOV_constraint_ = apply_in_FOV_constraint_;
+            reg.verbose_ = verbose_;
+
+            reg.dissimilarity_type_.clear();
+            reg.dissimilarity_type_.resize(resolution_pyramid_levels_, dissimilarity_type_);
+
+            reg.setTarget( const_cast<TargetType&>(target) );
+            reg.setSource( const_cast<TargetType&>(source) );
+
+            if ( verbose_ )
+            {
+                std::ostringstream outs;
+                reg.print(outs);
+                GDEBUG_STREAM(outs.str());
+            }
+
+            GADGET_CHECK_RETURN_FALSE(reg.initialize());
+
+            unsigned int d;
+
+            if ( target.dimensions_equal( *(deform[0]) ) )
+            {
+                if ( initial )
+                {
+                    for ( d=0; d<DIn; d++ )
+                    {
+                        reg.transform_->setDeformationField( *(deform[d]), d);
+                    }
+                }
+            }
+            else
+            {
+                for ( d=0; d<DIn; d++ )
+                {
+                    deform[d]->copyImageInfo(target);
+                    Gadgetron::clear( *(deform[d]) );
+                }
+            }
+
+            GADGET_CHECK_RETURN_FALSE(reg.performRegistration());
+
+            for ( d=0; d<DIn; d++ )
+            {
+                *(deform[d]) = reg.transform_->getDeformationField(d);
+            }
+
+            if ( warped != NULL )
+            {
+                /// bspline warp
+                hoNDBoundaryHandlerFixedValue<SourceType> bhFixedValue;
+                bhFixedValue.setArray( const_cast<SourceType&>(source) );
+
+                hoNDInterpolatorBSpline<SourceType, DIn> interpBSpline(5);
+                interpBSpline.setArray( const_cast<SourceType&>(source) );
+                interpBSpline.setBoundaryHandler(bhFixedValue);
+
+                hoImageRegWarper<ValueType, ValueType, DIn, DOut> warper;
+                warper.setBackgroundValue(bg_value_);
+                warper.setTransformation(*reg.transform_);
+                warper.setInterpolator(interpBSpline);
+
+                warper.warp(target, source, use_world_coordinates_, *warped);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::registerTwoImagesDeformationField(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::
+    registerTwoImagesDeformationFieldBidirectional(const TargetType& target, const SourceType& source, bool initial, TargetType* warped, DeformationFieldType** deform, DeformationFieldType** deformInv)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(DIn==DOut);
+            GADGET_CHECK_RETURN_FALSE(deform!=NULL);
+            GADGET_CHECK_RETURN_FALSE(deformInv!=NULL);
+
+            hoImageRegDeformationFieldBidirectionalRegister<ValueType, coord_type, DIn> reg(resolution_pyramid_levels_, use_world_coordinates_, bg_value_);
+
+            if ( !debugFolder_.empty() )
+            {
+                reg.debugFolder_ = debugFolder_;
+            }
+
+            GADGET_CHECK_RETURN_FALSE(reg.setDefaultParameters(resolution_pyramid_levels_, use_world_coordinates_));
+
+            reg.max_iter_num_pyramid_level_ = max_iter_num_pyramid_level_;
+            reg.div_num_pyramid_level_ = div_num_pyramid_level_;
+            reg.dissimilarity_MI_betaArg_ = dissimilarity_MI_betaArg_;
+            reg.regularization_hilbert_strength_world_coordinate_ = regularization_hilbert_strength_world_coordinate_;
+            reg.regularization_hilbert_strength_pyramid_level_ = regularization_hilbert_strength_pyramid_level_;
+            reg.dissimilarity_LocalCCR_sigmaArg_ = dissimilarity_LocalCCR_sigmaArg_;
+            reg.boundary_handler_type_warper_ = boundary_handler_type_warper_;
+            reg.interp_type_warper_ = interp_type_warper_;
+            reg.inverse_deform_enforce_iter_pyramid_level_ = inverse_deform_enforce_iter_pyramid_level_;
+            reg.inverse_deform_enforce_weight_pyramid_level_ = inverse_deform_enforce_weight_pyramid_level_;
+            reg.apply_in_FOV_constraint_ = apply_in_FOV_constraint_;
+
+            reg.verbose_ = verbose_;
+
+            reg.dissimilarity_type_.clear();
+            reg.dissimilarity_type_.resize(resolution_pyramid_levels_, dissimilarity_type_);
+
+            reg.setTarget( const_cast<TargetType&>(target) );
+            reg.setSource( const_cast<SourceType&>(source) );
+
+            if ( verbose_ )
+            {
+                Gadgetron::printInfo(reg);
+            }
+
+            GADGET_CHECK_RETURN_FALSE(reg.initialize());
+
+            unsigned int d;
+
+            if ( target.dimensions_equal( *(deform[0]) ) )
+            {
+                if ( initial )
+                {
+                    for ( d=0; d<DIn; d++ )
+                    {
+                        reg.transform_->setDeformationField( *(deform[d]), d);
+                        reg.transform_inverse_->setDeformationField( *(deformInv[d]), d);
+                    }
+                }
+            }
+            else
+            {
+                for ( d=0; d<DIn; d++ )
+                {
+                    deform[d]->copyImageInfo(target);
+                    Gadgetron::clear( *(deform[d]) );
+                    deformInv[d]->copyImageInfo(target);
+                    Gadgetron::clear( *(deformInv[d]) );
+                }
+            }
+
+            GADGET_CHECK_RETURN_FALSE(reg.performRegistration());
+
+            for ( d=0; d<DIn; d++ )
+            {
+                *(deform[d]) = reg.transform_->getDeformationField(d);
+                *(deformInv[d]) = reg.transform_inverse_->getDeformationField(d);
+            }
+
+            if ( warped != NULL )
+            {
+                /// bspline warp
+                // hoNDBoundaryHandlerFixedValue<SourceType> bhFixedValue;
+                hoNDBoundaryHandlerBorderValue<SourceType> bhFixedValue;
+                bhFixedValue.setArray(const_cast<SourceType&>(source));
+
+                hoNDInterpolatorBSpline<SourceType, DIn> interpBSpline(5);
+                interpBSpline.setArray(const_cast<SourceType&>(source));
+                interpBSpline.setBoundaryHandler(bhFixedValue);
+
+                hoImageRegWarper<ValueType, ValueType, DIn, DOut> warper;
+                warper.setBackgroundValue(bg_value_);
+                warper.setTransformation(*reg.transform_);
+                warper.setInterpolator(interpBSpline);
+
+                GADGET_CHECK_RETURN_FALSE(warper.warp(target, source, use_world_coordinates_, *warped));
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::registerTwoImagesDeformationFieldBidirectional(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::
+    initialize(const TargetContinerType& targetContainer, bool warped)
+    {
+        try
+        {
+            if ( warped )
+            {
+                GADGET_CHECK_RETURN_FALSE(warped_container_.copyFrom(targetContainer));
+            }
+
+            std::vector<size_t> col = targetContainer.cols();
+
+            unsigned int ii;
+
+            if ( container_reg_transformation_ == GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD )
+            {
+                for ( ii=0; ii<DIn; ii++ )
+                {
+                    GADGET_CHECK_RETURN_FALSE(deformation_field_[ii].create(col));
+                    GADGET_CHECK_RETURN_FALSE(deformation_field_[ii].fillWithZeros());
+                }
+            }
+            else if ( container_reg_transformation_ == GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD_BIDIRECTIONAL )
+            {
+                for ( ii=0; ii<DIn; ii++ )
+                {
+                    GADGET_CHECK_RETURN_FALSE(deformation_field_[ii].create(col));
+                    GADGET_CHECK_RETURN_FALSE(deformation_field_[ii].fillWithZeros());
+
+                    GADGET_CHECK_RETURN_FALSE(deformation_field_inverse_[ii].create(col));
+                    GADGET_CHECK_RETURN_FALSE(deformation_field_inverse_[ii].fillWithZeros());
+                }
+            }
+            else if ( container_reg_transformation_==GT_IMAGE_REG_TRANSFORMATION_RIGID 
+                        || container_reg_transformation_==GT_IMAGE_REG_TRANSFORMATION_AFFINE )
+            {
+                GDEBUG_STREAM("To be implemented ...");
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::initialize(const TargetContinerType& targetContainer) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::
+    registerOverContainer2DPairWise(TargetContinerType& targetContainer, SourceContinerType& sourceContainer, bool warped, bool initial)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(this->initialize(targetContainer, warped));
+
+            std::vector<TargetType*> targetImages;
+            targetContainer.get_all_images(targetImages);
+
+            std::vector<SourceType*> sourceImages;
+            sourceContainer.get_all_images(sourceImages);
+
+            long long numOfImages = targetImages.size();
+
+            GADGET_CHECK_RETURN_FALSE(numOfImages==sourceImages.size());
+
+            std::vector<SourceType*> warpedImages(numOfImages, NULL);
+            if ( warped )
+            {
+                warped_container_.get_all_images(warpedImages);
+            }
+
+            GDEBUG_STREAM("registerOverContainer2DPairWise - threading ... ");
+
+            #ifdef USE_OMP
+                int numOfProcs = omp_get_num_procs();
+                int nested = omp_get_nested();
+                if ( numOfImages < numOfProcs-1 )
+                {
+                    omp_set_nested(1);
+                    GDEBUG_STREAM("registerOverContainer2DPairWise - nested openMP on ... ");
+                }
+                else
+                {
+                    omp_set_nested(0);
+                    GDEBUG_STREAM("registerOverContainer2DPairWise - nested openMP off ... ");
+                }
+            #endif // USE_OMP
+
+            unsigned int ii;
+            long long n;
+
+            if ( container_reg_transformation_ == GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD )
+            {
+                std::vector< std::vector<DeformationFieldType*> > deform(DIn);
+
+                for ( ii=0; ii<DIn; ii++ )
+                {
+                    deformation_field_[ii].get_all_images(deform[ii]);
+                }
+
+                #pragma omp parallel default(none) private(n, ii) shared(numOfImages, initial, targetImages, sourceImages, deform, warpedImages)
+                {
+                    DeformationFieldType* deformCurr[DIn];
+
+                    #pragma omp for 
+                    for ( n=0; n<numOfImages; n++ )
+                    {
+                        TargetType& target = *(targetImages[n]);
+                        SourceType& source = *(sourceImages[n]);
+
+                        if ( &target == &source )
+                        {
+                            for ( ii=0; ii<DIn; ii++ )
+                            {
+                                deform[ii][n]->create(target.get_dimensions());
+                                Gadgetron::clear( *deform[ii][n] );
+                            }
+                        }
+                        else
+                        {
+                            for ( ii=0; ii<DIn; ii++ )
+                            {
+                                deformCurr[ii] = deform[ii][n];
+                            }
+
+                            registerTwoImagesDeformationField(target, source, initial, warpedImages[n], deformCurr);
+                        }
+                    }
+                }
+            }
+            else if ( container_reg_transformation_ == GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD_BIDIRECTIONAL )
+            {
+                std::vector< std::vector<DeformationFieldType*> > deform(DIn);
+                std::vector< std::vector<DeformationFieldType*> > deformInv(DIn);
+
+                for ( ii=0; ii<DIn; ii++ )
+                {
+                    deformation_field_[ii].get_all_images(deform[ii]);
+                    deformation_field_inverse_[ii].get_all_images(deformInv[ii]);
+                }
+
+                #pragma omp parallel default(none) private(n, ii) shared(numOfImages, initial, targetImages, sourceImages, deform, deformInv, warpedImages)
+                {
+                    DeformationFieldType* deformCurr[DIn];
+                    DeformationFieldType* deformInvCurr[DIn];
+
+                    #pragma omp for 
+                    for ( n=0; n<numOfImages; n++ )
+                    {
+                        TargetType& target = *(targetImages[n]);
+                        SourceType& source = *(sourceImages[n]);
+
+                        if ( &target == &source )
+                        {
+                            for ( ii=0; ii<DIn; ii++ )
+                            {
+                                deform[ii][n]->create(target.get_dimensions());
+                                Gadgetron::clear( *deform[ii][n] );
+
+                                deformInv[ii][n]->create(source.get_dimensions());
+                                Gadgetron::clear( *deformInv[ii][n] );
+                            }
+                        }
+                        else
+                        {
+                            for ( ii=0; ii<DIn; ii++ )
+                            {
+                                deformCurr[ii] = deform[ii][n];
+                                deformInvCurr[ii] = deformInv[ii][n];
+                            }
+
+                            registerTwoImagesDeformationFieldBidirectional(target, source, initial, warpedImages[n], deformCurr, deformInvCurr);
+                        }
+                    }
+                }
+            }
+            else if ( container_reg_transformation_==GT_IMAGE_REG_TRANSFORMATION_RIGID 
+                        || container_reg_transformation_==GT_IMAGE_REG_TRANSFORMATION_AFFINE )
+            {
+                GDEBUG_STREAM("To be implemented ...");
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::registerOverContainer2DPairWise(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::
+    registerOverContainer2DFixedReference(TargetContinerType& imageContainer, const std::vector<unsigned int>& referenceFrame, bool warped, bool initial)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(this->initialize(imageContainer, warped));
+
+            size_t row = imageContainer.rows();
+            std::vector<size_t> col = imageContainer.cols();
+
+            GADGET_CHECK_RETURN_FALSE(referenceFrame.size() == col.size());
+
+            std::vector<SourceType*> sourceImages;
+            imageContainer.get_all_images(sourceImages);
+
+            long long numOfImages = (long long)sourceImages.size();
+
+            // warped images
+            std::vector<SourceType*> warpedImages(numOfImages, NULL);
+            if ( warped )
+            {
+                warped_container_.get_all_images(warpedImages);
+            }
+
+            unsigned int ii;
+            long long n;
+            size_t r, c;
+
+            // fill in the reference frames
+            std::vector<TargetType*> targetImages(numOfImages, NULL);
+
+            size_t ind=0;
+            for ( r=0; r<row; r++ )
+            {
+                TargetType& ref = const_cast<TargetType&>(imageContainer(r, referenceFrame[r]));
+
+                for ( c=0; c<col[r]; c++ )
+                {
+                    targetImages[ind] = &ref;
+                    ind++;
+                }
+            }
+
+            GADGET_CHECK_RETURN_FALSE(numOfImages==targetImages.size());
+
+            #ifdef USE_OMP
+                int numOfProcs = omp_get_num_procs();
+                int nested = omp_get_nested();
+                if ( numOfImages < numOfProcs-1 )
+                {
+                    omp_set_nested(1);
+                    GDEBUG_STREAM("registerOverContainer2DFixedReference - nested openMP on ... ");
+                }
+                else
+                {
+                    omp_set_nested(0);
+                    GDEBUG_STREAM("registerOverContainer2DFixedReference - nested openMP off ... ");
+                }
+            #endif // USE_OMP
+
+            if ( container_reg_transformation_ == GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD )
+            {
+                std::vector< std::vector<DeformationFieldType*> > deform(DIn);
+
+                for ( ii=0; ii<DIn; ii++ )
+                {
+                    deformation_field_[ii].get_all_images(deform[ii]);
+                }
+
+                #pragma omp parallel default(none) private(n, ii) shared(numOfImages, initial, targetImages, sourceImages, deform, warpedImages)
+                {
+                    DeformationFieldType* deformCurr[DIn];
+
+                    #pragma omp for 
+                    for ( n=0; n<numOfImages; n++ )
+                    {
+                        if ( targetImages[n] == sourceImages[n] )
+                        {
+                            if ( warpedImages[n] != NULL )
+                            {
+                                *(warpedImages[n]) = *(targetImages[n]);
+                            }
+
+                            for ( ii=0; ii<DIn; ii++ )
+                            {
+                                deform[ii][n]->create(targetImages[n]->get_dimensions());
+                                Gadgetron::clear(*deform[ii][n]);
+                            }
+
+                            continue;
+                        }
+
+                        TargetType& target = *(targetImages[n]);
+                        SourceType& source = *(sourceImages[n]);
+
+                        for ( ii=0; ii<DIn; ii++ )
+                        {
+                            deformCurr[ii] = deform[ii][n];
+                        }
+
+                        registerTwoImagesDeformationField(target, source, initial, warpedImages[n], deformCurr);
+                    }
+                }
+            }
+            else if ( container_reg_transformation_ == GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD_BIDIRECTIONAL )
+            {
+                std::vector< std::vector<DeformationFieldType*> > deform(DIn);
+                std::vector< std::vector<DeformationFieldType*> > deformInv(DIn);
+
+                for ( ii=0; ii<DIn; ii++ )
+                {
+                    deformation_field_[ii].get_all_images(deform[ii]);
+                    deformation_field_inverse_[ii].get_all_images(deformInv[ii]);
+                }
+
+                #pragma omp parallel default(none) private(n, ii) shared(numOfImages, initial, targetImages, sourceImages, deform, deformInv, warpedImages)
+                {
+                    DeformationFieldType* deformCurr[DIn];
+                    DeformationFieldType* deformInvCurr[DIn];
+
+                    #pragma omp for 
+                    for ( n=0; n<numOfImages; n++ )
+                    {
+                        if ( targetImages[n] == sourceImages[n] )
+                        {
+                            if ( warpedImages[n] != NULL )
+                            {
+                                *(warpedImages[n]) = *(targetImages[n]);
+                            }
+
+                            for ( ii=0; ii<DIn; ii++ )
+                            {
+                                deform[ii][n]->create(targetImages[n]->get_dimensions());
+                                Gadgetron::clear(*deform[ii][n]);
+
+                                deformInv[ii][n]->create(targetImages[n]->get_dimensions());
+                                Gadgetron::clear(*deformInv[ii][n]);
+                            }
+
+                            continue;
+                        }
+
+                        TargetType& target = *(targetImages[n]);
+                        SourceType& source = *(sourceImages[n]);
+
+                        for ( ii=0; ii<DIn; ii++ )
+                        {
+                            deformCurr[ii] = deform[ii][n];
+                            deformInvCurr[ii] = deformInv[ii][n];
+                        }
+
+                        registerTwoImagesDeformationFieldBidirectional(target, source, initial, warpedImages[n], deformCurr, deformInvCurr);
+                    }
+                }
+            }
+            else if ( container_reg_transformation_==GT_IMAGE_REG_TRANSFORMATION_RIGID 
+                        || container_reg_transformation_==GT_IMAGE_REG_TRANSFORMATION_AFFINE )
+            {
+                GDEBUG_STREAM("To be implemented ...");
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::registerOverContainer2DFixedReference(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::
+    registerOverContainer2DProgressive(TargetContinerType& imageContainer, const std::vector<unsigned int>& referenceFrame)
+    {
+        try
+        {
+            bool warped = true;
+            GADGET_CHECK_RETURN_FALSE(this->initialize(imageContainer, warped));
+
+            long long row = (long long)imageContainer.rows();
+            std::vector<size_t> col = imageContainer.cols();
+
+            GADGET_CHECK_RETURN_FALSE(referenceFrame.size() == col.size());
+
+            unsigned int ii;
+            long long n;
+            long long r, c;
+
+            // for every row, two registration tasks can be formatted
+
+            long long numOfTasks = (long long)(2*row);
+            GDEBUG_STREAM("hoImageRegContainer2DRegistration<...>::registerOverContainer2DProgressive(...), numOfTasks : " << numOfTasks);
+
+            std::vector< std::vector<TargetType*> > regImages(numOfTasks);
+            std::vector< std::vector<TargetType*> > warpedImages(numOfTasks);
+
+            std::vector< std::vector< std::vector<DeformationFieldType*> > > deform(DIn);
+            std::vector< std::vector< std::vector<DeformationFieldType*> > > deformInv(DIn);
+
+            for ( ii=0; ii<DIn; ii++ )
+            {
+                deform[ii].resize(numOfTasks);
+                deformInv[ii].resize(numOfTasks);
+            }
+
+            for ( r=0; r<row; r++ )
+            {
+                unsigned int refFrame = referenceFrame[r];
+
+                regImages[2*r].resize(col[r]-refFrame);
+                regImages[2*r+1].resize(1+refFrame);
+
+                warpedImages[2*r].resize(col[r]-refFrame);
+                warpedImages[2*r+1].resize(1+refFrame);
+
+                // copy over the reference frame
+                warped_container_(r, refFrame) = imageContainer(r, refFrame);
+
+                if ( container_reg_transformation_ == GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD )
+                {
+                    for ( ii=0; ii<DIn; ii++ )
+                    {
+                        deformation_field_[ii](r, refFrame).create(imageContainer(r, refFrame).get_dimensions());
+                        Gadgetron::clear(deformation_field_[ii](r, refFrame));
+                    }
+                }
+
+                if ( container_reg_transformation_ == GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD_BIDIRECTIONAL )
+                {
+                    for ( ii=0; ii<DIn; ii++ )
+                    {
+                        deformation_field_[ii](r, refFrame).create(imageContainer(r, refFrame).get_dimensions());
+                        Gadgetron::clear(deformation_field_[ii](r, refFrame));
+
+                        deformation_field_inverse_[ii](r, refFrame).create(imageContainer(r, refFrame).get_dimensions());
+                        Gadgetron::clear(deformation_field_inverse_[ii](r, refFrame));
+                    }
+                }
+
+                // task one
+                for ( c=refFrame; c<(long long)col[r]; c++ )
+                {
+                    regImages[2*r][c-refFrame] = &(imageContainer(r, c));
+                    warpedImages[2*r][c-refFrame] = &(warped_container_(r, c));
+                }
+
+                // task two
+                for ( c=refFrame; c>=0; c-- )
+                {
+                    regImages[2*r+1][refFrame-c] = &(imageContainer(r, c));
+                    warpedImages[2*r+1][refFrame-c] = &(warped_container_(r, c));
+                }
+
+                for ( ii=0; ii<DIn; ii++ )
+                {
+                    if ( container_reg_transformation_ == GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD )
+                    {
+                        deform[ii][2*r].resize(col[r]-refFrame);
+                        deform[ii][2*r+1].resize(1+refFrame);
+
+                        // task one
+                        for ( c=refFrame; c<(long long)col[r]; c++ )
+                        {
+                            deform[ii][2*r][c-refFrame] = &(deformation_field_[ii](r, c));
+                        }
+
+                        // task two
+                        for ( c=refFrame; c>=0; c-- )
+                        {
+                            deform[ii][2*r+1][refFrame-c] = &(deformation_field_[ii](r, c));
+                        }
+                    }
+
+                    if ( container_reg_transformation_ == GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD_BIDIRECTIONAL )
+                    {
+                        deform[ii][2*r].resize(col[r]-refFrame);
+                        deform[ii][2*r+1].resize(1+refFrame);
+
+                        deformInv[ii][2*r].resize(col[r]-refFrame);
+                        deformInv[ii][2*r+1].resize(1+refFrame);
+
+                        // task one
+                        for ( c=refFrame; c<(long long)col[r]; c++ )
+                        {
+                            deform[ii][2*r][c-refFrame] = &(deformation_field_[ii](r, c));
+                            deformInv[ii][2*r][c-refFrame] = &(deformation_field_inverse_[ii](r, c));
+                        }
+
+                        // task two
+                        for ( c=refFrame; c>=0; c-- )
+                        {
+                            deform[ii][2*r+1][refFrame-c] = &(deformation_field_[ii](r, c));
+                            deformInv[ii][2*r+1][refFrame-c] = &(deformation_field_inverse_[ii](r, c));
+                        }
+                    }
+                }
+            }
+
+            if ( container_reg_transformation_ == GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD )
+            {
+                bool initial = false;
+
+                #pragma omp parallel default(none) private(n, ii) shared(numOfTasks, initial, regImages, warpedImages, deform)
+                {
+                    DeformationFieldType* deformCurr[DIn];
+
+                    #pragma omp for 
+                    for ( n=0; n<numOfTasks; n++ )
+                    {
+                        size_t numOfImages = regImages[n].size();
+
+                        // no need to copy the refrence frame to warped
+
+                        size_t k;
+                        for ( k=1; k<numOfImages; k++ )
+                        {
+                            TargetType& target = *(warpedImages[n][k-1]);
+                            SourceType& source = *(regImages[n][k]);
+
+                            for ( ii=0; ii<DIn; ii++ )
+                            {
+                                deformCurr[ii] = deform[ii][n][k];
+                            }
+
+                            registerTwoImagesDeformationField(target, source, initial, warpedImages[n][k], deformCurr);
+                        }
+                    }
+                }
+            }
+            else if ( container_reg_transformation_ == GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD_BIDIRECTIONAL )
+            {
+                bool initial = false;
+
+                #pragma omp parallel default(none) private(n, ii) shared(numOfTasks, initial, regImages, warpedImages, deform, deformInv)
+                {
+                    DeformationFieldType* deformCurr[DIn];
+                    DeformationFieldType* deformInvCurr[DIn];
+
+                    #pragma omp for 
+                    for ( n=0; n<numOfTasks; n++ )
+                    {
+                        size_t numOfImages = regImages[n].size();
+
+                        size_t k;
+                        for ( k=1; k<numOfImages; k++ )
+                        {
+                            TargetType& target = *(warpedImages[n][k-1]);
+                            SourceType& source = *(regImages[n][k]);
+
+                            for ( ii=0; ii<DIn; ii++ )
+                            {
+                                deformCurr[ii] = deform[ii][n][k];
+                                deformInvCurr[ii] = deformInv[ii][n][k];
+                            }
+
+                            registerTwoImagesDeformationFieldBidirectional(target, source, initial, warpedImages[n][k], deformCurr, deformInvCurr);
+                        }
+                    }
+                }
+            }
+            else if ( container_reg_transformation_==GT_IMAGE_REG_TRANSFORMATION_RIGID 
+                        || container_reg_transformation_==GT_IMAGE_REG_TRANSFORMATION_AFFINE )
+            {
+                GDEBUG_STREAM("To be implemented ...");
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::registerOverContainer2DProgressive(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    void hoImageRegContainer2DRegistration<ValueType, CoordType, DIn, DOut>::print(std::ostream& os) const
+    {
+        using namespace std;
+
+        unsigned int ii, jj;
+
+        os << "--------------Gagdgetron image registration container 2D -------------" << endl;
+
+        os << "Input dimension is : " << DIn << endl;
+        os << "Output dimension is : " << DOut << endl;
+
+        std::string elemTypeName = std::string(typeid(ValueType).name());
+        os << "Image data type is : " << elemTypeName << std::endl;
+
+        elemTypeName = std::string(typeid(CoordType).name());
+        os << "Transformation coordinate data type is : " << elemTypeName << std::endl;
+
+        os << "Whether to perform world coordinate registration is : " << use_world_coordinates_ << std::endl;
+        os << "Number of resolution pyramid levels is : " << resolution_pyramid_levels_ << std::endl;
+
+        os << "------------" << std::endl;
+        os << "Number of iterations is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << max_iter_num_pyramid_level_[ii] << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Image dissimilarity is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << getDissimilarityName(dissimilarity_type_) << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Threshold for dissimilarity is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << dissimilarity_thres_pyramid_level_[ii] << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Number of search size division is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << div_num_pyramid_level_[ii] << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        if ( regularization_hilbert_strength_world_coordinate_ )
+        {
+            os << "Regularization strength  is in the unit of physical metric, e.g. mm ... ";
+        }
+        else
+        {
+            os << "Regularization strength  is in the unit of image pixel size ... ";
+        }
+
+        os << "Regularization strength for every pyramid level is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - [ ";
+            for( jj=0; jj<DIn; jj++ )
+            {
+                os << regularization_hilbert_strength_pyramid_level_[ii][jj] << " ";
+            } 
+            os << " ] " << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Boundary handler and interpolator type for warper is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << getBoundaryHandlerName(boundary_handler_type_warper_[ii]) 
+                << " - " << getInterpolatorName(interp_type_warper_[ii]) << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Number of iterations to improve the estimation of the inverse transform is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << inverse_deform_enforce_iter_pyramid_level_[ii] << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Weight to update the estimation of the inverse transform is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << inverse_deform_enforce_weight_pyramid_level_[ii] << std::endl;
+        }
+        os << "------------" << std::endl;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/cpureg_export.h b/toolboxes/registration/optical_flow/cpu/cpureg_export.h
new file mode 100644
index 0000000..ad929f1
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/cpureg_export.h
@@ -0,0 +1,14 @@
+#ifndef _CPUREG_EXPORT_H_
+#define _CPUREG_EXPORT_H_
+
+#if defined (WIN32)
+    #if defined (__BUILD_GADGETRON_CPUREG__) || defined (cpureg_EXPORTS)
+        #define EXPORTCPUREG __declspec(dllexport)
+    #else
+        #define EXPORTCPUREG __declspec(dllimport)
+    #endif
+#else
+#define EXPORTCPUREG
+#endif
+
+#endif /* _CPUREG_EXPORT_H_ */
diff --git a/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilarity.h b/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilarity.h
new file mode 100644
index 0000000..db68847
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilarity.h
@@ -0,0 +1,251 @@
+/** \file   hoImageRegDissimilarity.h
+    \brief  Define the class to compute image dissimilarity in gadgetron registration
+
+            Four different types of image dissimilarity measures are implemented here:
+
+            SSD: sum-of-square difference
+            LocalCCR: localized cross-correlation
+            MI: mutual information
+            NMI: normalized mutual information
+
+            For  SSD, LocalCCR and MI, the analytical derivatives are computed.
+
+            The analytical derivatives are computed by using the formula proposed at:
+
+            [1] Gerardo Hermosillo, Christophe Chefd'Hotel, Olivier Faugeras. Variational Methods for Multimodal Image Matching. 
+            International Journal of Computer Vision. December 2002, Volume 50, Issue 3, pp 329-343.
+            http://link.springer.com/article/10.1023%2FA%3A1020830525823
+
+            [2] Gerardo Hermosillo. Variational Methods for Multimodal Image Matching. PhD Thesis, UNIVERSIT�E DE NICE - SOPHIA ANTIPOLIS. May 2002.
+            http://webdocs.cs.ualberta.ca/~dana/readingMedIm/papers/hermosilloPhD.pdf
+
+            The derivative computation code is based on the listed source code at page 179 - 185 in ref [2] and extended.
+
+            [3] Christophe Chefd'Hotel, Gerardo Hermosillo, Olivier D. Faugeras: Flows of diffeomorphisms for multimodal image registration. ISBI 2002: 753-756.
+            http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1029367&tag=1
+
+            [4] C. Studholme, D.L.G. Hill, D.J. Hawkes. An overlap invariant entropy measure of 3D medical image alignment. Pattern Recognition, 32, 71-86, 1999.
+            http://eecs.vanderbilt.edu/courses/cs359/other_links/papers/studholme_NMI_1999.pdf
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "hoNDImage.h"
+#include "hoNDInterpolator.h"
+#include "hoNDBoundaryHandler.h"
+#include "hoMatrix.h"
+#include "hoNDArray_utils.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDImage_util.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "GtPrepUtil.h"
+
+#ifdef USE_OMP
+    #include <omp.h>
+#endif // USE_OMP
+
+namespace Gadgetron
+{
+    // define the image dissimilarity type
+    enum GT_IMAGE_DISSIMILARITY
+    {
+        GT_IMAGE_DISSIMILARITY_SSD,
+        GT_IMAGE_DISSIMILARITY_LocalCCR,
+        GT_IMAGE_DISSIMILARITY_MI,
+        GT_IMAGE_DISSIMILARITY_NMI
+    };
+
+    inline std::string getDissimilarityName(GT_IMAGE_DISSIMILARITY v)
+    {
+        std::string name;
+
+        switch (v)
+        {
+            case GT_IMAGE_DISSIMILARITY_SSD:
+                name = "SSD";
+                break;
+
+            case GT_IMAGE_DISSIMILARITY_LocalCCR:
+                name = "LocalCCR";
+                break;
+
+            case GT_IMAGE_DISSIMILARITY_MI:
+                name = "MutualInformation";
+                break;
+
+            case GT_IMAGE_DISSIMILARITY_NMI:
+                name = "NormalizedMutualInformation";
+                break;
+
+            default:
+                GERROR_STREAM("Unrecognized image dissimilarity type : " << v);
+        }
+
+        return name;
+    }
+
+    inline GT_IMAGE_DISSIMILARITY getDissimilarityType(const std::string& name)
+    {
+        GT_IMAGE_DISSIMILARITY v;
+
+        if ( name == "SSD" )
+        {
+            v = GT_IMAGE_DISSIMILARITY_SSD;
+        }
+        else if ( name == "LocalCCR" )
+        {
+            v = GT_IMAGE_DISSIMILARITY_LocalCCR;
+        }
+        else if ( name == "MutualInformation" )
+        {
+            v = GT_IMAGE_DISSIMILARITY_MI;
+        }
+        else if ( name == "NormalizedMutualInformation" )
+        {
+            v = GT_IMAGE_DISSIMILARITY_NMI;
+        }
+        else
+        {
+            GERROR_STREAM("Unrecognized image dissimilarity name : " << name);
+        }
+
+        return v;
+    }
+
+    /// compute the image dissimilarity measures
+    /// if possible, compute the analytical derivatives
+    template<typename ValueType, unsigned int D> 
+    class hoImageRegDissimilarity
+    {
+    public:
+
+        typedef hoImageRegDissimilarity<ValueType, D> Self;
+
+        typedef hoNDImage<ValueType, D> ImageType;
+
+        typedef hoNDInterpolator<ImageType> InterpolatorType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef float coord_type;
+
+        hoImageRegDissimilarity(ValueType bg_value=ValueType(0));
+        virtual ~hoImageRegDissimilarity();
+
+        /// initialize the dissimilarity
+        virtual void initialize(ImageType& t);
+
+        const ImageType& getDeriv() const { return deriv_; }
+
+        ValueType getDissimilarity() const { return dissimilarity_; }
+
+        void setBackgroundValue(ValueType bg_value) { bg_value_ = bg_value; }
+
+        /// compute the dissimilarity value
+        virtual ValueType evaluate(ImageType& w);
+
+        /// compute the derivative and dissimilarity value
+        virtual bool evaluateDeriv(ImageType& w) = 0;
+
+        virtual void print(std::ostream& os) const;
+
+        // ----------------------------------
+        // debug and timing
+        // ----------------------------------
+        // clock for timing
+        Gadgetron::GadgetronTimer gt_timer1_;
+        Gadgetron::GadgetronTimer gt_timer2_;
+        Gadgetron::GadgetronTimer gt_timer3_;
+
+        bool performTiming_;
+
+        // exporter
+        Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+        // debug folder
+        std::string debugFolder_;
+
+    protected:
+
+        ImageType* target_;
+        ImageType* warpped_;
+
+        std::vector<size_t> image_dim_;
+
+        /// background pixels
+        ValueType bg_value_;
+
+        /// derivative to spatial locations
+        ImageType deriv_;
+
+        /// dissimilarity value
+        ValueType dissimilarity_;
+
+        hoNDArray<ValueType> target;
+        hoNDArray<ValueType> warped;
+        hoNDArray<ValueType> deriv;
+    };
+
+    template<typename ValueType, unsigned int D> 
+    hoImageRegDissimilarity<ValueType, D>::hoImageRegDissimilarity(ValueType bg_value) 
+        : target_(NULL), warpped_(NULL), bg_value_(bg_value), dissimilarity_(0), performTiming_(false)
+    {
+        gt_timer1_.set_timing_in_destruction(false);
+        gt_timer2_.set_timing_in_destruction(false);
+        gt_timer3_.set_timing_in_destruction(false);
+    }
+
+    template<typename ValueType, unsigned int D> 
+    hoImageRegDissimilarity<ValueType, D>::~hoImageRegDissimilarity()
+    {
+    }
+
+    template<typename ValueType, unsigned int D> 
+    void hoImageRegDissimilarity<ValueType, D>::initialize(ImageType& t)
+    {
+        target_ = &t;
+
+        if ( !deriv_.dimensions_equal(*target_) )
+        {
+            deriv_.create(target_->get_dimensions());
+        }
+        memset( deriv_.get_data_ptr(), 0, deriv_.get_number_of_elements()*sizeof(ValueType));
+
+        target_->get_dimensions(image_dim_);
+
+        /// these conversion can be removed if more utility functions are added for hoNDImage
+        target.create(image_dim_, target_->begin(), false);
+        deriv.create(image_dim_, deriv_.begin(), false);
+    }
+
+    template<typename ValueType, unsigned int D> 
+    ValueType hoImageRegDissimilarity<ValueType, D>::evaluate(ImageType& w)
+    {
+        if ( warpped_ != &w )
+        {
+            warpped_ = &w;
+            GADGET_CHECK_THROW(warpped_->dimensions_equal(*target_));
+            warped.create(image_dim_, warpped_->begin(), false);
+        }
+
+        this->dissimilarity_ = 0;
+
+        return this->dissimilarity_;
+    }
+
+    template<typename ValueType, unsigned int D> 
+    void hoImageRegDissimilarity<ValueType, D>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron image dissimilarity measure -------------" << endl;
+        os << "Image dimension is : " << D << endl;
+
+        std::string elemTypeName = std::string(typeid(ValueType).name());
+        os << "Transformation data type is : " << elemTypeName << endl << ends;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilarityHistogramBased.h b/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilarityHistogramBased.h
new file mode 100644
index 0000000..7b7b984
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilarityHistogramBased.h
@@ -0,0 +1,226 @@
+/** \file   hoImageRegDissimilarityHistogramBased.h
+    \brief  Define the class to compute image dissimilarity based on histogram
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <limits>
+#include "hoMatrix.h"
+#include "hoImageRegDissimilarity.h"
+
+namespace Gadgetron
+{
+    template<typename ValueType, unsigned int D> 
+    class hoImageRegDissimilarityHistogramBased : public hoImageRegDissimilarity<ValueType, D>
+    {
+    public:
+
+        typedef hoImageRegDissimilarityHistogramBased<ValueType, D> Self;
+        typedef hoImageRegDissimilarity<ValueType, D> BaseClass;
+
+        typedef typename BaseClass::ImageType ImageType;
+        typedef typename BaseClass::InterpolatorType InterpolatorType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef typename BaseClass::coord_type coord_type;
+
+        typedef double hist_value_type;
+
+        hoImageRegDissimilarityHistogramBased(unsigned int num_bin_target=64, unsigned int num_bin_warpped=64, ValueType bg_value=ValueType(0));
+        virtual ~hoImageRegDissimilarityHistogramBased();
+
+        virtual ValueType evaluate(ImageType& w);
+
+        virtual bool evaluateDeriv(ImageType& w) = 0;
+
+        virtual void print(std::ostream& os) const;
+
+        /// number of intensity bins
+        unsigned int num_bin_target_;
+        unsigned int num_bin_warpped_;
+
+        /// whether to perform partial interpolation for histogram
+        bool pv_interpolation_;
+
+        /// step size to ignore pixels when creating histogram
+        size_t step_size_ignore_pixel_;
+
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        using BaseClass::target_;
+        using BaseClass::warpped_;
+        using BaseClass::deriv_;
+        using BaseClass::bg_value_;
+        using BaseClass::dissimilarity_;
+        using BaseClass::target;
+        using BaseClass::warped;
+        using BaseClass::deriv;
+        using BaseClass::image_dim_;
+
+        /// store the 2D histogram
+        hoMatrix<hist_value_type> hist_;
+
+        /// min/max intensities of target and warped
+        ValueType min_target_;
+        ValueType max_target_;
+
+        ValueType min_warpped_;
+        ValueType max_warpped_;
+
+        size_t num_samples_in_hist_;
+    };
+
+    template<typename ValueType, unsigned int D> 
+    hoImageRegDissimilarityHistogramBased<ValueType, D>::
+    hoImageRegDissimilarityHistogramBased(unsigned int num_bin_target, unsigned int num_bin_warpped, ValueType bg_value) 
+        : BaseClass(bg_value), num_bin_target_(num_bin_target), num_bin_warpped_(num_bin_warpped), pv_interpolation_(false), step_size_ignore_pixel_(1)
+    {
+    }
+
+    template<typename ValueType, unsigned int D> 
+    hoImageRegDissimilarityHistogramBased<ValueType, D>::~hoImageRegDissimilarityHistogramBased()
+    {
+    }
+
+    template<typename ValueType, unsigned int D> 
+    ValueType hoImageRegDissimilarityHistogramBased<ValueType, D>::evaluate(ImageType& w)
+    {
+        try
+        {
+            BaseClass::evaluate(w);
+
+            // allocate histogram
+            hist_.createMatrix(num_bin_target_, num_bin_warpped_);
+            Gadgetron::clear(hist_);
+
+            // intensity range
+            min_target_ = std::numeric_limits<ValueType>::max();
+            max_target_ = std::numeric_limits<ValueType>::min();
+
+            min_warpped_ = min_target_;
+            max_warpped_ = max_target_;
+
+            size_t N = target_->get_number_of_elements();
+
+            long long n;
+            for ( n=0; n<(long long)N; n++ )
+            {
+                ValueType vt = target(n);
+                if ( vt < min_target_ ) min_target_ = vt;
+                if ( vt > max_target_ ) max_target_ = vt;
+
+                ValueType vw = warped(n);
+                if ( vw < min_warpped_ ) min_warpped_ = vw;
+                if ( vw > max_warpped_ ) max_warpped_ = vw;
+            }
+
+            ValueType range_t = ValueType(1.0)/(max_target_ - min_target_ + std::numeric_limits<ValueType>::epsilon());
+            ValueType range_w = ValueType(1.0)/(max_warpped_ - min_warpped_ + std::numeric_limits<ValueType>::epsilon());
+
+            num_samples_in_hist_ = 0;
+
+            if ( pv_interpolation_ )
+            {
+                #pragma omp parallel for default(none) private(n) shared(N, range_t, range_w)
+                for ( n=0; n<(long long)N; n+=(long long)step_size_ignore_pixel_ )
+                {
+                    ValueType vt = target(n);
+                    ValueType vw = warped(n);
+
+                    if ( std::abs(vt-bg_value_)<FLT_EPSILON 
+                        && std::abs(vw-bg_value_)<FLT_EPSILON )
+                    {
+                        continue;
+                    }
+
+                    ValueType xT = range_t*(vt-min_target_)*(num_bin_target_-1);
+                    ValueType xW = range_w*(vw-min_warpped_)*(num_bin_warpped_-1);
+
+                    size_t indT = static_cast<size_t>(xT);
+                    size_t indW = static_cast<size_t>(xW);
+
+                    ValueType sT, s1T, sW, s1W;
+
+                    sT = xT - indT; s1T = 1 - sT;
+                    sW = xW - indW; s1W = 1 - sW;
+
+                    #pragma omp critical
+                    {
+                        hist_(indT, indW) += s1T*s1W;
+
+                        if ( indT<num_bin_target_-1 && indW<num_bin_warpped_-1 )
+                        {
+                            hist_(indT, indW+1) += s1T*sW;
+                            hist_(indT+1, indW) += sT*s1W;
+                            hist_(indT+1, indW+1) += sT*sW;
+                        }
+                    }
+
+                    #pragma omp atomic
+                    num_samples_in_hist_++;
+                }
+            }
+            else
+            {
+                #pragma omp parallel for default(none) private(n) shared(N, range_t, range_w)
+                for ( n=0; n<(long long)N; n+=(long long)step_size_ignore_pixel_ )
+                {
+                    ValueType vt = target(n);
+                    ValueType vw = warped(n);
+
+                    if ( std::abs(vt-bg_value_)<FLT_EPSILON 
+                        && std::abs(vw-bg_value_)<FLT_EPSILON )
+                    {
+                        continue;
+                    }
+
+                    size_t indT = static_cast<size_t>( range_t*(vt-min_target_)*(num_bin_target_-1) + 0.5 );
+                    size_t indW = static_cast<size_t>( range_w*(vw-min_warpped_)*(num_bin_warpped_-1) + 0.5 );
+
+                    #pragma omp critical
+                    {
+                        hist_(indT, indW)++;
+                    }
+
+                    #pragma omp atomic
+                    num_samples_in_hist_++;
+                }
+            }
+
+            if ( !debugFolder_.empty() ) {  gt_exporter_.exportArray(hist_, debugFolder_+"hist2D"); }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDissimilarityHistogramBased<ValueType, D>::evaluate(ImageType& t, ImageType& w) ... ");
+        }
+
+        return this->dissimilarity_;
+    }
+
+    template<typename ValueType, unsigned int D> 
+    void hoImageRegDissimilarityHistogramBased<ValueType, D>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron image dissimilarity with histogram -------------" << endl;
+        os << "Image dimension is : " << D << endl;
+
+        std::string elemTypeName = std::string(typeid(ValueType).name());
+        os << "Transformation data type is : " << elemTypeName << std::endl;
+
+        os << "Number of intensity bins for target is : " << num_bin_target_ << endl;
+        os << "Number of intensity bins for warped is : " << num_bin_warpped_ << endl;
+        os << "PV interpolation for histogram is : " << pv_interpolation_ << endl;
+        os << "Step size to ignore pixels when creating histogram is : " << step_size_ignore_pixel_ << endl << ends;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilarityLocalCCR.h b/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilarityLocalCCR.h
new file mode 100644
index 0000000..90118c7
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilarityLocalCCR.h
@@ -0,0 +1,405 @@
+/** \file   hoImageRegDissimilarityLocalCCR.h
+    \brief  Define the class to compute image Local Cross CorRelation (LocalCCR) in gadgetron registration
+
+            The analytical derivatives are computed by using the formula proposed at:
+
+            [1] Gerardo Hermosillo, Christophe Chefd'Hotel, Olivier Faugeras. Variational Methods for Multimodal Image Matching. 
+            International Journal of Computer Vision. December 2002, Volume 50, Issue 3, pp 329-343.
+            http://link.springer.com/article/10.1023%2FA%3A1020830525823
+
+            [2] Gerardo Hermosillo. Variational Methods for Multimodal Image Matching. PhD Thesis, UNIVERSIT�E DE NICE - SOPHIA ANTIPOLIS. May 2002.
+            http://webdocs.cs.ualberta.ca/~dana/readingMedIm/papers/hermosilloPhD.pdf
+
+            This derivative computation code is based on the listed source code at page 183 - 185 in ref [2].
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <limits>
+#include "hoImageRegDissimilarity.h"
+
+namespace Gadgetron
+{
+    template<typename ValueType, unsigned int D> 
+    class hoImageRegDissimilarityLocalCCR : public hoImageRegDissimilarity<ValueType, D>
+    {
+    public:
+
+        typedef hoImageRegDissimilarityLocalCCR<ValueType, D> Self;
+        typedef hoImageRegDissimilarity<ValueType, D> BaseClass;
+
+        typedef typename BaseClass::ImageType ImageType;
+        typedef typename BaseClass::InterpolatorType InterpolatorType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef double computing_value_type;
+
+        typedef typename BaseClass::coord_type coord_type;
+
+        hoImageRegDissimilarityLocalCCR(computing_value_type betaArg=std::numeric_limits<ValueType>::epsilon() );
+        hoImageRegDissimilarityLocalCCR(ValueType sigmaArg[D], computing_value_type betaArg=std::numeric_limits<ValueType>::epsilon() );
+        virtual ~hoImageRegDissimilarityLocalCCR();
+
+        void initialize(ImageType& t);
+
+        virtual ValueType evaluate(ImageType& w);
+        virtual bool evaluateDeriv(ImageType& w);
+
+        virtual void print(std::ostream& os) const;
+
+        /// these parameter names are kept same as the source code on page 183 - 185 in ref [2]
+        computing_value_type sigmaArg_[D]; // kernel size of local weighting function
+
+        computing_value_type betaArg_;
+
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        using BaseClass::target_;
+        using BaseClass::warpped_;
+        using BaseClass::deriv_;
+        using BaseClass::bg_value_;
+        using BaseClass::dissimilarity_;
+        using BaseClass::target;
+        using BaseClass::warped;
+        using BaseClass::deriv;
+        using BaseClass::image_dim_;
+
+        /// these parameter names are kept same as the source code on page 183 - 185 in ref [2]
+        hoNDArray<computing_value_type> cc; computing_value_type* p_cc;
+        hoNDArray<computing_value_type> mu1; computing_value_type* p_mu1;
+        hoNDArray<computing_value_type> mu2; computing_value_type* p_mu2;
+        hoNDArray<computing_value_type> v1; computing_value_type* p_v1;
+        hoNDArray<computing_value_type> v2; computing_value_type* p_v2;
+        hoNDArray<computing_value_type> v12; computing_value_type* p_v12;
+
+        //hoNDArray<computing_value_type> vv1; computing_value_type* p_vv1;
+        //hoNDArray<computing_value_type> vv2; computing_value_type* p_vv2;
+        //hoNDArray<computing_value_type> vv12; computing_value_type* p_vv12;
+
+        hoNDArray<computing_value_type> mem_;
+
+        computing_value_type eps_;
+    };
+
+    template<typename ValueType, unsigned int D> 
+    hoImageRegDissimilarityLocalCCR<ValueType, D>::hoImageRegDissimilarityLocalCCR(computing_value_type betaArg) 
+        : BaseClass(), betaArg_(betaArg)
+    {
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            sigmaArg_[ii] = (computing_value_type)(2.0);
+        }
+    }
+
+    template<typename ValueType, unsigned int D> 
+    hoImageRegDissimilarityLocalCCR<ValueType, D>::hoImageRegDissimilarityLocalCCR(ValueType sigmaArg[D], computing_value_type betaArg) 
+        : BaseClass(), betaArg_(betaArg)
+    {
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            sigmaArg_[ii] = (computing_value_type)(sigmaArg[ii]);
+        }
+    }
+
+    template<typename ValueType, unsigned int D> 
+    hoImageRegDissimilarityLocalCCR<ValueType, D>::~hoImageRegDissimilarityLocalCCR()
+    {
+    }
+
+    template<typename ValueType, unsigned int D> 
+    void hoImageRegDissimilarityLocalCCR<ValueType, D>::initialize(ImageType& t)
+    {
+        BaseClass::initialize(t);
+
+        // allocate arrays for the computation
+        cc.create(image_dim_); p_cc = cc.begin();
+        mu1.create(image_dim_); p_mu1 = mu1.begin();
+        mu2.create(image_dim_); p_mu2 = mu2.begin();
+        v1.create(image_dim_); p_v1 = v1.begin();
+        v2.create(image_dim_); p_v2 = v2.begin();
+        v12.create(image_dim_); p_v12 = v12.begin();
+
+        //vv1.create(image_dim_); p_vv1 = vv1.begin();
+        //vv2.create(image_dim_); p_vv2 = vv2.begin();
+        //vv12.create(image_dim_); p_vv12 = vv12.begin();
+
+        #ifdef WIN32
+            size_t v=0;
+            for ( size_t ii=0; ii<image_dim_.size(); ii++ ) v+=image_dim_[ii];
+            mem_.create(2*v);
+        #endif // WIN32
+
+        eps_ = std::numeric_limits<computing_value_type>::epsilon();
+    }
+
+    template<typename ValueType, unsigned int D> 
+    ValueType hoImageRegDissimilarityLocalCCR<ValueType, D>::evaluate(ImageType& w)
+    {
+        try
+        {
+            /// in the ref [2], the code are:
+            /*
+            Image<float>
+                mu1(I1.domain()), mu2(I1.domain()),
+                v1(I1.domain()), v2(I1.domain()),
+                v12(I1.domain()), f1(I1.domain()),
+                f2(I1.domain()), f3(I1.domain());
+                Map(I1,x) {
+                const real i1 = I1[x];
+                const real i2 = I2[x];
+                mu1[x] = i1; v1[x] = i1 * i1;
+                mu2[x] = i2; v12[x] = i1 * i2;
+                v2[x] = i2 * i2;
+                }
+                mu1.SelfRecSmoothZeroBC(sigma); v1.SelfRecSmoothZeroBC(sigma);
+                mu2.SelfRecSmoothZeroBC(sigma); v2.SelfRecSmoothZeroBC(sigma);
+                v12.SelfRecSmoothZeroBC(sigma);
+
+                criter = 0;
+                Map(v1,x) {
+                const real u1 = mu1[x];
+                const real u2 = mu2[x];
+                const real vv1 = v1[x] + beta - u1 * u1;
+                const real vv2 = v2[x] + beta - u2 * u2;
+                const real vv12 = v12[x] - u1 * u2;
+                const real ff1 = vv12 / (vv1 * vv2);
+                const real CC = vv12 * ff1;
+                const real ff2 = - CC / vv2;
+                const real ff3 =  - (ff2 * u2 + ff1 * u1);
+                f1[x] = ff1; f2[x] = ff2; f3[x] = ff3;
+                cc[x] = -CC;
+                criter += -CC;
+                }
+                f1.SelfRecSmoothZeroBC(sigma);
+                f2.SelfRecSmoothZeroBC(sigma);
+                f3.SelfRecSmoothZeroBC(sigma);
+
+                norm = 0;
+                Map(f1,x) {
+                const float val = 2.0 * ( f1[x] * I1[x] + f2[x] * I2[x] + f3[x] ) ;
+                dist[x] = val;
+                norm += val * val;
+                }
+            */
+
+            /// we rewrite these code for gadgetron
+
+            //if ( performTiming_ ) { gt_timer1_.start("1"); }
+            BaseClass::evaluate(w);
+            //if ( performTiming_ ) { gt_timer1_.stop(); }
+
+            long long N = (long long)target.get_number_of_elements();
+
+            //if ( performTiming_ ) { gt_timer1_.start("2"); }
+            //mu1.copyFrom(target);
+            //mu2.copyFrom(warped);
+            //Gadgetron::multiply(mu1, mu1, v1);
+            //Gadgetron::multiply(mu2, mu2, v2);
+            //Gadgetron::multiply(mu1, mu2, v12);
+
+            long long n;
+
+            ValueType* pT = target.begin();
+            ValueType* pW = warped.begin();
+
+            for ( n=0; n<N; ++n )
+            {
+                const computing_value_type v1 = (computing_value_type)pT[n];
+                const computing_value_type v2 = (computing_value_type)pW[n];
+
+                p_mu1[n] = v1;
+                p_mu2[n] = v2;
+                p_v1[n] = v1*v1;
+                p_v2[n] = v2*v2;
+                p_v12[n] = v1*v2;
+            }
+
+                //#ifdef WIN32
+                    Gadgetron::filterGaussian(mu1, sigmaArg_, mem_.begin());
+                    Gadgetron::filterGaussian(mu2, sigmaArg_, mem_.begin());
+                    Gadgetron::filterGaussian(v1, sigmaArg_, mem_.begin());
+                    Gadgetron::filterGaussian(v2, sigmaArg_, mem_.begin());
+                    Gadgetron::filterGaussian(v12, sigmaArg_, mem_.begin());
+                //#else
+                //    Gadgetron::filterGaussian(mu1, sigmaArg_);
+                //    Gadgetron::filterGaussian(mu2, sigmaArg_);
+                //    Gadgetron::filterGaussian(v1, sigmaArg_);
+                //    Gadgetron::filterGaussian(v2, sigmaArg_);
+                //    Gadgetron::filterGaussian(v12, sigmaArg_);
+                //#endif // WIN32
+
+            //if ( 0 )
+            //{
+            //    //#pragma omp parallel sections if ( D==2 )
+            //    {
+            //        //#pragma omp section
+            //        {
+            //            Gadgetron::multiply(mu1, mu1, vv1);
+            //            Gadgetron::subtract(v1, vv1, vv1);
+            //            Gadgetron::addEpsilon(vv1);
+            //        }
+
+            //        //#pragma omp section
+            //        {
+            //            Gadgetron::multiply(mu2, mu2, vv2);
+            //            Gadgetron::subtract(v2, vv2, vv2);
+            //            Gadgetron::addEpsilon(vv2);
+            //        }
+
+            //        //#pragma omp section
+            //        {
+            //            Gadgetron::multiply(mu1, mu2, vv12);
+            //            Gadgetron::subtract(v12, vv12, vv12);
+            //        }
+            //    }
+
+            //    Gadgetron::multiply(vv1, vv2, vv1);
+            //    Gadgetron::divide(vv12, vv1, v1); // ff1
+
+            //    Gadgetron::multiply(vv12, v1, cc); // cc
+
+            //    Gadgetron::divide(cc, vv2, v2); // ff2
+            //    Gadgetron::scal( (computing_value_type)(-1), v2);
+
+            //    Gadgetron::multiply(v2, mu2, v12);
+            //    Gadgetron::multiply(v1, mu1, vv12);
+            //    Gadgetron::add(v12, vv12, v12);
+
+            //    computing_value_type v=0;
+            //    Gadgetron::norm1(cc, v);
+
+            //    dissimilarity_ = static_cast<T>(-v/N);
+            //}
+
+            dissimilarity_ = 0;
+            computing_value_type v=0;
+
+            //#pragma omp parallel for private(n)
+            for ( n=0; n<N; ++n )
+            {
+                const computing_value_type u1 = p_mu1[n];
+                const computing_value_type u2 = p_mu2[n];
+
+                const computing_value_type vv1 = p_v1[n] - u1 * u1;
+                const computing_value_type vv2 = p_v2[n] - u2 * u2;
+                const computing_value_type vv12 = p_v12[n] - u1 * u2;
+
+                const computing_value_type ff1 = vv12 / (vv1 * vv2);
+                const computing_value_type lcc = vv12 * ff1;
+
+                const computing_value_type ff2 = - lcc / vv2;
+                const computing_value_type ff3 = ff2 * u2 + ff1 * u1;
+
+                p_v1[n] = ff1; p_v2[n] = ff2; p_v12[n] = ff3;
+
+                p_cc[n] = lcc;
+            }
+
+            computing_value_type lcc = 0;
+
+            // #pragma omp parallel for reduction(+:lcc)
+            for (n=0; n<N; n++)
+            {
+                lcc += cc[n];
+            }
+
+            dissimilarity_ = -lcc/N;
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDissimilarityLocalCCR<ValueType, D>::evaluate(w) ... ");
+        }
+
+        return this->dissimilarity_;
+    }
+
+    template<typename ValueType, unsigned int D> 
+    bool hoImageRegDissimilarityLocalCCR<ValueType, D>::evaluateDeriv(ImageType& w)
+    {
+        try
+        {
+            this->evaluate(w);
+
+            size_t N = target.get_number_of_elements();
+
+            long long n;
+
+            //#pragma omp parallel sections if ( D==2 )
+            {
+                //#ifdef WIN32
+                    //#pragma omp section
+                    {
+                        Gadgetron::filterGaussian(v1, sigmaArg_, mem_.begin());
+                    }
+
+                    //#pragma omp section
+                    {
+                        Gadgetron::filterGaussian(v2, sigmaArg_, mem_.begin());
+                    }
+
+                    //#pragma omp section
+                    {
+                        Gadgetron::filterGaussian(v12, sigmaArg_, mem_.begin());
+                    }
+                //#else
+                //    Gadgetron::filterGaussian(v1, sigmaArg_);
+                //    Gadgetron::filterGaussian(v2, sigmaArg_);
+                //    Gadgetron::filterGaussian(v12, sigmaArg_);
+                //#endif // WIN32
+            }
+
+            // deriv = f1*i1 + f2*i2 + f3, we don't need to multiply this by 2.0
+
+            //if ( typeid(ValueType) == typeid(computing_value_type) )
+            //{
+                //Gadgetron::multiply(v1, target, mu1);
+                //Gadgetron::multiply(v2, warped, mu2);
+                //Gadgetron::add(mu1, mu2, deriv);
+                //Gadgetron::subtract(deriv, v12, deriv);
+            //}
+            //else
+            //{
+                T* pT = target.begin();
+                T* pW = warped.begin();
+
+                // #pragma omp parallel for default(none) shared(N, pT, pW)
+                for ( n=0; n<(long long)N; n++ )
+                {
+                    deriv(n) = static_cast<T>( p_v1[n]* (computing_value_type)pT[n] + ( p_v2[n]*(computing_value_type)pW[n] - p_v12[n] ) );
+                }
+            //}
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDissimilarityLocalCCR<ValueType, D>::evaluateDeriv(w) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, unsigned int D> 
+    void hoImageRegDissimilarityLocalCCR<ValueType, D>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron image dissimilarity LocalCCR measure -------------" << endl;
+        os << "Image dimension is : " << D << endl;
+
+        std::string elemTypeName = std::string(typeid(ValueType).name());
+        os << "Transformation data type is : " << elemTypeName << endl << ends;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilarityMutualInformation.h b/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilarityMutualInformation.h
new file mode 100644
index 0000000..96d3fff
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilarityMutualInformation.h
@@ -0,0 +1,289 @@
+/** \file   hoImageRegDissimilarityMutualInformation.h
+    \brief  Define the class to compute mutual information.
+
+            The analytical derivatives are computed by using the formula proposed at:
+
+            [1] Gerardo Hermosillo, Christophe Chefd'Hotel, Olivier Faugeras. Variational Methods for Multimodal Image Matching. 
+            International Journal of Computer Vision. December 2002, Volume 50, Issue 3, pp 329-343.
+            http://link.springer.com/article/10.1023%2FA%3A1020830525823
+
+            [2] Gerardo Hermosillo. Variational Methods for Multimodal Image Matching. PhD Thesis, UNIVERSIT�E DE NICE - SOPHIA ANTIPOLIS. May 2002.
+            http://webdocs.cs.ualberta.ca/~dana/readingMedIm/papers/hermosilloPhD.pdf
+
+            This derivative computation code is based on the listed source code at page 172 - 174 in ref [2].
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegDissimilarityHistogramBased.h"
+
+namespace Gadgetron
+{
+    template<typename ValueType, unsigned int D> 
+    class hoImageRegDissimilarityMutualInformation : public hoImageRegDissimilarityHistogramBased<ValueType, D>
+    {
+    public:
+
+        typedef hoImageRegDissimilarityMutualInformation<ValueType, D> Self;
+        typedef hoImageRegDissimilarityHistogramBased<ValueType, D> BaseClass;
+
+        typedef typename BaseClass::ImageType ImageType;
+        typedef typename BaseClass::InterpolatorType InterpolatorType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef typename BaseClass::coord_type coord_type;
+
+        typedef typename BaseClass::hist_value_type hist_value_type;
+
+        hoImageRegDissimilarityMutualInformation(ValueType betaArg=ValueType(2.0), unsigned int num_bin_target=64, unsigned int num_bin_warpped=64, ValueType bg_value=ValueType(0));
+        virtual ~hoImageRegDissimilarityMutualInformation();
+
+        virtual ValueType evaluate(ImageType& w);
+        virtual bool evaluateDeriv(ImageType& w);
+
+        virtual void print(std::ostream& os) const;
+
+        /// kernel size for density estimation
+        ValueType betaArg_[2];
+
+        using BaseClass::num_bin_target_;
+        using BaseClass::num_bin_warpped_;
+        using BaseClass::pv_interpolation_;
+        using BaseClass::step_size_ignore_pixel_;
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        using BaseClass::target_;
+        using BaseClass::warpped_;
+        using BaseClass::deriv_;
+        using BaseClass::bg_value_;
+        using BaseClass::dissimilarity_;
+        using BaseClass::target;
+        using BaseClass::warped;
+        using BaseClass::deriv;
+        using BaseClass::image_dim_;
+        using BaseClass::hist_;
+        using BaseClass::min_target_;
+        using BaseClass::max_target_;
+        using BaseClass::min_warpped_;
+        using BaseClass::max_warpped_;
+        using BaseClass::num_samples_in_hist_;
+
+        hoNDArray<hist_value_type> hist_target_;
+        hoNDArray<hist_value_type> hist_warpped_;
+
+        /// these variable names are kept same as the ref [2].
+        ho2DArray<hist_value_type> Hy;
+        hoNDArray<hist_value_type> hy;
+
+        ho2DArray<hist_value_type> P;
+        hoNDArray<hist_value_type> p;
+
+        ho2DArray<hist_value_type> Dist;
+    };
+
+    template<typename ValueType, unsigned int D> 
+    hoImageRegDissimilarityMutualInformation<ValueType, D>::
+    hoImageRegDissimilarityMutualInformation(ValueType betaArg, unsigned int num_bin_target, unsigned int num_bin_warpped, ValueType bg_value) 
+        : BaseClass(num_bin_target, num_bin_warpped, bg_value) 
+    {
+        betaArg_[0] = betaArg;
+        betaArg_[1] = betaArg;
+    }
+
+    template<typename ValueType, unsigned int D> 
+    hoImageRegDissimilarityMutualInformation<ValueType, D>::~hoImageRegDissimilarityMutualInformation()
+    {
+    }
+
+    template<typename ValueType, unsigned int D> 
+    ValueType hoImageRegDissimilarityMutualInformation<ValueType, D>::evaluate(ImageType& w)
+    {
+        try
+        {
+            BaseClass::evaluate(w);
+
+            /// compute entorpy
+            hist_target_.create(num_bin_target_);
+            Gadgetron::clear(hist_target_);
+
+            hist_warpped_.create(num_bin_warpped_);
+            Gadgetron::clear(hist_warpped_);
+
+            if ( betaArg_[0] > 0 )
+            {
+                Gadgetron::filterGaussian(hist_, betaArg_);
+            }
+
+            hist_value_type histSum=0;
+            Gadgetron::norm1(hist_, histSum);
+            Gadgetron::scal( hist_value_type(1.0/histSum), hist_);
+
+            hist_.sumOverRow(hist_target_);
+            hist_.sumOverCol(hist_warpped_);
+
+            dissimilarity_ = 0;
+
+            size_t t, w;
+
+            hist_value_type log2R = hist_value_type(1.0)/log( hist_value_type(2.0) );
+
+            for ( w=0; w<num_bin_warpped_; w++ )
+            {
+                hist_value_type prob_w = hist_warpped_(w);
+
+                for ( t=0; t<num_bin_target_; t++ )
+                {
+                    hist_value_type prob = hist_(t, w);
+                    if ( prob > 0 )
+                    {
+                        hist_value_type prob_t = hist_target_(t);
+
+                        dissimilarity_ -= prob * log( prob / (prob_t * prob_w) ) * log2R;
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDissimilarityMutualInformation<ValueType, D>::evaluate(ImageType& t, ImageType& w) ... ");
+        }
+
+        return this->dissimilarity_;
+    }
+
+    template<typename ValueType, unsigned int D> 
+    bool hoImageRegDissimilarityMutualInformation<ValueType, D>::evaluateDeriv(ImageType& w)
+    {
+        try
+        {
+            this->evaluate(w);
+
+            Hy.createArray(num_bin_target_, num_bin_warpped_);
+            hy.create(num_bin_warpped_);
+
+            P.createArray(num_bin_target_, num_bin_warpped_);
+            p.create(num_bin_warpped_);
+
+            Dist.createArray(num_bin_target_, num_bin_warpped_);
+
+            // hoNDBoundaryHandlerFixedValue< hoMatrix<hist_value_type> > bh_hist(hist_, 0);
+
+            size_t t, w;
+
+            for ( t=0; t<num_bin_target_; t++ )
+            {
+                Hy(t, 0) = ( hist_(t, 1) - 0 );
+                Hy(t, num_bin_warpped_-1) = ( 0 - hist_(t, num_bin_warpped_-2) );
+
+                for ( w=1; w<num_bin_warpped_-1; w++ )
+                {
+                    Hy(t, w) = ( hist_(t, w+1) - hist_(t, w-1) );
+                }
+            }
+
+            Gadgetron::scal( (hist_value_type)(0.5), Hy);
+
+            hoNDBoundaryHandlerFixedValue< hoNDArray<hist_value_type> > bh_hist_warpped(hist_warpped_, 0);
+            for ( w=0; w<num_bin_warpped_; w++ )
+            {
+                hy(w) = (hist_value_type)(0.5) * ( bh_hist_warpped(w+1) - bh_hist_warpped(w-1) );
+            }
+
+            P = Hy;
+            p = hy;
+
+            for ( t=0; t<num_bin_target_; t++ )
+            {
+                for ( w=0; w<num_bin_warpped_; w++ )
+                {
+                    hist_value_type v = hist_(t, w);
+
+                    if ( v > 0 )
+                    {
+                        P(t, w) = Hy(t, w)/v;
+                    }
+                }
+            }
+
+            for ( w=0; w<num_bin_warpped_; w++ )
+            {
+                hist_value_type v = hist_warpped_(w);
+
+                if ( v > 0 )
+                {
+                    p(w) = hy(w)/v;
+                }
+            }
+
+            for ( t=0; t<num_bin_target_; t++ )
+            {
+                for ( w=0; w<num_bin_warpped_; w++ )
+                {
+                    Dist(t, w) = P(t, w) - p(w);
+                }
+            }
+
+            if ( betaArg_[0] > 0 )
+            {
+                Gadgetron::filterGaussian(Dist, betaArg_);
+            }
+
+            hoNDBoundaryHandlerFixedValue< ho2DArray<hist_value_type> > bh_Dist(Dist, 0);
+            hoNDInterpolatorLinear< ho2DArray<hist_value_type> > interp_Dist(Dist, bh_Dist);
+
+            size_t N = target_->get_number_of_elements();
+
+            ValueType range_t = ValueType(1.0)/(max_target_ - min_target_ + std::numeric_limits<ValueType>::epsilon());
+            ValueType range_w = ValueType(1.0)/(max_warpped_ - min_warpped_ + std::numeric_limits<ValueType>::epsilon());
+
+            long long n;
+
+            ValueType v = (ValueType)(1.0/N);
+            for ( n=0; n<(long long)N; n++ )
+            {
+                coord_type it = (coord_type)(range_t*(target(n)-min_target_)*(num_bin_target_-1));
+                coord_type iw = (coord_type)(range_w*(warped(n)-min_warpped_)*(num_bin_warpped_-1));
+
+                deriv_(n) = ValueType( interp_Dist(it, iw) ) * v;
+            }
+
+            // Gadgetron::math::scal(deriv_.get_number_of_elements(), ValueType(1.0/N), deriv_.begin());
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDissimilarityMutualInformation<ValueType, D>::evaluate() ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, unsigned int D> 
+    void hoImageRegDissimilarityMutualInformation<ValueType, D>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron mutual information image dissimilarity meausre -------------" << endl;
+        os << "Image dimension is : " << D << endl;
+
+        std::string elemTypeName = std::string(typeid(ValueType).name());
+        os << "Transformation data type is : " << elemTypeName << std::endl;
+
+        os << "Number of intensity bins for target is : " << num_bin_target_ << endl;
+        os << "Number of intensity bins for warped is : " << num_bin_warpped_ << endl;
+        os << "PV interpolation for histogram is : " << pv_interpolation_ << endl;
+        os << "Step size to ignore pixels when creating histogram is : " << step_size_ignore_pixel_ << endl;
+        os << "Kernel size for probability density estimation is : " << betaArg_[0] << " x " << betaArg_[1] << endl;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilarityNormalizedMutualInformation.h b/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilarityNormalizedMutualInformation.h
new file mode 100644
index 0000000..69968ec
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilarityNormalizedMutualInformation.h
@@ -0,0 +1,173 @@
+/** \file   hoImageRegDissimilarityNormalizedMutualInformation.h
+    \brief  Define the class to compute normalized mutual information.
+
+            C. Studholme, D.L.G. Hill, D.J. Hawkes. An overlap invariant entropy measure of 3D medical image alignment. Pattern Recognition, 32, 71-86, 1999.
+            http://eecs.vanderbilt.edu/courses/cs359/other_links/papers/studholme_NMI_1999.pdf
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegDissimilarityHistogramBased.h"
+
+namespace Gadgetron
+{
+    template<typename ValueType, unsigned int D> 
+    class hoImageRegDissimilarityNormalizedMutualInformation : public hoImageRegDissimilarityHistogramBased<ValueType, D>
+    {
+    public:
+
+        typedef hoImageRegDissimilarityNormalizedMutualInformation<ValueType, D> Self;
+        typedef hoImageRegDissimilarityHistogramBased<ValueType, D> BaseClass;
+
+        typedef typename BaseClass::ImageType ImageType;
+        typedef typename BaseClass::InterpolatorType InterpolatorType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef typename BaseClass::coord_type coord_type;
+
+        typedef typename BaseClass::hist_value_type hist_value_type;
+
+        hoImageRegDissimilarityNormalizedMutualInformation(unsigned int num_bin_target=64, unsigned int num_bin_warpped=64, ValueType bg_value=ValueType(0));
+        virtual ~hoImageRegDissimilarityNormalizedMutualInformation();
+
+        virtual ValueType evaluate(ImageType& w);
+
+        virtual bool evaluateDeriv(ImageType& /*w*/) { return true; }
+
+        virtual void print(std::ostream& os) const;
+
+        using BaseClass::num_bin_target_;
+        using BaseClass::num_bin_warpped_;
+        using BaseClass::pv_interpolation_;
+        using BaseClass::step_size_ignore_pixel_;
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        using BaseClass::target_;
+        using BaseClass::warpped_;
+        using BaseClass::deriv_;
+        using BaseClass::image_dim_;
+        using BaseClass::bg_value_;
+        using BaseClass::dissimilarity_;
+        using BaseClass::target;
+        using BaseClass::warped;
+        using BaseClass::deriv;
+        using BaseClass::hist_;
+        using BaseClass::min_target_;
+        using BaseClass::max_target_;
+        using BaseClass::min_warpped_;
+        using BaseClass::max_warpped_;
+        using BaseClass::num_samples_in_hist_;
+
+        hoNDArray<hist_value_type> hist_target_;
+        hoNDArray<hist_value_type> hist_warpped_;
+    };
+
+    template<typename ValueType, unsigned int D> 
+    hoImageRegDissimilarityNormalizedMutualInformation<ValueType, D>::
+    hoImageRegDissimilarityNormalizedMutualInformation(unsigned int num_bin_target, unsigned int num_bin_warpped, ValueType bg_value) 
+        : BaseClass(num_bin_target, num_bin_warpped, bg_value)
+    {
+    }
+
+    template<typename ValueType, unsigned int D> 
+    hoImageRegDissimilarityNormalizedMutualInformation<ValueType, D>::~hoImageRegDissimilarityNormalizedMutualInformation()
+    {
+    }
+
+    template<typename ValueType, unsigned int D> 
+    ValueType hoImageRegDissimilarityNormalizedMutualInformation<ValueType, D>::evaluate(ImageType& w)
+    {
+        try
+        {
+            BaseClass::evaluate(w);
+
+            // convert to probabilities
+            if ( num_samples_in_hist_ > 0 )
+            {
+                Gadgetron::scal((hist_value_type)(1.0/num_samples_in_hist_), hist_);
+            }
+
+            /// compute entorpy
+            hist_target_.create(num_bin_target_);
+            Gadgetron::clear(hist_target_);
+
+            hist_warpped_.create(num_bin_warpped_);
+            Gadgetron::clear(hist_warpped_);
+
+            hist_.sumOverRow(hist_target_);
+            hist_.sumOverCol(hist_warpped_);
+
+            hist_value_type entropy_t(0), entropy_w(0), joint_entropy(0);
+
+            size_t t, w;
+
+            hist_value_type log2 = hist_value_type(1.0)/log( hist_value_type(2.0) );
+
+            for ( t=0; t<(size_t)num_bin_target_; t++ )
+            {
+                hist_value_type prob = hist_target_(t);
+                if ( prob > 0 )
+                {
+                    entropy_t -= prob * log(prob) * log2;
+                }
+            }
+
+            for ( w=0; w<num_bin_warpped_; w++ )
+            {
+                hist_value_type prob = hist_warpped_(w);
+                if ( prob > 0 )
+                {
+                    entropy_w -= prob * log(prob) * log2;
+                }
+            }
+
+            for ( w=0; w<num_bin_warpped_; w++ )
+            {
+                for ( t=0; t<num_bin_target_; t++ )
+                {
+                    hist_value_type prob = hist_(t, w);
+                    if ( prob > 0 )
+                    {
+                        joint_entropy -= prob * log(prob) * log2;
+                    }
+                }
+            }
+
+            dissimilarity_ = - (entropy_t + entropy_w) / joint_entropy;
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDissimilarityNormalizedMutualInformation<ValueType, D>::evaluate(ImageType& t, ImageType& w) ... ");
+        }
+
+        return this->dissimilarity_;
+    }
+
+    template<typename ValueType, unsigned int D> 
+    void hoImageRegDissimilarityNormalizedMutualInformation<ValueType, D>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron image dissimilarity with histogram -------------" << endl;
+        os << "Image dimension is : " << D << endl;
+
+        std::string elemTypeName = std::string(typeid(ValueType).name());
+        os << "Transformation data type is : " << elemTypeName << std::endl;
+
+        os << "Number of intensity bins for target is : " << num_bin_target_ << endl;
+        os << "Number of intensity bins for warped is : " << num_bin_warpped_ << endl;
+        os << "PV interpolation for histogram is : " << pv_interpolation_ << endl;
+        os << "Step size to ignore pixels when creating histogram is : " << step_size_ignore_pixel_ << endl;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilaritySSD.h b/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilaritySSD.h
new file mode 100644
index 0000000..cdfbceb
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/dissimilarity/hoImageRegDissimilaritySSD.h
@@ -0,0 +1,108 @@
+/** \file   hoImageRegDissimilaritySSD.h
+    \brief  Define the class to compute image sum-of-square difference (SSD ) in gadgetron registration
+
+            The analytical derivatives are computed by using the formula proposed at:
+
+            [1] Gerardo Hermosillo, Christophe Chefd'Hotel, Olivier Faugeras. Variational Methods for Multimodal Image Matching. 
+            International Journal of Computer Vision. December 2002, Volume 50, Issue 3, pp 329-343.
+            http://link.springer.com/article/10.1023%2FA%3A1020830525823
+
+            [2] Gerardo Hermosillo. Variational Methods for Multimodal Image Matching. PhD Thesis, UNIVERSIT�E DE NICE - SOPHIA ANTIPOLIS. May 2002.
+            http://webdocs.cs.ualberta.ca/~dana/readingMedIm/papers/hermosilloPhD.pdf
+
+            The derivative computation code is modified from the listed source code at page 179 - 185 in ref [2].
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegDissimilarity.h"
+
+namespace Gadgetron
+{
+    template<typename ValueType, unsigned int D> 
+    class hoImageRegDissimilaritySSD : public hoImageRegDissimilarity<ValueType, D>
+    {
+    public:
+
+        typedef hoImageRegDissimilaritySSD<ValueType, D> Self;
+        typedef hoImageRegDissimilarity<ValueType, D> BaseClass;
+
+        typedef typename BaseClass::ImageType ImageType;
+        typedef typename BaseClass::InterpolatorType InterpolatorType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef typename BaseClass::coord_type coord_type;
+
+        hoImageRegDissimilaritySSD();
+        virtual ~hoImageRegDissimilaritySSD();
+
+        virtual ValueType evaluate(ImageType& w);
+        virtual bool evaluateDeriv(ImageType& w) { this->evaluate(w); return true; }
+
+        virtual void print(std::ostream& os) const;
+
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        using BaseClass::target_;
+        using BaseClass::warpped_;
+        using BaseClass::deriv_;
+        using BaseClass::bg_value_;
+        using BaseClass::dissimilarity_;
+        using BaseClass::target;
+        using BaseClass::warped;
+        using BaseClass::deriv;
+    };
+
+    template<typename ValueType, unsigned int D> 
+    hoImageRegDissimilaritySSD<ValueType, D>::hoImageRegDissimilaritySSD() : BaseClass()
+    {
+    }
+
+    template<typename ValueType, unsigned int D> 
+    hoImageRegDissimilaritySSD<ValueType, D>::~hoImageRegDissimilaritySSD()
+    {
+    }
+
+    template<typename ValueType, unsigned int D> 
+    ValueType hoImageRegDissimilaritySSD<ValueType, D>::evaluate(ImageType& w)
+    {
+        try
+        {
+            BaseClass::evaluate(w);
+
+            Gadgetron::subtract(target, warped, deriv);
+            Gadgetron::norm2(deriv, dissimilarity_);
+
+            dissimilarity_ = (dissimilarity_*dissimilarity_) / (ValueType)(target.get_number_of_elements());
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDissimilaritySSD<ValueType, D>::evaluate(w) ... ");
+        }
+
+        return this->dissimilarity_;
+    }
+
+    template<typename ValueType, unsigned int D> 
+    void hoImageRegDissimilaritySSD<ValueType, D>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron image dissimilarity SSD measure -------------" << endl;
+        os << "Image dimension is : " << D << endl;
+
+        std::string elemTypeName = std::string(typeid(ValueType).name());
+        os << "Transformation data type is : " << elemTypeName << std::endl;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoCKOpticalFlowSolver.cpp b/toolboxes/registration/optical_flow/cpu/hoCKOpticalFlowSolver.cpp
new file mode 100644
index 0000000..147a5cc
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoCKOpticalFlowSolver.cpp
@@ -0,0 +1,297 @@
+#include "hoCKOpticalFlowSolver.h"
+#include "vector_td_utilities.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif
+
+namespace Gadgetron{
+
+  // Helpers
+  //
+  
+  template<unsigned int D> inline bool
+  is_border_pixel_for_stride( typename int64d<D>::Type stride, typename uint64d<D>::Type co, typename uint64d<D>::Type dims )
+  {
+    for( size_t d=0; d<D; d++ ){
+      if( stride.vec[d] == -1 ){
+	if( co.vec[d] == 0 ){
+	  return true;
+	}
+      }
+      else if( stride.vec[d] == 1 ){
+	if( co.vec[d] == (dims.vec[d]-1) ){
+	  return true;
+	}
+      }
+    }
+    return false;
+  }
+
+  template<size_t i, size_t j> struct Pow
+  {
+    enum { Value = i*Pow<i,j-1>::Value };
+  };
+  
+  template <size_t i> struct Pow<i,1>
+  {
+    enum { Value = i };
+  };
+  
+  //
+  // Implementation
+  //
+
+  template<class T, unsigned int D> boost::shared_ptr< hoNDArray<T> >
+  hoCKOpticalFlowSolver<T,D>::core_solver( hoNDArray<T> *_gradient_image, hoNDArray<T> *_stencil_image )
+  {
+    // Sanity checks
+    //
+  
+    if( !_gradient_image ){
+      throw std::runtime_error("hoCKOpticalFlowSolver::core_solver(): illegal input gradient image received.");
+    }
+
+    if( _gradient_image->get_number_of_dimensions() <= D ){
+      throw std::runtime_error("hoCKOpticalFlowSolver::core_solver(): number of gradient image dimensions is too small.");
+    }
+  
+    // The dimensions of the displacement field should match the gradient field
+    //
+  
+    boost::shared_ptr< std::vector<size_t> > disp_dims = _gradient_image->get_dimensions();
+    boost::shared_ptr< hoNDArray<T> > displacements_ping( new hoNDArray<T>(disp_dims.get()) );
+    boost::shared_ptr< hoNDArray<T> > displacements_pong( new hoNDArray<T>(disp_dims.get()) );
+    clear(displacements_ping.get());
+    clear(displacements_pong.get());
+    
+    // We use "shared memory" to hold the averaged displacements
+    boost::shared_ptr< hoNDArray<T> > _shared_mem(new hoNDArray<T>(disp_dims.get()));
+    T *shared_mem = _shared_mem->get_data_ptr();
+    clear( _shared_mem.get());
+
+    typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>( *disp_dims );  
+    size_t number_of_elements = prod(matrix_size);
+    size_t num_batches = 1;
+
+    for( size_t d=D; d<_gradient_image->get_number_of_dimensions()-1; d++ ){
+      num_batches *= _gradient_image->get_size(d);
+    }
+  
+    // Get ready
+    // 
+
+    size_t iteration_no = 0;
+    hoNDArray<T> *ping = displacements_ping.get();
+    hoNDArray<T> *pong = displacements_pong.get(); 
+
+    if( this->output_mode_ >= hoOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+      GDEBUG_STREAM(std::endl);
+    }
+
+    //
+    // Main Jacobi loop
+    //
+    
+    while(true){
+    
+      if( this->output_mode_ >= hoOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+	GDEBUG_STREAM("."; std::cout.flush());
+      }
+    
+      // Continuation flag used for early Jacobi termination
+      size_t continue_flag = 0;
+
+      // Number of elements per batch
+      const size_t num_elements_per_batch = prod(matrix_size);
+  
+      // Number of elements per dim
+      const size_t num_elements_per_dim = num_elements_per_batch*num_batches;
+
+      T *in_disp = ping->get_data_ptr();
+      T *out_disp = pong->get_data_ptr();
+      T *gradient_image = _gradient_image->get_data_ptr();
+      T *stencil_image = (_stencil_image) ? _stencil_image->get_data_ptr() : 0x0;
+
+      //
+      // Find the average velocities (shared memory)
+      //
+      
+      for( size_t dim = 0; dim < D+1; dim++ ){
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+	for( long long idx = 0; idx < (long long)num_elements_per_dim; idx++ ){
+	  	  
+	  // Index to the shared memory
+	  const size_t shared_idx = dim*num_elements_per_dim+idx;
+	  
+	  // Batch idx (second slowest varying dimension)   
+	  const size_t batch_idx = idx/num_elements_per_batch;
+	  
+	  // Local index to the image (or batch in our terminology)
+	  const size_t idx_in_batch = idx-batch_idx*num_elements_per_batch;
+    	  
+	  if( stencil_image && stencil_image[idx_in_batch] > T(0) )
+	    continue;
+
+	  // Local co to the image
+	  const typename uint64d<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );    
+	  const typename int64d<D>::Type zeros(0);
+	  const typename int64d<D>::Type ones(1);
+	  const typename int64d<D>::Type threes(3);
+	  
+	  const int num_neighbors = Pow<3,D>::Value;
+	  T num_contribs = T(0);
+	  
+	  shared_mem[shared_idx] = T(0);
+	  
+	  // Compute average of neighbors
+	  //
+	  
+	  for( long long i=0; i<num_neighbors; i++ ){
+	    
+	    // Find the stride of the neighbor {-1, 0, 1}^D
+	    const typename int64d<D>::Type stride = idx_to_co<D>( i, threes ) - ones;
+	    
+	    size_t neighbor_idx;
+	    
+	    const size_t base_offset = dim*num_elements_per_dim + batch_idx*num_elements_per_batch;
+	    
+	    // Verify that the neighbor is not out of bounds (and not the thread itself)
+	    if( !is_border_pixel_for_stride<D>( stride, co, matrix_size ) && !(stride==zeros) ){	
+	      neighbor_idx = (size_t) co_to_idx<D>( vector_td<long long,D>(co)+stride, vector_td<long long,D>(matrix_size)) + base_offset;
+	    }
+	    else{
+	      neighbor_idx = idx_in_batch + base_offset;
+	    }
+	    
+	    shared_mem[shared_idx] += in_disp[neighbor_idx];
+	    num_contribs += T(1);
+	  }
+	  
+	  // Normalize
+	  shared_mem[shared_idx] /= num_contribs;
+	}
+      }
+      
+      //
+      // Update displacement field (Jacobi iteration)
+      //
+      
+      const T disp_thresh_sqr = this->limit_*this->limit_;
+
+      for( size_t dim = 0; dim < D+1; dim++ ){
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+	for( long long idx = 0; idx < num_elements_per_dim; idx++ ){  
+	  // Index to the shared memory
+	  const size_t shared_idx = dim*num_elements_per_dim+idx;
+	  
+	  // Batch idx (second slowest varying dimension)   
+	  const size_t batch_idx = idx/num_elements_per_batch;
+	  
+	  // Local index to the image (or batch in our terminology)
+	  const size_t idx_in_batch = idx-batch_idx*num_elements_per_batch;
+    	  
+	  if( stencil_image && stencil_image[idx_in_batch] > T(0) )
+	    continue;
+
+	  T phi = T(0);
+	  T norm = T(0);
+	  
+	  typename reald<T,D>::Type derivatives;
+	  
+	  // Contributions from the spatial dimensions
+	  //
+	  
+	  for( size_t d=0; d<D; d++ ){
+	    derivatives.vec[d] = gradient_image[d*num_elements_per_dim+idx];
+	    const size_t shared_idx_d = d*num_elements_per_dim+idx;
+	    phi += (shared_mem[shared_idx_d]*derivatives.vec[d]);
+	    norm += (derivatives.vec[d]*derivatives.vec[d]);
+	  }
+	  
+	  // Contributions from the temporal dimension
+	  //
+	  
+	  phi += gradient_image[D*num_elements_per_dim+idx];
+	  
+	  // Contribution from the intensity attentuation estimation
+	  //
+	  
+	  phi -= shared_mem[D*num_elements_per_dim+idx];
+	  
+	  // Normalize
+	  //
+	  
+	  phi /= ((alpha_/beta_)*(alpha_/beta_)+alpha_*alpha_+norm);
+	  
+	  // Form result displacement
+	  //
+	  
+	  T result;
+	  
+	  if( dim<D )
+	    result = shared_mem[shared_idx]-derivatives.vec[dim]*phi;
+	  else
+	    result = shared_mem[D*num_elements_per_dim+idx]+(alpha_/beta_)*(alpha_/beta_)*phi;
+	  
+	  // Clear the "termination" flag if the displacement field has changed above the threshold
+	  //
+	  
+	  T delta = result-in_disp[dim*num_elements_per_dim+idx];
+	  if( dim < D && delta*delta > disp_thresh_sqr )
+	    continue_flag = 1;
+	  
+	  // Output result
+	  //
+	  
+	  out_disp[dim*num_elements_per_dim+idx] = result;
+	}
+      }
+      
+      // Swap in/out buffers
+      //
+      
+      hoNDArray<T> *tmp = ping;
+      ping = pong;
+      pong = tmp;
+
+      // Check termination criteria
+      //
+      
+      if( continue_flag == 0 ){
+	if( this->output_mode_ >= hoOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+	  GDEBUG_STREAM(std::endl << "Break after " << iteration_no+1 << " iterations" << std::endl);
+	}
+	break;
+      }
+      
+      if( iteration_no > this->max_num_iterations_per_level_ ) 
+	break;    
+      
+      iteration_no++;
+    }
+    
+    if( ping == displacements_ping.get() )   
+      return displacements_ping;
+    else
+      return displacements_pong;
+  }
+  
+  // 
+  // Template instantiation
+  //
+
+  template class EXPORTCPUREG hoCKOpticalFlowSolver<float,1>;
+  template class EXPORTCPUREG hoCKOpticalFlowSolver<float,2>;
+  template class EXPORTCPUREG hoCKOpticalFlowSolver<float,3>;
+  template class EXPORTCPUREG hoCKOpticalFlowSolver<float,4>;
+
+  template class EXPORTCPUREG hoCKOpticalFlowSolver<double,1>;
+  template class EXPORTCPUREG hoCKOpticalFlowSolver<double,2>;
+  template class EXPORTCPUREG hoCKOpticalFlowSolver<double,3>;
+  template class EXPORTCPUREG hoCKOpticalFlowSolver<double,4>;  
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoCKOpticalFlowSolver.h b/toolboxes/registration/optical_flow/cpu/hoCKOpticalFlowSolver.h
new file mode 100644
index 0000000..0db9689
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoCKOpticalFlowSolver.h
@@ -0,0 +1,55 @@
+/** \file hoCKOpticalFlowSolver.h
+    \brief CPU-based Cornelius-Kanade optical flow registration solver.
+
+    References to the solver implementation and some usage scenarios can be found in:
+
+    An optimised multi-baseline approach for on-line MR-temperature monitoring on commodity graphics hardware
+    BD de Senneville, KØ Noe, M Ries, M Pedersen, CTW Moonen, TS Sørensen.
+    5th IEEE International Symposium on Biomedical Imaging: From Nano to Macro, 2008. ISBI 2008. pp. 1513-1516.
+
+    Acceleration and validation of optical flow based deformable registration for image-guided radiotherapy.
+    KØ Noe, BD de Senneville, UV Elstrøm, K Tanderup, TS Sørensen.
+    Acta Oncologica 2008; 47(7): 1286-1293.
+
+    Retrospective reconstruction of high temporal resolution cine images from real‐time MRI using iterative motion correction
+    MS Hansen, TS Sørensen, AE Arai, P Kellman.
+    Magnetic Resonance in Medicine 2012; 68(3): 741-750.
+*/
+
+#pragma once
+
+#include "hoOpticalFlowSolver.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class EXPORTCPUREG hoCKOpticalFlowSolver 
+    : public hoOpticalFlowSolver<T, D>
+  {
+  
+  public:
+
+    // Constructors / destructors
+    //
+  
+    hoCKOpticalFlowSolver() : hoOpticalFlowSolver<T,D>(){ 
+      alpha_ = T(0.05); 
+      beta_ = T(1.0); 
+    } 
+  
+    virtual ~hoCKOpticalFlowSolver() {}
+  
+    // Set the regularization weight
+    //
+  
+    inline void set_alpha( T alpha ) { alpha_ = alpha; }
+    inline void set_beta( T beta ) { beta_ = beta; }
+  
+  protected:  
+    virtual boost::shared_ptr< hoNDArray<T> > 
+      core_solver( hoNDArray<T> *gradient_image, hoNDArray<T> *stencil );  
+    
+  protected:
+    T alpha_;
+    T beta_;
+  };
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoHSOpticalFlowSolver.cpp b/toolboxes/registration/optical_flow/cpu/hoHSOpticalFlowSolver.cpp
new file mode 100644
index 0000000..8a50319
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoHSOpticalFlowSolver.cpp
@@ -0,0 +1,286 @@
+#include "hoHSOpticalFlowSolver.h"
+#include "vector_td_utilities.h"
+#include "vector_td_operators.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif 
+
+namespace Gadgetron{
+
+  // Helpers
+  //
+  
+  template<unsigned int D> inline bool
+  is_border_pixel_for_stride( typename int64d<D>::Type stride, typename uint64d<D>::Type co, typename uint64d<D>::Type dims )
+  {
+    for( size_t d=0; d<D; d++ ){
+      if( stride.vec[d] == -1 ){
+	if( co.vec[d] == 0 ){
+	  return true;
+	}
+      }
+      else if( stride.vec[d] == 1 ){
+	if( co.vec[d] == (dims.vec[d]-1) ){
+	  return true;
+	}
+      }
+    }
+    return false;
+  }
+  
+  template<size_t i, size_t j> struct Pow
+  {
+    enum { Value = i*Pow<i,j-1>::Value };
+  };
+  
+  template <size_t i> struct Pow<i,1>
+  {
+    enum { Value = i };
+  };
+
+  //
+  // Implementation
+  //
+
+  template<class T, unsigned int D> boost::shared_ptr< hoNDArray<T> >
+  hoHSOpticalFlowSolver<T,D>::core_solver( hoNDArray<T> *_gradient_image, hoNDArray<T> *_stencil_image )
+  {
+    // Sanity checks
+    //
+  
+    if( !_gradient_image ){
+      throw std::runtime_error("hoHSOpticalFlowSolver::core_solver(): illegal input gradient image received.");
+    }
+  
+    if( _gradient_image->get_number_of_dimensions() <= D ){
+      throw std::runtime_error("hoHSOpticalFlowSolver::core_solver(): number of gradient image dimensions is too small.");
+    }
+    
+    // The dimensions of the displacement field should match the gradient field
+    // - when removing the temporal gradient component (replacing D+1 with D)
+    //
+  
+    boost::shared_ptr< std::vector<size_t> > disp_dims = _gradient_image->get_dimensions();
+    disp_dims->pop_back(); disp_dims->push_back(D);
+
+    boost::shared_ptr< hoNDArray<T> > displacements_ping(new hoNDArray<T>(disp_dims.get()));
+    boost::shared_ptr< hoNDArray<T> > displacements_pong(new hoNDArray<T>(disp_dims.get()));
+  
+    clear(displacements_ping.get());
+    clear(displacements_pong.get());
+
+    // We use "shared memory" to hold the averaged displacements
+    boost::shared_ptr< hoNDArray<T> > _shared_mem(new hoNDArray<T>(disp_dims.get()));
+    T *shared_mem = _shared_mem->get_data_ptr();
+    clear( _shared_mem.get());
+   
+    typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>( *_gradient_image->get_dimensions() );  
+    size_t number_of_elements = prod(matrix_size);
+    size_t num_batches = 1;
+    
+    for( size_t d=D; d<_gradient_image->get_number_of_dimensions()-1; d++ ){
+      num_batches *= _gradient_image->get_size(d);
+    }
+    
+    // Get ready...
+    //
+
+    size_t iteration_no = 0;
+    hoNDArray<T> *ping = displacements_ping.get();
+    hoNDArray<T> *pong = displacements_pong.get();
+
+    if( this->output_mode_ >= hoOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+      GDEBUG_STREAM(std::endl);
+    }
+
+    //
+    // Main Jacobi loop
+    //
+
+    while(true){
+    
+      if( this->output_mode_ >= hoOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+	GDEBUG_STREAM("."; std::cout.flush());
+      }
+    
+      // Continuation flag used for early Jacobi termination      
+      size_t continue_flag = 0;
+
+      // Number of elements per batch
+      const size_t num_elements_per_batch = prod(matrix_size);
+      
+      // Number of elements per dim
+      const size_t num_elements_per_dim = num_elements_per_batch*num_batches;
+      
+      T *in_disp = ping->get_data_ptr();
+      T *out_disp = pong->get_data_ptr();
+      T *gradient_image = _gradient_image->get_data_ptr();
+      T *stencil_image = (_stencil_image) ? _stencil_image->get_data_ptr() : 0x0;
+
+      //
+      // Find the average velocities (shared memory)
+      //
+      
+      for( size_t dim = 0; dim < D; dim++ ){
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+      for( long long idx = 0; idx < num_elements_per_dim; idx++ ){
+	  
+	  // Index to the shared memory
+	  const size_t shared_idx = dim*num_elements_per_dim+idx;
+	  
+	  // Batch idx (second slowest varying dimension)   
+	  const size_t batch_idx = idx/num_elements_per_batch;
+	  
+	  // Local index to the image (or batch in our terminology)
+	  const size_t idx_in_batch = idx-batch_idx*num_elements_per_batch;
+    	  
+	  if( stencil_image && stencil_image[idx_in_batch] > T(0) )
+	    continue;
+
+	  // Local co to the image
+	  const typename uint64d<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );	  
+	  const typename int64d<D>::Type zeros(0);
+	  const typename int64d<D>::Type ones(1);
+	  const typename int64d<D>::Type threes(3);
+	  
+	  const long long num_neighbors = Pow<3,D>::Value;
+	  T num_contribs = T(0);
+      	  
+	  for( long long i=0; i<num_neighbors; i++ ){
+	    
+	    // Find the stride of the neighbor {-1, 0, 1}^D
+	    const typename int64d<D>::Type stride = idx_to_co<D>( i, threes ) - ones;
+	    
+	    // Verify that the neighbor is not out of bounds (and not the thread itself)
+	    if( !is_border_pixel_for_stride<D>( stride, co, matrix_size ) && !(stride==zeros) ){
+	  
+	      // Compute average of neighbors
+	      //
+	      
+	      const size_t base_offset = dim*num_elements_per_dim + batch_idx*num_elements_per_batch;
+	      const size_t neighbor_idx = (size_t) co_to_idx<D>( vector_td<long long,D>(co)+stride, vector_td<long long,D>(matrix_size)) + base_offset;
+	  
+	      shared_mem[shared_idx] += in_disp[neighbor_idx];
+	      num_contribs += T(1);
+	    }
+	  }
+      
+	  // Normalize
+	  shared_mem[shared_idx] /= num_contribs;       	
+	}
+      }
+      
+      //
+      // Update displacement field (Jacobi iteration)
+      //
+      
+      const T disp_thresh_sqr = this->limit_*this->limit_;
+      
+      for( size_t dim = 0; dim < D; dim++ ){
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+      for( long long idx = 0; idx < num_elements_per_dim; idx++ ){
+
+	  // Batch idx (second slowest varying dimension)   
+	  const size_t batch_idx = idx/num_elements_per_batch;
+	  
+	  // Local index to the image (or batch in our terminology)
+	  const size_t idx_in_batch = idx-batch_idx*num_elements_per_batch;
+    	  
+	  if( stencil_image && stencil_image[idx_in_batch] > T(0) )
+	    continue;
+
+	  // Index to the shared memory
+	  const size_t shared_idx = dim*num_elements_per_dim+idx;
+
+	  T phi = T(0);
+	  T norm = T(0);
+	  
+	  typename reald<T,D>::Type derivatives;
+	  
+	  // Contributions from the spatial dimensions
+	  //
+	  
+	  for( size_t d=0; d<D; d++ ){
+	    derivatives.vec[d] = gradient_image[d*num_elements_per_dim+idx];
+	    const size_t shared_idx_d = d*num_elements_per_dim+idx;
+	    phi += (shared_mem[shared_idx_d]*derivatives.vec[d]);
+	    norm += (derivatives.vec[d]*derivatives.vec[d]);
+	  }
+	  
+	  // Contributions from the temporal dimension
+	  //
+	  
+	  phi += gradient_image[D*num_elements_per_dim+idx];
+	  
+	  // Normalize
+	  //
+	  
+	  phi /= (alpha_*alpha_+norm);
+	  
+	  // Form result displacement
+	  //
+	  
+	  T result = shared_mem[shared_idx]-derivatives.vec[dim]*phi;
+	  
+	  // Clear the "termination" flag if the displacement field has changed above the threshold
+	  //
+	  
+	  T delta = result-in_disp[dim*num_elements_per_dim+idx];
+	  if( delta*delta > disp_thresh_sqr )
+	    continue_flag = 1;
+	  
+	  // Output result
+	  //
+	  
+	  out_disp[dim*num_elements_per_dim+idx] = result;
+	}
+      }
+      
+      // Swap in/out buffers
+      //
+      
+      hoNDArray<T> *tmp = ping;
+      ping = pong;
+      pong = tmp;
+      
+      // Check termination criteria
+      //
+
+      if( continue_flag == 0 ){
+	if( this->output_mode_ >= hoOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+	  GDEBUG_STREAM(std::endl << "Break after " << iteration_no+1 << " iterations" << std::endl);
+	}
+	break;
+      }
+    
+      if( iteration_no > this->max_num_iterations_per_level_ ) 
+	break;    
+      
+      iteration_no++;
+    }
+  
+    if( ping == displacements_ping.get() )   
+      return displacements_ping;
+    else
+      return displacements_pong;
+  }
+     
+  // 
+  // Template instantiation
+  //
+  
+  template class EXPORTCPUREG hoHSOpticalFlowSolver<float,1>;
+  template class EXPORTCPUREG hoHSOpticalFlowSolver<float,2>;
+  template class EXPORTCPUREG hoHSOpticalFlowSolver<float,3>;
+  template class EXPORTCPUREG hoHSOpticalFlowSolver<float,4>;
+  
+  template class EXPORTCPUREG hoHSOpticalFlowSolver<double,1>;
+  template class EXPORTCPUREG hoHSOpticalFlowSolver<double,2>;
+  template class EXPORTCPUREG hoHSOpticalFlowSolver<double,3>;
+  template class EXPORTCPUREG hoHSOpticalFlowSolver<double,4>;
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoHSOpticalFlowSolver.h b/toolboxes/registration/optical_flow/cpu/hoHSOpticalFlowSolver.h
new file mode 100644
index 0000000..ab1df40
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoHSOpticalFlowSolver.h
@@ -0,0 +1,52 @@
+/** \file hoHSOpticalFlowSolver.h
+    \brief CPU-based Horn-Schunck optical flow registration solver.
+
+    References to the solver implementation and some usage scenarios can be found in:
+
+    An optimised multi-baseline approach for on-line MR-temperature monitoring on commodity graphics hardware
+    BD de Senneville, KØ Noe, M Ries, M Pedersen, CTW Moonen, TS Sørensen.
+    5th IEEE International Symposium on Biomedical Imaging: From Nano to Macro, 2008. ISBI 2008. pp. 1513-1516.
+
+    Acceleration and validation of optical flow based deformable registration for image-guided radiotherapy.
+    KØ Noe, BD de Senneville, UV Elstrøm, K Tanderup, TS Sørensen.
+    Acta Oncologica 2008; 47(7): 1286-1293.
+
+    Retrospective reconstruction of high temporal resolution cine images from real‐time MRI using iterative motion correction
+    MS Hansen, TS Sørensen, AE Arai, P Kellman.
+    Magnetic Resonance in Medicine 2012; 68(3): 741-750.
+*/
+
+#pragma once
+
+#include "hoOpticalFlowSolver.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class EXPORTCPUREG hoHSOpticalFlowSolver 
+    : public hoOpticalFlowSolver<T, D>
+  {
+    
+  public:
+
+    // Constructors / destructors
+    //
+  
+    hoHSOpticalFlowSolver() : hoOpticalFlowSolver<T,D>(){ 
+      alpha_ = T(0.1); 
+    } 
+  
+    virtual ~hoHSOpticalFlowSolver() {}
+  
+    // Set the regularization weight
+    //
+  
+    inline void set_alpha( T alpha ) { alpha_ = alpha; }
+  
+  protected:  
+    virtual boost::shared_ptr< hoNDArray<T> > 
+      core_solver( hoNDArray<T> *gradient_image, hoNDArray<T> *stencil_image );
+    
+  protected:
+    T alpha_;
+  };
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator.cpp b/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator.cpp
new file mode 100644
index 0000000..f2d3a9d
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator.cpp
@@ -0,0 +1,203 @@
+#include "hoLinearResampleOperator.h"
+#include "vector_td_utilities.h"
+#include "vector_td_operators.h"
+#include "hoArmadillo.h"
+
+#include <stdio.h>
+#include <cmath>
+
+namespace Gadgetron{
+
+  template <class T, unsigned int D>
+  void hoLinearResampleOperator<T,D>::mult_M( hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate )
+  {
+    if( !this->preprocessed_ ){
+      throw std::runtime_error("hoLinearResampleOperator::mult_M(): displacements not set." );
+    }
+  
+    if( !in || !in->get_data_ptr() || !out || !out->get_data_ptr() ){
+      throw std::runtime_error("hoLinearResampleOperator::mult_M(): illegal input/output array." );
+    }
+  
+    arma::Row<typename stdType<T>::Type > in_vec = as_arma_row(in);
+    arma::Row<typename stdType<T>::Type > out_vec = as_arma_row(out);
+    out_vec = in_vec*R_T_;
+  }
+
+  template <class T, unsigned int D>
+  void hoLinearResampleOperator<T,D>::mult_MH( hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate )
+  {
+    if( !this->preprocessed_ ){
+      throw std::runtime_error("hoLinearResampleOperator::mult_M(): displacements not set." );
+    }
+  
+    if( !in || !in->get_data_ptr() || !out || !out->get_data_ptr() ){
+      throw std::runtime_error("hoLinearResampleOperator::mult_M(): illegal input/output array." );
+    }
+
+    arma::Col<typename stdType<T>::Type > in_vec = as_arma_col(in);
+    arma::Col<typename stdType<T>::Type > out_vec = as_arma_col(out);
+    out_vec = R_T_ * in_vec;
+  }
+  
+  template <class T, unsigned int D>
+  void hoLinearResampleOperator<T,D>::reset()
+  {
+    R_T_.reset();
+    resampleOperator< hoNDArray<typename realType<T>::Type>, hoNDArray<T> >::reset();
+  }
+  
+  template <class T, unsigned int D> void
+  hoLinearResampleOperator<T,D>::set_displacement_field( boost::shared_ptr< hoNDArray<typename realType<T>::Type> > displacements )
+  {
+    typedef typename realType<T>::Type REAL;
+    this->preprocessed_ = false;
+
+    if( displacements.get() == 0x0 ){
+      throw std::runtime_error("hoLinearResampleOperator::set_displacement_field : displacements ptr is 0x0." );
+    }  
+  
+    const int surplus = displacements->get_number_of_dimensions()-D;
+  
+    if( !( surplus == 1 || surplus == 2 ) ){
+      throw std::runtime_error("hoLinearResampleOperator::set_displacement_field : unexpected array dimensionality." );
+    }  
+  
+    // Determine the number of registrations performed
+    const unsigned int extended_dim = (surplus == 1) ? 1 : displacements->get_size(D); 
+    const unsigned int field_dim = (surplus == 1) ? displacements->get_size(D) : displacements->get_size(D+1);
+
+    if( !(field_dim == D || field_dim == D+1 )){
+      throw std::runtime_error("hoLinearResampleOperator::set_displacement_field : illegal tailing array dim" );
+    }
+  
+    const typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>( *(displacements->get_dimensions()));
+    const size_t num_elements_mat = prod(matrix_size);
+    const size_t num_elements_ext = prod(matrix_size)*extended_dim;
+    
+    const unsigned int num_neighbors = this->get_num_neighbors();
+    arma::umat locations(2,num_elements_ext*num_neighbors);
+    arma::Col<typename realType<T>::Type > values(num_elements_ext*num_neighbors);
+    size_t location_index = 0;
+
+    for( size_t idx=0; idx<num_elements_ext; idx++ ){
+    
+      const size_t batch_no = idx/num_elements_mat;
+      const size_t idx_in_batch = idx-batch_no*num_elements_mat;
+    
+      const typename uint64d<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );
+      typename reald<REAL,D>::Type co_disp = vector_td<REAL,D>(co);
+      for( unsigned int dim=0; dim<D; dim++ ){
+        REAL tmp = displacements->get_data_ptr()[dim*num_elements_ext+batch_no*num_elements_mat+idx_in_batch];
+        co_disp.vec[dim] += tmp;
+      } 
+    
+      // Determine the number of neighbors
+      //
+    
+      const typename uint64d<D>::Type twos(2);
+    
+      // Weights are non-zero only if all neighbors exist
+      //
+    
+      if( this->is_border_pixel(co_disp, matrix_size) )
+        continue;
+    
+      // Iterate over all neighbors
+      //
+    
+      size_t mat_j = idx;
+      size_t mat_i;
+    
+      for( unsigned int i=0; i<num_neighbors; i++ ){
+      
+        // Determine image coordinate of current neighbor
+        //
+        
+        const typename uint64d<D>::Type stride = idx_to_co<D>( i, twos );
+        
+        if( weak_greater_equal( stride, matrix_size ) ) continue; // For dimensions of size 1
+        
+        typename reald<REAL,D>::Type co_stride;
+      
+        for( unsigned int dim=0; dim<D; dim++ ){
+          if( stride.vec[dim] == 0 ){
+            co_stride.vec[dim] = std::floor(co_disp.vec[dim]);
+          }
+          else{
+            co_stride.vec[dim] = std::ceil(co_disp.vec[dim]);
+            if( co_stride.vec[dim] == co_disp.vec[dim] )
+              co_stride.vec[dim] += REAL(1.0);
+          }
+        }
+
+        // Validate that the coordinate is within the expected range
+        //
+
+        typename uint64d<D>::Type ones(1);
+        typename uint64d<D>::Type co_stride_uint64d = vector_td<size_t,D>(co_stride);
+
+        if( weak_greater( co_stride_uint64d, matrix_size-ones ) ){
+
+          for( unsigned int dim=0; dim<D; dim++ ){
+            if( co_stride[dim] < REAL(0) )
+              co_stride_uint64d[dim] = 0;
+            if( co_stride[dim] > (REAL(matrix_size[dim])-REAL(1)) )
+              co_stride_uint64d[dim] = matrix_size[dim]-1;
+          }
+        }
+	
+        mat_i = co_to_idx<D>(co_stride_uint64d, matrix_size)+batch_no*num_elements_mat;
+      
+        // Determine weight
+        //
+      
+        REAL weight = REAL(1);
+      
+        for( unsigned int dim=0; dim<D; dim++ ){	  
+          if( stride.vec[dim] == 0 ){
+            weight *= (REAL(1.0)-(co_disp.vec[dim]-co_stride.vec[dim])); }
+          else{
+            weight *= (REAL(1.0)-(co_stride.vec[dim]-co_disp.vec[dim])); }
+        }
+      
+        locations(0,location_index) = mat_i;
+        locations(1,location_index) = mat_j;
+        values(location_index) = weight;
+        location_index++;
+      }
+    }
+    locations.resize(2,location_index);
+    values.resize(location_index);
+    R_T_ = arma::SpMat<REAL>( locations, values, num_elements_mat*extended_dim, num_elements_ext, false );
+    this->preprocessed_ = true;
+  }
+
+  template <class T, unsigned int D> bool
+  hoLinearResampleOperator<T,D>::is_border_pixel( typename reald<typename realType<T>::Type,D>::Type co, typename uint64d<D>::Type dims )
+  {
+    typedef typename realType<T>::Type REAL;
+
+    for( unsigned int dim=0; dim<D; dim++ ){
+      if( dims[dim] > 1 && ( co[dim] < REAL(0) || co[dim] >= (REAL(dims[dim])-REAL(1)) ) )
+        return true;
+    }
+    return false;
+  }
+
+  template <class T, unsigned int D> unsigned int
+  hoLinearResampleOperator<T,D>::get_num_neighbors()
+  {
+    return 1 << D;
+  }
+  
+  template class EXPORTCPUREG hoLinearResampleOperator<float,1>;
+  template class EXPORTCPUREG hoLinearResampleOperator<float,2>;
+  template class EXPORTCPUREG hoLinearResampleOperator<float,3>;
+  template class EXPORTCPUREG hoLinearResampleOperator<float,4>;
+
+  template class EXPORTCPUREG hoLinearResampleOperator<double,1>;
+  template class EXPORTCPUREG hoLinearResampleOperator<double,2>;
+  template class EXPORTCPUREG hoLinearResampleOperator<double,3>;
+  template class EXPORTCPUREG hoLinearResampleOperator<double,4>;
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator.h b/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator.h
new file mode 100644
index 0000000..c00d689
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "hoNDArray_math.h"
+#include "resampleOperator.h"
+#include "complext.h"
+#include "cpureg_export.h"
+#include "hoArmadillo.h"
+
+namespace Gadgetron{
+
+  template <class T, unsigned int D>
+  class EXPORTCPUREG hoLinearResampleOperator : public resampleOperator<hoNDArray<typename realType<T>::Type>, hoNDArray<T> >
+  {  
+  public:
+    
+    hoLinearResampleOperator() : resampleOperator<hoNDArray<typename realType<T>::Type>, hoNDArray<T> >() {}
+    virtual ~hoLinearResampleOperator() {}
+  
+    virtual void mult_M( hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate = false);
+    virtual void mult_MH( hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate = false);
+    virtual void set_displacement_field( boost::shared_ptr< hoNDArray<typename realType<T>::Type> > offsets );
+    virtual void reset();
+  
+  private:
+    inline bool is_border_pixel( typename reald<typename realType<T>::Type,D>::Type co, typename uint64d<D>::Type dims );
+    inline unsigned int get_num_neighbors();
+  
+  protected:
+    arma::SpMat<typename realType<T>::Type> R_T_; //Contains the TRANSPOSED resampling matrix.
+  };
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator_eigen.cpp b/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator_eigen.cpp
new file mode 100644
index 0000000..7b610f8
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator_eigen.cpp
@@ -0,0 +1,206 @@
+#include "hoLinearResampleOperator_eigen.h"
+#include "vector_td_utilities.h"
+#include "vector_td_operators.h"
+
+#include "GadgetronTimer.h"
+
+#include <stdio.h>
+#include <algorithm>
+#include <Eigen/Core>
+
+namespace Gadgetron{
+
+    template <class T, unsigned int D> void
+        hoLinearResampleOperator_eigen<T,D>::mult_M( hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate )
+    {
+        if( !this->preprocessed_ ){
+            throw std::runtime_error("hoLinearResampleOperator::mult_M(): displacements not set." );
+        }
+
+        if( !in || !in->get_data_ptr() || !out || !out->get_data_ptr() ){
+            throw std::runtime_error("hoLinearResampleOperator::mult_M(): illegal input/output array." );
+        }
+
+        Eigen::Map< Eigen::Matrix<typename realType<T>::Type, 1, Eigen::Dynamic> > in_vec( in->get_data_ptr(), in->get_number_of_elements() );
+        Eigen::Map< Eigen::Matrix<typename realType<T>::Type, 1, Eigen::Dynamic> > out_vec( out->get_data_ptr(), out->get_number_of_elements() );
+
+        out_vec = in_vec * (*R_);
+    }
+
+    template <class T, unsigned int D> void
+        hoLinearResampleOperator_eigen<T,D>::mult_MH( hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate )
+    {
+        if( !this->preprocessed_ ){
+            throw std::runtime_error("hoLinearResampleOperator::mult_M(): displacements not set." );
+        }
+
+        if( !in || !in->get_data_ptr() || !out || !out->get_data_ptr() ){
+            throw std::runtime_error("hoLinearResampleOperator::mult_M(): illegal input/output array." );
+        }
+
+        Eigen::Map< Eigen::Matrix<typename realType<T>::Type, Eigen::Dynamic, 1> > in_vec( in->get_data_ptr(), in->get_number_of_elements() );
+        Eigen::Map< Eigen::Matrix<typename realType<T>::Type, Eigen::Dynamic, 1> > out_vec( out->get_data_ptr(), out->get_number_of_elements() );
+
+        out_vec = (*R_) * in_vec;
+    }
+
+    template <class T, unsigned int D> void
+        hoLinearResampleOperator_eigen<T,D>::set_displacement_field( boost::shared_ptr< hoNDArray<typename realType<T>::Type> > displacements )
+    {
+        if( displacements.get() == 0x0 ){
+            throw std::runtime_error("hoLinearResampleOperator_eigen_eigen::set_displacement_field : displacements ptr is 0x0." );
+        }  
+
+        const int surplus = displacements->get_number_of_dimensions()-D;
+
+        if( !( surplus == 1 || surplus == 2 ) ){
+            throw std::runtime_error("hoLinearResampleOperator_eigen::set_displacement_field : unexpected array dimensionality." );
+        }  
+
+        // Determine the number of registrations performed
+        const size_t extended_dim = (surplus == 1) ? 1 : displacements->get_size(D); 
+        temporal_dim_size_ = extended_dim;
+
+        const size_t field_dim = (surplus == 1) ? displacements->get_size(D) : displacements->get_size(D+1);
+
+        if( !(field_dim == D || field_dim == D+1 )){
+            throw std::runtime_error("hoLinearResampleOperator_eigen::set_displacement_field : illegal tailing array dim" );
+        }
+
+        const typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>( *(displacements->get_dimensions()));
+
+        const size_t num_elements_mat = prod(matrix_size);
+        const size_t num_elements_ext = prod(matrix_size)*extended_dim;
+
+        R_ = boost::shared_ptr< Eigen::SparseMatrix<typename realType<T>::Type> >
+            ( new Eigen::SparseMatrix<typename realType<T>::Type>( num_elements_mat, num_elements_ext ) );
+
+        std::vector< Eigen::Triplet<typename realType<T>::Type> > coefficients;
+
+        for( size_t idx=0; idx<num_elements_ext; idx++ ){
+
+            const size_t batch_no = idx/num_elements_mat;
+            const size_t idx_in_batch = idx-batch_no*num_elements_mat;
+
+            const typename uint64d<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );
+
+            typename reald<typename realType<T>::Type,D>::Type co_disp = to_reald<typename realType<T>::Type,size_t,D>(co);
+            for( size_t dim=0; dim<D; dim++ ){
+                typename realType<T>::Type tmp = displacements->get_data_ptr()[dim*num_elements_ext+batch_no*num_elements_mat+idx_in_batch];
+                co_disp.vec[dim] += tmp;
+            } 
+
+            // Determine the number of neighbors
+            //
+
+            const typename uint64d<D>::Type twos = to_vector_td<size_t,D>(2);
+            const size_t num_neighbors = this->get_num_neighbors();
+
+            // Weights are non-zero only if all neighbors exist
+            //
+
+            if( this->is_border_pixel(co_disp, matrix_size) )
+                continue;
+
+            // Iterate over all neighbors
+            //
+
+            //
+            // Eigen asks us to build the matrix column by column 
+            // It is more easy then to construct the transpose
+            //
+
+            size_t mat_j = idx;
+            size_t mat_i;
+
+            for( size_t i=0; i<num_neighbors; i++ ){
+
+                // Determine image coordinate of current neighbor
+                //
+
+                const typename uint64d<D>::Type stride = idx_to_co<D>( i, twos );
+
+                if( weak_greater_equal( stride, matrix_size ) ) continue; // For dimensions of size 1
+
+                typename reald<typename realType<T>::Type,D>::Type co_stride;
+
+                for( size_t dim=0; dim<D; dim++ ){
+                    if( stride.vec[dim] == 0 ){
+                        co_stride.vec[dim] = std::floor(co_disp.vec[dim]);
+                    }
+                    else{
+                        co_stride.vec[dim] = std::ceil(co_disp.vec[dim]);
+                        if( co_stride.vec[dim] == co_disp.vec[dim] )
+                            co_stride.vec[dim] += typename realType<T>::Type(1.0);
+                    }
+                }
+
+                // Validate that the coordinate is within the expected range
+                //
+
+                typename uint64d<D>::Type ones = to_vector_td<size_t,D>(1);
+                typename uint64d<D>::Type co_stride_uint64d = to_uint64d<typename realType<T>::Type,D>(co_stride);
+
+                if( weak_greater( co_stride_uint64d, matrix_size-ones ) ){
+
+                    for( size_t dim=0; dim<D; dim++ ){
+                        if( co_stride[dim] < typename realType<T>::Type(0) )
+                            co_stride_uint64d[dim] = 0;
+                        if( co_stride[dim] > (typename realType<T>::Type(matrix_size[dim])-typename realType<T>::Type(1)) )
+                            co_stride_uint64d[dim] = matrix_size[dim]-1;
+                    }
+                }
+
+                mat_i = co_to_idx<D>(co_stride_uint64d, matrix_size);
+
+                // Determine weight
+                //
+
+                typename realType<T>::Type weight = typename realType<T>::Type(1);
+
+                for( size_t dim=0; dim<D; dim++ ){	  
+                    if( stride.vec[dim] == 0 ){
+                        weight *= (typename realType<T>::Type(1.0)-(co_disp.vec[dim]-co_stride.vec[dim])); }
+                    else{
+                        weight *= (typename realType<T>::Type(1.0)-(co_stride.vec[dim]-co_disp.vec[dim])); }
+                }
+
+                // Insert weight in resampling matrix R_
+                //
+
+                //R_->insert( mat_i, mat_j ) =  weight;
+                coefficients.push_back(Eigen::Triplet<typename realType<T>::Type>(mat_i, mat_j, weight));
+            }
+        }  
+        //R_->finalize();
+        R_->setFromTriplets(coefficients.begin(), coefficients.end());
+        this->preprocessed_ = true;
+    }
+
+    template <class T, unsigned int D> bool
+        hoLinearResampleOperator_eigen<T,D>::is_border_pixel( typename reald<typename realType<T>::Type,D>::Type co, typename uint64d<D>::Type dims )
+    {
+        for( size_t dim=0; dim<D; dim++ ){
+            if( dims[dim] > 1 && ( co[dim] < typename realType<T>::Type(0) || co[dim] >= (typename realType<T>::Type(dims[dim])-typename realType<T>::Type(1)) ) )
+                return true;
+        }
+        return false;
+    }
+
+    template <class T, unsigned int D> size_t
+        hoLinearResampleOperator_eigen<T,D>::get_num_neighbors()
+    {
+        return 1 << D;
+    }
+
+
+    template class EXPORTCPUREG hoLinearResampleOperator_eigen<float,1>;
+    template class EXPORTCPUREG hoLinearResampleOperator_eigen<float,2>;
+    template class EXPORTCPUREG hoLinearResampleOperator_eigen<float,3>;
+    template class EXPORTCPUREG hoLinearResampleOperator_eigen<float,4>;
+
+    template class EXPORTCPUREG hoLinearResampleOperator_eigen<double,1>;
+    template class EXPORTCPUREG hoLinearResampleOperator_eigen<double,2>;
+    template class EXPORTCPUREG hoLinearResampleOperator_eigen<double,3>;
+    template class EXPORTCPUREG hoLinearResampleOperator_eigen<double,4>;
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator_eigen.h b/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator_eigen.h
new file mode 100644
index 0000000..61b63a2
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator_eigen.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include "hoNDArray_math.h"
+
+#include "resampleOperator.h"
+#include "complext.h"
+#include "cpureg_export.h"
+
+#include <armadillo>
+#include <Eigen/Sparse>
+
+namespace Gadgetron{
+
+  template <class T, unsigned int D>
+  class EXPORTCPUREG hoLinearResampleOperator_eigen : public resampleOperator<hoNDArray<typename realType<T>::Type>, hoNDArray<T> >
+  {  
+  public:
+  
+    hoLinearResampleOperator_eigen() : resampleOperator<hoNDArray<typename realType<T>::Type>, hoNDArray<T> >() {}
+    virtual ~hoLinearResampleOperator_eigen() {}
+  
+    virtual void mult_M( hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate = false);
+    virtual void mult_MH( hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate = false);
+    virtual void set_displacement_field( boost::shared_ptr< hoNDArray<typename realType<T>::Type> > offsets );
+  
+    virtual size_t get_temporal_dimension_size() { return temporal_dim_size_; }
+  
+    virtual boost::shared_ptr< linearOperator< hoNDArray<T> > > clone() {
+      return linearOperator< hoNDArray<T> >::clone(this);
+    }
+  
+  private:
+    inline bool is_border_pixel( typename reald<typename realType<T>::Type,D>::Type co, typename uint64d<D>::Type dims );
+    inline size_t get_num_neighbors();
+  
+  protected:
+    boost::shared_ptr< Eigen::SparseMatrix<typename realType<T>::Type> > R_;
+    size_t temporal_dim_size_;
+  };
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoOpticalFlowSolver.cpp b/toolboxes/registration/optical_flow/cpu/hoOpticalFlowSolver.cpp
new file mode 100644
index 0000000..147eae9
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoOpticalFlowSolver.cpp
@@ -0,0 +1,183 @@
+#include "hoOpticalFlowSolver.h"
+#include "vector_td_utilities.h"
+
+#include <algorithm>
+
+namespace Gadgetron{
+
+  // Helpers
+  //
+
+  template<unsigned int D> inline typename uint64d<D>::Type 
+  compute_stride( size_t dim )
+  {
+    typename uint64d<D>::Type res;
+  
+    for( size_t d=0; d<D; d++ ){
+      res.vec[d] = (d==dim) ? 1 : 0;
+    }
+    return res;
+  }
+
+  template<unsigned int D> inline bool 
+  is_border_pixel_in_stride_dim_before( size_t dim, typename uint64d<D>::Type co, typename uint64d<D>::Type dims )
+  {
+    if( co.vec[dim] == 0 )
+      return true;
+    else
+      return false;
+  }
+
+  template<unsigned int D> inline bool 
+  is_border_pixel_in_stride_dim_after( size_t dim, typename uint64d<D>::Type co, typename uint64d<D>::Type dims )
+  {
+    if( co.vec[dim] == (dims.vec[dim]-1) )
+      return true;
+    else
+      return false;
+  }
+    
+  template<class T, unsigned int D> void
+  hoOpticalFlowSolver<T,D>::core_grad_spatial( T *fixed_image, T *moving_image, T *gradient_image, 
+						  typename uint64d<D>::Type matrix_size, 
+						  size_t num_batches_fixed, 
+						  size_t num_batches_moving )
+  {        
+    // Number of elements per partial derivate
+    const size_t num_elements_per_batch = prod(matrix_size);
+    const size_t num_elements_per_pdev_fixed = num_elements_per_batch*num_batches_fixed;
+    const size_t num_elements_per_pdev_moving = num_elements_per_batch*num_batches_moving;
+
+    // Total number of elements for all partial derivatives
+    const size_t num_elements_total = std::max(num_elements_per_pdev_fixed, num_elements_per_pdev_moving)*D;
+  
+    for( size_t idx = 0; idx<num_elements_total; idx++ ){
+    
+      // The (minimum) index in the slowest varying output dimension determines which partial derivative to compute 
+      const size_t stride_dim_fixed = idx/(num_elements_per_pdev_fixed);
+      const size_t stride_dim_moving = idx/(num_elements_per_pdev_moving);
+      const size_t stride_dim = std::min(stride_dim_fixed, stride_dim_moving);
+
+      // Local index to the partial derivative
+      const size_t idx_in_pdev_fixed = idx-stride_dim_fixed*num_elements_per_pdev_fixed;
+      const size_t idx_in_pdev_moving = idx-stride_dim_moving*num_elements_per_pdev_moving;
+
+      // Batch idx (second slowest varying dimension)   
+      const size_t batch_idx_fixed = idx_in_pdev_fixed/num_elements_per_batch;
+      const size_t batch_idx_moving = idx_in_pdev_moving/num_elements_per_batch;
+
+      // Local index to the batch (should be identical for the fixed/moving image)
+      const size_t idx_in_batch = idx_in_pdev_moving-batch_idx_moving*num_elements_per_batch;
+
+      // Local co to the image
+      const typename uint64d<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );
+ 
+      T res;
+      size_t count = 0;
+
+      //
+      // Find partial derivatives using central differences
+      //
+    
+      const typename uint64d<D>::Type stride = compute_stride<D>(stride_dim);
+      const size_t base_idx_moving = batch_idx_moving*num_elements_per_batch;
+      const size_t base_idx_fixed = batch_idx_fixed*num_elements_per_batch;
+
+      size_t stride_base_idx, fixed_idx, moving_idx;
+     
+      // Neighbor "plus stride" side
+      if( !is_border_pixel_in_stride_dim_after<D>( stride_dim, co, matrix_size )){
+        stride_base_idx = co_to_idx<D>(co+stride, matrix_size);
+        count++;
+      }
+      else{
+        stride_base_idx = idx_in_batch;
+      }
+    
+      fixed_idx = stride_base_idx+base_idx_fixed;
+      moving_idx = stride_base_idx+base_idx_moving;
+      
+      res = (fixed_image[fixed_idx]+moving_image[moving_idx])*T(0.5);
+
+      // Neighbor "minus stride" side
+      if( !is_border_pixel_in_stride_dim_before<D>( stride_dim, co, matrix_size )){
+        stride_base_idx = co_to_idx<D>(co-stride, matrix_size);
+        count++;
+      }
+      else{
+        stride_base_idx = co_to_idx<D>(co, matrix_size);
+      }
+    
+      fixed_idx = stride_base_idx+base_idx_fixed;
+      moving_idx = stride_base_idx+base_idx_moving;
+    
+      res -= (fixed_image[fixed_idx]+moving_image[moving_idx])*T(0.5);
+
+      if( count == 2 ) // Both neighbors exist
+        res /= T(2);
+
+      // Output result
+      //
+    
+      gradient_image[idx] = res;
+    }
+  }
+  
+  template<class T, unsigned int D> void
+  hoOpticalFlowSolver<T,D>::core_grad_temporal( T *fixed_image, T *moving_image, T *gradient_image, 
+						   typename uint64d<D>::Type matrix_size, 
+						   size_t num_batches_fixed, 
+						   size_t num_batches_moving )
+  {        
+    // Number of elements per partial derivate
+    const size_t num_elements_per_batch = prod(matrix_size);
+    const size_t num_elements_per_pdev_fixed = num_elements_per_batch*num_batches_fixed;
+    const size_t num_elements_per_pdev_moving = num_elements_per_batch*num_batches_moving;
+
+    // Total number of elements for all partial derivatives
+    const size_t num_elements_total = std::max(num_elements_per_pdev_fixed, num_elements_per_pdev_moving);
+  
+    for( size_t idx =0; idx < num_elements_total; idx++ ){
+      
+      // Local index to the partial derivative
+      const size_t stride_dim_fixed = idx/(num_elements_per_pdev_fixed);
+      const size_t stride_dim_moving = idx/(num_elements_per_pdev_moving);
+      const size_t idx_in_pdev_fixed = idx-stride_dim_fixed*num_elements_per_pdev_fixed;
+      const size_t idx_in_pdev_moving = idx-stride_dim_moving*num_elements_per_pdev_moving;
+
+      // Batch idx (second slowest varying dimension)   
+      const size_t batch_idx_fixed = idx_in_pdev_fixed/num_elements_per_batch;
+      const size_t batch_idx_moving = idx_in_pdev_moving/num_elements_per_batch;
+
+      // Local index to the batch (should be identical for the fixed/moving image)
+      const size_t idx_in_batch = idx_in_pdev_moving-batch_idx_moving*num_elements_per_batch;
+      const size_t base_idx_fixed = batch_idx_fixed*num_elements_per_batch;
+      const size_t base_idx_moving = batch_idx_moving*num_elements_per_batch;
+    
+      // Ctr pixel
+      const size_t fixed_idx = idx_in_batch+base_idx_fixed;
+      const size_t moving_idx = idx_in_batch+base_idx_moving;
+    
+      const T res = moving_image[moving_idx]-fixed_image[fixed_idx];
+    
+      // Output result
+      //
+    
+      gradient_image[idx] = res;        
+    }    
+  }
+  
+  // 
+  // Template instantiation
+  //
+
+  template class EXPORTCPUREG hoOpticalFlowSolver<float,1>;
+  template class EXPORTCPUREG hoOpticalFlowSolver<float,2>;
+  template class EXPORTCPUREG hoOpticalFlowSolver<float,3>;
+  template class EXPORTCPUREG hoOpticalFlowSolver<float,4>;
+
+  template class EXPORTCPUREG hoOpticalFlowSolver<double,1>;
+  template class EXPORTCPUREG hoOpticalFlowSolver<double,2>;
+  template class EXPORTCPUREG hoOpticalFlowSolver<double,3>;
+  template class EXPORTCPUREG hoOpticalFlowSolver<double,4>;
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoOpticalFlowSolver.h b/toolboxes/registration/optical_flow/cpu/hoOpticalFlowSolver.h
new file mode 100644
index 0000000..9bc6dd6
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoOpticalFlowSolver.h
@@ -0,0 +1,46 @@
+/** \file hoOpticalFlowSolver.h
+    \brief Abstract class for a CPU-based optical flow registration solver.
+
+    hoOpticalFlowSolver is derived from class opticalFlowSolver 
+    and implements the computation of the spatial and temporal gradients.
+    A pure virtual function is expected to implement the specific algorithm (Horn-Schunck, Cornelius-Kanade).
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_utils.h"
+#include "hoNDArray_math.h"
+#include "opticalFlowSolver.h"
+#include "cpureg_export.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class EXPORTCPUREG hoOpticalFlowSolver 
+    : public opticalFlowSolver< hoNDArray<T>,D >
+  {  
+  public:
+  
+    hoOpticalFlowSolver() : opticalFlowSolver< hoNDArray<T>,D >() {}   
+    virtual ~hoOpticalFlowSolver() {}
+    
+  protected:
+
+    // Inherited and still pure virtual...
+    //virtual boost::shared_ptr< hoNDArray<T> > core_solver( hoNDArray<T> *gradient_image, hoNDArray<T> *stencil_image ) = 0;      
+
+    // CPU-based computation of the spatial and temporal image gradient
+    //
+    
+    virtual void core_grad_spatial( T *fixed_image, T *moving_image, T *gradient_image, 
+				    typename uint64d<D>::Type matrix_size_moving, 
+				    size_t number_of_batches_fixed, 
+				    size_t number_of_batches_moving );
+    
+    virtual void core_grad_temporal( T *fixed_image, T *moving_image, T *gradient_image, 
+				     typename uint64d<D>::Type matrix_size_moving, 
+				     size_t number_of_batches_fixed, 
+				     size_t number_of_batches_moving );
+  };  
+}
diff --git a/toolboxes/registration/optical_flow/cpu/register/hoImageRegDeformationFieldBidirectionalRegister.h b/toolboxes/registration/optical_flow/cpu/register/hoImageRegDeformationFieldBidirectionalRegister.h
new file mode 100644
index 0000000..54e643d
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/register/hoImageRegDeformationFieldBidirectionalRegister.h
@@ -0,0 +1,501 @@
+/** \file   hoImageRegDeformationFieldBidirectionalRegister.h
+    \brief  Define the class to perform non-rigid image registration to estimate bi-directional variational deformation field
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegDeformationFieldRegister.h"
+
+namespace Gadgetron
+{
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    class hoImageRegDeformationFieldBidirectionalRegister : public hoImageRegDeformationFieldRegister<ValueType, CoordType, D>
+    {
+    public:
+
+        typedef hoImageRegDeformationFieldBidirectionalRegister<ValueType, CoordType, D> Self;
+        typedef hoImageRegDeformationFieldRegister<ValueType, CoordType, D> BaseClass;
+        typedef hoImageRegNonParametricRegister<ValueType, CoordType, D, D> NonParametricRegisterClass;
+
+        typedef typename BaseClass::TargetType TargetType;
+        typedef typename BaseClass::SourceType SourceType;
+
+        typedef typename BaseClass::Target2DType Target2DType;
+        typedef typename BaseClass::Source2DType Source2DType;
+
+        typedef typename BaseClass::Target3DType Target3DType;
+        typedef typename BaseClass::Source3DType Source3DType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        /// boundary handler and interpolator for target image
+        typedef typename BaseClass::BoundaryHandlerTargetType BoundaryHandlerTargetType;
+        typedef typename BaseClass::BoundaryHandlerTargetFixedValueType BoundaryHandlerTargetFixedValueType;
+        typedef typename BaseClass::BoundaryHandlerTargetBorderValueType BoundaryHandlerTargetBorderValueType;
+        typedef typename BaseClass::BoundaryHandlerTargetPeriodicType BoundaryHandlerTargetPeriodicType;
+        typedef typename BaseClass::BoundaryHandlerTargetMirrorType BoundaryHandlerTargetMirrorType;
+
+        typedef typename BaseClass::InterpTargetType InterpTargetType;
+        typedef typename BaseClass::InterpTargetLinearType InterpTargetLinearType;
+        typedef typename BaseClass::InterpTargetNearestNeighborType InterpTargetNearestNeighborType;
+        typedef typename BaseClass::InterpTargetBSplineType InterpTargetBSplineType;
+
+        /// boundary handler and interpolator for source image
+        typedef typename BaseClass::BoundaryHandlerSourceType BoundaryHandlerSourceType;
+        typedef typename BaseClass::BoundaryHandlerSourceFixedValueType BoundaryHandlerSourceFixedValueType;
+        typedef typename BaseClass::BoundaryHandlerSourceBorderValueType BoundaryHandlerSourceBorderValueType;
+        typedef typename BaseClass::BoundaryHandlerSourcePeriodicType BoundaryHandlerSourcePeriodicType;
+        typedef typename BaseClass::BoundaryHandlerSourceMirrorType BoundaryHandlerSourceMirrorType;
+
+        typedef typename BaseClass::InterpSourceType InterpSourceType;
+        typedef typename BaseClass::InterpSourceLinearType InterpSourceLinearType;
+        typedef typename BaseClass::InterpSourceNearestNeighborType InterpSourceNearestNeighborType;
+        typedef typename BaseClass::InterpSourceBSplineType InterpSourceBSplineType;
+
+        /// warper type
+        typedef typename BaseClass::WarperType WarperType;
+
+        /// image dissimilarity type
+        typedef typename BaseClass::DissimilarityType DissimilarityType;
+
+        /// transformation type
+        typedef hoImageRegDeformationField<CoordType, D> TransformationType;
+        typedef typename TransformationType::input_point_type input_point_type;
+        typedef typename TransformationType::output_point_type output_point_type;
+        typedef typename TransformationType::jacobian_position_type jacobian_position_type;
+        typedef typename TransformationType::DeformationFieldType DeformationFieldType;
+        typedef typename TransformationType::coord_type coord_type;
+
+        /// solver type
+        typedef hoImageRegDeformationFieldBidirectionalSolver<ValueType, CoordType, D> SolverType;
+
+        hoImageRegDeformationFieldBidirectionalRegister(unsigned int resolution_pyramid_levels=3, bool use_world_coordinates=false, ValueType bg_value=ValueType(0));
+        virtual ~hoImageRegDeformationFieldBidirectionalRegister();
+
+        /// initialize the registration
+        /// should be called after all images and parameters of registration are set
+        virtual bool initialize();
+
+        /// perform the registration
+        virtual bool performRegistration();
+
+        virtual void printContent(std::ostream& os) const;
+        virtual void print(std::ostream& os) const;
+
+        /// parameters
+
+        using BaseClass::use_world_coordinates_;
+        using BaseClass::resolution_pyramid_divided_by_2_;
+        using BaseClass::resolution_pyramid_levels_;
+        using BaseClass::resolution_pyramid_downsample_ratio_;
+        using BaseClass::resolution_pyramid_blurring_sigma_;
+        using BaseClass::boundary_handler_type_warper_;
+        using BaseClass::interp_type_warper_;
+        using BaseClass::boundary_handler_type_pyramid_construction_;
+        using BaseClass::interp_type_pyramid_construction_;
+        using BaseClass::dissimilarity_type_;
+        using BaseClass::apply_in_FOV_constraint_;
+        using BaseClass::solver_type_;
+
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+        using BaseClass::max_iter_num_pyramid_level_;
+        using BaseClass::dissimilarity_thres_pyramid_level_;
+        using BaseClass::div_num_pyramid_level_;
+        using BaseClass::step_size_para_pyramid_level_;
+        using BaseClass::step_size_div_para_pyramid_level_;
+        using BaseClass::regularization_hilbert_strength_world_coordinate_;
+        using BaseClass::regularization_hilbert_strength_pyramid_level_;
+        using BaseClass::verbose_;
+
+        /// number of iterations to improve the estimation of the inverse transform
+        std::vector<unsigned int> inverse_deform_enforce_iter_pyramid_level_;
+        /// weight to update the estimation of the inverse transform, must be within [0 1]
+        std::vector<CoordType> inverse_deform_enforce_weight_pyramid_level_;
+
+        /// set the default parameters
+        virtual bool setDefaultParameters(unsigned int resolution_pyramid_levels, bool use_world_coordinates);
+
+        /// deformation field transformation, defined in the grid of target image
+        using BaseClass::transform_;
+
+        TransformationType* transform_inverse_;
+
+        /// solver
+        std::vector<SolverType> solver_pyramid_inverse_;
+
+    protected:
+
+        using BaseClass::target_;
+        using BaseClass::source_;
+        using BaseClass::bg_value_;
+        using BaseClass::target_pyramid_;
+        using BaseClass::source_pyramid_;
+        using BaseClass::target_bh_warper_;
+        using BaseClass::target_interp_warper_;
+        using BaseClass::source_bh_warper_;
+        using BaseClass::source_interp_warper_;
+        using BaseClass::target_bh_pyramid_construction_;
+        using BaseClass::target_interp_pyramid_construction_;
+        using BaseClass::source_bh_pyramid_construction_;
+        using BaseClass::source_interp_pyramid_construction_;
+        using BaseClass::warper_pyramid_;
+        using BaseClass::warper_pyramid_inverse_;
+        using BaseClass::dissimilarity_pyramid_;
+        using BaseClass::dissimilarity_pyramid_inverse_;
+        using BaseClass::preset_transform_;
+    };
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    hoImageRegDeformationFieldBidirectionalRegister<ValueType, CoordType, D>::
+    hoImageRegDeformationFieldBidirectionalRegister(unsigned int resolution_pyramid_levels, bool use_world_coordinates, ValueType bg_value) 
+    : BaseClass(resolution_pyramid_levels, bg_value)
+    {
+        inverse_deform_enforce_iter_pyramid_level_.clear();
+        inverse_deform_enforce_iter_pyramid_level_.resize(resolution_pyramid_levels, 10);
+
+        inverse_deform_enforce_weight_pyramid_level_.clear();
+        inverse_deform_enforce_weight_pyramid_level_.resize(resolution_pyramid_levels, 0.5);
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    hoImageRegDeformationFieldBidirectionalRegister<ValueType, CoordType, D>::~hoImageRegDeformationFieldBidirectionalRegister()
+    {
+        if ( !preset_transform_ )
+        {
+            // delete transform_;
+            delete transform_inverse_;
+            transform_inverse_ = NULL;
+        }
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    bool hoImageRegDeformationFieldBidirectionalRegister<ValueType, CoordType, D>::setDefaultParameters(unsigned int resolution_pyramid_levels, bool use_world_coordinates)
+    {
+        BaseClass::setDefaultParameters(resolution_pyramid_levels, use_world_coordinates);
+
+        inverse_deform_enforce_iter_pyramid_level_.clear();
+        inverse_deform_enforce_iter_pyramid_level_.resize(resolution_pyramid_levels, 10);
+
+        inverse_deform_enforce_weight_pyramid_level_.clear();
+        inverse_deform_enforce_weight_pyramid_level_.resize(resolution_pyramid_levels, 0.5);
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    bool hoImageRegDeformationFieldBidirectionalRegister<ValueType, CoordType, D>::initialize()
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(NonParametricRegisterClass::initialize());
+
+            if ( transform_ == NULL )
+            {
+                transform_ = new TransformationType(*target_);
+                transform_inverse_ = new TransformationType(*source_);
+
+                preset_transform_ = false;
+            }
+
+            warper_pyramid_.resize(resolution_pyramid_levels_);
+            warper_pyramid_inverse_.resize(resolution_pyramid_levels_);
+
+            solver_pyramid_inverse_.resize(resolution_pyramid_levels_);
+
+            unsigned int ii, jj;
+            for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+            {
+                warper_pyramid_[ii].setTransformation(*transform_);
+                warper_pyramid_[ii].setInterpolator( *source_interp_warper_[ii] );
+                warper_pyramid_[ii].setBackgroundValue(bg_value_);
+                warper_pyramid_[ii].debugFolder_ = this->debugFolder_;
+
+                warper_pyramid_inverse_[ii].setTransformation(*transform_inverse_);
+                warper_pyramid_inverse_[ii].setInterpolator( *target_interp_warper_[ii] );
+                warper_pyramid_inverse_[ii].setBackgroundValue(bg_value_);
+                warper_pyramid_inverse_[ii].debugFolder_ = this->debugFolder_;
+
+                solver_pyramid_inverse_[ii].setTransform(*transform_);
+                solver_pyramid_inverse_[ii].setTransformInverse(*transform_inverse_);
+
+                if ( regularization_hilbert_strength_world_coordinate_ )
+                {
+                    // world to pixel
+                    std::vector<coord_type> pixelSize;
+                    target_->get_pixel_size(pixelSize);
+
+                    for ( jj=0; jj<D; jj++ )
+                    {
+                        solver_pyramid_inverse_[ii].regularization_hilbert_strength_[jj] = (regularization_hilbert_strength_pyramid_level_[ii][jj] / pixelSize[jj]);
+                    }
+                }
+                else
+                {
+                    for ( jj=0; jj<D; jj++ )
+                    {
+                        solver_pyramid_inverse_[ii].regularization_hilbert_strength_[jj] = regularization_hilbert_strength_pyramid_level_[ii][jj];
+                    }
+                }
+
+                solver_pyramid_inverse_[ii].max_iter_num_ = max_iter_num_pyramid_level_[ii];
+                solver_pyramid_inverse_[ii].dissimilarity_thres_ = dissimilarity_thres_pyramid_level_[ii];
+                solver_pyramid_inverse_[ii].div_num_ = div_num_pyramid_level_[ii];
+                solver_pyramid_inverse_[ii].step_size_para_ = step_size_para_pyramid_level_[ii];
+                solver_pyramid_inverse_[ii].step_size_div_para_ = step_size_div_para_pyramid_level_[ii];
+                solver_pyramid_inverse_[ii].verbose_ = verbose_;
+                solver_pyramid_inverse_[ii].debugFolder_ = this->debugFolder_;
+
+                solver_pyramid_inverse_[ii].setTarget(target_pyramid_[ii]);
+                solver_pyramid_inverse_[ii].setSource(source_pyramid_[ii]);
+
+                solver_pyramid_inverse_[ii].setDissimilarity(*dissimilarity_pyramid_[ii]);
+                solver_pyramid_inverse_[ii].setWarper(warper_pyramid_[ii]);
+                solver_pyramid_inverse_[ii].setInterpolator(*source_interp_warper_[ii]);
+
+                solver_pyramid_inverse_[ii].setDissimilarityInverse(*dissimilarity_pyramid_inverse_[ii]);
+                solver_pyramid_inverse_[ii].setWarperInverse(warper_pyramid_inverse_[ii]);
+                solver_pyramid_inverse_[ii].setInterpolatorInverse(*target_interp_warper_[ii]);
+
+                solver_pyramid_inverse_[ii].setBackgroundValue(bg_value_);
+                solver_pyramid_inverse_[ii].setUseWorldCoordinate(use_world_coordinates_);
+
+                solver_pyramid_inverse_[ii].inverse_deform_enforce_iter_ = inverse_deform_enforce_iter_pyramid_level_[ii];
+                solver_pyramid_inverse_[ii].inverse_deform_enforce_weight_ = inverse_deform_enforce_weight_pyramid_level_[ii];
+
+                solver_pyramid_inverse_[ii].apply_in_FOV_constraint_ = apply_in_FOV_constraint_;
+            }
+
+            // downsample the deformation field if necessary
+            if ( !transform_->getDeformationField(0).dimensions_equal(target_pyramid_[resolution_pyramid_levels_-1]) )
+            {
+                std::vector<size_t> dim;
+                target_pyramid_[resolution_pyramid_levels_-1].get_dimensions(dim);
+
+                std::vector<size_t> dimInv;
+                source_pyramid_[resolution_pyramid_levels_-1].get_dimensions(dimInv);
+
+                for ( jj=0; jj<D; jj++ )
+                {
+                    DeformationFieldType& deField = transform_->getDeformationField(jj);
+                    DeformationFieldType& deField_inverse = transform_inverse_->getDeformationField(jj);
+
+                    if ( preset_transform_ )
+                    {
+                        // forward
+                        DeformationFieldType deFieldResampled;
+
+                        hoNDBoundaryHandlerBorderValue<DeformationFieldType> bhBorderValue(deField);
+                        hoNDInterpolatorLinear<DeformationFieldType> interpLinear(deField, bhBorderValue);
+
+                        GADGET_CHECK_RETURN_FALSE(Gadgetron::resampleImage(deField, interpLinear, dim, deFieldResampled));
+
+                        deField = deFieldResampled;
+                        deField.copyImageInfoWithoutImageSize(target_pyramid_[resolution_pyramid_levels_-1]);
+
+                        // inverse
+                        DeformationFieldType deFieldResampled_inverse;
+
+                        bhBorderValue.setArray(deField_inverse);
+                        interpLinear.setArray(deField_inverse);
+                        GADGET_CHECK_RETURN_FALSE(Gadgetron::resampleImage(deField_inverse, interpLinear, dimInv, deFieldResampled_inverse));
+
+                        deField_inverse = deFieldResampled_inverse;
+                        deField_inverse.copyImageInfoWithoutImageSize(source_pyramid_[resolution_pyramid_levels_-1]);
+                    }
+                    else
+                    {
+                        deField.createFrom(target_pyramid_[resolution_pyramid_levels_-1]);
+                        Gadgetron::clear(deField);
+
+                        deField_inverse.createFrom(source_pyramid_[resolution_pyramid_levels_-1]);
+                        Gadgetron::clear(deField_inverse);
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationFieldBidirectionalRegister<ValueType, CoordType, D>::initialize() ... ");
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    bool hoImageRegDeformationFieldBidirectionalRegister<ValueType, CoordType, D>::performRegistration()
+    {
+        try
+        {
+            // starting from the most coarse level
+
+            int level;
+            for ( level=(int)resolution_pyramid_levels_-1; level>=0; level-- )
+            {
+                // update the transform for multi-resolution pyramid
+                transform_->update();
+                transform_inverse_->update();
+
+                GADGET_CHECK_RETURN_FALSE(solver_pyramid_inverse_[level].solve());
+
+                if ( !debugFolder_.empty() )
+                {
+                    unsigned int jj;
+                    for ( jj=0; jj<D; jj++ )
+                    {
+                        std::ostringstream ostr;
+                        ostr << "deform_" << jj;
+
+                        gt_exporter_.exportImage(transform_->getDeformationField(jj), debugFolder_+ostr.str());
+
+                        std::ostringstream ostr2;
+                        ostr2 << "deform_inverse_" << jj;
+
+                        gt_exporter_.exportImage(transform_inverse_->getDeformationField(jj), debugFolder_+ostr2.str());
+                    }
+                }
+
+                // expand the deformation field for next resolution level
+                if ( level>0 )
+                {
+                    std::vector<float> ratio = resolution_pyramid_downsample_ratio_[level-1];
+
+                    unsigned int jj;
+                    bool downsampledBy2 = true;
+                    for ( jj=0; jj<D; jj++ )
+                    {
+                        if ( std::abs(ratio[jj]-2.0f) > FLT_EPSILON )
+                        {
+                            downsampledBy2 = false;
+                            break;
+                        }
+                    }
+
+                    DeformationFieldType deformExpanded;
+                    deformExpanded.createFrom(target_pyramid_[level-1]);
+                    Gadgetron::clear(deformExpanded);
+
+                    DeformationFieldType deformInverseExpanded;
+                    deformInverseExpanded.createFrom(source_pyramid_[level-1]);
+                    Gadgetron::clear(deformInverseExpanded);
+
+                    if ( downsampledBy2 )
+                    {
+                        for ( jj=0; jj<D; jj++ )
+                        {
+                            DeformationFieldType& deform = transform_->getDeformationField(jj);
+                            Gadgetron::expandImageBy2(deform, *target_bh_pyramid_construction_, deformExpanded);
+
+                            if ( !use_world_coordinates_ )
+                            {
+                                Gadgetron::scal(ValueType(2.0), deformExpanded); // the deformation vector should be doubled in length
+                            }
+
+                            deform = deformExpanded;
+
+                            DeformationFieldType& deformInv = transform_inverse_->getDeformationField(jj);
+                            Gadgetron::expandImageBy2(deformInv, *source_bh_pyramid_construction_, deformInverseExpanded);
+
+                            if ( !use_world_coordinates_ )
+                            {
+                                Gadgetron::scal(ValueType(2.0), deformInverseExpanded); // the deformation vector should be doubled in length
+                            }
+
+                            deformInv = deformInverseExpanded;
+                        }
+                    }
+                    else
+                    {
+                        for ( jj=0; jj<D; jj++ )
+                        {
+                            DeformationFieldType& deform = transform_->getDeformationField(jj);
+                            Gadgetron::upsampleImage(deform, *target_interp_pyramid_construction_, deformExpanded, &ratio[0]);
+
+                            if ( !use_world_coordinates_ )
+                            {
+                                Gadgetron::scal(ValueType(ratio[jj]), deformExpanded);
+                            }
+
+                            deform = deformExpanded;
+
+                            DeformationFieldType& deformInv = transform_inverse_->getDeformationField(jj);
+                            Gadgetron::upsampleImage(deformInv, *source_interp_pyramid_construction_, deformInverseExpanded, &ratio[0]);
+
+                            if ( !use_world_coordinates_ )
+                            {
+                                Gadgetron::scal(ValueType(ratio[jj]), deformInverseExpanded);
+                            }
+
+                            deformInv = deformInverseExpanded;
+                        }
+                    }
+                }
+
+                if ( !debugFolder_.empty() )
+                {
+                    unsigned int jj;
+                    for ( jj=0; jj<D; jj++ )
+                    {
+                        std::ostringstream ostr;
+                        ostr << "deformExpanded_" << jj;
+
+                        gt_exporter_.exportImage(transform_->getDeformationField(jj), debugFolder_+ostr.str());
+
+                        std::ostringstream ostr2;
+                        ostr2 << "deformExpanded_inverse_" << jj;
+
+                        gt_exporter_.exportImage(transform_inverse_->getDeformationField(jj), debugFolder_+ostr2.str());
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationFieldBidirectionalRegister<ValueType, CoordType, D>::performRegistration() ... ");
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    void hoImageRegDeformationFieldBidirectionalRegister<ValueType, CoordType, D>::printContent(std::ostream& os) const
+    {
+        using namespace std;
+        BaseClass::printContent(os);
+
+        unsigned int ii;
+
+        os << "------------" << std::endl;
+        os << "Number of iterations to improve the estimation of the inverse transform is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << inverse_deform_enforce_iter_pyramid_level_[ii] << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Weight to update the estimation of the inverse transform is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << inverse_deform_enforce_weight_pyramid_level_[ii] << std::endl;
+        }
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    void hoImageRegDeformationFieldBidirectionalRegister<ValueType, CoordType, D>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron non-parametric bi-directional deformation field image register -------------" << endl;
+        this->printContent(os);
+        os << "--------------------------------------------------------------------" << endl << ends;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/register/hoImageRegDeformationFieldRegister.h b/toolboxes/registration/optical_flow/cpu/register/hoImageRegDeformationFieldRegister.h
new file mode 100644
index 0000000..d4753cd
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/register/hoImageRegDeformationFieldRegister.h
@@ -0,0 +1,527 @@
+/** \file   hoImageRegDeformationFieldRegister.h
+    \brief  Define the class to perform non-rigid image registration to estimate variational deformation field
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegNonParametricRegister.h"
+
+namespace Gadgetron
+{
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    class hoImageRegDeformationFieldRegister : public hoImageRegNonParametricRegister<ValueType, CoordType, D, D>
+    {
+    public:
+
+        typedef hoImageRegDeformationFieldRegister<ValueType, CoordType, D> Self;
+        typedef hoImageRegNonParametricRegister<ValueType, CoordType, D, D> BaseClass;
+
+        typedef typename BaseClass::TargetType TargetType;
+        typedef typename BaseClass::SourceType SourceType;
+
+        typedef typename BaseClass::Target2DType Target2DType;
+        typedef typename BaseClass::Source2DType Source2DType;
+
+        typedef typename BaseClass::Target3DType Target3DType;
+        typedef typename BaseClass::Source3DType Source3DType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        /// boundary handler and interpolator for target image
+        typedef typename BaseClass::BoundaryHandlerTargetType BoundaryHandlerTargetType;
+        typedef typename BaseClass::BoundaryHandlerTargetFixedValueType BoundaryHandlerTargetFixedValueType;
+        typedef typename BaseClass::BoundaryHandlerTargetBorderValueType BoundaryHandlerTargetBorderValueType;
+        typedef typename BaseClass::BoundaryHandlerTargetPeriodicType BoundaryHandlerTargetPeriodicType;
+        typedef typename BaseClass::BoundaryHandlerTargetMirrorType BoundaryHandlerTargetMirrorType;
+
+        typedef typename BaseClass::InterpTargetType InterpTargetType;
+        typedef typename BaseClass::InterpTargetLinearType InterpTargetLinearType;
+        typedef typename BaseClass::InterpTargetNearestNeighborType InterpTargetNearestNeighborType;
+        typedef typename BaseClass::InterpTargetBSplineType InterpTargetBSplineType;
+
+        /// boundary handler and interpolator for source image
+        typedef typename BaseClass::BoundaryHandlerSourceType BoundaryHandlerSourceType;
+        typedef typename BaseClass::BoundaryHandlerSourceFixedValueType BoundaryHandlerSourceFixedValueType;
+        typedef typename BaseClass::BoundaryHandlerSourceBorderValueType BoundaryHandlerSourceBorderValueType;
+        typedef typename BaseClass::BoundaryHandlerSourcePeriodicType BoundaryHandlerSourcePeriodicType;
+        typedef typename BaseClass::BoundaryHandlerSourceMirrorType BoundaryHandlerSourceMirrorType;
+
+        typedef typename BaseClass::InterpSourceType InterpSourceType;
+        typedef typename BaseClass::InterpSourceLinearType InterpSourceLinearType;
+        typedef typename BaseClass::InterpSourceNearestNeighborType InterpSourceNearestNeighborType;
+        typedef typename BaseClass::InterpSourceBSplineType InterpSourceBSplineType;
+
+        /// warper type
+        typedef typename BaseClass::WarperType WarperType;
+
+        /// image dissimilarity type
+        typedef typename BaseClass::DissimilarityType DissimilarityType;
+
+        /// transformation type
+        typedef hoImageRegDeformationField<CoordType, D> TransformationType;
+        typedef typename TransformationType::input_point_type input_point_type;
+        typedef typename TransformationType::output_point_type output_point_type;
+        typedef typename TransformationType::jacobian_position_type jacobian_position_type;
+        typedef typename TransformationType::DeformationFieldType DeformationFieldType;
+        typedef typename TransformationType::coord_type coord_type;
+
+        /// solver type
+        typedef hoImageRegDeformationFieldSolver<ValueType, CoordType, D> SolverType;
+
+        hoImageRegDeformationFieldRegister(unsigned int resolution_pyramid_levels=3, bool use_world_coordinates=false, ValueType bg_value=ValueType(0));
+        virtual ~hoImageRegDeformationFieldRegister();
+
+        /// initialize the registration
+        /// should be called after all images and parameters of registration are set
+        virtual bool initialize();
+
+        /// perform the registration
+        virtual bool performRegistration();
+
+        virtual void printContent(std::ostream& os) const;
+        virtual void print(std::ostream& os) const;
+
+        /// parameters
+
+        using BaseClass::use_world_coordinates_;
+        using BaseClass::resolution_pyramid_divided_by_2_;
+        using BaseClass::resolution_pyramid_levels_;
+        using BaseClass::resolution_pyramid_downsample_ratio_;
+        using BaseClass::resolution_pyramid_blurring_sigma_;
+        using BaseClass::boundary_handler_type_warper_;
+        using BaseClass::interp_type_warper_;
+        using BaseClass::boundary_handler_type_pyramid_construction_;
+        using BaseClass::interp_type_pyramid_construction_;
+        using BaseClass::dissimilarity_type_;
+        using BaseClass::solver_type_;
+
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+        /// number of iterations for every pyramid level
+        std::vector<unsigned int> max_iter_num_pyramid_level_;
+        /// threshold for dissimilarity for every pyramid level
+        std::vector<ValueType> dissimilarity_thres_pyramid_level_;
+        /// number of search size division for every pyramid level
+        std::vector<unsigned int> div_num_pyramid_level_;
+        /// solver step size for every pyramid level
+        std::vector<ValueType> step_size_para_pyramid_level_;
+        /// step size division ratio for every pyramid level
+        std::vector<ValueType> step_size_div_para_pyramid_level_;
+        /// regularization strength for every pyramid level
+        /// if regularization_hilbert_strength_world_coordinate_=true, this strength is in the unit of world coordinate
+        /// if regularization_hilbert_strength_world_coordinate_=false, this strength is in the unit of pixel
+        bool regularization_hilbert_strength_world_coordinate_;
+        std::vector< std::vector<ValueType> > regularization_hilbert_strength_pyramid_level_;
+
+        /// in-FOV constraint
+        bool apply_in_FOV_constraint_;
+
+        /// verbose mode
+        bool verbose_;
+
+        /// set the default parameters
+        virtual bool setDefaultParameters(unsigned int resolution_pyramid_levels, bool use_world_coordinates);
+
+        /// deformation field transformation, defined in the world coordinate of target image
+        TransformationType* transform_;
+
+        /// solver
+        std::vector<SolverType> solver_pyramid_;
+
+    protected:
+
+        using BaseClass::target_;
+        using BaseClass::source_;
+        using BaseClass::bg_value_;
+        using BaseClass::target_pyramid_;
+        using BaseClass::source_pyramid_;
+        using BaseClass::target_bh_warper_;
+        using BaseClass::target_interp_warper_;
+        using BaseClass::source_bh_warper_;
+        using BaseClass::source_interp_warper_;
+        using BaseClass::target_bh_pyramid_construction_;
+        using BaseClass::target_interp_pyramid_construction_;
+        using BaseClass::source_bh_pyramid_construction_;
+        using BaseClass::source_interp_pyramid_construction_;
+        using BaseClass::warper_pyramid_;
+        using BaseClass::dissimilarity_pyramid_;
+        using BaseClass::warper_pyramid_inverse_;
+        using BaseClass::dissimilarity_pyramid_inverse_;
+
+        /// whether the transformation is preset or not
+        /// the preset transformation can be used to pass in an initial deformation field
+        bool preset_transform_;
+    };
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    hoImageRegDeformationFieldRegister<ValueType, CoordType, D>::
+    hoImageRegDeformationFieldRegister(unsigned int resolution_pyramid_levels, bool use_world_coordinates, ValueType bg_value) 
+    : transform_(NULL), regularization_hilbert_strength_world_coordinate_(false), verbose_(false), preset_transform_(false), BaseClass(resolution_pyramid_levels, bg_value)
+    {
+        this->setDefaultParameters(resolution_pyramid_levels, use_world_coordinates);
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    hoImageRegDeformationFieldRegister<ValueType, CoordType, D>::~hoImageRegDeformationFieldRegister()
+    {
+        if ( !preset_transform_ )
+        {
+            delete transform_;
+            transform_ = NULL;
+        }
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    bool hoImageRegDeformationFieldRegister<ValueType, CoordType, D>::setDefaultParameters(unsigned int resolution_pyramid_levels, bool use_world_coordinates)
+    {
+        use_world_coordinates_ = use_world_coordinates;
+        resolution_pyramid_levels_ = resolution_pyramid_levels;
+
+        resolution_pyramid_downsample_ratio_.clear();
+        resolution_pyramid_downsample_ratio_.resize(resolution_pyramid_levels_-1, std::vector<float>(D, 2.0) );
+
+        resolution_pyramid_blurring_sigma_.clear();
+        resolution_pyramid_blurring_sigma_.resize(resolution_pyramid_levels_, std::vector<float>(D, 0.0) );
+
+        boundary_handler_type_warper_.clear();
+        // boundary_handler_type_warper_.resize(resolution_pyramid_levels_, GT_BOUNDARY_CONDITION_FIXEDVALUE);
+        boundary_handler_type_warper_.resize(resolution_pyramid_levels_, GT_BOUNDARY_CONDITION_BORDERVALUE);
+
+        interp_type_warper_.clear();
+        interp_type_warper_.resize(resolution_pyramid_levels_, GT_IMAGE_INTERPOLATOR_LINEAR);
+
+        boundary_handler_type_pyramid_construction_ = GT_BOUNDARY_CONDITION_BORDERVALUE;
+        interp_type_pyramid_construction_ = GT_IMAGE_INTERPOLATOR_LINEAR;
+
+        dissimilarity_type_.clear();
+        dissimilarity_type_.resize(resolution_pyramid_levels_, GT_IMAGE_DISSIMILARITY_LocalCCR);
+
+        solver_type_.clear();
+        solver_type_.resize(resolution_pyramid_levels_, GT_IMAGE_REG_SOLVER_PDE_TIME_INTEGRATION);
+
+        max_iter_num_pyramid_level_.clear();
+        max_iter_num_pyramid_level_.resize(resolution_pyramid_levels_, 32);
+
+        dissimilarity_thres_pyramid_level_.clear();
+        dissimilarity_thres_pyramid_level_.resize(resolution_pyramid_levels_, 1e-6);
+
+        div_num_pyramid_level_.clear();
+        div_num_pyramid_level_.resize(resolution_pyramid_levels_, 2);
+
+        step_size_para_pyramid_level_.clear();
+        step_size_para_pyramid_level_.resize(resolution_pyramid_levels_, 0.8);
+
+        step_size_div_para_pyramid_level_.clear();
+        step_size_div_para_pyramid_level_.resize(resolution_pyramid_levels_, 0.5);
+
+        regularization_hilbert_strength_world_coordinate_ = false;
+
+        regularization_hilbert_strength_pyramid_level_.clear();
+        regularization_hilbert_strength_pyramid_level_.resize(resolution_pyramid_levels_);
+
+        unsigned int ii;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            regularization_hilbert_strength_pyramid_level_[ii].resize(D, 12.0);
+        }
+
+        apply_in_FOV_constraint_ = false;
+
+        verbose_ = false;
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    bool hoImageRegDeformationFieldRegister<ValueType, CoordType, D>::initialize()
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(BaseClass::initialize());
+
+            if ( transform_ == NULL )
+            {
+                transform_ = new TransformationType(*target_);
+                preset_transform_ = false;
+            }
+
+            warper_pyramid_.resize(resolution_pyramid_levels_);
+            solver_pyramid_.resize(resolution_pyramid_levels_);
+
+            unsigned int ii, jj;
+            for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+            {
+                warper_pyramid_[ii].setTransformation(*transform_);
+                warper_pyramid_[ii].setInterpolator( *source_interp_warper_[ii] );
+                warper_pyramid_[ii].setBackgroundValue(bg_value_);
+                warper_pyramid_[ii].debugFolder_ = this->debugFolder_;
+
+                solver_pyramid_[ii].setTransform(*transform_);
+
+                if ( regularization_hilbert_strength_world_coordinate_ )
+                {
+                    // world to pixel
+                    std::vector<coord_type> pixelSize;
+                    target_->get_pixel_size(pixelSize);
+
+                    for ( jj=0; jj<D; jj++ )
+                    {
+                        solver_pyramid_[ii].regularization_hilbert_strength_[jj] = (regularization_hilbert_strength_pyramid_level_[ii][jj] / pixelSize[jj]);
+                    }
+                }
+                else
+                {
+                    for ( jj=0; jj<D; jj++ )
+                    {
+                        solver_pyramid_[ii].regularization_hilbert_strength_[jj] = regularization_hilbert_strength_pyramid_level_[ii][jj];
+                    }
+                }
+
+                solver_pyramid_[ii].max_iter_num_ = max_iter_num_pyramid_level_[ii];
+                solver_pyramid_[ii].dissimilarity_thres_ = dissimilarity_thres_pyramid_level_[ii];
+                solver_pyramid_[ii].div_num_ = div_num_pyramid_level_[ii];
+                solver_pyramid_[ii].step_size_para_ = step_size_para_pyramid_level_[ii];
+                solver_pyramid_[ii].step_size_div_para_ = step_size_div_para_pyramid_level_[ii];
+                solver_pyramid_[ii].verbose_ = verbose_;
+                solver_pyramid_[ii].debugFolder_ = this->debugFolder_;
+
+                solver_pyramid_[ii].setTarget(target_pyramid_[ii]);
+                solver_pyramid_[ii].setSource(source_pyramid_[ii]);
+                solver_pyramid_[ii].setDissimilarity(*dissimilarity_pyramid_[ii]);
+                solver_pyramid_[ii].setWarper(warper_pyramid_[ii]);
+                solver_pyramid_[ii].setInterpolator(*source_interp_warper_[ii]);
+                solver_pyramid_[ii].setBackgroundValue(bg_value_);
+                solver_pyramid_[ii].setUseWorldCoordinate(use_world_coordinates_);
+
+                solver_pyramid_[ii].apply_in_FOV_constraint_ = apply_in_FOV_constraint_;
+            }
+
+            // downsample the deformation field if necessary
+            if ( !transform_->getDeformationField(0).dimensions_equal(target_pyramid_[resolution_pyramid_levels_-1]) )
+            {
+                std::vector<size_t> dim;
+                target_pyramid_[resolution_pyramid_levels_-1].get_dimensions(dim);
+
+                for ( jj=0; jj<D; jj++ )
+                {
+                    DeformationFieldType& deField = transform_->getDeformationField(jj);
+
+                    if ( preset_transform_ )
+                    {
+                        DeformationFieldType deFieldResampled;
+
+                        hoNDBoundaryHandlerBorderValue<DeformationFieldType> bhBorderValue(deField);
+                        hoNDInterpolatorLinear<DeformationFieldType> interpLinear(deField, bhBorderValue);
+
+                        GADGET_CHECK_RETURN_FALSE(Gadgetron::resampleImage(deField, interpLinear, dim, deFieldResampled));
+
+                        deField = deFieldResampled;
+                        deField.copyImageInfoWithoutImageSize(target_pyramid_[resolution_pyramid_levels_-1]);
+                    }
+                    else
+                    {
+                        deField.createFrom(target_pyramid_[resolution_pyramid_levels_-1]);
+                        Gadgetron::clear(deField);
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationFieldRegister<ValueType, CoordType, D>::initialize() ... ");
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    bool hoImageRegDeformationFieldRegister<ValueType, CoordType, D>::performRegistration()
+    {
+        try
+        {
+            // starting from the most coarse level
+
+            int level;
+            for ( level=(int)resolution_pyramid_levels_-1; level>=0; level-- )
+            {
+                // update the transform for multi-resolution pyramid
+                transform_->update();
+
+                // GADGET_CHECK_RETURN_FALSE(solver_pyramid_[level].initialize());
+                GADGET_CHECK_RETURN_FALSE(solver_pyramid_[level].solve());
+
+                if ( !debugFolder_.empty() )
+                {
+                    unsigned int jj;
+                    for ( jj=0; jj<D; jj++ )
+                    {
+                        std::ostringstream ostr;
+                        ostr << "deform_" << jj;
+
+                        gt_exporter_.exportImage(transform_->getDeformationField(jj), debugFolder_+ostr.str());
+                    }
+                }
+
+                // expand the deformation field for next resolution level
+                if ( level>0 )
+                {
+                    std::vector<float> ratio = resolution_pyramid_downsample_ratio_[level-1];
+
+                    unsigned int jj;
+                    bool downsampledBy2 = true;
+                    for ( jj=0; jj<D; jj++ )
+                    {
+                        if ( std::abs(ratio[jj]-2.0f) > FLT_EPSILON )
+                        {
+                            downsampledBy2 = false;
+                            break;
+                        }
+                    }
+
+                    DeformationFieldType deformExpanded;
+                    deformExpanded.createFrom(target_pyramid_[level-1]);
+                    // Gadgetron::clear(deformExpanded);
+                    memset(deformExpanded.begin(), 0, deformExpanded.get_number_of_bytes());
+
+                    if ( downsampledBy2 || resolution_pyramid_divided_by_2_ )
+                    {
+                        for ( jj=0; jj<D; jj++ )
+                        {
+                            DeformationFieldType& deform = transform_->getDeformationField(jj);
+                            Gadgetron::expandImageBy2(deform, *target_bh_pyramid_construction_, deformExpanded);
+
+                            if ( !use_world_coordinates_ )
+                            {
+                                Gadgetron::scal(ValueType(2.0), deformExpanded); // the deformation vector should be doubled in length
+                            }
+
+                            deform = deformExpanded;
+                        }
+                    }
+                    else
+                    {
+                        for ( jj=0; jj<D; jj++ )
+                        {
+                            DeformationFieldType& deform = transform_->getDeformationField(jj);
+                            Gadgetron::upsampleImage(deform, *target_interp_pyramid_construction_, deformExpanded, &ratio[0]);
+
+                            if ( !use_world_coordinates_ )
+                            {
+                                Gadgetron::scal(ValueType(ratio[jj]), deformExpanded);
+                            }
+
+                            deform = deformExpanded;
+                        }
+                    }
+
+                    if ( !debugFolder_.empty() )
+                    {
+                        for ( jj=0; jj<D; jj++ )
+                        {
+                            std::ostringstream ostr;
+                            ostr << "deformExpanded_" << jj;
+
+                            gt_exporter_.exportImage(transform_->getDeformationField(jj), debugFolder_+ostr.str());
+                        }
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationFieldRegister<ValueType, CoordType, D>::performRegistration() ... ");
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    void hoImageRegDeformationFieldRegister<ValueType, CoordType, D>::printContent(std::ostream& os) const
+    {
+        using namespace std;
+        BaseClass::printContent(os);
+
+        unsigned int ii, jj;
+
+        os << "------------" << std::endl;
+        os << "Maximal iteration number for every pyramid level is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << max_iter_num_pyramid_level_[ii] << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Threshold for dissimilarity for every pyramid level is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << dissimilarity_thres_pyramid_level_[ii] << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Number of search size division for every pyramid level is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << div_num_pyramid_level_[ii] << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Solver step size for every pyramid level is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << step_size_para_pyramid_level_[ii] << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Step size division ratio for every pyramid level is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << step_size_div_para_pyramid_level_[ii] << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        if ( regularization_hilbert_strength_world_coordinate_ )
+        {
+            os << "Regularization strength  is in the unit of physical metric, e.g. mm ... ";
+        }
+        else
+        {
+            os << "Regularization strength  is in the unit of image pixel size ... ";
+        }
+
+        os << "Regularization strength for every pyramid level is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - [ ";
+            for( jj=0; jj<D; jj++ )
+            {
+                os << regularization_hilbert_strength_pyramid_level_[ii][jj] << " ";
+            } 
+            os << " ] " << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Verbose mode is : " << verbose_ << std::endl;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    void hoImageRegDeformationFieldRegister<ValueType, CoordType, D>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron non-parametric deformation field image register -------------" << endl;
+        this->printContent(os);
+        os << "--------------------------------------------------------------------" << endl << ends;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/register/hoImageRegNonParametricRegister.h b/toolboxes/registration/optical_flow/cpu/register/hoImageRegNonParametricRegister.h
new file mode 100644
index 0000000..426913e
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/register/hoImageRegNonParametricRegister.h
@@ -0,0 +1,148 @@
+/** \file   hoImageRegNonParametricRegister.h
+    \brief  Define the class to perform non-parametric image registration in gadgetron
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegRegister.h"
+
+namespace Gadgetron
+{
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    class hoImageRegNonParametricRegister : public hoImageRegRegister<ValueType, CoordType, DIn, DOut>
+    {
+    public:
+
+        typedef hoImageRegNonParametricRegister<ValueType, CoordType, DIn, DOut> Self;
+        typedef hoImageRegRegister<ValueType, CoordType, DIn, DOut> BaseClass;
+
+        typedef typename BaseClass::TargetType TargetType;
+        typedef typename BaseClass::SourceType SourceType;
+
+        typedef typename BaseClass::Target2DType Target2DType;
+        typedef typename BaseClass::Source2DType Source2DType;
+
+        typedef typename BaseClass::Target3DType Target3DType;
+        typedef typename BaseClass::Source3DType Source3DType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef CoordType coord_type;
+
+        /// boundary handler and interpolator for target image
+        typedef typename BaseClass::BoundaryHandlerTargetType BoundaryHandlerTargetType;
+        typedef typename BaseClass::BoundaryHandlerTargetFixedValueType BoundaryHandlerTargetFixedValueType;
+        typedef typename BaseClass::BoundaryHandlerTargetBorderValueType BoundaryHandlerTargetBorderValueType;
+        typedef typename BaseClass::BoundaryHandlerTargetPeriodicType BoundaryHandlerTargetPeriodicType;
+        typedef typename BaseClass::BoundaryHandlerTargetMirrorType BoundaryHandlerTargetMirrorType;
+
+        typedef typename BaseClass::InterpTargetType InterpTargetType;
+        typedef typename BaseClass::InterpTargetLinearType InterpTargetLinearType;
+        typedef typename BaseClass::InterpTargetNearestNeighborType InterpTargetNearestNeighborType;
+        typedef typename BaseClass::InterpTargetBSplineType InterpTargetBSplineType;
+
+        /// boundary handler and interpolator for source image
+        typedef typename BaseClass::BoundaryHandlerSourceType BoundaryHandlerSourceType;
+        typedef typename BaseClass::BoundaryHandlerSourceFixedValueType BoundaryHandlerSourceFixedValueType;
+        typedef typename BaseClass::BoundaryHandlerSourceBorderValueType BoundaryHandlerSourceBorderValueType;
+        typedef typename BaseClass::BoundaryHandlerSourcePeriodicType BoundaryHandlerSourcePeriodicType;
+        typedef typename BaseClass::BoundaryHandlerSourceMirrorType BoundaryHandlerSourceMirrorType;
+
+        typedef typename BaseClass::InterpSourceType InterpSourceType;
+        typedef typename BaseClass::InterpSourceLinearType InterpSourceLinearType;
+        typedef typename BaseClass::InterpSourceNearestNeighborType InterpSourceNearestNeighborType;
+        typedef typename BaseClass::InterpSourceBSplineType InterpSourceBSplineType;
+
+        /// warper type
+        typedef typename BaseClass::WarperType WarperType;
+
+        /// image dissimilarity type
+        typedef typename BaseClass::DissimilarityType DissimilarityType;
+
+        hoImageRegNonParametricRegister(unsigned int resolution_pyramid_levels=3, ValueType bg_value=ValueType(0));
+        virtual ~hoImageRegNonParametricRegister();
+
+        /// initialize the registration
+        /// should be called after all images and parameters of registration are set
+        virtual bool initialize() { return BaseClass::initialize(); }
+
+        /// perform the registration
+        virtual bool performRegistration() = 0;
+
+        virtual void print(std::ostream& os) const;
+
+        /// parameters
+
+        using BaseClass::use_world_coordinates_;
+        using BaseClass::resolution_pyramid_divided_by_2_;
+        using BaseClass::resolution_pyramid_levels_;
+        using BaseClass::resolution_pyramid_downsample_ratio_;
+        using BaseClass::resolution_pyramid_blurring_sigma_;
+        using BaseClass::boundary_handler_type_warper_;
+        using BaseClass::interp_type_warper_;
+        using BaseClass::boundary_handler_type_pyramid_construction_;
+        using BaseClass::interp_type_pyramid_construction_;
+        using BaseClass::dissimilarity_type_;
+        using BaseClass::solver_type_;
+
+        using BaseClass::dissimilarity_LocalCCR_sigmaArg_;
+        using BaseClass::dissimilarity_hist_num_bin_target_;
+        using BaseClass::dissimilarity_hist_num_bin_warpped_;
+        using BaseClass::dissimilarity_hist_pv_interpolation_;
+        using BaseClass::dissimilarity_hist_step_size_ignore_pixel_;
+
+        using BaseClass::dissimilarity_MI_betaArg_;
+
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        using BaseClass::target_;
+        using BaseClass::source_;
+        using BaseClass::bg_value_;
+        using BaseClass::target_pyramid_;
+        using BaseClass::source_pyramid_;
+        using BaseClass::target_bh_warper_;
+        using BaseClass::target_interp_warper_;
+        using BaseClass::source_bh_warper_;
+        using BaseClass::source_interp_warper_;
+        using BaseClass::target_bh_pyramid_construction_;
+        using BaseClass::target_interp_pyramid_construction_;
+        using BaseClass::source_bh_pyramid_construction_;
+        using BaseClass::source_interp_pyramid_construction_;
+        using BaseClass::warper_pyramid_;
+        using BaseClass::dissimilarity_pyramid_;
+        using BaseClass::warper_pyramid_inverse_;
+        using BaseClass::dissimilarity_pyramid_inverse_;
+    };
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegNonParametricRegister<ValueType, CoordType, DIn, DOut>::
+    hoImageRegNonParametricRegister(unsigned int resolution_pyramid_levels, ValueType bg_value) : BaseClass(resolution_pyramid_levels, bg_value)
+    {
+
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegNonParametricRegister<ValueType, CoordType, DIn, DOut>::~hoImageRegNonParametricRegister()
+    {
+
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    void hoImageRegNonParametricRegister<ValueType, CoordType, DIn, DOut>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron non-parametric image register -------------" << endl;
+        BaseClass::printContent(os);
+        os << "--------------------------------------------------------------------" << endl << ends;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/register/hoImageRegParametricRegister.h b/toolboxes/registration/optical_flow/cpu/register/hoImageRegParametricRegister.h
new file mode 100644
index 0000000..e874786
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/register/hoImageRegParametricRegister.h
@@ -0,0 +1,408 @@
+/** \file   hoImageRegParametricRegister.h
+    \brief  Define the class to perform parametric image registration in gadgetron
+            By default, the multi-level multi-step parametric solver is used
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegRegister.h"
+
+namespace Gadgetron
+{
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    class hoImageRegParametricRegister : public hoImageRegRegister<ValueType, CoordType, DIn, DOut>
+    {
+    public:
+
+        typedef hoImageRegParametricRegister<ValueType, CoordType, DIn, DOut> Self;
+        typedef hoImageRegRegister<ValueType, CoordType, DIn, DOut> BaseClass;
+
+        typedef typename BaseClass::TargetType TargetType;
+        typedef typename BaseClass::SourceType SourceType;
+
+        typedef typename BaseClass::Target2DType Target2DType;
+        typedef typename BaseClass::Source2DType Source2DType;
+
+        typedef typename BaseClass::Target3DType Target3DType;
+        typedef typename BaseClass::Source3DType Source3DType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef CoordType coord_type;
+
+        /// boundary handler and interpolator for target image
+        typedef typename BaseClass::BoundaryHandlerTargetType BoundaryHandlerTargetType;
+        typedef typename BaseClass::BoundaryHandlerTargetFixedValueType BoundaryHandlerTargetFixedValueType;
+        typedef typename BaseClass::BoundaryHandlerTargetBorderValueType BoundaryHandlerTargetBorderValueType;
+        typedef typename BaseClass::BoundaryHandlerTargetPeriodicType BoundaryHandlerTargetPeriodicType;
+        typedef typename BaseClass::BoundaryHandlerTargetMirrorType BoundaryHandlerTargetMirrorType;
+
+        typedef typename BaseClass::InterpTargetType InterpTargetType;
+        typedef typename BaseClass::InterpTargetLinearType InterpTargetLinearType;
+        typedef typename BaseClass::InterpTargetNearestNeighborType InterpTargetNearestNeighborType;
+        typedef typename BaseClass::InterpTargetBSplineType InterpTargetBSplineType;
+
+        /// boundary handler and interpolator for source image
+        typedef typename BaseClass::BoundaryHandlerSourceType BoundaryHandlerSourceType;
+        typedef typename BaseClass::BoundaryHandlerSourceFixedValueType BoundaryHandlerSourceFixedValueType;
+        typedef typename BaseClass::BoundaryHandlerSourceBorderValueType BoundaryHandlerSourceBorderValueType;
+        typedef typename BaseClass::BoundaryHandlerSourcePeriodicType BoundaryHandlerSourcePeriodicType;
+        typedef typename BaseClass::BoundaryHandlerSourceMirrorType BoundaryHandlerSourceMirrorType;
+
+        typedef typename BaseClass::InterpSourceType InterpSourceType;
+        typedef typename BaseClass::InterpSourceLinearType InterpSourceLinearType;
+        typedef typename BaseClass::InterpSourceNearestNeighborType InterpSourceNearestNeighborType;
+        typedef typename BaseClass::InterpSourceBSplineType InterpSourceBSplineType;
+
+        /// warper type
+        typedef typename BaseClass::WarperType WarperType;
+
+        /// image dissimilarity type
+        typedef typename BaseClass::DissimilarityType DissimilarityType;
+
+        /// transformation type
+        typedef hoImageRegParametricTransformation<CoordType, DIn, DOut> TransformationType;
+        typedef typename TransformationType::input_point_type input_point_type;
+        typedef typename TransformationType::output_point_type output_point_type;
+        typedef typename TransformationType::jacobian_position_type jacobian_position_type;
+
+        /// solver type
+        typedef hoImageRegParametricSolver<ValueType, CoordType, DIn, DOut> SolverType;
+
+        hoImageRegParametricRegister(unsigned int resolution_pyramid_levels=3, bool use_world_coordinates=true, ValueType bg_value=ValueType(0));
+        virtual ~hoImageRegParametricRegister();
+
+        /// initialize the registration
+        /// should be called after all images and parameters of registration are set
+        virtual bool initialize();
+
+        /// create parametric solver
+        SolverType* createParametricSolver(GT_IMAGE_REG_SOLVER v, unsigned int level);
+
+        /// set the default parameters
+        virtual bool setDefaultParameters(unsigned int resolution_pyramid_levels, bool use_world_coordinates);
+
+        /// perform the registration
+        virtual bool performRegistration();
+
+        virtual void print(std::ostream& os) const;
+
+        /// parameters
+
+        using BaseClass::use_world_coordinates_;
+        using BaseClass::resolution_pyramid_divided_by_2_;
+        using BaseClass::resolution_pyramid_levels_;
+        using BaseClass::resolution_pyramid_downsample_ratio_;
+        using BaseClass::resolution_pyramid_blurring_sigma_;
+        using BaseClass::boundary_handler_type_warper_;
+        using BaseClass::interp_type_warper_;
+        using BaseClass::boundary_handler_type_pyramid_construction_;
+        using BaseClass::interp_type_pyramid_construction_;
+        using BaseClass::dissimilarity_type_;
+        using BaseClass::solver_type_;
+
+        using BaseClass::dissimilarity_LocalCCR_sigmaArg_;
+        using BaseClass::dissimilarity_hist_num_bin_target_;
+        using BaseClass::dissimilarity_hist_num_bin_warpped_;
+        using BaseClass::dissimilarity_hist_pv_interpolation_;
+        using BaseClass::dissimilarity_hist_step_size_ignore_pixel_;
+
+        using BaseClass::dissimilarity_MI_betaArg_;
+
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+        /// verbose mode
+        bool verbose_;
+
+        /// deformation field transformation, defined in the world coordinate of target image
+        TransformationType* transform_;
+
+        /// solver
+        std::vector<SolverType*> solver_pyramid_;
+
+        /// for solver of every pyramid level
+
+        /// maximal number of iterations
+        std::vector<unsigned int> max_iter_num_pyramid_level_;
+
+        /// threshold for minimal dissimilarity changes
+        ValueType dissimilarity_thres_;
+
+        /// threshold for minimal parameter changes
+        ValueType parameter_thres_;
+
+        /// number of search division
+        std::vector<unsigned int> div_num_pyramid_level_;
+
+        /// step size for every parameter
+        std::vector< std::vector<ValueType> > step_size_para_pyramid_level_;
+
+        /// step size division ratio
+        /// step_size_para_ = step_size_para_ .* step_size_div_para_ to reduce search step size
+        std::vector< std::vector<ValueType> > step_size_div_para_pyramid_level_;
+
+    protected:
+
+        using BaseClass::target_;
+        using BaseClass::source_;
+        using BaseClass::bg_value_;
+        using BaseClass::target_pyramid_;
+        using BaseClass::source_pyramid_;
+        using BaseClass::target_bh_warper_;
+        using BaseClass::target_interp_warper_;
+        using BaseClass::source_bh_warper_;
+        using BaseClass::source_interp_warper_;
+        using BaseClass::target_bh_pyramid_construction_;
+        using BaseClass::target_interp_pyramid_construction_;
+        using BaseClass::source_bh_pyramid_construction_;
+        using BaseClass::source_interp_pyramid_construction_;
+        using BaseClass::warper_pyramid_;
+        using BaseClass::dissimilarity_pyramid_;
+        using BaseClass::warper_pyramid_inverse_;
+        using BaseClass::dissimilarity_pyramid_inverse_;
+
+        bool preset_transform_;
+    };
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegParametricRegister<ValueType, CoordType, DIn, DOut>::
+    hoImageRegParametricRegister(unsigned int resolution_pyramid_levels, bool use_world_coordinates, ValueType bg_value) : BaseClass(resolution_pyramid_levels, bg_value), verbose_(false), preset_transform_(false)
+    {
+        GADGET_CHECK_THROW(this->setDefaultParameters(resolution_pyramid_levels, use_world_coordinates));
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegParametricRegister<ValueType, CoordType, DIn, DOut>::~hoImageRegParametricRegister()
+    {
+
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegParametricRegister<ValueType, CoordType, DIn, DOut>::setDefaultParameters(unsigned int resolution_pyramid_levels, bool use_world_coordinates)
+    {
+        use_world_coordinates_ = use_world_coordinates;
+        resolution_pyramid_levels_ = resolution_pyramid_levels;
+
+        resolution_pyramid_downsample_ratio_.clear();
+        resolution_pyramid_downsample_ratio_.resize(resolution_pyramid_levels_-1, std::vector<float>(std::max(DIn, DOut), 2.0) );
+
+        resolution_pyramid_blurring_sigma_.clear();
+        resolution_pyramid_blurring_sigma_.resize(resolution_pyramid_levels_, std::vector<float>(std::max(DIn, DOut), 0.0) );
+
+        boundary_handler_type_warper_.clear();
+        boundary_handler_type_warper_.resize(resolution_pyramid_levels_, GT_BOUNDARY_CONDITION_FIXEDVALUE);
+
+        interp_type_warper_.clear();
+        interp_type_warper_.resize(resolution_pyramid_levels_, GT_IMAGE_INTERPOLATOR_LINEAR);
+
+        boundary_handler_type_pyramid_construction_ = GT_BOUNDARY_CONDITION_BORDERVALUE;
+        interp_type_pyramid_construction_ = GT_IMAGE_INTERPOLATOR_LINEAR;
+
+        dissimilarity_type_.clear();
+        dissimilarity_type_.resize(resolution_pyramid_levels_, GT_IMAGE_DISSIMILARITY_NMI);
+
+        solver_type_.clear();
+        solver_type_.resize(resolution_pyramid_levels_, GT_IMAGE_REG_SOLVER_DOWNHILL);
+
+        max_iter_num_pyramid_level_.clear();
+        max_iter_num_pyramid_level_.resize(resolution_pyramid_levels_, 100);
+
+        dissimilarity_thres_ = 1e-6;
+
+        div_num_pyramid_level_.clear();
+        div_num_pyramid_level_.resize(resolution_pyramid_levels_, 5);
+
+        step_size_para_pyramid_level_.clear();
+        step_size_para_pyramid_level_.resize(resolution_pyramid_levels_);
+
+        step_size_div_para_pyramid_level_.clear();
+        step_size_div_para_pyramid_level_.resize(resolution_pyramid_levels_);
+
+        size_t maxParaNum = 4096;
+
+        step_size_para_pyramid_level_[resolution_pyramid_levels_-1].resize(maxParaNum, 3.2);
+        step_size_div_para_pyramid_level_[resolution_pyramid_levels_-1].resize(maxParaNum, 0.5);
+
+        int ii;
+        unsigned int jj;
+        for ( ii=(int)resolution_pyramid_levels_-2; ii>=0; ii-- )
+        {
+            step_size_div_para_pyramid_level_[ii].resize(maxParaNum, 0.5);
+            step_size_para_pyramid_level_[ii].resize(maxParaNum);
+
+            for ( jj=0; jj<maxParaNum; jj++ )
+            {
+                step_size_para_pyramid_level_[ii][jj] = step_size_div_para_pyramid_level_[ii][jj]*step_size_para_pyramid_level_[ii+1][jj];
+            }
+        }
+
+        verbose_ = false;
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegParametricRegister<ValueType, CoordType, DIn, DOut>::initialize()
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(BaseClass::initialize());
+
+            GADGET_CHECK_RETURN_FALSE( transform_ != NULL );
+
+            warper_pyramid_.resize(resolution_pyramid_levels_);
+            solver_pyramid_.resize(resolution_pyramid_levels_);
+
+            unsigned int ii, jj;
+            for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+            {
+                warper_pyramid_[ii].setTransformation(*transform_);
+                warper_pyramid_[ii].setInterpolator( *source_interp_warper_[ii] );
+                warper_pyramid_[ii].setBackgroundValue(bg_value_);
+                warper_pyramid_[ii].debugFolder_ = this->debugFolder_;
+
+                solver_pyramid_[ii] = this->createParametricSolver(solver_type_[ii], ii);
+
+                solver_pyramid_[ii]->setTransform(*transform_);
+
+                solver_pyramid_[ii]->verbose_ = verbose_;
+                solver_pyramid_[ii]->debugFolder_ = this->debugFolder_;
+
+                solver_pyramid_[ii]->setTarget(target_pyramid_[ii]);
+                solver_pyramid_[ii]->setSource(source_pyramid_[ii]);
+                solver_pyramid_[ii]->setDissimilarity(*dissimilarity_pyramid_[ii]);
+                solver_pyramid_[ii]->setWarper(warper_pyramid_[ii]);
+                solver_pyramid_[ii]->setInterpolator(*source_interp_warper_[ii]);
+                solver_pyramid_[ii]->setBackgroundValue(bg_value_);
+                solver_pyramid_[ii]->setUseWorldCoordinate(use_world_coordinates_);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationFieldRegister<ValueType, CoordType, D>::initialize() ... ");
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegParametricRegister<ValueType, CoordType, DIn, DOut>::performRegistration()
+    {
+        try
+        {
+            // starting from the most coarse level
+
+            if ( verbose_ )
+            {
+                GDEBUG_STREAM("Initial transformation : ");
+                transform_->print(std::cout);
+            }
+
+            int level;
+            for ( level=(int)resolution_pyramid_levels_-1; level>=0; level-- )
+            {
+                // GADGET_CHECK_RETURN_FALSE(solver_pyramid_[level].initialize());
+                GADGET_CHECK_RETURN_FALSE(solver_pyramid_[level]->solve());
+
+                if ( verbose_ )
+                {
+                    GDEBUG_STREAM("Transformation for level " << level << " : ");
+                    transform_->printTransform(std::cout);
+                }
+
+                // adjust transformation for the next resolution level
+                if ( level>0 )
+                {
+                    if ( !use_world_coordinates_ )
+                    {
+                        hoMatrix<ValueType> lowResI2W, highResI2W;
+                        source_pyramid_[level].image_to_world_matrix(lowResI2W);
+                        source_pyramid_[level-1].image_to_world_matrix(highResI2W);
+
+                        GADGET_CHECK_RETURN_FALSE(transform_->adjustForResolutionPyramid(lowResI2W, highResI2W));
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegParametricRegister<ValueType, CoordType, DIn, DOut>::performRegistration() ... ");
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegParametricSolver<ValueType, CoordType, DIn, DOut>* hoImageRegParametricRegister<ValueType, CoordType, DIn, DOut>::createParametricSolver(GT_IMAGE_REG_SOLVER v, unsigned int level)
+    {
+        SolverType* res = NULL;
+
+        unsigned int ii;
+
+        switch (v)
+        {
+            case GT_IMAGE_REG_SOLVER_DOWNHILL:
+                res = new hoImageRegParametricDownHillSolver<ValueType, CoordType, DIn, DOut>();
+                break;
+
+            case GT_IMAGE_REG_SOLVER_GRADIENT_DESCENT:
+                res = new hoImageRegParametricGradientDescentSolver<ValueType, CoordType, DIn, DOut>();
+                break;
+
+            default:
+                GERROR_STREAM("Unrecognized parametric solver type : " << v);
+        }
+
+        res->max_iter_num_ = max_iter_num_pyramid_level_[level];
+        res->dissimilarity_thres_ = dissimilarity_thres_;
+        res->parameter_thres_ = parameter_thres_;
+        res->div_num_ = div_num_pyramid_level_[level];
+        res->step_size_para_ = step_size_para_pyramid_level_[level];
+        res->step_size_div_para_ = step_size_div_para_pyramid_level_[level];
+
+        return res;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    void hoImageRegParametricRegister<ValueType, CoordType, DIn, DOut>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron parametric image register -------------" << endl;
+        BaseClass::printContent(os);
+
+        unsigned int ii, jj;
+
+        os << "------------" << std::endl;
+        os << "Maximal iteration number for every pyramid level is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << max_iter_num_pyramid_level_[ii] << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Threshold for dissimilarity for every pyramid level is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << dissimilarity_thres_ << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Number of search size division for every pyramid level is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << div_num_pyramid_level_[ii] << std::endl;
+        }
+        os << "--------------------------------------------------------------------" << endl << ends;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/register/hoImageRegRegister.h b/toolboxes/registration/optical_flow/cpu/register/hoImageRegRegister.h
new file mode 100644
index 0000000..929de6d
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/register/hoImageRegRegister.h
@@ -0,0 +1,651 @@
+/** \file   hoImageRegRegister.h
+    \brief  Define the class to perform image registration in gadgetron
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "hoNDImage.h"
+#include "hoNDInterpolator.h"
+#include "hoNDBoundaryHandler.h"
+#include "hoMatrix.h"
+#include "hoNDArray_utils.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDImage_util.h"
+#include "gtPlusISMRMRDReconUtil.h"
+
+// transformation
+#include "hoImageRegTransformation.h"
+#include "hoImageRegParametricTransformation.h"
+#include "hoImageRegTransformation.h"
+#include "hoImageRegHomogenousTransformation.h"
+#include "hoImageRegRigid2DTransformation.h"
+#include "hoImageRegRigid3DTransformation.h"
+
+// warper
+#include "hoImageRegWarper.h"
+
+// solver
+#include "hoImageRegDeformationFieldSolver.h"
+#include "hoImageRegParametricSolver.h"
+#include "hoImageRegDeformationFieldBidirectionalSolver.h"
+#include "hoImageRegParametricDownHillSolver.h"
+#include "hoImageRegParametricGradientDescentSolver.h"
+
+// dissimilarity
+#include "hoImageRegDissimilaritySSD.h"
+#include "hoImageRegDissimilarityLocalCCR.h"
+#include "hoImageRegDissimilarityMutualInformation.h"
+#include "hoImageRegDissimilarityNormalizedMutualInformation.h"
+#include "GtPrepUtil.h"
+
+namespace Gadgetron
+{
+    /// perform the image registration using pyramid scheme
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    class hoImageRegRegister
+    {
+    public:
+
+        typedef hoImageRegRegister<ValueType, CoordType, DIn, DOut> Self;
+
+        typedef hoNDImage<ValueType, DOut> TargetType;
+        typedef hoNDImage<ValueType, DIn> SourceType;
+
+        typedef hoNDImage<ValueType, 2> Target2DType;
+        typedef Target2DType Source2DType;
+
+        typedef hoNDImage<ValueType, 3> Target3DType;
+        typedef Target2DType Source3DType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef CoordType coord_type;
+
+        /// boundary handler and interpolator for target image
+        typedef hoNDBoundaryHandler<TargetType> BoundaryHandlerTargetType;
+        typedef hoNDBoundaryHandlerFixedValue<TargetType> BoundaryHandlerTargetFixedValueType;
+        typedef hoNDBoundaryHandlerBorderValue<TargetType> BoundaryHandlerTargetBorderValueType;
+        typedef hoNDBoundaryHandlerPeriodic<TargetType> BoundaryHandlerTargetPeriodicType;
+        typedef hoNDBoundaryHandlerMirror<TargetType> BoundaryHandlerTargetMirrorType;
+
+        typedef hoNDInterpolator<TargetType> InterpTargetType;
+        typedef hoNDInterpolatorLinear<TargetType> InterpTargetLinearType;
+        typedef hoNDInterpolatorNearestNeighbor<TargetType> InterpTargetNearestNeighborType;
+        typedef hoNDInterpolatorBSpline<TargetType, DIn> InterpTargetBSplineType;
+
+        /// boundary handler and interpolator for source image
+        typedef hoNDBoundaryHandler<SourceType> BoundaryHandlerSourceType;
+        typedef hoNDBoundaryHandlerFixedValue<SourceType> BoundaryHandlerSourceFixedValueType;
+        typedef hoNDBoundaryHandlerBorderValue<SourceType> BoundaryHandlerSourceBorderValueType;
+        typedef hoNDBoundaryHandlerPeriodic<SourceType> BoundaryHandlerSourcePeriodicType;
+        typedef hoNDBoundaryHandlerMirror<SourceType> BoundaryHandlerSourceMirrorType;
+
+        typedef hoNDInterpolator<SourceType> InterpSourceType;
+        typedef hoNDInterpolatorLinear<SourceType> InterpSourceLinearType;
+        typedef hoNDInterpolatorNearestNeighbor<SourceType> InterpSourceNearestNeighborType;
+        typedef hoNDInterpolatorBSpline<SourceType, DIn> InterpSourceBSplineType;
+
+        /// warper type
+        typedef hoImageRegWarper<ValueType, CoordType, DIn, DOut> WarperType;
+
+        /// image dissimilarity type
+        typedef hoImageRegDissimilarity<ValueType, DOut> DissimilarityType;
+
+        hoImageRegRegister(unsigned int resolution_pyramid_levels=3, ValueType bg_value=ValueType(0));
+        virtual ~hoImageRegRegister();
+
+        /// initialize the registration
+        /// should be called after all images and parameters of registration are set
+        virtual bool initialize();
+
+        /// set target and source, create the multi-resolution pyramid and set up the interpolators
+        virtual void setTarget(TargetType& target);
+        virtual void setSource(SourceType& source);
+
+        /// create dissimilarity measures
+        DissimilarityType* createDissimilarity(GT_IMAGE_DISSIMILARITY v, unsigned int level);
+
+        /// perform the registration
+        virtual bool performRegistration() = 0;
+
+        /// print the class information
+        virtual void printContent(std::ostream& os) const;
+        virtual void print(std::ostream& os) const;
+
+        /// parameters
+
+        /// whether to perform world coordinate registration
+        bool use_world_coordinates_;
+
+        /// number of resolution pyramid levels
+        unsigned int resolution_pyramid_levels_;
+
+        /// use fast pyramid creation by dividing the image size by 2
+        /// if the use_world_coordinates_ == true and resolution_pyramid_divided_by_2_ == true, , resolution_pyramid_downsample_ratio_
+        /// and resolution_pyramid_blurring_sigma_ will be ignored
+        bool resolution_pyramid_divided_by_2_;
+
+        /// downsample ratio of the resolution pyramid for every dimension and every level
+        /// e.g. ratio=2, downsample by 100%
+        std::vector< std::vector<float> > resolution_pyramid_downsample_ratio_;
+
+        /// extra gaussian blurring can be applied on every resolution pyramid
+        /// if use_world_coordinates_=true, sigma is in the unit of world coordinate
+        /// otherwise, it is in the unit of image pixel
+        std::vector< std::vector<float> > resolution_pyramid_blurring_sigma_;
+
+        /// boundary handler and interpolator type for warper, for every resolution level, different interpolator can be used
+        std::vector<GT_BOUNDARY_CONDITION> boundary_handler_type_warper_;
+        std::vector<GT_IMAGE_INTERPOLATOR> interp_type_warper_;
+
+        /// boundary handler and interpolator type for pyramid construction
+        GT_BOUNDARY_CONDITION boundary_handler_type_pyramid_construction_;
+        GT_IMAGE_INTERPOLATOR interp_type_pyramid_construction_;
+
+        /// image dissimilarity
+        /// for different pyramid level, different dissimilarity can be used
+        std::vector<GT_IMAGE_DISSIMILARITY> dissimilarity_type_;
+
+        /// solver for every pyramid level
+        std::vector<GT_IMAGE_REG_SOLVER> solver_type_;
+
+        ///// whether to set the origin of target/source to image center
+        //bool orgin_at_image_center_;
+
+        /// parameters for dissimilarity measures, for every paramid level
+        /// LocalCCR
+        std::vector<std::vector<ValueType> > dissimilarity_LocalCCR_sigmaArg_;
+
+        /// Histogram based
+        std::vector<unsigned int> dissimilarity_hist_num_bin_target_;
+        std::vector<unsigned int> dissimilarity_hist_num_bin_warpped_;
+        bool dissimilarity_hist_pv_interpolation_;
+        std::vector<size_t> dissimilarity_hist_step_size_ignore_pixel_;
+
+        /// Mutual information
+        std::vector<ValueType> dissimilarity_MI_betaArg_;
+
+        // ----------------------------------
+        // debug and timing
+        // ----------------------------------
+        // clock for timing
+        Gadgetron::GadgetronTimer gt_timer1_;
+        Gadgetron::GadgetronTimer gt_timer2_;
+        Gadgetron::GadgetronTimer gt_timer3_;
+
+        bool performTiming_;
+
+        // exporter
+        Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+        // debug folder
+        std::string debugFolder_;
+
+    protected:
+
+        TargetType* target_;
+        SourceType* source_;
+
+        /// back ground values, used to mark regions in the target image which will not be warped
+        ValueType bg_value_;
+
+        /// store the multi-resolution images for every pyramid level
+        std::vector<TargetType> target_pyramid_;
+        std::vector<TargetType> source_pyramid_;
+
+        /// store the boundary handler and interpolator for warpers
+        std::vector<BoundaryHandlerTargetType*> target_bh_warper_;
+        std::vector<InterpTargetType*> target_interp_warper_;
+
+        std::vector<BoundaryHandlerSourceType*> source_bh_warper_;
+        std::vector<InterpSourceType*> source_interp_warper_;
+
+        /// store the boundary handler and interpolator for pyramid construction
+        BoundaryHandlerTargetType* target_bh_pyramid_construction_;
+        InterpTargetType* target_interp_pyramid_construction_;
+
+        BoundaryHandlerSourceType* source_bh_pyramid_construction_;
+        InterpSourceType* source_interp_pyramid_construction_;
+
+        /// store warpers for ever pyramid level
+        std::vector<WarperType> warper_pyramid_;
+
+        /// store the image dissimilarity for every pyramid level
+        std::vector<DissimilarityType*> dissimilarity_pyramid_;
+
+        /// store warpers for ever pyramid level
+        std::vector<WarperType> warper_pyramid_inverse_;
+
+        /// store the image dissimilarity for every pyramid level
+        std::vector<DissimilarityType*> dissimilarity_pyramid_inverse_;
+    };
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegRegister<ValueType, CoordType, DIn, DOut>::
+    hoImageRegRegister(unsigned int resolution_pyramid_levels, ValueType bg_value) 
+    : target_(NULL), source_(NULL), bg_value_(bg_value), performTiming_(false)
+    {
+        gt_timer1_.set_timing_in_destruction(false);
+        gt_timer2_.set_timing_in_destruction(false);
+        gt_timer3_.set_timing_in_destruction(false);
+
+        use_world_coordinates_ = true;
+
+        resolution_pyramid_levels_ = resolution_pyramid_levels;
+
+        resolution_pyramid_divided_by_2_ = true;
+
+        resolution_pyramid_downsample_ratio_.resize(resolution_pyramid_levels_-1, std::vector<float>(DIn, 2.0f) );
+
+        resolution_pyramid_blurring_sigma_.resize(resolution_pyramid_levels_, std::vector<float>(DIn, 0.0f));
+
+        boundary_handler_type_warper_.resize(resolution_pyramid_levels_, GT_BOUNDARY_CONDITION_FIXEDVALUE);
+        interp_type_warper_.resize(resolution_pyramid_levels_, GT_IMAGE_INTERPOLATOR_LINEAR);
+
+        boundary_handler_type_pyramid_construction_ = GT_BOUNDARY_CONDITION_BORDERVALUE;
+        interp_type_pyramid_construction_ = GT_IMAGE_INTERPOLATOR_LINEAR;
+
+        dissimilarity_type_.resize(resolution_pyramid_levels_, GT_IMAGE_DISSIMILARITY_NMI);
+
+        solver_type_.resize(resolution_pyramid_levels_, GT_IMAGE_REG_SOLVER_DOWNHILL);
+
+        target_bh_warper_.resize(resolution_pyramid_levels_, NULL);
+        target_interp_warper_.resize(resolution_pyramid_levels_, NULL);
+
+        source_bh_warper_.resize(resolution_pyramid_levels_, NULL);
+        source_interp_warper_.resize(resolution_pyramid_levels_, NULL);
+
+        target_bh_pyramid_construction_ = NULL;
+        target_interp_pyramid_construction_ = NULL;
+
+        source_bh_pyramid_construction_ = NULL;
+        source_interp_pyramid_construction_ = NULL;
+
+        dissimilarity_pyramid_.resize(resolution_pyramid_levels_, NULL);
+        dissimilarity_pyramid_inverse_.resize(resolution_pyramid_levels_, NULL);
+
+        dissimilarity_LocalCCR_sigmaArg_.resize(resolution_pyramid_levels_, std::vector<ValueType>(DOut, 2.0) );
+
+        dissimilarity_hist_num_bin_target_.resize(resolution_pyramid_levels_, 64);
+        dissimilarity_hist_num_bin_warpped_.resize(resolution_pyramid_levels_, 64);
+        dissimilarity_hist_pv_interpolation_ = false;
+        dissimilarity_hist_step_size_ignore_pixel_.resize(resolution_pyramid_levels_, 1);
+
+        dissimilarity_MI_betaArg_.resize(resolution_pyramid_levels_, 2.0);
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegRegister<ValueType, CoordType, DIn, DOut>::~hoImageRegRegister()
+    {
+        unsigned int ii;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            delete target_bh_warper_[ii];
+            delete target_interp_warper_[ii];
+
+            delete source_bh_warper_[ii];
+            delete source_interp_warper_[ii];
+
+            delete dissimilarity_pyramid_[ii];
+            delete dissimilarity_pyramid_inverse_[ii];
+        }
+
+        delete target_bh_pyramid_construction_;
+        delete target_interp_pyramid_construction_;
+
+        delete source_bh_pyramid_construction_;
+        delete source_interp_pyramid_construction_;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegDissimilarity<ValueType, DOut>* hoImageRegRegister<ValueType, CoordType, DIn, DOut>::createDissimilarity(GT_IMAGE_DISSIMILARITY v, unsigned int level)
+    {
+        hoImageRegDissimilarity<ValueType, DOut>* res = NULL;
+
+        unsigned int ii;
+
+        switch (v)
+        {
+            case GT_IMAGE_DISSIMILARITY_SSD:
+                res = new hoImageRegDissimilaritySSD<ValueType, DOut>();
+                break;
+
+            case GT_IMAGE_DISSIMILARITY_LocalCCR:
+            {
+                hoImageRegDissimilarityLocalCCR<ValueType, DOut>* ptr = new hoImageRegDissimilarityLocalCCR<ValueType, DOut>();
+                for ( ii=0; ii<DOut; ii++ )
+                {
+                    ptr->sigmaArg_[ii] = dissimilarity_LocalCCR_sigmaArg_[level][ii];
+                }
+
+                res = ptr;
+            }
+                break;
+
+            case GT_IMAGE_DISSIMILARITY_MI:
+            {
+                hoImageRegDissimilarityMutualInformation<ValueType, DOut>* ptr = new hoImageRegDissimilarityMutualInformation<ValueType, DOut>();
+
+                ptr->betaArg_[0] = dissimilarity_MI_betaArg_[level];
+                ptr->betaArg_[1] = dissimilarity_MI_betaArg_[level];
+                ptr->num_bin_target_ = dissimilarity_hist_num_bin_target_[level];
+                ptr->num_bin_warpped_ = dissimilarity_hist_num_bin_warpped_[level];
+                ptr->pv_interpolation_ = dissimilarity_hist_pv_interpolation_;
+                ptr->step_size_ignore_pixel_ = dissimilarity_hist_step_size_ignore_pixel_[level];
+
+                res = ptr;
+            }
+                break;
+
+            case GT_IMAGE_DISSIMILARITY_NMI:
+            {
+                hoImageRegDissimilarityNormalizedMutualInformation<ValueType, DOut>* ptr = new hoImageRegDissimilarityNormalizedMutualInformation<ValueType, DOut>();
+
+                ptr->num_bin_target_ = dissimilarity_hist_num_bin_target_[level];
+                ptr->num_bin_warpped_ = dissimilarity_hist_num_bin_warpped_[level];
+                ptr->pv_interpolation_ = dissimilarity_hist_pv_interpolation_;
+                ptr->step_size_ignore_pixel_ = dissimilarity_hist_step_size_ignore_pixel_[level];
+
+                res = ptr;
+            }
+                break;
+
+            default:
+                GERROR_STREAM("Unrecognized image dissimilarity type : " << v);
+        }
+
+        res->setBackgroundValue(bg_value_);
+
+        return res;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegRegister<ValueType, CoordType, DIn, DOut>::initialize()
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(target_!=NULL);
+            GADGET_CHECK_RETURN_FALSE(source_!=NULL);
+
+            GADGET_CHECK_RETURN_FALSE(resolution_pyramid_downsample_ratio_.size()==resolution_pyramid_levels_-1);
+            GADGET_CHECK_RETURN_FALSE(resolution_pyramid_blurring_sigma_.size()==resolution_pyramid_levels_);
+
+            GADGET_CHECK_RETURN_FALSE(boundary_handler_type_warper_.size()==resolution_pyramid_levels_);
+            GADGET_CHECK_RETURN_FALSE(interp_type_warper_.size()==resolution_pyramid_levels_);
+
+            GADGET_CHECK_RETURN_FALSE(dissimilarity_type_.size()==resolution_pyramid_levels_);
+            GADGET_CHECK_RETURN_FALSE(solver_type_.size()==resolution_pyramid_levels_);
+
+            target_pyramid_.resize(resolution_pyramid_levels_);
+            source_pyramid_.resize(resolution_pyramid_levels_);
+
+            target_pyramid_[0] = *target_;
+            source_pyramid_[0] = *source_;
+
+            target_bh_pyramid_construction_ = createBoundaryHandler<TargetType>(boundary_handler_type_pyramid_construction_);
+            target_interp_pyramid_construction_ = createInterpolator<TargetType, DOut>(interp_type_pyramid_construction_);
+            target_interp_pyramid_construction_->setBoundaryHandler(*target_bh_pyramid_construction_);
+
+            source_bh_pyramid_construction_ = createBoundaryHandler<SourceType>(boundary_handler_type_pyramid_construction_);
+            source_interp_pyramid_construction_ = createInterpolator<SourceType, DIn>(interp_type_pyramid_construction_);
+            source_interp_pyramid_construction_->setBoundaryHandler(*source_bh_pyramid_construction_);
+
+            /// allocate all objects
+            unsigned int ii, jj;
+            for ( ii=0; ii<resolution_pyramid_levels_-1; ii++ )
+            {
+                // create pyramid
+                target_bh_pyramid_construction_->setArray(target_pyramid_[ii]);
+                target_interp_pyramid_construction_->setArray(target_pyramid_[ii]);
+
+                if ( use_world_coordinates_ )
+                {
+                    if ( resolution_pyramid_divided_by_2_ )
+                    {
+                        Gadgetron::downsampleImageBy2WithAveraging(target_pyramid_[ii], *target_bh_pyramid_construction_, target_pyramid_[ii+1]);
+                    }
+                    else
+                    {
+                        std::vector<float> ratio = resolution_pyramid_downsample_ratio_[ii];
+                        Gadgetron::downsampleImage(target_pyramid_[ii], *target_interp_pyramid_construction_, target_pyramid_[ii+1], &ratio[0]);
+
+                        std::vector<float> sigma = resolution_pyramid_blurring_sigma_[ii+1];
+                        for ( jj=0; jj<DOut; jj++ )
+                        {
+                            sigma[jj] /= target_pyramid_[ii+1].get_pixel_size(jj); // world to pixel
+                        }
+
+                        Gadgetron::filterGaussian(target_pyramid_[ii+1], &sigma[0]);
+                    }
+                }
+                else
+                {
+                    std::vector<float> ratio = resolution_pyramid_downsample_ratio_[ii];
+
+                    bool downsampledBy2 = true;
+                    for ( jj=0; jj<DOut; jj++ )
+                    {
+                        if ( std::abs(ratio[jj]-2.0f) > FLT_EPSILON )
+                        {
+                            downsampledBy2 = false;
+                            break;
+                        }
+                    }
+
+                    if ( downsampledBy2 )
+                    {
+                        Gadgetron::downsampleImageBy2WithAveraging(target_pyramid_[ii], *target_bh_pyramid_construction_, target_pyramid_[ii+1]);
+                        // Gadgetron::downsampleImage(target_pyramid_[ii], *target_interp_pyramid_construction_, target_pyramid_[ii+1], &ratio[0]);
+                    }
+                    else
+                    {
+                        Gadgetron::downsampleImage(target_pyramid_[ii], *target_interp_pyramid_construction_, target_pyramid_[ii+1], &ratio[0]);
+                        std::vector<float> sigma = resolution_pyramid_blurring_sigma_[ii+1];
+                        Gadgetron::filterGaussian(target_pyramid_[ii+1], &sigma[0]);
+                    }
+                }
+
+                // source
+
+                source_bh_pyramid_construction_->setArray(source_pyramid_[ii]);
+                source_interp_pyramid_construction_->setArray(source_pyramid_[ii]);
+
+                if ( use_world_coordinates_ )
+                {
+                    if ( resolution_pyramid_divided_by_2_ )
+                    {
+                        Gadgetron::downsampleImageBy2WithAveraging(source_pyramid_[ii], *source_bh_pyramid_construction_, source_pyramid_[ii+1]);
+                    }
+                    else
+                    {
+                        std::vector<float> ratio = resolution_pyramid_downsample_ratio_[ii];
+                        Gadgetron::downsampleImage(source_pyramid_[ii], *source_interp_pyramid_construction_, source_pyramid_[ii+1], &ratio[0]);
+
+                        std::vector<float> sigma = resolution_pyramid_blurring_sigma_[ii+1];
+                        for ( jj=0; jj<DOut; jj++ )
+                        {
+                            sigma[jj] /= source_pyramid_[ii+1].get_pixel_size(jj); // world to pixel
+                        }
+
+                        Gadgetron::filterGaussian(source_pyramid_[ii+1], &sigma[0]);
+                    }
+                }
+                else
+                {
+                    std::vector<float> ratio = resolution_pyramid_downsample_ratio_[ii];
+
+                    bool downsampledBy2 = true;
+                    for ( jj=0; jj<DOut; jj++ )
+                    {
+                        if ( std::abs(ratio[jj]-2.0f) > FLT_EPSILON )
+                        {
+                            downsampledBy2 = false;
+                            break;
+                        }
+                    }
+
+                    if ( downsampledBy2 )
+                    {
+                        Gadgetron::downsampleImageBy2WithAveraging(source_pyramid_[ii], *source_bh_pyramid_construction_, source_pyramid_[ii+1]);
+                        //Gadgetron::downsampleImage(source_pyramid_[ii], *source_interp_pyramid_construction_, source_pyramid_[ii+1], &ratio[0]);
+                    }
+                    else
+                    {
+                        Gadgetron::downsampleImage(source_pyramid_[ii], *source_interp_pyramid_construction_, source_pyramid_[ii+1], &ratio[0]);
+                        std::vector<float> sigma = resolution_pyramid_blurring_sigma_[ii+1];
+                        Gadgetron::filterGaussian(source_pyramid_[ii+1], &sigma[0]);
+                    }
+                }
+            }
+
+            for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+            {
+                target_bh_warper_[ii] = createBoundaryHandler<TargetType>(boundary_handler_type_warper_[ii]);
+                target_bh_warper_[ii]->setArray(target_pyramid_[ii]);
+
+                target_interp_warper_[ii] = createInterpolator<TargetType, DOut>(interp_type_warper_[ii]);
+                target_interp_warper_[ii]->setArray(target_pyramid_[ii]);
+                target_interp_warper_[ii]->setBoundaryHandler(*target_bh_warper_[ii]);
+
+                source_bh_warper_[ii] = createBoundaryHandler<SourceType>(boundary_handler_type_warper_[ii]);
+                source_bh_warper_[ii]->setArray(source_pyramid_[ii]);
+
+                source_interp_warper_[ii] = createInterpolator<SourceType, DIn>(interp_type_warper_[ii]);
+                source_interp_warper_[ii]->setArray(source_pyramid_[ii]);
+                source_interp_warper_[ii]->setBoundaryHandler(*source_bh_warper_[ii]);
+
+                dissimilarity_pyramid_[ii] = createDissimilarity(dissimilarity_type_[ii], ii);
+                dissimilarity_pyramid_[ii]->initialize(target_pyramid_[ii]);
+                dissimilarity_pyramid_[ii]->debugFolder_ = this->debugFolder_;
+
+                dissimilarity_pyramid_inverse_[ii] = createDissimilarity(dissimilarity_type_[ii], ii);
+                dissimilarity_pyramid_inverse_[ii]->initialize(source_pyramid_[ii]);
+                dissimilarity_pyramid_inverse_[ii]->debugFolder_ = this->debugFolder_;
+            }
+
+            if ( !debugFolder_.empty() )
+            {
+                for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+                {
+                    std::ostringstream ostr_t;
+                    ostr_t << "target_" << ii;
+
+                    gt_exporter_.exportImage(target_pyramid_[ii], debugFolder_+ostr_t.str());
+
+                    std::ostringstream ostr_s;
+                    ostr_s << "source_" << ii;
+
+                    gt_exporter_.exportImage(source_pyramid_[ii], debugFolder_+ostr_s.str());
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegRegister<ValueType, CoordType, DIn, DOut>::initialize() ... ");
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    inline void hoImageRegRegister<ValueType, CoordType, DIn, DOut>::setTarget(TargetType& target)
+    {
+        target_ = ⌖
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    inline void hoImageRegRegister<ValueType, CoordType, DIn, DOut>::setSource(SourceType& source)
+    {
+        source_ = &source;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    void hoImageRegRegister<ValueType, CoordType, DIn, DOut>::printContent(std::ostream& os) const
+    {
+        using namespace std;
+        os << "Input dimension is : " << DIn << endl;
+        os << "Output dimension is : " << DOut << endl;
+
+        std::string elemTypeName = std::string(typeid(ValueType).name());
+        os << "Image data type is : " << elemTypeName << std::endl;
+
+        elemTypeName = std::string(typeid(CoordType).name());
+        os << "Transformation coordinate data type is : " << elemTypeName << std::endl;
+
+        os << "Whether to perform world coordinate registration is : " << use_world_coordinates_ << std::endl;
+        os << "Number of resolution pyramid levels is : " << resolution_pyramid_levels_ << std::endl;
+
+        os << "------------" << std::endl;
+        os << "Downsample ratio of the resolution pyramid for every dimension and every level is : " << std::endl;
+
+        unsigned int ii, jj;
+        for ( ii=0; ii<resolution_pyramid_levels_-1; ii++ )
+        {
+            os << "Level " << ii << " [ ";
+            for ( jj=0; jj<resolution_pyramid_downsample_ratio_[ii].size(); jj++ )
+            {
+                os << resolution_pyramid_downsample_ratio_[ii][jj] << " ";
+            }
+            os << " ] " << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Gaussian blurring sigma for every dimension and every level is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << "Level " << ii << " [ ";
+            for ( jj=0; jj<resolution_pyramid_blurring_sigma_[ii].size(); jj++ )
+            {
+                os << resolution_pyramid_blurring_sigma_[ii][jj] << " ";
+            }
+            os << " ] " << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Boundary handler and interpolator type for warper is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << getBoundaryHandlerName(boundary_handler_type_warper_[ii]) 
+                << " - " << getInterpolatorName(interp_type_warper_[ii]) << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Boundary handler and interpolator type for pyramid construction is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << getBoundaryHandlerName(boundary_handler_type_pyramid_construction_) 
+                << " - " << getInterpolatorName(interp_type_pyramid_construction_) << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Image dissimilarity is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << getDissimilarityName(dissimilarity_type_[ii]) << std::endl;
+        }
+
+        os << "------------" << std::endl;
+        os << "Image registration solver is : " << std::endl;
+        for ( ii=0; ii<resolution_pyramid_levels_; ii++ )
+        {
+            os << " Level " << ii << " - " 
+                << getImageRegSolverName(solver_type_[ii]) << std::endl;
+        }
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    void hoImageRegRegister<ValueType, CoordType, DIn, DOut>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron image register -------------" << endl;
+        this->printContent(os);
+        os << "-----------------------------------------------------" << std::endl;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/solver/hoImageRegDeformationFieldBidirectionalSolver.h b/toolboxes/registration/optical_flow/cpu/solver/hoImageRegDeformationFieldBidirectionalSolver.h
new file mode 100644
index 0000000..481e792
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/solver/hoImageRegDeformationFieldBidirectionalSolver.h
@@ -0,0 +1,602 @@
+/** \file   hoImageRegDeformationFieldBidirectionalSolver.h
+    \brief  Implement the PDE solver for bidirecitonal deformation field non-linear image registration
+
+            The PDE solver is a classical gradient descent method, derived from the calculus of variation:
+
+            [1] Gerardo Hermosillo, Christophe Chefd'Hotel, Olivier Faugeras. Variational Methods for Multimodal Image Matching. 
+            International Journal of Computer Vision. December 2002, Volume 50, Issue 3, pp 329-343.
+            http://link.springer.com/article/10.1023%2FA%3A1020830525823
+
+            [2] Gerardo Hermosillo. Variational Methods for Multimodal Image Matching. PhD Thesis, UNIVERSIT�E DE NICE - SOPHIA ANTIPOLIS. May 2002.
+            http://webdocs.cs.ualberta.ca/~dana/readingMedIm/papers/hermosilloPhD.pdf
+
+            [3] Christophe Chefd'Hotel, Gerardo Hermosillo, Olivier D. Faugeras: Flows of diffeomorphisms for multimodal image registration. ISBI 2002: 753-756.
+            http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1029367&tag=1
+
+            [4] Christophe Chefd'Hotel, Geometric Methods in Computer Vision and Image Processing : Contributions and Applications. PhD Thesis, April 2005.
+
+            The code is based on the listed source code at page 185 - 187 in ref [2] and extended according to the ref [3] and [4].
+
+            [5] Christoph Guetter, Hui Xue, Christophe Chefd'Hotel, Jens Guehring: Efficient symmetric and inverse-consistent deformable registration through interleaved optimization. ISBI 2011: 590-593.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegDeformationFieldSolver.h"
+
+#ifdef max
+#undef max
+#endif // max
+
+#ifdef min
+#undef min
+#endif // min
+
+namespace Gadgetron
+{
+    /// ValueType: image pixel value type
+    /// CoordType: transformation data type
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    class hoImageRegDeformationFieldBidirectionalSolver : public hoImageRegDeformationFieldSolver<ValueType, CoordType, D>
+    {
+    public:
+
+        typedef hoImageRegDeformationFieldBidirectionalSolver<ValueType, CoordType, D> Self;
+        typedef hoImageRegDeformationFieldSolver<ValueType, CoordType, D> BaseClass;
+
+        typedef hoNDImage<ValueType, D> TargetType;
+        typedef hoNDImage<ValueType, D> SourceType;
+
+        typedef hoNDImage<ValueType, 2> Target2DType;
+        typedef Target2DType Source2DType;
+
+        typedef hoNDImage<ValueType, 3> Target3DType;
+        typedef Target2DType Source3DType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef CoordType coord_type;
+
+        typedef typename BaseClass::InterpolatorType InterpolatorType;
+
+        typedef hoImageRegDeformationField<CoordType, D> TransformationType;
+        typedef typename TransformationType::input_point_type input_point_type;
+        typedef typename TransformationType::output_point_type output_point_type;
+        typedef typename TransformationType::jacobian_position_type jacobian_position_type;
+        typedef typename TransformationType::DeformationFieldType DeformationFieldType;
+
+        typedef typename BaseClass::ImageRegWarperType ImageRegWarperType;
+
+        typedef typename BaseClass::ImageRegDissimilarityType ImageRegDissimilarityType;
+
+        hoImageRegDeformationFieldBidirectionalSolver();
+        virtual ~hoImageRegDeformationFieldBidirectionalSolver();
+
+        void setTransform(TransformationType& transform) { transform_ = &transform; }
+        void setTransformInverse(TransformationType& transform) { transform_inverse_ = &transform; }
+
+        virtual bool initialize();
+
+        virtual bool solve();
+
+        virtual void print(std::ostream& os) const;
+
+        void setDissimilarityInverse(ImageRegDissimilarityType& dissimilarity) { dissimilarity_inverse_ = &dissimilarity; }
+        void setWarperInverse(ImageRegWarperType& warper) { warper_inverse_ = &warper; }
+        void setInterpolatorInverse(InterpolatorType& interp) { interp_inverse_ = &interp; }
+
+        virtual bool enforceInverseTransform(TransformationType* transform, TransformationType* transform_inverse, DeformationFieldType* deform_delta_inverse, unsigned int iter_num=10);
+
+        /// number of iterations to improve the estimation of the inverse transform
+        unsigned int inverse_deform_enforce_iter_;
+        /// weight to update the estimation of the inverse transform, must be within [0 1]
+        CoordType inverse_deform_enforce_weight_;
+
+        using BaseClass::regularization_hilbert_strength_;
+        using BaseClass::apply_in_FOV_constraint_;
+        using BaseClass::iter_num_;
+        using BaseClass::max_iter_num_;
+        using BaseClass::dissimilarity_thres_;
+        using BaseClass::parameter_thres_;
+        using BaseClass::div_num_;
+        using BaseClass::step_size_para_;
+        using BaseClass::step_size_div_para_;
+        using BaseClass::verbose_;
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        using BaseClass::transform_;
+
+        using BaseClass::curr_dissimilarity_;
+        using BaseClass::prev_dissimilarity_;
+
+        using BaseClass::deform_delta_;
+        using BaseClass::deform_updated_;
+        using BaseClass::deform_norm_;
+        using BaseClass::deform_norm_one_dim_;
+        using BaseClass::gradient_warpped_;
+
+        using BaseClass::target_;
+        using BaseClass::source_;
+        using BaseClass::warpped_;
+        using BaseClass::bg_value_;
+        using BaseClass::interp_;
+        using BaseClass::warper_;
+        using BaseClass::dissimilarity_;
+        using BaseClass::use_world_coordinate_;
+        using BaseClass::deform_delta_scale_factor_;
+
+        /// for the inverse transformation
+
+        SourceType warpped_inverse_;
+
+        InterpolatorType* interp_inverse_;
+
+        ImageRegWarperType* warper_inverse_;
+
+        ImageRegDissimilarityType* dissimilarity_inverse_;
+
+        TransformationType* transform_inverse_;
+
+        ValueType curr_dissimilarity_inverse_;
+        ValueType prev_dissimilarity_inverse_;
+
+        DeformationFieldType deform_delta_inverse_[D];
+        DeformationFieldType deform_updated_inverse_[D];
+
+        DeformationFieldType deform_norm_inverse_;
+        DeformationFieldType deform_norm_one_dim_inverse_;
+
+        TargetType gradient_warpped_inverse_[D];
+
+        coord_type deform_delta_scale_factor_inverse_[D];
+    };
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    hoImageRegDeformationFieldBidirectionalSolver<ValueType, CoordType, D>::
+    hoImageRegDeformationFieldBidirectionalSolver() : BaseClass(), inverse_deform_enforce_iter_(10), inverse_deform_enforce_weight_(0.5)
+    {
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    hoImageRegDeformationFieldBidirectionalSolver<ValueType, CoordType, D>::~hoImageRegDeformationFieldBidirectionalSolver()
+    {
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    bool hoImageRegDeformationFieldBidirectionalSolver<ValueType, CoordType, D>::initialize()
+    {
+        GADGET_CHECK_RETURN_FALSE(interp_inverse_!=NULL);
+        GADGET_CHECK_RETURN_FALSE(warper_inverse_!=NULL);
+        GADGET_CHECK_RETURN_FALSE(dissimilarity_inverse_!=NULL);
+        GADGET_CHECK_RETURN_FALSE(transform_inverse_!=NULL);
+
+        GADGET_CHECK_RETURN_FALSE(BaseClass::initialize());
+
+        warper_inverse_->setInterpolator(*interp_inverse_);
+        warper_inverse_->setBackgroundValue(bg_value_);
+
+        dissimilarity_inverse_->setBackgroundValue(bg_value_);
+
+        if ( !warpped_inverse_.dimensions_equal(*source_) )
+        {
+            warpped_inverse_ = *source_;
+        }
+
+        dissimilarity_inverse_->initialize(*source_);
+
+        warper_inverse_->setTransformation(*transform_inverse_);
+
+        std::vector<size_t> dim;
+        source_->get_dimensions(dim);
+
+        deform_norm_inverse_.copyImageInfo(*source_);
+        deform_norm_one_dim_inverse_.copyImageInfo(*source_);
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            deform_delta_inverse_[ii].copyImageInfo(*source_);
+            Gadgetron::clear(deform_delta_[ii]);
+
+            deform_updated_inverse_[ii].copyImageInfo(*source_);
+            Gadgetron::clear(deform_updated_[ii]);
+
+            gradient_warpped_inverse_[ii].copyImageInfo(*source_);
+        }
+
+        deform_delta_scale_factor_inverse_[0] = 1;
+        for ( ii=0; ii<D; ii++ )
+        {
+            deform_delta_scale_factor_inverse_[ii] = source_->get_pixel_size(0)/source_->get_pixel_size(ii);
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    bool hoImageRegDeformationFieldBidirectionalSolver<ValueType, CoordType, D>::solve()
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(this->initialize());
+
+            prev_dissimilarity_ = std::numeric_limits<ValueType>::max();
+            prev_dissimilarity_inverse_ = std::numeric_limits<ValueType>::max();
+
+            unsigned int divTimes = 0;
+
+            dissimilarity_->initialize(*target_);
+            dissimilarity_inverse_->initialize(*source_);
+
+            bool computeForwardTransform = false;
+            bool stopIteration = false;
+
+            for ( iter_num_=0; iter_num_<max_iter_num_; iter_num_++ )
+            {
+                if ( computeForwardTransform )
+                {
+                    GADGET_CHECK_RETURN_FALSE( this->solve_once(target_, source_, warpped_, iter_num_, max_iter_num_, 
+                                                                divTimes, curr_dissimilarity_, prev_dissimilarity_, 
+                                                                transform_, *warper_, *dissimilarity_,
+                                                                stopIteration, 
+                                                                gradient_warpped_, deform_delta_, 
+                                                                deform_updated_, deform_norm_, deform_norm_one_dim_,
+                                                                deform_delta_scale_factor_) );
+
+                    if ( stopIteration ) break;
+
+                    GADGET_CHECK_RETURN_FALSE(this->enforceInverseTransform(transform_, transform_inverse_, deform_delta_inverse_, 6));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE( this->solve_once(source_, target_, warpped_inverse_, iter_num_, max_iter_num_, 
+                                                                divTimes, curr_dissimilarity_inverse_, prev_dissimilarity_inverse_, 
+                                                                transform_inverse_, *warper_inverse_, *dissimilarity_inverse_,
+                                                                stopIteration, 
+                                                                gradient_warpped_inverse_, deform_delta_inverse_, 
+                                                                deform_updated_inverse_, deform_norm_inverse_, deform_norm_one_dim_inverse_,
+                                                                deform_delta_scale_factor_inverse_) );
+
+                    if ( stopIteration ) break;
+
+                    GADGET_CHECK_RETURN_FALSE(this->enforceInverseTransform(transform_inverse_, transform_, deform_delta_, 6));
+                }
+
+                computeForwardTransform = !computeForwardTransform;
+            }
+
+            GADGET_CHECK_RETURN_FALSE( this->enforceInverseTransform(transform_inverse_, transform_, deform_delta_, inverse_deform_enforce_iter_) );
+
+            if ( verbose_ ) { GDEBUG_STREAM("----> Total iteration number : " << iter_num_); }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationFieldBidirectionalSolver<ValueType, CoordType, D>::solve() ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    bool hoImageRegDeformationFieldBidirectionalSolver<ValueType, CoordType, D>::
+    enforceInverseTransform(TransformationType* transform, TransformationType* transform_inverse, DeformationFieldType* deform_delta, unsigned int iter_num)
+    {
+        try
+        {
+
+            std::vector<size_t> dim, dim_inverse;
+
+            DeformationFieldType& deform = transform->getDeformationField(0);
+            deform.get_dimensions(dim);
+
+            DeformationFieldType& deform_inverse = transform_inverse->getDeformationField(0);
+            deform_inverse.get_dimensions(dim_inverse);
+
+            unsigned int iter_enforce;
+            for ( iter_enforce=0; iter_enforce<iter_num; iter_enforce++ )
+            {
+                if ( use_world_coordinate_ )
+                {
+                    if ( D == 2 )
+                    {
+                        long long sx = (long long)dim_inverse[0];
+                        long long sy = (long long)dim_inverse[1];
+
+                        long long y;
+                        // #pragma omp parallel default(none) private(y) shared(sx, sy, transform, transform_inverse, deform_delta, deform, deform_inverse) if(sx*sy>64*1024) num_threads(2)
+                        {
+                            CoordType ix, iy, px, py, px_inverse, py_inverse, dx, dy, dx_inverse, dy_inverse;
+                            size_t offset;
+
+                            // #pragma omp for 
+                            for ( y=0; y<(long long)sy; y++ )
+                            {
+                                for ( size_t x=0; x<sx; x++ )
+                                {
+                                    transform_inverse->get(x, (size_t)y, dx_inverse, dy_inverse);
+
+                                    deform_inverse.image_to_world(x, y, px_inverse, py_inverse);
+                                    px = px_inverse + dx_inverse;
+                                    py = py_inverse + dy_inverse;
+
+                                    deform.world_to_image(px, py, ix, iy);
+
+                                    transform->get(ix, iy, dx, dy);
+
+                                    offset = x + y*sx;
+
+                                    deform_delta[0](offset) = dx;
+                                    deform_delta[1](offset) = dy;
+                                }
+                            }
+                        }
+                    }
+                    else if ( D == 3 )
+                    {
+                        long long sx = (long long)dim_inverse[0];
+                        long long sy = (long long)dim_inverse[1];
+                        long long sz = (long long)dim_inverse[2];
+
+                        long long z;
+                        #pragma omp parallel default(none) private(z) shared(sx, sy, sz, transform, transform_inverse, deform_delta, deform, deform_inverse)
+                        {
+                            CoordType ix, iy, iz, px, py, pz, px_inverse, py_inverse, pz_inverse, dx, dy, dz, dx_inverse, dy_inverse, dz_inverse;
+
+                            #pragma omp for 
+                            for ( z=0; z<(long long)sz; z++ )
+                            {
+                                for ( size_t y=0; y<sy; y++ )
+                                {
+                                    size_t offset = z*sx*sy + y*sx;
+
+                                    for ( size_t x=0; x<sx; x++ )
+                                    {
+                                        transform_inverse->get(x, y, (size_t)z, dx_inverse, dy_inverse, dz_inverse);
+
+                                        deform_inverse.image_to_world(x, y, z, px_inverse, py_inverse, pz_inverse);
+                                        px = px_inverse + dx_inverse;
+                                        py = py_inverse + dy_inverse;
+                                        pz = pz_inverse + dz_inverse;
+
+                                        deform.world_to_image(px, py, pz, ix, iy, iz);
+
+                                        transform->get(ix, iy, iz, dx, dy, dz);
+
+                                        deform_delta[0](offset+x) = dx;
+                                        deform_delta[1](offset+x) = dy;
+                                        deform_delta[2](offset+x) = dz;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    else
+                    {
+                        size_t N = deform_inverse.get_number_of_elements();
+
+                        long long n;
+                        #pragma omp parallel default(none) private(n) shared(N, transform, transform_inverse, deform_delta, deform, deform_inverse)
+                        {
+                            size_t ind[D];
+                            CoordType wind[D], wind_inverse[D], d_inverse[D], pt[D], d[D];
+
+                            for ( n=0; n<(long long)N; n++ )
+                            {
+                                deform_inverse.calculate_index( (unsigned long long)(n), ind);
+                                deform_inverse.image_to_world(ind, wind_inverse);
+
+                                transform_inverse->get(ind, d_inverse);
+
+                                unsigned int ii;
+                                for ( ii=0; ii<D; ii++ ) pt[ii] = wind_inverse[ii] + d_inverse[ii];
+
+                                deform.world_to_image(pt, wind);
+
+                                transform->get(wind, d);
+                                for ( ii=0; ii<D; ii++ )
+                                {
+                                    deform_delta[ii](n) = d[ii];
+                                }
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    if ( D == 2 )
+                    {
+                        long long sx = (long long)dim_inverse[0];
+                        long long sy = (long long)dim_inverse[1];
+
+                        long long y;
+                        // #pragma omp parallel default(none) private(y) shared(sx, sy, transform, transform_inverse, deform_delta) if(sx*sy>64*1024) num_threads(2)
+                        {
+                            CoordType px, py, dx, dy, dx_inverse, dy_inverse;
+                            size_t offset;
+
+                            // #pragma omp for 
+                            for ( y=0; y<(long long)sy; y++ )
+                            {
+                                for ( size_t x=0; x<sx; x++ )
+                                {
+                                    transform_inverse->get(x, (size_t)y, dx_inverse, dy_inverse);
+
+                                    px = x + dx_inverse;
+                                    py = y + dy_inverse;
+
+                                    transform->get(px, py, dx, dy);
+
+                                    offset = x + y*sx;
+
+                                    deform_delta[0](offset) = dx;
+                                    deform_delta[1](offset) = dy;
+                                }
+                            }
+                        }
+                    }
+                    else if ( D == 3 )
+                    {
+                        long long sx = (long long)dim_inverse[0];
+                        long long sy = (long long)dim_inverse[1];
+                        long long sz = (long long)dim_inverse[2];
+
+                        long long z;
+                        #pragma omp parallel default(none) private(z) shared(sx, sy, sz, transform, transform_inverse, deform_delta)
+                        {
+                            CoordType px, py, pz, dx, dy, dz, dx_inverse, dy_inverse, dz_inverse;
+                            size_t offset;
+
+                            #pragma omp for 
+                            for ( z=0; z<(long long)sz; z++ )
+                            {
+                                for ( size_t y=0; y<sy; y++ )
+                                {
+                                    offset = z*sx*sy + y*sx;
+
+                                    for ( size_t x=0; x<sx; x++ )
+                                    {
+                                        transform_inverse->get(x, y, (size_t)z, dx_inverse, dy_inverse, dz_inverse);
+
+                                        px = x + dx_inverse;
+                                        py = y + dy_inverse;
+                                        pz = z + dz_inverse;
+
+                                        transform->get(px, py, pz, dx, dy, dz);
+
+                                        deform_delta[0](offset+x) = dx;
+                                        deform_delta[1](offset+x) = dy;
+                                        deform_delta[2](offset+x) = dz;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    else
+                    {
+                        size_t N = deform_inverse.get_number_of_elements();
+
+                        long long n;
+                        #pragma omp parallel default(none) private(n) shared(N, transform, transform_inverse, deform_delta, deform_inverse)
+                        {
+                            size_t ind[D];
+                            CoordType d_inverse[D], pt[D], d[D];
+
+                            for ( n=0; n<(long long)N; n++ )
+                            {
+                                deform_inverse.calculate_index( (unsigned long long)(n), ind);
+
+                                transform_inverse->get(ind, d_inverse);
+
+                                unsigned int ii;
+                                for ( ii=0; ii<D; ii++ ) pt[ii] = ind[ii] + d_inverse[ii];
+
+                                transform->get(pt, d);
+                                for ( ii=0; ii<D; ii++ )
+                                {
+                                    deform_delta[ii](n) = d[ii];
+                                }
+                            }
+                        }
+                    }
+                }
+
+                unsigned int ii;
+                for ( ii=0; ii<D; ii++ )
+                {
+                    DeformationFieldType& deform_inverse = transform_inverse->getDeformationField(ii);
+
+                    Gadgetron::scal( CoordType(1-inverse_deform_enforce_weight_), deform_inverse);
+
+                    Gadgetron::scal( CoordType(-1*inverse_deform_enforce_weight_), deform_delta[ii]);
+
+                    Gadgetron::add(deform_delta[ii], deform_inverse, deform_inverse);
+                }
+
+                if ( apply_in_FOV_constraint_ )
+                {
+                    if ( !use_world_coordinate_ )
+                    {
+                        if ( D == 2 )
+                        {
+                            long long sx = (long long)dim_inverse[0];
+                            long long sy = (long long)dim_inverse[1];
+
+                            DeformationFieldType& dxInv = transform_inverse->getDeformationField(0);
+                            DeformationFieldType& dyInv = transform_inverse->getDeformationField(1);
+
+                            long long x, y;
+                            // #pragma omp parallel for default(none) private(y, x) shared(sx, sy, dxInv, dyInv) if(sx*sy>64*1024) num_threads(2)
+                            for ( y=0; y<sy; y++ )
+                            {
+                                for ( x=0; x<sx; x++ )
+                                {
+                                    size_t offset = x + y*sx;
+
+                                    CoordType tx = x + dxInv(offset);
+                                    CoordType ty = y + dyInv(offset);
+
+                                    if ( tx < 0 )
+                                    {
+                                        dxInv(offset) = FLT_EPSILON - x;
+                                    }
+                                    else if (tx > sx-1 )
+                                    {
+                                        dxInv(offset) = sx-1-FLT_EPSILON - x;
+                                    }
+
+                                    if ( ty < 0 )
+                                    {
+                                        dyInv(offset) = FLT_EPSILON - y;
+                                    }
+                                    else if (ty > sy-1 )
+                                    {
+                                        dyInv(offset) = sy-1-FLT_EPSILON - y;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationFieldBidirectionalSolver<ValueType, CoordType, D>::enforceInverseTransform(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    void hoImageRegDeformationFieldBidirectionalSolver<ValueType, CoordType, D>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron image registration non-parametric solver for pixel-wise bidirectional deformation field -------------" << endl;
+        os << "Image dimension is : " << D << endl;
+        os << "Image data type is : " << std::string(typeid(ValueType).name()) << std::endl;
+        os << "Transformation data type is : " << std::string(typeid(CoordType).name()) << std::endl;
+        os << "Use world coordinate is : " << use_world_coordinate_ << std::endl;
+        os << "Maximal iteration number is : " << max_iter_num_ << std::endl;
+        os << "Dissimilarity threshold is : " << dissimilarity_thres_ << std::endl;
+        os << "Parameter threshold is : " << parameter_thres_ << std::endl;
+        os << "Number of search size division is : " << div_num_ << std::endl;
+        os << "Solver step size is : " << step_size_para_ << std::endl;
+        os << "Step size division ratio is : " << step_size_div_para_ << std::endl;
+        os << "Step size division ratio is : " << step_size_div_para_ << std::endl;
+        os << "Number of iterations to improve the estimation of the inverse transform is : " << inverse_deform_enforce_iter_ << std::endl;
+        os << "Weight to update the estimation of the inverse transform is : " << inverse_deform_enforce_weight_ << std::endl;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/solver/hoImageRegDeformationFieldSolver.h b/toolboxes/registration/optical_flow/cpu/solver/hoImageRegDeformationFieldSolver.h
new file mode 100644
index 0000000..1e46852
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/solver/hoImageRegDeformationFieldSolver.h
@@ -0,0 +1,673 @@
+/** \file   hoImageRegDeformationFieldSolver.h
+    \brief  Implement the PDE solver for deformation field non-linear image registration
+
+            The PDE solver is a classical gradient descent method, derived from the calculus of variation:
+
+            [1] Gerardo Hermosillo, Christophe Chefd'Hotel, Olivier Faugeras. Variational Methods for Multimodal Image Matching. 
+            International Journal of Computer Vision. December 2002, Volume 50, Issue 3, pp 329-343.
+            http://link.springer.com/article/10.1023%2FA%3A1020830525823
+
+            [2] Gerardo Hermosillo. Variational Methods for Multimodal Image Matching. PhD Thesis, UNIVERSIT�E DE NICE - SOPHIA ANTIPOLIS. May 2002.
+            http://webdocs.cs.ualberta.ca/~dana/readingMedIm/papers/hermosilloPhD.pdf
+
+            [3] Christophe Chefd'Hotel, Gerardo Hermosillo, Olivier D. Faugeras: Flows of diffeomorphisms for multimodal image registration. ISBI 2002: 753-756.
+            http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1029367&tag=1
+
+            [4] Christophe Chefd'Hotel, Geometric Methods in Computer Vision and Image Processing : Contributions and Applications. PhD Thesis, April 2005.
+
+            The code is based on the listed source code at page 185 - 187 in ref [2] and extended according to the ref [3] and [4].
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegNonParametricSolver.h"
+#include "hoImageRegDeformationField.h"
+
+#ifdef max
+#undef max
+#endif // max
+
+#ifdef min
+#undef min
+#endif // min
+
+namespace Gadgetron
+{
+    /// ValueType: image pixel value type
+    /// CoordType: transformation data type
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    class hoImageRegDeformationFieldSolver : public hoImageRegNonParametricSolver<ValueType, CoordType, D, D>
+    {
+    public:
+
+        typedef hoImageRegDeformationFieldSolver<ValueType, CoordType, D> Self;
+        typedef hoImageRegNonParametricSolver<ValueType, CoordType, D, D> BaseClass;
+
+        typedef hoNDImage<ValueType, D> TargetType;
+        typedef hoNDImage<ValueType, D> SourceType;
+
+        typedef hoNDImage<ValueType, 2> Target2DType;
+        typedef Target2DType Source2DType;
+
+        typedef hoNDImage<ValueType, 3> Target3DType;
+        typedef Target2DType Source3DType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef CoordType coord_type;
+
+        typedef typename BaseClass::InterpolatorType InterpolatorType;
+
+        typedef hoImageRegDeformationField<CoordType, D> TransformationType;
+        typedef typename TransformationType::input_point_type input_point_type;
+        typedef typename TransformationType::output_point_type output_point_type;
+        typedef typename TransformationType::jacobian_position_type jacobian_position_type;
+        typedef typename TransformationType::DeformationFieldType DeformationFieldType;
+
+        typedef typename BaseClass::ImageRegWarperType ImageRegWarperType;
+
+        typedef typename BaseClass::ImageRegDissimilarityType ImageRegDissimilarityType;
+
+        hoImageRegDeformationFieldSolver();
+        virtual ~hoImageRegDeformationFieldSolver();
+
+        void setTransform(TransformationType& transform) { transform_ = &transform; }
+
+        virtual bool initialize();
+
+        virtual bool solve();
+
+        /// perform one iteration of optimization
+        virtual bool solve_once(TargetType* target, SourceType* source, TargetType& warped, 
+                                unsigned int iter_num, unsigned int max_iter_num, 
+                                unsigned int& divTimes, 
+                                ValueType& curr_dissimilarity, ValueType& prev_dissimilarity, 
+                                TransformationType* transform, ImageRegWarperType& warper, ImageRegDissimilarityType& dissimilarity,
+                                bool& stopIteration, 
+                                TargetType* gradient_warpped, DeformationFieldType* deform_delta, DeformationFieldType* deform_updated, 
+                                DeformationFieldType& deform_norm , DeformationFieldType& deform_norm_one_dim,
+                                CoordType* deform_delta_scale_factor);
+
+        virtual void print(std::ostream& os) const;
+
+        /// the regularization method in ref [3] is used
+        /// in the unit of pixel
+        ValueType regularization_hilbert_strength_[D];
+
+        /// whether the deformation can warp a point outside the FOV
+        /// InFOV constraint
+        bool apply_in_FOV_constraint_;
+
+        using BaseClass::iter_num_;
+        using BaseClass::max_iter_num_;
+        using BaseClass::dissimilarity_thres_;
+        using BaseClass::parameter_thres_;
+        using BaseClass::div_num_;
+        using BaseClass::step_size_para_;
+        using BaseClass::step_size_div_para_;
+        using BaseClass::verbose_;
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        TransformationType* transform_;
+
+        ValueType curr_dissimilarity_;
+        ValueType prev_dissimilarity_;
+
+        DeformationFieldType deform_delta_[D];
+        DeformationFieldType deform_updated_[D];
+
+        DeformationFieldType deform_norm_;
+        DeformationFieldType deform_norm_one_dim_;
+
+        TargetType gradient_warpped_[D];
+
+        /// compensate for the non-isotropic pixel sizes
+        coord_type deform_delta_scale_factor_[D];
+
+        using BaseClass::target_;
+        using BaseClass::source_;
+        using BaseClass::warpped_;
+        using BaseClass::bg_value_;
+        using BaseClass::interp_;
+        using BaseClass::warper_;
+        using BaseClass::dissimilarity_;
+        using BaseClass::use_world_coordinate_;
+    };
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    hoImageRegDeformationFieldSolver<ValueType, CoordType, D>::
+    hoImageRegDeformationFieldSolver() : BaseClass()
+    {
+        for ( unsigned int ii=0; ii<D; ii++ )
+        {
+            regularization_hilbert_strength_[ii] = 12;
+            deform_delta_scale_factor_[ii] = 1;
+        }
+
+        apply_in_FOV_constraint_ = false;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    hoImageRegDeformationFieldSolver<ValueType, CoordType, D>::~hoImageRegDeformationFieldSolver()
+    {
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    bool hoImageRegDeformationFieldSolver<ValueType, CoordType, D>::initialize()
+    {
+        GADGET_CHECK_RETURN_FALSE(BaseClass::initialize());
+        warper_->setTransformation(*transform_);
+
+        std::vector<size_t> dim;
+        target_->get_dimensions(dim);
+
+        deform_norm_.copyImageInfo(*target_);
+        deform_norm_one_dim_.copyImageInfo(*target_);
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            deform_delta_[ii].copyImageInfo(*target_);
+            Gadgetron::clear(deform_delta_[ii]);
+
+            deform_updated_[ii].copyImageInfo(*target_);
+            Gadgetron::clear(deform_updated_[ii]);
+
+            gradient_warpped_[ii].copyImageInfo(*target_);
+        }
+
+        deform_delta_scale_factor_[0] = 1;
+        for ( ii=0; ii<D; ii++ )
+        {
+            deform_delta_scale_factor_[ii] = target_->get_pixel_size(0)/target_->get_pixel_size(ii);
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    bool hoImageRegDeformationFieldSolver<ValueType, CoordType, D>::
+    solve_once(TargetType* target, SourceType* source, TargetType& warped, 
+                unsigned int iter_num, unsigned int max_iter_num, 
+                unsigned int& divTimes, 
+                ValueType& curr_dissimilarity, ValueType& prev_dissimilarity, 
+                TransformationType* transform, ImageRegWarperType& warper, ImageRegDissimilarityType& dissimilarity,
+                bool& stopIteration, 
+                TargetType* gradient_warpped, DeformationFieldType* deform_delta, DeformationFieldType* deform_updated, 
+                DeformationFieldType& deform_norm , DeformationFieldType& deform_norm_one_dim,
+                CoordType* deform_delta_scale_factor)
+    {
+        try
+        {
+            unsigned int ii;
+
+            long long sx = (long long)(target_->get_size(0));
+            long long sy = (long long)(target_->get_size(1));
+            long long sz = (long long)(target_->get_size(2));
+
+            long long x, y, z;
+
+            if ( !debugFolder_.empty() )
+            {
+                for ( ii=0; ii<D; ii++ )
+                {
+                    std::ostringstream ostr;
+                    ostr << "DeformationFieldSolver_deformfield_" << ii;
+                    const DeformationFieldType& def = transform->getDeformationField(ii);
+                    gt_exporter_.exportImage(def, debugFolder_+ostr.str());
+                }
+            }
+
+            // warp the source
+
+            if ( use_world_coordinate_ )
+            {
+                GADGET_CHECK_RETURN_FALSE(warper.warpWithDeformationFieldWorldCoordinate(*target, *source, warped));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(warper.warp(*target, *source, use_world_coordinate_, warped));
+            }
+
+            if ( !debugFolder_.empty() ) { gt_exporter_.exportImage(warped, debugFolder_+"DeformationFieldSolver_warpped"); }
+
+            // evaluate the dissimilarity and get the intensity comparison function
+            GADGET_CHECK_RETURN_FALSE(dissimilarity.evaluateDeriv(warped));
+
+            curr_dissimilarity = dissimilarity.getDissimilarity();
+            if ( verbose_ ) { GDEBUG_STREAM("--> Iteration " << iter_num << " [out of " << max_iter_num << "] : \t" << curr_dissimilarity); }
+
+            if ( prev_dissimilarity < curr_dissimilarity + dissimilarity_thres_ )
+            {
+                if ( ++divTimes > div_num_ )
+                {
+                    stopIteration = true;
+                    return true;
+                }
+
+                step_size_para_ *= step_size_div_para_;
+
+                if ( verbose_ ) { GDEBUG_STREAM("----> Parameter division " << divTimes << " [out of " << div_num_ << "] "); }
+            }
+
+            prev_dissimilarity = curr_dissimilarity;
+
+            /// gradient is in the 1/pixel unit
+            Gadgetron::gradient(warped, gradient_warpped);
+
+            const TargetType& deriv = dissimilarity.getDeriv();
+
+            for ( ii=0; ii<D; ii++ )
+            {
+                Gadgetron::multiply(gradient_warpped[ii], deriv, deform_delta[ii]);
+            }
+
+            if ( !debugFolder_.empty() )
+            {
+                gt_exporter_.exportImage(deriv, debugFolder_+"DeformationFieldSolver_deriv");
+
+                for ( ii=0; ii<D; ii++ )
+                {
+                    std::ostringstream ostr;
+                    ostr << "DeformationFieldSolver_gradient_warpped_" << ii;
+
+                    gt_exporter_.exportImage(gradient_warpped[ii], debugFolder_+ostr.str());
+
+                    std::ostringstream ostr2;
+                    ostr2 << "DeformationFieldSolver_deform_delta_" << ii;
+
+                    gt_exporter_.exportImage(deform_delta[ii], debugFolder_+ostr2.str());
+                }
+            }
+
+            /// compensate for non-isotropic pixel sizes
+            for ( ii=0; ii<D; ii++ )
+            {
+                if ( std::abs(deform_delta_scale_factor[ii]-1) > FLT_EPSILON )
+                {
+                    Gadgetron::scal(deform_delta_scale_factor[ii], deform_delta[ii]);
+                }
+            }
+
+            /// filter sigma is in the unit of pixel size
+            for ( ii=0; ii<D; ii++ )
+            {
+                Gadgetron::filterGaussian(deform_delta[ii], regularization_hilbert_strength_);
+            }
+
+            if ( !debugFolder_.empty() )
+            {
+                for ( ii=0; ii<D; ii++ )
+                {
+                    std::ostringstream ostr;
+                    ostr << "DeformationFieldSolver_deform_delta_filtered_" << ii;
+
+                    gt_exporter_.exportImage(deform_delta[ii], debugFolder_+ostr.str());
+                }
+            }
+
+            // compute the max norm of hilbert derivative
+            Gadgetron::clear(deform_norm);
+            for ( ii=0; ii<D; ii++ )
+            {
+                Gadgetron::multiply(deform_delta[ii], deform_delta[ii], deform_norm_one_dim);
+                Gadgetron::add(deform_norm_one_dim, deform_norm, deform_norm);
+            }
+
+            CoordType* pDeformNorm = deform_norm.begin();
+
+            ValueType max_norm_deform_delta = pDeformNorm[0];
+            // size_t max_ind;
+
+            for ( ii=1; ii<sx*sy; ii++ )
+            {
+                if ( max_norm_deform_delta < pDeformNorm[ii] ) max_norm_deform_delta = pDeformNorm[ii];
+            }
+
+            // Gadgetron::maxAbsolute(deform_norm, max_norm_deform_delta, max_ind);
+
+            ValueType PDE_time_integration_step_size = 0;
+            if ( max_norm_deform_delta > 1e-5 )
+            {
+                PDE_time_integration_step_size = step_size_para_ / std::sqrt(max_norm_deform_delta);
+            }
+
+            if ( PDE_time_integration_step_size > 0 )
+            {
+                for ( ii=0; ii<D; ii++ )
+                {
+                    Gadgetron::scal(PDE_time_integration_step_size, deform_delta[ii]);
+                }
+
+                if ( use_world_coordinate_ )
+                {
+                    // Note: the deform_delta is in the unit of pixel so far, need to convert it to the world coordinate
+
+                    if ( D == 2 )
+                    {
+                        CoordType ix, iy, wx, wy, pX, pY, deltaWX, deltaWY;
+
+                        // #pragma omp parallel for default(none) private(y, x, ix, iy, wx, wy, pX, pY, deltaWX, deltaWY) shared(sx, sy, target, deform_delta, deform_updated, transform) num_threads(2)
+                        for ( y=0; y<sy; y++ )
+                        {
+                            for ( x=0; x<sx; x++ )
+                            {
+                                size_t offset = x + y*sx;
+
+                                target->image_to_world( (size_t)x, (size_t)y, wx, wy);
+
+                                CoordType deltaX = deform_delta[0](offset);
+                                CoordType deltaY = deform_delta[1](offset);
+
+                                // because the delta deformation is in the pixel size unit, it needs to be converted to world coordinate
+                                target->image_to_world( deltaX, deltaY, deltaWX, deltaWY);
+
+                                target->world_to_image(wx+deltaWX, wy+deltaWY, ix, iy);
+
+                                transform->get(ix, iy, pX, pY);
+
+                                deform_updated[0](offset) = deltaWX + pX;
+                                deform_updated[1](offset) = deltaWY + pY;
+                            }
+                        }
+                    }
+                    else if ( D == 3 )
+                    {
+                        CoordType ix, iy, iz, wx, wy, wz, pX, pY, pZ, deltaWX, deltaWY, deltaWZ;
+
+                        #pragma omp parallel for default(none) private(y, x, z, ix, iy, iz, wx, wy, wz, pX, pY, pZ, deltaWX, deltaWY, deltaWZ) shared(sx, sy, sz, target, deform_delta, deform_updated, transform)
+                        for ( z=0; z<sz; z++ )
+                        {
+                            for ( y=0; y<sy; y++ )
+                            {
+                                for ( x=0; x<sx; x++ )
+                                {
+                                    size_t offset = x + y*sx + z*sx*sy;
+
+                                    target->image_to_world( (size_t)x, (size_t)y, (size_t)z, wx, wy, wz);
+
+                                    CoordType deltaX = deform_delta[0](offset);
+                                    CoordType deltaY = deform_delta[1](offset);
+                                    CoordType deltaZ = deform_delta[2](offset);
+
+                                    target->image_to_world( deltaX, deltaY, deltaZ, deltaWX, deltaWY, deltaWZ);
+
+                                    target->world_to_image(wx+deltaWX, wy+deltaWY, wz+deltaWZ, ix, iy, iz);
+
+                                    transform->get(ix, iy, iz, pX, pY, pZ);
+
+                                    deform_updated[0](offset) = deltaWX + pX;
+                                    deform_updated[1](offset) = deltaWY + pY;
+                                    deform_updated[2](offset) = deltaWZ + pZ;
+                                }
+                            }
+                        }
+                    }
+                    else
+                    {
+                        size_t N = target_->get_number_of_elements();
+
+                        long long n;
+
+                        #pragma omp parallel default(none) private(n, ii) shared(N, target, deform_delta, deform_updated, transform)
+                        {
+                            size_t ind[D];
+                            CoordType pos[D];
+                            CoordType pDelta[D];
+                            CoordType pDeltaWorld[D];
+                            CoordType indDeform[D];
+                            CoordType pDeform[D];
+
+                            #pragma omp for 
+                            for ( n=0; n<(long long)N; n++ )
+                            {
+                                deform_delta[0].calculate_index(n, ind);
+
+                                target->image_to_world( ind, pos);
+
+                                for ( ii=0; ii<D; ii++ )
+                                {
+                                    pDelta[ii] = deform_delta[ii](n);
+                                }
+
+                                target->image_to_world( pDelta, pDeltaWorld);
+
+                                for ( ii=0; ii<D; ii++ )
+                                {
+                                    pDeltaWorld[ii] += pos[ii];
+                                }
+
+                                target->world_to_image(pDeltaWorld, indDeform);
+                                transform->get(indDeform, pDeform);
+
+                                for ( ii=0; ii<D; ii++ )
+                                {
+                                    deform_updated[ii](n) = pDeltaWorld[ii] + pDeform[ii];
+                                }
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    if ( D == 2 )
+                    {
+                        CoordType pX, pY;
+
+                        // #pragma omp parallel for default(none) private(y, x, pX, pY) shared(sx, sy, deform_delta, deform_updated, transform) num_threads(2)
+                        for ( y=0; y<sy; y++ )
+                        {
+                            for ( x=0; x<sx; x++ )
+                            {
+                                size_t offset = x + y*sx;
+
+                                CoordType deltaX = deform_delta[0](offset);
+                                CoordType deltaY = deform_delta[1](offset);
+
+                                transform->get(x+deltaX, y+deltaY, pX, pY);
+
+                                deform_updated[0](offset) = deltaX + pX;
+                                deform_updated[1](offset) = deltaY + pY;
+                            }
+                        }
+                    }
+                    else if ( D == 3 )
+                    {
+                        CoordType pX, pY, pZ;
+
+                        #pragma omp parallel for default(none) private(y, x, z, pX, pY, pZ) shared(sx, sy, sz, deform_delta, deform_updated, transform)
+                        for ( z=0; z<sz; z++ )
+                        {
+                            for ( y=0; y<sy; y++ )
+                            {
+                                for ( x=0; x<sx; x++ )
+                                {
+                                    size_t offset = x + y*sx + z*sx*sy;
+
+                                    CoordType deltaX = deform_delta[0](offset);
+                                    CoordType deltaY = deform_delta[1](offset);
+                                    CoordType deltaZ = deform_delta[2](offset);
+
+                                    transform->get(x+deltaX, y+deltaY, z+deltaZ, pX, pY, pZ);
+
+                                    deform_updated[0](offset) = deltaX + pX;
+                                    deform_updated[1](offset) = deltaY + pY;
+                                    deform_updated[2](offset) = deltaZ + pZ;
+                                }
+                            }
+                        }
+                    }
+                    else
+                    {
+                        size_t N = target_->get_number_of_elements();
+
+                        long long n;
+
+                        #pragma omp parallel default(none) private(n, ii) shared(N, deform_delta, deform_updated, transform)
+                        {
+                            size_t ind[D];
+                            CoordType pDelta[D];
+                            CoordType indDeform[D];
+                            CoordType pDeform[D];
+
+                            #pragma omp for 
+                            for ( n=0; n<(long long)N; n++ )
+                            {
+                                deform_delta[0].calculate_index(n, ind);
+
+                                for ( ii=0; ii<D; ii++ )
+                                {
+                                    pDelta[ii] = deform_delta[ii](n);
+                                    indDeform[ii] = ind[ii] + pDelta[ii];
+                                }
+
+                                transform->get(indDeform, pDeform);
+
+                                for ( ii=0; ii<D; ii++ )
+                                {
+                                    deform_updated[ii](n) = pDelta[ii] + pDeform[ii];
+                                }
+                            }
+                        }
+                    }
+                }
+
+                if ( !debugFolder_.empty() )
+                {
+                    for ( ii=0; ii<D; ii++ )
+                    {
+                        std::ostringstream ostr;
+                        ostr << "DeformationFieldSolver_deform_updated_" << ii;
+                        gt_exporter_.exportImage(deform_updated[ii], debugFolder_+ostr.str());
+                    }
+                }
+
+                // add the InFOV constraint
+                if ( apply_in_FOV_constraint_ )
+                {
+                    if ( !use_world_coordinate_ )
+                    {
+                        if ( D == 2 )
+                        {
+                            CoordType pX, pY;
+
+                            // #pragma omp parallel for default(none) private(y, x, pX, pY) shared(sx, sy, deform_updated) num_threads(2)
+                            for ( y=0; y<sy; y++ )
+                            {
+                                for ( x=0; x<sx; x++ )
+                                {
+                                    size_t offset = x + y*sx;
+
+                                    CoordType tx = x + deform_updated[0](offset);
+                                    CoordType ty = y + deform_updated[1](offset);
+
+                                    if ( tx < 0 )
+                                    {
+                                        deform_updated[0](offset) = FLT_EPSILON - x;
+                                    }
+                                    else if (tx > sx-1 )
+                                    {
+                                        deform_updated[0](offset) = sx-1-FLT_EPSILON - x;
+                                    }
+
+                                    if ( ty < 0 )
+                                    {
+                                        deform_updated[1](offset) = FLT_EPSILON - y;
+                                    }
+                                    else if (ty > sy-1 )
+                                    {
+                                        deform_updated[1](offset) = sy-1-FLT_EPSILON - y;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+
+                for ( ii=0; ii<D; ii++ )
+                {
+                    transform->setDeformationField(deform_updated[ii], ii);
+                }
+            }
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    bool hoImageRegDeformationFieldSolver<ValueType, CoordType, D>::solve()
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(this->initialize());
+
+            prev_dissimilarity_ = std::numeric_limits<ValueType>::max();
+
+            unsigned int divTimes = 0;
+
+            dissimilarity_->initialize(*target_);
+
+            if ( !debugFolder_.empty() )
+            {
+                gt_exporter_.exportImage(*target_, debugFolder_+"DeformationFieldSolver_target");
+                gt_exporter_.exportImage(*source_, debugFolder_+"DeformationFieldSolver_source");
+            }
+
+            bool stopIteration = false;
+
+            if ( verbose_ ) { GDEBUG_STREAM("--> DeformationFieldSolver ... "); }
+            for ( iter_num_=0; iter_num_<max_iter_num_; iter_num_++ )
+            {
+                GADGET_CHECK_RETURN_FALSE( this->solve_once(target_, source_, warpped_, iter_num_, max_iter_num_, 
+                                                            divTimes, curr_dissimilarity_, prev_dissimilarity_, 
+                                                            transform_, *warper_, *dissimilarity_,
+                                                            stopIteration, 
+                                                            gradient_warpped_, deform_delta_, deform_updated_, 
+                                                            deform_norm_ , deform_norm_one_dim_, deform_delta_scale_factor_) );
+
+                if ( stopIteration ) break;
+            }
+
+            if ( verbose_ ) { GDEBUG_STREAM("----> Total iteration number : " << iter_num_); }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationFieldSolver<ValueType, CoordType, D>::solve() ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int D> 
+    void hoImageRegDeformationFieldSolver<ValueType, CoordType, D>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron image registration non-parametric solver for pixel-wise deformation field -------------" << endl;
+        os << "Image dimension is : " << D << endl;
+        os << "Image data type is : " << std::string(typeid(ValueType).name()) << std::endl;
+        os << "Transformation data type is : " << std::string(typeid(CoordType).name()) << std::endl;
+        os << "Use world coordinate is : " << use_world_coordinate_ << std::endl;
+        os << "Maximal iteration number is : " << max_iter_num_ << std::endl;
+        os << "Dissimilarity threshold is : " << dissimilarity_thres_ << std::endl;
+        os << "Parameter threshold is : " << parameter_thres_ << std::endl;
+        os << "Number of search size division is : " << div_num_ << std::endl;
+        os << "Solver step size is : " << step_size_para_ << std::endl;
+        os << "Step size division ratio is : " << step_size_div_para_ << std::endl;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/solver/hoImageRegNonParametricSolver.h b/toolboxes/registration/optical_flow/cpu/solver/hoImageRegNonParametricSolver.h
new file mode 100644
index 0000000..16ba86a
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/solver/hoImageRegNonParametricSolver.h
@@ -0,0 +1,162 @@
+/** \file   hoImageRegNonParametricSolver.h
+    \brief  Define the base class of image registration solver for non-parametric image transformation
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegSolver.h"
+
+namespace Gadgetron
+{
+    /// ValueType: image pixel value type
+    /// CoordType: transformation data type
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    class hoImageRegNonParametricSolver : public hoImageRegSolver<ValueType, CoordType, DIn, DOut>
+    {
+    public:
+
+        typedef hoImageRegNonParametricSolver<ValueType, CoordType, DIn, DOut> Self;
+        typedef hoImageRegSolver<ValueType, CoordType, DIn, DOut> BaseClass;
+
+        typedef hoNDImage<ValueType, DOut> TargetType;
+        typedef hoNDImage<ValueType, DIn> SourceType;
+
+        typedef hoNDImage<ValueType, 2> Target2DType;
+        typedef Target2DType Source2DType;
+
+        typedef hoNDImage<ValueType, 3> Target3DType;
+        typedef Target2DType Source3DType;
+
+        typedef typename BaseClass::InterpolatorType InterpolatorType;
+
+        typedef hoImageRegNonParametricTransformation<CoordType, DIn, DOut> TransformationType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef CoordType coord_type;
+
+        typedef typename TransformationType::input_point_type input_point_type;
+        typedef typename TransformationType::output_point_type output_point_type;
+        typedef typename TransformationType::jacobian_position_type jacobian_position_type;
+
+        typedef typename BaseClass::ImageRegWarperType ImageRegWarperType;
+
+        typedef typename BaseClass::ImageRegDissimilarityType ImageRegDissimilarityType;
+
+        hoImageRegNonParametricSolver();
+        virtual ~hoImageRegNonParametricSolver();
+
+        virtual bool initialize();
+
+        /// solve the minimization and find the optimal transformation
+        virtual bool solve() = 0;
+
+        virtual void print(std::ostream& os) const;
+
+        /// number of performed iterations
+        unsigned int iter_num_;
+
+        /// maximal number of iterations
+        unsigned int max_iter_num_;
+
+        /// threshold for minimal dissimilarity changes
+        ValueType dissimilarity_thres_;
+
+        /// threshold for minimal parameter changes
+        ValueType parameter_thres_;
+
+        /// number of search size division
+        unsigned int div_num_;
+
+        /// solver step size
+        ValueType step_size_para_;
+        /// step size division ratio
+        ValueType step_size_div_para_;
+
+        using BaseClass::verbose_;
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        ValueType curr_dissimilarity_;
+        ValueType prev_dissimilarity_;
+
+        using BaseClass::target_;
+        using BaseClass::source_;
+        using BaseClass::warpped_;
+        using BaseClass::bg_value_;
+        using BaseClass::interp_;
+        using BaseClass::warper_;
+        using BaseClass::dissimilarity_;
+        using BaseClass::use_world_coordinate_;
+    };
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegNonParametricSolver<ValueType, CoordType, DIn, DOut>::hoImageRegNonParametricSolver() 
+        : BaseClass(), dissimilarity_thres_(0), parameter_thres_( (ValueType)1e-8 ), div_num_(3), step_size_para_( (ValueType)0.8 ), step_size_div_para_( (ValueType)0.5 )
+    {
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegNonParametricSolver<ValueType, CoordType, DIn, DOut>::~hoImageRegNonParametricSolver()
+    {
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegNonParametricSolver<ValueType, CoordType, DIn, DOut>::initialize()
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(target_!=NULL);
+            GADGET_CHECK_RETURN_FALSE(source_!=NULL);
+            GADGET_CHECK_RETURN_FALSE(interp_!=NULL);
+            GADGET_CHECK_RETURN_FALSE(warper_!=NULL);
+            GADGET_CHECK_RETURN_FALSE(dissimilarity_!=NULL);
+
+            warper_->setInterpolator(*interp_);
+            warper_->setBackgroundValue(bg_value_);
+
+            dissimilarity_->setBackgroundValue(bg_value_);
+
+            if ( !warpped_.dimensions_equal(*target_) )
+            {
+                warpped_ = *target_;
+            }
+
+            dissimilarity_->initialize(*target_);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegNonParametricSolver<ValueType, CoordType, DIn, DOut>::initialize() ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    void hoImageRegNonParametricSolver<ValueType, CoordType, DIn, DOut>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron image registration non-parametric solver -------------" << endl;
+        os << "Target image dimension is : " << DIn << endl;
+        os << "Source image dimension is : " << DOut << endl;
+        os << "Image data type is : " << std::string(typeid(ValueType).name()) << std::endl;
+        os << "Transformation data type is : " << std::string(typeid(CoordType).name()) << std::endl;
+        os << "Use world coordinate is : " << use_world_coordinate_ << std::endl;
+        os << "Maximal iteration number is : " << max_iter_num_ << std::endl;
+        os << "Dissimilarity threshold is : " << dissimilarity_thres_ << std::endl;
+        os << "Parameter threshold is : " << parameter_thres_ << std::endl;
+        os << "Number of search size division is : " << div_num_ << std::endl;
+        os << "Solver step size is : " << step_size_para_ << std::endl;
+        os << "Step size division ratio is : " << step_size_div_para_ << std::endl;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/solver/hoImageRegParametricDownHillSolver.h b/toolboxes/registration/optical_flow/cpu/solver/hoImageRegParametricDownHillSolver.h
new file mode 100644
index 0000000..a6e6c96
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/solver/hoImageRegParametricDownHillSolver.h
@@ -0,0 +1,166 @@
+/** \file   hoImageRegParametricDownHillSolver.h
+    \brief  Define the class of simple down-hill solver for parametric image transformation
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegParametricSolver.h"
+
+namespace Gadgetron
+{
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    class hoImageRegParametricDownHillSolver : public hoImageRegParametricSolver<ValueType, CoordType, DIn, DOut>
+    {
+    public:
+
+        typedef hoImageRegParametricDownHillSolver<ValueType, CoordType, DIn, DOut> Self;
+        typedef hoImageRegParametricSolver<ValueType, CoordType, DIn, DOut> BaseClass;
+
+        typedef hoNDImage<ValueType, DOut> TargetType;
+        typedef hoNDImage<ValueType, DIn> SourceType;
+
+        typedef hoNDImage<ValueType, 2> Target2DType;
+        typedef Target2DType Source2DType;
+
+        typedef hoNDImage<ValueType, 3> Target3DType;
+        typedef Target2DType Source3DType;
+
+        typedef typename BaseClass::InterpolatorType InterpolatorType;
+
+        typedef typename BaseClass::TransformationType TransformationType;
+        typedef typename BaseClass::ParaStatusType ParaStatusType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef CoordType coord_type;
+
+        typedef typename TransformationType::input_point_type input_point_type;
+        typedef typename TransformationType::output_point_type output_point_type;
+
+        typedef typename TransformationType::jacobian_parameter_type jacobian_parameter_type;
+        typedef typename TransformationType::jacobian_position_type jacobian_position_type;
+
+        typedef typename BaseClass::ImageRegWarperType ImageRegWarperType;
+
+        typedef typename BaseClass::ImageRegDissimilarityType ImageRegDissimilarityType;
+
+        hoImageRegParametricDownHillSolver();
+        virtual ~hoImageRegParametricDownHillSolver();
+
+        /// perform one iteration of optimization
+        virtual ValueType solver_once(ValueType curr_dissimilarity);
+
+        virtual void print(std::ostream& os) const;
+
+        using BaseClass::iter_num_;
+        using BaseClass::max_iter_num_;
+        using BaseClass::dissimilarity_thres_;
+        using BaseClass::parameter_thres_;
+        using BaseClass::div_num_;
+        using BaseClass::step_size_para_;
+        using BaseClass::step_size_div_para_;
+
+        using BaseClass::verbose_;
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        using BaseClass::transform_;
+
+        using BaseClass::curr_dissimilarity_;
+        using BaseClass::prev_dissimilarity_;
+
+        using BaseClass::target_;
+        using BaseClass::source_;
+        using BaseClass::warpped_;
+        using BaseClass::bg_value_;
+        using BaseClass::interp_;
+        using BaseClass::warper_;
+        using BaseClass::dissimilarity_;
+        using BaseClass::use_world_coordinate_;
+    };
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegParametricDownHillSolver<ValueType, CoordType, DIn, DOut>::hoImageRegParametricDownHillSolver() : BaseClass()
+    {
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegParametricDownHillSolver<ValueType, CoordType, DIn, DOut>::~hoImageRegParametricDownHillSolver()
+    {
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    ValueType hoImageRegParametricDownHillSolver<ValueType, CoordType, DIn, DOut>::solver_once(ValueType curr_dissimilarity)
+    {
+        ValueType prevValue = curr_dissimilarity;
+        ValueType currValue;
+
+        size_t optimizedDimIndex = 0;
+        ValueType positiveStepFlag = 0;
+
+        size_t numOfPara = transform_->get_number_of_parameters();
+        size_t i;
+
+        ValueType currPara(0);
+        for ( i=0; i<numOfPara; i++ )
+        {
+            if ( transform_->get_para_status(i) == TransformationType::Active)
+            {
+                currPara = transform_->get_parameter(i);
+
+                // positive
+                transform_->set_parameter(i, currPara + step_size_para_[i]);
+
+                GADGET_CHECK_RETURN_FALSE(warper_->warp(*target_, *source_, use_world_coordinate_, warpped_));
+                currValue = dissimilarity_->evaluate(warpped_);
+
+                if ( currValue < curr_dissimilarity + dissimilarity_thres_ )
+                {
+                    curr_dissimilarity = currValue;
+                    optimizedDimIndex =  i;
+                    positiveStepFlag = 1;
+                }
+
+                // negative
+                transform_->set_parameter(i, currPara - step_size_para_[i]);
+
+                GADGET_CHECK_RETURN_FALSE(warper_->warp(*target_, *source_, use_world_coordinate_, warpped_));
+                currValue = dissimilarity_->evaluate(warpped_);
+
+                if ( currValue < curr_dissimilarity + dissimilarity_thres_ )
+                {
+                    curr_dissimilarity = currValue;
+                    optimizedDimIndex =  i;
+                    positiveStepFlag = -1;
+                }
+
+                transform_->set_parameter(i, currPara);
+            }
+        }
+
+        if ( curr_dissimilarity < prevValue )
+        {
+            currPara = transform_->get_parameter(optimizedDimIndex);
+            transform_->set_parameter(optimizedDimIndex, currPara+positiveStepFlag*step_size_para_[optimizedDimIndex]);
+        }
+
+        return curr_dissimilarity;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    void hoImageRegParametricDownHillSolver<ValueType, CoordType, DIn, DOut>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "-------------- Gagdgetron DownHill image registration solver -------------" << endl;
+        BaseClass::print(os);
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/solver/hoImageRegParametricGradientDescentSolver.h b/toolboxes/registration/optical_flow/cpu/solver/hoImageRegParametricGradientDescentSolver.h
new file mode 100644
index 0000000..22e5b8f
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/solver/hoImageRegParametricGradientDescentSolver.h
@@ -0,0 +1,146 @@
+/** \file   hoImageRegParametricGradientDescentSolver.h
+    \brief  Define the class of simple gradient descent solver for parametric image transformation, no linear search is performed in this solver
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegParametricSolver.h"
+
+namespace Gadgetron
+{
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    class hoImageRegParametricGradientDescentSolver : public hoImageRegParametricSolver<ValueType, CoordType, DIn, DOut>
+    {
+    public:
+
+        typedef hoImageRegParametricGradientDescentSolver<ValueType, CoordType, DIn, DOut> Self;
+        typedef hoImageRegParametricSolver<ValueType, CoordType, DIn, DOut> BaseClass;
+
+        typedef hoNDImage<ValueType, DOut> TargetType;
+        typedef hoNDImage<ValueType, DIn> SourceType;
+
+        typedef hoNDImage<ValueType, 2> Target2DType;
+        typedef Target2DType Source2DType;
+
+        typedef hoNDImage<ValueType, 3> Target3DType;
+        typedef Target2DType Source3DType;
+
+        typedef typename BaseClass::InterpolatorType InterpolatorType;
+
+        typedef typename BaseClass::TransformationType TransformationType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef CoordType coord_type;
+
+        typedef typename TransformationType::input_point_type input_point_type;
+        typedef typename TransformationType::output_point_type output_point_type;
+
+        typedef typename TransformationType::jacobian_parameter_type jacobian_parameter_type;
+        typedef typename TransformationType::jacobian_position_type jacobian_position_type;
+
+        typedef typename BaseClass::ImageRegWarperType ImageRegWarperType;
+
+        typedef typename BaseClass::ImageRegDissimilarityType ImageRegDissimilarityType;
+
+        hoImageRegParametricGradientDescentSolver();
+        virtual ~hoImageRegParametricGradientDescentSolver();
+
+        /// perform one iteration of optimization
+        virtual ValueType solver_once(ValueType curr_dissimilarity);
+
+        virtual void print(std::ostream& os) const;
+
+        using BaseClass::iter_num_;
+        using BaseClass::max_iter_num_;
+        using BaseClass::dissimilarity_thres_;
+        using BaseClass::parameter_thres_;
+        using BaseClass::div_num_;
+        using BaseClass::step_size_para_;
+        using BaseClass::step_size_div_para_;
+
+        using BaseClass::verbose_;
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        using BaseClass::transform_;
+
+        using BaseClass::curr_dissimilarity_;
+        using BaseClass::prev_dissimilarity_;
+
+        using BaseClass::target_;
+        using BaseClass::source_;
+        using BaseClass::warpped_;
+        using BaseClass::bg_value_;
+        using BaseClass::interp_;
+        using BaseClass::warper_;
+        using BaseClass::dissimilarity_;
+        using BaseClass::use_world_coordinate_;
+    };
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegParametricGradientDescentSolver<ValueType, CoordType, DIn, DOut>::hoImageRegParametricGradientDescentSolver() : BaseClass()
+    {
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegParametricGradientDescentSolver<ValueType, CoordType, DIn, DOut>::~hoImageRegParametricGradientDescentSolver()
+    {
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    ValueType hoImageRegParametricGradientDescentSolver<ValueType, CoordType, DIn, DOut>::solver_once(ValueType curr_dissimilarity)
+    {
+        std::vector<ValueType> deriv;
+        GADGET_CHECK_RETURN_FALSE(this->evaluateDeriv(transform_, dissimilarity_, step_size_para_, deriv));
+
+        size_t numOfPara = transform_->get_number_of_parameters();
+        size_t i;
+        ValueType prevValue, currPara(0);
+
+        while ( 1 )
+        {
+            prevValue = curr_dissimilarity;
+
+            for ( i=0; i<numOfPara; i++ )
+            {
+                currPara = transform_->get_parameter(i);
+                transform_->set_parameter( i, currPara-step_size_para_[i]*deriv[i] );
+            }
+
+            GADGET_CHECK_RETURN_FALSE(warper_->warp(*target_, *source_, use_world_coordinate_, warpped_));
+            curr_dissimilarity = dissimilarity_->evaluate(warpped_);
+
+            if ( curr_dissimilarity > prevValue + dissimilarity_thres_ )
+            {
+                break;
+            }
+        }
+
+        // rewind
+        for ( i=0; i<numOfPara; i++ )
+        {
+            currPara = transform_->get_parameter(i);
+            transform_->set_parameter( i, currPara+step_size_para_[i]*deriv[i] );
+        }
+
+        return prevValue;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    void hoImageRegParametricGradientDescentSolver<ValueType, CoordType, DIn, DOut>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "-------------- Gagdgetron Gradient descent image registration solver -------------" << endl;
+        BaseClass::print(os);
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/solver/hoImageRegParametricSolver.h b/toolboxes/registration/optical_flow/cpu/solver/hoImageRegParametricSolver.h
new file mode 100644
index 0000000..ac8c7b6
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/solver/hoImageRegParametricSolver.h
@@ -0,0 +1,326 @@
+/** \file   hoImageRegParametricSolver.h
+    \brief  Define the base class of image registration solver for parametric image transformation
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegSolver.h"
+
+namespace Gadgetron
+{
+    /// ValueType: image pixel value type
+    /// CoordType: transformation data type
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    class hoImageRegParametricSolver : public hoImageRegSolver<ValueType, CoordType, DIn, DOut>
+    {
+    public:
+
+        typedef hoImageRegParametricSolver<ValueType, CoordType, DIn, DOut> Self;
+        typedef hoImageRegSolver<ValueType, CoordType, DIn, DOut> BaseClass;
+
+        typedef hoNDImage<ValueType, DOut> TargetType;
+        typedef hoNDImage<ValueType, DIn> SourceType;
+
+        typedef hoNDImage<ValueType, 2> Target2DType;
+        typedef Target2DType Source2DType;
+
+        typedef hoNDImage<ValueType, 3> Target3DType;
+        typedef Target2DType Source3DType;
+
+        typedef typename BaseClass::InterpolatorType InterpolatorType;
+
+        typedef hoImageRegParametricTransformation<CoordType, DIn, DOut> TransformationType;
+        typedef typename TransformationType::ParaStatus ParaStatusType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef CoordType coord_type;
+
+        typedef typename TransformationType::input_point_type input_point_type;
+        typedef typename TransformationType::output_point_type output_point_type;
+
+        typedef typename TransformationType::jacobian_parameter_type jacobian_parameter_type;
+        typedef typename TransformationType::jacobian_position_type jacobian_position_type;
+
+        typedef typename BaseClass::ImageRegWarperType ImageRegWarperType;
+
+        typedef typename BaseClass::ImageRegDissimilarityType ImageRegDissimilarityType;
+
+        hoImageRegParametricSolver();
+        virtual ~hoImageRegParametricSolver();
+
+        void setTransform(TransformationType& transform) { transform_ = &transform; }
+
+        virtual bool initialize();
+
+        /// solve the minimization and find the optimal transformation
+        virtual bool solve();
+
+        /// perform one iteration of optimization
+        virtual ValueType solver_once(ValueType curr_dissimilarity) = 0;
+
+        /// compute the derivatives of dissimilarity measures to the transformation parameters
+        /// if the analytic derivative is hard to compute, the central difference derivative is computed
+        /// deriv_step_size is the step size used to compute central difference derivatives
+        virtual bool evaluateDeriv(TransformationType* transform, ImageRegDissimilarityType* dissimilarity, const std::vector<ValueType>& deriv_step_size, std::vector<ValueType>& deriv);
+
+        virtual void print(std::ostream& os) const;
+
+        /// number of performed iterations
+        unsigned int iter_num_;
+
+        /// maximal number of iterations
+        unsigned int max_iter_num_;
+
+        /// threshold for minimal dissimilarity changes
+        ValueType dissimilarity_thres_;
+
+        /// threshold for minimal parameter changes
+        ValueType parameter_thres_;
+
+        /// number of search division
+        unsigned int div_num_;
+
+        /// step size for every parameters used in optimization
+        /// depending on the optimization algorithm, this variable may not be used
+        std::vector<ValueType> step_size_para_;
+        /// step size division ratio
+        /// step_size_para_ = step_size_para_ .* step_size_div_para_ to reduce search step size
+        std::vector<ValueType> step_size_div_para_;
+
+        using BaseClass::verbose_;
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        TransformationType* transform_;
+
+        ValueType curr_dissimilarity_;
+        ValueType prev_dissimilarity_;
+
+        using BaseClass::target_;
+        using BaseClass::source_;
+        using BaseClass::warpped_;
+        using BaseClass::bg_value_;
+        using BaseClass::interp_;
+        using BaseClass::warper_;
+        using BaseClass::dissimilarity_;
+        using BaseClass::use_world_coordinate_;
+    };
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegParametricSolver<ValueType, CoordType, DIn, DOut>::hoImageRegParametricSolver() 
+        : BaseClass(), dissimilarity_thres_(1e-8), parameter_thres_(1e-8)
+    {
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegParametricSolver<ValueType, CoordType, DIn, DOut>::~hoImageRegParametricSolver()
+    {
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegParametricSolver<ValueType, CoordType, DIn, DOut>::initialize()
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(target_!=NULL);
+            GADGET_CHECK_RETURN_FALSE(source_!=NULL);
+            GADGET_CHECK_RETURN_FALSE(interp_!=NULL);
+            GADGET_CHECK_RETURN_FALSE(warper_!=NULL);
+            GADGET_CHECK_RETURN_FALSE(dissimilarity_!=NULL);
+            GADGET_CHECK_RETURN_FALSE(transform_!=NULL);
+
+            warper_->setTransformation(*transform_);
+            warper_->setInterpolator(*interp_);
+            warper_->setBackgroundValue(bg_value_);
+
+            dissimilarity_->setBackgroundValue(bg_value_);
+
+            if ( !warpped_.dimensions_equal(*target_) )
+            {
+                warpped_ = *target_;
+            }
+
+            dissimilarity_->initialize(*target_);
+
+            if ( step_size_para_.size() != transform_->get_number_of_parameters() )
+            {
+                step_size_para_.resize(transform_->get_number_of_parameters(), 1.0);
+            }
+
+            if ( step_size_div_para_.size() != transform_->get_number_of_parameters() )
+            {
+                step_size_div_para_.resize(transform_->get_number_of_parameters(), 0.5);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegParametricSolver<ValueType, CoordType, DIn, DOut>::initialize() ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegParametricSolver<ValueType, CoordType, DIn, DOut>::solve()
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(this->initialize());
+
+            size_t numOfPara = transform_->get_number_of_parameters();
+
+            GADGET_CHECK_RETURN_FALSE(warper_->warp(*target_, *source_, use_world_coordinate_, warpped_));
+            curr_dissimilarity_ = dissimilarity_->evaluate(warpped_);
+
+            if ( verbose_ ) { GDEBUG_STREAM("----> Initial image dissimilarity : " << curr_dissimilarity_); }
+
+            unsigned int totalIterNum = 0;
+
+            unsigned int div;
+            for ( div=0; div<div_num_; div++ )
+            {
+                if ( verbose_ ) { GDEBUG_STREAM("----> Parameter division " << div << " [out of " << div_num_ << "] "); }
+
+                for ( iter_num_=0; iter_num_<max_iter_num_; iter_num_++ )
+                {
+                    if ( verbose_ ) { GDEBUG_STREAM("--> Iteration " << iter_num_ << " [out of " << max_iter_num_ << "] : \t" << curr_dissimilarity_); }
+
+                    prev_dissimilarity_ = curr_dissimilarity_;
+
+                    curr_dissimilarity_ = this->solver_once(prev_dissimilarity_);
+
+                    // if the dissimilarity stops decreasing
+                    if ( prev_dissimilarity_ < curr_dissimilarity_ + dissimilarity_thres_ )
+                    {
+                        break;
+                    }
+                }
+                if ( verbose_ ) { transform_->printTransform(std::cout); }
+
+                totalIterNum += iter_num_;
+
+                // reduce the step size
+                size_t p;
+                for ( p=0; p<numOfPara; p++ )
+                {
+                    step_size_para_[p] *= step_size_div_para_[p];
+                }
+            }
+
+            if ( verbose_ ) { GDEBUG_STREAM("----> Total iteration number : " << totalIterNum); }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegParametricSolver<ValueType, CoordType, DIn, DOut>::solve() ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegParametricSolver<ValueType, CoordType, DIn, DOut>::
+    evaluateDeriv(TransformationType* transform, ImageRegDissimilarityType* dissimilarity, const std::vector<ValueType>& deriv_step_size, std::vector<ValueType>& deriv)
+    {
+        try
+        {
+            bool has_analytic_deriv = false;
+
+            /// for some transformation and dissimilarity combination, the analytical derivative is easier to be computed
+
+            /// implement the central difference numerical derivative
+            if ( !has_analytic_deriv )
+            {
+                size_t numOfPara = transform_->get_number_of_parameters();
+                size_t i;
+
+                deriv.resize(numOfPara, 0);
+
+                ValueType currPara(0), positiveValue(0), negativeValue(0), normDeriv(0);
+                for ( i=0; i<numOfPara; i++ )
+                {
+                    if ( transform_->get_para_status(i) == TransformationType::Active )
+                    {
+                        currPara = transform_->get_parameter(i);
+
+                        // positive
+                        transform_->set_parameter(i, currPara + deriv_step_size[i]);
+
+                        GADGET_CHECK_RETURN_FALSE(warper_->warp(*target_, *source_, use_world_coordinate_, warpped_));
+                        positiveValue = dissimilarity_->evaluate(warpped_);
+
+                        // negative
+                        transform_->set_parameter(i, currPara - deriv_step_size[i]);
+
+                        GADGET_CHECK_RETURN_FALSE(warper_->warp(*target_, *source_, use_world_coordinate_, warpped_));
+                        negativeValue = dissimilarity_->evaluate(warpped_);
+
+                        deriv[i] = (positiveValue - negativeValue)/(2*deriv_step_size[i]);
+                        normDeriv += deriv[i]*deriv[i];
+
+                        transform_->set_parameter(i, currPara);
+                    }
+                }
+
+                if ( normDeriv > 0 )
+                {
+                    ValueType distDeriv=std::sqrt(normDeriv);
+
+                    for ( i=0; i<numOfPara; i++ )
+                    {
+                        deriv[i] /= distDeriv;
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in evaluateDeriv(TransformationType* transform, ImageRegDissimilarityType* dissimilarity, const std::vector<ValueType>& deriv_step_size, std::vector<ValueType>& deriv) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    void hoImageRegParametricSolver<ValueType, CoordType, DIn, DOut>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron parametric image registration solver -------------" << endl;
+        os << "Target image dimension is : " << DIn << endl;
+        os << "Source image dimension is : " << DOut << endl;
+        os << "Image data type is : " << std::string(typeid(ValueType).name()) << std::endl;
+        os << "Transformation data type is : " << std::string(typeid(CoordType).name()) << std::endl;
+        os << "Use world coordinate is : " << use_world_coordinate_ << std::endl;
+        os << "Maximal iteration number is : " << max_iter_num_ << std::endl;
+        os << "Dissimilarity threshold is : " << dissimilarity_thres_ << std::endl;
+        os << "Parameter threshold is : " << parameter_thres_ << std::endl;
+        os << "Number of search division is : " << div_num_ << std::endl;
+
+        os << "Step size for every parameters used in optimization is : [ ";
+        unsigned int ii;
+        for ( ii=0; ii<step_size_para_.size(); ii++ )
+        {
+            os << step_size_para_[ii] << " ";
+        }
+        os << " ] " << endl;
+
+        os << "Step size division ratio is : [ ";
+        for ( ii=0; ii<step_size_div_para_.size(); ii++ )
+        {
+            os << step_size_div_para_[ii] << " ";
+        }
+        os << " ] " << endl;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/solver/hoImageRegSolver.h b/toolboxes/registration/optical_flow/cpu/solver/hoImageRegSolver.h
new file mode 100644
index 0000000..da07f0d
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/solver/hoImageRegSolver.h
@@ -0,0 +1,210 @@
+/** \file   hoImageRegSolver.h
+    \brief  Define the base class of image registration solver for gadgetron
+
+            The solver takes in the image warper, similarity, target and source images, and solves
+            for an optimal image transformation.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "hoNDImage.h"
+#include "hoNDInterpolator.h"
+#include "hoNDBoundaryHandler.h"
+#include "hoMatrix.h"
+#include "hoNDArray_utils.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDImage_util.h"
+#include "hoImageRegTransformation.h"
+#include "hoImageRegWarper.h"
+#include "hoImageRegDissimilarity.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "GtPrepUtil.h"
+
+#ifdef USE_OMP
+    #include <omp.h>
+#endif // USE_OMP
+
+namespace Gadgetron
+{
+    // define the solver type
+    enum GT_IMAGE_REG_SOLVER
+    {
+        GT_IMAGE_REG_SOLVER_DOWNHILL,
+        GT_IMAGE_REG_SOLVER_GRADIENT_DESCENT,
+        GT_IMAGE_REG_SOLVER_PDE_TIME_INTEGRATION,
+        GT_IMAGE_REG_SOLVER_PDE_TIME_INTEGRATION_INV
+    };
+
+    inline std::string getImageRegSolverName(GT_IMAGE_REG_SOLVER v)
+    {
+        std::string name;
+
+        switch (v)
+        {
+            case GT_IMAGE_REG_SOLVER_DOWNHILL:
+                name = "DownHill";
+                break;
+
+            case GT_IMAGE_REG_SOLVER_GRADIENT_DESCENT:
+                name = "GradientDescent";
+                break;
+
+            case GT_IMAGE_REG_SOLVER_PDE_TIME_INTEGRATION:
+                name = "PDE_Time_Integration";
+                break;
+
+            case GT_IMAGE_REG_SOLVER_PDE_TIME_INTEGRATION_INV:
+                name = "PDE_Time_Integration_Inv";
+                break;
+
+            default:
+                GERROR_STREAM("Unrecognized image registration solver type : " << v);
+        }
+
+        return name;
+    }
+
+    inline GT_IMAGE_REG_SOLVER getImageRegSolverType(const std::string& name)
+    {
+        GT_IMAGE_REG_SOLVER v;
+
+        if ( name == "DownHill" )
+        {
+            v = GT_IMAGE_REG_SOLVER_DOWNHILL;
+        }
+        else if ( name == "GradientDescent" )
+        {
+            v = GT_IMAGE_REG_SOLVER_GRADIENT_DESCENT;
+        }
+        else if ( name == "PDE_Time_Integration" )
+        {
+            v = GT_IMAGE_REG_SOLVER_PDE_TIME_INTEGRATION;
+        }
+        else if ( name == "PDE_Time_Integration_Inv" )
+        {
+            v = GT_IMAGE_REG_SOLVER_PDE_TIME_INTEGRATION_INV;
+        }
+        else
+        {
+            GERROR_STREAM("Unrecognized image registration solver name : " << name);
+        }
+
+        return v;
+    }
+
+    /// ValueType: image pixel value type
+    /// CoordType: transformation data type
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    class hoImageRegSolver
+    {
+    public:
+
+        typedef hoImageRegSolver<ValueType, CoordType, DIn, DOut> Self;
+
+        typedef hoNDImage<ValueType, DOut> TargetType;
+        typedef hoNDImage<ValueType, DIn> SourceType;
+
+        typedef hoNDImage<ValueType, 2> Target2DType;
+        typedef Target2DType Source2DType;
+
+        typedef hoNDImage<ValueType, 3> Target3DType;
+        typedef Target2DType Source3DType;
+
+        typedef hoNDInterpolator<SourceType> InterpolatorType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef CoordType coord_type;
+
+        typedef hoImageRegWarper<ValueType, CoordType, DIn, DOut> ImageRegWarperType;
+
+        typedef hoImageRegDissimilarity<ValueType, DOut> ImageRegDissimilarityType;
+
+        hoImageRegSolver();
+        virtual ~hoImageRegSolver();
+
+        void setTarget(TargetType& target) { target_ = ⌖ }
+        void setSource(SourceType& source) { source_ = &source; }
+
+        void setDissimilarity(ImageRegDissimilarityType& dissimilarity) { dissimilarity_ = &dissimilarity; }
+        void setWarper(ImageRegWarperType& warper) { warper_ = &warper; }
+        void setInterpolator(InterpolatorType& interp) { interp_ = &interp; }
+        void setBackgroundValue(ValueType bg_value) { bg_value_ = bg_value; }
+
+        void setUseWorldCoordinate(bool use_world_coordinate) { use_world_coordinate_ = use_world_coordinate; }
+
+        virtual bool solve() = 0;
+
+        virtual void print(std::ostream& os) const;
+
+        /// if true, print out more intermediate information
+        bool verbose_;
+
+        // ----------------------------------
+        // debug and timing
+        // ----------------------------------
+        // clock for timing
+        Gadgetron::GadgetronTimer gt_timer1_;
+        Gadgetron::GadgetronTimer gt_timer2_;
+        Gadgetron::GadgetronTimer gt_timer3_;
+
+        bool performTiming_;
+
+        // exporter
+        Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+        // debug folder
+        std::string debugFolder_;
+
+    protected:
+
+        TargetType* target_;
+        SourceType* source_;
+
+        // warped image
+        TargetType warpped_;
+
+        ValueType bg_value_;
+
+        InterpolatorType* interp_;
+
+        ImageRegWarperType* warper_;
+
+        ImageRegDissimilarityType* dissimilarity_;
+
+        /// whether to perform registration using the world coordinates
+        bool use_world_coordinate_;
+    };
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegSolver<ValueType, CoordType, DIn, DOut>::hoImageRegSolver() 
+        : target_(NULL), source_(NULL), bg_value_(0), interp_(NULL), warper_(NULL), dissimilarity_(NULL), verbose_(false), use_world_coordinate_(true), performTiming_(false)
+    {
+        gt_timer1_.set_timing_in_destruction(false);
+        gt_timer2_.set_timing_in_destruction(false);
+        gt_timer3_.set_timing_in_destruction(false);
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegSolver<ValueType, CoordType, DIn, DOut>::~hoImageRegSolver()
+    {
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    void hoImageRegSolver<ValueType, CoordType, DIn, DOut>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron image registration solver -------------" << endl;
+        os << "Target image dimension is : " << DIn << endl;
+        os << "Source image dimension is : " << DOut << endl;
+        os << "Image data type is : " << std::string(typeid(ValueType).name()) << std::endl;
+        os << "Transformation data type is : " << std::string(typeid(CoordType).name()) << std::endl;
+        os << "Use world coordinate is : " << use_world_coordinate_ << std::endl;
+        os << "verbose flag is : " << verbose_ << std::endl;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegDeformationField.h b/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegDeformationField.h
new file mode 100644
index 0000000..22c3c87
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegDeformationField.h
@@ -0,0 +1,965 @@
+/** \file   hoImageRegDeformationField.h
+    \brief  Define the geometry transformation using deformation filed
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegNonParametricTransformation.h"
+
+namespace Gadgetron
+{
+    /// deformation field is defined as hoNDImage
+    /// the deformation field can be accessed on image pixels
+    /// if the non-integer image pixels are used to access deformaiton field, an image interpolator is used
+    /// linear interpolator is used for deformation field
+    /// the unit of stored deformation field is in pixel, not in world coordinates
+    template<typename ValueType, unsigned int D> 
+    class  hoImageRegDeformationField: public hoImageRegNonParametricTransformation<ValueType, D, D>
+    {
+    public:
+
+        typedef hoImageRegTransformation<ValueType, D, D> Self;
+        typedef hoImageRegNonParametricTransformation<ValueType, D, D> BaseClass;
+
+        typedef ValueType T;
+
+        typedef typename BaseClass::input_point_type input_point_type;
+        typedef typename BaseClass::output_point_type output_point_type;
+
+        typedef typename BaseClass::jacobian_position_type jacobian_position_type;
+
+        typedef hoNDImage<T, D> DeformationFieldType;
+
+        typedef typename DeformationFieldType::coord_type coord_type;
+
+        typedef typename DeformationFieldType::axis_type axis_type;
+
+        typedef hoNDBoundaryHandler<DeformationFieldType> BoundHanlderType;
+        typedef hoNDInterpolator<DeformationFieldType> InterpolatorType;
+
+        typedef hoNDInterpolatorLinear<DeformationFieldType> DefaultInterpolatorType;
+        typedef hoNDBoundaryHandlerBorderValue<DeformationFieldType> DefaultBoundHanlderType;
+
+        hoImageRegDeformationField();
+        hoImageRegDeformationField(const std::vector<size_t>& dimensions);
+        hoImageRegDeformationField(const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, const axis_type& axis);
+        hoImageRegDeformationField(const hoNDImage<ValueType, D>& im);
+
+        virtual ~hoImageRegDeformationField();
+
+        virtual bool invertTransformation();
+
+        virtual bool setIdentity();
+
+        /// update the internal status after the deformation fields are changed
+        virtual bool update();
+
+        /// transform a point
+        /// the point is in the non-integer image pixel indexes
+        /// image interpolator is used
+        virtual bool transform(const T* pt_in, T* pt_out) const;
+        virtual bool transform(const T& xi, const T& yi, T& xo, T& yo) const;
+        virtual bool transform(const T& xi, const T& yi, const T& zi, T& xo, T& yo, T& zo) const;
+
+        /// transform a point
+        /// the point is in the integer image pixel indexes
+        /// image interpolator is not used
+        /// pt_in, pt_out stores a point as an array
+        virtual bool transform(const size_t* pt_in, T* pt_out) const;
+        virtual bool transform(const size_t* pt_in, size_t N, T* pt_out) const;
+
+        /// for 2D - 2D transformation
+        virtual bool transform(const size_t& xi, const size_t& yi, T& xo, T& yo) const;
+        virtual bool transform(const size_t* xi, const size_t* yi, size_t N, T* xo, T* yo) const;
+
+        /// for 3D - 3D transformation
+        virtual bool transform(const size_t& xi, const size_t& yi, const size_t& zi, T& xo, T& yo, T& zo) const;
+        virtual bool transform(const size_t* xi, const size_t* yi, const size_t* zi, size_t N, T* xo, T* yo, T* zo) const;
+
+        /// compute jacobian matrix to spatial position
+        /// the jacobian matrix is computed with the compensation for non-isotropic pixel sizes
+        /// e.g. dxdy = ( dx(x,y+dh)*sx - dx(x, y-dh)*sx ) / (2*dh*sy); sx, sy: pixel sizes for x and y directions
+        /// DOut*DIn matrix
+        virtual bool jacobianPosition(const input_point_type& /*pos*/, jacobian_position_type& jac);
+
+        /// compute jacobian matrix on the deformation grid
+        /// jac is [DOut Din dimensions] array, storing the jacobian matrix for every point in the deformation field
+        virtual bool jacobianPosition(hoNDArray<T>& jac, DeformationFieldType* deform_field[D], unsigned int borderWidth=1);
+        virtual bool jacobianPosition(hoNDArray<T>& jac, unsigned int borderWidth=1);
+
+        /// compute some parameters from deformation field and jacobian matrix
+        /// in the world coordinate
+        virtual bool analyzeJacobianAndDeformation(const hoNDArray<T>& jac, DeformationFieldType* deform_field[D], T& meanDeform, T& maxDeform, T& meanLogJac, T& maxLogJac, unsigned int borderWidth=1);
+        virtual bool analyzeJacobianAndDeformation(const hoNDArray<T>& jac, T& meanDeform, T& maxDeform, T& meanLogJac, T& maxLogJac, unsigned int borderWidth=1);
+
+        /// get/set the deformation vector on the deformation grid (image coordinate)
+        /// given the index idx[DIn], output the deformation value for outDim
+        T& operator()( size_t idx[D], size_t outDim );
+        const T& operator()( size_t idx[D], size_t outDim ) const;
+
+        void get(size_t idx[D], T deform[D]);
+        void get(size_t x, size_t y, T& dx, T& dy);
+        void get(size_t x, size_t y, size_t z, T& dx, T& dy, T& dz);
+
+        void set(size_t idx[D], T deform[D]);
+        void set(size_t x, size_t y, T dx, T dy);
+        void set(size_t x, size_t y, size_t z, T dx, T dy, T dz);
+
+        /// get/set the deformation vector on the world coordinate
+        /// given the position pos[DIn], output the deformation value for outDim
+        T operator()( coord_type pos[D], size_t outDim );
+
+        void get(coord_type pos[D], T deform[D]);
+        void get(coord_type px, coord_type py, T& dx, T& dy);
+        void get(coord_type px, coord_type py, coord_type pz, T& dx, T& dy, T& dz);
+
+        /// get/set interpolator
+        //void getInterpolator(InterpolatorType*& interp, size_t outDim);
+        //void setInterpolator(InterpolatorType* interp, size_t outDim);
+
+        /// get/set deformation field
+        void getDeformationField(DeformationFieldType*& deform, size_t outDim);
+        DeformationFieldType& getDeformationField(size_t outDim) { GADGET_DEBUG_CHECK_THROW(outDim<=D); return this->deform_field_[outDim]; }
+
+        void setDeformationField(const DeformationFieldType& deform, size_t outDim);
+
+        /// serialize/deserialize the transformation
+        virtual bool serialize(char*& buf, size_t& len) const ;
+        virtual bool deserialize(char* buf, size_t& len);
+
+        virtual void print(std::ostream& os) const;
+
+        virtual std::string transformationName() const;
+
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        DeformationFieldType deform_field_[D];
+
+        //InterpolatorType* interp_[D];
+
+        DefaultInterpolatorType* interp_default_[D];
+        DefaultBoundHanlderType* bh_default_[D];
+    };
+
+    template <typename ValueType, unsigned int D> 
+    hoImageRegDeformationField<ValueType, D>::hoImageRegDeformationField() : BaseClass()
+    {
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            //interp_[ii] = NULL;
+            bh_default_[ii] = new DefaultBoundHanlderType(deform_field_[ii]);
+            interp_default_[ii] = new DefaultInterpolatorType(deform_field_[ii], *(bh_default_[ii]));
+        }
+    }
+
+    template <typename ValueType, unsigned int D> 
+    hoImageRegDeformationField<ValueType, D>::
+    hoImageRegDeformationField(const std::vector<size_t>& dimensions) : BaseClass()
+    {
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            deform_field_[ii].create(dimensions);
+            memset(deform_field_[ii].get_data_ptr(), 0, deform_field_[ii].get_number_of_elements()*sizeof(T));
+
+            //interp_[ii] = NULL;
+
+            bh_default_[ii] = new DefaultBoundHanlderType(deform_field_[ii]);
+            interp_default_[ii] = new DefaultInterpolatorType(deform_field_[ii], *(bh_default_[ii]));
+        }
+    }
+
+    template <typename ValueType, unsigned int D> 
+    hoImageRegDeformationField<ValueType, D>::
+    hoImageRegDeformationField(const std::vector<size_t>& dimensions, const std::vector<coord_type>& pixelSize, const std::vector<coord_type>& origin, const axis_type& axis) : BaseClass()
+    {
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            deform_field_[ii].create(dimensions, pixelSize, origin, axis);
+            memset(deform_field_[ii].get_data_ptr(), 0, deform_field_[ii].get_number_of_elements()*sizeof(T));
+
+            //interp_[ii] = NULL;
+
+            bh_default_[ii] = new DefaultBoundHanlderType(deform_field_[ii]);
+            interp_default_[ii] = new DefaultInterpolatorType(deform_field_[ii], *(bh_default_[ii]));
+        }
+    }
+
+    template <typename ValueType, unsigned int D> 
+    hoImageRegDeformationField<ValueType, D>::hoImageRegDeformationField(const hoNDImage<ValueType, D>& im) : BaseClass()
+    {
+        std::vector<size_t> dim;
+        im.get_dimensions(dim);
+
+        std::vector<coord_type> pixelSize;
+        im.get_pixel_size(pixelSize);
+
+        std::vector<coord_type> origin;
+        im.get_origin(origin);
+
+        axis_type axis;
+        im.get_axis(axis);
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            deform_field_[ii].create(dim, pixelSize, origin, axis);
+            memset(deform_field_[ii].get_data_ptr(), 0, deform_field_[ii].get_number_of_elements()*sizeof(T));
+
+            //interp_[ii] = NULL;
+
+            bh_default_[ii] = new DefaultBoundHanlderType(deform_field_[ii]);
+            interp_default_[ii] = new DefaultInterpolatorType(deform_field_[ii], *(bh_default_[ii]));
+        }
+    }
+
+    template <typename ValueType, unsigned int D> 
+    hoImageRegDeformationField<ValueType, D>::
+    ~hoImageRegDeformationField()
+    {
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            delete bh_default_[ii];
+            delete interp_default_[ii];
+        }
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline bool hoImageRegDeformationField<ValueType, D>::invertTransformation()
+    {
+        /// to be implemented ... 
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline bool hoImageRegDeformationField<ValueType, D>::setIdentity()
+    {
+        try
+        {
+            unsigned int ii;
+            for ( ii=0; ii<D; ii++ )
+            {
+                memset(deform_field_[ii].get_data_ptr(), 0, deform_field_[ii].get_number_of_elements()*sizeof(T));
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationField<ValueType, D>::setIdentity() ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline bool hoImageRegDeformationField<ValueType, D>::update()
+    {
+        try
+        {
+            unsigned int ii;
+            for ( ii=0; ii<D; ii++ )
+            {
+                interp_default_[ii]->setArray(deform_field_[ii]);
+                interp_default_[ii]->setBoundaryHandler(*bh_default_[ii]);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationField<ValueType, D>::update() ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline bool hoImageRegDeformationField<ValueType, D>::transform(const T* pt_in, T* pt_out) const
+    {
+        try
+        {
+            std::vector<coord_type> pos(D);
+
+            int ii;
+            for ( ii=0; ii<(int)D; ii++ )
+            {
+                pos[ii] = pt_in[ii];
+            }
+
+            #pragma omp parallel for default(none) private(ii) shared(pos, pt_out)
+            for ( ii=0; ii<(int)D; ii++ )
+            {
+                pt_out[ii] += this->interp_default_[ii]->operator()(pos);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationField<ValueType, D>::transform(T* pt_in, T* pt_out) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline bool hoImageRegDeformationField<ValueType, D>::transform(const T& xi, const T& yi, T& xo, T& yo) const
+    {
+        try
+        {
+            xo = xi + (*interp_default_[0])(xi, yi);
+            yo = yi + (*interp_default_[1])(xi, yi);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationField<ValueType, D>::transform(const T& xi, const T& yi, T& xo, T& yo) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline bool hoImageRegDeformationField<ValueType, D>::transform(const T& xi, const T& yi, const T& zi, T& xo, T& yo, T& zo) const
+    {
+        try
+        {
+            xo = xi + (*interp_default_[0])(xi, yi, zi);
+            yo = yi + (*interp_default_[1])(xi, yi, zi);
+            zo = zi + (*interp_default_[2])(xi, yi, zi);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationField<ValueType, D>::transform(const T& xi, const T& yi, const T& zi, T& xo, T& yo, T& zo) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline bool hoImageRegDeformationField<ValueType, D>::transform(const size_t* pt_in, T* pt_out) const
+    {
+        try
+        {
+            unsigned int ii;
+            for ( ii=0; ii<D; ii++ )
+            {
+                pt_out[ii] = pt_in[ii] + this->deform_field_[ii](pt_in);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationField<ValueType, D>::transform(size_t* pt_in, T* pt_out) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline bool hoImageRegDeformationField<ValueType, D>::transform(const size_t* pt_in, size_t N, T* pt_out) const
+    {
+        try
+        {
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, pt_in, pt_out)
+            for( n=0; n<(long long)N; n++ )
+            {
+                this->transform(pt_in+n*D, pt_out+n*D);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationField<ValueType, D>::transform(size_t* pt_in, size_t N, T* pt_out) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline bool hoImageRegDeformationField<ValueType, D>::transform(const size_t& xi, const size_t& yi, T& xo, T& yo) const
+    {
+        try
+        {
+            xo = xi + this->deform_field_[0](xi, yi);
+            yo = yi + this->deform_field_[1](xi, yi);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationField<ValueType, D>::transform(const size_t& xi, const size_t& yi, T& xo, T& yo) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline bool hoImageRegDeformationField<ValueType, D>::transform(const size_t* xi, const size_t* yi, size_t N, T* xo, T* yo) const
+    {
+        try
+        {
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, xi, yi, xo, yo)
+            for( n=0; n<(long long)N; n++ )
+            {
+                this->transform(xi[n], yi[n], xo[n], yo[n]);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationField<ValueType, D>::transform(size_t* xi, size_t* yi, size_t N, T* xo, T* yo) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline bool hoImageRegDeformationField<ValueType, D>::transform(const size_t& xi, const size_t& yi, const size_t& zi, T& xo, T& yo, T& zo) const
+    {
+        try
+        {
+            xo = xi + this->deform_field_[0](xi, yi, zi);
+            yo = yi + this->deform_field_[1](xi, yi, zi);
+            zo = zi + this->deform_field_[2](xi, yi, zi);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationField<ValueType, D>::transform(const size_t& xi, const size_t& yi, const size_t& zi, T& xo, T& yo, T& zo) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline bool hoImageRegDeformationField<ValueType, D>::transform(const size_t* xi, const size_t* yi, const size_t* zi, size_t N, T* xo, T* yo, T* zo) const
+    {
+        try
+        {
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, xi, yi, zi, xo, yo, zo)
+            for( n=0; n<(long long)N; n++ )
+            {
+                this->transform(xi[n], yi[n], zi[n], xo[n], yo[n], zo[n]);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationField<ValueType, D>::transform(size_t* xi, size_t* yi, size_t* zi, size_t N, T* xo, T* yo, T* zo) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline bool hoImageRegDeformationField<ValueType, D>::jacobianPosition(const input_point_type& pos, jacobian_position_type& jac)
+    {
+        try
+        {
+            jac.createMatrix(D, D);
+
+            T delta = 0.5;
+            T deltaReciprocal = T(1.0)/(T(2.0)*delta);
+
+            std::vector<coord_type> pixelSize(D);
+
+            deform_field_[0].get_pixel_size(pixelSize);
+
+            size_t din, dout;
+            for ( dout=0; dout<D; dout++ )
+            {
+                for ( din=0; din<D; din++ )
+                {
+                    input_point_type pos_positive(pos);
+                    input_point_type pos_negative(pos);
+
+                    pos_positive[din] += delta;
+                    pos_negative[din] -= delta;
+
+                    T v_positive = (*interp_default_[dout])(pos_positive.begin());
+                    T v_negative = (*interp_default_[dout])(pos_negative.begin());
+
+                    jac(dout, din) = (v_positive-v_negative)*deltaReciprocal;
+
+                    if ( dout != din )
+                    {
+                        // scaled for non-isotropic pixel sizes
+                        jac(dout, din) *= ( pixelSize[dout]/pixelSize[din] );
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationField<ValueType, D>::jacobianPosition(const input_point_type& pos, jacobian_position_type& jac) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline bool hoImageRegDeformationField<ValueType, D>::jacobianPosition(hoNDArray<T>& jac, unsigned int borderWidth)
+    {
+        DeformationFieldType* deform_field[D];
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            deform_field[ii] = &deform_field_[ii];
+        }
+
+        return this->jacobianPosition(jac, deform_field, borderWidth);
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline bool hoImageRegDeformationField<ValueType, D>::jacobianPosition(hoNDArray<T>& jac, DeformationFieldType* deform_field[D], unsigned int borderWidth)
+    {
+        try
+        {
+            std::vector<size_t> dim;
+            deform_field[0]->get_dimensions(dim);
+
+            std::vector<size_t> dimJac(D+2, D);
+            memcpy(&dimJac[0]+2, &dim[0], sizeof(size_t)*D);
+
+            jac.create(&dimJac);
+            Gadgetron::clear(&jac);
+
+            std::vector<size_t> offset(D);
+            deform_field[0]->get_offset_factor(offset);
+
+            std::vector<coord_type> pixelSize(D);
+            deform_field[0]->get_pixel_size(pixelSize);
+
+            T delta = 1.0;
+            T deltaReciprocal = T(1.0)/(T(2.0)*delta);
+
+            size_t N = deform_field[0]->get_number_of_elements();
+
+            long long n;
+
+            #pragma omp parallel default(none) private(n) shared(N, jac, dim, offset, pixelSize, borderWidth, deltaReciprocal, deform_field)
+            {
+
+                std::vector<size_t> ind(D);
+
+                hoNDArray<T> jacCurr(D, D);
+
+                #pragma omp for 
+                for ( n=0; n<(long long)N; n++ )
+                {
+                    ind = deform_field[0]->calculate_index( n );
+
+                    bool inRange = true;
+
+                    size_t din, dout;
+
+                    for ( dout=0; dout<D; dout++ )
+                    {
+                        if ( ind[dout]<borderWidth || ind[dout]>=dim[dout]-borderWidth )
+                        {
+                            inRange = false;
+                            break;
+                        }
+                    }
+
+                    if ( inRange )
+                    {
+                        for ( dout=0; dout<D; dout++ )
+                        {
+                            for ( din=0; din<D; din++ )
+                            {
+                                size_t offset_positive = n + offset[din];
+                                size_t offset_negative = n - offset[din];
+
+                                T v_positive = (*deform_field[dout])(offset_positive);
+                                T v_negative = (*deform_field[dout])(offset_negative);
+
+                                jacCurr(dout, din) = (v_positive-v_negative)*deltaReciprocal;
+
+                                if ( dout != din )
+                                {
+                                    // scaled for non-isotropic pixel sizes
+                                    jacCurr(dout, din) *= ( pixelSize[dout]/pixelSize[din] );
+                                }
+                            }
+                        }
+
+                        memcpy(jac.begin()+n*D*D, jacCurr.begin(), sizeof(T)*D*D);
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationField<ValueType, D>::jacobianPosition(hoNDArray<T>& jac, DeformationFieldType* deform_field[D], unsigned int borderWidth) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline bool hoImageRegDeformationField<ValueType, D>::
+    analyzeJacobianAndDeformation(const hoNDArray<T>& jac, T& meanDeform, T& maxDeform, T& meanLogJac, T& maxLogJac, unsigned int borderWidth)
+    {
+        DeformationFieldType* deform_field[D];
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            deform_field[ii] = &deform_field_[ii];
+        }
+
+        return this->analyzeJacobianAndDeformation(jac, deform_field, meanDeform, maxDeform, meanLogJac, maxLogJac, borderWidth);
+    }
+
+    template <typename ValueType, unsigned int D> 
+    bool hoImageRegDeformationField<ValueType, D>::
+    analyzeJacobianAndDeformation(const hoNDArray<T>& jac, DeformationFieldType* deform_field[D], T& meanDeform, T& maxDeform, T& meanLogJac, T& maxLogJac, unsigned int borderWidth)
+    {
+        try
+        {
+            std::vector<size_t> dim;
+            deform_field[0]->get_dimensions(dim);
+
+            std::vector<coord_type> pixelSize(D);
+            deform_field[0]->get_pixel_size(pixelSize);
+
+            size_t N = deform_field[0]->get_number_of_elements();
+
+            meanDeform = 0;
+            maxDeform = -1;
+            meanLogJac = 0;
+            maxLogJac = -1;
+
+            hoNDArray<T> deformNorm(dim);
+            Gadgetron::clear(deformNorm);
+
+            hoNDArray<T> logJac(dim);
+            Gadgetron::clear(logJac);
+
+            long long n;
+            #pragma omp parallel default(none) private(n) shared(N, borderWidth, jac, deformNorm, logJac, dim, pixelSize, deform_field)
+            {
+                std::vector<size_t> ind(D);
+                hoMatrix<T> jacCurr(D, D);
+                unsigned int ii;
+
+                #pragma omp for 
+                for ( n=0; n<(long long)N; n++ )
+                {
+                    ind = deform_field[0]->calculate_index( n );
+
+                    bool inRange = true;
+
+                    size_t dout;
+
+                    for ( dout=0; dout<D; dout++ )
+                    {
+                        if ( ind[dout]<borderWidth || ind[dout]>=dim[dout]-borderWidth )
+                        {
+                            inRange = false;
+                            break;
+                        }
+                    }
+
+                    if ( inRange )
+                    {
+                        memcpy(jacCurr.begin(), jac.begin()+n*D*D, sizeof(T)*D*D);
+
+                        T deformMag(0), v, det;
+
+                        for ( ii=0; ii<D; ii++ )
+                        {
+                            jacCurr(ii, ii) += 1.0;
+
+                            v = (*deform_field[ii])(n)*pixelSize[ii];
+                            deformMag += v*v;
+                        }
+
+                        deformNorm(n) = std::sqrt(deformMag);
+
+                        if ( D == 2 )
+                        {
+                            det = jacCurr(0, 0)*jacCurr(1, 1) - jacCurr(0, 1)*jacCurr(1, 0);
+                        }
+                        else if ( D == 3 )
+                        {
+                            det = jacCurr(0, 0)*jacCurr(1, 1)*jacCurr(2, 2) 
+                                + jacCurr(0, 1)*jacCurr(1, 2)*jacCurr(2, 0)
+                                + jacCurr(0, 2)*jacCurr(2, 1)*jacCurr(1, 0)
+                                - jacCurr(0, 2)*jacCurr(1, 1)*jacCurr(2, 0) 
+                                - jacCurr(0, 1)*jacCurr(1, 0)*jacCurr(2, 2) 
+                                - jacCurr(0, 0)*jacCurr(2, 1)*jacCurr(1, 2);
+                        }
+
+                        if ( std::abs(det) < FLT_EPSILON ) det = FLT_EPSILON;
+                        logJac(n) = std::log(det);
+                    }
+                }
+            }
+
+            size_t ind;
+            Gadgetron::maxAbsolute(deformNorm, maxDeform, ind);
+            Gadgetron::maxAbsolute(logJac, maxLogJac, ind);
+
+            Gadgetron::norm1(deformNorm, meanDeform);
+            meanDeform /= N;
+
+            Gadgetron::norm1(logJac, meanLogJac);
+            meanLogJac /= N;
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in analyzeJacobianAndDeformation(const hoNDArray<T>& jac, DeformationFieldType* deform_field[D], T& meanDeform, T& maxDeform, T& meanLogJac, T& maxLogJac, unsigned int borderWidth) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline ValueType& hoImageRegDeformationField<ValueType, D>::operator()( size_t idx[D], size_t outDim )
+    {
+        GADGET_DEBUG_CHECK_THROW(outDim<=D);
+        return this->deform_field_[outDim](idx);
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline const ValueType& hoImageRegDeformationField<ValueType, D>::operator()( size_t idx[D], size_t outDim ) const 
+    {
+        GADGET_DEBUG_CHECK_THROW(outDim<=D);
+        return this->deform_field_[outDim](idx);
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline void hoImageRegDeformationField<ValueType, D>::get(size_t idx[D], T deform[D])
+    {
+        size_t offset = this->deform_field_[0].calculate_offset(idx);
+
+        unsigned int ii;
+        for ( ii=0; ii<D; ii++ )
+        {
+            deform[ii] = this->deform_field_[ii](offset);
+        }
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline void hoImageRegDeformationField<ValueType, D>::get(size_t x, size_t y, T& dx, T& dy)
+    {
+        size_t offset = this->deform_field_[0].calculate_offset(x, y);
+        dx = this->deform_field_[0](offset);
+        dy = this->deform_field_[1](offset);
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline void hoImageRegDeformationField<ValueType, D>::get(size_t x, size_t y, size_t z, T& dx, T& dy, T& dz)
+    {
+        size_t offset = this->deform_field_[0].calculate_offset(x, y, z);
+        dx = this->deform_field_[0](offset);
+        dy = this->deform_field_[1](offset);
+        dz = this->deform_field_[2](offset);
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline void hoImageRegDeformationField<ValueType, D>::set(size_t idx[D], T deform[D])
+    {
+        size_t offset = this->deform_field_[0].calculate_offset(idx);
+
+        unsigned int ii;
+        for ( ii<0; ii<D; ii++ )
+        {
+            this->deform_field_[ii](offset) = deform[ii];
+        }
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline void hoImageRegDeformationField<ValueType, D>::set(size_t x, size_t y, T dx, T dy)
+    {
+        size_t offset = this->deform_field_[0].calculate_offset(x, y);
+        this->deform_field_[0](offset) = dx;
+        this->deform_field_[1](offset) = dy;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline void hoImageRegDeformationField<ValueType, D>::set(size_t x, size_t y, size_t z, T dx, T dy, T dz)
+    {
+        size_t offset = this->deform_field_[0].calculate_offset(x, y, z);
+        this->deform_field_[0](offset) = dx;
+        this->deform_field_[1](offset) = dy;
+        this->deform_field_[2](offset) = dz;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline ValueType hoImageRegDeformationField<ValueType, D>::operator()( coord_type pos[D], size_t outDim )
+    {
+        GADGET_DEBUG_CHECK_THROW(outDim<=D);
+        return (*interp_default_[outDim])(pos);
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline void hoImageRegDeformationField<ValueType, D>::get(coord_type pos[D], T deform[D])
+    {
+        unsigned int ii;
+        for (ii=0; ii<D; ii++ )
+        {
+            deform[ii] = (*interp_default_[ii])(pos);
+        }
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline void hoImageRegDeformationField<ValueType, D>::get(coord_type px, coord_type py, T& dx, T& dy)
+    {
+        dx = (*interp_default_[0])(px, py);
+        dy = (*interp_default_[1])(px, py);
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline void hoImageRegDeformationField<ValueType, D>::get(coord_type px, coord_type py, coord_type pz, T& dx, T& dy, T& dz)
+    {
+        dx = (*interp_default_[0])(px, py, pz);
+        dy = (*interp_default_[1])(px, py, pz);
+        dz = (*interp_default_[2])(px, py, pz);
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline void hoImageRegDeformationField<ValueType, D>::  getDeformationField(DeformationFieldType*& deform, size_t outDim)
+    {
+        GADGET_DEBUG_CHECK_THROW(outDim<=D);
+        deform = &(this->deform_field_[outDim]);
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline void hoImageRegDeformationField<ValueType, D>::setDeformationField(const DeformationFieldType& deform, size_t outDim)
+    {
+        GADGET_DEBUG_CHECK_THROW(outDim<=D);
+        this->deform_field_[outDim] = deform;
+        this->update();
+    }
+
+    template <typename ValueType, unsigned int D> 
+    bool hoImageRegDeformationField<ValueType, D>::serialize(char*& buf, size_t& len) const 
+    {
+        try
+        {
+            if ( buf != NULL ) delete[] buf;
+
+            char* bufInternal[D];
+            size_t lenInternal[D];
+
+            // serialize every dimension
+
+            size_t totalLen = 0;
+
+            unsigned int ii;
+            for ( ii=0; ii<D; ii++ )
+            {
+                GADGET_CHECK_RETURN_FALSE(this->deform_field_[ii].serialize(bufInternal[ii], lenInternal[ii]));
+                totalLen += lenInternal[ii];
+            }
+
+            // number of dimensions + dimension vector + pixel size + origin + axis + contents
+            len = sizeof(unsigned int) + totalLen;
+
+            buf = new char[len];
+            GADGET_CHECK_RETURN_FALSE(buf!=NULL);
+
+            unsigned int NDim=D;
+
+            size_t offset = 0;
+            memcpy(buf, &NDim, sizeof(unsigned int));
+            offset += sizeof(unsigned int);
+
+            if ( NDim > 0 )
+            {
+                for ( ii=0; ii<D; ii++ )
+                {
+                    memcpy(buf+offset, bufInternal[ii], lenInternal[ii]);
+                    offset += lenInternal[ii];
+                }
+
+                for ( ii=0; ii<D; ii++ )
+                {
+                    delete [] bufInternal[ii];
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationField<ValueType, D>::serialize(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    bool hoImageRegDeformationField<ValueType, D>::deserialize(char* buf, size_t& len)
+    {
+        try
+        {
+            unsigned int NDim;
+            memcpy(&NDim, buf, sizeof(unsigned int));
+            if ( NDim != D )
+            {
+                GERROR_STREAM("hoImageRegDeformationField<ValueType, D>::deserialize(...) : number of image dimensions does not match ... ");
+                return false;
+            }
+
+            size_t offset = sizeof(unsigned int);
+
+            unsigned int ii;
+
+            if ( NDim > 0 )
+            {
+                for ( ii=0; ii<D; ii++ )
+                {
+                    size_t lenInternal;
+                    GADGET_CHECK_RETURN_FALSE(this->deform_field_[ii].deserialize(buf+offset, lenInternal));
+                    offset += lenInternal;
+                }
+            }
+
+            len = offset;
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegDeformationField<ValueType, D>::deserialize(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    void hoImageRegDeformationField<ValueType, D>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron deformation field geometry transformation -------------" << endl;
+        os << "Deformation field dimension is : " << D << endl;
+
+        std::string elemTypeName = std::string(typeid(T).name());
+        os << "Transformation data type is : " << elemTypeName << std::endl;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    std::string hoImageRegDeformationField<ValueType, D>::transformationName() const
+    {
+        return std::string("hoImageRegDeformationField"); 
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegHomogenousTransformation.h b/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegHomogenousTransformation.h
new file mode 100644
index 0000000..ec96814
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegHomogenousTransformation.h
@@ -0,0 +1,475 @@
+/** \file   hoImageRegHomogenousTransformation.h
+    \brief  Define the class for the homogenous geometry transformation in gadgetron registration
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegParametricTransformation.h"
+#include "hoMatrix.h"
+
+namespace Gadgetron
+{
+    /// Homogenous transformation
+    template<typename ValueType, unsigned int D> 
+    class hoImageRegHomogenousTransformation : public hoImageRegParametricTransformation<ValueType, D, D>
+    {
+    public:
+
+        typedef hoImageRegParametricTransformation<ValueType, D, D> BaseClass;
+        typedef hoImageRegHomogenousTransformation<ValueType, D> Self;
+
+        typedef ValueType T;
+
+        typedef typename BaseClass::input_point_type input_point_type;
+        typedef typename BaseClass::output_point_type output_point_type;
+
+        typedef typename BaseClass::jacobian_parameter_type jacobian_parameter_type;
+        typedef typename BaseClass::jacobian_position_type jacobian_position_type;
+
+        typedef typename BaseClass::ParaStatus ParaStatus;
+        typedef typename BaseClass::ParaStatusType ParaStatusType;
+
+        hoImageRegHomogenousTransformation();
+        virtual ~hoImageRegHomogenousTransformation();
+
+        // get/set the ith parameter
+        virtual ValueType get_parameter(size_t i) const;
+        virtual void set_parameter(size_t i, ValueType v);
+
+        virtual bool invertTransformation();
+
+        virtual bool setIdentity();
+
+        virtual bool transform(const T* pt_in, T* pt_out) const;
+
+        virtual bool transform(const T& xi, const T& yi, T& xo, T& yo) const;
+
+        virtual bool transform(const T& xi, const T& yi, const T& zi, T& xo, T& yo, T& zo) const;
+
+        virtual bool transform(const size_t* pt_in, T* pt_out) const;
+        virtual bool transform(const size_t* pt_in, size_t N, T* pt_out) const;
+        virtual bool transform(const size_t& xi, const size_t& yi, T& xo, T& yo) const;
+        virtual bool transform(const size_t* xi, const size_t* yi, size_t N, T* xo, T* yo) const;
+        virtual bool transform(const size_t& xi, const size_t& yi, const size_t& zi, T& xo, T& yo, T& zo) const;
+        virtual bool transform(const size_t* xi, const size_t* yi, const size_t* zi, size_t N, T* xo, T* yo, T* zo) const;
+
+        /// compute jacobian matrix to parameters
+        /// D*num_parameters_ matrix
+        virtual bool jacobianParameter(const input_point_type& pos, jacobian_parameter_type& jac);
+
+        /// compute jacobian matrix to spatial position
+        /// D*D matrix
+        virtual bool jacobianPosition(const input_point_type& pos, jacobian_position_type& jac);
+
+        virtual void print(std::ostream& os) const;
+        virtual void printTransform(std::ostream& os) const;
+
+        virtual std::string transformationName() const
+        {
+            return std::string("hoImageRegHomogenousTransformation"); 
+        }
+
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        using BaseClass::num_parameters_;
+        using BaseClass::para_status_;
+
+        /// transformation matrix
+        hoMatrix<ValueType> matrix_;
+    };
+
+    template <typename ValueType, unsigned int D> 
+    hoImageRegHomogenousTransformation<ValueType, D>::hoImageRegHomogenousTransformation() : BaseClass()
+    {
+        num_parameters_ = D*(D+1);
+        para_status_.resize(num_parameters_, BaseClass::Active);
+
+        GADGET_CHECK_THROW(matrix_.createMatrix(D+1, D+1));
+        GADGET_CHECK_THROW(matrix_.setIdentity());
+    }
+
+    template <typename ValueType, unsigned int D> 
+    hoImageRegHomogenousTransformation<ValueType, D>::~hoImageRegHomogenousTransformation()
+    {
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline ValueType hoImageRegHomogenousTransformation<ValueType, D>::get_parameter(size_t i) const
+    {
+        GADGET_DEBUG_CHECK_THROW(i<num_parameters_);
+        return matrix_( i/(D+1), i%(D+1) );
+    }
+
+    template <typename ValueType, unsigned int D> 
+    inline void hoImageRegHomogenousTransformation<ValueType, D>::set_parameter(size_t i, ValueType v)
+    {
+        GADGET_DEBUG_CHECK_THROW(i<num_parameters_);
+        matrix_( i/(D+1), i%(D+1) ) = v;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    bool hoImageRegHomogenousTransformation<ValueType, D>::invertTransformation()
+    {
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE( Gadgetron::getri(matrix_) );
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    bool hoImageRegHomogenousTransformation<ValueType, D>::setIdentity()
+    {
+        GADGET_CHECK_RETURN_FALSE( matrix_.setIdentity() );
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    bool hoImageRegHomogenousTransformation<ValueType, D>::transform(const T* pt_in, T* pt_out) const
+    {
+        try
+        {
+            unsigned int ii, jj;
+            for ( ii=0; ii<D; ii++ )
+            {
+                pt_out[ii] = 0;
+                for ( jj=0; jj<D; jj++ )
+                {
+                    pt_out[ii] += matrix_(ii, jj) * pt_in[jj];
+                }
+                pt_out[ii] += matrix_(ii, D);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoImageRegHomogenousTransformation<ValueType, D>::transform(const T* pt_in, T* pt_out) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    bool hoImageRegHomogenousTransformation<ValueType, D>::transform(const T& xi, const T& yi, T& xo, T& yo) const
+    {
+        try
+        {
+            xo = matrix_(0, 0)*xi + matrix_(0, 1)*yi + matrix_(0, 2);
+            yo = matrix_(1, 0)*xi + matrix_(1, 1)*yi + matrix_(1, 2);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoImageRegHomogenousTransformation<ValueType, D>::transform(const T& xi, const T& yi, T& xo, T& yo) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    bool hoImageRegHomogenousTransformation<ValueType, D>::transform(const T& xi, const T& yi, const T& zi, T& xo, T& yo, T& zo) const
+    {
+        try
+        {
+            xo = matrix_(0, 0)*xi + matrix_(0, 1)*yi + matrix_(0, 2)*zi + matrix_(0, 3);
+            yo = matrix_(1, 0)*xi + matrix_(1, 1)*yi + matrix_(1, 2)*zi + matrix_(1, 3);
+            zo = matrix_(2, 0)*xi + matrix_(2, 1)*yi + matrix_(2, 2)*zi + matrix_(2, 3);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoImageRegHomogenousTransformation<ValueType, D>::transform(const T& xi, const T& yi, const T& zi, T& xo, T& yo, T& zo) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    bool hoImageRegHomogenousTransformation<ValueType, D>::transform(const size_t* pt_in, T* pt_out) const
+    {
+        try
+        {
+            unsigned int ii, jj;
+            for ( ii=0; ii<D; ii++ )
+            {
+                pt_out[ii] = 0;
+                for ( jj=0; jj<D; jj++ )
+                {
+                    pt_out[ii] += matrix_(ii, jj) * pt_in[jj];
+                }
+                pt_out[ii] += matrix_(ii, D);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoImageRegHomogenousTransformation<ValueType, D>::transform(const size_t* pt_in, T* pt_out) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    bool hoImageRegHomogenousTransformation<ValueType, D>::transform(const size_t* pt_in, size_t N, T* pt_out) const
+    {
+        try
+        {
+            long long ii;
+
+            #pragma omp parallel for default(none) private(ii) shared(pt_in, pt_out, N)
+            for ( ii=0; ii<(long long)N; ii++ )
+            {
+                this->transform(pt_in+ii*D, pt_out+ii*D);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegHomogenousTransformation<ValueType, D>::transform(const size_t* pt_in, size_t N, T* pt_out) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    bool hoImageRegHomogenousTransformation<ValueType, D>::transform(const size_t& xi, const size_t& yi, T& xo, T& yo) const
+    {
+        try
+        {
+            xo = matrix_(0, 0)*xi + matrix_(0, 1)*yi + matrix_(0, 2);
+            yo = matrix_(1, 0)*xi + matrix_(1, 1)*yi + matrix_(1, 2);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoImageRegHomogenousTransformation<ValueType, D>::transform(const size_t& xi, const size_t& yi, T& xo, T& yo) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    bool hoImageRegHomogenousTransformation<ValueType, D>::transform(const size_t* xi, const size_t* yi, size_t N, T* xo, T* yo) const
+    {
+        try
+        {
+            long long ii;
+
+            #pragma omp parallel for default(none) private(ii) shared(xi, yi, xo, yo, N)
+            for ( ii=0; ii<(long long)N; ii++ )
+            {
+                this->transform(xi[ii], yi[ii], xo[ii], yo[ii]);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegHomogenousTransformation<ValueType, D>::transform(const size_t* xi, const size_t* yi, size_t N, T* xo, T* yo) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    bool hoImageRegHomogenousTransformation<ValueType, D>::transform(const size_t& xi, const size_t& yi, const size_t& zi, T& xo, T& yo, T& zo) const
+    {
+        try
+        {
+            xo = matrix_(0, 0)*xi + matrix_(0, 1)*yi + matrix_(0, 2)*zi + matrix_(0, 3);
+            yo = matrix_(1, 0)*xi + matrix_(1, 1)*yi + matrix_(1, 2)*zi + matrix_(1, 3);
+            zo = matrix_(2, 0)*xi + matrix_(2, 1)*yi + matrix_(2, 2)*zi + matrix_(2, 3);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoImageRegHomogenousTransformation<ValueType, D>::transform(const size_t& xi, const size_t& yi, const size_t& zi, T& xo, T& yo, T& zo) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    bool hoImageRegHomogenousTransformation<ValueType, D>::transform(const size_t* xi, const size_t* yi, const size_t* zi, size_t N, T* xo, T* yo, T* zo) const
+    {
+        try
+        {
+            long long ii;
+
+            #pragma omp parallel for default(none) private(ii) shared(xi, yi, zi, xo, yo, zo, N)
+            for ( ii=0; ii<(long long)N; ii++ )
+            {
+                this->transform(xi[ii], yi[ii], zi[ii], xo[ii], yo[ii], zo[ii]);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegHomogenousTransformation<ValueType, D>::transform(const size_t* xi, const size_t* yi, const size_t* zi, size_t N, T* xo, T* yo, T* zo) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    bool hoImageRegHomogenousTransformation<ValueType, D>::jacobianParameter(const input_point_type& pos, jacobian_parameter_type& jac)
+    {
+        try
+        {
+            jac.createMatrix(D, num_parameters_);
+            Gadgetron::clear(jac);
+
+            if ( D == 2 )
+            {
+                jac(0, 0) = pos(0);
+                jac(0, 1) = pos(1);
+                jac(0, 2) = 1;
+
+                jac(1, 3) = pos(0);
+                jac(1, 4) = pos(1);
+                jac(1, 5) = 1;
+            }
+            else if ( D == 3 )
+            {
+                jac(0, 0) = pos(0);
+                jac(0, 1) = pos(1);
+                jac(0, 2) = pos(2);
+                jac(0, 3) = 1;
+
+                jac(1, 4) = pos(0);
+                jac(1, 5) = pos(1);
+                jac(1, 6) = pos(2);
+                jac(1, 7) = 1;
+
+                jac(2, 8)  = pos(0);
+                jac(2, 9)  = pos(1);
+                jac(2, 10) = pos(2);
+                jac(2, 11) = 1;
+            }
+            else
+            {
+                unsigned int ii, jj;
+                for ( ii=0; ii<D; ii++ )
+                {
+                    for ( jj=0; jj<D; jj++ )
+                    {
+                        jac(ii, ii*(D+1)+jj) = pos(jj);
+                    }
+
+                    jac(ii, ii*(D+1)+D) = 1;
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegHomogenousTransformation<ValueType, D>::jacobianParameter(const input_point_type& pos, jacobian_parameter_type& jac) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    bool hoImageRegHomogenousTransformation<ValueType, D>::jacobianPosition(const input_point_type& pos, jacobian_position_type& jac)
+    {
+        try
+        {
+            jac.createMatrix(D, D);
+            Gadgetron::clear(jac);
+
+            if ( D == 2 )
+            {
+                jac(0, 0) = matrix_(0, 0);
+                jac(0, 1) = matrix_(0, 1);
+                jac(1, 0) = matrix_(1, 0);
+                jac(1, 1) = matrix_(1, 1);
+            }
+            else if ( D == 3 )
+            {
+                jac(0, 0) = matrix_(0, 0);
+                jac(0, 1) = matrix_(0, 1);
+                jac(0, 2) = matrix_(0, 2);
+
+                jac(1, 0) = matrix_(1, 0);
+                jac(1, 1) = matrix_(1, 1);
+                jac(1, 2) = matrix_(1, 2);
+
+                jac(2, 0) = matrix_(2, 0);
+                jac(2, 1) = matrix_(2, 1);
+                jac(2, 2) = matrix_(2, 2);
+            }
+            else
+            {
+                unsigned int ii, jj;
+                for ( ii=0; ii<D; ii++ )
+                {
+                    for ( jj=0; jj<D; jj++ )
+                    {
+                        jac(ii, jj) = matrix_(ii, jj);
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegHomogenousTransformation<ValueType, D>::jacobianPosition(const input_point_type& pos, jacobian_position_type& jac) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType, unsigned int D> 
+    void hoImageRegHomogenousTransformation<ValueType, D>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron homogenous transformation -------------" << endl;
+        os << "Input dimension is : " << D << endl;
+        os << "Output dimension is : " << D << endl;
+
+        std::string elemTypeName = std::string(typeid(T).name());
+        os << "Transformation data type is : " << elemTypeName << std::endl;
+        os << "Number of parameters is : " << num_parameters_ << endl;
+
+        size_t i;
+        os << "Status of parameters: " << endl;
+        for ( i=0; i<this->num_parameters_; i++ )
+        {
+            os << "Para " << i << " : \t";
+            if ( para_status_[i] == BaseClass::Active )
+            {
+                os << "Active";
+            }
+            else if ( para_status_[i] == BaseClass::Inactive )
+            {
+                os << "Inactive";
+            }
+            else
+            {
+                os << "Unknown";
+            }
+            os << endl;
+        }
+
+        os << "Transformation: " << endl;
+        this->printTransform(os);
+    }
+
+    template <typename ValueType, unsigned int D> 
+    void hoImageRegHomogenousTransformation<ValueType, D>::printTransform(std::ostream& os) const
+    {
+        using namespace std;
+
+        size_t i;
+        os << "[ ";
+        for ( i=0; i<this->num_parameters_; i++ )
+        {
+            os << this->get_parameter(i) << " \t";
+        }
+        os << " ]" << endl;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegNonParametricTransformation.h b/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegNonParametricTransformation.h
new file mode 100644
index 0000000..68f36fb
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegNonParametricTransformation.h
@@ -0,0 +1,82 @@
+/** \file   hoImageRegNonParametricTransformation.h
+    \brief  Define the base class for the non-parametric geometry transformation in gadgetron registration
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegTransformation.h"
+
+namespace Gadgetron
+{
+    /// non-parametric transformation, e.g. deformation field
+    template<typename ValueType, unsigned int DIn, unsigned int DOut> 
+    class hoImageRegNonParametricTransformation : public hoImageRegTransformation<ValueType, DIn, DOut>
+    {
+    public:
+
+        typedef hoImageRegTransformation<ValueType, DIn, DOut> BaseClass;
+        typedef hoImageRegNonParametricTransformation<ValueType, DIn, DOut> Self;
+
+        typedef ValueType T;
+
+        typedef typename BaseClass::input_point_type input_point_type;
+        typedef typename BaseClass::output_point_type output_point_type;
+
+        typedef typename BaseClass::jacobian_position_type jacobian_position_type;
+
+        hoImageRegNonParametricTransformation() : BaseClass() {}
+        virtual ~hoImageRegNonParametricTransformation() {}
+
+        virtual bool invertTransformation() = 0;
+
+        virtual bool setIdentity() = 0;
+
+        virtual bool transform(const T* pt_in, T* pt_out) const = 0;
+
+        virtual bool transform(const T& xi, const T& yi, T& xo, T& yo) const = 0;
+
+        virtual bool transform(const T& xi, const T& yi, const T& zi, T& xo, T& yo, T& zo) const = 0;
+
+        virtual bool transform(const size_t* pt_in, T* pt_out) const = 0;
+        virtual bool transform(const size_t* pt_in, size_t N, T* pt_out) const = 0;
+        virtual bool transform(const size_t& xi, const size_t& yi, T& xo, T& yo) const = 0;
+        virtual bool transform(const size_t* xi, const size_t* yi, size_t N, T* xo, T* yo) const = 0;
+        virtual bool transform(const size_t& xi, const size_t& yi, const size_t& zi, T& xo, T& yo, T& zo) const = 0;
+        virtual bool transform(const size_t* xi, const size_t* yi, const size_t* zi, size_t N, T* xo, T* yo, T* zo) const = 0;
+
+        /// compute jacobian matrix to spatial position
+        /// DOut*DIn matrix
+        virtual bool jacobianPosition(const input_point_type& /*pos*/, jacobian_position_type& jac)
+        {
+            jac.createMatrix(DOut, DIn);
+            jac.setIdentity();
+            return true;
+        }
+
+        virtual void print(std::ostream& os) const
+        {
+            using namespace std;
+            os << "--------------Gagdgetron non-parametric geometry transformation -------------" << endl;
+            os << "Input dimension is : " << DIn << endl;
+            os << "Output dimension is : " << DOut << endl;
+
+            std::string elemTypeName = std::string(typeid(T).name());
+            os << "Transformation data type is : " << elemTypeName << std::endl;
+        }
+
+        virtual std::string transformationName() const
+        {
+            return std::string("hoImageRegNonParametricTransformation"); 
+        }
+
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+    };
+}
diff --git a/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegParametricTransformation.h b/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegParametricTransformation.h
new file mode 100644
index 0000000..dbfdeea
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegParametricTransformation.h
@@ -0,0 +1,227 @@
+/** \file   hoImageRegParametricTransformation.h
+    \brief  Define the base class for the parametric geometry transformation in gadgetron registration
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegTransformation.h"
+
+namespace Gadgetron
+{
+    /// parametric transformation, e.g. rigid and affine transformation or Free-Form Deformation
+    template<typename ValueType, unsigned int DIn, unsigned int DOut> 
+    class hoImageRegParametricTransformation : public hoImageRegTransformation<ValueType, DIn, DOut>
+    {
+    public:
+
+        typedef hoImageRegTransformation<ValueType, DIn, DOut> BaseClass;
+        typedef hoImageRegParametricTransformation<ValueType, DIn, DOut> Self;
+
+        typedef ValueType T;
+
+        typedef typename BaseClass::input_point_type input_point_type;
+        typedef typename BaseClass::output_point_type output_point_type;
+
+        typedef typename BaseClass::jacobian_parameter_type jacobian_parameter_type;
+        typedef typename BaseClass::jacobian_position_type jacobian_position_type;
+
+        /// every parameter can be active or inactive
+        /// if inactive, this parameter will not be changed during optimization
+        typedef enum { Inactive=0, Active, Unknown } ParaStatus;
+        typedef std::vector<ParaStatus> ParaStatusType;
+
+        hoImageRegParametricTransformation() : num_parameters_(0), BaseClass() {}
+        virtual ~hoImageRegParametricTransformation() {}
+
+        size_t get_number_of_parameters() const { return num_parameters_; }
+        void set_number_of_parameters(size_t num) { num_parameters_ = num; para_status_.resize(num, Active); }
+
+        // get/set the ith parameter
+        virtual ValueType get_parameter(size_t i) const = 0;
+        virtual void set_parameter(size_t i, ValueType v) = 0;
+
+        ParaStatus get_para_status(size_t i) { GADGET_CHECK_THROW(i<num_parameters_); return this->para_status_[i]; }
+        void set_para_status(size_t i, ParaStatus status) { GADGET_CHECK_THROW(i<num_parameters_); para_status_[i] = status; }
+
+        virtual bool invertTransformation() = 0;
+
+        virtual bool setIdentity() = 0;
+
+        virtual bool transform(const T* pt_in, T* pt_out) const = 0;
+
+        virtual bool transform(const T& xi, const T& yi, T& xo, T& yo) const = 0;
+
+        virtual bool transform(const T& xi, const T& yi, const T& zi, T& xo, T& yo, T& zo) const = 0;
+
+        virtual bool transform(const size_t* pt_in, T* pt_out) const = 0;
+        virtual bool transform(const size_t* pt_in, size_t N, T* pt_out) const = 0;
+        virtual bool transform(const size_t& xi, const size_t& yi, T& xo, T& yo) const = 0;
+        virtual bool transform(const size_t* xi, const size_t* yi, size_t N, T* xo, T* yo) const = 0;
+        virtual bool transform(const size_t& xi, const size_t& yi, const size_t& zi, T& xo, T& yo, T& zo) const = 0;
+        virtual bool transform(const size_t* xi, const size_t* yi, const size_t* zi, size_t N, T* xo, T* yo, T* zo) const = 0;
+
+        /// adjust transformation for the resolution pyramid, if the image coordinate is used
+        /// sourceI2W and targetI2W: source and target image to world transformation matrix
+        virtual bool adjustForResolutionPyramid(const hoMatrix<ValueType>& sourceI2W, const hoMatrix<ValueType>& targetI2W)
+        {
+            /// by default, the transformation is not changed
+            return true;
+        }
+
+        /// compute jacobian matrix to parameters
+        /// DOut*num_parameters_ matrix
+        virtual bool jacobianParameter(const input_point_type& /*pos*/, jacobian_parameter_type& jac)
+        {
+            jac.createMatrix(DOut, num_parameters_);
+            jac.setIdentity();
+            return true;
+        }
+
+        /// compute jacobian matrix to spatial position
+        /// DOut*DIn matrix
+        virtual bool jacobianPosition(const input_point_type& /*pos*/, jacobian_position_type& jac)
+        {
+            jac.createMatrix(DOut, DIn);
+            jac.setIdentity();
+            return true;
+        }
+
+        /// serialize/deserialize the transformation
+        virtual bool serialize(char*& buf, size_t& len) const;
+        virtual bool deserialize(char* buf, size_t& len);
+
+        virtual void print(std::ostream& os) const
+        {
+            using namespace std;
+            os << "--------------Gagdgetron parametric geometry transformation -------------" << endl;
+            os << "Input dimension is : " << DIn << endl;
+            os << "Output dimension is : " << DOut << endl;
+
+            std::string elemTypeName = std::string(typeid(T).name());
+            os << "Transformation data type is : " << elemTypeName << std::endl;
+            os << "Number of parameters is : " << num_parameters_ << endl;
+
+            size_t i;
+            os << "Status of parameters: " << endl;
+            for ( i=0; i<this->num_parameters_; i++ )
+            {
+                os << "Para " << i << " : \t";
+                if ( para_status_[i] == Active )
+                {
+                    os << "Active";
+                }
+                else if ( para_status_[i] == Inactive )
+                {
+                    os << "Inactive";
+                }
+                else
+                {
+                    os << "Unknown";
+                }
+                os << endl;
+            }
+        }
+
+        virtual void printTransform(std::ostream& os) const
+        {
+            using namespace std;
+
+            size_t i;
+            size_t maxNum = 12;
+
+            if ( this->num_parameters_< maxNum )
+            {
+                os << "[ ";
+                for ( i=0; i<this->num_parameters_; i++ )
+                {
+                    os << this->get_parameter(i) << " \t";
+                }
+                os << " ]" << endl;
+            }
+            else
+            {
+                os << "[ ";
+                for ( i=0; i<maxNum; i++ )
+                {
+                    os << this->get_parameter(i) << " \t";
+                }
+                os << " ... ]" << endl;
+            }
+        }
+
+        virtual std::string transformationName() const
+        {
+            return std::string("hoImageRegParametricTransformation"); 
+        }
+
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        size_t num_parameters_;
+
+        ParaStatusType para_status_;
+    };
+
+    template<typename ValueType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegParametricTransformation<ValueType, DIn, DOut>::serialize(char*& buf, size_t& len) const 
+    {
+        try
+        {
+            if ( buf != NULL ) delete[] buf;
+
+            size_t numOfPara = this->get_number_of_parameters();
+            size_t totalLen = sizeof(ValueType)*numOfPara;
+
+            buf = new char[totalLen];
+            GADGET_CHECK_RETURN_FALSE(buf!=NULL);
+
+            ValueType currPara;
+            size_t ii, offset(0);
+            for ( ii=0; ii<numOfPara; ii++ )
+            {
+                currPara = this->get_parameter(ii);
+                memcpy(buf+offset, &currPara, sizeof(ValueType));
+                offset += sizeof(ValueType);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegParametricTransformation<ValueType, DIn, DOut>::serialize(char*& buf, size_t& len) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegParametricTransformation<ValueType, DIn, DOut>::deserialize(char* buf, size_t& len)
+    {
+        try
+        {
+            size_t numOfPara = this->get_number_of_parameters();
+
+            ValueType currPara;
+            size_t ii, offset(0);
+            for ( ii=0; ii<numOfPara; ii++ )
+            {
+                memcpy(&currPara, buf+offset, sizeof(ValueType));
+                offset += sizeof(ValueType);
+                this->set_parameter(ii, currPara);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegParametricTransformation<ValueType, DIn, DOut>::deserialize(char* buf, size_t& len) ... ");
+            return false;
+        }
+
+        return true;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegRigid2DTransformation.h b/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegRigid2DTransformation.h
new file mode 100644
index 0000000..348012a
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegRigid2DTransformation.h
@@ -0,0 +1,380 @@
+/** \file   hoImageRegRigid2DTransformation.h
+    \brief  Define the class for the rigid 2D transformation in gadgetron registration
+            Three parameters are translation along x and y and roation along z (tx, ty, rz)
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegHomogenousTransformation.h"
+#include <cmath>
+
+namespace Gadgetron
+{
+    /// Homogenous transformation
+    template<typename ValueType> 
+    class hoImageRegRigid2DTransformation : public hoImageRegHomogenousTransformation<ValueType, 2>
+    {
+    public:
+
+        typedef hoImageRegParametricTransformation<ValueType, 2, 2> ParaTransformBaseClass;
+        typedef hoImageRegHomogenousTransformation<ValueType, 2> BaseClass;
+        typedef hoImageRegRigid2DTransformation<ValueType> Self;
+
+        typedef ValueType T;
+
+        typedef typename BaseClass::input_point_type input_point_type;
+        typedef typename BaseClass::output_point_type output_point_type;
+
+        typedef typename BaseClass::jacobian_parameter_type jacobian_parameter_type;
+        typedef typename BaseClass::jacobian_position_type jacobian_position_type;
+
+        typedef typename BaseClass::ParaStatus ParaStatus;
+        typedef typename BaseClass::ParaStatusType ParaStatusType;
+
+        hoImageRegRigid2DTransformation();
+        virtual ~hoImageRegRigid2DTransformation();
+
+        // get/set the ith parameter
+        virtual ValueType get_parameter(size_t i) const;
+        virtual void set_parameter(size_t i, ValueType v);
+
+        virtual bool invertTransformation();
+
+        virtual bool setIdentity();
+
+        // get/set the translation and rotation
+        ValueType get_tx() const;
+        ValueType get_ty() const;
+        ValueType get_rz() const;
+
+        void set_tx(ValueType tx);
+        void set_ty(ValueType ty);
+        void set_rz(ValueType rz);
+
+        void set_tx_ty(ValueType tx, ValueType ty);
+        void set_tx_ty_rz(ValueType tx, ValueType ty, ValueType rz);
+
+        /// compute the transformation matrix
+        bool updateTransformationMatrix(ValueType tx, ValueType ty, ValueType rz, hoMatrix<T>& matrix);
+        bool extractParametersFromTransformationMatrix(const hoMatrix<T>& matrix, ValueType& tx, ValueType& ty, ValueType& rz);
+
+        virtual bool adjustForResolutionPyramid(const hoMatrix<ValueType>& sourceI2W, const hoMatrix<ValueType>& targetI2W);
+
+        /// compute jacobian matrix to parameters
+        /// D*num_parameters_ matrix
+        virtual bool jacobianParameter(const input_point_type& pos, jacobian_parameter_type& jac);
+
+        virtual void print(std::ostream& os) const;
+        virtual void printTransform(std::ostream& os) const;
+
+        virtual std::string transformationName() const
+        {
+            return std::string("hoImageRegRigid2DTransformation"); 
+        }
+
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        using BaseClass::num_parameters_;
+        using BaseClass::para_status_;
+        using BaseClass::matrix_;
+
+        /// translation along x and y
+        ValueType tx_;
+        ValueType ty_;
+        /// rotation along z, in degree
+        ValueType rz_;
+    };
+
+    template <typename ValueType> 
+    hoImageRegRigid2DTransformation<ValueType>::hoImageRegRigid2DTransformation() : BaseClass()
+    {
+        num_parameters_ = 3;
+        para_status_.resize(num_parameters_, ParaTransformBaseClass::Active);
+
+        GADGET_CHECK_THROW(matrix_.createMatrix(3, 3));
+        GADGET_CHECK_THROW(matrix_.setIdentity());
+
+        tx_ = 0;
+        ty_ = 0;
+        rz_ = 0;
+    }
+
+    template <typename ValueType> 
+    hoImageRegRigid2DTransformation<ValueType>::~hoImageRegRigid2DTransformation()
+    {
+    }
+
+    template <typename ValueType> 
+    inline ValueType hoImageRegRigid2DTransformation<ValueType>::get_parameter(size_t i) const
+    {
+        GADGET_DEBUG_CHECK_THROW(i<num_parameters_);
+        if ( i == 0 )
+        {
+            return tx_;
+        }
+        else if ( i == 1 )
+        {
+            return ty_;
+        }
+        else
+        {
+            return rz_;
+        }
+    }
+
+    template <typename ValueType> 
+    inline void hoImageRegRigid2DTransformation<ValueType>::set_parameter(size_t i, ValueType v)
+    {
+        GADGET_DEBUG_CHECK_THROW(i<num_parameters_);
+        if ( i == 0 )
+        {
+            tx_ = v;
+        }
+        else if ( i == 1 )
+        {
+            ty_ = v;
+        }
+        else
+        {
+            rz_ = v;
+        }
+
+        GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, rz_, matrix_));
+    }
+
+    template <typename ValueType> 
+    inline bool hoImageRegRigid2DTransformation<ValueType>::invertTransformation()
+    {
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE( Gadgetron::getri(matrix_) );
+        GADGET_CHECK_RETURN_FALSE( this->extractParametersFromTransformationMatrix(matrix_, tx_, ty_, rz_) );
+        return true;
+    }
+
+    template <typename ValueType> 
+    inline bool hoImageRegRigid2DTransformation<ValueType>::setIdentity()
+    {
+        GADGET_CHECK_RETURN_FALSE( matrix_.setIdentity() );
+        tx_ = 0;
+        ty_ = 0;
+        rz_ = 0;
+        return true;
+    }
+
+    template <typename ValueType> 
+    inline ValueType hoImageRegRigid2DTransformation<ValueType>::get_tx() const
+    {
+        return tx_;
+    }
+
+    template <typename ValueType> 
+    inline ValueType hoImageRegRigid2DTransformation<ValueType>::get_ty() const
+    {
+        return ty_;
+    }
+
+    template <typename ValueType> 
+    inline ValueType hoImageRegRigid2DTransformation<ValueType>::get_rz() const
+    {
+        return rz_;
+    }
+
+    template <typename ValueType> 
+    inline void hoImageRegRigid2DTransformation<ValueType>::set_tx(ValueType tx)
+    {
+        tx_ = tx;
+        GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, rz_, matrix_));
+    }
+
+    template <typename ValueType> 
+    inline void hoImageRegRigid2DTransformation<ValueType>::set_ty(ValueType ty)
+    {
+        ty_ = ty;
+        GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, rz_, matrix_));
+    }
+
+    template <typename ValueType> 
+    inline void hoImageRegRigid2DTransformation<ValueType>::set_rz(ValueType rz)
+    {
+        rz_ = rz;
+        GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, rz_, matrix_));
+    }
+
+    template <typename ValueType> 
+    inline void hoImageRegRigid2DTransformation<ValueType>::set_tx_ty(ValueType tx, ValueType ty)
+    {
+        tx_ = tx;
+        ty_ = ty;
+        GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, rz_, matrix_));
+    }
+
+    template <typename ValueType> 
+    inline void hoImageRegRigid2DTransformation<ValueType>::set_tx_ty_rz(ValueType tx, ValueType ty, ValueType rz)
+    {
+        tx_ = tx;
+        ty_ = ty;
+        rz_ = rz;
+        GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, rz_, matrix_));
+    }
+
+    template <typename ValueType> 
+    bool hoImageRegRigid2DTransformation<ValueType>::updateTransformationMatrix(ValueType tx, ValueType ty, ValueType rz, hoMatrix<T>& matrix)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE( matrix.createMatrix(3, 3) );
+
+            ValueType cosrz = std::cos(rz*M_PI/180.0);
+            ValueType sinrz = std::sin(rz*M_PI/180.0);
+
+            matrix(0, 0) = cosrz;  matrix(0, 1) = sinrz; matrix(0, 2) = tx;
+            matrix(1, 0) = -sinrz; matrix(1, 1) = cosrz; matrix(1, 2) = ty;
+            matrix(2, 0) = 0;      matrix(2, 1) = 0;     matrix(2, 2) = 1;
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegRigid2DTransformation<ValueType>::updateTransformationMatrix(ValueType tx, ValueType ty, ValueType rz, hoMatrix<T>& matrix) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType> 
+    bool hoImageRegRigid2DTransformation<ValueType>::extractParametersFromTransformationMatrix(const hoMatrix<T>& matrix, ValueType& tx, ValueType& ty, ValueType& rz)
+    {
+        try
+        {
+            double cosrz = matrix(0, 0);
+            double sinrz = matrix(0, 1);
+
+            if ( cosrz >= 0 ) // rz is [-PI/2 PI/2]
+            {
+                rz = std::asin(sinrz);
+            }
+            else
+            {
+                rz = std::acos(cosrz);
+                if ( sinrz < 0) rz *= -1; // [-PI -PI/2]
+            }
+
+            tx = matrix(0, 2);
+            ty = matrix(1, 2);
+            rz *= 180.0/M_PI;
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegRigid2DTransformation<ValueType>::extractParametersFromTransformationMatrix(const hoMatrix<T>& matrix, ValueType& tx, ValueType& ty, ValueType& rz) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType> 
+    bool hoImageRegRigid2DTransformation<ValueType>::jacobianParameter(const input_point_type& pos, jacobian_parameter_type& jac)
+    {
+        try
+        {
+            jac.createMatrix(2, num_parameters_);
+            Gadgetron::clear(jac);
+
+            double cosrz = matrix_(0, 0);
+            double sinrz = matrix_(0, 1);
+
+            jac(0, 0) = 1;
+            jac(0, 1) = 0;
+            jac(0, 2) = -sinrz*pos(0) + cosrz*pos(1);
+
+            jac(1, 0) = 0;
+            jac(1, 1) = 1;
+            jac(1, 2) = -cosrz*pos(0) - sinrz*pos(1);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegRigid2DTransformation<ValueType>::jacobianParameter(const input_point_type& pos, jacobian_parameter_type& jac) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType> 
+    bool hoImageRegRigid2DTransformation<ValueType>::adjustForResolutionPyramid(const hoMatrix<ValueType>& sourceI2W, const hoMatrix<ValueType>& targetI2W)
+    {
+        try
+        {
+            hoNDImage<ValueType, 2> source;
+            source.set_image_to_world_matrix(sourceI2W);
+
+            hoNDImage<ValueType, 2> target;
+            target.set_image_to_world_matrix(targetI2W);
+
+            tx_ *= source.get_pixel_size(0)/target.get_pixel_size(0);
+            ty_ *= source.get_pixel_size(1)/target.get_pixel_size(1);
+
+            GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, rz_, matrix_));
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoImageRegRigid2DTransformation<ValueType>::adjustForResolutionPyramid(const hoMatrix<ValueType>& sourceI2W, const hoMatrix<ValueType>& targetI2W) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType> 
+    void hoImageRegRigid2DTransformation<ValueType>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron rigid 2D transformation -------------" << endl;
+        std::string elemTypeName = std::string(typeid(T).name());
+        os << "Transformation data type is : " << elemTypeName << std::endl;
+        os << "Number of parameters is : " << num_parameters_ << endl;
+
+        size_t i;
+        os << "Status of parameters [tx ty rz] : " << endl;
+        for ( i=0; i<this->num_parameters_; i++ )
+        {
+            os << "Para " << i << " : \t";
+            if ( para_status_[i] == ParaTransformBaseClass::Active )
+            {
+                os << "Active";
+            }
+            else if ( para_status_[i] == ParaTransformBaseClass::Inactive )
+            {
+                os << "Inactive";
+            }
+            else
+            {
+                os << "Unknown";
+            }
+            os << endl;
+        }
+
+        os << "Transformation: " << endl;
+        this->printTransform(os);
+    }
+
+    template <typename ValueType> 
+    void hoImageRegRigid2DTransformation<ValueType>::printTransform(std::ostream& os) const
+    {
+        using namespace std;
+
+        size_t i;
+        os << "[tx ty rz] = [ ";
+        for ( i=0; i<this->num_parameters_; i++ )
+        {
+            os << this->get_parameter(i) << " \t";
+        }
+        os << " ]" << endl;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegRigid3DTransformation.h b/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegRigid3DTransformation.h
new file mode 100644
index 0000000..bd9b793
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegRigid3DTransformation.h
@@ -0,0 +1,491 @@
+/** \file   hoImageRegRigid3DTransformation.h
+    \brief  Define the class for the rigid 2D transformation in gadgetron registration
+            Three parameters are translation along x and y and roation along z (tx, ty, rz)
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoImageRegHomogenousTransformation.h"
+#include <cmath>
+
+namespace Gadgetron
+{
+    /// Homogenous transformation
+    template<typename ValueType> 
+    class hoImageRegRigid3DTransformation : public hoImageRegHomogenousTransformation<ValueType, 3>
+    {
+    public:
+
+        typedef hoImageRegParametricTransformation<ValueType, 3, 3> ParaTransformBaseClass;
+        typedef hoImageRegHomogenousTransformation<ValueType, 3> BaseClass;
+        typedef hoImageRegRigid3DTransformation<ValueType> Self;
+
+        typedef ValueType T;
+
+        typedef typename BaseClass::input_point_type input_point_type;
+        typedef typename BaseClass::output_point_type output_point_type;
+
+        typedef typename BaseClass::jacobian_parameter_type jacobian_parameter_type;
+        typedef typename BaseClass::jacobian_position_type jacobian_position_type;
+
+        typedef typename BaseClass::ParaStatus ParaStatus;
+        typedef typename BaseClass::ParaStatusType ParaStatusType;
+
+        hoImageRegRigid3DTransformation();
+        virtual ~hoImageRegRigid3DTransformation();
+
+        // get/set the ith parameter
+        virtual ValueType get_parameter(size_t i) const;
+        virtual void set_parameter(size_t i, ValueType v);
+
+        virtual bool invertTransformation();
+
+        virtual bool setIdentity();
+
+        // get/set the translation and rotation
+        ValueType get_tx() const;
+        ValueType get_ty() const;
+        ValueType get_tz() const;
+        ValueType get_rx() const;
+        ValueType get_ry() const;
+        ValueType get_rz() const;
+
+        void set_tx(ValueType tx);
+        void set_ty(ValueType ty);
+        void set_tz(ValueType tz);
+        void set_rx(ValueType rx);
+        void set_ry(ValueType ry);
+        void set_rz(ValueType rz);
+
+        void set_tx_ty_tz(ValueType tx, ValueType ty, ValueType tz);
+        void set_rx_ry_rz(ValueType rx, ValueType ry, ValueType rz);
+
+        /// compute the transformation matrix
+        bool updateTransformationMatrix(ValueType tx, ValueType ty, ValueType tz, ValueType rx, ValueType ry, ValueType rz, hoMatrix<T>& matrix);
+        bool extractParametersFromTransformationMatrix(const hoMatrix<T>& matrix, ValueType& tx, ValueType& ty, ValueType& tz, ValueType& rx, ValueType& ry, ValueType& rz);
+
+        virtual bool adjustForResolutionPyramid(const hoMatrix<ValueType>& sourceI2W, const hoMatrix<ValueType>& targetI2W);
+
+        /// compute jacobian matrix to parameters
+        /// D*num_parameters_ matrix
+        virtual bool jacobianParameter(const input_point_type& pos, jacobian_parameter_type& jac);
+
+        virtual void print(std::ostream& os) const;
+        virtual void printTransform(std::ostream& os) const;
+
+        virtual std::string transformationName() const
+        {
+            return std::string("hoImageRegRigid3DTransformation"); 
+        }
+
+        using BaseClass::gt_timer1_;
+        using BaseClass::gt_timer2_;
+        using BaseClass::gt_timer3_;
+        using BaseClass::performTiming_;
+        using BaseClass::gt_exporter_;
+        using BaseClass::debugFolder_;
+
+    protected:
+
+        using BaseClass::num_parameters_;
+        using BaseClass::para_status_;
+        using BaseClass::matrix_;
+
+        /// translation along x, y and z
+        ValueType tx_;
+        ValueType ty_;
+        ValueType tz_;
+        /// rotation along x, y, and z, in degree
+        ValueType rx_;
+        ValueType ry_;
+        ValueType rz_;
+    };
+
+    template <typename ValueType> 
+    hoImageRegRigid3DTransformation<ValueType>::hoImageRegRigid3DTransformation() : BaseClass()
+    {
+        num_parameters_ = 6;
+        para_status_.resize(num_parameters_, BaseClass::Active);
+
+        GADGET_CHECK_THROW(matrix_.createMatrix(4, 4));
+        GADGET_CHECK_THROW(matrix_.setIdentity());
+
+        tx_ = 0;
+        ty_ = 0;
+        tz_ = 0;
+        rx_ = 0;
+        ry_ = 0;
+        rz_ = 0;
+    }
+
+    template <typename ValueType> 
+    hoImageRegRigid3DTransformation<ValueType>::~hoImageRegRigid3DTransformation()
+    {
+    }
+
+    template <typename ValueType> 
+    inline ValueType hoImageRegRigid3DTransformation<ValueType>::get_parameter(size_t i) const
+    {
+        GADGET_DEBUG_CHECK_THROW(i<num_parameters_);
+        if ( i == 0 )
+        {
+            return tx_;
+        }
+        else if ( i == 1 )
+        {
+            return ty_;
+        }
+        else if ( i == 2 )
+        {
+            return tz_;
+        }
+        else if ( i == 3 )
+        {
+            return rx_;
+        }
+        else if ( i == 4 )
+        {
+            return ry_;
+        }
+        else if ( i == 5 )
+        {
+            return rz_;
+        }
+
+        return 0;
+    }
+
+    template <typename ValueType> 
+    inline void hoImageRegRigid3DTransformation<ValueType>::set_parameter(size_t i, ValueType v)
+    {
+        GADGET_DEBUG_CHECK_THROW(i<num_parameters_);
+        if ( i == 0 )
+        {
+            tx_ = v;
+        }
+        else if ( i == 1 )
+        {
+            ty_ = v;
+        }
+        else if ( i == 2 )
+        {
+            tz_ = v;
+        }
+        else if ( i == 3 )
+        {
+            rx_ = v;
+        }
+        else if ( i == 4 )
+        {
+            ry_ = v;
+        }
+        else if ( i == 5 )
+        {
+            rz_ = v;
+        }
+
+        GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, tz_, rx_, ry_, rz_, matrix_));
+    }
+
+    template <typename ValueType> 
+    inline bool hoImageRegRigid3DTransformation<ValueType>::invertTransformation()
+    {
+        GADGET_CHECK_EXCEPTION_RETURN_FALSE( Gadgetron::getri(matrix_) );
+        GADGET_CHECK_RETURN_FALSE( this->extractParametersFromTransformationMatrix(matrix_, tx_, ty_, tz_, rx_, ry_, rz_) );
+        return true;
+    }
+
+    template <typename ValueType> 
+    inline bool hoImageRegRigid3DTransformation<ValueType>::setIdentity()
+    {
+        GADGET_CHECK_RETURN_FALSE( matrix_.setIdentity() );
+        tx_ = 0;
+        ty_ = 0;
+        tz_ = 0;
+        rx_ = 0;
+        ry_ = 0;
+        rz_ = 0;
+        return true;
+    }
+
+    template <typename ValueType> 
+    inline ValueType hoImageRegRigid3DTransformation<ValueType>::get_tx() const
+    {
+        return tx_;
+    }
+
+    template <typename ValueType> 
+    inline ValueType hoImageRegRigid3DTransformation<ValueType>::get_ty() const
+    {
+        return ty_;
+    }
+
+    template <typename ValueType> 
+    inline ValueType hoImageRegRigid3DTransformation<ValueType>::get_tz() const
+    {
+        return tz_;
+    }
+
+    template <typename ValueType> 
+    inline ValueType hoImageRegRigid3DTransformation<ValueType>::get_rx() const
+    {
+        return rx_;
+    }
+
+    template <typename ValueType> 
+    inline ValueType hoImageRegRigid3DTransformation<ValueType>::get_ry() const
+    {
+        return ry_;
+    }
+
+    template <typename ValueType> 
+    inline ValueType hoImageRegRigid3DTransformation<ValueType>::get_rz() const
+    {
+        return rz_;
+    }
+
+    template <typename ValueType> 
+    inline void hoImageRegRigid3DTransformation<ValueType>::set_tx(ValueType tx)
+    {
+        tx_ = tx;
+        GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, tz_, rx_, ry_, rz_, matrix_));
+    }
+
+    template <typename ValueType> 
+    inline void hoImageRegRigid3DTransformation<ValueType>::set_ty(ValueType ty)
+    {
+        ty_ = ty;
+        GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, tz_, rx_, ry_, rz_, matrix_));
+    }
+
+    template <typename ValueType> 
+    inline void hoImageRegRigid3DTransformation<ValueType>::set_tz(ValueType tz)
+    {
+        tz_ = tz;
+        GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, tz_, rx_, ry_, rz_, matrix_));
+    }
+
+    template <typename ValueType> 
+    inline void hoImageRegRigid3DTransformation<ValueType>::set_rx(ValueType rx)
+    {
+        rx_ = rx;
+        GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, tz_, rx_, ry_, rz_, matrix_));
+    }
+
+    template <typename ValueType> 
+    inline void hoImageRegRigid3DTransformation<ValueType>::set_ry(ValueType ry)
+    {
+        ry_ = ry;
+        GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, tz_, rx_, ry_, rz_, matrix_));
+    }
+
+    template <typename ValueType> 
+    inline void hoImageRegRigid3DTransformation<ValueType>::set_rz(ValueType rz)
+    {
+        rz_ = rz;
+        GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, tz_, rx_, ry_, rz_, matrix_));
+    }
+
+    template <typename ValueType> 
+    inline void hoImageRegRigid3DTransformation<ValueType>::set_tx_ty_tz(ValueType tx, ValueType ty, ValueType tz)
+    {
+        tx_ = tx;
+        ty_ = ty;
+        tz_ = tz;
+        GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, tz_, rx_, ry_, rz_, matrix_));
+    }
+
+    template <typename ValueType> 
+    inline void hoImageRegRigid3DTransformation<ValueType>::set_rx_ry_rz(ValueType rx, ValueType ry, ValueType rz)
+    {
+        rx_ = rx;
+        ry_ = ry;
+        rz_ = rz;
+        GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, tz_, rx_, ry_, rz_, matrix_));
+    }
+
+    template <typename ValueType> 
+    bool hoImageRegRigid3DTransformation<ValueType>::updateTransformationMatrix(ValueType tx, ValueType ty, ValueType tz, ValueType rx, ValueType ry, ValueType rz, hoMatrix<T>& matrix)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE( matrix.createMatrix(4, 4) );
+
+            double cosrx = std::cos(rx*M_PI/180.0);
+            double sinrx = std::sin(rx*M_PI/180.0);
+
+            double cosry = std::cos(ry*M_PI/180.0);
+            double sinry = std::sin(ry*M_PI/180.0);
+
+            double cosrz = std::cos(rz*M_PI/180.0);
+            double sinrz = std::sin(rz*M_PI/180.0);
+
+            matrix(0, 0) = cosry*cosrz;                         matrix(0, 1) = cosry*sinrz;                             matrix(0, 2) = -sinry;           matrix(0, 3) = tx;
+            matrix(1, 0) = sinrx*sinry*cosrz-cosrx*sinrz;       matrix(1, 1) = sinrx*sinry*sinrz+cosrx*cosrz;           matrix(1, 2) = sinrx*cosry;      matrix(1, 3) = ty;
+            matrix(2, 0) = cosrx*sinry*cosrz+sinrx*sinrz;       matrix(2, 1) = cosrx*sinry*sinrz-sinrx*cosrz;           matrix(2, 2) = cosrx*cosry;      matrix(2, 3) = tz;
+            matrix(3, 0) = 0;                                   matrix(3, 1) = 0;                                       matrix(3, 2) = 0;                matrix(3, 3) = 1;
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegRigid3DTransformation<ValueType>::updateTransformationMatrix(ValueType tx, ValueType ty, ValueType rz, hoMatrix<T>& matrix) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType> 
+    bool hoImageRegRigid3DTransformation<ValueType>::extractParametersFromTransformationMatrix(const hoMatrix<T>& matrix, ValueType& tx, ValueType& ty, ValueType& tz, ValueType& rx, ValueType& ry, ValueType& rz)
+    {
+        try
+        {
+            ry_ = asin(-1 * matrix_(0, 2));
+
+            if ( std::abs( std::cos(ry_) ) > 1e-6 )
+            {
+                rx_ = atan2(matrix_(1, 2), matrix_(2, 2));
+                rz_ = atan2(matrix_(0, 1), matrix_(0, 0));
+            } 
+            else 
+            { 
+                rx_ = atan2(-1.0*matrix_(0, 2)*matrix_(1, 0), -1.0*matrix_(0, 2)*matrix_(2, 0)); 
+                rz_ = 0;
+            }
+
+            tx_ = matrix_(0, 3);
+            ty_ = matrix_(1, 3);
+            tz_ = matrix_(2, 3);
+            rx_ *= 180.0/M_PI;
+            ry_ *= 180.0/M_PI;
+            rz_ *= 180.0/M_PI;
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegRigid3DTransformation<ValueType>::extractParametersFromTransformationMatrix(const hoMatrix<T>& matrix, ValueType& tx, ValueType& ty, ValueType& tz, ValueType& rx, ValueType& ry, ValueType& rz) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType> 
+    bool hoImageRegRigid3DTransformation<ValueType>::jacobianParameter(const input_point_type& pos, jacobian_parameter_type& jac)
+    {
+        try
+        {
+            jac.createMatrix(3, num_parameters_);
+            Gadgetron::clear(jac);
+
+            double cosrx = std::cos(rx_*M_PI/180.0);
+            double sinrx = std::sin(rx_*M_PI/180.0);
+
+            double cosry = std::cos(ry_*M_PI/180.0);
+            double sinry = std::sin(ry_*M_PI/180.0);
+
+            double cosrz = std::cos(rz_*M_PI/180.0);
+            double sinrz = std::sin(rz_*M_PI/180.0);
+
+            jac(0, 0) = 1;
+            jac(0, 1) = 0;
+            jac(0, 2) = 0;
+            jac(0, 3) = 0;
+            jac(0, 4) = -sinry*cosrz*pos(0)-sinry*sinrz*pos(1)-cosry*pos(2);
+            jac(0, 5) = -cosry*sinrz*pos(0)+cosry*cosrz*pos(1);
+
+            jac(1, 0) = 0;
+            jac(1, 1) = 1;
+            jac(1, 2) = 0;
+            jac(1, 3) = (cosrx*sinry*cosrz+sinrx*sinrz) *pos(0)             + (cosrx*sinry*sinrz-sinrx*cosrz)   *pos(1)        + cosrx*cosry*pos(2);
+            jac(1, 4) = (sinrx*cosry*cosrz)             *pos(0)             + (sinrx*cosry*sinrz)               *pos(1)        - sinrx*sinry*pos(2);
+            jac(1, 5) = (-sinrx*sinry*sinrz-cosrx*cosrz)*pos(0)             + (sinrx*sinry*cosrz-cosrx*sinrz)   *pos(1);
+
+            jac(2, 0) = 0;
+            jac(2, 1) = 0;
+            jac(2, 2) = 1;
+            jac(2, 3) = (-sinrx*sinry*cosrz+cosrx*sinrz)*pos(0)             + (-sinrx*sinry*sinrz-cosrx*cosrz)  *pos(1)         - sinrx*cosry*pos(2);
+            jac(2, 4) = (cosrx*cosry*cosrz)             *pos(0)             + (cosrx*cosry*sinrz)               *pos(1)         - cosrx*sinry*pos(2);
+            jac(2, 5) = (cosrx*sinry*-sinrz+sinrx*cosrz)*pos(0)             + (cosrx*sinry*cosrz+sinrx*sinrz)   *pos(1);
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegRigid3DTransformation<ValueType>::jacobianParameter(const input_point_type& pos, jacobian_parameter_type& jac) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType> 
+    bool hoImageRegRigid3DTransformation<ValueType>::adjustForResolutionPyramid(const hoMatrix<ValueType>& sourceI2W, const hoMatrix<ValueType>& targetI2W)
+    {
+        try
+        {
+            hoNDImage<ValueType, 3> source;
+            source.set_image_to_world_matrix(sourceI2W);
+
+            hoNDImage<ValueType, 3> target;
+            target.set_image_to_world_matrix(targetI2W);
+
+            tx_ *= source.get_pixel_size(0)/target.get_pixel_size(0);
+            ty_ *= source.get_pixel_size(1)/target.get_pixel_size(1);
+            tz_ *= source.get_pixel_size(2)/target.get_pixel_size(2);
+
+            GADGET_CHECK_THROW(this->updateTransformationMatrix(tx_, ty_, tz_, rx_, ry_, rz_, matrix_));
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Error happened in hoImageRegRigid3DTransformation<ValueType>::adjustForResolutionPyramid(const hoMatrix<ValueType>& sourceI2W, const hoMatrix<ValueType>& targetI2W) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename ValueType> 
+    void hoImageRegRigid3DTransformation<ValueType>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron rigid 3D transformation -------------" << endl;
+        std::string elemTypeName = std::string(typeid(T).name());
+        os << "Transformation data type is : " << elemTypeName << std::endl;
+        os << "Number of parameters is : " << num_parameters_ << endl;
+
+        size_t i;
+        os << "Status of parameters [tx ty tz rx ry rz] : " << endl;
+        for ( i=0; i<this->num_parameters_; i++ )
+        {
+            os << "Para " << i << " : \t";
+            if ( para_status_[i] == ParaTransformBaseClass::Active )
+            {
+                os << "Active";
+            }
+            else if ( para_status_[i] == ParaTransformBaseClass::Inactive )
+            {
+                os << "Inactive";
+            }
+            else
+            {
+                os << "Unknown";
+            }
+            os << endl;
+        }
+
+        os << "Transformation: " << endl;
+        this->printTransform(os);
+    }
+
+    template <typename ValueType> 
+    void hoImageRegRigid3DTransformation<ValueType>::printTransform(std::ostream& os) const
+    {
+        using namespace std;
+
+        size_t i;
+        os << "[tx ty tz rx ry rz] = [ ";
+        for ( i=0; i<this->num_parameters_; i++ )
+        {
+            os << this->get_parameter(i) << " \t";
+        }
+        os << " ]" << endl;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegTransformation.h b/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegTransformation.h
new file mode 100644
index 0000000..5978643
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/transformation/hoImageRegTransformation.h
@@ -0,0 +1,408 @@
+/** \file   hoImageRegTransformation.h
+    \brief  Define the base class for the geometric transformation in gadgetron registration
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "hoNDImage.h"
+#include "hoMatrix.h"
+#include "hoNDInterpolator.h"
+#include "hoNDBoundaryHandler.h"
+#include "hoMatrix.h"
+#include "hoNDArray_utils.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDImage_util.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "GtPrepUtil.h"
+
+#ifdef USE_OMP
+    #include <omp.h>
+#endif // USE_OMP
+
+namespace Gadgetron
+{
+    enum GT_IMAGE_REG_TRANSFORMATION
+    {
+        GT_IMAGE_REG_TRANSFORMATION_RIGID,
+        GT_IMAGE_REG_TRANSFORMATION_AFFINE,
+        GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD,
+        GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD_BIDIRECTIONAL
+    };
+
+    inline std::string getImageRegTransformationName(GT_IMAGE_REG_TRANSFORMATION v)
+    {
+        std::string name;
+
+        switch (v)
+        {
+            case GT_IMAGE_REG_TRANSFORMATION_RIGID:
+                name = "Rigid";
+                break;
+
+            case GT_IMAGE_REG_TRANSFORMATION_AFFINE:
+                name = "Affine";
+                break;
+
+            case GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD:
+                name = "DeformationField";
+                break;
+
+            case GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD_BIDIRECTIONAL:
+                name = "DeformationFieldBidirectional";
+                break;
+
+            default:
+                GERROR_STREAM("Unrecognized image registration transformation type : " << v);
+        }
+
+        return name;
+    }
+
+    inline GT_IMAGE_REG_TRANSFORMATION getImageRegTransformationType(const std::string& name)
+    {
+        GT_IMAGE_REG_TRANSFORMATION v;
+
+        if ( name == "Rigid" )
+        {
+            v = GT_IMAGE_REG_TRANSFORMATION_RIGID;
+        }
+        else if ( name == "Affine" )
+        {
+            v = GT_IMAGE_REG_TRANSFORMATION_AFFINE;
+        }
+        else if ( name == "DeformationField" )
+        {
+            v = GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD;
+        }
+        else if ( name == "DeformationFieldBidirectional" )
+        {
+            v = GT_IMAGE_REG_TRANSFORMATION_DEFORMATION_FIELD_BIDIRECTIONAL;
+        }
+        else
+        {
+            GERROR_STREAM("Unrecognized image registration transformation name : " << name);
+        }
+
+        return v;
+    }
+
+    /// transform a spatial position to another spatial position
+    /// input and output can have different dimensions
+    /// input has DIn dimension and output has DOut dimension
+    /// a transformation is defined as a vector function M*1
+    /// [T1; T2; T3; ...; TDOut] = T( [x1; x2; x3; ...; xDIn], [a1, a2, a3, ..., ak])
+    /// transforms from n dimension to m dimension with k parameters
+    /// therefore, the jacobian matrix to the parameters (Jac_parameter) is a DOut*k matrix
+    /// the jacobian matrix to the spatial position (Jac_position) is a DOut*DIn matrix
+    template<typename ValueType, unsigned int DIn, unsigned int DOut> 
+    class hoImageRegTransformation
+    {
+    public:
+
+        typedef hoImageRegTransformation<ValueType, DIn, DOut> Self;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef hoNDPoint<T, DIn> input_point_type;
+        typedef hoNDPoint<T, DOut> output_point_type;
+
+        /// there are two types of jacobian for transformations
+        /// one is the jacobian to the transformation paramerters
+        /// Jacobian matrix to paramters DOut*k matrix
+        typedef hoMatrix<T> jacobian_parameter_type;
+
+        /// Jacobian matrix to spatial position DOut*DIn matrix
+        typedef hoMatrix<T> jacobian_position_type;
+
+        hoImageRegTransformation() : performTiming_(false) 
+        {
+            gt_timer1_.set_timing_in_destruction(false);
+            gt_timer2_.set_timing_in_destruction(false);
+            gt_timer3_.set_timing_in_destruction(false); 
+        }
+
+        virtual ~hoImageRegTransformation() {}
+
+        /// invert the transformation, after calling this, the transformation is replace by its inverse transformation
+        virtual bool invertTransformation() = 0;
+
+        /// set the transformation to be identical transformation
+        virtual bool setIdentity() = 0;
+
+        /// transform a point
+        /// pt_in, pt_out stores a point as an array
+        virtual bool transform(const T* pt_in, T* pt_out) const = 0;
+        /// transform a point
+        virtual bool transform( const input_point_type& in, output_point_type& out ) const;
+        /// transform a group of points
+        virtual bool transform( input_point_type* in, size_t N, output_point_type* out ) const;
+        /// hoNDArray stores input and output points
+        /// pt_in: [DIn N]; pt_out: [DOut N]
+        virtual bool transform(const hoNDArray<T>& pt_in, hoNDArray<T>& pt_out) const;
+        /// pt_in, pt_out stores the points as an array
+        virtual bool transform(const T* pt_in, size_t N, T* pt_out) const;
+        /// for the DIn==DOut
+        virtual bool transform(T* pt_inout, size_t N) const;
+
+        /// for 2D - 2D transformation
+        virtual bool transform(const T& xi, const T& yi, T& xo, T& yo) const = 0;
+        virtual bool transform(const T* xi, const T* yi, size_t N, T* xo, T* yo) const;
+        virtual bool transform(T* x_inout, T* y_inout, size_t N) const;
+
+        /// for 3D - 3D transformation
+        virtual bool transform(const T& xi, const T& yi, const T& zi, T& xo, T& yo, T& zo) const = 0;
+        virtual bool transform(const T* xi, const T* yi, const T* zi, size_t N, T* xo, T* yo, T* zo) const;
+        virtual bool transform(T* x_inout, T* y_inout, T* z_inout, size_t N) const;
+
+        /// transform a point
+        /// the point is in the integer image pixel indexes
+        /// image interpolator is not used
+        /// pt_in, pt_out stores a point as an array
+        virtual bool transform(const size_t* pt_in, T* pt_out) const = 0;
+        virtual bool transform(const size_t* pt_in, size_t N, T* pt_out) const = 0;
+
+        /// for 2D - 2D transformation
+        virtual bool transform(const size_t& xi, const size_t& yi, T& xo, T& yo) const = 0;
+        virtual bool transform(const size_t* xi, const size_t* yi, size_t N, T* xo, T* yo) const = 0;
+
+        /// for 3D - 3D transformation
+        virtual bool transform(const size_t& xi, const size_t& yi, const size_t& zi, T& xo, T& yo, T& zo) const = 0;
+        virtual bool transform(const size_t* xi, const size_t* yi, const size_t* zi, size_t N, T* xo, T* yo, T* zo) const = 0;
+
+        /// serialize/deserialize the transformation
+        virtual bool serialize(char*& buf, size_t& len) const = 0;
+        virtual bool deserialize(char* buf, size_t& len) = 0;
+
+        virtual void print(std::ostream& os) const
+        {
+            using namespace std;
+            os << "--------------Gagdgetron geometric transformation -------------" << endl;
+            os << "Input dimension is : " << DIn << endl;
+            os << "Output dimension is : " << DOut << endl;
+
+            std::string elemTypeName = std::string(typeid(T).name());
+            os << "Transformation data type is : " << elemTypeName << std::endl;
+        }
+
+        virtual std::string transformationName() const
+        {
+            return std::string("hoImageRegTransformation"); 
+        }
+
+        // ----------------------------------
+        // debug and timing
+        // ----------------------------------
+        // clock for timing
+        Gadgetron::GadgetronTimer gt_timer1_;
+        Gadgetron::GadgetronTimer gt_timer2_;
+        Gadgetron::GadgetronTimer gt_timer3_;
+
+        bool performTiming_;
+
+        // exporter
+        Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+        // debug folder
+        std::string debugFolder_;
+    };
+
+    template<typename ValueType, unsigned int DIn, unsigned int DOut> 
+    inline bool hoImageRegTransformation<ValueType, DIn, DOut>::
+    transform( const input_point_type& in, output_point_type& out ) const
+    {
+        return this->transform(in.begin(), out.begin());
+    }
+
+    template<typename ValueType, unsigned int DIn, unsigned int DOut> 
+    inline bool hoImageRegTransformation<ValueType, DIn, DOut>::
+    transform( input_point_type* in, size_t N, output_point_type* out ) const
+    {
+        try
+        {
+            long long ii;
+
+            #pragma omp parallel for default(none) private(ii) shared(in, out, N)
+            for ( ii=0; ii<(long long)N; ii++ )
+            {
+                this->transform(in[ii].begin(), out[ii].begin());
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegTransformation<ValueType, DIn, DOut>::transform( input_point_type* in, size_t N, output_point_type* out ) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, unsigned int DIn, unsigned int DOut> 
+    inline bool hoImageRegTransformation<ValueType, DIn, DOut>::
+    transform(const hoNDArray<T>& pt_in, hoNDArray<T>& pt_out) const
+    {
+        const T* pIn = pt_in.begin();
+        T* pOut = pt_out.begin();
+        size_t N = pt_in.get_size(1);
+
+        return this->transform(pIn, N, pOut);
+    }
+
+    template<typename ValueType, unsigned int DIn, unsigned int DOut> 
+    inline bool hoImageRegTransformation<ValueType, DIn, DOut>::
+    transform(const T* pt_in, size_t N, T* pt_out) const
+    {
+        try
+        {
+            long long ii;
+
+            #pragma omp parallel for default(none) private(ii) shared(pt_in, N, pt_out)
+            for ( ii=0; ii<(long long)N; ii++ )
+            {
+                this->transform(pt_in+ii*DIn, pt_out+ii*DOut);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegTransformation<ValueType, DIn, DOut>::transform(T* pt_in, size_t N, T* pt_out) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, unsigned int DIn, unsigned int DOut> 
+    inline bool hoImageRegTransformation<ValueType, DIn, DOut>::
+    transform(T* pt_inout, size_t N) const
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(DIn>=DOut);
+
+            long long ii;
+
+            #pragma omp parallel default(none) private(ii) shared(pt_inout, N)
+            {
+                T pt_out[DOut];
+
+                #pragma omp for 
+                for ( ii=0; ii<(long long)N; ii++ )
+                {
+                    this->transform(pt_inout+ii*DIn, pt_out);
+                    memcpy(pt_inout+ii*DIn, pt_out, sizeof(T)*DOut);
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegTransformation<ValueType, DIn, DOut>::transform(T* pt_inout, size_t N) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, unsigned int DIn, unsigned int DOut> 
+    inline bool hoImageRegTransformation<ValueType, DIn, DOut>::
+    transform(const T* xi, const T* yi, size_t N, T* xo, T* yo) const
+    {
+        try
+        {
+            long long ii;
+
+            #pragma omp parallel for default(none) private(ii) shared(xi, yi, xo, yo, N)
+            for ( ii=0; ii<(long long)N; ii++ )
+            {
+                this->transform(xi[ii], yi[ii], xo[ii], yo[ii]);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegTransformation<ValueType, DIn, DOut>::transform(T* xi, T* yi, size_t N, T* xo, T* yo) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, unsigned int DIn, unsigned int DOut> 
+    inline bool hoImageRegTransformation<ValueType, DIn, DOut>::
+    transform(T* x_inout, T* y_inout, size_t N) const
+    {
+        try
+        {
+            long long ii;
+
+            T xo, yo;
+
+            #pragma omp parallel for default(none) private(ii, xo, yo) shared(x_inout, y_inout, N)
+            for ( ii=0; ii<(long long)N; ii++ )
+            {
+                this->transform(x_inout[ii], y_inout[ii], xo, yo);
+                x_inout[ii] = xo;
+                y_inout[ii] = yo;
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegTransformation<ValueType, DIn, DOut>::transform(T* x_inout, T* y_inout, size_t N) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, unsigned int DIn, unsigned int DOut> 
+    inline bool hoImageRegTransformation<ValueType, DIn, DOut>::
+    transform(const T* xi, const T* yi, const T* zi, size_t N, T* xo, T* yo, T* zo) const
+    {
+        try
+        {
+            long long ii;
+
+            #pragma omp parallel for default(none) private(ii) shared(xi, yi, zi, xo, yo, zo, N)
+            for ( ii=0; ii<(long long)N; ii++ )
+            {
+                this->transform(xi[ii], yi[ii], zi[ii], xo[ii], yo[ii], zo[ii]);
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegTransformation<ValueType, DIn, DOut>::transform(T* xi, T* yi, T* zi, size_t N, T* xo, T* yo, T* zo) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, unsigned int DIn, unsigned int DOut> 
+    inline bool hoImageRegTransformation<ValueType, DIn, DOut>::
+    transform(T* x_inout, T* y_inout, T* z_inout, size_t N) const
+    {
+        try
+        {
+            long long ii;
+
+            T xo, yo, zo;
+
+            #pragma omp parallel for default(none) private(ii, xo, yo, zo) shared(x_inout, y_inout, z_inout, N)
+            for ( ii=0; ii<(long long)N; ii++ )
+            {
+                this->transform(x_inout[ii], y_inout[ii], z_inout[ii], xo, yo, zo);
+                x_inout[ii] = xo;
+                y_inout[ii] = yo;
+                z_inout[ii] = zo;
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happen in hoImageRegTransformation<ValueType, DIn, DOut>::transform(T* x_inout, T* y_inout, T* z_inout, size_t N) ... ");
+            return false;
+        }
+
+        return true;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/cpu/warper/hoImageRegWarper.h b/toolboxes/registration/optical_flow/cpu/warper/hoImageRegWarper.h
new file mode 100644
index 0000000..ecbe443
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/warper/hoImageRegWarper.h
@@ -0,0 +1,529 @@
+/** \file   hoImageRegWarper.h
+    \brief  Define the class to perform image warpping using the geometric transformation in gadgetron registration
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "hoNDImage.h"
+#include "hoNDInterpolator.h"
+#include "hoNDBoundaryHandler.h"
+#include "hoMatrix.h"
+#include "hoNDArray_utils.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDImage_util.h"
+
+#include "hoImageRegTransformation.h"
+#include "hoImageRegDeformationField.h"
+#include "GtPrepUtil.h"
+
+#ifdef USE_OMP
+    #include <omp.h>
+#endif // USE_OMP
+
+namespace Gadgetron
+{
+    /// warp the source image to the grid of target image under a transformation
+    /// both image domain warpping and world coordinate warpping is implemented
+    /// for the image domain warpping, the pixels are in the coordinate of image grid
+    /// input and output can have different dimensions
+    /// input has DIn dimension and output has DOut dimension
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    class hoImageRegWarper
+    {
+    public:
+
+        typedef hoImageRegWarper<ValueType, CoordType, DIn, DOut> Self;
+
+        typedef hoNDImage<ValueType, DOut> TargetType;
+        typedef hoNDImage<ValueType, DIn> SourceType;
+
+        typedef hoNDImage<ValueType, 2> Target2DType;
+        typedef Target2DType Source2DType;
+
+        typedef hoNDImage<ValueType, 3> Target3DType;
+        typedef Target2DType Source3DType;
+
+        typedef hoNDInterpolator<SourceType> InterpolatorType;
+
+        typedef hoImageRegTransformation<CoordType, DIn, DOut> TransformationType;
+        typedef hoImageRegDeformationField<CoordType, DIn> DeformTransformationType;
+
+        typedef ValueType T;
+        typedef ValueType element_type;
+        typedef ValueType value_type;
+
+        typedef CoordType coord_type;
+
+        typedef typename TransformationType::input_point_type input_point_type;
+        typedef typename TransformationType::output_point_type output_point_type;
+
+        typedef typename TransformationType::jacobian_parameter_type jacobian_parameter_type;
+        typedef typename TransformationType::jacobian_position_type jacobian_position_type;
+
+        hoImageRegWarper(ValueType bg_values = 0);
+        virtual ~hoImageRegWarper();
+
+        void setTransformation(TransformationType& transform);
+        void setInterpolator(InterpolatorType& interp);
+        void setBackgroundValue(ValueType bg_value);
+
+        virtual bool warp(const TargetType& target, const SourceType& source, bool useWorldCoordinate, TargetType& warped);
+        //virtual bool warp(const Target2DType& target, const Source2DType& source, bool useWorldCoordinate, Target2DType& warped);
+        //virtual bool warp(const Target3DType& target, const Source3DType& source, bool useWorldCoordinate, Target3DType& warped);
+
+        /// warp at the target image grid using the DeformationField transformation
+        /// the DeformationField takes in the target pixel indexes and returns the transformed position in the world coordinates
+        /// the deformation field grid should be the same as the target images
+        virtual bool warpWithDeformationFieldWorldCoordinate(const TargetType& target, const SourceType& source, TargetType& warped);
+
+        virtual void print(std::ostream& os) const;
+
+        // ----------------------------------
+        // debug and timing
+        // ----------------------------------
+        // clock for timing
+        Gadgetron::GadgetronTimer gt_timer1_;
+        Gadgetron::GadgetronTimer gt_timer2_;
+        Gadgetron::GadgetronTimer gt_timer3_;
+
+        bool performTiming_;
+
+        // exporter
+        Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+        // debug folder
+        std::string debugFolder_;
+
+    protected:
+
+        TransformationType* transform_;
+        InterpolatorType* interp_;
+
+        /// back ground values, used to mark regions in the target image which will not be warped
+        ValueType bg_value_;
+    };
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegWarper<ValueType, CoordType, DIn, DOut>::hoImageRegWarper(ValueType bg_value) : transform_(NULL), interp_(NULL), performTiming_(false), bg_value_(bg_value)
+    {
+        gt_timer1_.set_timing_in_destruction(false);
+        gt_timer2_.set_timing_in_destruction(false);
+        gt_timer3_.set_timing_in_destruction(false);
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    hoImageRegWarper<ValueType, CoordType, DIn, DOut>::~hoImageRegWarper()
+    {
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    inline void hoImageRegWarper<ValueType, CoordType, DIn, DOut>::setTransformation(TransformationType& transform)
+    {
+        transform_ = &transform;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    inline void hoImageRegWarper<ValueType, CoordType, DIn, DOut>::setInterpolator(InterpolatorType& interp)
+    {
+        interp_ = &interp;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    inline void hoImageRegWarper<ValueType, CoordType, DIn, DOut>::setBackgroundValue(ValueType bg_value)
+    {
+        bg_value_ = bg_value;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegWarper<ValueType, CoordType, DIn, DOut>::
+    warp(const TargetType& target, const SourceType& source, bool useWorldCoordinate, TargetType& warped)
+    {
+        try
+        {
+            GADGET_DEBUG_CHECK_RETURN_FALSE(transform_!=NULL);
+
+            if ( useWorldCoordinate )
+            {
+                // if the transformation is the deformation filed, special version of warp should be called
+                DeformTransformationType* transformDeformField = dynamic_cast<DeformTransformationType*>(transform_);
+                if( transformDeformField != NULL )
+                {
+                    return this->warpWithDeformationFieldWorldCoordinate(target, source, warped);
+                }
+            }
+
+            GADGET_DEBUG_CHECK_RETURN_FALSE(interp_!=NULL);
+            interp_->setArray( const_cast<SourceType&>(source) );
+
+            warped = target;
+
+            if ( DIn==2 && DOut==2 )
+            {
+                size_t sx = target.get_size(0);
+                size_t sy = target.get_size(1);
+
+                long long y;
+
+                if ( useWorldCoordinate )
+                {
+                    // #pragma omp parallel private(y) shared(sx, sy, target, source, warped) num_threads(2)
+                    {
+                        coord_type px, py, px_source, py_source, ix_source, iy_source;
+
+                        // #pragma omp for 
+                        for ( y=0; y<(long long)sy; y++ )
+                        {
+                            for ( size_t x=0; x<sx; x++ )
+                            {
+                                size_t offset = x + y*sx;
+
+                                if ( target( offset ) != bg_value_ )
+                                {
+                                    // target to world
+                                    target.image_to_world(x, size_t(y), px, py);
+
+                                    // transform the point
+                                    transform_->transform(px, py, px_source, py_source);
+
+                                    // world to source
+                                    source.world_to_image(px_source, py_source, ix_source, iy_source);
+
+                                    // interpolate the source
+                                    warped( offset ) = (*interp_)(ix_source, iy_source);
+                                }
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    // #pragma omp parallel private(y) shared(sx, sy, target, source, warped) num_threads(2)
+                    {
+                        coord_type ix_source, iy_source;
+
+                        // #pragma omp for 
+                        for ( y=0; y<(long long)sy; y++ )
+                        {
+                            for ( size_t x=0; x<sx; x++ )
+                            {
+                                size_t offset = x + y*sx;
+
+                                if ( target( offset ) != bg_value_ )
+                                {
+                                    // transform the point
+                                    transform_->transform(x, size_t(y), ix_source, iy_source);
+
+                                    // interpolate the source
+                                    warped( offset ) = (*interp_)(ix_source, iy_source);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else if ( DIn==3 && DOut==3 )
+            {
+                size_t sx = target.get_size(0);
+                size_t sy = target.get_size(1);
+                size_t sz = target.get_size(2);
+
+                long long z;
+
+                if ( useWorldCoordinate )
+                {
+                    #pragma omp parallel private(z) shared(sx, sy, sz, target, source, warped)
+                    {
+                        coord_type px, py, pz, px_source, py_source, pz_source, ix_source, iy_source, iz_source;
+
+                        #pragma omp for 
+                        for ( z=0; z<(long long)sz; z++ )
+                        {
+                            for ( size_t y=0; y<sy; y++ )
+                            {
+                                size_t offset = y*sx + z*sx*sy;
+
+                                for ( size_t x=0; x<sx; x++ )
+                                {
+                                    if ( target( x+offset ) != bg_value_ )
+                                    {
+                                        // target to world
+                                        target.image_to_world(x, y, size_t(z), px, py, pz);
+
+                                        // transform the point
+                                        transform_->transform(px, py, pz, px_source, py_source, pz_source);
+
+                                        // world to source
+                                        source.world_to_image(px_source, py_source, pz_source, ix_source, iy_source, iz_source);
+
+                                        // interpolate the source
+                                        warped( x+offset ) = (*interp_)(ix_source, iy_source, iz_source);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel private(z) shared(sx, sy, sz, target, source, warped)
+                    {
+                        coord_type ix_source, iy_source, iz_source;
+
+                        #pragma omp for 
+                        for ( z=0; z<(long long)sz; z++ )
+                        {
+                            for ( size_t y=0; y<sy; y++ )
+                            {
+                                size_t offset = y*sx + z*sx*sy;
+
+                                for ( size_t x=0; x<sx; x++ )
+                                {
+                                    if ( target( x+offset ) != bg_value_ )
+                                    {
+                                        // transform the point
+                                        transform_->transform(x, y, size_t(z), ix_source, iy_source, iz_source);
+
+                                        // interpolate the source
+                                        warped( x+offset ) = (*interp_)(ix_source, iy_source, iz_source);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                size_t numOfPixels = target.get_number_of_elements();
+
+                long long n;
+
+                if ( useWorldCoordinate )
+                {
+                    #pragma omp parallel private(n) shared(numOfPixels, target, source, warped)
+                    {
+                        size_t ind_target[DIn];
+                        coord_type pt_target[DIn];
+                        coord_type pt_source[DOut];
+                        coord_type ind_source[DOut];
+
+                        #pragma omp for 
+                        for ( n=0; n<(long long)numOfPixels; n++ )
+                        {
+                            if ( target( size_t(n) ) != bg_value_ )
+                            {
+                                // target to world
+                                target.calculate_index( size_t(n), ind_target );
+
+                                target.image_to_world(ind_target, pt_target);
+
+                                // transform the point
+                                transform_->transform(pt_target, pt_source);
+
+                                // world to source
+                                source.world_to_image(pt_source, ind_source);
+
+                                // interpolate the source
+                                warped( size_t(n) ) = (*interp_)(ind_source);
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel private(n) shared(numOfPixels, target, source, warped)
+                    {
+                        coord_type pt_target[DIn];
+                        coord_type pt_source[DOut];
+
+                        #pragma omp for 
+                        for ( n=0; n<(long long)numOfPixels; n++ )
+                        {
+                            if ( target( size_t(n) ) != bg_value_ )
+                            {
+                                target.calculate_index( size_t(n), pt_target );
+
+                                // transform the point
+                                this->transform_->transform(pt_target, pt_source);
+
+                                // interpolate the source
+                                warped( size_t(n) ) = (*interp_)(pt_source);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegWarper<ValueType, CoordType, DIn, DOut>::\
+                                    warp(const TargetType& target, const SourceType& source, bool useWorldCoordinate, TargetType& warped) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    bool hoImageRegWarper<ValueType, CoordType, DIn, DOut>::
+    warpWithDeformationFieldWorldCoordinate(const TargetType& target, const SourceType& source, TargetType& warped)
+    {
+        try
+        {
+            GADGET_DEBUG_CHECK_RETURN_FALSE(DIn==DOut);
+            GADGET_DEBUG_CHECK_RETURN_FALSE(transform_!=NULL);
+
+            DeformTransformationType* transformDeformField = dynamic_cast<DeformTransformationType*>(transform_);
+            GADGET_DEBUG_CHECK_RETURN_FALSE(transformDeformField!=NULL);
+
+            GADGET_DEBUG_CHECK_RETURN_FALSE(interp_!=NULL);
+            interp_->setArray( const_cast<SourceType&>(source) );
+
+            warped = target;
+
+            if ( DIn==2 && DOut==2 )
+            {
+                size_t sx = target.get_size(0);
+                size_t sy = target.get_size(1);
+
+                long long y;
+
+                // #pragma omp parallel private(y) shared(sx, sy, target, source, warped) num_threads(2)
+                {
+                    coord_type px, py, dx, dy, ix_source, iy_source;
+
+                    // #pragma omp for 
+                    for ( y=0; y<(long long)sy; y++ )
+                    {
+                        for ( size_t x=0; x<sx; x++ )
+                        {
+                            size_t offset = x + y*sx;
+
+                            if ( target( offset ) != bg_value_ )
+                            {
+                                // target to world
+                                target.image_to_world(x, size_t(y), px, py);
+
+                                // transform the point
+                                transformDeformField->get(x, size_t(y), dx, dy);
+
+                                // world to source
+                                source.world_to_image(px+dx, py+dy, ix_source, iy_source);
+
+                                // interpolate the source
+                                warped( offset ) = (*interp_)(ix_source, iy_source);
+                            }
+                        }
+                    }
+                }
+            }
+            else if ( DIn==3 && DOut==3 )
+            {
+                size_t sx = target.get_size(0);
+                size_t sy = target.get_size(1);
+                size_t sz = target.get_size(2);
+
+                long long z;
+
+                #pragma omp parallel private(z) shared(sx, sy, sz, target, source, warped)
+                {
+                    coord_type px, py, pz, dx, dy, dz, ix_source, iy_source, iz_source;
+
+                    #pragma omp for 
+                    for ( z=0; z<(long long)sz; z++ )
+                    {
+                        for ( size_t y=0; y<sy; y++ )
+                        {
+                            size_t offset = y*sx + z*sx*sy;
+
+                            for ( size_t x=0; x<sx; x++ )
+                            {
+                                if ( target( x+offset ) != bg_value_ )
+                                {
+                                    // target to world
+                                    target.image_to_world(x, y, size_t(z), px, py, pz);
+
+                                    // transform the point
+                                    transformDeformField->get(x, y, size_t(z), dx, dy, dz);
+
+                                    // world to source
+                                    source.world_to_image(px+dx, py+dy, pz+dz, ix_source, iy_source, iz_source);
+
+                                    // interpolate the source
+                                    warped( x+offset ) = (*interp_)(ix_source, iy_source, iz_source);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                size_t numOfPixels = target.get_number_of_elements();
+
+                long long n;
+
+                #pragma omp parallel private(n) shared(numOfPixels, target, source, warped)
+                {
+                    size_t ind_target[DIn];
+                    coord_type pt_target[DIn];
+                    coord_type pt_source[DOut];
+                    coord_type ind_source[DOut];
+
+                    unsigned int ii;
+
+                    #pragma omp for 
+                    for ( n=0; n<(long long)numOfPixels; n++ )
+                    {
+                        if ( target( size_t(n) ) != bg_value_ )
+                        {
+                            // target to world
+                            target.calculate_index( size_t(n), ind_target );
+
+                            target.image_to_world(ind_target, pt_target);
+
+                            // transform the point
+                            transformDeformField->get(ind_target, pt_source);
+
+                            for ( ii=0; ii<DIn; ii++ )
+                            {
+                                pt_source[ii] += pt_target[ii];
+                            }
+
+                            // world to source
+                            source.world_to_image(pt_source, ind_source);
+
+                            // interpolate the source
+                            warped( size_t(n) ) = (*interp_)(ind_source);
+                        }
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GERROR_STREAM("Errors happened in hoImageRegWarper<ValueType, CoordType, DIn, DOut>::\
+                                    warpWithDeformationFieldWorldCoordinate(const TargetType& target, const SourceType& source, TargetType& warped) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename ValueType, typename CoordType, unsigned int DIn, unsigned int DOut> 
+    void hoImageRegWarper<ValueType, CoordType, DIn, DOut>::print(std::ostream& os) const
+    {
+        using namespace std;
+        os << "--------------Gagdgetron image warper -------------" << endl;
+        os << "Input dimension is : " << DIn << endl;
+        os << "Output dimension is : " << DOut << endl;
+
+        std::string elemTypeName = std::string(typeid(ValueType).name());
+        os << "Image data type is : " << elemTypeName << std::endl;
+
+        elemTypeName = std::string(typeid(CoordType).name());
+        os << "Transformation coordinate data type is : " << elemTypeName << std::endl;
+    }
+}
diff --git a/toolboxes/registration/optical_flow/gpu/CMakeLists.txt b/toolboxes/registration/optical_flow/gpu/CMakeLists.txt
new file mode 100644
index 0000000..7a38931
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/CMakeLists.txt
@@ -0,0 +1,39 @@
+if (WIN32)
+ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUREG__)
+endif (WIN32)
+
+include_directories(   
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+  ${CUDA_INCLUDE_DIRS}
+)
+
+cuda_add_library(gadgetron_toolbox_gpureg SHARED 
+  cuOpticalFlowSolver.cu 
+  cuHSOpticalFlowSolver.cu 
+  cuCKOpticalFlowSolver.cu 
+  cuResampleOperator.cu 
+  cuLinearResampleOperator.cu
+#  cuRegistration_utils.cu
+  )
+
+set_target_properties(gadgetron_toolbox_gpureg PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_toolbox_gpureg 
+  gadgetron_toolbox_gpucore
+  gadgetron_toolbox_log
+  ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES}
+  )
+
+install(TARGETS gadgetron_toolbox_gpureg DESTINATION lib COMPONENT main)
+
+install(FILES
+  cuOpticalFlowSolver.h
+  cuHSOpticalFlowSolver.h
+  cuCKOpticalFlowSolver.h
+  gpureg_export.h
+  cuResampleOperator.h
+  cuLinearResampleOperator.h
+#  cuRegistration_utils.h
+  cuCGHSOFSolver.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/registration/optical_flow/gpu/cuCGHSOFSolver.h b/toolboxes/registration/optical_flow/gpu/cuCGHSOFSolver.h
new file mode 100644
index 0000000..39b2ba9
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuCGHSOFSolver.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include "opticalFlowOperator.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuCgSolver.h"
+namespace Gadgetron{
+
+
+template<class T, unsigned int D> class cuCGHSOFSolver : public multiresRegistrationSolver<cuNDArray<T>, D>{
+public:
+	cuCGHSOFSolver(){
+
+		OF = boost::shared_ptr<OFOp>(new OFOp);
+		solver = boost::shared_ptr< cuCgSolver<T> >(new cuCgSolver<T>);
+		solver->set_encoding_operator(OF);
+		for (unsigned int i = 0; i < D; i++){
+			boost::shared_ptr<cuPartialDerivativeOperator<T,D> > dx(new cuPartialDerivativeOperator<T,D>(i));
+			solver->add_regularization_operator(dx);
+			ops.push_back(dx);
+		}
+	}
+
+	virtual ~cuCGHSOFSolver(){};
+	typedef opticalFlowOperator<cuNDArray<T>,cuPartialDerivativeOperator<T,D>,D> OFOp;
+
+	virtual void compute( cuNDArray<T> *fixed_image, cuNDArray<T> *moving_image, cuNDArray<T> *stencil_image, boost::shared_ptr<cuNDArray<T> > &result )
+  {
+		std::vector<size_t> dims = *fixed_image->get_dimensions();
+		OF->set_codomain_dimensions(&dims);    
+		OF->set_images(fixed_image,moving_image);
+
+		for (int i = 0; i < ops.size(); i++){
+				ops[i]->set_domain_dimensions(&dims);
+				ops[i]->set_codomain_dimensions(&dims);
+				ops[i]->set_weight(_alpha);
+		}
+
+		dims.push_back(D);
+		OF->set_domain_dimensions(&dims);
+		cuNDArray<T> It(*fixed_image);
+		It -= *moving_image;
+		boost::shared_ptr<cuNDArray<T> > resOp = solver->solve(&It);
+
+		if (result.get()) *result += *resOp;
+		else result = resOp;
+	}
+
+	void set_alpha(T alpha){
+		_alpha = alpha;
+	}
+
+	boost::shared_ptr< cuCgSolver<T> > get_solver(){
+		return solver;
+	}
+
+protected:
+
+	T _alpha;
+	boost::shared_ptr< cuCgSolver<T> > solver;
+	boost::shared_ptr<OFOp> OF;
+	std::vector<boost::shared_ptr<cuPartialDerivativeOperator<T,D> >  >ops;
+};
+
+
+
+
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuCKOpticalFlowSolver.cu b/toolboxes/registration/optical_flow/gpu/cuCKOpticalFlowSolver.cu
new file mode 100644
index 0000000..729eab4
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuCKOpticalFlowSolver.cu
@@ -0,0 +1,340 @@
+#include "cuCKOpticalFlowSolver.h"
+#include "vector_td_utilities.h"
+
+namespace Gadgetron{
+
+  //
+  // Kernel prototype declarations
+  //
+
+  template<class REAL, unsigned int D> __global__ 
+  void CorneliusKanade_kernel(const REAL*,const REAL*,const REAL*,REAL*,typename uintd<D>::Type,unsigned int,REAL,REAL,REAL,unsigned int*);
+
+  //
+  // Reference to shared memory
+  //
+
+  extern __shared__ char _shared_mem[];
+
+  //
+  // Implementation
+  //
+
+  template<class T, unsigned int D> boost::shared_ptr< cuNDArray<T> >
+  cuCKOpticalFlowSolver<T,D>::core_solver( cuNDArray<T> *gradient_image, cuNDArray<T> *stencil_image )
+  {
+
+    // Sanity checks
+    //
+  
+    if( !gradient_image ){
+      throw std::runtime_error("cuCKOpticalFlowSolver::core_solver(): illegal input gradient image received.");
+    }
+
+    if( gradient_image->get_number_of_dimensions() <= D ){
+      throw std::runtime_error("cuCKOpticalFlowSolver::core_solver(): number of gradient image dimensions is too small.");
+    }
+  
+    // The dimensions of the displacement field should match the gradient field
+    //
+  
+    boost::shared_ptr< std::vector<size_t> > disp_dims = gradient_image->get_dimensions();
+
+    boost::shared_ptr< cuNDArray<T> > displacements_ping( new cuNDArray<T>(disp_dims.get()));
+    boost::shared_ptr< cuNDArray<T> > displacements_pong( new cuNDArray<T>(disp_dims.get()));
+  
+    clear(displacements_ping.get());
+    clear(displacements_pong.get());
+    
+    // Setup grid
+    //
+
+    typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>( *gradient_image->get_dimensions() );  
+    unsigned int number_of_elements = prod(matrix_size);
+    unsigned int number_of_batches = 1;
+
+    for( unsigned int d=D; d<gradient_image->get_number_of_dimensions()-1; d++ ){
+      number_of_batches *= gradient_image->get_size(d);
+    }
+  
+    dim3 blockDim; dim3 gridDim;
+    this->setup_grid( &blockDim, &gridDim, number_of_elements, number_of_batches*(D+1), true, D+1 );
+  
+    // Allocate continuation flag (used for early Jacobi termination by the kernel)
+    //
+  
+    unsigned int *continue_flag;
+    if( cudaMalloc((void**)&continue_flag, sizeof(unsigned int) ) != cudaSuccess ) {
+      throw std::runtime_error("cuCKOpticalFlowSolver::core_solver(): failed to allocate continuation flag.");
+    }
+    
+    unsigned int iteration_no = 0;
+    cuNDArray<T> *ping = displacements_ping.get();
+    cuNDArray<T> *pong = displacements_pong.get();
+
+    if( this->output_mode_ >= cuOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+      GDEBUG_STREAM(std::endl);
+    }
+
+    //
+    // Main Jacobi loop
+    //
+
+    while(true){
+    
+      if( this->output_mode_ >= cuOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+        GDEBUG_STREAM("."; std::cout.flush());
+      }
+    
+      // Clear termination flag
+      //
+    
+      unsigned int _continue_flag = 0;
+      if( cudaMemcpy( continue_flag, &_continue_flag, sizeof(unsigned int), cudaMemcpyHostToDevice ) != cudaSuccess ) {
+        throw std::runtime_error("cuCKOpticalFlowSolver::core_solver(): failed to set continuation flag.");
+      }
+    
+      // Invoke kernel
+      //
+    
+      CorneliusKanade_kernel<T,D><<< gridDim, blockDim, (blockDim.x*blockDim.y)*sizeof(T) >>>
+        ( gradient_image->get_data_ptr(), (stencil_image) ? stencil_image->get_data_ptr() : 0x0,
+          ping->get_data_ptr(), pong->get_data_ptr(), 
+          vector_td<unsigned int,D>(matrix_size), number_of_batches, alpha_, beta_, this->limit_*this->limit_, continue_flag );
+      
+      CHECK_FOR_CUDA_ERROR();
+
+      // Swap in/out buffers
+      //
+    
+      cuNDArray<T> *tmp = ping;
+      ping = pong;
+      pong = tmp;
+
+      // Check termination criteria
+      //
+
+      if( cudaMemcpy(&_continue_flag, continue_flag, sizeof(unsigned int), cudaMemcpyDeviceToHost) != cudaSuccess ) {
+        throw std::runtime_error("cuHSOpticalFlowSolver::core_solver(): failed to evaluate the continuation flag.");
+      }
+    
+      if( _continue_flag == 0 ){
+        if( this->output_mode_ >= cuOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+          GDEBUG_STREAM(std::endl << "Break after " << iteration_no+1 << " iterations" << std::endl);
+        }
+        break;
+      }
+      
+      if( iteration_no > this->max_num_iterations_per_level_ ) 
+        break;    
+      
+      iteration_no++;
+    }
+  
+    if( cudaFree(continue_flag) != cudaSuccess ) {
+      throw std::runtime_error("cuCKOpticalFlowSolver::core_solver(): failed to free continuation flag.");
+    }
+  
+    if( ping == displacements_ping.get() )   
+      return displacements_ping;
+    else
+      return displacements_pong;
+  }
+
+  // Helpers
+  //
+  
+  template<unsigned int D> __device__ 
+  bool is_border_pixel_for_stride( typename intd<D>::Type stride, typename uintd<D>::Type co, typename uintd<D>::Type dims )
+  {
+    for( unsigned int d=0; d<D; d++ ){
+      if( stride.vec[d] == -1 ){
+        if( co.vec[d] == 0 ){
+          return true;
+        }
+      }
+      else if( stride.vec[d] == 1 ){
+        if( co.vec[d] == (dims.vec[d]-1) ){
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+  
+  template<unsigned int i, unsigned int j> struct Pow
+  {
+    enum { Value = i*Pow<i,j-1>::Value };
+  };
+  
+  template <unsigned int i> struct Pow<i,1>
+  {
+    enum { Value = i };
+  };
+  
+  // Cornelius-Kanade / Jacobi iteration
+  //
+
+  template<class REAL, unsigned int D> __global__ void
+  CorneliusKanade_kernel( const REAL * __restrict__ gradient_image, const REAL * __restrict__ stencil_image,
+                          const REAL * __restrict__ in_disp, REAL * __restrict__ out_disp,
+                          typename uintd<D>::Type matrix_size, unsigned int num_batches,
+                          REAL alpha, REAL beta, REAL disp_thresh_sqr, unsigned int * __restrict__ continue_signal )
+  {  
+    
+    // The overall flow dimension corresponding to this thread
+    const unsigned int dim = threadIdx.y;
+
+    // The thread idx relative to the flow dimension
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    
+    // Number of elements per batch
+    const unsigned int num_elements_per_batch = prod(matrix_size);
+  
+    // Number of elements per dim
+    const unsigned int num_elements_per_dim = num_elements_per_batch*num_batches;
+    
+    // We use shared memory to hold the averaged displacements
+    REAL *shared_mem = (REAL*) _shared_mem;
+  
+    //
+    // Find the average velocities (shared memory)
+    //
+  
+    // Batch idx (second slowest varying dimension)   
+    const unsigned int batch_idx = idx/num_elements_per_batch;
+    
+    // Local index to the image (or batch in our terminology)
+    const unsigned int idx_in_batch = idx-batch_idx*num_elements_per_batch;
+
+    // All threads (even out-of-range ones) must reach the synchronization point below
+    //
+
+    bool legal_idx = (idx < num_elements_per_dim);
+  
+    if( legal_idx && stencil_image && stencil_image[idx_in_batch] > REAL(0) )
+      legal_idx = false;
+
+    if( legal_idx ){
+
+      // Local co to the image
+      const typename uintd<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );
+    
+      const typename intd<D>::Type zeros (0);
+      const typename intd<D>::Type ones(1);
+      const typename intd<D>::Type threes(3);
+    
+      const int num_neighbors = Pow<3,D>::Value;
+      REAL num_contribs = REAL(0);
+    
+      // Idx local to the shared memory
+      const unsigned int shared_idx = threadIdx.y*blockDim.x+threadIdx.x;
+    
+      shared_mem[shared_idx] = REAL(0);
+    
+      // Compute average of neighbors
+      //
+    
+      for( int i=0; i<num_neighbors; i++ ){
+      
+        // Find the stride of the neighbor {-1, 0, 1}^D
+        const typename intd<D>::Type stride = idx_to_co<D>( i, threes ) - ones;
+        
+        unsigned int neighbor_idx;
+        
+        const unsigned int base_offset = dim*num_elements_per_dim + batch_idx*num_elements_per_batch;
+        
+        // Verify that the neighbor is not out of bounds (and not the thread itself)
+        if( !is_border_pixel_for_stride<D>( stride, co, matrix_size ) && !(stride==zeros) ){	
+          neighbor_idx = (unsigned int) co_to_idx<D>( vector_td<int,D>(co)+stride, vector_td<int,D>(matrix_size)) + base_offset;
+        }
+        else{
+          neighbor_idx = idx_in_batch + base_offset;
+        }
+        
+        shared_mem[shared_idx] += in_disp[neighbor_idx];
+        num_contribs += REAL(1);
+      }
+      
+      // Normalize
+      shared_mem[shared_idx] /= num_contribs;
+    }
+  
+    // Block until all averages have been computed (we need all d dims below)
+    __syncthreads();
+  
+    if( legal_idx ){
+    
+      //
+      // Update displacement field (Jacobi iteration)
+      //
+    
+      REAL phi = REAL(0);
+      REAL norm = REAL(0);
+    
+      typename reald<REAL,D>::Type derivatives;
+    
+      // Contributions from the spatial dimensions
+      //
+      
+      for( unsigned int d=0; d<D; d++ ){
+        derivatives.vec[d] = gradient_image[d*num_elements_per_dim+idx];
+        const unsigned int shared_idx = d*blockDim.x+threadIdx.x;
+        phi += (shared_mem[shared_idx]*derivatives.vec[d]);
+        norm += (derivatives.vec[d]*derivatives.vec[d]);
+      }
+      
+      // Contributions from the temporal dimension
+      //
+      
+      phi += gradient_image[D*num_elements_per_dim+idx];
+    
+      // Contribution from the intensity attentuation estimation
+      //
+    
+      phi -= shared_mem[D*blockDim.x+threadIdx.x];
+    
+      // Normalize
+      //
+    
+      phi /= ((alpha/beta)*(alpha/beta)+alpha*alpha+norm);
+    
+      // Form result displacement
+      //
+    
+      const unsigned int shared_idx = dim*blockDim.x+threadIdx.x;
+      REAL result;
+    
+      if( dim<D )
+        result = shared_mem[shared_idx]-derivatives.vec[dim]*phi;
+      else
+        result = shared_mem[D*blockDim.x+threadIdx.x]+(alpha/beta)*(alpha/beta)*phi;
+      
+      // Clear the "termination" flag if the displacement field has changed above the threshold
+      //
+      
+      REAL delta = result-in_disp[dim*num_elements_per_dim+idx];
+      if( dim < D && delta*delta > disp_thresh_sqr )
+        continue_signal[0] = 1;
+      
+      // Output result
+      //
+      
+      out_disp[dim*num_elements_per_dim+idx] = result;
+    }
+  }
+
+  // 
+  // Template instantiation
+  //
+
+  template class EXPORTGPUREG cuCKOpticalFlowSolver<float,1>;
+  template class EXPORTGPUREG cuCKOpticalFlowSolver<float,2>;
+  template class EXPORTGPUREG cuCKOpticalFlowSolver<float,3>;
+  template class EXPORTGPUREG cuCKOpticalFlowSolver<float,4>;
+
+  template class EXPORTGPUREG cuCKOpticalFlowSolver<double,1>;
+  template class EXPORTGPUREG cuCKOpticalFlowSolver<double,2>;
+  template class EXPORTGPUREG cuCKOpticalFlowSolver<double,3>;
+  template class EXPORTGPUREG cuCKOpticalFlowSolver<double,4>;  
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuCKOpticalFlowSolver.h b/toolboxes/registration/optical_flow/gpu/cuCKOpticalFlowSolver.h
new file mode 100644
index 0000000..f1caafc
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuCKOpticalFlowSolver.h
@@ -0,0 +1,55 @@
+/** \file cuCKOpticalFlowSolver.h
+    \brief GPU-based Cornelius-Kanade optical flow registration solver.
+
+    References to the solver implementation and some usage scenarios can be found in:
+
+    An optimised multi-baseline approach for on-line MR-temperature monitoring on commodity graphics hardware
+    BD de Senneville, KØ Noe, M Ries, M Pedersen, CTW Moonen, TS Sørensen.
+    5th IEEE International Symposium on Biomedical Imaging: From Nano to Macro, 2008. ISBI 2008. pp. 1513-1516.
+
+    Acceleration and validation of optical flow based deformable registration for image-guided radiotherapy.
+    KØ Noe, BD de Senneville, UV Elstrøm, K Tanderup, TS Sørensen.
+    Acta Oncologica 2008; 47(7): 1286-1293.
+
+    Retrospective reconstruction of high temporal resolution cine images from real‐time MRI using iterative motion correction
+    MS Hansen, TS Sørensen, AE Arai, P Kellman.
+    Magnetic Resonance in Medicine 2012; 68(3): 741-750.
+*/
+
+#pragma once
+
+#include "cuOpticalFlowSolver.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class EXPORTGPUREG cuCKOpticalFlowSolver 
+    : public cuOpticalFlowSolver<T, D>
+  {
+  
+  public:
+
+    // Constructors / destructors
+    //
+  
+    cuCKOpticalFlowSolver() : cuOpticalFlowSolver<T,D>(){ 
+      alpha_ = T(0.05); 
+      beta_ = T(1.0); 
+    } 
+  
+    virtual ~cuCKOpticalFlowSolver() {}
+  
+    // Set the regularization weight
+    //
+  
+    inline void set_alpha( T alpha ) { alpha_ = alpha; }
+    inline void set_beta( T beta ) { beta_ = beta; }
+  
+  protected:  
+    virtual boost::shared_ptr< cuNDArray<T> > 
+      core_solver( cuNDArray<T> *gradient_image, cuNDArray<T> *stencil_image );  
+    
+  protected:
+    T alpha_;
+    T beta_;
+  };
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuHSOpticalFlowSolver.cu b/toolboxes/registration/optical_flow/gpu/cuHSOpticalFlowSolver.cu
new file mode 100644
index 0000000..0b3c15d
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuHSOpticalFlowSolver.cu
@@ -0,0 +1,326 @@
+#include "cuHSOpticalFlowSolver.h"
+#include "vector_td_utilities.h"
+
+namespace Gadgetron{
+
+  //
+  // Kernel prototype declarations
+  //
+
+  template<class REAL, unsigned int D> __global__ 
+  void HornSchunk_kernel(const REAL*,const REAL*,const REAL*,REAL*,typename uintd<D>::Type,unsigned int,REAL,REAL,unsigned int*);
+
+  //
+  // Reference to shared memory
+  //
+
+  extern __shared__ char _shared_mem[];
+
+  //
+  // Implementation
+  //
+
+  template<class T, unsigned int D> boost::shared_ptr< cuNDArray<T> >
+  cuHSOpticalFlowSolver<T,D>::core_solver( cuNDArray<T> *gradient_image, cuNDArray<T> *stencil_image )
+  {
+    // Sanity checks
+    //
+  
+    if( !gradient_image ){
+      throw std::runtime_error("cuHSOpticalFlowSolver::core_solver(): illegal input gradient image received.");
+    }
+  
+    if( gradient_image->get_number_of_dimensions() <= D ){
+      throw std::runtime_error("cuHSOpticalFlowSolver::core_solver(): number of gradient image dimensions is too small.");
+    }
+    
+    // The dimensions of the displacement field should match the gradient field
+    // - when removing the temporal gradient component (replacing D+1 with D)
+    //
+  
+    boost::shared_ptr< std::vector<size_t> > disp_dims = gradient_image->get_dimensions();
+    disp_dims->pop_back(); disp_dims->push_back(D);
+
+    boost::shared_ptr< cuNDArray<T> > displacements_ping(new cuNDArray<T>(disp_dims.get()));
+    boost::shared_ptr< cuNDArray<T> > displacements_pong(new cuNDArray<T>(disp_dims.get()));
+  
+    clear(displacements_ping.get());
+    clear(displacements_pong.get());
+      
+    // Setup grid
+    //
+
+    typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>( *gradient_image->get_dimensions() );  
+    unsigned int number_of_elements = prod(matrix_size);
+    unsigned int number_of_batches = 1;
+  
+    for( unsigned int d=D; d<gradient_image->get_number_of_dimensions()-1; d++ ){
+      number_of_batches *= gradient_image->get_size(d);
+    }
+  
+    dim3 blockDim; dim3 gridDim;
+    this->setup_grid( &blockDim, &gridDim, number_of_elements, number_of_batches*D, true, D );
+  
+    // Allocate continuation flag (used for early Jacobi termination by the kernel)
+    //
+  
+    unsigned int *continue_flag;
+    if( cudaMalloc((void**)&continue_flag, sizeof(unsigned int) ) != cudaSuccess ) {
+      throw std::runtime_error("cuHSOpticalFlowSolver::core_solver(): failed to allocate continuation flag.");
+    }
+  
+    unsigned int iteration_no = 0;
+    cuNDArray<T> *ping = displacements_ping.get();
+    cuNDArray<T> *pong = displacements_pong.get();
+
+    if( this->output_mode_ >= cuOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+      GDEBUG_STREAM(std::endl);
+    }
+
+    //
+    // Main Jacobi loop
+    //
+
+    while(true){
+    
+      if( this->output_mode_ >= cuOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+        GDEBUG_STREAM("."; std::cout.flush());
+      }
+    
+      // Clear termination flag
+      //
+    
+      unsigned int _continue_flag = 0;
+      if( cudaMemcpy( continue_flag, &_continue_flag, sizeof(unsigned int), cudaMemcpyHostToDevice ) != cudaSuccess ) {
+        throw std::runtime_error("cuHSOpticalFlowSolver::core_solver(): failed to set continuation flag.");
+      }
+    
+      // Invoke kernel
+      //
+    
+      HornSchunk_kernel<T,D><<< gridDim, blockDim, (blockDim.x*blockDim.y)*sizeof(T) >>>
+        ( gradient_image->get_data_ptr(), (stencil_image) ? stencil_image->get_data_ptr() : 0x0,
+          ping->get_data_ptr(), pong->get_data_ptr(),
+          vector_td<unsigned int,D>(matrix_size), number_of_batches, alpha_, this->limit_*this->limit_, continue_flag );
+    
+      CHECK_FOR_CUDA_ERROR();
+
+      // Swap in/out buffers
+      //
+    
+      cuNDArray<T> *tmp = ping;
+      ping = pong;
+      pong = tmp;
+
+      // Check termination criteria
+      //
+
+      if( cudaMemcpy(&_continue_flag, continue_flag, sizeof(unsigned int), cudaMemcpyDeviceToHost) != cudaSuccess ) {
+        throw std::runtime_error("cuHSOpticalFlowSolver::core_solver(): failed to evaluate the continuation flag.");
+      }
+    
+      if( _continue_flag == 0 ){
+        if( this->output_mode_ >= cuOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+          GDEBUG_STREAM(std::endl << "Break after " << iteration_no+1 << " iterations" << std::endl);
+        }
+        break;
+      }
+    
+      if( iteration_no > this->max_num_iterations_per_level_ ) 
+        break;    
+    
+      iteration_no++;
+    }
+  
+    if( cudaFree(continue_flag) != cudaSuccess ) {
+      throw std::runtime_error("cuHSOpticalFlowSolver::core_solver(): failed to free continuation flag.");
+    }
+    
+    if( ping == displacements_ping.get() )   
+      return displacements_ping;
+    else
+      return displacements_pong;
+  }
+  
+  // Helpers
+  //
+  
+  template<unsigned int D> __device__ 
+  bool is_border_pixel_for_stride( typename intd<D>::Type stride, typename uintd<D>::Type co, typename uintd<D>::Type dims )
+  {
+    for( unsigned int d=0; d<D; d++ ){
+      if( stride.vec[d] == -1 ){
+        if( co.vec[d] == 0 ){
+          return true;
+        }
+      }
+      else if( stride.vec[d] == 1 ){
+        if( co.vec[d] == (dims.vec[d]-1) ){
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+  
+  template<unsigned int i, unsigned int j> struct Pow
+  {
+    enum { Value = i*Pow<i,j-1>::Value };
+  };
+  
+  template <unsigned int i> struct Pow<i,1>
+  {
+    enum { Value = i };
+  };
+  
+  // Horn-Schunk / Jacobi iteration
+  //
+  
+  template<class REAL, unsigned int D> __global__ void
+  HornSchunk_kernel( const REAL * __restrict__ gradient_image, const REAL * __restrict__ stencil_image,
+                     const REAL * __restrict__ in_disp, REAL * __restrict__ out_disp,
+                     typename uintd<D>::Type matrix_size, unsigned int num_batches,
+                     REAL alpha, REAL disp_thresh_sqr, unsigned int * __restrict__ continue_signal )
+  {  
+    
+    // The overall flow dimension corresponding to this thread
+    const unsigned int dim = threadIdx.y;
+    
+    // The thread idx relative to the flow dimension
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    
+    // Number of elements per batch
+    const unsigned int num_elements_per_batch = prod(matrix_size);
+    
+    // Number of elements per dim
+    const unsigned int num_elements_per_dim = num_elements_per_batch*num_batches;
+    
+    // We use shared memory to hold the averaged displacements
+    REAL *shared_mem = (REAL*) _shared_mem;
+    
+    //
+    // Find the average velocities (shared memory)
+    //
+    
+    // Batch idx (second slowest varying dimension)   
+    const unsigned int batch_idx = idx/num_elements_per_batch;
+    
+    // Local index to the image (or batch in our terminology)
+    const unsigned int idx_in_batch = idx-batch_idx*num_elements_per_batch;
+    
+    // All threads (even out-of-range ones) must reach the synchronization point below
+    //
+    
+    bool legal_idx = (idx < num_elements_per_dim);
+    
+    if( legal_idx && stencil_image && stencil_image[idx_in_batch] > REAL(0) )
+      legal_idx = false;
+    
+    if( legal_idx ){
+      
+      // Local co to the image
+      const typename uintd<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );
+      
+      const typename intd<D>::Type zeros(0);
+      const typename intd<D>::Type ones(1);
+      const typename intd<D>::Type threes(3);
+      
+      const int num_neighbors = Pow<3,D>::Value;
+      REAL num_contribs = REAL(0);
+      
+      // Idx local to the shared memory
+      const unsigned int shared_idx = threadIdx.y*blockDim.x+threadIdx.x;
+      
+      shared_mem[shared_idx] = REAL(0);
+      
+      for( int i=0; i<num_neighbors; i++ ){
+	
+        // Find the stride of the neighbor {-1, 0, 1}^D
+        const typename intd<D>::Type stride = idx_to_co<D>( i, threes ) - ones;
+	
+        // Verify that the neighbor is not out of bounds (and not the thread itself)
+        if( !is_border_pixel_for_stride<D>( stride, co, matrix_size ) && !(stride==zeros) ){
+	  
+          // Compute average of neighbors
+          //
+	  
+          const unsigned int base_offset = dim*num_elements_per_dim + batch_idx*num_elements_per_batch;
+          const unsigned int neighbor_idx = (unsigned int) co_to_idx<D>( vector_td<int,D>(co)+stride, vector_td<int,D>(matrix_size)) + base_offset;
+	  
+          shared_mem[shared_idx] += in_disp[neighbor_idx];
+          num_contribs += REAL(1);
+        }
+      }
+      
+      // Normalize
+      shared_mem[shared_idx] /= num_contribs;       	
+    }
+    
+    // Block until all averages have been computed (we need all d dims below)
+    __syncthreads();
+    
+    if( legal_idx ){
+      
+      //
+      // Update displacement field (Jacobi iteration)
+      //
+      
+      REAL phi = REAL(0);
+      REAL norm = REAL(0);
+      
+      typename reald<REAL,D>::Type derivatives;
+      
+      // Contributions from the spatial dimensions
+      //
+      
+      for( unsigned int d=0; d<D; d++ ){
+        derivatives.vec[d] = gradient_image[d*num_elements_per_dim+idx];
+        const unsigned int shared_idx = d*blockDim.x+threadIdx.x;
+        phi += (shared_mem[shared_idx]*derivatives.vec[d]);
+        norm += (derivatives.vec[d]*derivatives.vec[d]);
+      }
+      
+      // Contributions from the temporal dimension
+      //
+      
+      phi += gradient_image[D*num_elements_per_dim+idx];
+      
+      // Normalize
+      //
+      
+      phi /= (alpha*alpha+norm);
+      
+      // Form result displacement
+      //
+      
+      const unsigned int shared_idx = dim*blockDim.x+threadIdx.x;
+      REAL result = shared_mem[shared_idx]-derivatives.vec[dim]*phi;
+      
+      // Clear the "termination" flag if the displacement field has changed above the threshold
+      //
+      
+      REAL delta = result-in_disp[dim*num_elements_per_dim+idx];
+      if( delta*delta > disp_thresh_sqr )
+        continue_signal[0] = 1;
+      
+      // Output result
+      //
+      
+      out_disp[dim*num_elements_per_dim+idx] = result;
+    }
+  }
+  
+  // 
+  // Template instantiation
+  //
+  
+  template class EXPORTGPUREG cuHSOpticalFlowSolver<float,1>;
+  template class EXPORTGPUREG cuHSOpticalFlowSolver<float,2>;
+  template class EXPORTGPUREG cuHSOpticalFlowSolver<float,3>;
+  template class EXPORTGPUREG cuHSOpticalFlowSolver<float,4>;
+  
+  template class EXPORTGPUREG cuHSOpticalFlowSolver<double,1>;
+  template class EXPORTGPUREG cuHSOpticalFlowSolver<double,2>;
+  template class EXPORTGPUREG cuHSOpticalFlowSolver<double,3>;
+  template class EXPORTGPUREG cuHSOpticalFlowSolver<double,4>;
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuHSOpticalFlowSolver.h b/toolboxes/registration/optical_flow/gpu/cuHSOpticalFlowSolver.h
new file mode 100644
index 0000000..3a5f73f
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuHSOpticalFlowSolver.h
@@ -0,0 +1,52 @@
+/** \file cuHSOpticalFlowSolver.h
+    \brief GPU-based Horn-Schunck optical flow registration solver.
+
+    References to the solver implementation and some usage scenarios can be found in:
+
+    An optimised multi-baseline approach for on-line MR-temperature monitoring on commodity graphics hardware
+    BD de Senneville, KØ Noe, M Ries, M Pedersen, CTW Moonen, TS Sørensen.
+    5th IEEE International Symposium on Biomedical Imaging: From Nano to Macro, 2008. ISBI 2008. pp. 1513-1516.
+
+    Acceleration and validation of optical flow based deformable registration for image-guided radiotherapy.
+    KØ Noe, BD de Senneville, UV Elstrøm, K Tanderup, TS Sørensen.
+    Acta Oncologica 2008; 47(7): 1286-1293.
+
+    Retrospective reconstruction of high temporal resolution cine images from real‐time MRI using iterative motion correction
+    MS Hansen, TS Sørensen, AE Arai, P Kellman.
+    Magnetic Resonance in Medicine 2012; 68(3): 741-750.
+*/
+
+#pragma once
+
+#include "cuOpticalFlowSolver.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class EXPORTGPUREG cuHSOpticalFlowSolver 
+    : public cuOpticalFlowSolver<T, D>
+  {
+  
+  public:
+
+    // Constructors / destructors
+    //
+  
+    cuHSOpticalFlowSolver() : cuOpticalFlowSolver<T,D>(){ 
+      alpha_ = T(0.1); 
+    } 
+  
+    virtual ~cuHSOpticalFlowSolver() {}
+  
+    // Set the regularization weight
+    //
+  
+    inline void set_alpha( T alpha ) { alpha_ = alpha; }
+  
+  protected:  
+    virtual boost::shared_ptr< cuNDArray<T> > 
+      core_solver( cuNDArray<T> *gradient_image, cuNDArray<T> *stencil_image );
+    
+  protected:
+    T alpha_;
+  };
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuLinearResampleOperator.cu b/toolboxes/registration/optical_flow/gpu/cuLinearResampleOperator.cu
new file mode 100644
index 0000000..0154082
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuLinearResampleOperator.cu
@@ -0,0 +1,265 @@
+#include "cuLinearResampleOperator.h"
+#include "cuNDArray_reductions.h"
+#include "cuResampleOperator_macros.h"
+#include "vector_td_utilities.h"
+#include "check_CUDA.h"
+#include "setup_grid.h"
+
+namespace Gadgetron{
+
+  //
+  // Check if all neighbors required for the linear interpolation exists
+  // 
+
+  template<class REAL, unsigned int D> __device__ 
+  bool is_border_pixel( typename reald<REAL,D>::Type co, typename uintd<D>::Type dims )
+  {
+    for( unsigned int dim=0; dim<D; dim++ ){
+      if( dims[dim] > 1 && ( co[dim] < REAL(0) || co[dim] >= (REAL(dims[dim])-REAL(1)) ) )
+        return true;
+    }
+    return false;
+  }
+  
+  template<unsigned int D> static __inline__ __host__ __device__ 
+  unsigned int _get_num_neighbors()
+  {
+    return 1 << D;
+  }
+
+  template<class T, unsigned int D> unsigned int
+  cuLinearResampleOperator<T,D>::get_num_neighbors()
+  {
+    return _get_num_neighbors<D>();
+  }
+
+  //
+  // Linear interpolation
+  //
+
+  template<class T, unsigned int D> __device__ 
+  T interpolate( unsigned int batch_no, 
+                 typename reald<typename realType<T>::Type,D>::Type co, 
+                 typename uintd<D>::Type matrix_size, 
+                 const T * __restrict__ image )
+  {
+    typedef typename realType<T>::Type REAL;
+
+    // We will only proceed if all neighbours exist
+    //
+
+    if( is_border_pixel<REAL,D>(co, matrix_size) )
+      return T(0);
+
+    // To hold the result
+    //
+
+    T res = T(0);
+
+    // Iterate over all neighbors
+    //
+
+    const typename uintd<D>::Type twos(2);
+    const unsigned int num_neighbors = _get_num_neighbors<D>();
+  
+    for( unsigned int i=0; i<num_neighbors; i++ ){
+    
+      // Determine image coordinate of current neighbor
+      //
+
+      const typename uintd<D>::Type stride = idx_to_co<D>( i, twos );
+
+      if( weak_greater_equal( stride, matrix_size ) ) continue; // For dimensions of size 1
+
+      typename reald<REAL,D>::Type co_stride;
+
+      for( unsigned int dim=0; dim<D; dim++ ){
+        if( stride.vec[dim] == 0 ){
+          co_stride.vec[dim] = ::floor(co.vec[dim]);
+        }
+        else{
+          co_stride.vec[dim] = ::ceil(co.vec[dim]);
+          if( co_stride.vec[dim] == co.vec[dim] )
+            co_stride.vec[dim] += REAL(1.0);
+        }
+      }
+      
+      // Read corresponding pixel value
+      //
+    
+      T image_value = image[co_to_idx<D>(vector_td<unsigned int,D>(co_stride), matrix_size) + batch_no*prod(matrix_size)];
+    
+      // Determine weight
+      //
+
+      REAL weight = REAL(1);
+
+      for( unsigned int dim=0; dim<D; dim++ ){
+
+        if( stride.vec[dim] == 0 ){
+          weight *= (REAL(1.0)-(co.vec[dim]-co_stride.vec[dim]));
+        }
+        else{
+          weight *= (REAL(1.0)-(co_stride.vec[dim]-co.vec[dim]));
+        }
+      }
+      
+      // Accumulate result
+      //
+    
+      res += (weight * image_value);
+    }
+
+    // All done, return result
+    //
+
+    return res;
+  }
+
+  template<class REAL, unsigned int D> __global__ void
+  write_sort_arrays_kernel( typename uintd<D>::Type matrix_size, unsigned int extended_size, const REAL * __restrict__ displacements,
+                            unsigned int * __restrict__ sort_keys,  unsigned int * __restrict__ sort_values_indices, REAL * __restrict__ sort_values_weights )
+  {
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    const unsigned int num_elements_mat = prod(matrix_size);
+    const unsigned int num_elements_ext = prod(matrix_size)*extended_size;
+  
+    if( idx < num_elements_ext ){
+
+      const unsigned int batch_no = idx/num_elements_mat;
+      const unsigned int idx_in_batch = idx-batch_no*num_elements_mat;
+    
+      const typename uintd<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );
+
+      typename reald<REAL,D>::Type co_disp = vector_td<REAL,D>(co);
+      for( unsigned int dim=0; dim<D; dim++ )
+        co_disp.vec[dim] +=  displacements[dim*num_elements_ext+batch_no*num_elements_mat+idx_in_batch];
+    
+      // Determine the number of neighbors
+      //
+    
+      const typename uintd<D>::Type twos(2);
+      const unsigned int num_neighbors = _get_num_neighbors<D>();
+
+      // Weights are non-zero only if all neighbors exist
+      //
+    
+      bool non_zero = !is_border_pixel<REAL,D>(co_disp, matrix_size);
+
+      // Iterate over all neighbors
+      //
+    
+      for( unsigned int i=0; i<num_neighbors; i++ ){
+      
+        // Write out the sort values/indices
+        //
+        
+        sort_values_indices[idx+i*num_elements_ext] = idx;
+        
+        // Determine image coordinate of current neighbor
+        //
+        
+        const typename uintd<D>::Type stride = idx_to_co<D>( i, twos );
+        
+        if( weak_greater_equal( stride, matrix_size ) ) non_zero = false; // For dimensions of size 1
+        
+        typename reald<REAL,D>::Type co_stride;
+        
+        if( non_zero ){
+          for( unsigned int dim=0; dim<D; dim++ ){
+            if( stride.vec[dim] == 0 ){
+              co_stride.vec[dim] = ::floor(co_disp.vec[dim]);
+            }
+            else{
+              co_stride.vec[dim] = ::ceil(co_disp.vec[dim]);
+              if( co_stride.vec[dim] == co_disp.vec[dim] )
+                co_stride.vec[dim] += REAL(1.0);
+            }
+          }
+          
+          // Write out sort keys (moving image resampling indices).
+          //
+          
+          sort_keys[idx+i*num_elements_ext] = co_to_idx<D>(vector_td<unsigned int,D>(co_stride), matrix_size) + batch_no*num_elements_mat;
+        }
+        else{
+          sort_keys[idx+i*num_elements_ext] = idx; // Could be anything, weight is zero
+        }
+        
+        // Determine weight
+        //
+        
+        REAL weight = (non_zero) ? REAL(1) : REAL(0);
+        
+        if( non_zero ){
+          for( unsigned int dim=0; dim<D; dim++ ){	  
+            if( stride.vec[dim] == 0 ){
+              weight *= (REAL(1.0)-(co_disp.vec[dim]-co_stride.vec[dim])); }
+            else{
+              weight *= (REAL(1.0)-(co_stride.vec[dim]-co_disp.vec[dim])); }
+          }
+        }
+        
+        // Write out the sort values/weights
+        //
+
+        sort_values_weights[idx+i*num_elements_ext] = weight;
+      }
+    }
+  };
+
+  template<class T, unsigned int D> void 
+  cuLinearResampleOperator<T,D>::write_sort_arrays( thrust::device_vector<unsigned int> &sort_keys )
+  {
+    typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>(*this->offsets_->get_dimensions().get());
+    int surplus = this->offsets_->get_number_of_dimensions()-D;
+    unsigned int extended_dim = (surplus == 1) ? 1 : this->offsets_->get_size(D);
+  
+    dim3 blockDim, gridDim;
+    setup_grid( prod(matrix_size)*extended_dim, &blockDim, &gridDim );
+    
+    write_sort_arrays_kernel<typename realType<T>::Type,D><<< gridDim, blockDim >>>
+      ( vector_td<unsigned int,D>(matrix_size), extended_dim, this->offsets_->get_data_ptr(),
+        raw_pointer_cast(&(sort_keys[0])),
+        raw_pointer_cast(&(this->indices_)[0]),
+        raw_pointer_cast(&(this->weights_)[0]) );
+    
+    CHECK_FOR_CUDA_ERROR();
+  };
+  
+  // This macro is a workaround for Cudas missing support for pure virtual functions.
+  // It defines mult_M and mult_MH and intended to be shared among all classes derived from cuResampleOperator.
+  //
+  // 'cu' is automatically appendex to the macro argument (a workaround for the workaround).
+  //
+  
+  DECLARE_CU_RESAMPLE_OPERATOR_SUPPORT(LinearResampleOperator)
+  
+  // 
+  // Instantiation
+  //
+
+  template class EXPORTGPUREG cuLinearResampleOperator<float,1>;
+  template class EXPORTGPUREG cuLinearResampleOperator<float_complext,1>;
+
+  template class EXPORTGPUREG cuLinearResampleOperator<float,2>;
+  template class EXPORTGPUREG cuLinearResampleOperator<float_complext,2>;
+
+  template class EXPORTGPUREG cuLinearResampleOperator<float,3>;
+  template class EXPORTGPUREG cuLinearResampleOperator<float_complext,3>;
+
+  template class EXPORTGPUREG cuLinearResampleOperator<float,4>;
+  template class EXPORTGPUREG cuLinearResampleOperator<float_complext,4>;
+
+  template class EXPORTGPUREG cuLinearResampleOperator<double,1>;
+  template class EXPORTGPUREG cuLinearResampleOperator<double_complext,1>;
+
+  template class EXPORTGPUREG cuLinearResampleOperator<double,2>;
+  template class EXPORTGPUREG cuLinearResampleOperator<double_complext,2>;
+
+  template class EXPORTGPUREG cuLinearResampleOperator<double,3>;
+  template class EXPORTGPUREG cuLinearResampleOperator<double_complext,3>;
+
+  template class EXPORTGPUREG cuLinearResampleOperator<double,4>;
+  template class EXPORTGPUREG cuLinearResampleOperator<double_complext,4>;
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuLinearResampleOperator.h b/toolboxes/registration/optical_flow/gpu/cuLinearResampleOperator.h
new file mode 100644
index 0000000..ba5a5da
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuLinearResampleOperator.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "cuResampleOperator.h"
+
+namespace Gadgetron{
+
+  template <class T, unsigned int D>
+  class EXPORTGPUREG cuLinearResampleOperator : public cuResampleOperator<T,D>
+  {  
+  public:
+  
+    cuLinearResampleOperator() : cuResampleOperator<T,D>() {}
+    virtual ~cuLinearResampleOperator() {}
+  
+    virtual void mult_M( cuNDArray<T> *in, cuNDArray<T> *out, bool accumulate = false);
+    virtual void mult_MH( cuNDArray<T> *in, cuNDArray<T> *out, bool accumulate = false);
+  
+
+  protected:
+    virtual unsigned int get_num_neighbors();
+    virtual void write_sort_arrays( thrust::device_vector<unsigned int> &sort_keys );
+  };
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuOpticalFlowSolver.cu b/toolboxes/registration/optical_flow/gpu/cuOpticalFlowSolver.cu
new file mode 100644
index 0000000..e94238f
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuOpticalFlowSolver.cu
@@ -0,0 +1,303 @@
+#include "cuOpticalFlowSolver.h"
+#include "vector_td_utilities.h"
+#include "check_CUDA.h"
+
+#include <stdexcept>
+
+namespace Gadgetron{
+
+  //
+  // Kernel prototype declarations
+  //
+
+  template<class REAL, unsigned int D> __global__ 
+  void spatial_grad_kernel(const REAL*, const REAL*,REAL*,typename uint64d<D>::Type,unsigned int,unsigned int);
+
+  template<class REAL, unsigned int D> __global__ 
+  void temporal_grad_kernel(const REAL*, const REAL*,REAL*,typename uint64d<D>::Type,unsigned int,unsigned int);
+
+  // There is some issue about Cuda defining min/max incompatibly...
+  //
+
+  template <class T> __host__ __device__ const T& _cuOF_max (const T& a, const T& b) {
+    return (a<b)?b:a;
+  }
+
+  template <class T> __host__ __device__ const T& _cuOF_min (const T& a, const T& b) {
+    return (a>b)?b:a;
+  }
+
+  template<class T, unsigned int D> void
+  cuOpticalFlowSolver<T,D>::setup_grid( dim3 *blockDim, dim3* gridDim, 
+					   unsigned int number_of_elements, 
+					   unsigned int num_batches, 
+					   bool use_2d_blocks, 
+					   unsigned int num_unknowns )
+  {
+    int device;
+    cudaDeviceProp deviceProp; 
+  
+    if( cudaGetDevice( &device ) != cudaSuccess) {
+      throw std::runtime_error("cuOpticalFlowSolver::setup_grid(): unable to determine current device");
+    }
+    
+    if( cudaGetDeviceProperties( &deviceProp, device ) != cudaSuccess) {
+      throw std::runtime_error("cuOpticalFlowSolver::setup_grid(): unable to query current device");
+    }
+    
+    int max_blockdim = deviceProp.maxThreadsDim[0];
+    int max_griddim  = deviceProp.maxGridSize[0];
+    int warp_size    = deviceProp.warpSize;
+    
+    // For small arrays we keep the block dimension fairly small
+    if( use_2d_blocks )
+      *blockDim = dim3(((256/num_unknowns)/warp_size)*warp_size, num_unknowns);
+    else
+      *blockDim = dim3(256);
+  
+    *gridDim = dim3((number_of_elements+(blockDim->x*blockDim->y)-1)/(blockDim->x*blockDim->y), num_batches);
+
+    // Extend block/grid dimensions for large arrays
+    if( gridDim->x > max_griddim ){
+      if( use_2d_blocks )
+        blockDim->x = ((max_blockdim/num_unknowns)/warp_size)*warp_size;
+      else
+        blockDim->x = max_blockdim;
+    
+      gridDim->x = (number_of_elements+(blockDim->x*blockDim->y)-1)/(blockDim->x*blockDim->y);
+    }
+
+    if( gridDim->x > max_griddim ){
+      gridDim->x = ((unsigned int)std::sqrt((T)number_of_elements)+(blockDim->x*blockDim->y)-1)/(blockDim->x*blockDim->y);
+      gridDim->y *= ((number_of_elements+(blockDim->x*blockDim->y)*gridDim->x-1)/((blockDim->x*blockDim->y)*gridDim->x));
+    }
+   
+    if( gridDim->x > max_griddim || gridDim->y > max_griddim ){      
+      throw std::runtime_error("cuOpticalFlowSolver::setup_grid(): maximum grid dimensions exceeded");
+    }
+  }
+  
+  template<class T, unsigned int D> void
+  cuOpticalFlowSolver<T,D>::core_grad_spatial( T *fixed_image, T *moving_image, T *gradient_image, 
+						  typename uint64d<D>::Type matrix_size_moving, 
+						  size_t number_of_batches_fixed, 
+						  size_t number_of_batches_moving )
+  {        
+    unsigned int number_of_elements = prod(matrix_size_moving);
+    dim3 blockDim; dim3 gridDim;
+
+    setup_grid( &blockDim, &gridDim, number_of_elements, _cuOF_max(number_of_batches_moving, number_of_batches_fixed)*D );
+    
+    // Invoke kernel (spatial partial derivatives)
+    spatial_grad_kernel<T,D><<< gridDim, blockDim >>>
+      ( fixed_image, moving_image, gradient_image, matrix_size_moving, number_of_batches_fixed, number_of_batches_moving );
+    
+    CHECK_FOR_CUDA_ERROR();
+  }
+  
+  template<class T, unsigned int D> void
+  cuOpticalFlowSolver<T,D>::core_grad_temporal( T *fixed_image, T *moving_image, T *gradient_image, 
+						   typename uint64d<D>::Type matrix_size_moving, 
+						   size_t number_of_batches_fixed, 
+						   size_t number_of_batches_moving )
+  {        
+    unsigned int number_of_elements = prod(matrix_size_moving);
+    dim3 blockDim; dim3 gridDim;
+    
+    setup_grid( &blockDim, &gridDim, number_of_elements, _cuOF_max(number_of_batches_moving, number_of_batches_fixed) );
+    
+    // Invoke kernel (temporal partial derivative)
+    temporal_grad_kernel<T,D><<< gridDim, blockDim >>>
+      ( fixed_image, moving_image, gradient_image,
+        matrix_size_moving, number_of_batches_fixed, number_of_batches_moving );
+    
+    CHECK_FOR_CUDA_ERROR();
+  }
+  
+  // Helpers
+  //
+
+  template<unsigned int D> __device__ 
+  typename uint64d<D>::Type compute_stride( unsigned int dim )
+  {
+    typename uint64d<D>::Type res;
+  
+    for( unsigned int d=0; d<D; d++ ){
+      res.vec[d] = (d==dim) ? 1 : 0;
+    }
+    return res;
+  }
+
+  template<unsigned int D> __device__ 
+  bool is_border_pixel_in_stride_dim_before( unsigned int dim, typename uint64d<D>::Type co, typename uint64d<D>::Type dims )
+  {
+    if( co.vec[dim] == 0 )
+      return true;
+    else
+      return false;
+  }
+
+  template<unsigned int D> __device__ 
+  bool is_border_pixel_in_stride_dim_after( unsigned int dim, typename uint64d<D>::Type co, typename uint64d<D>::Type dims )
+  {
+    if( co.vec[dim] == (dims.vec[dim]-1) )
+      return true;
+    else
+      return false;
+  }
+
+  // Spatial partial derivatives
+  //
+
+  template<class REAL, unsigned int D> __global__ void
+  spatial_grad_kernel( const REAL * __restrict__ fixed_image, const REAL * __restrict__ moving_image, REAL * __restrict__ gradient_image,
+                       typename uint64d<D>::Type matrix_size, 
+                       unsigned int num_batches_fixed, unsigned int num_batches_moving )
+  {
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+
+    // Number of elements per partial derivate
+    const unsigned int num_elements_per_batch = prod(matrix_size);
+    const unsigned int num_elements_per_pdev_fixed = num_elements_per_batch*num_batches_fixed;
+    const unsigned int num_elements_per_pdev_moving = num_elements_per_batch*num_batches_moving;
+
+    // Total number of elements for all partial derivatives
+    const unsigned int num_elements_total = _cuOF_max(num_elements_per_pdev_fixed, num_elements_per_pdev_moving)*D;
+  
+    if( idx < num_elements_total ){
+    
+      // The (minimum) index in the slowest varying output dimension determines which partial derivative to compute 
+      const unsigned int stride_dim_fixed = idx/(num_elements_per_pdev_fixed);
+      const unsigned int stride_dim_moving = idx/(num_elements_per_pdev_moving);
+      const unsigned int stride_dim = _cuOF_min(stride_dim_fixed, stride_dim_moving);
+
+      // Local index to the partial derivative
+      const unsigned int idx_in_pdev_fixed = idx-stride_dim_fixed*num_elements_per_pdev_fixed;
+      const unsigned int idx_in_pdev_moving = idx-stride_dim_moving*num_elements_per_pdev_moving;
+
+      // Batch idx (second slowest varying dimension)   
+      const unsigned int batch_idx_fixed = idx_in_pdev_fixed/num_elements_per_batch;
+      const unsigned int batch_idx_moving = idx_in_pdev_moving/num_elements_per_batch;
+
+      // Local index to the batch (should be identical for the fixed/moving image)
+      const size_t idx_in_batch = idx_in_pdev_moving-batch_idx_moving*num_elements_per_batch;
+
+      // Local co to the image
+      const typename uint64d<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );
+ 
+      REAL res;
+      unsigned int count = 0;
+
+      //
+      // Find partial derivatives using central differences
+      //
+    
+      typename uint64d<D>::Type stride = compute_stride<D>(stride_dim);
+    
+      const unsigned int base_idx_moving = batch_idx_moving*num_elements_per_batch;
+      const unsigned int base_idx_fixed = batch_idx_fixed*num_elements_per_batch;
+
+      unsigned int stride_base_idx, fixed_idx, moving_idx;
+     
+      // Neighbor "plus stride" side
+      if( !is_border_pixel_in_stride_dim_after<D>( stride_dim, co, matrix_size )){
+        stride_base_idx = co_to_idx<D>(co+stride, matrix_size);
+        count++;
+      }
+      else{
+        stride_base_idx = idx_in_batch;
+      }
+    
+      fixed_idx = stride_base_idx+base_idx_fixed;
+      moving_idx = stride_base_idx+base_idx_moving;
+    
+      res = (fixed_image[fixed_idx]+moving_image[moving_idx])*REAL(0.5);
+
+      // Neighbor "minus stride" side
+      if( !is_border_pixel_in_stride_dim_before<D>( stride_dim, co, matrix_size )){
+        stride_base_idx = co_to_idx<D>(co-stride, matrix_size);
+        count++;
+      }
+      else{
+        stride_base_idx = co_to_idx<D>(co, matrix_size);
+      }
+    
+      fixed_idx = stride_base_idx+base_idx_fixed;
+      moving_idx = stride_base_idx+base_idx_moving;
+    
+      res -= (fixed_image[fixed_idx]+moving_image[moving_idx])*REAL(0.5);
+
+      if( count == 2 ) // Both neighbors exist
+        res /= REAL(2);
+
+      // Output result
+      //
+    
+      gradient_image[idx] = res;
+    }
+  }
+
+  // Temporal partial derivatives
+  //
+
+  template<class REAL, unsigned int D> __global__ void
+  temporal_grad_kernel( const REAL * __restrict__ fixed_image, const REAL * __restrict__ moving_image, REAL * __restrict__ gradient_image,
+                        typename uint64d<D>::Type matrix_size, 
+                        unsigned int num_batches_fixed, unsigned int num_batches_moving )
+  { 
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+
+    // Number of elements per partial derivate
+    const unsigned int num_elements_per_batch = prod(matrix_size);
+    const unsigned int num_elements_per_pdev_fixed = num_elements_per_batch*num_batches_fixed;
+    const unsigned int num_elements_per_pdev_moving = num_elements_per_batch*num_batches_moving;
+
+    // Total number of elements for all partial derivatives
+    const unsigned int num_elements_total = _cuOF_max(num_elements_per_pdev_fixed, num_elements_per_pdev_moving);
+  
+    if( idx < num_elements_total ){
+    
+      const unsigned int stride_dim_fixed = idx/(num_elements_per_pdev_fixed);
+      const unsigned int stride_dim_moving = idx/(num_elements_per_pdev_moving);
+
+      // Local index to the partial derivative
+      const unsigned int idx_in_pdev_fixed = idx-stride_dim_fixed*num_elements_per_pdev_fixed;
+      const unsigned int idx_in_pdev_moving = idx-stride_dim_moving*num_elements_per_pdev_moving;
+
+      // Batch idx (second slowest varying dimension)   
+      const unsigned int batch_idx_fixed = idx_in_pdev_fixed/num_elements_per_batch;
+      const unsigned int batch_idx_moving = idx_in_pdev_moving/num_elements_per_batch;
+
+      // Local index to the batch (should be identical for the fixed/moving image)
+      const unsigned int idx_in_batch = idx_in_pdev_moving-batch_idx_moving*num_elements_per_batch;
+
+      const unsigned int base_idx_fixed = batch_idx_fixed*num_elements_per_batch;
+      const unsigned int base_idx_moving = batch_idx_moving*num_elements_per_batch;
+    
+      // Ctr pixel
+      const unsigned int fixed_idx = idx_in_batch+base_idx_fixed;
+      const unsigned int moving_idx = idx_in_batch+base_idx_moving;
+    
+      const REAL res = moving_image[moving_idx]-fixed_image[fixed_idx];
+    
+      // Output result
+      //
+    
+      gradient_image[idx] = res;        
+    }    
+  }
+
+  // 
+  // Template instantiation
+  //
+
+  template class EXPORTGPUREG cuOpticalFlowSolver<float,1>;
+  template class EXPORTGPUREG cuOpticalFlowSolver<float,2>;
+  template class EXPORTGPUREG cuOpticalFlowSolver<float,3>;
+  template class EXPORTGPUREG cuOpticalFlowSolver<float,4>;
+
+  template class EXPORTGPUREG cuOpticalFlowSolver<double,1>;
+  template class EXPORTGPUREG cuOpticalFlowSolver<double,2>;
+  template class EXPORTGPUREG cuOpticalFlowSolver<double,3>;
+  template class EXPORTGPUREG cuOpticalFlowSolver<double,4>;
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuOpticalFlowSolver.h b/toolboxes/registration/optical_flow/gpu/cuOpticalFlowSolver.h
new file mode 100644
index 0000000..0ee98cf
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuOpticalFlowSolver.h
@@ -0,0 +1,50 @@
+/** \file cuOpticalFlowSolver.h
+    \brief Abstract class for a GPU-based optical flow registration solver.
+
+    cuOpticalFlowSolver is derived from class opticalFlowSolver 
+    and implements the computation of the spatial and temporal gradients.
+    A pure virtual function is expected to implement the specific algorithm (Horn-Schunck, Cornelius-Kanade).
+*/
+
+#pragma once
+
+#include "cuNDArray.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_blas.h"
+#include "opticalFlowSolver.h"
+#include "gpureg_export.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class EXPORTGPUREG cuOpticalFlowSolver 
+    : public opticalFlowSolver< cuNDArray<T>,D >
+  {  
+  public:
+  
+    cuOpticalFlowSolver() : opticalFlowSolver< cuNDArray<T>,D >() {}   
+    virtual ~cuOpticalFlowSolver() {}
+    
+  protected:
+
+    // General tool to set up the block/grid dimensions
+    //
+
+    void setup_grid( dim3 *blockDim, dim3* gridDim, unsigned int number_of_elements, 
+                     unsigned int num_batches = 1, bool use_2d_blocks = false, unsigned int num_unknowns = D);  
+ 
+    // GPU-based computation of the spatial and temporal image gradient
+    //
+    
+    virtual void core_grad_spatial( T *fixed_image, T *moving_image, T *gradient_image, 
+                                    typename uint64d<D>::Type matrix_size_moving, 
+                                    size_t number_of_batches_fixed, 
+                                    size_t number_of_batches_moving );
+    
+    virtual void core_grad_temporal( T *fixed_image, T *moving_image, T *gradient_image, 
+                                     typename uint64d<D>::Type matrix_size_moving, 
+                                     size_t number_of_batches_fixed, 
+                                     size_t number_of_batches_moving );
+  };  
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuResampleOperator.cu b/toolboxes/registration/optical_flow/gpu/cuResampleOperator.cu
new file mode 100644
index 0000000..a2731c1
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuResampleOperator.cu
@@ -0,0 +1,107 @@
+#include "cuResampleOperator.h"
+
+#include <thrust/host_vector.h>
+#include <thrust/generate.h>
+#include <thrust/pair.h>
+#include <thrust/sort.h> 
+#include <thrust/binary_search.h>
+#include <thrust/iterator/counting_iterator.h>
+
+using namespace thrust;
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> void 
+  cuResampleOperator<T,D>::mult_MH_preprocess()
+  {
+    this->preprocessed_ = false;
+  
+    // Check if a displacement field has been provided
+    //
+  
+    if( !this->offsets_.get() ){
+      throw cuda_error("cuResampleOperator::mult_MH_preprocess(): displacement field not set.");
+    }
+
+    // Make a device vector wrap of the displacement field
+    //
+
+    std::vector<size_t> _dims_disp = *this->offsets_->get_dimensions(); _dims_disp.pop_back(); 
+    unsigned int num_elements_disp = D;
+    while(!_dims_disp.empty()){
+      num_elements_disp *= _dims_disp.back();
+      _dims_disp.pop_back();
+    }
+  
+    device_vector<REAL> displacements
+      ( device_pointer_cast<REAL>(this->offsets_->get_data_ptr()), 
+        device_pointer_cast<REAL>(this->offsets_->get_data_ptr()+num_elements_disp) );
+  
+    // Make sort keys/values array from the deformation field
+    //
+
+    unsigned int num_elements_sort = num_elements_disp/D;
+  
+    this->lower_bounds_ = device_vector<unsigned int>(num_elements_sort);
+    this->upper_bounds_ = device_vector<unsigned int>(num_elements_sort);
+  
+    this->indices_ = device_vector<unsigned int>(get_num_neighbors()*num_elements_sort);
+    this->weights_ = device_vector<REAL>(get_num_neighbors()*num_elements_sort);
+
+    device_vector<unsigned int> sort_keys = device_vector<unsigned int>
+      (get_num_neighbors()*num_elements_sort);
+  
+    // Fill arrays
+    //
+
+    write_sort_arrays(sort_keys);
+    
+    // Make copy of sort_keys before the sort modifies it
+    //
+
+    device_vector<unsigned int> sort_keys_copy(sort_keys);
+  
+    // Sort (twice since we have two value arrays)
+    //
+
+    sort_by_key(sort_keys.begin(), sort_keys.end(), this->indices_.begin() );
+    sort_by_key(sort_keys_copy.begin(), sort_keys_copy.end(), this->weights_.begin() );
+  
+    // Find start/end indices (buckets) in the two values arrays
+    //
+  
+    counting_iterator<unsigned int> search_begin(0);
+    
+    lower_bound( sort_keys.begin(), sort_keys.end(), 
+		 search_begin, search_begin + num_elements_sort, this->lower_bounds_.begin() );
+  
+    upper_bound( sort_keys.begin(), sort_keys.end(), 
+		 search_begin, search_begin + num_elements_sort, this->upper_bounds_.begin() );
+    
+    this->preprocessed_ = true;
+  }
+
+  template class EXPORTGPUREG cuResampleOperator<float,1>;
+  template class EXPORTGPUREG cuResampleOperator<float_complext,1>;
+
+  template class EXPORTGPUREG cuResampleOperator<float,2>;
+  template class EXPORTGPUREG cuResampleOperator<float_complext,2>;
+
+  template class EXPORTGPUREG cuResampleOperator<float,3>;
+  template class EXPORTGPUREG cuResampleOperator<float_complext,3>;
+
+  template class EXPORTGPUREG cuResampleOperator<float,4>;
+  template class EXPORTGPUREG cuResampleOperator<float_complext,4>;
+
+  template class EXPORTGPUREG cuResampleOperator<double,1>;
+  template class EXPORTGPUREG cuResampleOperator<double_complext,1>;
+
+  template class EXPORTGPUREG cuResampleOperator<double,2>;
+  template class EXPORTGPUREG cuResampleOperator<double_complext,2>;
+
+  template class EXPORTGPUREG cuResampleOperator<double,3>;
+  template class EXPORTGPUREG cuResampleOperator<double_complext,3>;
+
+  template class EXPORTGPUREG cuResampleOperator<double,4>;
+  template class EXPORTGPUREG cuResampleOperator<double_complext,4>;
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuResampleOperator.h b/toolboxes/registration/optical_flow/gpu/cuResampleOperator.h
new file mode 100644
index 0000000..e73f1be
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuResampleOperator.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "cuNDArray_math.h"
+#include "resampleOperator.h"
+#include "gpureg_export.h"
+
+#include <thrust/device_vector.h>
+
+namespace Gadgetron{
+
+  template <class T, unsigned int D>
+  class EXPORTGPUREG cuResampleOperator : public resampleOperator< cuNDArray<typename realType<T>::Type>, cuNDArray<T> >
+  {    
+  public:
+
+    typedef typename realType<T>::Type REAL;
+    
+    cuResampleOperator() : resampleOperator< cuNDArray<REAL>, cuNDArray<T> >() {}
+    virtual ~cuResampleOperator() {}
+  
+    virtual void reset()
+    {
+      lower_bounds_ = thrust::device_vector<unsigned int>();
+      upper_bounds_ = thrust::device_vector<unsigned int>();
+      indices_ = thrust::device_vector<unsigned int>();
+      weights_ = thrust::device_vector<REAL>();
+      resampleOperator< cuNDArray<typename realType<T>::Type>, cuNDArray<T> >::reset();
+    }
+    
+    virtual void mult_MH_preprocess();
+  
+  protected:
+    virtual unsigned int get_num_neighbors() = 0;
+    virtual void write_sort_arrays( thrust::device_vector<unsigned int> &sort_keys ) = 0;
+    
+  protected:
+    thrust::device_vector<unsigned int> lower_bounds_;
+    thrust::device_vector<unsigned int> upper_bounds_;
+    thrust::device_vector<unsigned int> indices_;
+    thrust::device_vector<REAL> weights_;
+  };
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuResampleOperator_macros.h b/toolboxes/registration/optical_flow/gpu/cuResampleOperator_macros.h
new file mode 100644
index 0000000..26072e8
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuResampleOperator_macros.h
@@ -0,0 +1,248 @@
+#pragma once
+
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "complext.h"
+
+/* 
+   This macro definition is a workaround 
+   for missing pure virtual device function support in Cuda.
+   
+   We provide this macro to avoid explicitly duplicating 
+   the code below in every "cuResampleOperator-inherited" class.
+*/
+
+#define DECLARE_CU_RESAMPLE_OPERATOR_SUPPORT(COMPONENT)                 \
+                                                                        \
+  template<class T, unsigned int D> __global__ void                     \
+  mult_M_kernel_batch( T *in, T *out,                                   \
+                       typename realType<T>::Type *displacements,       \
+                       typename uintd<D>::Type matrix_size, unsigned int num_batches ) \
+  {                                                                     \
+    typedef typename realType<T>::Type REAL;                            \
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x; \
+    const unsigned int num_elements = prod(matrix_size);                \
+                                                                        \
+    if( idx < num_elements*num_batches ){                               \
+                                                                        \
+      const unsigned int batch_no = idx/num_elements;                   \
+      const unsigned int idx_in_batch = idx-batch_no*num_elements;      \
+      const typename uintd<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size ); \
+                                                                        \
+      typename reald<REAL,D>::Type co_disp = vector_td<REAL,D>(co); \
+      for( unsigned int dim=0; dim<D; dim++ )                           \
+        co_disp.vec[dim] +=  displacements[dim*num_elements+idx_in_batch]; \
+                                                                        \
+      out[idx] = interpolate<T,D>( batch_no, co_disp, matrix_size, in ); \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  template<class T, unsigned int D> __global__ void                     \
+  mult_M_kernel_extended( T *in, T *out,                                \
+                          typename realType<T>::Type *displacements,    \
+                          typename uintd<D>::Type matrix_size,          \
+                          unsigned int num_elements_in,                 \
+                          unsigned int extended_size )                  \
+  {                                                                     \
+    typedef typename realType<T>::Type REAL;                            \
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x; \
+    const unsigned int num_elements_mat = prod(matrix_size);            \
+    const unsigned int num_elements_ext = prod(matrix_size)*extended_size; \
+                                                                        \
+    if( idx < num_elements_ext ){                                       \
+                                                                        \
+      const unsigned int batch_no = idx/num_elements_mat;               \
+      const unsigned int idx_in_batch = idx-batch_no*num_elements_mat;	\
+                                                                        \
+      const typename uintd<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size ); \
+                                                                        \
+      typename reald<REAL,D>::Type co_disp = vector_td<REAL,D>(co); \
+      for( unsigned int dim=0; dim<D; dim++ )                           \
+        co_disp.vec[dim] +=  displacements[dim*num_elements_ext+batch_no*num_elements_mat+idx_in_batch]; \
+                                                                        \
+      out[idx] = interpolate<T,D>( (idx >= num_elements_in) ? 0 : batch_no, \
+                                   co_disp, matrix_size, in );          \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  template<class T, unsigned int D> void                                \
+  cu##COMPONENT<T,D>::mult_M( cuNDArray<T> *in, cuNDArray<T> *out, bool accumulate ) \
+  {                                                                     \
+    if( !in || !out ){                                                  \
+      throw cuda_error("cuResampleOperator::mult_M(): illegal input/output array."); \
+    }                                                                   \
+                                                                        \
+    if( !this->offsets_.get() ){                                        \
+      throw cuda_error("cuResampleOperator::mult_M(): displacement field not set."); \
+    }                                                                   \
+                                                                        \
+    cuNDArray<T> tmp;                                                   \
+    if( accumulate ){                                                   \
+      tmp = *out;                                                       \
+    }                                                                   \
+                                                                        \
+    unsigned int num_disp_vectors = this->get_number_of_displacement_vectors(); \
+    int surplus = this->offsets_->get_number_of_dimensions()-D;         \
+                                                                        \
+    if( !( surplus == 1 || surplus == 2) || this->offsets_->get_size(D-1+surplus) < D ){ \
+      throw cuda_error("cuResampleOperator::mult_M(): unexpected dimensions of displacement field."); \
+    }                                                                   \
+                                                                        \
+    if( surplus == 1 ){                                                 \
+      if( in->get_number_of_elements() != out->get_number_of_elements() ){ \
+        throw cuda_error("cuResampleOperator::mult_M(): in/out array dimensions mismatch (1)."); \
+      }                                                                 \
+      if( (in->get_number_of_elements() % num_disp_vectors ) != 0 ){    \
+        throw cuda_error("cuResampleOperator::mult_M(): in/out array dimensions mismatch displacement field."); \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    if( surplus == 2 ){                                                 \
+      if( (out->get_number_of_elements() % in->get_number_of_elements()) != 0 ){ \
+        throw cuda_error("cuResampleOperator::mult_M(): in/out array dimensions mismatch (2)."); \
+      }                                                                 \
+      if( out->get_number_of_dimensions() != (D+1) || out->get_number_of_elements() != num_disp_vectors ){ \
+        throw cuda_error("cuResampleOperator::mult_M(): output array dimensions mismatch displacement field."); \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>(*in->get_dimensions().get()); \
+    unsigned int num_elements_mat = prod(matrix_size);                  \
+    unsigned int num_batches = (surplus == 2) ? 1 : in->get_number_of_elements() / num_elements_mat; \
+    unsigned int extended_dim = (surplus == 1) ? 1 : out->get_size(D);	\
+                                                                        \
+    dim3 blockDim, gridDim;                                             \
+                                                                        \
+    if( surplus == 1 ){                                                 \
+      setup_grid( num_elements_mat, &blockDim, &gridDim, num_batches ); \
+    }                                                                   \
+    else{                                                               \
+      setup_grid( num_elements_mat*extended_dim, &blockDim, &gridDim ); \
+    }                                                                   \
+                                                                        \
+    if( surplus == 1 ) {                                                \
+      mult_M_kernel_batch<T,D><<< gridDim, blockDim >>>                 \
+        ( in->get_data_ptr(), out->get_data_ptr(),                      \
+          this->offsets_->get_data_ptr(), vector_td<unsigned int,D>(matrix_size), num_batches ); \
+    }                                                                   \
+    else{                                                               \
+      mult_M_kernel_extended<T,D><<< gridDim, blockDim >>>              \
+        ( in->get_data_ptr(), out->get_data_ptr(), this->offsets_->get_data_ptr(), \
+          vector_td<unsigned int,D>(matrix_size), in->get_number_of_elements(), extended_dim ); \
+    }                                                                   \
+                                                                        \
+    CHECK_FOR_CUDA_ERROR();                                             \
+                                                                        \
+    if( accumulate ){                                                   \
+      *out += tmp;                                                      \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  template<class T, unsigned int D> __global__ void                     \
+  mult_MH_kernel( T *in, T *out, typename realType<T>::Type *weights,   \
+                  unsigned int *indices, unsigned int *lower_bounds, unsigned int *upper_bounds, \
+                  unsigned int num_elements, unsigned int num_batches ) \
+  {                                                                     \
+    typedef typename realType<T>::Type REAL;                            \
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x; \
+                                                                        \
+    if( idx < num_elements*num_batches ){                               \
+                                                                        \
+      const unsigned int batch_no = idx/num_elements;                   \
+      const unsigned int idx_in_batch = idx-batch_no*num_elements;      \
+                                                                        \
+      const unsigned int lower_bound = lower_bounds[idx_in_batch];      \
+      const unsigned int upper_bound = upper_bounds[idx_in_batch];      \
+                                                                        \
+      T val = T(0);                                                     \
+                                                                        \
+      if( lower_bound > upper_bound ||                                  \
+          lower_bound >= (_get_num_neighbors<D>()*num_elements) ||      \
+          upper_bound >= (_get_num_neighbors<D>()*num_elements) ){      \
+                                                                        \
+        out[idx] = T(0);                                                \
+        return;                                                         \
+      }                                                                 \
+                                                                        \
+      for( unsigned int i=lower_bound; i<upper_bound; i++ ){            \
+        unsigned int in_idx = indices[i];                               \
+        if( in_idx >= num_elements ){                                   \
+          val = T(0);                                                   \
+          continue;                                                     \
+        }                                                               \
+        REAL weight = weights[i];                                       \
+        val += (in[in_idx+batch_no*num_elements]*weight);               \
+      }                                                                 \
+      out[idx] = val;                                                   \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  template<class T, unsigned int D> void                                \
+  cu##COMPONENT<T,D>::mult_MH( cuNDArray<T> *in, cuNDArray<T> *out, bool accumulate ) \
+  {                                                                     \
+    if( !in || !out ){                                                  \
+      throw cuda_error("cuResampleOperator::mult_MH(): illegal input/output array."); \
+    }                                                                   \
+                                                                        \
+    if( !this->preprocessed_ ){                                         \
+      throw cuda_error("cuResampleOperator::mult_MH(): preprocessing has not been performed."); \
+    }                                                                   \
+                                                                        \
+    cuNDArray<T> tmp;                                                   \
+    if( accumulate ){                                                   \
+      tmp = *out;                                                       \
+    }                                                                   \
+                                                                        \
+    unsigned int num_disp_vectors = this->get_number_of_displacement_vectors(); \
+    int surplus = this->offsets_->get_number_of_dimensions()-D;         \
+                                                                        \
+    if( surplus == 1 ){                                                 \
+      if( in->get_number_of_elements() != out->get_number_of_elements() ){ \
+        throw cuda_error("cuResampleOperator::mult_MH(): in/out array dimensions mismatch (1)."); \
+      }                                                                 \
+      if( (in->get_number_of_elements() % num_disp_vectors ) != 0 ){    \
+        throw cuda_error("cuResampleOperator::mult_MH(): in/out array dimensions mismatch displacement field (1)."); \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    if( surplus == 2 ){                                                 \
+      if( (in->get_number_of_elements() % out->get_number_of_elements()) != 0 ){ \
+        throw cuda_error("cuResampleOperator::mult_MH(): in/out array dimensions mismatch (2)."); \
+      }                                                                 \
+      if( in->get_number_of_dimensions() != (D+1) || in->get_number_of_elements() != num_disp_vectors ){ \
+        throw cuda_error("cuResampleOperator::mult_MH(): output array dimensions mismatch displacement field."); \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    cuNDArray<T> *tmp_out = out; bool mod_out = false;                  \
+    if( surplus == 2 && (in->get_number_of_elements()/out->get_number_of_elements()) > 1 ){ \
+      mod_out = true;                                                   \
+      tmp_out = new cuNDArray<T>(in->get_dimensions().get());           \
+    }                                                                   \
+                                                                        \
+    typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>( *this->offsets_->get_dimensions().get() ); \
+    unsigned int num_batches = (surplus == 2) ? 1 : in->get_number_of_elements() / prod(matrix_size); \
+    unsigned int extended_dim = (surplus == 1) ? 1 : in->get_size(D);   \
+    unsigned int num_elements = prod(matrix_size)*extended_dim;         \
+                                                                        \
+    dim3 blockDim, gridDim;                                             \
+                                                                        \
+    setup_grid( num_elements, &blockDim, &gridDim, num_batches );       \
+    mult_MH_kernel<T,D><<< gridDim, blockDim >>>                        \
+      ( in->get_data_ptr(), tmp_out->get_data_ptr(),                    \
+        raw_pointer_cast(&this->weights_[0]), raw_pointer_cast(&this->indices_[0]), \
+        raw_pointer_cast(&this->lower_bounds_[0]), raw_pointer_cast(&this->upper_bounds_[0]), \
+        num_elements, num_batches );                                    \
+                                                                        \
+    if( mod_out ){                                                      \
+      *out = *sum<T>( tmp_out, D );                                     \
+      delete tmp_out;                                                   \
+    }                                                                   \
+                                                                        \
+    CHECK_FOR_CUDA_ERROR();                                             \
+                                                                        \
+    if( accumulate ){                                                   \
+      *out += tmp;                                                      \
+    }                                                                   \
+  }
diff --git a/toolboxes/registration/optical_flow/gpu/gpureg_export.h b/toolboxes/registration/optical_flow/gpu/gpureg_export.h
new file mode 100644
index 0000000..6690a4c
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/gpureg_export.h
@@ -0,0 +1,14 @@
+#ifndef _GPUREG_EXPORT_H_
+#define _GPUREG_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GPUREG__) || defined (gpureg_EXPORTS)
+#define EXPORTGPUREG __declspec(dllexport)
+#else
+#define EXPORTGPUREG __declspec(dllimport)
+#endif
+#else
+#define EXPORTGPUREG
+#endif
+
+#endif /* _GPUREG_EXPORT_H_ */
diff --git a/toolboxes/registration/optical_flow/multiresRegistrationSolver.h b/toolboxes/registration/optical_flow/multiresRegistrationSolver.h
new file mode 100644
index 0000000..4c0c6bc
--- /dev/null
+++ b/toolboxes/registration/optical_flow/multiresRegistrationSolver.h
@@ -0,0 +1,263 @@
+/** \file multiresRegistrationSolver.h
+    Abstract class defining a multiresolution registration solver.
+    Pure virtual functions are expected to do the actual work.
+*/
+
+#pragma once
+
+#include "registrationSolver.h"
+#include "vector_td_utilities.h"
+#include "vector_td_operators.h"
+
+namespace Gadgetron{
+
+  template<class ARRAY_TYPE_REAL, unsigned int D> class multiresRegistrationSolver
+    : public registrationSolver<ARRAY_TYPE_REAL>
+  {
+  protected:
+    typedef typename ARRAY_TYPE_REAL::element_type REAL;
+
+  public:
+
+    multiresRegistrationSolver() : registrationSolver<ARRAY_TYPE_REAL>(){
+      num_multires_levels_ = 0;
+      max_num_iterations_per_level_ = 500;
+    }
+
+    virtual ~multiresRegistrationSolver() {}
+
+    // Utilities to specify the registration settings
+    //
+
+    virtual void set_num_multires_levels( unsigned int levels ) {
+      num_multires_levels_ = levels; }
+
+    virtual void set_max_num_iterations_per_level( unsigned int iterations ) {
+      max_num_iterations_per_level_ = iterations; }
+
+    //
+    // The main solver interface
+    //
+
+    virtual boost::shared_ptr<ARRAY_TYPE_REAL> solve( registrationData<ARRAY_TYPE_REAL> *rd )
+    {
+      return registrationSolver<ARRAY_TYPE_REAL>::solve(rd);
+    }
+  
+    virtual boost::shared_ptr<ARRAY_TYPE_REAL> solve(
+                                                     ARRAY_TYPE_REAL *fixed_image,
+                                                     ARRAY_TYPE_REAL *moving_image,
+                                                     bool input_normalization_allowed = false  )
+    {
+      // Some initial validity tests
+      //
+
+      if( !fixed_image || !moving_image ){
+        throw std::runtime_error("multiresRegistrationSolver::solve : invalid input pointer.");
+      }
+
+      if( !this->interpolator_.get() ){
+        throw std::runtime_error("multiresRegistrationSolver::solve : interpolator not set.");
+      }
+
+      typename uint64d<D>::Type fixed_dims = from_std_vector<size_t,D>(*moving_image->get_dimensions());
+      typename uint64d<D>::Type moving_dims = from_std_vector<size_t,D>(*fixed_image->get_dimensions());
+
+      if(!(fixed_dims == moving_dims)){
+        throw std::runtime_error("multiresRegistrationSolver::solve : fixed/moving image base dimensions mismatch.");
+      }
+
+      if( weak_less_equal(fixed_dims>>num_multires_levels_, vector_td<size_t, D>(1)) ){
+        throw std::runtime_error("multiresRegistrationSolver::solve : too many multiresolution levels for image dimensionality.");
+      }
+
+      // Normalize the input
+      //
+
+      ARRAY_TYPE_REAL *normalized_fixed;
+      ARRAY_TYPE_REAL *normalized_moving;
+
+      boost::shared_ptr<ARRAY_TYPE_REAL> garbage_collector_fixed, garbage_collector_moving;
+      bool use_padding = padding_required(fixed_dims);
+
+      if( input_normalization_allowed ){
+        if( use_padding ){
+          throw std::runtime_error("multiresRegistrationSolver::solve : input normalization not possible as image padding is required.");
+        }
+        else{
+          normalized_fixed = fixed_image;
+          normalized_moving = moving_image;
+        }
+      }
+      else{
+        if( use_padding ){
+          garbage_collector_fixed = pad<REAL,D>(round_pow2(fixed_dims), fixed_image);
+          garbage_collector_moving = pad<REAL,D>(round_pow2(moving_dims), moving_image);
+          normalized_fixed = garbage_collector_fixed.get();
+          normalized_moving = garbage_collector_moving.get();
+        }
+        else{
+          normalized_fixed = new ARRAY_TYPE_REAL(*fixed_image);
+          normalized_moving = new ARRAY_TYPE_REAL(*moving_image);
+          garbage_collector_fixed = boost::shared_ptr<ARRAY_TYPE_REAL>(normalized_fixed);
+          garbage_collector_moving = boost::shared_ptr<ARRAY_TYPE_REAL>(normalized_moving);
+        }
+      }
+
+      normalize(normalized_fixed, REAL(1));
+      normalize(normalized_moving, REAL(1));
+
+      // Invoke multi-resolution solver
+      //
+
+      if( this->output_mode_ >= registrationSolver<ARRAY_TYPE_REAL>::OUTPUT_VERBOSE ) {
+        GDEBUG_STREAM(std::endl << "Starting multiresolution registration " <<  std::endl);
+      }
+
+      boost::shared_ptr<ARRAY_TYPE_REAL> result =
+        solveMultiRes( num_multires_levels_, normalized_fixed, normalized_moving, this->stencil_.get() );
+
+      if( use_padding ){
+        result = crop<REAL,D>( (round_pow2(fixed_dims)-fixed_dims)>>2, fixed_dims, result.get());
+      }
+
+      return result;
+    }
+
+  protected:
+
+    // Pure virtual fuctions to be implemented in a subclass
+    //
+
+    virtual void compute( ARRAY_TYPE_REAL *fixed_image, ARRAY_TYPE_REAL *moving_image, ARRAY_TYPE_REAL *stencil_image, 
+                          boost::shared_ptr<ARRAY_TYPE_REAL> &result ) = 0;
+
+    // The recursive multi-resolution solver
+    //
+
+    virtual boost::shared_ptr<ARRAY_TYPE_REAL> solveMultiRes(
+                                                             unsigned int res_level,
+                                                             ARRAY_TYPE_REAL *fixed_image,
+                                                             ARRAY_TYPE_REAL *moving_image,
+                                                             ARRAY_TYPE_REAL *stencil_image )
+    {
+      boost::shared_ptr<ARRAY_TYPE_REAL> result;
+
+      if (res_level>0){
+
+        //
+        // We are not yet at the end of the multi-resolution chain
+        //
+
+        // Downsample input images (and stencil if provided)
+        //
+
+        boost::shared_ptr<ARRAY_TYPE_REAL> fixed_image_lowres = downsample<REAL,D>(fixed_image);
+        boost::shared_ptr<ARRAY_TYPE_REAL> moving_image_lowres = downsample<REAL,D>(moving_image);
+        boost::shared_ptr<ARRAY_TYPE_REAL> stencil_image_lowres =
+          ((stencil_image) ? downsample<REAL,D>(stencil_image) : boost::shared_ptr<ARRAY_TYPE_REAL>());
+
+        // Compute displacement field at the downsampled resolution
+        //
+
+        boost::shared_ptr<ARRAY_TYPE_REAL> result_lowres =
+          solveMultiRes( res_level-1, fixed_image_lowres.get(), moving_image_lowres.get(), stencil_image_lowres.get() );
+
+        // Clean up low resolution image data
+        //
+
+        fixed_image_lowres.reset();
+        moving_image_lowres.reset();
+        stencil_image_lowres.reset();
+
+        // Upsample lowres results to current resolution
+        //
+
+        result = upsample<REAL,D>(result_lowres.get());
+        *result *= REAL(2); // To adjust the flow vectors to the fact that the resolution is now twice as high
+
+        // Clean up low resolution result
+        //
+
+        result_lowres.reset();
+
+        // Some output to track our progress at runtime
+        //
+
+        if( this->output_mode_ >= registrationSolver<ARRAY_TYPE_REAL>::OUTPUT_VERBOSE ) {
+          GDEBUG_STREAM(std::endl << "Multiresolution level " << res_level);
+        }
+
+        // Use estimated (lowres) motion to compute displacements at the current resolution
+        //
+
+        boost::shared_ptr<ARRAY_TYPE_REAL> def_moving_image = this->deform( moving_image, result );
+      
+        // Compute registationnat the current multiresolution level
+        //
+
+        compute( fixed_image, def_moving_image.get(), stencil_image, result );
+      }	
+      else{
+
+        //
+        // We are now at the end of the multi-resolution chain
+        //
+
+        // Some output to track our progress at runtime
+        //
+
+        if( this->output_mode_ >= registrationSolver<ARRAY_TYPE_REAL>::OUTPUT_VERBOSE ) {
+          GDEBUG_STREAM(std::endl << "Multiresolution level " << res_level << " (lowest)");
+        }
+
+        // Compute displacements at the current resolution (no estimate can be provided)
+        //
+
+        compute( fixed_image, moving_image, stencil_image, result );
+      }
+
+      return result;
+    }
+
+    virtual bool padding_required( typename uint64d<D>::Type dims )
+    {
+      bool padding_required = false;
+      typename uint64d<D>::Type ones(1);
+      typename uint64d<D>::Type twos(2);
+
+      for( unsigned int i=0; i<num_multires_levels_; i++ ){
+
+        dims /= (size_t)2;
+
+        if( weak_less( dims, (size_t)12*ones ) ){
+          throw std::runtime_error("multiresRegistrationSolver::padding_required : resolution too low. Too many multiresolution levels specified?");
+        }
+
+        if( weak_equal(dims%twos, ones) ){
+          padding_required = true;
+        }
+      }
+      return padding_required;
+    }
+
+  protected:
+    unsigned int num_multires_levels_;
+    unsigned int max_num_iterations_per_level_;
+
+  private:
+    typename uint64d<D>::Type round_pow2(typename uint64d<D>::Type v)
+    {
+      typename uint64d<D>::Type ones(1);
+      typename uint64d<D>::Type out = v-ones;
+      for( unsigned int d=0; d<D; d++ ){
+        out[d] |= out[d] >> 1;
+        out[d] |= out[d] >> 2;
+        out[d] |= out[d] >> 4;
+        out[d] |= out[d] >> 8;
+        out[d] |= out[d] >> 16;
+      }
+      return out+ones;
+    }
+  };
+}
diff --git a/toolboxes/registration/optical_flow/opticalFlowOperator.h b/toolboxes/registration/optical_flow/opticalFlowOperator.h
new file mode 100644
index 0000000..761f6f1
--- /dev/null
+++ b/toolboxes/registration/optical_flow/opticalFlowOperator.h
@@ -0,0 +1,68 @@
+#pragma once
+#include "linearOperator.h"
+#include <numeric>
+#include <functional>
+namespace Gadgetron {
+
+  template<class ARRAY_TYPE,class partialDerivOp, unsigned int D> class opticalFlowOperator : public linearOperator<ARRAY_TYPE >{
+  public:
+    typedef typename ARRAY_TYPE::element_type T;
+
+    opticalFlowOperator(){};
+
+    opticalFlowOperator(ARRAY_TYPE* moving,ARRAY_TYPE* stat){
+      set_images(moving,stat);
+    }
+
+    virtual ~opticalFlowOperator(){};
+
+    virtual void mult_M(ARRAY_TYPE* in,ARRAY_TYPE* out,bool accumulate){
+
+      if (!accumulate) clear(out);
+      std::vector<size_t> dims = *in->get_dimensions();
+      if (dims.back() != D) throw std::runtime_error("Input array for optical flow has the wrong last dimensions");
+      dims.pop_back();
+
+      size_t elements = std::accumulate(dims.begin(),dims.end(),1u,std::multiplies<size_t>());
+
+      for (int i = 0; i < D; i++){
+	ARRAY_TYPE tmp(&dims,in->get_data_ptr()+elements*i);
+	ARRAY_TYPE tmp2(tmp);
+	tmp2 *= *Ix[i];
+	*out += tmp2;
+      }
+    }
+
+    virtual void mult_MH(ARRAY_TYPE* in,ARRAY_TYPE* out,bool accumulate){
+
+      if (!accumulate) clear(out);
+      std::vector<size_t> dims = *out->get_dimensions();
+      if (dims.back() != D) throw std::runtime_error("Output array for optical flow has the wrong last dimensions");
+      dims.pop_back();
+      size_t elements = std::accumulate(dims.begin(),dims.end(),1u,std::multiplies<size_t>());
+
+      for (int i = 0; i < D; i++){
+	ARRAY_TYPE out_view(&dims,out->get_data_ptr()+elements*i);
+	ARRAY_TYPE tmp2(in);
+	tmp2 *= *Ix[i];
+	out_view += tmp2;
+      }
+    }
+
+    void set_images(ARRAY_TYPE* moving,ARRAY_TYPE* stat){
+      Ix = std::vector< boost::shared_ptr<ARRAY_TYPE> >();
+
+      for (int i=0; i < D; i++){
+	partialDerivOp op(i);
+	boost::shared_ptr<ARRAY_TYPE> I(new ARRAY_TYPE(moving->get_dimensions()));
+	op.mult_M(moving,I.get());
+	op.mult_M(stat,I.get(),true);
+	*I /= T(2);
+	Ix.push_back(I);
+      }
+    }
+
+  protected:
+    std::vector< boost::shared_ptr<ARRAY_TYPE> > Ix; //Gradient along different directions
+  };
+}
diff --git a/toolboxes/registration/optical_flow/opticalFlowSolver.h b/toolboxes/registration/optical_flow/opticalFlowSolver.h
new file mode 100644
index 0000000..255116c
--- /dev/null
+++ b/toolboxes/registration/optical_flow/opticalFlowSolver.h
@@ -0,0 +1,176 @@
+/** \file opticalFlowSolver.h
+    \brief Abstract class defining an optical flow registration solver.
+
+    Pure virtual functions are expected to do the actual work 
+    - on the CPU and GPU respectively.
+*/
+
+#pragma once
+
+#include "multiresRegistrationSolver.h"
+#include "resampleOperator.h"
+#include "vector_td_utilities.h"
+
+#include <algorithm>
+
+namespace Gadgetron{
+
+  template<class ARRAY_TYPE_REAL, unsigned int D> class opticalFlowSolver 
+    : public multiresRegistrationSolver<ARRAY_TYPE_REAL,D>
+  {  
+  protected:
+    typedef typename ARRAY_TYPE_REAL::element_type REAL;
+
+  public:
+
+    opticalFlowSolver() : multiresRegistrationSolver<ARRAY_TYPE_REAL,D>(){ 
+      limit_ = REAL(0.01);
+    } 
+
+    virtual ~opticalFlowSolver() {}
+
+    // Set termination threshold
+    inline void set_limit( REAL limit ) { limit_ = limit; }
+
+  protected:
+
+    // Inherited from the multiresolution solver
+    //
+
+    virtual void compute( ARRAY_TYPE_REAL *fixed_image, ARRAY_TYPE_REAL *moving_image, ARRAY_TYPE_REAL *stencil_image, 
+                          boost::shared_ptr<ARRAY_TYPE_REAL> &result_in_out )
+    {
+      // Test the validity of the input images
+      //
+
+      if( !fixed_image || !moving_image ){
+        throw std::runtime_error("opticalFlowSolver::compute(): illegal input array received.");
+      }
+
+      if( prod(from_std_vector<size_t,D>(*fixed_image->get_dimensions().get())) != 
+          prod(from_std_vector<size_t,D>(*moving_image->get_dimensions().get())) ){
+        throw std::runtime_error("opticalFlowSolver::compute(): core image dimensions (excluding batches) mismatch.");
+      }
+
+      if( stencil_image && 
+          prod(from_std_vector<size_t,D>(*fixed_image->get_dimensions().get())) != 
+          prod(from_std_vector<size_t,D>(*stencil_image->get_dimensions().get())) ){
+        throw std::runtime_error("opticalFlowSolver::compute(): stencil image dimensions mismatch fixed/moving image dimensions.");
+      }
+
+      if( result_in_out.get() && 
+          !( result_in_out->get_number_of_dimensions() > D ||
+             result_in_out->get_size(result_in_out->get_number_of_dimensions()-1) == D )){
+        throw std::runtime_error("opticalFlowSolver::compute(): input displacements dimensionality mismatch");
+      }
+
+
+      boost::shared_ptr<ARRAY_TYPE_REAL> grad_image = grad( fixed_image, moving_image );
+
+
+      // Invoke core solver (e.g. Horn-Schunk, Cornelius-Kanade, ...)
+      //
+
+      boost::shared_ptr<ARRAY_TYPE_REAL> displacements = core_solver( grad_image.get(), stencil_image );
+
+      // If an input vector field was provided then our result should be added element-wise
+      // 
+
+      if( result_in_out.get() ){
+        *result_in_out += *displacements;
+      }
+      else{    
+        result_in_out = displacements;
+      }
+    }
+
+    // Compute the gradient
+    //
+
+    virtual boost::shared_ptr<ARRAY_TYPE_REAL> grad( ARRAY_TYPE_REAL *fixed_image, ARRAY_TYPE_REAL *moving_image )
+    {
+      // Sanity checks
+      //
+
+      if( !fixed_image || !moving_image ){
+        throw std::runtime_error("opticalFlowSolver::grad(): illegal input received.");
+      }
+
+      if( !((moving_image->get_number_of_elements() % fixed_image->get_number_of_elements()) == 0 ||
+            (fixed_image->get_number_of_elements() % moving_image->get_number_of_elements()) == 0 )){
+        throw std::runtime_error("opticalFlowSolver::grad(): fixed/moving image dimensions mismatch.");
+      }
+
+      // Determine dimension size of the gradient field:
+      // D spatial dimensions plus one temporal dimension
+      //
+
+      std::vector<size_t> grad_dims;
+
+      (fixed_image->get_number_of_elements()<moving_image->get_number_of_elements() )
+        ? grad_dims = *moving_image->get_dimensions() : grad_dims = *fixed_image->get_dimensions();
+
+      grad_dims.push_back(D+1); 
+
+      boost::shared_ptr<ARRAY_TYPE_REAL> grad_image(new ARRAY_TYPE_REAL(&grad_dims));
+
+      // Setup for the spatial partial derivatives
+      //
+
+      typename uint64d<D>::Type matrix_size_fixed = from_std_vector<size_t,D>( *fixed_image->get_dimensions() );
+      typename uint64d<D>::Type matrix_size_moving = from_std_vector<size_t,D>( *moving_image->get_dimensions() );
+
+      if( matrix_size_fixed != matrix_size_moving ){
+        throw std::runtime_error("opticalFlowSolver::grad(): fixed/moving image dimensions mismatch (2).");
+      }
+
+      // Ignoring the batch dimensions the fixed and moving images have the same number of elements
+      //
+
+      size_t number_of_elements = prod(matrix_size_moving);
+      size_t number_of_batches_fixed = 1;
+      size_t number_of_batches_moving = 1;
+
+      for( size_t d=D; d<fixed_image->get_number_of_dimensions(); d++ ){
+        number_of_batches_fixed *= fixed_image->get_size(d);
+      }
+
+      for( size_t d=D; d<moving_image->get_number_of_dimensions(); d++ ){
+        number_of_batches_moving *= moving_image->get_size(d);
+      }
+
+      // Compute spatial partial derivatives
+      //
+
+      core_grad_spatial( fixed_image->get_data_ptr(), moving_image->get_data_ptr(), grad_image->get_data_ptr(), 
+                         matrix_size_moving, number_of_batches_fixed, number_of_batches_moving );
+
+      // Compute temporal partial derivatives
+      //
+
+      core_grad_temporal( fixed_image->get_data_ptr(), moving_image->get_data_ptr(), 
+                          grad_image->get_data_ptr()+number_of_elements*std::max(number_of_batches_moving, number_of_batches_fixed)*D, 
+                          matrix_size_moving, number_of_batches_fixed, number_of_batches_moving );
+
+      return grad_image;
+    }
+
+    // The actual work is being done in these functions, to be implemented on both host and device
+    //
+
+    virtual boost::shared_ptr<ARRAY_TYPE_REAL> core_solver( ARRAY_TYPE_REAL *gradient_image, ARRAY_TYPE_REAL *stencil_image ) = 0;      
+
+    virtual void core_grad_spatial( REAL *fixed_image, REAL *moving_image, REAL *gradient_image, 
+                                    typename uint64d<D>::Type matrix_size_moving, 
+                                    size_t number_of_batches_fixed, 
+                                    size_t number_of_batches_moving ) = 0;
+
+    virtual void core_grad_temporal( REAL *fixed_image, REAL *moving_image, REAL *gradient_image, 
+                                     typename uint64d<D>::Type matrix_size_moving, 
+                                     size_t number_of_batches_fixed, 
+                                     size_t number_of_batches_moving ) = 0;
+
+  protected:
+    REAL limit_;
+  };
+}
diff --git a/toolboxes/registration/optical_flow/registrationSolver.h b/toolboxes/registration/optical_flow/registrationSolver.h
new file mode 100644
index 0000000..9e59ef1
--- /dev/null
+++ b/toolboxes/registration/optical_flow/registrationSolver.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include "solver.h"
+#include "resampleOperator.h"
+
+namespace Gadgetron{
+  
+  template <class ARRAY_TYPE> class registrationData
+  {
+  public:
+    registrationData( ARRAY_TYPE *fixed_image, ARRAY_TYPE *moving_image )
+    {
+      fixed_image_ = fixed_image;
+      moving_image_ = moving_image;
+    }
+
+    virtual ~registrationData() {}
+  
+    inline ARRAY_TYPE* get_fixed_image () { return fixed_image_; }
+    inline ARRAY_TYPE* get_moving_image () { return moving_image_; }
+  
+  protected:
+    ARRAY_TYPE *fixed_image_;
+    ARRAY_TYPE *moving_image_;
+  };
+
+  template <class ARRAY_TYPE> class registrationSolver 
+    : public solver<registrationData<ARRAY_TYPE>, ARRAY_TYPE >
+  {
+  public:
+
+    // Constructor/destructor
+    //
+
+    registrationSolver() : solver<registrationData<ARRAY_TYPE>,ARRAY_TYPE>() {}
+    virtual ~registrationSolver() {}
+
+    // Set interpolator for resampling
+    //
+  
+    inline void set_interpolator( boost::shared_ptr< resampleOperator<ARRAY_TYPE,ARRAY_TYPE> > interpolator )
+    {
+      interpolator_ = interpolator;
+    }
+  
+    // Set zero deformation boundary condition as a stencil image
+    //
+  
+    inline void set_stencil( boost::shared_ptr<ARRAY_TYPE> stencil )
+    {
+      stencil_ = stencil;
+    }
+  
+    //
+    // The solver adds a dimension to ARRAY_TYPE to hold the vector result.
+    // I.e. the vector field dimension is the slowest varying.
+    //
+  
+    virtual boost::shared_ptr<ARRAY_TYPE> 
+    solve( ARRAY_TYPE *fixed_image, ARRAY_TYPE *moving_image, bool input_normalization_allowed = false ) = 0;
+  
+    virtual boost::shared_ptr<ARRAY_TYPE> 
+    solve( registrationData< ARRAY_TYPE> *rd )
+    {
+      return solve( rd->get_fixed_image(), rd->get_moving_image() );
+    }
+  
+    // Deform image based on displacement field
+    //
+
+    virtual boost::shared_ptr<ARRAY_TYPE> 
+    deform( ARRAY_TYPE *image, boost::shared_ptr<ARRAY_TYPE> displacements )
+    {
+      if( !interpolator_.get() ){
+	    throw std::runtime_error("registrationSolver::deform() : interpolator not set");;
+      }
+    
+      boost::shared_ptr<ARRAY_TYPE> out(new ARRAY_TYPE);
+      std::vector<size_t> out_dims = *displacements->get_dimensions().get(); out_dims.pop_back();    
+      out->create(&out_dims);
+    
+      interpolator_->set_displacement_field( displacements );
+      interpolator_->mult_M( image, out.get() );
+      interpolator_->reset();
+    
+      return out;
+    }
+  
+    // Deform image based on an invocation of the registration solver
+    //
+  
+    virtual boost::shared_ptr<ARRAY_TYPE> 
+    deform( ARRAY_TYPE *fixed_image, ARRAY_TYPE *moving_image )
+    {
+      boost::shared_ptr<ARRAY_TYPE> displacements = solve( fixed_image, moving_image );
+      return deform( moving_image, displacements );
+    }
+  
+  protected:
+    boost::shared_ptr< resampleOperator<ARRAY_TYPE,ARRAY_TYPE> > interpolator_;
+    boost::shared_ptr<ARRAY_TYPE> stencil_;
+  };
+}
diff --git a/toolboxes/registration/optical_flow/resampleOperator.h b/toolboxes/registration/optical_flow/resampleOperator.h
new file mode 100644
index 0000000..4d0139a
--- /dev/null
+++ b/toolboxes/registration/optical_flow/resampleOperator.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "linearOperator.h"
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE_REAL, class ARRAY_TYPE_ELEMENT> 
+  class resampleOperator : public linearOperator<ARRAY_TYPE_ELEMENT>
+  {
+  public:
+  
+    resampleOperator() : linearOperator<ARRAY_TYPE_ELEMENT>(), preprocessed_(false) {}
+    virtual ~resampleOperator() {}
+
+    virtual void reset(){ preprocessed_ = false; }
+  
+    // Expected format: the vector field dimension should be the slowest varying
+    //
+
+    virtual void set_displacement_field( boost::shared_ptr<ARRAY_TYPE_REAL> offsets )
+    {
+      offsets_ = offsets;
+    }
+  
+    virtual boost::shared_ptr<ARRAY_TYPE_REAL> get_displacement_field()
+    {
+      return offsets_;
+    }
+
+    virtual size_t get_number_of_displacement_vectors() 
+    {
+      if( !offsets_.get() ) return 0;
+      return offsets_->get_number_of_elements()/offsets_->get_size(offsets_->get_number_of_dimensions()-1);
+    }
+
+    virtual bool is_preprocessed(){ return preprocessed_; }
+
+  protected:
+    bool preprocessed_;
+    boost::shared_ptr<ARRAY_TYPE_REAL> offsets_;
+  };
+}
diff --git a/toolboxes/solvers/CMakeLists.txt b/toolboxes/solvers/CMakeLists.txt
new file mode 100644
index 0000000..30baf6a
--- /dev/null
+++ b/toolboxes/solvers/CMakeLists.txt
@@ -0,0 +1,35 @@
+include_directories(
+  ${Boost_INCLUDE_DIR}
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  )
+
+install(FILES 	
+  solver.h
+  linearOperatorSolver.h
+  cgSolver.h
+  nlcgSolver.h
+  lbfgsSolver.h
+  sbSolver.h
+  sbcSolver.h
+  cgCallback.h	
+  cgPreconditioner.h
+  lwSolver.h
+  lbfgsSolver.h
+  lsqrSolver.h
+  gpSolver.h
+  gpBbSolver.h
+  eigenTester.h
+  osMOMSolver.h
+  osSPSSolver.h
+  osLALMSolver.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
+
+IF(ARMADILLO_FOUND)
+  add_subdirectory(cpu)
+ENDIF(ARMADILLO_FOUND)
+
+IF( CUDA_FOUND)
+  add_subdirectory(gpu)
+ENDIF (CUDA_FOUND)
diff --git a/toolboxes/solvers/cgCallback.h b/toolboxes/solvers/cgCallback.h
new file mode 100644
index 0000000..07edbf0
--- /dev/null
+++ b/toolboxes/solvers/cgCallback.h
@@ -0,0 +1,198 @@
+/** \file cgCallback.h
+    \brief Class to specify the termination criteria for the conjugate gradient solver through a callback mechanism.
+*/
+
+#pragma once
+
+#include "real_utilities.h"
+#include "cgSolver.h"
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE> class cgSolver;
+
+  template <class ARRAY_TYPE> class cgTerminationCallback
+  {
+
+  public:
+
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+    cgTerminationCallback() {}
+    virtual ~cgTerminationCallback() {}
+  
+    virtual bool initialize( cgSolver<ARRAY_TYPE> *cg ){cg_ = cg; return true;}
+    virtual bool iterate( unsigned int iteration, REAL *tc_metric, bool *tc_terminate ) = 0;
+
+  protected:
+
+    cgSolver<ARRAY_TYPE> *cg_;
+
+    REAL get_rq(){
+      return cg_->rq_;
+    }
+
+    REAL get_rq0(){
+      return cg_->rq0_;
+    }
+
+    REAL get_alpha(){
+      return cg_->alpha_;
+    }
+
+    boost::shared_ptr<ARRAY_TYPE> get_x(){
+      return cg_->x_;
+    }
+
+    boost::shared_ptr<ARRAY_TYPE> get_p(){
+      return cg_->p_;
+    }
+
+    boost::shared_ptr<ARRAY_TYPE> get_r(){
+      return cg_->r_;
+    }
+  };
+
+  template <class ARRAY_TYPE> class relativeResidualTCB
+    : public cgTerminationCallback<ARRAY_TYPE>
+  {
+
+  protected:
+    typedef cgTerminationCallback<ARRAY_TYPE> cgTC;
+
+  public:
+
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+
+    relativeResidualTCB() : cgTerminationCallback<ARRAY_TYPE>() {
+      rq_0_ = REAL(0); 
+      tc_last_ = get_max<REAL>();
+    }
+  
+    virtual ~relativeResidualTCB() {}
+  
+    virtual bool initialize( cgSolver<ARRAY_TYPE> *cg )
+    {
+      cgTC::initialize(cg);
+      tc_last_ = get_max<REAL>();
+      rq_0_ = cgTC::get_rq0();
+      return true;
+    }
+  
+    virtual bool iterate( unsigned int iteration, REAL *tc_metric, bool *tc_terminate )
+    {
+      *tc_metric = cgTC::get_rq()/rq_0_;
+    
+      if( cgTC::cg_->get_output_mode() >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_WARNINGS ) {
+	if( cgTC::cg_->get_output_mode() >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ) {
+	  GDEBUG_STREAM("Iteration " << iteration << ". rq/rq_0 = " << *tc_metric << std::endl);
+	}
+	if( (tc_last_-(*tc_metric)) < REAL(0) ){
+	  GDEBUG_STREAM("Warning: conjugate gradient residual increase." << std::endl);
+	}
+      }
+    
+      *tc_terminate = ( *tc_metric < cgTC::cg_->get_tc_tolerance() );
+      tc_last_ = *tc_metric;
+      return true;
+    }
+  
+  protected:
+    REAL rq_0_;
+    REAL tc_last_;
+  };
+
+  template <class ARRAY_TYPE> class residualTCB
+    : public cgTerminationCallback<ARRAY_TYPE>
+  {
+
+  protected:
+
+    typedef cgTerminationCallback<ARRAY_TYPE> cgTC;
+
+  public:
+
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+
+    residualTCB() : cgTerminationCallback<ARRAY_TYPE>() {
+      tc_last_ = get_max<REAL>();
+    }
+
+    virtual ~residualTCB() {}
+
+    virtual bool initialize( cgSolver<ARRAY_TYPE> *cg )
+    {
+      cgTC::initialize(cg);
+      tc_last_ = get_max<REAL>();
+      return true;
+    }
+
+    virtual bool iterate( unsigned int iteration, REAL *tc_metric, bool *tc_terminate )
+    {
+      *tc_metric = cgTC::get_rq();
+      if( cgTC::cg_->get_output_mode() >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_WARNINGS ) {
+        if( cgTC::cg_->get_output_mode() >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ) {
+	  GDEBUG_STREAM("Iteration " << iteration << ". rq/rq_0 = " << *tc_metric << std::endl);
+        }
+        if( (tc_last_-(*tc_metric)) < REAL(0) ){
+	  GDEBUG_STREAM("----- Warning: CG residual increase. Stability problem! -----" << std::endl);
+        }
+      }
+      *tc_terminate = ( *tc_metric < cgTC::cg_->get_tc_tolerance() );
+      tc_last_ = *tc_metric;
+      return true;
+    }
+
+  protected:
+
+    REAL tc_last_;
+  };
+
+  template <class ARRAY_TYPE> class updateTCB
+    : public cgTerminationCallback<ARRAY_TYPE>
+  {
+
+  protected:
+    typedef cgTerminationCallback<ARRAY_TYPE> cgTC;
+
+  public:
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+
+    updateTCB() : cgTerminationCallback<ARRAY_TYPE>() {
+
+      tc_last_ = get_max<REAL>();
+    }
+
+    virtual ~updateTCB() {}
+
+    virtual bool initialize( cgSolver<ARRAY_TYPE> *cg )
+    {
+      cgTC::initialize(cg);
+      tc_last_ = get_max<REAL>();
+      return true;
+    }
+
+    virtual bool iterate( unsigned int iteration, REAL *tc_metric, bool *tc_terminate )
+    {
+      *tc_metric = cgTC::cg_->solver_dot(cgTC::get_p().get(),cgTC::get_p().get());
+      if( cgTC::cg_->get_output_mode() >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_WARNINGS ) {
+	if( cgTC::cg_->get_output_mode() >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ) {
+	  GDEBUG_STREAM("Iteration " << iteration << ". rq/rq_0 = " << *tc_metric << std::endl);
+	}
+	if( (tc_last_-(*tc_metric)) < REAL(0) ){
+	  GDEBUG_STREAM("----- Warning: CG residual increase. Stability problem! -----" << std::endl);
+	}
+      }
+      *tc_terminate = ( *tc_metric < cgTC::cg_->get_tc_tolerance() );
+      tc_last_ = *tc_metric;
+      return true;
+    }
+
+  protected:
+
+    REAL tc_last_;
+  };
+}
diff --git a/toolboxes/solvers/cgPreconditioner.h b/toolboxes/solvers/cgPreconditioner.h
new file mode 100644
index 0000000..04f36bd
--- /dev/null
+++ b/toolboxes/solvers/cgPreconditioner.h
@@ -0,0 +1,46 @@
+/** \file cgPreconditioner.h
+    \brief Base class for preconditioners for the cgSolver class.
+*/
+
+#ifndef CGPRECONDITIONER_H
+#define CGPRECONDITIONER_H
+#pragma once
+
+#include <boost/shared_ptr.hpp>
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE> class cgPreconditioner
+  {
+  public:
+    
+    cgPreconditioner() {}
+    virtual ~cgPreconditioner() {}
+    
+    virtual void set_weights( boost::shared_ptr<ARRAY_TYPE> w ){
+      weights_ = w;
+    }
+
+    virtual void apply( ARRAY_TYPE *in, ARRAY_TYPE *out )
+    {
+      if( !weights_.get() ){
+	throw std::runtime_error( "cgPreconditioner::apply(): weights not set");
+      }
+      
+      if ( !in || !out || in->get_number_of_elements() != out->get_number_of_elements()) {
+	throw std::runtime_error("cgPreconditioner::apply(): input and output dimensions mismatch");
+      }
+      
+      if (in->get_number_of_elements() % weights_->get_number_of_elements()) {
+	throw std::runtime_error( "cgPreconditioner::apply(): unexpected dimensionality of computed weights" );
+      }
+      *out = *in;
+      *out *= *weights_;
+    };
+
+  protected:
+    boost::shared_ptr<ARRAY_TYPE> weights_;    
+  };
+}
+
+#endif //CGPRECONDITIONER_H
diff --git a/toolboxes/solvers/cgSolver.h b/toolboxes/solvers/cgSolver.h
new file mode 100644
index 0000000..26b2627
--- /dev/null
+++ b/toolboxes/solvers/cgSolver.h
@@ -0,0 +1,412 @@
+/** \file cgSolver.h
+    \brief Base class for the conjugate gradient solvers.
+
+    The file cgSolver.h is a device independent implementation of the conjugate gradient solver.
+    To simplify the actual instantiation we refer to 
+    - the class(/file) hoCgSolver(/.h) for a cpu instantiated solver using the hoNDArray class
+    - the class(/file) cuCgSolver(/.h) for a gpu instantiated solver using the cuNDArray class
+    - the class(/file) hoCuCgSolver(/.h) for a gpu based solver using a host memory interface. 
+
+    The latter version is intended for large reconstructions in which device memory cannot hold 
+    the entire data from the image and encoded image domains. 
+    In the "hoCu" scenario, suitable encoding and regularization operators
+    capable of batching their mult_M and mult_MHM functions should be chosen.
+
+    In all cases, the encoding and regularization operators added to the solver 
+    must adhere to the underlying instantiation of the NDArray data type.
+*/
+
+#pragma once
+
+#include "linearOperatorSolver.h"
+#include "cgCallback.h"
+#include "cgPreconditioner.h"
+#include "real_utilities.h"
+#include "complext.h"
+
+#include <vector>
+#include <iostream>
+#include <limits>
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE> class cgSolver : public linearOperatorSolver<ARRAY_TYPE>
+  {
+  
+  public:
+
+    // Convienient typedefs
+    //
+
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+    friend class cgTerminationCallback<ARRAY_TYPE>;
+
+
+    // Constructor
+    //
+
+    cgSolver() : linearOperatorSolver<ARRAY_TYPE>() {
+      alpha_ = std::numeric_limits<ELEMENT_TYPE>::quiet_NaN();
+      iterations_ = 10;
+      tc_tolerance_ = (REAL)1e-3;
+      cb_ = boost::shared_ptr< relativeResidualTCB<ARRAY_TYPE> >( new relativeResidualTCB<ARRAY_TYPE>() );
+    }
+  
+
+    // Destructor
+    //
+
+    virtual ~cgSolver() {}
+
+
+    // Set preconditioner
+    //
+
+    virtual void set_preconditioner( boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond ) {
+      precond_ = precond;
+    }
+  
+
+    // Set termination callback
+    //
+
+    virtual void set_termination_callback( boost::shared_ptr< cgTerminationCallback<ARRAY_TYPE> > cb ){
+      cb_ = cb;
+    }
+  
+
+    // Set/get maximally allowed number of iterations
+    //
+
+    virtual void set_max_iterations( unsigned int iterations ) { iterations_ = iterations; }
+    virtual unsigned int get_max_iterations() { return iterations_; }  
+
+
+    // Set/get tolerance threshold for termination criterium
+    //
+
+    virtual void set_tc_tolerance( REAL tolerance ) { tc_tolerance_ = tolerance; }
+    virtual REAL get_tc_tolerance() { return tc_tolerance_; }
+  
+
+    // Virtual function that is provided with the intermediate solution at each solver iteration.
+    // The default behaviour is to do nothing with this array,
+    // but customization is possible by specialization of the virtual function in a derived class.
+    //
+
+    virtual void solver_dump( ARRAY_TYPE* ) {}
+
+
+    //
+    // Main solver interface
+    //
+
+    virtual boost::shared_ptr<ARRAY_TYPE> solve( ARRAY_TYPE *d )
+    {
+    
+      // Compute right hand side...
+      //
+      
+      boost::shared_ptr<ARRAY_TYPE> rhs = compute_rhs( d );
+
+      // ... and the result
+      //
+
+      boost::shared_ptr<ARRAY_TYPE> result =  solve_from_rhs( rhs.get() );
+      return result;
+    }
+
+
+    // Alternative solver interface when given the right hand side
+    //
+
+    virtual boost::shared_ptr<ARRAY_TYPE> solve_from_rhs( ARRAY_TYPE *rhs ) 
+    {
+      // For zero iterations we have computed / return the right hand side
+      //
+
+      if( iterations_ == 0 ){
+        return boost::shared_ptr<ARRAY_TYPE>( new ARRAY_TYPE(*rhs) );
+      }
+
+      // Initialize
+      //
+
+      initialize(rhs);
+
+      // Iterate
+      //
+
+      if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+        GDEBUG_STREAM("Iterating..." << std::endl);
+      }
+    
+      for( unsigned int it=0; it<iterations_; it++ ){
+
+        REAL tc_metric;
+        bool tc_terminate;
+      
+        this->iterate( it, &tc_metric, &tc_terminate );
+
+        solver_dump( x_.get());
+      
+        if( tc_terminate )
+          break;
+      }
+    
+      // Clean up and we are done
+      //
+
+      boost::shared_ptr<ARRAY_TYPE> tmpx = x_;
+      deinitialize();
+      return tmpx;
+    }
+
+
+    // Compute right hand side
+    //
+
+    virtual boost::shared_ptr<ARRAY_TYPE> compute_rhs( ARRAY_TYPE *d )
+    {
+    
+      if( this->encoding_operator_.get() == 0 ){
+      	throw std::runtime_error( "Error: cgSolver::compute_rhs : no encoding operator is set" );
+      } 
+        
+      // Get image space dimensions from the encoding operator
+      //
+
+      boost::shared_ptr< std::vector<size_t> > image_dims = this->encoding_operator_->get_domain_dimensions();
+      if( image_dims->size() == 0 ){
+      	throw std::runtime_error( "Error: cgSolver::compute_rhs : encoding operator has not set domain dimension" );
+      }
+
+      // Create result array and clear
+      //
+
+      boost::shared_ptr<ARRAY_TYPE> result = boost::shared_ptr<ARRAY_TYPE>(new ARRAY_TYPE(image_dims.get()));
+      clear(result.get());
+    
+      // Create temporary array
+      //
+
+      ARRAY_TYPE tmp(image_dims.get() );
+
+      // Compute operator adjoint
+      //
+
+      this->encoding_operator_->mult_MH( d, &tmp );
+    
+      // Apply weight
+      //
+
+      axpy(ELEMENT_TYPE(this->encoding_operator_->get_weight()), &tmp, result.get() );
+    
+      return result;
+    }
+
+  protected:
+  
+    //
+    // Everything beyond this point is internal to the implementation
+    // and not intended to be exposed as a public interface
+    //
+
+    // Initialize solver
+    //
+
+    virtual void initialize( ARRAY_TYPE *rhs )
+    {
+      // Input validity test
+      //
+
+      if( !rhs || rhs->get_number_of_elements() == 0 ){
+      	throw std::runtime_error( "Error: cgSolver::initialize : empty or NULL rhs provided" );
+      }
+    
+      // Result, x
+      //
+
+      x_ = boost::shared_ptr<ARRAY_TYPE>( new ARRAY_TYPE(rhs->get_dimensions()) );
+    
+    
+      // Initialize r,p,x
+      //
+
+      r_ = boost::shared_ptr<ARRAY_TYPE>( new ARRAY_TYPE(*rhs) );
+      p_ = boost::shared_ptr<ARRAY_TYPE>( new ARRAY_TYPE(*r_) );
+    
+      if( !this->get_x0().get() ){ // no starting image provided      
+	clear(x_.get());
+      }
+
+      // Apply preconditioning, twice (should change preconditioners to do this)
+      //
+      
+      if( precond_.get() ) {	
+        precond_->apply( p_.get(), p_.get() );
+        precond_->apply( p_.get(), p_.get() );
+      }
+
+      rq0_ = real(dot( r_.get(), p_.get() ));
+
+      if (this->get_x0().get()){
+	
+        if( !this->get_x0()->dimensions_equal( rhs )){
+          throw std::runtime_error( "Error: cgSolver::initialize : RHS and initial guess must have same dimensions" );
+        }
+	
+        *x_ = *(this->get_x0());
+        
+        ARRAY_TYPE mhmX( rhs->get_dimensions());
+
+        if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ) {
+          GDEBUG_STREAM("Preparing guess..." << std::endl);
+        }
+        
+        mult_MH_M( this->get_x0().get(), &mhmX );
+        
+        *r_ -= mhmX;
+        *p_ = *r_;
+        
+        // Apply preconditioning, twice (should change preconditioners to do this)
+        //
+        
+        if( precond_.get() ){
+          precond_->apply( p_.get(), p_.get() );
+          precond_->apply( p_.get(), p_.get() );
+        }
+      }
+      
+      rq_ = real( dot( r_.get(), p_.get() ));
+      
+      // Invoke termination callback initialization
+      //
+    
+      cb_->initialize(this);
+    }
+  
+    // Clean up
+    //
+
+    virtual void deinitialize()
+    {
+      p_.reset();
+      r_.reset();
+      x_.reset();
+    }
+
+    // Perform full cg iteration
+    //
+
+    virtual void iterate( unsigned int iteration, REAL *tc_metric, bool *tc_terminate )
+    {
+      ARRAY_TYPE q = ARRAY_TYPE(x_->get_dimensions());
+
+      // Perform one iteration of the solver
+      //
+
+      mult_MH_M( p_.get(), &q );
+    
+      // Update solution
+      //
+
+      alpha_ = rq_/dot( p_.get(), &q );
+      axpy( alpha_, p_.get(), x_.get());
+
+      // Update residual
+      //
+
+      axpy( -alpha_, &q, r_.get());
+
+      // Apply preconditioning
+      //
+
+      if( precond_.get() ){
+
+        precond_->apply( r_.get(), &q );
+        precond_->apply( &q, &q );
+        
+        REAL tmp_rq = real(dot( r_.get(), &q ));      
+        *p_ *= ELEMENT_TYPE((tmp_rq/rq_));
+        axpy( ELEMENT_TYPE(1), &q, p_.get() );
+        rq_ = tmp_rq;
+      } 
+      else{
+        
+        REAL tmp_rq = real(dot( r_.get(), r_.get()) );
+        *p_ *= ELEMENT_TYPE((tmp_rq/rq_));           
+        axpy( ELEMENT_TYPE(1), r_.get(), p_.get() );
+        rq_ = tmp_rq;      
+      }
+      
+      // Invoke termination callback iteration
+      //
+
+      if( !cb_->iterate( iteration, tc_metric, tc_terminate ) ){
+        throw std::runtime_error( "Error: cgSolver::iterate : termination callback iteration failed" );
+      }    
+    }
+    
+    // Perform mult_MH_M of the encoding and regularization matrices
+    //
+
+    void mult_MH_M( ARRAY_TYPE *in, ARRAY_TYPE *out )
+    {
+      // Basic validity checks
+      //
+
+      if( !in || !out ){
+        throw std::runtime_error( "Error: cgSolver::mult_MH_M : invalid input pointer(s)" );
+      }
+
+      if( in->get_number_of_elements() != out->get_number_of_elements() ){
+        throw std::runtime_error( "Error: cgSolver::mult_MH_M : array dimensionality mismatch" );
+      }
+    
+      // Intermediate storage
+      //
+
+      ARRAY_TYPE q = ARRAY_TYPE(in->get_dimensions());
+
+      // Start by clearing the output
+      //
+      clear(out);
+
+      // Apply encoding operator
+      //
+
+      this->encoding_operator_->mult_MH_M( in, &q, false );
+      axpy( this->encoding_operator_->get_weight(), &q, out );
+
+      // Iterate over regularization operators
+      //
+
+      for( unsigned int i=0; i<this->regularization_operators_.size(); i++ ){      
+        this->regularization_operators_[i]->mult_MH_M( in, &q, false );
+        axpy( this->regularization_operators_[i]->get_weight(), &q, out );
+      }      
+    }
+    
+  protected:
+
+    // Preconditioner
+    boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond_;
+
+    // Termination criterium callback
+    boost::shared_ptr< cgTerminationCallback<ARRAY_TYPE> > cb_;
+
+    // Termination criterium threshold
+    REAL tc_tolerance_;
+
+    // Maximum number of iterations
+    unsigned int iterations_;
+
+    // Internal variables. 
+    REAL rq_;
+    REAL rq0_;
+    ELEMENT_TYPE alpha_;
+    boost::shared_ptr<ARRAY_TYPE> x_, p_, r_;
+  };
+}
diff --git a/toolboxes/solvers/cpu/CMakeLists.txt b/toolboxes/solvers/cpu/CMakeLists.txt
new file mode 100644
index 0000000..534ec11
--- /dev/null
+++ b/toolboxes/solvers/cpu/CMakeLists.txt
@@ -0,0 +1,16 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_CPUSOLVERS__)
+endif (WIN32)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/math
+  )
+
+install(FILES 	
+  hoCgSolver.h
+  hoSbCgSolver.h
+  hoGpBbSolver.h
+  hoCgPreconditioner.h
+  hoSolverUtils.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/solvers/cpu/hoCgPreconditioner.h b/toolboxes/solvers/cpu/hoCgPreconditioner.h
new file mode 100644
index 0000000..35ee14b
--- /dev/null
+++ b/toolboxes/solvers/cpu/hoCgPreconditioner.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "hoNDArray_math.h"
+#include "cgPreconditioner.h"
+
+namespace Gadgetron{
+
+  template<class T> class hoCgPreconditioner : public cgPreconditioner< hoNDArray<T> >
+  {
+  public:    
+    hoCgPreconditioner() : cgPreconditioner< hoNDArray<T> >() {}
+    virtual ~hoCgPreconditioner() {}
+  };
+}
diff --git a/toolboxes/solvers/cpu/hoCgSolver.h b/toolboxes/solvers/cpu/hoCgSolver.h
new file mode 100644
index 0000000..cde6f33
--- /dev/null
+++ b/toolboxes/solvers/cpu/hoCgSolver.h
@@ -0,0 +1,28 @@
+/** \file hoCgSolver.h
+    \brief Instantiation of the conjugate gradient solver on the cpu.
+
+    The file hoCgSolver.h is a convienience wrapper for the device independent cgSolver class.
+    The class hoCgSolver instantiates the cgSolver for the hoNDArray
+    and the header otherwise includes other neccessary header files.
+*/
+
+#pragma once
+
+#include "cgSolver.h"
+#include "hoNDArray_math.h"
+
+namespace Gadgetron{
+
+  /** \class hoCgSolver
+      \brief Instantiation of the conjugate gradient solver on the cpu.
+      
+      The class hoCgSolver is a convienience wrapper for the device independent cgSolver class.
+      hoCgSolver instantiates the cgSolver for type hoNDArray<T>.
+  */
+  template <class T> class hoCgSolver : public cgSolver< hoNDArray<T> >
+  {
+  public:
+    hoCgSolver() : cgSolver<hoNDArray<T> >() {}
+    virtual ~hoCgSolver() {}
+  };
+}
diff --git a/toolboxes/solvers/cpu/hoGpBbSolver.h b/toolboxes/solvers/cpu/hoGpBbSolver.h
new file mode 100644
index 0000000..5349a8c
--- /dev/null
+++ b/toolboxes/solvers/cpu/hoGpBbSolver.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "gpBbSolver.h"
+#include "hoNDArray_math.h"
+#include "real_utilities.h"
+#include "vector_td_utilities.h"
+
+
+
+namespace Gadgetron{
+
+  template <class T> class hoGpBbSolver : public gpBbSolver< hoNDArray<T> >
+  {  
+  public:
+
+    hoGpBbSolver() : gpBbSolver< hoNDArray<T> >() {};
+    virtual ~hoGpBbSolver() {};
+        
+
+  };
+}
diff --git a/toolboxes/solvers/cpu/hoSbCgSolver.h b/toolboxes/solvers/cpu/hoSbCgSolver.h
new file mode 100644
index 0000000..8d84929
--- /dev/null
+++ b/toolboxes/solvers/cpu/hoSbCgSolver.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "hoCgSolver.h"
+#include "sbSolver.h"
+
+#include "complext.h"
+
+namespace Gadgetron{
+
+  template <class T> class hoSbCgSolver : public sbSolver< hoNDArray<typename realType<T>::Type >, hoNDArray<T>, hoCgSolver<T> >
+  {
+  public:    
+    hoSbCgSolver() : sbSolver<hoNDArray<typename realType<T>::Type >, hoNDArray<T>, hoCgSolver<T> >() {}    
+    virtual ~hoSbCgSolver() {}
+  };
+}
diff --git a/toolboxes/solvers/cpu/hoSolverUtils.h b/toolboxes/solvers/cpu/hoSolverUtils.h
new file mode 100644
index 0000000..a495078
--- /dev/null
+++ b/toolboxes/solvers/cpu/hoSolverUtils.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "hoNDArray.h"
+#include "hoNDArray_math.h"
+#include "complext.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif
+
+namespace Gadgetron {
+template<class T> void solver_non_negativity_filter(hoNDArray<T> *xdata, hoNDArray<T> *gdata)
+{
+	typedef typename realType<T>::Type REAL;
+
+	T* x = xdata->get_data_ptr();
+	T* g = gdata->get_data_ptr();
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+	for( int i=0; i < xdata->get_number_of_elements(); i++ )
+		if( (real(x[i]) <= REAL(0)) && (real(g[i]) > 0) )
+			g[i]=T(0);
+}
+}
diff --git a/toolboxes/solvers/eigenTester.h b/toolboxes/solvers/eigenTester.h
new file mode 100644
index 0000000..b9b4ef2
--- /dev/null
+++ b/toolboxes/solvers/eigenTester.h
@@ -0,0 +1,157 @@
+#pragma once
+#include "complext.h"
+#include "diagonalOperator.h"
+#include "identityOperator.h"
+
+#include <boost/make_shared.hpp>
+
+namespace Gadgetron{
+template <class ARRAY_TYPE> class eigenTester {
+
+
+public:
+	typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+  eigenTester(){
+	  tolerance = REAL(1e-8);
+  }
+  virtual ~eigenTester(){}
+
+  ELEMENT_TYPE get_dominant_eigenvalue(){
+	  boost::shared_ptr<ARRAY_TYPE> eigenVector = get_dominant_eigenvector();
+	  return get_eigenvalue_from_vector(eigenVector.get());
+  }
+
+
+  ELEMENT_TYPE get_smallest_eigenvalue(ELEMENT_TYPE dominant_eigenvalue){
+	  ELEMENT_TYPE beta = dominant_eigenvalue*2;
+	  auto id_operator = boost::make_shared<identityOperator<ARRAY_TYPE>>();
+	  id_operator->set_weight(-abs(beta));
+
+
+	  regularization_operators_.push_back(id_operator);
+	  std::cout << "ID operator weight " << beta << std::endl;
+	  ELEMENT_TYPE eig1 = get_dominant_eigenvalue();
+	  regularization_operators_.pop_back();
+	  return eig1+beta;
+
+
+  }
+  ELEMENT_TYPE get_smallest_eigenvalue(){
+  	  ELEMENT_TYPE eig = get_dominant_eigenvalue();
+  	  return get_smallest_eigenvalue(eig);
+    }
+  // Add encoding operator to solver (only one allowed)
+   inline bool add_encoding_operator( boost::shared_ptr< linearOperator<ARRAY_TYPE> > op)
+   {
+     if( !op.get() ){
+       std::cout << "Error: linearSolver::add_matrix_operator : NULL operator provided" << std::endl;
+       return false;
+     }
+
+     encoding_operator_ = op;
+
+     return true;
+   }
+   inline void set_tolerance(REAL tolerance){
+	   this->tolerance = tolerance;
+   }
+   // Add linear operator to solver (in addition to the encoding operator)
+   inline bool add_linear_operator( boost::shared_ptr< linearOperator<ARRAY_TYPE> > op)
+     {
+       if( !op.get() ){
+    	   std::cout << "Error: linearSolver::add_matrix_operator : NULL operator provided"  << std::endl;
+         return false;
+       }
+
+       regularization_operators_.push_back(op);
+
+       return true;
+     }
+	protected:
+	 bool mult_MH_M( ARRAY_TYPE *in, ARRAY_TYPE *out )
+	  {
+	    // Basic validity checks
+	    if( !in || !out ){
+	      std::cout << "Error: cgSolver::mult_MH_M : invalid input pointer(s)" << std::endl;
+	      return false;
+	    }
+	    if( in->get_number_of_elements() != out->get_number_of_elements() ){
+	    	std::cout << "Error: cgSolver::mult_MH_M : array dimensionality mismatch"<< std::endl;
+	      return false;
+	    }
+
+	    // Intermediate storage
+	    ARRAY_TYPE q( in->get_dimensions());
+	   // Start by clearing the output
+	    clear( out );
+
+	    //Use encoding operator
+
+	    this->encoding_operator_->mult_MH_M( in, &q, false );
+	    axpy( this->encoding_operator_->get_weight(), &q, out );
+
+	    // Iterate over regularization operators
+	    for( unsigned int i=0; i<this->regularization_operators_.size(); i++ ){
+		  this->regularization_operators_[i]->mult_MH_M( in, &q, false );
+	      axpy( this->regularization_operators_[i]->get_weight(), &q, out );
+	    }
+
+	    return true;
+	  }
+	 ELEMENT_TYPE get_eigenvalue_from_vector(ARRAY_TYPE* eigenVector){
+		 ARRAY_TYPE out(*eigenVector);
+		 clear(&out);
+		 mult_MH_M(eigenVector,&out);
+		 size_t eigMax = amax(eigenVector);
+		 ELEMENT_TYPE dom1 = eigenVector->at(eigMax);
+		 size_t oMax = amax(&out);
+		 ELEMENT_TYPE dom2 = out[oMax];
+		 return dom2/dom1;
+
+	 }
+
+	  boost::shared_ptr<ARRAY_TYPE> get_dominant_eigenvector(){
+		  std::cout << "Starting dominant eigenvector calculations "<< tolerance << std::endl;
+		  ELEMENT_TYPE norm = ELEMENT_TYPE(1);
+		  ELEMENT_TYPE norm_old = ELEMENT_TYPE(2);
+
+		  ARRAY_TYPE* in = new ARRAY_TYPE;
+		  std::vector<size_t> image_dims = *this->encoding_operator_->get_domain_dimensions();
+
+		  in->create(&image_dims);
+
+		  fill(in,ELEMENT_TYPE(1));
+
+		  ARRAY_TYPE* out = new ARRAY_TYPE;
+		  out->create(&image_dims);
+
+		  while (abs(norm-norm_old)/abs(norm)> tolerance){
+			  norm_old=norm;
+			  mult_MH_M(in,out);
+			  std::cout << dot(in,out) << std::endl;
+
+			  norm = nrm2(out);
+
+			  *out /= norm;
+			  ARRAY_TYPE* tmp = in;
+			  in = out;
+			  out = tmp;
+
+			  }
+		  std::cout << "Done" << std::endl;
+		  delete in;
+		  return boost::shared_ptr<ARRAY_TYPE>(out);
+		}
+
+
+	protected:
+
+	  // Single encoding operator
+	  boost::shared_ptr< linearOperator< ARRAY_TYPE> > encoding_operator_;
+	  REAL tolerance;
+	  // Vector of linear regularization operators
+	  std::vector< boost::shared_ptr< linearOperator< ARRAY_TYPE> > > regularization_operators_;
+
+};
+}
diff --git a/toolboxes/solvers/gpBbSolver.h b/toolboxes/solvers/gpBbSolver.h
new file mode 100644
index 0000000..724777d
--- /dev/null
+++ b/toolboxes/solvers/gpBbSolver.h
@@ -0,0 +1,199 @@
+#pragma once
+
+#include "gpSolver.h"
+#include "real_utilities.h"
+#include "complext.h"
+#include "cgPreconditioner.h"
+#include <vector>
+#include <iostream>
+
+namespace Gadgetron{
+
+/* Using adaptive step size from Zhou et al, 2006, Computational Optimization and Applications,
+ * DOI: 10.1007/s10589-006-6446-0
+ */
+
+template <class ARRAY_TYPE> class gpBbSolver : public gpSolver<ARRAY_TYPE>
+{
+protected:
+	typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+	typedef typename realType<ELEMENT_TYPE>::Type REAL;
+	typedef ARRAY_TYPE ARRAY_CLASS;
+
+public:
+
+	gpBbSolver(): gpSolver<ARRAY_TYPE>() {
+		iterations_ = 10;
+		tc_tolerance_ = (REAL)1e-6;
+		non_negativity_constraint_=false;
+		dump_residual = false;
+		threshold= REAL(1e-8);
+	}
+
+	virtual ~gpBbSolver(){}
+
+	virtual boost::shared_ptr<ARRAY_TYPE> solve(ARRAY_TYPE* in)
+    		{
+		if( this->encoding_operator_.get() == 0 ){
+			throw std::runtime_error("Error: gpBbSolver::compute_rhs : no encoding operator is set" );
+		}
+
+		// Get image space dimensions from the encoding operator
+		//
+
+		boost::shared_ptr< std::vector<size_t> > image_dims = this->encoding_operator_->get_domain_dimensions();
+		if( image_dims->size() == 0 ){
+			throw std::runtime_error("Error: gpBbSolver::compute_rhs : encoding operator has not set domain dimension" );
+		}
+
+		ARRAY_TYPE * x = new ARRAY_TYPE;
+		x->create(image_dims.get());
+
+		ARRAY_TYPE x_old(image_dims.get());
+
+		ARRAY_TYPE * g = new ARRAY_TYPE;
+		g->create(image_dims.get());
+		ARRAY_TYPE *  g_old = new ARRAY_TYPE;
+		g_old->create(image_dims.get());
+
+		if (this->x0_.get()){
+			*x = *(this->x0_.get());
+		} else  {
+			clear(x);
+		}
+
+		ARRAY_TYPE encoding_space;
+		REAL reg_res,data_res;
+		encoding_space.create(in->get_dimensions().get());
+		if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+			GDEBUG_STREAM("Iterating..." << std::endl);
+		}
+		for (int i = 0; i < iterations_; i++){
+			if ((i==0) && (!this->x0_.get())){
+				clear(g);
+
+				this->encoding_operator_->mult_MH(in,g);
+				if (precond_.get()) {
+					precond_->apply(g,g);
+					precond_->apply(g,g);
+				}
+
+				*g *=  -this->encoding_operator_->get_weight();
+				data_res = real(dot(in,in));
+				reg_res=REAL(0);
+			} else {
+				this->encoding_operator_->mult_M(x,&encoding_space);
+				axpy(REAL(-1),in,&encoding_space);
+				data_res = real(dot(&encoding_space,&encoding_space));
+				this->encoding_operator_->mult_MH(&encoding_space,g);
+				if (precond_.get()) {
+					precond_->apply(g,g);
+					precond_->apply(g,g);
+				}
+				*g *=  this->encoding_operator_->get_weight();
+			}
+
+			this->add_gradient(x,g); // Adds the gradient from all the regularization operators
+
+			if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+				GDEBUG_STREAM("Data residual: " << data_res << std::endl);
+			}
+
+			if (non_negativity_constraint_) solver_non_negativity_filter(x,g);
+			ELEMENT_TYPE nabla;
+			if (i==0){
+				ARRAY_TYPE tmp_encoding = *in;
+				this->encoding_operator_->mult_M(g,&tmp_encoding);
+				if (this->x0_.get()){
+					nabla = dot(&encoding_space,&tmp_encoding)/dot(&tmp_encoding,&tmp_encoding);
+				} else {
+					nabla = -dot(in,&tmp_encoding)/dot(&tmp_encoding,&tmp_encoding);
+				}
+			} else {
+				x_old -= *x;
+				*g_old -= *g;
+				ELEMENT_TYPE xx = dot(&x_old,&x_old);
+				ELEMENT_TYPE gx = dot(g_old,&x_old);
+
+				ELEMENT_TYPE nabla1 = xx/gx;
+
+				/* This is the code that enables the adaptive step size.
+	     REAL gg = dot(g_old,&x_old);
+	     REAL nabla2 = gx/gg;
+	     if ((nabla2/nabla1) < 0.5) nabla = nabla2;
+	     else nabla = nabla1;*/
+				nabla = nabla1;
+			}
+
+			ARRAY_TYPE * tmp;
+			tmp=g_old;
+			g_old=g;
+			g=tmp;
+
+			x_old = *x;
+			REAL grad_norm = nrm2(g_old);
+
+			if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+				GDEBUG_STREAM("Iteration " <<i << ". Gradient norm: " <<  grad_norm << std::endl);
+			}
+			iteration_callback(x,i,data_res,reg_res);
+			axpy(-nabla,g_old,x);
+			if (non_negativity_constraint_) clamp_min(x,REAL(0));
+			if (grad_norm < tc_tolerance_)  break;
+		}
+		delete g;
+		delete g_old;
+
+		return boost::shared_ptr<ARRAY_TYPE>(x);
+    		}
+
+	// Set preconditioner
+	//
+	/*virtual void set_preconditioner( boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond ) {
+      precond_ = precond;
+      }*/
+
+	// Set/get maximally allowed number of iterations
+	//
+	virtual void set_max_iterations( unsigned int iterations ) { iterations_ = iterations; }
+	virtual unsigned int get_max_iterations() { return iterations_; }
+
+	// Set/get tolerance threshold for termination criterium
+	//
+	virtual void set_tc_tolerance( REAL tolerance ) { tc_tolerance_ = tolerance; }
+	virtual REAL get_tc_tolerance() { return tc_tolerance_; }
+
+	virtual void set_non_negativity_constraint(bool non_negativity_constraint){
+		non_negativity_constraint_=non_negativity_constraint;
+	}
+
+	virtual void set_dump_residual(bool dump_res){
+		dump_residual = dump_res;
+	}
+	// Set preconditioner
+	//
+
+	virtual void set_preconditioner( boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond ) {
+		precond_ = precond;
+	}
+
+protected:
+	typedef typename std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > >::iterator  csIterator;
+	typedef typename std::vector< std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > > >::iterator csGroupIterator;
+
+	virtual void iteration_callback(ARRAY_TYPE*,int i,REAL,REAL){};
+
+protected:
+
+	// Preconditioner
+	//boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond_;
+	// Maximum number of iterations
+	unsigned int iterations_;
+	bool non_negativity_constraint_;
+	REAL tc_tolerance_;
+	REAL threshold;
+	bool dump_residual;
+	// Preconditioner
+	boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond_;
+};
+}
diff --git a/toolboxes/solvers/gpSolver.h b/toolboxes/solvers/gpSolver.h
new file mode 100644
index 0000000..27cb3f0
--- /dev/null
+++ b/toolboxes/solvers/gpSolver.h
@@ -0,0 +1,318 @@
+#pragma once
+
+#include "linearOperatorSolver.h"
+#include "real_utilities.h"
+#include "complext.h"
+
+#include <vector>
+#include <iostream>
+
+namespace Gadgetron{
+
+  /* Using adaptive step size from Zhou et al, 2006, Computational Optimization and Applications,
+   * DOI: 10.1007/s10589-006-6446-0
+   */
+
+  template <class ARRAY_TYPE> class gpSolver : public linearOperatorSolver<ARRAY_TYPE>
+  {
+  protected:
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+
+  public:
+
+    virtual void set_domain_dimensions(std::vector<size_t> *dims ){
+      for (int i = 0;  i < operators.size(); i++) operators[i]->set_domain_dimensions(dims);
+    }
+    virtual ~gpSolver(){}
+
+    virtual void add_nonlinear_operator(boost::shared_ptr< generalOperator<ARRAY_TYPE> > op ){
+      operators.push_back(op);
+    }
+
+    virtual void add_regularization_operator(boost::shared_ptr< linearOperator<ARRAY_TYPE> > op ){
+      add_regularization_operator(op,2);
+    }
+    virtual void add_regularization_operator(boost::shared_ptr< linearOperator<ARRAY_TYPE> > op, int L_norm ){
+      if (L_norm==1){
+        operators.push_back(boost::shared_ptr<gpRegularizationOperator>(new l1GPRegularizationOperator(op)));
+      }else{
+        operators.push_back(op);
+      }
+    }
+
+    virtual void add_regularization_operator(boost::shared_ptr< linearOperator<ARRAY_TYPE> > op, boost::shared_ptr<ARRAY_TYPE> prior, int L_norm=2 ){
+      if (L_norm==1){
+        operators.push_back(boost::shared_ptr<gpRegularizationOperator>(new l1GPRegularizationOperator(op,prior)));
+      }else{
+        operators.push_back(boost::shared_ptr<gpRegularizationOperator>(new l2GPRegularizationOperator(op,prior)));
+      }
+    }
+
+    virtual void add_regularization_group_operator ( boost::shared_ptr< linearOperator<ARRAY_TYPE> > op )
+    {
+      current_group.push_back(op);
+    }
+
+    virtual void add_group(int L_norm=1)
+    {
+      if(current_group.size()==0){
+        throw std::runtime_error( "Error: gpBBSolver::add_group : no regularization group operators added" );
+      }
+      if (L_norm==2){
+        for (int i =0; i < current_group.size(); i++){
+          add_regularization_operator(current_group[i]);
+        }
+
+      } else {
+
+        boost::shared_ptr<gpRegularizationOperator> new_group(new l1GroupGPRegularizationOperator(current_group));
+        operators.push_back(new_group);
+      }
+      current_group = std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > >();
+    }
+
+    virtual void add_group(boost::shared_ptr<ARRAY_TYPE> prior, int L_norm=1)
+    {
+      if(current_group.size()==0){
+        throw std::runtime_error( "Error: gpBBSolver::add_group : no regularization group operators added" );
+
+      }
+      if (L_norm==2){
+        for (int i =0; i < current_group.size(); i++){
+          add_regularization_operator(current_group[i],prior);
+        }
+
+      } else {
+
+        boost::shared_ptr<gpRegularizationOperator> new_group(new l1GroupGPRegularizationOperator(current_group,prior));
+        operators.push_back(new_group);
+      }
+      current_group = std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > >();
+    }
+
+  protected:
+
+
+    virtual void add_gradient(ARRAY_TYPE* x, ARRAY_TYPE* g){
+      for (int i = 0; i < operators.size(); i++){
+        boost::shared_ptr<generalOperator<ARRAY_TYPE> > op = operators[i];
+        op->gradient(x,g,true);
+      }
+
+    }
+
+
+
+    class gpRegularizationOperator : public generalOperator<ARRAY_TYPE> {
+    public:
+      gpRegularizationOperator() : generalOperator<ARRAY_TYPE>(){
+      }
+
+      gpRegularizationOperator(std::vector<size_t> *dims): generalOperator<ARRAY_TYPE>(){this->set_domain_dimensions(dims);};
+      gpRegularizationOperator(
+                               boost::shared_ptr<ARRAY_TYPE> _prior): generalOperator<ARRAY_TYPE>(){
+        prior = _prior;
+      }
+
+      gpRegularizationOperator(boost::shared_ptr<ARRAY_TYPE> _prior,std::vector<size_t> *dims): generalOperator<ARRAY_TYPE>(){
+        prior = _prior;
+        set_domain_dimensions(dims);
+      }
+
+
+
+    protected:
+
+      boost::shared_ptr<ARRAY_TYPE> prior;
+
+    };
+
+
+    class l2GPRegularizationOperator : public gpRegularizationOperator {
+    public:
+      l2GPRegularizationOperator(boost::shared_ptr<linearOperator<ARRAY_TYPE> > _op){
+        op = _op;
+      }
+      l2GPRegularizationOperator(
+                                 boost::shared_ptr<linearOperator<ARRAY_TYPE> > _op,
+                                 boost::shared_ptr<ARRAY_TYPE> _prior): gpRegularizationOperator(_prior){op = _op;}
+      virtual void gradient(ARRAY_TYPE* x, ARRAY_TYPE* g,bool accumulate=false){
+        ARRAY_TYPE* x2 = x;
+        if (this->prior.get()){
+          x2 = new ARRAY_TYPE(*x);
+          axpy(REAL(-1),this->prior.get(),x2);
+        }
+        op->gradient(x,g,accumulate);
+      }
+
+      virtual REAL magnitude(ARRAY_TYPE* x){
+        ARRAY_TYPE tmp(op->get_codomain_dimensions());
+        ARRAY_TYPE* x2 = x;
+        if (this->prior.get()){
+          x2 = new ARRAY_TYPE(*x);
+          *x2 -= *this->prior;
+        }
+        else clear(&tmp);
+        op->mult_M(x2,&tmp);
+        if (this->prior.get()) delete x2;
+        return std::sqrt(op->get_weight())*real(dot(&tmp,&tmp));
+      }
+    protected:
+      boost::shared_ptr<linearOperator<ARRAY_TYPE> > op;
+    };
+
+    class l1GPRegularizationOperator : public gpRegularizationOperator {
+    public:
+      l1GPRegularizationOperator(boost::shared_ptr<linearOperator<ARRAY_TYPE> > _op){
+
+        op = _op;
+      }
+      l1GPRegularizationOperator(
+                                 boost::shared_ptr<linearOperator<ARRAY_TYPE> > _op,
+                                 boost::shared_ptr<ARRAY_TYPE> _prior): gpRegularizationOperator(_prior){op = _op;}
+
+
+      virtual void gradient(ARRAY_TYPE* x, ARRAY_TYPE* g, bool accumulate=false){
+        ARRAY_TYPE tmp(op->get_codomain_dimensions());
+        ARRAY_TYPE q(op->get_domain_dimensions());
+        ARRAY_TYPE* x2 = x;
+
+        if (!accumulate) clear(g);
+        if (this->prior.get()){
+          x2 = new ARRAY_TYPE;
+          *x2 = *x;
+          axpy(REAL(-1),this->prior.get(),x2);
+
+        }
+        op->mult_M(x2,&tmp);
+        sgn_inplace(&tmp);
+        op->mult_MH(&tmp,&q,false);
+        axpy(op->get_weight(),&q,g);
+        if (this->prior.get()) delete x2;
+      }
+
+      virtual REAL magnitude(ARRAY_TYPE* x){
+        ARRAY_TYPE tmp(op->get_codomain_dimensions());
+        ARRAY_TYPE* x2 = x;
+        if (this->prior.get()){
+          x2 = new ARRAY_TYPE(*x);
+          *x2 -= *this->prior;
+        }
+        op->mult_M(x2,&tmp);
+        if (this->prior.get()) delete x2;
+        return op->get_weight()*asum(&tmp);
+      }
+
+
+      virtual void set_domain_dimensions(std::vector<size_t> *dims){
+        generalOperator<ARRAY_TYPE>::set_domain_dimensions(dims);
+        op->set_domain_dimensions(dims);
+        if (op->get_codomain_dimensions()->size() == 0){
+          GDEBUG_STREAM("WARNING: Codomain dimension not set. Setting to domain_dimension" << std::endl);
+          op->set_codomain_dimensions(dims);
+        }
+      }
+      boost::shared_ptr<linearOperator<ARRAY_TYPE> > op;
+    };
+
+    class l1GroupGPRegularizationOperator : public gpRegularizationOperator {
+    public:
+      l1GroupGPRegularizationOperator(std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > >_ops){
+        group = _ops;
+        threshold = REAL(1e-8);
+
+      }
+      l1GroupGPRegularizationOperator(std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > >_ops,
+                                      boost::shared_ptr<ARRAY_TYPE> _prior): gpRegularizationOperator(_prior){
+        group = _ops;
+        threshold = REAL(1e-8);
+
+      }
+      virtual void gradient(ARRAY_TYPE* x, ARRAY_TYPE* g,bool accumulate=false){
+        std::vector<boost::shared_ptr<ARRAY_TYPE> > data;
+        ARRAY_TYPE gData(group.front()->get_codomain_dimensions());
+        clear(&gData);
+
+        if (!accumulate) clear(g);
+        ARRAY_TYPE* x2 = x;
+        if (this->prior.get()){
+          x2 = new ARRAY_TYPE(*x);
+          *x2 -= *this->prior;
+        }
+
+        for (int i = 0; i < group.size(); i++ ){
+          boost::shared_ptr<linearOperator<ARRAY_TYPE> > op = group[i];
+          boost::shared_ptr<ARRAY_TYPE> tmp(new ARRAY_TYPE(op->get_codomain_dimensions().get()));
+          op->mult_M(x2,tmp.get());
+          data.push_back(tmp);
+          ARRAY_TYPE tmp2 = *tmp;
+          tmp2 *= *tmp; //Square data
+          gData += tmp2;
+        }
+        if (this->prior.get()){
+          delete x2;
+        }
+        sqrt_inplace(&gData);
+        //REAL cost = group.front()->get_weight()*asum(&gData);
+        clamp_min(&gData,threshold);
+        reciprocal_inplace(&gData);
+
+        ARRAY_TYPE q(group.front()->get_domain_dimensions());
+
+        for (int i = 0; i < group.size(); i++ ){
+          boost::shared_ptr<linearOperator<ARRAY_TYPE> > op = group[i];
+          boost::shared_ptr<ARRAY_TYPE> tmp = data[i];
+          *tmp *= gData;
+          op->mult_MH(tmp.get(),&q,false);
+          axpy(op->get_weight(),&q,g);
+        }
+      }
+
+
+      virtual REAL magnitude(ARRAY_TYPE* x){
+        ARRAY_TYPE gData(group.front()->get_codomain_dimensions());
+        clear(&gData);
+        ARRAY_TYPE* x2 =x;
+        if (this->prior.get()){
+          x2 = new ARRAY_TYPE(*x);
+          *x2 -= *this->prior;
+        }
+        for (int i = 0; i < group.size(); i++ ){
+          boost::shared_ptr<linearOperator<ARRAY_TYPE> > op = group[i];
+          ARRAY_TYPE tmp(op->get_codomain_dimensions().get());
+          op->mult_M(x2,&tmp);
+          tmp *= tmp;
+          gData += tmp;
+        }
+        if (this->prior.get()) delete x2;
+        sqrt_inplace(&gData);
+        return group.front()->get_weight()*asum(&gData);
+      }
+
+      void set_threshold(REAL _threshold){
+        threshold = _threshold;
+      }
+
+    protected:
+
+      std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > > group;
+      REAL threshold;
+
+      virtual void set_domain_dimensions(std::vector<size_t> *dims){
+        generalOperator<ARRAY_TYPE>::set_domain_dimensions(dims);
+        for (int i = 0; i < group.size(); i++ ){
+          boost::shared_ptr<linearOperator<ARRAY_TYPE> > op = group[i];
+          op->set_domain_dimensions(dims);
+          if (op->get_codomain_dimensions()->size() == 0){
+            GDEBUG_STREAM("WARNING: Codomain dimension not set. Setting to domain_dimension" << std::endl);
+            op->set_codomain_dimensions(dims);
+          }
+        }
+      }
+    };
+
+    std::vector< boost::shared_ptr< generalOperator<ARRAY_TYPE> > > operators;
+    std::vector< boost::shared_ptr<linearOperator<ARRAY_TYPE> > >  current_group;
+  };
+}
diff --git a/toolboxes/solvers/gpu/CMakeLists.txt b/toolboxes/solvers/gpu/CMakeLists.txt
new file mode 100644
index 0000000..115acb2
--- /dev/null
+++ b/toolboxes/solvers/gpu/CMakeLists.txt
@@ -0,0 +1,51 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUSOLVERS__)
+endif (WIN32)
+
+if(WIN32)
+  link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+include_directories(
+  ${CUDA_INCLUDE_DIRS}
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers/cpu
+  )
+
+cuda_add_library(gadgetron_toolbox_gpusolvers SHARED 
+    gpusolvers_export.h
+    cuSolverUtils.cu
+  )
+
+set_target_properties(gadgetron_toolbox_gpusolvers PROPERTIES VERSION ${GADGETRON_VERSION_STRING} SOVERSION ${GADGETRON_SOVERSION})
+
+target_link_libraries(gadgetron_toolbox_gpusolvers 
+  gadgetron_toolbox_gpucore
+  gadgetron_toolbox_log
+  ${Boost_LIBRARIES}
+  ${CUDA_LIBRARIES}
+  ${CUDA_CUBLAS_LIBRARIES} 
+  )
+
+install(TARGETS gadgetron_toolbox_gpusolvers DESTINATION lib COMPONENT main)
+
+install(FILES 	
+  cuSbCgSolver.h
+  cuSbcCgSolver.h
+  cuCgPreconditioner.h
+  cuLwSolver.h
+  cuLbfgsSolver.h
+  cuSbLwSolver.h
+  cuSbcLwSolver.h
+  cuCgSolver.h
+  cuNlcgSolver.h
+  cuGpBbSolver.h
+  hoCuCgSolver.h
+  hoCuNlcgSolver.h
+  hoCuSbcCgSolver.h
+  hoCuGpBbSolver.h
+  cuSolverUtils.h
+  gpusolvers_export.h
+  DESTINATION ${GADGETRON_INSTALL_INCLUDE_PATH} COMPONENT main)
diff --git a/toolboxes/solvers/gpu/cuCgPreconditioner.h b/toolboxes/solvers/gpu/cuCgPreconditioner.h
new file mode 100644
index 0000000..808d9e2
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuCgPreconditioner.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "cuNDArray_operators.h"
+#include "cgPreconditioner.h"
+
+namespace Gadgetron{
+
+  template<class T> class cuCgPreconditioner : public cgPreconditioner< cuNDArray<T> >
+  {
+  public:    
+    cuCgPreconditioner() : cgPreconditioner< cuNDArray<T> >() {}
+    virtual ~cuCgPreconditioner() {}
+  };
+}
diff --git a/toolboxes/solvers/gpu/cuCgSolver.h b/toolboxes/solvers/gpu/cuCgSolver.h
new file mode 100644
index 0000000..071a8ec
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuCgSolver.h
@@ -0,0 +1,30 @@
+/** \file cuCgSolver.h
+    \brief Instantiation of the conjugate gradient solver on the cpu.
+
+    The file cuCgSolver.h is a convienience wrapper for the device independent cgSolver class.
+    The class cuCgSolver instantiates the cgSolver for the cuNDArray
+    and the header otherwise includes other neccessary header files.
+*/
+
+#pragma once
+
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "cgSolver.h"
+
+namespace Gadgetron{
+  
+  /** \class cuCgSolver
+      \brief Instantiation of the conjugate gradient solver on the cpu.
+      
+      The class cuCgSolver is a convienience wrapper for the device independent cgSolver class.
+      cuCgSolver instantiates the cgSolver for type cuNDArray<T>.
+  */
+  template <class T> class cuCgSolver : public cgSolver< cuNDArray<T> >
+  {
+  public:    
+    cuCgSolver() : cgSolver<cuNDArray<T> >() {}
+    virtual ~cuCgSolver() {}
+  };
+}
diff --git a/toolboxes/solvers/gpu/cuGpBbSolver.h b/toolboxes/solvers/gpu/cuGpBbSolver.h
new file mode 100644
index 0000000..3def796
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuGpBbSolver.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "gpBbSolver.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "real_utilities.h"
+#include "vector_td_utilities.h"
+
+
+#include "cuSolverUtils.h"
+
+namespace Gadgetron{
+
+  template <class T> class cuGpBbSolver : public gpBbSolver<cuNDArray<T> >
+  {
+  public:
+
+    cuGpBbSolver() : gpBbSolver<cuNDArray<T> >() {}
+    virtual ~cuGpBbSolver() {}
+  };
+}
diff --git a/toolboxes/solvers/gpu/cuLbfgsSolver.h b/toolboxes/solvers/gpu/cuLbfgsSolver.h
new file mode 100644
index 0000000..f7803a9
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuLbfgsSolver.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "lbfgsSolver.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "real_utilities.h"
+#include "vector_td_utilities.h"
+#include "gpusolvers_export.h"
+
+
+#include <fstream>
+#include "cuSolverUtils.h"
+
+namespace Gadgetron{
+  
+  template <class T> class cuLbfgsSolver : public lbfgsSolver<cuNDArray<T> >
+  {
+  public:
+    
+    cuLbfgsSolver() : lbfgsSolver<cuNDArray<T> >() {}
+    virtual ~cuLbfgsSolver() {}
+/*
+    virtual void iteration_callback(cuNDArray<T>* x ,int iteration,typename realType<T>::Type value){
+  	  if (iteration == 0){
+  		  std::ofstream textFile("residual.txt",std::ios::trunc);
+  	  	  textFile << value << std::endl;
+  	  } else{
+  		  std::ofstream textFile("residual.txt",std::ios::app);
+  		  textFile << value << std::endl;
+  	  }
+
+    };
+    */
+  };
+}
diff --git a/toolboxes/solvers/gpu/cuLwSolver.h b/toolboxes/solvers/gpu/cuLwSolver.h
new file mode 100644
index 0000000..5ad7d5a
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuLwSolver.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "lwSolver.h"
+#include "cuNDArray.h"
+#include "cuNDArray_blas.h"
+
+#include <iostream>
+
+namespace Gadgetron{
+
+  template <class T> class cuLwSolver
+    : public lwSolver<cuNDArray<T> >
+  {
+  public:
+  
+    cuLwSolver() : lwSolver< cuNDArray<T> >() { set_device(-1); }
+    virtual ~cuLwSolver() {}
+  
+    virtual bool set_device( int device )
+    { 
+      device_ = device;
+    
+      if( device<0 ){
+      
+	int old_device;  
+      
+	if( cudaGetDevice( &old_device ) != cudaSuccess ){
+	  std::cerr << "cuLwSolver::set_device: unable to get current device." << std::endl ;
+	  return false;
+	}
+      
+	device_ = old_device;
+      }
+    
+      return true;
+    }
+    
+  protected:
+    int device_;
+    int old_device_;
+  };
+}
diff --git a/toolboxes/solvers/gpu/cuNlcgSolver.h b/toolboxes/solvers/gpu/cuNlcgSolver.h
new file mode 100644
index 0000000..16670f5
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuNlcgSolver.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "nlcgSolver.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "real_utilities.h"
+#include "vector_td_utilities.h"
+#include "gpusolvers_export.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+#include <thrust/functional.h>
+#include "cuSolverUtils.h"
+
+namespace Gadgetron{
+  
+  template <class T> class cuNlcgSolver : public nlcgSolver<cuNDArray<T> >
+  {
+  public:
+    cuNlcgSolver() : nlcgSolver<cuNDArray<T> >() {}
+    virtual ~cuNlcgSolver() {}
+  };
+}
diff --git a/toolboxes/solvers/gpu/cuSbCgSolver.h b/toolboxes/solvers/gpu/cuSbCgSolver.h
new file mode 100644
index 0000000..7fa12c8
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuSbCgSolver.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "cuCgSolver.h"
+#include "sbSolver.h"
+
+#include "complext.h"
+
+namespace Gadgetron{
+
+  template <class T> class cuSbCgSolver : public sbSolver< cuNDArray<typename realType<T>::Type >, cuNDArray<T>, cuCgSolver<T> >
+  {
+  public:    
+    cuSbCgSolver() : sbSolver<cuNDArray<typename realType<T>::Type >, cuNDArray<T>, cuCgSolver<T> >() {}    
+    virtual ~cuSbCgSolver() {}
+  };
+}
diff --git a/toolboxes/solvers/gpu/cuSbLwSolver.h b/toolboxes/solvers/gpu/cuSbLwSolver.h
new file mode 100644
index 0000000..c54c5e3
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuSbLwSolver.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "sbSolver.h"
+#include "cuLwSolver.h"
+#include "cuNDArray.h"
+#include "real_utilities.h"
+#include "vector_td_utilities.h"
+#include "ndarray_vector_td_utilities.h"
+#include "encodingOperatorContainer.h"
+
+template <class T> class cuSbLwSolver
+  : public sbSolver<cuNDArray<typename realType<T>::type>, cuNDArray<T>, cuLwSolver<T> >
+{
+public:
+  
+  cuSbLwSolver() : sbSolver< cuNDArray<typename realType<T>::type>, cuNDArray<T>, cuLwSolver<T> >() {
+    set_device(-1); 
+  }
+
+  virtual ~cuSbLwSolver() {}
+  
+#include "cuSbSolver_macros.h"
+  
+protected:
+  int device_;
+  int old_device_;
+};
diff --git a/toolboxes/solvers/gpu/cuSbcCgSolver.h b/toolboxes/solvers/gpu/cuSbcCgSolver.h
new file mode 100644
index 0000000..78ece19
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuSbcCgSolver.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "cuCgSolver.h"
+#include "sbcSolver.h"
+
+namespace Gadgetron{
+  
+  template <class T> class cuSbcCgSolver : public sbcSolver< cuNDArray<typename realType<T>::Type >, cuNDArray<T>, cuCgSolver<T> >
+  {
+  public:    
+    cuSbcCgSolver() : sbcSolver<cuNDArray<typename realType<T>::Type >, cuNDArray<T>, cuCgSolver<T> >() {}
+    virtual ~cuSbcCgSolver() {}    
+  };
+}
diff --git a/toolboxes/solvers/gpu/cuSbcLwSolver.h b/toolboxes/solvers/gpu/cuSbcLwSolver.h
new file mode 100644
index 0000000..6dcc218
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuSbcLwSolver.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "sbcSolver.h"
+#include "cuLwSolver.h"
+#include "cuNDArray.h"
+#include "real_utilities.h"
+#include "vector_td_utilities.h"
+#include "ndarray_vector_td_utilities.h"
+#include "encodingOperatorContainer.h"
+
+namespace Gadgetron{
+template <class T> class cuSbcLwSolver
+  : public sbcSolver< cuNDArray<typename realType<T>::type>, cuNDArray<T>, cuLwSolver<T> >
+{
+public:
+  
+  cuSbcLwSolver() : sbcSolver<cuNDArray<typename realType<T>::type>, cuNDArray<T>, cuLwSolver<T> >() {
+    set_device(-1); 
+  }
+
+  virtual ~cuSbcLwSolver() {}
+
+#include "cuSbSolver_macros.h"
+
+protected:
+  int device_;
+  int old_device_;
+};
+}
diff --git a/toolboxes/solvers/gpu/cuSolverUtils.cu b/toolboxes/solvers/gpu/cuSolverUtils.cu
new file mode 100644
index 0000000..e26ec4d
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuSolverUtils.cu
@@ -0,0 +1,111 @@
+#include "complext.h"
+#include "cuSolverUtils.h"
+#include <thrust/transform.h>
+#include <thrust/iterator/zip_iterator.h>
+#include "cuNDArray_math.h"
+#define MAX_THREADS_PER_BLOCK 512
+
+using namespace Gadgetron;
+template <class T> __global__ static void filter_kernel(T* x, T* g, int elements){
+	const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < elements){
+		if ( x[idx] <= T(0) && g[idx] > 0) g[idx]=T(0);
+	}
+}
+
+template <class REAL> __global__ static void filter_kernel(complext<REAL>* x, complext<REAL>* g, int elements){
+	const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < elements){
+		if ( real(x[idx]) <= REAL(0) && real(g[idx]) > 0) g[idx].vec[0] = REAL(0);
+		g[idx].vec[1]=REAL(0);
+	}
+}
+
+template <class T> void EXPORTGPUSOLVERS Gadgetron::solver_non_negativity_filter(cuNDArray<T>* x , cuNDArray<T>* g)
+{
+	int elements = g->get_number_of_elements();
+
+	int threadsPerBlock = std::min(elements,MAX_THREADS_PER_BLOCK);
+	dim3 dimBlock( threadsPerBlock);
+	int totalBlocksPerGrid = std::max(1,elements/MAX_THREADS_PER_BLOCK);
+	dim3 dimGrid(totalBlocksPerGrid);
+
+	filter_kernel<typename realType<T>::Type><<<dimGrid,dimBlock>>>(x->get_data_ptr(),g->get_data_ptr(),elements);
+}
+
+
+
+template<class T> struct updateF_functor{
+
+	typedef typename realType<T>::Type REAL;
+	updateF_functor(REAL alpha_, REAL sigma_){
+
+		alpha= alpha_;
+		sigma = sigma_;
+	}
+	__device__ __inline__ T operator() (T val){
+		return val/(1+alpha*sigma)/max(REAL(1),abs(val/(1+alpha*sigma)));
+	}
+	typename realType<T>::Type alpha, sigma;
+};
+
+template<class T>
+inline void Gadgetron::updateF(cuNDArray<T>& data,
+		typename realType<T>::Type alpha, typename realType<T>::Type sigma) {
+	thrust::transform(data.begin(),data.end(),data.begin(),updateF_functor<T>(alpha,sigma));
+}
+
+
+template<class T> struct updateFgroup_functor {
+
+	typedef typename realType<T>::Type REAL;
+	updateFgroup_functor(REAL alpha_, REAL sigma_) : alpha(alpha_), sigma(sigma_){
+	}
+
+	__device__ __inline__ T operator() (thrust::tuple<T,typename realType<T>::Type> tup){
+			return thrust::get<0>(tup)/(1+alpha*sigma)/max(REAL(1),thrust::get<1>(tup)/(1+alpha*sigma));
+	}
+
+	typename realType<T>::Type alpha, sigma;
+};
+
+template<class T> struct add_square_functor{
+
+	__device__ __inline__ typename realType<T>::Type operator() (thrust::tuple<T,typename realType<T>::Type> tup){
+		T val = thrust::get<0>(tup);
+		return thrust::get<1>(tup)+norm(val);
+	}
+};
+template<class T>
+inline void Gadgetron::updateFgroup(std::vector<cuNDArray<T> >& datas,
+		typename realType<T>::Type alpha, typename realType<T>::Type sigma) {
+
+	cuNDArray<typename realType<T>::Type> squares(datas.front().get_dimensions());
+	clear(&squares);
+	for (int i = 0; i < datas.size(); i++)
+		thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(datas[i].begin(),squares.begin())),
+				thrust::make_zip_iterator(thrust::make_tuple(datas[i].end(),squares.end())), squares.begin(), add_square_functor<T>());
+
+	sqrt_inplace(&squares);
+	for (int i = 0 ; i < datas.size(); i++){
+		thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(datas[i].begin(),squares.begin())),
+				thrust::make_zip_iterator(thrust::make_tuple(datas[i].end(),squares.end())), datas[i].begin(), updateFgroup_functor<T>(alpha,sigma));
+	}
+}
+
+template void EXPORTGPUSOLVERS Gadgetron::updateF<float>(cuNDArray<float>& data, float alpha, float sigma);
+template void EXPORTGPUSOLVERS Gadgetron::updateF<double>(cuNDArray<double>& data,double alpha, double sigma);
+template void EXPORTGPUSOLVERS Gadgetron::updateF<float_complext>(cuNDArray<float_complext>& data, float alpha, float sigma);
+template void EXPORTGPUSOLVERS Gadgetron::updateF<double_complext>(cuNDArray<double_complext>& data, double alpha, double sigma);
+
+template void EXPORTGPUSOLVERS Gadgetron::updateFgroup<float>(std::vector<cuNDArray<float> >& data, float alpha, float sigma);
+template void EXPORTGPUSOLVERS Gadgetron::updateFgroup<double>(std::vector<cuNDArray<double> >& data,double alpha, double sigma);
+template void EXPORTGPUSOLVERS Gadgetron::updateFgroup<float_complext>(std::vector<cuNDArray<float_complext> >& data, float alpha, float sigma);
+template void EXPORTGPUSOLVERS Gadgetron::updateFgroup<double_complext>(std::vector<cuNDArray<double_complext> >& data, double alpha, double sigma);
+
+
+template void EXPORTGPUSOLVERS Gadgetron::solver_non_negativity_filter<float>(cuNDArray<float>*, cuNDArray<float>*);
+template void EXPORTGPUSOLVERS Gadgetron::solver_non_negativity_filter<double>(cuNDArray<double>*, cuNDArray<double>*);
+template void EXPORTGPUSOLVERS Gadgetron::solver_non_negativity_filter<float_complext>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+template void EXPORTGPUSOLVERS Gadgetron::solver_non_negativity_filter<double_complext>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+
diff --git a/toolboxes/solvers/gpu/cuSolverUtils.h b/toolboxes/solvers/gpu/cuSolverUtils.h
new file mode 100644
index 0000000..b7d701a
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuSolverUtils.h
@@ -0,0 +1,16 @@
+#pragma once
+#include "complext.h"
+#include "hoCuNDArray.h"
+#include "cuNDArray.h"
+#include "gpusolvers_export.h"
+
+namespace Gadgetron{
+
+template<class T> void EXPORTGPUSOLVERS solver_non_negativity_filter(cuNDArray<T>* x , cuNDArray<T>* g);
+
+
+template<class T> void EXPORTGPUSOLVERS updateF(cuNDArray<T>& data, typename realType<T>::Type alpha ,typename realType<T>::Type sigma);
+
+template<class T> void EXPORTGPUSOLVERS updateFgroup(std::vector<cuNDArray<T> >& datas, typename realType<T>::Type alpha ,typename realType<T>::Type sigma);
+
+}
diff --git a/toolboxes/solvers/gpu/gpusolvers_export.h b/toolboxes/solvers/gpu/gpusolvers_export.h
new file mode 100644
index 0000000..1dc1c3e
--- /dev/null
+++ b/toolboxes/solvers/gpu/gpusolvers_export.h
@@ -0,0 +1,18 @@
+/** \file gpusolvers_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef GPUSOLVERS_EXPORT_H_
+#define GPUSOLVERS_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GPUSOLVERS__) || defined (gpusolvers_EXPORTS)
+#define EXPORTGPUSOLVERS __declspec(dllexport)
+#else
+#define EXPORTGPUSOLVERS __declspec(dllimport)
+#endif
+#else
+#define EXPORTGPUSOLVERS
+#endif
+
+#endif /* GPUSOLVERS_EXPORT_H_ */
diff --git a/toolboxes/solvers/gpu/hoCuCgSolver.h b/toolboxes/solvers/gpu/hoCuCgSolver.h
new file mode 100644
index 0000000..1995b3c
--- /dev/null
+++ b/toolboxes/solvers/gpu/hoCuCgSolver.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include "cgSolver.h"
+
+#include "cgSolver.h"
+#include "hoNDArray_math.h"
+#include "hoCuNDArray_math.h"
+
+namespace Gadgetron{
+
+  /** \class hoCuCgSolver
+      \brief Instantiation of the conjugate gradient solver on the cpu.
+
+      The class hoCuCgSolver is a convienience wrapper for the device independent cgSolver class.
+      hoCuCgSolver instantiates the cgSolver for type hoNDArray<T>.
+  */
+  template <class T> class hoCuCgSolver : public cgSolver< hoCuNDArray<T> >
+  {
+  public:
+    hoCuCgSolver() : cgSolver<hoCuNDArray<T> >(), _it(0) {}
+    virtual ~hoCuCgSolver() {}
+
+    /* TSS: This is too expensive to do in general. Move responsibility of dumping to the apps.
+    virtual void solver_dump(hoCuNDArray<T>* x){
+    	std::stringstream ss;
+			ss << "iteration-" << _it << ".real";
+			write_nd_array(x,ss.str().c_str());
+			_it++;
+      }*/
+
+  private:
+    int _it;
+  };
+}
diff --git a/toolboxes/solvers/gpu/hoCuGpBbSolver.h b/toolboxes/solvers/gpu/hoCuGpBbSolver.h
new file mode 100644
index 0000000..4db1aee
--- /dev/null
+++ b/toolboxes/solvers/gpu/hoCuGpBbSolver.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "gpBbSolver.h"
+#include "hoNDArray_math.h"
+#include "real_utilities.h"
+#include "vector_td_utilities.h"
+#include "hoSolverUtils.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif
+
+namespace Gadgetron{
+
+  template <class T> class hoCuGpBbSolver : public gpBbSolver< hoCuNDArray<T> >
+  {  
+  public:
+
+    hoCuGpBbSolver() : gpBbSolver< hoCuNDArray<T> >() {};
+    virtual ~hoCuGpBbSolver() {};
+
+  };
+}
diff --git a/toolboxes/solvers/gpu/hoCuNlcgSolver.h b/toolboxes/solvers/gpu/hoCuNlcgSolver.h
new file mode 100644
index 0000000..9ab5f7d
--- /dev/null
+++ b/toolboxes/solvers/gpu/hoCuNlcgSolver.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "hoNDArray_math.h"
+#include "hoCuNDArray_math.h"
+#include "hoNDArray_fileio.h"
+#include "complext.h"
+#include "nlcgSolver.h"
+#include "hoSolverUtils.h"
+
+namespace Gadgetron{
+
+template<class T> class hoCuNlcgSolver: public nlcgSolver<hoCuNDArray<T> >{
+	typedef typename realType<T>::Type REAL;
+public:
+	hoCuNlcgSolver():nlcgSolver<hoCuNDArray<T> >(){
+
+	}
+
+	virtual ~hoCuNlcgSolver(){};
+
+  virtual void iteration_callback(hoCuNDArray<T>* x,int i,REAL data_res,REAL reg_res){
+	  /*
+	  if (i == 0){
+		  std::ofstream textFile("residual.txt",std::ios::trunc);
+	  	  textFile << data_res << std::endl;
+	  } else{
+		  std::ofstream textFile("residual.txt",std::ios::app);
+		  textFile << data_res << std::endl;
+	  }
+	  std::stringstream ss;
+	  ss << "iteration-" << i << ".real";
+	  write_nd_array(x,ss.str().c_str());*/
+  };
+};
+}
diff --git a/toolboxes/solvers/gpu/hoCuSbcCgSolver.h b/toolboxes/solvers/gpu/hoCuSbcCgSolver.h
new file mode 100644
index 0000000..ff29e6f
--- /dev/null
+++ b/toolboxes/solvers/gpu/hoCuSbcCgSolver.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "hoCuCgSolver.h"
+#include "sbcSolver.h"
+
+#include "complext.h"
+
+namespace Gadgetron{
+
+  template <class T> class hoCuSbcCgSolver : public sbcSolver< hoCuNDArray<typename realType<T>::Type >, hoCuNDArray<T>, hoCuCgSolver<T> >
+  {
+  public:
+    hoCuSbcCgSolver() : sbcSolver<hoCuNDArray<typename realType<T>::Type >, hoCuNDArray<T>, hoCuCgSolver<T> >() {}
+    virtual ~hoCuSbcCgSolver() {}
+  };
+}
diff --git a/toolboxes/solvers/lbfgsSolver.h b/toolboxes/solvers/lbfgsSolver.h
new file mode 100644
index 0000000..e04daaf
--- /dev/null
+++ b/toolboxes/solvers/lbfgsSolver.h
@@ -0,0 +1,824 @@
+#pragma once
+
+#include "gpSolver.h"
+#include "linearOperatorSolver.h"
+#include "real_utilities.h"
+#include "complext.h"
+#include "cgPreconditioner.h"
+
+#include <vector>
+#include <iostream>
+#include <numeric>
+#include <list>
+
+namespace Gadgetron{
+/** Memory Limited BFGS Solver Adapted from Numerical Optimization (Wright and Nocedal 1999).
+ *
+ */
+
+template <class ARRAY_TYPE> class lbfgsSolver : public gpSolver<ARRAY_TYPE>
+{
+
+
+protected:
+	typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+	typedef typename realType<ELEMENT_TYPE>::Type REAL;
+	typedef ARRAY_TYPE ARRAY_CLASS;
+	typedef gpSolver<ARRAY_TYPE> GP;
+	typedef typename gpSolver<ARRAY_TYPE>::l1GPRegularizationOperator l1GPRegularizationOperator;
+
+public:
+
+	lbfgsSolver(): gpSolver<ARRAY_TYPE>() {
+		iterations_ = 10;
+		tc_tolerance_ = (REAL)1e-7;
+		non_negativity_constraint_=false;
+		dump_residual = false;
+		threshold= REAL(1e-7);
+		m_ = 3;
+		rho = 0.5f;
+	}
+
+	virtual ~lbfgsSolver(){}
+
+
+	virtual void set_rho(REAL _rho){
+		rho = _rho;
+	}
+
+	/***
+	 * @brief Sets the number of iterations to use for estimating the Hessian. Memory usage increases linearly with m_;
+	 * @param m
+	 */
+	virtual void set_m(unsigned int m){
+		m_ = m;
+	}
+
+	virtual boost::shared_ptr<ARRAY_TYPE> solve(ARRAY_TYPE* in)
+																																	{
+		if( this->encoding_operator_.get() == 0 ){
+			throw std::runtime_error("Error: lbfgsSolver::compute_rhs : no encoding operator is set" );
+		}
+
+		// Get image space dimensions from the encoding operator
+		//
+
+		boost::shared_ptr< std::vector<size_t> > image_dims = this->encoding_operator_->get_domain_dimensions();
+		if( image_dims->size() == 0 ){
+			throw std::runtime_error("Error: lbfgsSolver::compute_rhs : encoding operator has not set domain dimension" );
+		}
+
+		ARRAY_TYPE * x = new ARRAY_TYPE(image_dims.get()); //The image. Will be returned inside a shared_ptr
+
+		ARRAY_TYPE g(image_dims.get()); //Contains the gradient of the current step
+		ARRAY_TYPE g_old(image_dims.get()); //Contains the gradient of the previous step
+
+
+		ARRAY_TYPE g_linear(image_dims.get()); //Contains the linear part of the gradient;
+
+		//If a prior image was given, use it for the initial guess.
+		if (this->x0_.get()){
+			*x = *(this->x0_.get());
+		} else  {
+			clear(x);
+		}
+
+		// Contains the encoding space of the linear regularization operators
+		std::vector<ARRAY_TYPE> regEnc;
+
+		//Initialize encoding space
+		for (int i = 0; i < this->regularization_operators_.size(); i++){
+			regEnc.push_back(ARRAY_TYPE(this->regularization_operators_[i]->get_codomain_dimensions()));
+			if (reg_priors[i].get()){
+				regEnc.back() = *reg_priors[i];
+				regEnc.back() *= -std::sqrt(this->regularization_operators_[i]->get_weight());
+			}
+
+		}
+		std::vector<ARRAY_TYPE> regEnc2 = regEnc;
+
+		ARRAY_TYPE d(image_dims.get()); //Search direction.
+		clear(&d);
+
+		ARRAY_TYPE encoding_space(in->get_dimensions().get()); //Contains the encoding space, or, equivalently, the residual vector
+
+		ARRAY_TYPE g_step(image_dims.get()); //Linear part of the gradient of the step d will be stored here
+
+		ARRAY_TYPE encoding_space2(in->get_dimensions().get());
+		REAL reg_res,data_res;
+
+
+		std::list<bfgsPair> subspace;
+
+		if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+			GDEBUG_STREAM("Iterating..." << std::endl);
+		}
+		REAL grad_norm0;
+
+		for (int i = 0; i < iterations_; i++){
+			if (i==0){
+				if (this->x0_.get()){
+					this->encoding_operator_->mult_M(x,&encoding_space);
+
+				} else clear(&encoding_space);
+				encoding_space -= *in;
+				this->encoding_operator_->mult_MH(&encoding_space,&g_linear);
+
+				g_linear *=  this->encoding_operator_->get_weight();
+				data_res = std::sqrt(this->encoding_operator_->get_weight())*real(dot(&encoding_space,&encoding_space));
+
+				calc_regMultM(x,regEnc);
+				for (int n = 0; n < regEnc.size(); n++)
+					if (reg_priors[n].get())
+						axpy(-std::sqrt(this->regularization_operators_[n]->get_weight()),reg_priors[n].get(),&regEnc[n]);
+				add_linear_gradient(regEnc,&g_linear);
+				g = g_linear;
+				this->add_gradient(x,&g);
+
+				reg_res=REAL(0);
+
+			}else {
+				data_res = real(dot(&encoding_space,&encoding_space));
+			}
+
+
+
+			if (non_negativity_constraint_) solver_non_negativity_filter(x,&g);
+			if (i==0) grad_norm0=nrm2(&g);
+			REAL grad_norm = nrm2(&g);
+			if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+
+				GDEBUG_STREAM("Iteration " <<i << ". Relative gradient norm: " <<  grad_norm/grad_norm0 << std::endl);
+			}
+
+			lbfgs_update(&g,&d,subspace);
+
+			if (this->precond_.get()){
+				this->precond_->apply(&d,&d);
+				this->precond_->apply(&d,&d);
+			}
+
+
+			this->encoding_operator_->mult_M(&d,&encoding_space2);
+
+			calc_regMultM(&d,regEnc2);
+
+
+
+			this->encoding_operator_->mult_MH(&encoding_space2,&g_step);
+			g_step *= this->encoding_operator_->get_weight();
+
+
+			add_linear_gradient(regEnc2,&g_step);
+
+			REAL gd = real(dot(&g,&d));
+
+			REAL alpha0=REAL(1);
+
+			//In the linear or semi-linear case, we can calculate the ideal step size.
+			if (this->operators.size() == 0) alpha0 = -real(dot(&encoding_space,&encoding_space2)+calc_dot(regEnc,regEnc2))/real(dot(&encoding_space2,&encoding_space2)+calc_dot(regEnc2,regEnc2));
+
+			REAL alpha;
+			REAL old_norm = functionValue(&encoding_space,regEnc,x);
+
+
+
+			g_old = g;
+
+
+
+
+			{
+				FunctionEstimator f(&encoding_space,&encoding_space2,&regEnc,&regEnc2,x,&d,&g_linear,&g_step,this);
+				//alpha=backtracking(f,alpha0,gd,rho,old_norm);
+					alpha=cg_linesearch(f,alpha0,gd,old_norm);
+				if (alpha == 0) {
+					std::cerr << "Linesearch failed, returning current iteration" << std::endl;
+					return boost::shared_ptr<ARRAY_TYPE>(x);
+				}
+			}
+
+			GDEBUG_STREAM("Alpha : " << alpha << std::endl);
+
+
+
+			if (non_negativity_constraint_){
+				//Restore encoding space and gradient. Why not keep a copy? Memory!
+				axpy(-alpha,&encoding_space2,&encoding_space);
+				reg_axpy(-alpha,regEnc2,regEnc);
+				axpy(-alpha,&g_step,&g_linear);
+
+				ARRAY_TYPE x2 = *x;
+				axpy(alpha,&d,&x2);
+
+				clamp_min(&x2,REAL(0));
+
+				d = x2;
+				d -= *x;
+				gd = real(dot(&g,&d));
+				x2 = *x;
+				alpha0 = 1;
+				this->encoding_operator_->mult_M(&d,&encoding_space2);
+				calc_regMultM(&d,regEnc2);
+
+
+				this->encoding_operator_->mult_MH(&encoding_space2,&g_step);
+				g_step *= this->encoding_operator_->get_weight();
+				add_linear_gradient(regEnc2,&g_step);
+
+				FunctionEstimator f(&encoding_space,&encoding_space2,&regEnc,&regEnc2,x,&d,&g_linear,&g_step,this);
+				//alpha=gold(f,0,alpha0*1.5);
+				//alpha = wolfesearch(f,alpha0,gd,rho,old_norm);
+				//alpha = backtracking(f,alpha0,gd,rho,old_norm);
+
+				alpha = cg_linesearch(f,alpha0,gd,old_norm);
+				axpy(alpha,&d,x);
+				if (alpha == 0){
+					std::cerr << "Linesearch failed, returning current iteration" << std::endl;
+					return boost::shared_ptr<ARRAY_TYPE>(x);
+				}
+			} else {
+				axpy(alpha,&d,x);
+
+			}
+
+
+
+
+
+
+			REAL f = functionValue(&encoding_space,regEnc,x);
+			GDEBUG_STREAM("Function value: " << f << std::endl);
+
+			g = g_linear;
+
+			this->add_gradient(x,&g);
+
+
+			//Expand current BFGS subspace with new pair
+			bfgsPair pair;
+			if (subspace.size() == m_){
+				pair=subspace.back();
+				subspace.pop_back();
+				*(pair.s) = d;
+				*(pair.y) = g;
+			} else {
+				pair.s = boost::shared_ptr<ARRAY_TYPE>(new ARRAY_TYPE(d));
+				pair.y = boost::shared_ptr<ARRAY_TYPE>(new ARRAY_TYPE(g));
+			}
+			*(pair.s) *= alpha;
+			*(pair.y) -= g_old;
+
+			pair.rho = dot(pair.s.get(),pair.y.get());
+
+			subspace.push_front(pair);
+
+
+			iteration_callback(x,i,f);
+
+
+			if (grad_norm/grad_norm0 < tc_tolerance_)  break;
+
+		}
+
+		return boost::shared_ptr<ARRAY_TYPE>(x);
+																																	}
+
+
+
+	// Set preconditioner
+	//
+	/*virtual void set_preconditioner( boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond ) {
+      precond_ = precond;
+      }*/
+
+	// Set/get maximally allowed number of iterations
+	//
+	virtual void set_max_iterations( unsigned int iterations ) { iterations_ = iterations; }
+	virtual unsigned int get_max_iterations() { return iterations_; }
+
+	// Set/get tolerance threshold for termination criterium
+	//
+	virtual void set_tc_tolerance( REAL tolerance ) { tc_tolerance_ = tolerance; }
+	virtual REAL get_tc_tolerance() { return tc_tolerance_; }
+
+	virtual void set_non_negativity_constraint(bool non_negativity_constraint){
+		non_negativity_constraint_=non_negativity_constraint;
+	}
+
+	virtual void set_dump_residual(bool dump_res){
+		dump_residual = dump_res;
+	}
+	// Set preconditioner
+	//
+
+	virtual void set_preconditioner( boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond ) {
+		precond_ = precond;
+	}
+
+	virtual void add_regularization_operator( boost::shared_ptr< linearOperator< ARRAY_TYPE> > op)
+	{
+		if( !op.get() ){
+			throw std::runtime_error( "Error: linearOperatorSolver::add_regularization_operator : NULL operator provided" );
+		}
+		this->regularization_operators_.push_back(op);
+		reg_priors.push_back(boost::shared_ptr<ARRAY_TYPE>((ARRAY_TYPE*)0));
+	}
+
+	virtual void add_regularization_operator( boost::shared_ptr< linearOperator< ARRAY_TYPE> > op,boost::shared_ptr<ARRAY_TYPE> prior)
+	{
+		if( !op.get() ){
+			throw std::runtime_error( "Error: linearOperatorSolver::add_regularization_operator : NULL operator provided" );
+		}
+
+		this->regularization_operators_.push_back(op);
+		reg_priors.push_back(prior);
+	}
+
+	virtual void add_regularization_operator(boost::shared_ptr< linearOperator<ARRAY_TYPE> > op, int L_norm ){
+		if (L_norm==1){
+
+			this->operators.push_back(boost::shared_ptr< l1GPRegularizationOperator>(new l1GPRegularizationOperator(op)));
+		}else{
+			add_regularization_operator(op);
+		}
+	}
+
+
+	virtual void add_regularization_operator(boost::shared_ptr< linearOperator<ARRAY_TYPE> > op, boost::shared_ptr<ARRAY_TYPE> prior, int L_norm ){
+		if (L_norm==1){
+			this->operators.push_back(boost::shared_ptr<l1GPRegularizationOperator>(new l1GPRegularizationOperator(op,prior)));
+		}else{
+			add_regularization_operator(op,prior);
+		}
+	}
+
+
+protected:
+	typedef typename std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > >::iterator  csIterator;
+	typedef typename std::vector< std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > > >::iterator csGroupIterator;
+
+	virtual void iteration_callback(ARRAY_TYPE* x ,int iteration,REAL value){};
+
+
+
+
+	ELEMENT_TYPE calc_dot(std::vector<ARRAY_TYPE>& x,std::vector<ARRAY_TYPE>& y){
+		ELEMENT_TYPE res(0);
+		for (int  i = 0; i < x.size(); i++)
+			res += dot(&x[i],&y[i]);
+		return res;
+	}
+
+	void add_linear_gradient(std::vector<ARRAY_TYPE>& elems, ARRAY_TYPE* g){
+		ARRAY_TYPE tmp(g->get_dimensions());
+		for (int i = 0; i <elems.size(); i++){
+			this->regularization_operators_[i]->mult_MH(&elems[i],&tmp);
+			axpy(std::sqrt(this->regularization_operators_[i]->get_weight()),&tmp,g);
+		}
+	}
+
+	void calc_regMultM(ARRAY_TYPE* x,std::vector<ARRAY_TYPE>& elems){
+		for (int i = 0; i <elems.size(); i++){
+			this->regularization_operators_[i]->mult_M(x,&elems[i]);
+			elems[i] *= std::sqrt(this->regularization_operators_[i]->get_weight());
+		}
+	}
+
+	void reg_axpy(REAL alpha, std::vector<ARRAY_TYPE>& x, std::vector<ARRAY_TYPE>& y){
+		for (int i = 0; i <x.size(); i++){
+			axpy(alpha,&x[i],&y[i]);
+
+		}
+	}
+	struct bfgsPair{
+		boost::shared_ptr<ARRAY_TYPE> s;
+		boost::shared_ptr<ARRAY_TYPE> y;
+		ELEMENT_TYPE rho;
+	};
+
+	/***
+	 * @brief L-BFGS update, following algorithm 9.2 in Numerical Optimization
+	 * @param[in] g gradient
+	 * @param[out] d search direction
+	 * @param[in] pairs
+	 */
+	void lbfgs_update(ARRAY_TYPE* g, ARRAY_TYPE* d, std::list<bfgsPair>& pairs){
+		*d = *g;
+
+		if (pairs.size() > 0){
+			std::list<ELEMENT_TYPE> alpha_list;
+			for (typename std::list<bfgsPair>::iterator it = pairs.begin(); it != pairs.end(); ++it){
+				ELEMENT_TYPE alpha = dot(it->s.get(),d)/it->rho;
+				axpy(-alpha,it->y.get(),d);
+				alpha_list.push_back(alpha);
+			}
+
+			bfgsPair front = pairs.front();
+			ELEMENT_TYPE gamma = front.rho/dot(front.y.get(),front.y.get());
+			*d *= gamma;
+
+			typename std::list<ELEMENT_TYPE>::reverse_iterator alpha_it = alpha_list.rbegin();
+			//Reverse iteration
+			for (typename std::list<bfgsPair>::reverse_iterator it = pairs.rbegin(); it != pairs.rend(); ++it, ++alpha_it){
+				ELEMENT_TYPE beta = dot(it->y.get(),d)/it->rho;
+				ELEMENT_TYPE alpha = *alpha_it;
+				axpy(alpha-beta,it->s.get(),d);
+			}
+		}
+		*d *= REAL(-1);
+
+	}
+
+
+
+
+
+	class FunctionEstimator{
+	public:
+
+		FunctionEstimator(ARRAY_TYPE* _encoding_space,ARRAY_TYPE* _encoding_step,std::vector<ARRAY_TYPE>* _regEnc,std::vector<ARRAY_TYPE>* _regEnc_step, ARRAY_TYPE * _x, ARRAY_TYPE * _d, ARRAY_TYPE * _g, ARRAY_TYPE * _g_step, lbfgsSolver<ARRAY_TYPE> * _parent)
+	{
+			encoding_step = _encoding_step;
+			encoding_space = _encoding_space;
+			regEnc = _regEnc;
+			regEnc_step = _regEnc_step;
+			x = _x;
+			xtmp = *x;
+			d = _d;
+			parent = _parent;
+			alpha_old = 0;
+			g = _g;
+			g_step = _g_step;
+
+	}
+
+
+
+		REAL operator () (REAL alpha){
+			axpy(alpha-alpha_old,encoding_step,encoding_space);
+
+			axpy(alpha-alpha_old,g_step,g);
+			parent->reg_axpy(alpha-alpha_old,*regEnc_step,*regEnc);
+			axpy(alpha-alpha_old,d,&xtmp);
+
+			alpha_old = alpha;
+			REAL res = parent->functionValue(encoding_space,*regEnc,&xtmp);
+			return res;
+
+		}
+
+		ELEMENT_TYPE dir_deriv(){
+			ARRAY_TYPE g_tmp = *g;
+			parent->add_gradient(&xtmp,&g_tmp);
+			return dot(d,&g_tmp);
+		}
+
+
+
+
+
+
+	private:
+
+		REAL alpha_old;
+		ARRAY_TYPE* encoding_step;
+		ARRAY_TYPE * encoding_space;
+		std::vector<ARRAY_TYPE>* regEnc;
+		std::vector<ARRAY_TYPE>* regEnc_step;
+		ARRAY_TYPE* x, *d;
+		ARRAY_TYPE* g, *g_step;
+
+		lbfgsSolver<ARRAY_TYPE>* parent;
+		ARRAY_TYPE xtmp;
+
+
+	};
+	friend class FunctionEstimator;
+
+	/***
+	 * @brief Gold section search algorithm. Only works with unimodal functions, which we assume we're dealing with, at least locally
+	 * @param f Functor to calculate the function to minimize
+	 * @param a Start of the bracketing
+	 * @param d End of bracketing
+	 * @return Value minimizing the function f.
+	 */
+	REAL gold(FunctionEstimator& f, REAL a, REAL d){
+		const REAL gold = 1.0/(1.0+std::sqrt(5.0))/2;
+
+		REAL b = d-(d-a)*gold;
+		REAL c = (d-a)*gold-a;
+
+		REAL fa = f(a);
+		REAL fb = f(b);
+		REAL fc = f(c);
+		REAL fd = f(d);
+		REAL tol = 1e-6;
+
+		while (abs(a-d) > tol*(abs(b)+abs(c))){
+			if (fb > fc){
+				a = b;
+				fa = fb;
+				b = c;
+				fb = fc;
+				c= b*gold+(1.0-gold)*d;
+				fc = f(c);
+			} else {
+				d = c;
+				fd = fc;
+				c = b;
+				fc = fb;
+				b = c*gold+(1-gold)*a;
+				fb = f(b);
+			}
+		}
+		if (fb < fc){
+			f(b);
+			return b;
+		}else {
+			f(c);
+			return c;
+		}
+	}
+
+	/***
+	 * Armijo type linesearch
+	 * @param f
+	 * @param alpha0
+	 * @param gd
+	 * @param rho
+	 * @param old_norm
+	 * @return
+	 */
+	REAL backtracking(FunctionEstimator& f, const REAL alpha0, const REAL gd, const REAL rho, const REAL old_norm){
+		REAL alpha;
+		REAL delta=1e-4;
+		REAL sigma=0.9;
+		//REAL precision = 0.0003; //Estimated precision of function evaluation
+		REAL precision = 1e-4f; //Estimated precision of function evaluation
+		bool wolfe = false;
+		int  k=0;
+
+		while (not wolfe){
+			alpha=alpha0*std::pow(rho,k);
+			//if (f(alpha) <= old_norm+alpha*delta*gd) wolfe = true;//Strong Wolfe condition..
+			REAL fa = f(alpha);
+			ELEMENT_TYPE dir_deriv = f.dir_deriv();
+			if (((2*delta-1.0)*real(gd) >= real(dir_deriv)) && (fa < (old_norm+precision))) wolfe=true; //Approx Wolfe condition from Hager, W. and Zhang, H.SIAM Journal on Optimization 2005 16:1, 170-192
+			if (abs(dir_deriv) > sigma*abs(gd)) wolfe = false;//Strong Wolfe condition..
+			k++;
+			if (alpha == 0){
+				//GDEBUG_STREAM("Backtracking search failed, switching to slow wolfe-search" << std::endl);
+				//return wolfesearch(f,alpha0,gd,rho,old_norm);
+				return 0;
+			}
+		}
+
+		return alpha;
+
+	}
+
+	/***
+	 * Line search taken from Numerical Optimization (Wright and Nocedal 1999).
+	 * Adapted from the scipy optimize algorithm.
+	 * Like the gold-section method it works quite poorly in practice.
+	 * @param f
+	 * @param alpha0
+	 * @param gd
+	 * @param rho
+	 * @param old_norm
+	 * @return
+	 */
+	REAL wolfesearch(FunctionEstimator& f, const REAL alpha_init, const REAL gd, const REAL rho, const REAL old_norm){
+		using std::sqrt;
+		using std::abs;
+		REAL delta=0.01;
+		unsigned int k=0;
+		REAL alpha0 = alpha_init;
+		REAL f0 = f(alpha0);
+
+		if (f0 <= old_norm+alpha0*delta*gd){//Strong Wolfe condition..
+			return alpha0;
+		}
+
+
+		REAL alpha1 = -gd*alpha0*alpha0/2.0/(f0-old_norm-gd*alpha0);
+		//GDEBUG_STREAM("F0 " <<f0 << " old " << old_norm << " gd " << gd <<std::endl);
+		GDEBUG_STREAM("Alpha0: "  << alpha0 << std::endl);
+		//GDEBUG_STREAM("Alpha1: "  << alpha1 << std::endl);
+		REAL f1 = f(alpha1);
+
+
+		if (f1 <= old_norm+alpha1*delta*gd){//Strong Wolfe condition..
+			return alpha1;
+		}
+
+
+		while (alpha1 > 0){
+			double factor = alpha0*alpha0*alpha1*alpha1*(alpha1-alpha0);
+			double a = alpha0*alpha0*(f1-old_norm-gd*alpha1) - alpha1*alpha1*(f0-old_norm-gd*alpha0);
+			a /= factor;
+
+			double b = -alpha0*alpha0*alpha0*(f1-old_norm-gd*alpha1) + alpha1*alpha1*alpha1*(f0-old_norm-gd*alpha0);
+			b /= factor;
+
+			double alpha2 = (-b+std::sqrt(std::abs(b*b-3*a*gd)))/(3*a);
+			REAL f2 = f(alpha2);
+			//GDEBUG_STREAM("a " << a << "b " << b << std::endl);
+			GDEBUG_STREAM("Alpha1: "  << alpha1 << std::endl);
+			GDEBUG_STREAM("Alpha2: "  << alpha2 << std::endl);
+			if (f2 < old_norm+alpha2*delta*gd){//Strong Wolfe condition..
+				return alpha2;
+			}
+
+			if (((alpha1-alpha2) > (alpha1/2.0)) || ((1.0-alpha2/alpha1) < 0.96)){
+				alpha2 = alpha1 / 2.0;
+			}
+
+			alpha0 = alpha1;
+			alpha1 = alpha2;
+			f0 = f1;
+			f1 = f2;
+			k++;
+
+
+		}
+
+		throw std::runtime_error("Wolfe line search failed");
+
+
+	}
+
+
+
+	/***
+	 * CG linesearch adapted from  Hager, W. and Zhang, H.SIAM Journal on Optimization 2005 16:1, 170-192
+	 * @param f
+	 * @param alpha0
+	 * @param gd
+	 * @param rho
+	 * @param old_norm
+	 * @return
+	 */
+	REAL cg_linesearch(FunctionEstimator& f, const REAL alpha0, const REAL gd, const REAL old_norm){
+		REAL delta=0.1;
+		REAL sigma=0.9;
+		REAL nabla=0.66;
+		//REAL precision = 0.0003; //Estimated precision of function evaluation
+		REAL precision = 1e-4f; //Estimated precision of function evaluation
+
+
+
+
+		REAL a=0;
+		REAL b = alpha0;
+
+		REAL ak = a;
+		REAL bk = b;
+		REAL fa = old_norm;
+		ELEMENT_TYPE a_deriv = gd;
+		REAL fb = f(alpha0);
+		ELEMENT_TYPE b_deriv = f.dir_deriv();
+
+		while (abs(a-b) > 0){
+			if ((((2*delta-1.0)*real(gd) >= real(b_deriv)) && (fb < old_norm+precision)) && //Check Approximate Wolfe conditions
+					(abs(b_deriv) <= sigma*abs(gd))){
+				f(b);
+				return b;
+			}
+
+			if ((((2*delta-1.0)*real(gd) >= real(a_deriv)) && (fa < old_norm+precision)) && //Check Approximate Wolfe conditions
+					(abs(a_deriv) <= sigma*abs(gd))){
+				f(a);
+				return a;
+			}
+
+			secant2(a,b,f,old_norm+precision);
+			if ((b-a) > nabla*(bk-ak)) {
+				REAL c = (a+b)/2;
+				interval_update(a,b,c,f,old_norm);
+			}
+			if (a != ak){
+				fa = f(a);
+				a_deriv = f.dir_deriv();
+			}
+
+			if (b != bk){
+				fb = f(b);
+				b_deriv = f.dir_deriv();
+			}
+
+			ak = a;
+			bk = b;
+
+			GDEBUG_STREAM("a: " << a << " b: " << b << std::endl);
+		}
+		return 0;
+		//throw std::runtime_error("CG_linesearch failed");
+
+	}
+
+
+	void secant2(REAL& a, REAL& b,FunctionEstimator& f,REAL old_norm){
+		REAL fa = f(a);
+		ELEMENT_TYPE dfa = f.dir_deriv();
+		REAL fb = f(b);
+		ELEMENT_TYPE dfb = f.dir_deriv();
+
+		REAL c= real((a*dfb-b*dfa)/(dfb-dfa));
+
+		REAL fc = f(c);
+		ELEMENT_TYPE dfc = f.dir_deriv();
+
+		REAL A=a;
+		REAL B = b;
+
+		interval_update(A,B,c,f,old_norm);
+
+		if (c == B){
+			c= real((b*dfc-c*dfb)/(dfc-dfb));
+			interval_update(A,B,c,f,old_norm);
+		} if (c == A){
+			c= real((a*dfc-c*dfa)/(dfc-dfa));
+			interval_update(A,B,c,f,old_norm);
+		}
+
+		a= A;
+		b = B;
+	}
+
+	void interval_update(REAL & a, REAL & b, REAL c,FunctionEstimator& f,REAL old_norm){
+		REAL theta = 0.5;
+		if (c < a || c > b) return; // C not in interval
+		REAL fc = f(c);
+		ELEMENT_TYPE dfc = f.dir_deriv();
+
+		if (real(dfc) >= 0){
+			b =c;
+			return;
+		}
+		if (fc < old_norm){
+			a = c;
+			return;
+		}
+		b =c;
+		while(true){
+			REAL d = (1-theta)*a+theta*b;
+			REAL fd = f(d);
+			ELEMENT_TYPE dfd = f.dir_deriv();
+
+			if (real(dfd) >= 0){
+				b = d;
+				return;
+			}
+			if (fd < old_norm){
+				a = d;
+			} else 	b = d;
+
+			GDEBUG_STREAM("Interval a: " << a << " b: " << b << std::endl);
+
+		}
+
+
+
+
+	}
+
+	REAL functionValue(ARRAY_TYPE* encoding_space,std::vector<ARRAY_TYPE>& regEnc, ARRAY_TYPE * x){
+		REAL res= std::sqrt(this->encoding_operator_->get_weight())*abs(dot(encoding_space,encoding_space));
+
+		for (int i = 0; i  < this->operators.size(); i++){
+			res += this->operators[i]->magnitude(x);
+		}
+
+		res += abs(calc_dot(regEnc,regEnc));
+		return res;
+
+	}
+
+
+
+
+
+
+protected:
+
+	// Preconditioner
+	//boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond_;
+	// Maximum number of iterations
+	unsigned int iterations_;
+	bool non_negativity_constraint_;
+	REAL tc_tolerance_;
+	REAL threshold;
+	bool dump_residual;
+	REAL rho;
+
+	unsigned int m_; // Number of copies to use.
+
+	// Preconditioner
+
+	std::vector<boost::shared_ptr<ARRAY_TYPE> > reg_priors;
+	boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond_;
+
+};
+}
diff --git a/toolboxes/solvers/linearOperatorSolver.h b/toolboxes/solvers/linearOperatorSolver.h
new file mode 100644
index 0000000..bbfbd7f
--- /dev/null
+++ b/toolboxes/solvers/linearOperatorSolver.h
@@ -0,0 +1,75 @@
+/** \file linearOperatorSolver.h
+    \brief Base class for all of Gadgetron's solvers operating on linear operators.
+*/
+
+#pragma once
+
+#include "solver.h"
+#include "linearOperator.h"
+
+#include <vector>
+#include <iostream>
+#include <stdexcept>
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE> class linearOperatorSolver : public solver<ARRAY_TYPE, ARRAY_TYPE>
+  {
+    
+  public:
+
+    // Constructor
+    linearOperatorSolver() : solver<ARRAY_TYPE,ARRAY_TYPE>() {}
+  
+    // Destructor
+    virtual ~linearOperatorSolver() {}
+
+    // Add encoding operator to solver (only one allowed)
+    virtual void set_encoding_operator( boost::shared_ptr< linearOperator<ARRAY_TYPE> > op)
+    {
+      if( !op.get() ){
+        throw std::runtime_error( "Error: linearOperatorSolver::set_encoding_operator : NULL operator provided" );
+      }     
+      encoding_operator_ = op;    
+    }
+  
+    virtual boost::shared_ptr< linearOperator<ARRAY_TYPE> >
+    get_encoding_operator()
+    {
+      return encoding_operator_;
+    }  
+  
+    // Add linear operator to solver (in addition to the encoding operator)
+    virtual void add_regularization_operator( boost::shared_ptr< linearOperator< ARRAY_TYPE> > op)
+    {
+      if( !op.get() ){
+        throw std::runtime_error( "Error: linearOperatorSolver::add_regularization_operator : NULL operator provided" );
+      }    
+      regularization_operators_.push_back(op);
+    }
+  
+    virtual boost::shared_ptr< linearOperator< ARRAY_TYPE> >
+    get_regularization_operator( unsigned int i )
+    {
+      if( i >= get_number_of_regularization_operators() ){
+        throw std::runtime_error( "Error: linearOperatorSolver::get_regularization_operator : index out of range" );
+      }    
+      return regularization_operators_[i];
+    }  
+  
+    virtual unsigned int get_number_of_regularization_operators()
+    {
+      return regularization_operators_.size();
+    }
+    
+  protected:
+  
+    // Single encoding operator
+    boost::shared_ptr< linearOperator<ARRAY_TYPE> > encoding_operator_;
+  
+    // Vector of linear regularization operators
+    std::vector< boost::shared_ptr< linearOperator<ARRAY_TYPE> > > regularization_operators_;
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+  };
+}
diff --git a/toolboxes/solvers/lsqrSolver.h b/toolboxes/solvers/lsqrSolver.h
new file mode 100644
index 0000000..9831144
--- /dev/null
+++ b/toolboxes/solvers/lsqrSolver.h
@@ -0,0 +1,173 @@
+#pragma once
+
+
+#include "linearOperator.h"
+#include "linearOperatorSolver.h"
+#include "cgPreconditioner.h"
+#include "real_utilities.h"
+
+#include <vector>
+#include <iostream>
+#include "encodingOperatorContainer.h"
+
+namespace Gadgetron {
+template <class ARRAY_TYPE> class lsqrSolver: public linearOperatorSolver<ARRAY_TYPE>
+{
+protected:
+	typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+	typedef typename realType<ELEMENT_TYPE>::Type REAL;
+public:
+
+	lsqrSolver()  {
+		iterations_ = 10;
+		tc_tolerance_ = (REAL)1e-3;
+
+	}
+
+	virtual ~lsqrSolver() {}
+/*
+	virtual int set_preconditioner( boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond ) {
+		precond_ = precond;
+		return 0;
+	}
+*/
+	virtual void set_tc_tolerance( REAL tolerance ) { tc_tolerance_ = tolerance; }
+	virtual REAL get_tc_tolerance() { return tc_tolerance_; }
+
+	virtual void set_max_iterations( unsigned int iterations ) { iterations_ = iterations; }
+	virtual unsigned int get_max_iterations() { return iterations_; }
+
+
+
+
+
+	virtual boost::shared_ptr<ARRAY_TYPE> solve( ARRAY_TYPE *b )
+  {
+
+		boost::shared_ptr< std::vector<size_t> > image_dims = this->encoding_operator_->get_domain_dimensions();
+		if( image_dims->size() == 0 ){
+			throw std::runtime_error( "Error: cgSolver::compute_rhs : encoding operator has not set domain dimension" );
+		}
+
+
+
+
+		ARRAY_TYPE * x = new ARRAY_TYPE(image_dims);
+		clear(x);
+
+
+		encodingOperatorContainer<ARRAY_TYPE> enc_op;
+		boost::shared_ptr<ARRAY_TYPE> u;
+
+		{
+			enc_op.add_operator(this->encoding_operator_);
+			for (unsigned int i =0; i < this->regularization_operators_.size(); i++)
+				enc_op.add_operator(this->regularization_operators_[i]);
+			std::vector<ARRAY_TYPE*> encspace(this->regularization_operators_.size()+1,NULL);
+			encspace[0] = b;
+			u = enc_op.create_codomain(encspace);
+		}
+
+
+
+		//Initialise u vector
+		REAL beta = 0;
+
+		beta = nrm2(u.get());
+		*u *= REAL(1)/beta;
+
+		//Initialise v vector
+		REAL alpha = 0;
+		ARRAY_TYPE v(*x); //v vector is in image space
+
+		clear(&v);
+
+
+		enc_op.mult_MH(u.get(),&v);
+
+
+		alpha = nrm2(&v);
+
+		v *= REAL(1)/alpha;
+
+		//Initialise w vector
+		ARRAY_TYPE w(v);
+
+		//Initialise phibar
+		REAL phibar = beta;
+		REAL phibar0 = phibar;
+
+		//Initialise rhobar
+		REAL rhobar = alpha;
+		REAL rhobar0 = alpha;
+
+		REAL cg_res = alpha;
+
+		REAL rnorm = beta;
+
+		REAL xnorm = 0;
+		REAL anorm = 0;
+		REAL arnorm = alpha*beta;
+
+
+		for (int it = 0; it < iterations_; it ++){
+			beta = REAL(0);
+
+			*u *= -alpha;
+
+			enc_op.mult_M(&v,u.get(),true);
+
+			beta =nrm2(u.get());
+			*u *= REAL(1)/beta;
+
+			v *= -beta;
+
+			enc_op.mult_MH(u.get(),&v,true);
+			alpha = nrm2(&v);
+
+			v *= REAL(1)/alpha;
+
+
+			//Construct and apply next orthogonal transformation
+			REAL rho = std::sqrt(norm(rhobar)+norm(beta));
+			REAL c = rhobar/rho;
+			REAL s = beta/rho;
+			REAL theta = s*alpha;
+			rhobar = -c*alpha;
+			REAL phi = c*phibar;
+			phibar *= s;
+
+
+			//Update x, w
+			axpy(phi/rho,&w,x);  //x = x + phi/rho * w
+
+			w *= -theta/rho;
+			w += v;
+
+			//Check for convergence
+
+			//rhobar is a good approximation of the euclidian norm of the residual, so we check for that
+
+			if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+				GDEBUG_STREAM("Iteration " <<it << ". Relative residual: " <<  rhobar/rhobar0 << std::endl);
+			}
+
+		}
+
+
+
+		return boost::shared_ptr<ARRAY_TYPE>(x);
+
+}
+
+
+
+protected:
+
+	//boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond_;
+	unsigned int iterations_;
+	REAL tc_tolerance_;
+
+};
+
+}
diff --git a/toolboxes/solvers/lwSolver.h b/toolboxes/solvers/lwSolver.h
new file mode 100644
index 0000000..0ad5d58
--- /dev/null
+++ b/toolboxes/solvers/lwSolver.h
@@ -0,0 +1,214 @@
+/*
+  An implementation of the "Generalized Landweber Solver" based on the paper
+  "Theory and methods related to the singular-function expansion and Landweber's iteration..."
+  by O.N. Strand, Siam J Numer, Anal. 1974;11(4):798-825.
+*/
+
+#pragma once
+
+#include "linearOperatorSolver.h"
+
+#include <vector>
+#include <iostream>
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE> class lwSolver
+    : public linearOperatorSolver< ARRAY_TYPE>
+  {
+
+  protected:
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::type REAL;
+
+  public:
+
+    // Constructor
+    lwSolver() : linearOperatorSolver<ARRAY_TYPE>() {
+      iterations_ = 3;
+      alpha_ = REAL(1);
+    }
+  
+    // Destructor
+    virtual ~lwSolver() {}
+  
+    // Set/get maximally allowed number of iterations
+    virtual void set_max_iterations( unsigned int iterations ) { iterations_ = iterations; }
+    virtual unsigned int get_max_iterations() { return iterations_; }  
+
+    // Set/get alpha.
+    // Optimally set alpha to 1/(sigma^2), sigma being the largest singular value of the "sum of operators"
+    virtual void set_alpha( REAL alpha ) { alpha_ = alpha; }
+    virtual REAL get_alpha() { return alpha_; }  
+
+    // Inherited solver interface
+    virtual boost::shared_ptr<ARRAY_TYPE> solve( ARRAY_TYPE *b )
+    {   
+      // Initial validity checks
+      //
+
+      std::vector<unsigned int> image_dims = *this->encoding_operator_->get_domain_dimensions();
+
+      if( image_dims.size() == 0 ){
+	throw std::runtime_error("Error: lwSolver::solve : domain dimensions not set on encoding operator" );
+      }
+        
+      // Allocate solution array.
+      // Clear or set to x0 if provided
+      //
+
+      boost::shared_ptr<ARRAY_TYPE> x( new ARRAY_TYPE() );
+      if( this->get_x0().get() ){
+	*x = *(this->get_x0());
+      }
+      else{
+    	x->create( &image_dims );
+    	x->clear();
+      }    
+
+      ARRAY_TYPE x_prev;
+
+      // Main solver iteration loop
+      //
+    
+      for( unsigned int iteration=0; iteration<iterations_; iteration++ ){
+      
+	// Keep previous x for convergence reporting
+	// 
+
+	if( this->output_mode_ >= solver<ARRAY_TYPE, ARRAY_TYPE>::OUTPUT_VERBOSE ){
+	  x_prev = *x;
+
+	}
+      
+	// Compute residual image, i.e. A^T(b-Ax_k)
+	//
+      
+	boost::shared_ptr<ARRAY_TYPE> r = compute_residual_image( x.get(), b );
+
+	// Multiply residual with shaping matrix
+	//
+
+	boost::shared_ptr<ARRAY_TYPE> rr = apply_shaping_matrix( r.get() );
+
+	// Update x
+	//
+	axpy( get_alpha(), rr.get(), x.get() );
+      
+	if( this->output_mode_ >= solver<ARRAY_TYPE, ARRAY_TYPE>::OUTPUT_VERBOSE ){
+	  axpy( ELEMENT_TYPE(-1), x.get(), &x_prev );
+	  GDEBUG_STREAM(" iteration: " << iteration << ", delta x: " << solver_asum(&x_prev) << std::endl);
+	}      
+      }
+    
+      return x;
+    }
+
+  protected:
+    virtual boost::shared_ptr<ARRAY_TYPE> compute_residual_image( ARRAY_TYPE *x, ARRAY_TYPE *b )
+    {    
+      // Allocate some temporary storage and the esult array
+      //
+    
+
+      boost::shared_ptr<ARRAY_TYPE> res = boost::shared_ptr<ARRAY_TYPE>(new ARRAY_TYPE(x->get_dimensions()));
+
+      ARRAY_TYPE tmp_M(b->get_dimensions());
+      ARRAY_TYPE tmp_acc(b->get_dimensions());
+        
+      // Clear accumulation buffer to b
+      tmp_acc = *b;
+    
+      // Apply encoding operator to current solution
+      this->encoding_operator_->mult_M( x, &tmp_M );
+    
+      // Find residual
+      axpy(REAL(-1), &tmp_M, &tmp_acc );
+    
+      // Adjoint residual    
+      this->encoding_operator_->mult_MH( &tmp_acc, res.get());
+      // Apply encoding operator weight
+      *res *= this->encoding_operator_->get_weight();
+    
+      return res;
+    }
+  
+    boost::shared_ptr<ARRAY_TYPE> apply_shaping_matrix( ARRAY_TYPE *r )
+    {
+      //
+      // Apply 6th order polynomial F(lambda) -- see paper referenced at top
+      //
+    
+      // The input residual r is modified (it is an internal implementation variable anyway)
+      //
+
+      // Memory allocation
+      std::vector<unsigned int> image_dims = *this->encoding_operator_->get_domain_dimensions();
+      boost::shared_ptr<ARRAY_TYPE> res = boost::shared_ptr<ARRAY_TYPE>(new ARRAY_TYPE(&image_dims ));
+    
+      // Handle 0th order   
+      *res = *r;
+      *res *=  REAL(31.5);
+
+      // Handle 1th order
+      apply_shape_matrix_mult_MH_M( r, res.get(), REAL(-315) );
+    
+      // Handle 2th order
+      apply_shape_matrix_mult_MH_M( r, res.get(), REAL(1443.75) );
+
+      // Handle 3th order
+      apply_shape_matrix_mult_MH_M( r, res.get(), REAL(-3465) );
+    
+      // Handle 4th order
+      apply_shape_matrix_mult_MH_M( r, res.get(), REAL(4504.5) );
+    
+      // Handle 5th order
+      apply_shape_matrix_mult_MH_M( r, res.get(), REAL(-3003) );
+    
+      // Handle 6th order
+      apply_shape_matrix_mult_MH_M( r, res.get(), REAL(804.375) );
+    
+      // Return result
+      return res;
+    }
+
+    void apply_shape_matrix_mult_MH_M( ARRAY_TYPE *r, ARRAY_TYPE *acc, REAL w )
+    {
+      // Temporary storage
+      std::vector<unsigned int> image_dims = *this->encoding_operator_->get_domain_dimensions();
+      ARRAY_TYPE tmp_MH_M(&image_dims), tmp_acc(&image_dims);
+    
+      // Apply encoding operator
+      this->encoding_operator_->mult_MH_M( r, &tmp_MH_M );
+    
+      // Accumulate for overall result
+      axpy(get_alpha()*w*this->encoding_operator_->get_weight(), &tmp_MH_M, acc );
+
+      // Accumulate for intermediate (MH_M)^i
+      tmp_acc = tmp_MH_M;
+      tmp_acc *= get_alpha()*this->encoding_operator_->get_weight();
+    
+      // Loop over operators
+      for( unsigned int i=0; i<this->regularization_operators_.size(); i++){
+      
+	// Compute operator mult_MH_M
+	this->regularization_operators_[i]->mult_MH_M( r, &tmp_MH_M );
+      
+	// Accumulate
+	axpy(get_alpha()*w*this->regularization_operators_[i]->get_weight(), &tmp_MH_M, acc );
+
+	// Accumulate for intermediate (MH_M)^i
+	axpy(get_alpha()*this->encoding_operator_->get_weight(), &tmp_MH_M, &tmp_acc );
+      }
+    
+      // Update r
+      *r = tmp_acc;
+    }
+  
+  protected:
+  
+    // Maximum number of iterations
+    unsigned int iterations_;
+    REAL alpha_;
+  };
+}
diff --git a/toolboxes/solvers/nlcgSolver.h b/toolboxes/solvers/nlcgSolver.h
new file mode 100644
index 0000000..4f609bc
--- /dev/null
+++ b/toolboxes/solvers/nlcgSolver.h
@@ -0,0 +1,776 @@
+#pragma once
+
+#include "gpSolver.h"
+#include "linearOperatorSolver.h"
+#include "real_utilities.h"
+#include "complext.h"
+#include "cgPreconditioner.h"
+
+#include <vector>
+#include <iostream>
+#include <numeric>
+
+namespace Gadgetron{
+/** Nonlinear conjugate gradient solver.
+ * Adapted from Y.H. Dai & Y. Yuan 2001 "An Efficient Hybrid Conjugate Gradient Method for Unconstrained Optimization"
+ * Annals of Operations Research, March 2001, Volume 103, Issue 1-4, pp 33-47
+ *
+ */
+
+template <class ARRAY_TYPE> class nlcgSolver : public gpSolver<ARRAY_TYPE>
+{
+
+
+protected:
+	typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+	typedef typename realType<ELEMENT_TYPE>::Type REAL;
+	typedef ARRAY_TYPE ARRAY_CLASS;
+	typedef gpSolver<ARRAY_TYPE> GP;
+	typedef typename gpSolver<ARRAY_TYPE>::l1GPRegularizationOperator l1GPRegularizationOperator;
+
+public:
+
+	nlcgSolver(): gpSolver<ARRAY_TYPE>() {
+		iterations_ = 10;
+		tc_tolerance_ = (REAL)1e-7;
+		non_negativity_constraint_=false;
+		dump_residual = false;
+		threshold= REAL(1e-7);
+
+		rho = 0.5f;
+	}
+
+	virtual ~nlcgSolver(){}
+
+
+	virtual void set_rho(REAL _rho){
+		rho = _rho;
+	}
+
+	virtual boost::shared_ptr<ARRAY_TYPE> solve(ARRAY_TYPE* in)
+																															{
+		if( this->encoding_operator_.get() == 0 ){
+			throw std::runtime_error("Error: nlcgSolver::compute_rhs : no encoding operator is set" );
+		}
+
+		// Get image space dimensions from the encoding operator
+		//
+
+		boost::shared_ptr< std::vector<size_t> > image_dims = this->encoding_operator_->get_domain_dimensions();
+		if( image_dims->size() == 0 ){
+			throw std::runtime_error("Error: nlcgSolver::compute_rhs : encoding operator has not set domain dimension" );
+		}
+
+		ARRAY_TYPE * x = new ARRAY_TYPE(image_dims.get()); //The image. Will be returned inside a shared_ptr
+
+		ARRAY_TYPE g(image_dims.get()); //Contains the gradient of the current step
+		ARRAY_TYPE g_old(image_dims.get()); //Contains the gradient of the previous step
+
+
+		ARRAY_TYPE g_linear(image_dims.get()); //Contains the linear part of the gradient;
+
+		//If a prior image was given, use it for the initial guess.
+		if (this->x0_.get()){
+			*x = *(this->x0_.get());
+		} else  {
+			clear(x);
+		}
+
+		// Contains the encoding space of the linear regularization operators
+		std::vector<ARRAY_TYPE> regEnc;
+
+		//Initialize encoding space
+		for (int i = 0; i < this->regularization_operators_.size(); i++){
+			regEnc.push_back(ARRAY_TYPE(this->regularization_operators_[i]->get_codomain_dimensions()));
+			if (reg_priors[i].get()){
+				regEnc.back() = *reg_priors[i];
+				regEnc.back() *= -std::sqrt(this->regularization_operators_[i]->get_weight());
+			}
+
+		}
+		std::vector<ARRAY_TYPE> regEnc2 = regEnc;
+
+		ARRAY_TYPE d(image_dims.get()); //Search direction.
+		clear(&d);
+
+		ARRAY_TYPE encoding_space(in->get_dimensions().get()); //Contains the encoding space, or, equivalently, the residual vector
+
+		ARRAY_TYPE g_step(image_dims.get()); //Linear part of the gradient of the step d will be stored here
+
+		ARRAY_TYPE encoding_space2(in->get_dimensions().get());
+		REAL reg_res,data_res;
+
+		if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+			GDEBUG_STREAM("Iterating..." << std::endl);
+		}
+		REAL grad_norm0;
+
+		for (int i = 0; i < iterations_; i++){
+			if (i==0){
+				if (this->x0_.get()){
+					this->encoding_operator_->mult_M(x,&encoding_space);
+
+				} else clear(&encoding_space);
+				encoding_space -= *in;
+				this->encoding_operator_->mult_MH(&encoding_space,&g_linear);
+
+				g_linear *=  this->encoding_operator_->get_weight();
+				data_res = std::sqrt(this->encoding_operator_->get_weight())*real(dot(&encoding_space,&encoding_space));
+
+				calc_regMultM(x,regEnc);
+				for (int n = 0; n < regEnc.size(); n++)
+					if (reg_priors[n].get())
+						axpy(-std::sqrt(this->regularization_operators_[n]->get_weight()),reg_priors[n].get(),&regEnc[n]);
+				add_linear_gradient(regEnc,&g_linear);
+				g = g_linear;
+				this->add_gradient(x,&g);
+
+				reg_res=REAL(0);
+
+			}else {
+				data_res = real(dot(&encoding_space,&encoding_space));
+			}
+
+
+
+			if (non_negativity_constraint_) solver_non_negativity_filter(x,&g);
+			if (i==0) grad_norm0=nrm2(&g);
+			REAL grad_norm = nrm2(&g);
+			if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+
+				GDEBUG_STREAM("Iteration " <<i << ". Relative gradient norm: " <<  grad_norm/grad_norm0 << std::endl);
+			}
+
+			if (i == 0){
+				d -= g;
+				if (this->precond_.get()){
+					this->precond_->apply(&d,&d);
+					this->precond_->apply(&d,&d);
+				}
+
+			} else {
+
+				g_step = g; //Not using g_step for anything right now, so let's use it for our beta calculation
+				if (this->precond_.get()){
+					this->precond_->apply(&g_step,&g_step); //Perform first half of the preconditioning
+					this->precond_->apply(&g_old,&g_old);
+				}
+
+				ELEMENT_TYPE g_old_norm = dot(&g_old,&g_old);
+				ELEMENT_TYPE ggold = dot(&g_step,&g_old);
+				g_old -= g_step;
+				REAL gg = real(dot(&g_step,&g_step));
+				ELEMENT_TYPE gy = -dot(&d,&g_old);
+				//ELEMENT_TYPE beta = -dot(g,g_old)/g_old_norm; //PRP ste[
+				//ELEMENT_TYPE theta = gy/g_old_norm;
+
+				REAL betaDy = -gg/real(dot(&d,&g_old));
+				REAL betaHS = real(dot(&g_step,&g_old))/real(dot(&d,&g_old));
+				REAL beta = std::max(REAL(0),std::min(betaDy,betaHS)); //Hybrid step size from Dai and Yuan 2001
+
+				d *= beta;
+
+				if (this->precond_.get()) this->precond_->apply(&g_step,&g_step); //Perform the rest of the preconditioning
+
+				d -= g_step;
+				GDEBUG_STREAM("Beta " << beta << std::endl);
+			}
+
+			this->encoding_operator_->mult_M(&d,&encoding_space2);
+
+			calc_regMultM(&d,regEnc2);
+
+
+
+			this->encoding_operator_->mult_MH(&encoding_space2,&g_step);
+			g_step *= this->encoding_operator_->get_weight();
+
+
+			add_linear_gradient(regEnc2,&g_step);
+
+			REAL gd = real(dot(&g,&d));
+
+			REAL alpha0=REAL(1);
+
+			//In the linear or semi-linear case, we can calculate the ideal step size.
+			if (this->operators.size() == 0) alpha0 = -real(dot(&encoding_space,&encoding_space2)+calc_dot(regEnc,regEnc2))/real(dot(&encoding_space2,&encoding_space2)+calc_dot(regEnc2,regEnc2));
+
+			REAL alpha;
+			REAL old_norm = functionValue(&encoding_space,regEnc,x);
+
+
+
+			g_old = g;
+
+
+
+
+			{
+				FunctionEstimator f(&encoding_space,&encoding_space2,&regEnc,&regEnc2,x,&d,&g_linear,&g_step,this);
+				alpha=backtracking(f,alpha0,gd,rho,old_norm);
+				//alpha=cg_linesearch(f,alpha0,gd,old_norm);
+				if (alpha == 0) {
+					std::cerr << "Linesearch failed, returning current iteration" << std::endl;
+					return boost::shared_ptr<ARRAY_TYPE>(x);
+				}
+			}
+
+			GDEBUG_STREAM("Alpha: " << alpha << std::endl);
+
+
+
+			if (non_negativity_constraint_){
+				//Restore encoding space and gradient. Why not keep a copy? Memory!
+				axpy(-alpha,&encoding_space2,&encoding_space);
+				reg_axpy(-alpha,regEnc2,regEnc);
+				axpy(-alpha,&g_step,&g_linear);
+
+				ARRAY_TYPE x2 = *x;
+				axpy(alpha,&d,&x2);
+
+				clamp_min(&x2,REAL(0));
+
+				d = x2;
+				d -= *x;
+				gd = real(dot(&g,&d));
+				x2 = *x;
+				alpha0 = 1;
+				this->encoding_operator_->mult_M(&d,&encoding_space2);
+				calc_regMultM(&d,regEnc2);
+
+
+				this->encoding_operator_->mult_MH(&encoding_space2,&g_step);
+				g_step *= this->encoding_operator_->get_weight();
+				add_linear_gradient(regEnc2,&g_step);
+
+				FunctionEstimator f(&encoding_space,&encoding_space2,&regEnc,&regEnc2,x,&d,&g_linear,&g_step,this);
+				//alpha=gold(f,0,alpha0*1.5);
+				//alpha = wolfesearch(f,alpha0,gd,rho,old_norm);
+				alpha = backtracking(f,alpha0,gd,rho,old_norm);
+
+				//alpha = cg_linesearch(f,alpha0,gd,old_norm);
+				axpy(alpha,&d,x);
+				if (alpha == 0){
+					std::cerr << "Linesearch failed, returning current iteration" << std::endl;
+					return boost::shared_ptr<ARRAY_TYPE>(x);
+				}
+			} else {
+				axpy(alpha,&d,x);
+
+			}
+
+
+
+			GDEBUG_STREAM("Function value: " << functionValue(&encoding_space,regEnc,x) << std::endl);
+
+			g = g_linear;
+
+			this->add_gradient(x,&g);
+
+
+			iteration_callback(x,i,data_res,reg_res);
+
+
+			if (grad_norm/grad_norm0 < tc_tolerance_)  break;
+
+		}
+
+		return boost::shared_ptr<ARRAY_TYPE>(x);
+																															}
+
+
+
+	// Set preconditioner
+	//
+	/*virtual void set_preconditioner( boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond ) {
+      precond_ = precond;
+      }*/
+
+	// Set/get maximally allowed number of iterations
+	//
+	virtual void set_max_iterations( unsigned int iterations ) { iterations_ = iterations; }
+	virtual unsigned int get_max_iterations() { return iterations_; }
+
+	// Set/get tolerance threshold for termination criterium
+	//
+	virtual void set_tc_tolerance( REAL tolerance ) { tc_tolerance_ = tolerance; }
+	virtual REAL get_tc_tolerance() { return tc_tolerance_; }
+
+	virtual void set_non_negativity_constraint(bool non_negativity_constraint){
+		non_negativity_constraint_=non_negativity_constraint;
+	}
+
+	virtual void set_dump_residual(bool dump_res){
+		dump_residual = dump_res;
+	}
+	// Set preconditioner
+	//
+
+	virtual void set_preconditioner( boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond ) {
+		precond_ = precond;
+	}
+
+	virtual void add_regularization_operator( boost::shared_ptr< linearOperator< ARRAY_TYPE> > op)
+	{
+		if( !op.get() ){
+			throw std::runtime_error( "Error: linearOperatorSolver::add_regularization_operator : NULL operator provided" );
+		}
+		this->regularization_operators_.push_back(op);
+		reg_priors.push_back(boost::shared_ptr<ARRAY_TYPE>((ARRAY_TYPE*)0));
+	}
+
+	virtual void add_regularization_operator( boost::shared_ptr< linearOperator< ARRAY_TYPE> > op,boost::shared_ptr<ARRAY_TYPE> prior)
+	{
+		if( !op.get() ){
+			throw std::runtime_error( "Error: linearOperatorSolver::add_regularization_operator : NULL operator provided" );
+		}
+
+		this->regularization_operators_.push_back(op);
+		reg_priors.push_back(prior);
+	}
+
+	virtual void add_regularization_operator(boost::shared_ptr< linearOperator<ARRAY_TYPE> > op, int L_norm ){
+		if (L_norm==1){
+
+			this->operators.push_back(boost::shared_ptr< l1GPRegularizationOperator>(new l1GPRegularizationOperator(op)));
+		}else{
+			add_regularization_operator(op);
+		}
+	}
+
+
+	virtual void add_regularization_operator(boost::shared_ptr< linearOperator<ARRAY_TYPE> > op, boost::shared_ptr<ARRAY_TYPE> prior, int L_norm ){
+		if (L_norm==1){
+			this->operators.push_back(boost::shared_ptr<l1GPRegularizationOperator>(new l1GPRegularizationOperator(op,prior)));
+		}else{
+			add_regularization_operator(op,prior);
+		}
+	}
+
+
+protected:
+	typedef typename std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > >::iterator  csIterator;
+	typedef typename std::vector< std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > > >::iterator csGroupIterator;
+
+	virtual void iteration_callback(ARRAY_TYPE*,int i,REAL,REAL){};
+
+
+
+
+	ELEMENT_TYPE calc_dot(std::vector<ARRAY_TYPE>& x,std::vector<ARRAY_TYPE>& y){
+		ELEMENT_TYPE res(0);
+		for (int  i = 0; i < x.size(); i++)
+			res += dot(&x[i],&y[i]);
+		return res;
+	}
+
+	void add_linear_gradient(std::vector<ARRAY_TYPE>& elems, ARRAY_TYPE* g){
+		ARRAY_TYPE tmp(g->get_dimensions());
+		for (int i = 0; i <elems.size(); i++){
+			this->regularization_operators_[i]->mult_MH(&elems[i],&tmp);
+			axpy(std::sqrt(this->regularization_operators_[i]->get_weight()),&tmp,g);
+		}
+	}
+
+	void calc_regMultM(ARRAY_TYPE* x,std::vector<ARRAY_TYPE>& elems){
+		for (int i = 0; i <elems.size(); i++){
+			this->regularization_operators_[i]->mult_M(x,&elems[i]);
+			elems[i] *= std::sqrt(this->regularization_operators_[i]->get_weight());
+		}
+	}
+
+	void reg_axpy(REAL alpha, std::vector<ARRAY_TYPE>& x, std::vector<ARRAY_TYPE>& y){
+		for (int i = 0; i <x.size(); i++){
+			axpy(alpha,&x[i],&y[i]);
+
+		}
+	}
+
+
+	class FunctionEstimator{
+	public:
+
+		FunctionEstimator(ARRAY_TYPE* _encoding_space,ARRAY_TYPE* _encoding_step,std::vector<ARRAY_TYPE>* _regEnc,std::vector<ARRAY_TYPE>* _regEnc_step, ARRAY_TYPE * _x, ARRAY_TYPE * _d, ARRAY_TYPE * _g, ARRAY_TYPE * _g_step, nlcgSolver<ARRAY_TYPE> * _parent)
+	{
+			encoding_step = _encoding_step;
+			encoding_space = _encoding_space;
+			regEnc = _regEnc;
+			regEnc_step = _regEnc_step;
+			x = _x;
+			xtmp = *x;
+			d = _d;
+			parent = _parent;
+			alpha_old = 0;
+			g = _g;
+			g_step = _g_step;
+
+	}
+
+
+
+		REAL operator () (REAL alpha){
+			axpy(alpha-alpha_old,encoding_step,encoding_space);
+
+			axpy(alpha-alpha_old,g_step,g);
+			parent->reg_axpy(alpha-alpha_old,*regEnc_step,*regEnc);
+			axpy(alpha-alpha_old,d,&xtmp);
+
+			alpha_old = alpha;
+			REAL res = parent->functionValue(encoding_space,*regEnc,&xtmp);
+			return res;
+
+		}
+
+		ELEMENT_TYPE dir_deriv(){
+			ARRAY_TYPE g_tmp = *g;
+			parent->add_gradient(&xtmp,&g_tmp);
+			return dot(d,&g_tmp);
+		}
+
+
+
+
+
+
+	private:
+
+		REAL alpha_old;
+		ARRAY_TYPE* encoding_step;
+		ARRAY_TYPE * encoding_space;
+		std::vector<ARRAY_TYPE>* regEnc;
+		std::vector<ARRAY_TYPE>* regEnc_step;
+		ARRAY_TYPE* x, *d;
+		ARRAY_TYPE* g, *g_step;
+
+		nlcgSolver<ARRAY_TYPE>* parent;
+		ARRAY_TYPE xtmp;
+
+
+	};
+	friend class FunctionEstimator;
+
+	/***
+	 * @brief Gold section search algorithm. Only works with unimodal functions, which we assume we're dealing with, at least locally
+	 * @param f Functor to calculate the function to minimize
+	 * @param a Start of the bracketing
+	 * @param d End of bracketing
+	 * @return Value minimizing the function f.
+	 */
+	REAL gold(FunctionEstimator& f, REAL a, REAL d){
+		const REAL gold = 1.0/(1.0+std::sqrt(5.0))/2;
+
+		REAL b = d-(d-a)*gold;
+		REAL c = (d-a)*gold-a;
+
+		REAL fa = f(a);
+		REAL fb = f(b);
+		REAL fc = f(c);
+		REAL fd = f(d);
+		REAL tol = 1e-6;
+
+		while (abs(a-d) > tol*(abs(b)+abs(c))){
+			if (fb > fc){
+				a = b;
+				fa = fb;
+				b = c;
+				fb = fc;
+				c= b*gold+(1.0-gold)*d;
+				fc = f(c);
+			} else {
+				d = c;
+				fd = fc;
+				c = b;
+				fc = fb;
+				b = c*gold+(1-gold)*a;
+				fb = f(b);
+			}
+		}
+		if (fb < fc){
+			f(b);
+			return b;
+		}else {
+			f(c);
+			return c;
+		}
+	}
+
+	/***
+	 * Armijo type linesearch
+	 * @param f
+	 * @param alpha0
+	 * @param gd
+	 * @param rho
+	 * @param old_norm
+	 * @return
+	 */
+	REAL backtracking(FunctionEstimator& f, const REAL alpha0, const REAL gd, const REAL rho, const REAL old_norm){
+		REAL alpha;
+		REAL delta=0.1;
+		REAL sigma=0.9;
+		//REAL precision = 0.0003; //Estimated precision of function evaluation
+		REAL precision = 1e-4f; //Estimated precision of function evaluation
+		bool wolfe = false;
+		int  k=0;
+
+		while ( !wolfe)
+        {
+			alpha=alpha0*std::pow(rho,k);
+			//if (f(alpha) <= old_norm+alpha*delta*gd) wolfe = true;//Strong Wolfe condition..
+			REAL fa = f(alpha);
+			ELEMENT_TYPE dir_deriv = f.dir_deriv();
+			if (((2*delta-1.0)*real(gd) >= real(dir_deriv)) && (fa < (old_norm+precision))) wolfe=true; //Approx Wolfe condition from Hager, W. and Zhang, H.SIAM Journal on Optimization 2005 16:1, 170-192
+			if (abs(dir_deriv) > sigma*abs(gd)) wolfe = false;//Strong Wolfe condition..
+			k++;
+			if (alpha == 0){
+				//GDEBUG_STREAM("Backtracking search failed, switching to slow wolfe-search" << std::endl);
+				//return wolfesearch(f,alpha0,gd,rho,old_norm);
+				return 0;
+			}
+		}
+
+		return alpha;
+
+	}
+
+	/***
+	 * Line search taken from Numerical Optimization (Wright and Nocedal 1999).
+	 * Adapted from the scipy optimize algorithm.
+	 * Like the gold-section method it works quite poorly in practice.
+	 * @param f
+	 * @param alpha0
+	 * @param gd
+	 * @param rho
+	 * @param old_norm
+	 * @return
+	 */
+	REAL wolfesearch(FunctionEstimator& f, const REAL alpha_init, const REAL gd, const REAL rho, const REAL old_norm){
+		using std::sqrt;
+		using std::abs;
+		REAL delta=0.01;
+		unsigned int k=0;
+		REAL alpha0 = alpha_init;
+		REAL f0 = f(alpha0);
+
+		if (f0 <= old_norm+alpha0*delta*gd){//Strong Wolfe condition..
+			return alpha0;
+		}
+
+
+		REAL alpha1 = -gd*alpha0*alpha0/2.0/(f0-old_norm-gd*alpha0);
+		//GDEBUG_STREAM("F0 " <<f0 << " old " << old_norm << " gd " << gd <<std::endl);
+		GDEBUG_STREAM("Alpha0: "  << alpha0 << std::endl);
+		//GDEBUG_STREAM("Alpha1: "  << alpha1 << std::endl);
+		REAL f1 = f(alpha1);
+
+
+		if (f1 <= old_norm+alpha1*delta*gd){//Strong Wolfe condition..
+			return alpha1;
+		}
+
+
+		while (alpha1 > 0){
+			double factor = alpha0*alpha0*alpha1*alpha1*(alpha1-alpha0);
+			double a = alpha0*alpha0*(f1-old_norm-gd*alpha1) - alpha1*alpha1*(f0-old_norm-gd*alpha0);
+			a /= factor;
+
+			double b = -alpha0*alpha0*alpha0*(f1-old_norm-gd*alpha1) + alpha1*alpha1*alpha1*(f0-old_norm-gd*alpha0);
+			b /= factor;
+
+			double alpha2 = (-b+std::sqrt(std::abs(b*b-3*a*gd)))/(3*a);
+			REAL f2 = f(alpha2);
+			//GDEBUG_STREAM("a " << a << "b " << b << std::endl);
+			GDEBUG_STREAM("Alpha1: "  << alpha1 << std::endl);
+			GDEBUG_STREAM("Alpha2: "  << alpha2 << std::endl);
+			if (f2 < old_norm+alpha2*delta*gd){//Strong Wolfe condition..
+				return alpha2;
+			}
+
+			if (((alpha1-alpha2) > (alpha1/2.0)) || ((1.0-alpha2/alpha1) < 0.96)){
+				alpha2 = alpha1 / 2.0;
+			}
+
+			alpha0 = alpha1;
+			alpha1 = alpha2;
+			f0 = f1;
+			f1 = f2;
+			k++;
+
+
+		}
+
+		throw std::runtime_error("Wolfe line search failed");
+
+
+	}
+
+
+
+	/***
+	 * CG linesearch adapted from  Hager, W. and Zhang, H.SIAM Journal on Optimization 2005 16:1, 170-192
+	 * @param f
+	 * @param alpha0
+	 * @param gd
+	 * @param rho
+	 * @param old_norm
+	 * @return
+	 */
+	REAL cg_linesearch(FunctionEstimator& f, const REAL alpha0, const REAL gd, const REAL old_norm){
+		REAL delta=0.1;
+		REAL sigma=0.9;
+		REAL nabla=0.66;
+		//REAL precision = 0.0003; //Estimated precision of function evaluation
+		REAL precision = 1e-4f; //Estimated precision of function evaluation
+
+
+
+
+		REAL a=0;
+		REAL b = alpha0;
+
+		REAL ak = a;
+		REAL bk = b;
+		REAL fa = old_norm;
+		ELEMENT_TYPE a_deriv = gd;
+		REAL fb = f(alpha0);
+		ELEMENT_TYPE b_deriv = f.dir_deriv();
+
+		while (abs(a-b) > 0){
+			if ((((2*delta-1.0)*real(gd) >= real(b_deriv)) && (fb < old_norm+precision)) && //Check Approximate Wolfe conditions
+					(abs(b_deriv) <= sigma*abs(gd))){
+				f(b);
+				return b;
+			}
+
+			if ((((2*delta-1.0)*real(gd) >= real(a_deriv)) && (fa < old_norm+precision)) && //Check Approximate Wolfe conditions
+					(abs(a_deriv) <= sigma*abs(gd))){
+				f(a);
+				return a;
+			}
+
+			secant2(a,b,f,old_norm+precision);
+			if ((b-a) > nabla*(bk-ak)) {
+				REAL c = (a+b)/2;
+				interval_update(a,b,c,f,old_norm);
+			}
+			if (a != ak){
+				fa = f(a);
+				a_deriv = f.dir_deriv();
+			}
+
+			if (b != bk){
+				fb = f(b);
+				b_deriv = f.dir_deriv();
+			}
+
+			ak = a;
+			bk = b;
+
+			GDEBUG_STREAM("a: " << a << " b: " << b << std::endl);
+		}
+		return 0;
+		//throw std::runtime_error("CG_linesearch failed");
+
+	}
+
+
+	void secant2(REAL& a, REAL& b,FunctionEstimator& f,REAL old_norm){
+		REAL fa = f(a);
+		ELEMENT_TYPE dfa = f.dir_deriv();
+		REAL fb = f(b);
+		ELEMENT_TYPE dfb = f.dir_deriv();
+
+		REAL c= real((a*dfb-b*dfa)/(dfb-dfa));
+
+		REAL fc = f(c);
+		ELEMENT_TYPE dfc = f.dir_deriv();
+
+		REAL A=a;
+		REAL B = b;
+
+		interval_update(A,B,c,f,old_norm);
+
+		if (c == B){
+			c= real((b*dfc-c*dfb)/(dfc-dfb));
+			interval_update(A,B,c,f,old_norm);
+		} if (c == A){
+			c= real((a*dfc-c*dfa)/(dfc-dfa));
+			interval_update(A,B,c,f,old_norm);
+		}
+
+		a= A;
+		b = B;
+	}
+
+	void interval_update(REAL & a, REAL & b, REAL c,FunctionEstimator& f,REAL old_norm){
+		REAL theta = 0.5;
+		if (c < a || c > b) return; // C not in interval
+		REAL fc = f(c);
+		ELEMENT_TYPE dfc = f.dir_deriv();
+
+		if (real(dfc) >= 0){
+			b =c;
+			return;
+		}
+		if (fc < old_norm){
+			a = c;
+			return;
+		}
+		b =c;
+		while(true){
+			REAL d = (1-theta)*a+theta*b;
+			REAL fd = f(d);
+			ELEMENT_TYPE dfd = f.dir_deriv();
+
+			if (real(dfd) >= 0){
+				b = d;
+				return;
+			}
+			if (fd < old_norm){
+				a = d;
+			} else 	b = d;
+
+			GDEBUG_STREAM("Interval a: " << a << " b: " << b << std::endl);
+
+		}
+
+
+
+
+	}
+
+	REAL functionValue(ARRAY_TYPE* encoding_space,std::vector<ARRAY_TYPE>& regEnc, ARRAY_TYPE * x){
+		REAL res= this->encoding_operator_->get_weight()*abs(dot(encoding_space,encoding_space));
+
+		for (int i = 0; i  < this->operators.size(); i++){
+			res += this->operators[i]->magnitude(x);
+		}
+
+		res += abs(calc_dot(regEnc,regEnc));
+		return res;
+
+	}
+
+
+
+
+
+
+protected:
+
+	// Preconditioner
+	//boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond_;
+	// Maximum number of iterations
+	unsigned int iterations_;
+	bool non_negativity_constraint_;
+	REAL tc_tolerance_;
+	REAL threshold;
+	bool dump_residual;
+	REAL rho;
+
+	// Preconditioner
+
+	std::vector<boost::shared_ptr<ARRAY_TYPE> > reg_priors;
+	boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond_;
+
+};
+}
diff --git a/toolboxes/solvers/osLALMSolver.h b/toolboxes/solvers/osLALMSolver.h
new file mode 100644
index 0000000..0d97b8f
--- /dev/null
+++ b/toolboxes/solvers/osLALMSolver.h
@@ -0,0 +1,279 @@
+/**
+ * Implements the "Fast X-Ray CT Image Reconstruction Using a Linearized Augmented Lagrangian Method with Ordered Subsets" method, by 	Hung Nien and Jeffrey A. Fessler
+ */
+#pragma once
+#include "subsetOperator.h"
+#include "solver.h"
+#include <numeric>
+#include <vector>
+#include <functional>
+#include <boost/iterator/counting_iterator.hpp>
+#include <boost/make_shared.hpp>
+#include <boost/math/constants/constants.hpp>
+#include <initializer_list>
+namespace Gadgetron{
+template <class ARRAY_TYPE> class osLALMSolver : public solver< ARRAY_TYPE,ARRAY_TYPE> {
+	typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+	typedef typename realType<ELEMENT_TYPE>::Type REAL;
+public:
+	osLALMSolver() :solver< ARRAY_TYPE,ARRAY_TYPE>() {
+		_iterations=10;
+		non_negativity_=false;
+		inner_iterations=1;
+		alpha = 0;
+	}
+	virtual ~osLALMSolver(){};
+
+	void set_max_iterations(int i){_iterations=i;}
+	int get_max_iterations(){return _iterations;}
+	void set_non_negativity_constraint(bool neg=true){non_negativity_=neg;}
+
+	void set_regularization_iterations(int reg_it){
+		inner_iterations = reg_it;
+	}
+
+
+	void set_alpha(REAL a){
+		alpha = a;
+	}
+	/**
+	 * Sets the preconditioning image. In most cases this is not needed, and the preconditioning is calculated based on the system transform
+	 * @param precon_image
+	 */
+	void set_preconditioning_image(boost::shared_ptr<ARRAY_TYPE> precon_image){
+		this->preconditioning_image_ = precon_image;
+	}
+
+
+	void set_reg_steps(unsigned int reg_steps){ reg_steps_ = reg_steps;}
+
+	boost::shared_ptr<ARRAY_TYPE> solve(ARRAY_TYPE* in){
+		//boost::shared_ptr<ARRAY_TYPE> rhs = compute_rhs(in);
+		if( this->encoding_operator_.get() == 0 ){
+			throw std::runtime_error( "Error: cgSolver::compute_rhs : no encoding operator is set" );
+			return boost::shared_ptr<ARRAY_TYPE>();
+		}
+
+		// Get image space dimensions from the encoding operator
+		//
+
+		boost::shared_ptr< std::vector<size_t> > image_dims = this->encoding_operator_->get_domain_dimensions();
+		if( image_dims->size() == 0 ){
+			throw std::runtime_error( "Error: cgSolver::compute_rhs : encoding operator has not set domain dimension" );
+			return boost::shared_ptr<ARRAY_TYPE>();
+		}
+
+		ARRAY_TYPE * x = new ARRAY_TYPE(*image_dims);
+		if (this->x0_.get()){
+			*x = *(this->x0_.get());
+		} else  {
+			clear(x);
+		}
+		std::vector<boost::shared_ptr<ARRAY_TYPE> > subsets = this->encoding_operator_->projection_subsets(in);
+
+		ARRAY_TYPE tmp_projection(in->get_dimensions());
+		std::vector<boost::shared_ptr<ARRAY_TYPE> > tmp_projections = this->encoding_operator_->projection_subsets(&tmp_projection);
+
+		boost::shared_ptr<ARRAY_TYPE> precon_image;
+		if (preconditioning_image_)
+			precon_image = preconditioning_image_;
+		else {
+			precon_image = boost::make_shared<ARRAY_TYPE>(image_dims.get());
+			fill(precon_image.get(),ELEMENT_TYPE(1));
+			this->encoding_operator_->mult_M(precon_image.get(),&tmp_projection,false);
+			this->encoding_operator_->mult_MH(&tmp_projection,precon_image.get(),false);
+			clamp_min(precon_image.get(),REAL(1e-6));
+			reciprocal_inplace(precon_image.get());
+			//ones_image *= (ELEMENT_TYPE) this->encoding_operator_->get_number_of_subsets();
+		}
+		ARRAY_TYPE s(image_dims.get());
+		ARRAY_TYPE g(image_dims);
+		clear(&g);
+
+
+		if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+			std::cout << "osLALM setup done, starting iterations:" << std::endl;
+		}
+
+		REAL rho = 1;
+		const REAL rho_min = 1e-3;
+		const REAL pi = boost::math::constants::pi<REAL>();
+		REAL t = 1;
+		unsigned int L = 1;
+		REAL avg_lambda = calc_avg_lambda();
+		std::vector<int> isubsets(boost::counting_iterator<int>(0), boost::counting_iterator<int>(this->encoding_operator_->get_number_of_subsets()));
+
+		{
+			this->encoding_operator_->mult_M(x,tmp_projections[0].get(),0,false);
+			*tmp_projections[0] -= *subsets[0];
+			this->encoding_operator_->mult_MH(tmp_projections[0].get(),&s,0,false);
+			s*= REAL(this->encoding_operator_->get_number_of_subsets());
+			g = s;
+
+		}
+
+
+		for (int i =0; i < _iterations; i++){
+			for (int isubset = 0; isubset < this->encoding_operator_->get_number_of_subsets(); isubset++){
+				int subset = isubsets[isubset];
+
+				s *= rho;
+				axpy((1-rho),&g,&s);
+				s *= -t/rho;
+				s *= *precon_image;
+				s+= *x;
+				if (regularization_operators.empty() && regularization_groups.empty()){
+					*x = s;
+				} else {
+					//denoise(*x,s,t/rho,avg_lambda);
+					denoise(*x,s,*precon_image,t/rho,avg_lambda);
+				}
+				if (non_negativity_){
+					clamp_min(x,REAL(0));
+				}
+
+
+				this->encoding_operator_->mult_M(x,tmp_projections[subset].get(),subset,false);
+				*tmp_projections[subset] -= *subsets[subset];
+				if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+					std::cout << "Iteration " <<i << " Subset " << subset << " Update norm: " << nrm2(tmp_projections[subset].get()) << std::endl;
+				}
+				this->encoding_operator_->mult_MH(tmp_projections[subset].get(),&s,subset,false);
+				s*= REAL(this->encoding_operator_->get_number_of_subsets());
+
+				g *= 1/(1+rho);
+
+				axpy(rho/(rho+1),&s,&g);
+
+				rho = std::max(rho_min,pi/(L+1)*std::sqrt(1-(pi*pi/(4*(L+1)*(L+1)))));
+
+				//s *= REAL(-1);
+
+				L++;
+
+			}
+		}
+
+
+		return boost::shared_ptr<ARRAY_TYPE>(x);
+	}
+
+	void set_encoding_operator(boost::shared_ptr<subsetOperator<ARRAY_TYPE> > encoding_operator){ encoding_operator_ = encoding_operator; }
+
+	virtual void add_regularization_operator(boost::shared_ptr<linearOperator<ARRAY_TYPE>> op){
+		regularization_operators.push_back(op);
+	}
+
+	virtual void add_regularization_group(std::initializer_list<boost::shared_ptr<linearOperator<ARRAY_TYPE>>> ops){
+		regularization_groups.push_back(std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE>>>(ops));
+	}
+
+	virtual void add_regularization_group(std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE>>> ops){
+		regularization_groups.push_back(ops);
+	}
+
+
+
+protected:
+
+	/**
+	 * Solves an image denoising problem, using the regularization operators for denoising.
+	 * This is done via the AHMOD algorithm (see "A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging" - Antonin Chambolle & Thomas Pock, 2010)
+	 * @param x
+	 * @param s
+	 * @param scaling
+	 */
+	void denoise(ARRAY_TYPE& x, ARRAY_TYPE& s, ARRAY_TYPE& precon,REAL scaling,REAL avg_lambda ){
+		REAL tau=1.0;
+		REAL gam=0.35/(scaling*avg_lambda)/(precon.get_number_of_elements()/asum(&precon));
+		REAL sigma = 1;
+		ARRAY_TYPE g(x.get_dimensions());
+
+		for (auto it = 0u; it < inner_iterations; it++){
+			clear(&g);
+			for (auto reg_op : regularization_operators){
+				ARRAY_TYPE data(reg_op->get_codomain_dimensions());
+				reg_op->mult_M(&x,&data);
+				data *= sigma*reg_op->get_weight()/avg_lambda;
+				//updateF is the resolvent operator on the regularization
+				updateF(data, alpha, sigma);
+				data *= reg_op->get_weight()/avg_lambda;
+				reg_op->mult_MH(&data,&g,true);
+			}
+
+			for (auto & reg_group : regularization_groups){
+				std::vector<ARRAY_TYPE> datas(reg_group.size());
+				REAL val = 0;
+				for (auto i = 0u; i < reg_group.size(); i++){
+					datas[i] = ARRAY_TYPE(reg_group[i]->get_codomain_dimensions());
+					reg_group[i]->mult_M(&x,&datas[i]);
+					datas[i] *= sigma*reg_group[i]->get_weight()/avg_lambda;
+				}
+				//updateFgroup is the resolvent operators on the group
+				updateFgroup(datas,alpha,sigma);
+
+				for (auto i = 0u; i < reg_group.size(); i++){
+					datas[i] *= reg_group[i]->get_weight()/avg_lambda;
+					reg_group[i]->mult_MH(&datas[i],&g,true);
+
+				}
+
+			}
+			//updateG is the resolvent operator on the |x-s| part of the optimization
+			axpy(-tau,&g,&x);
+			g = s;
+			g /= precon;
+
+			axpy(tau/(scaling*avg_lambda),&g,&x);
+
+			g = precon;
+
+			reciprocal_inplace(&g);
+			g *= tau/(scaling*avg_lambda);
+			g += REAL(1);
+			x /= g;
+			//x *= 1/(1+tau/(scaling*avg_lambda));
+			REAL theta = 1/std::sqrt(1+2*gam*tau);
+			tau *= theta;
+			sigma /= theta;
+
+		}
+
+
+
+	};
+
+	REAL calc_avg_lambda(){
+		REAL result = 0;
+		auto num = 0u;
+		for (auto op : regularization_operators){
+			auto w = op->get_weight();
+			result += w;
+			num++;
+		}
+
+		for (auto & group : regularization_groups)
+			for (auto op : group){
+				auto w = op->get_weight();
+				std::cout << "Weight " << w << std::endl;
+				result += w;
+				num++;
+			}
+
+		result /= num;
+
+		return result;
+
+	}
+	int _iterations;
+	int inner_iterations;
+	bool non_negativity_;
+	unsigned int reg_steps_;
+	REAL alpha;
+	boost::shared_ptr<subsetOperator<ARRAY_TYPE> > encoding_operator_;
+	boost::shared_ptr<ARRAY_TYPE> preconditioning_image_;
+	std::vector<std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE>>>> regularization_groups;
+	std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> >> regularization_operators;
+
+};
+}
diff --git a/toolboxes/solvers/osMOMSolver.h b/toolboxes/solvers/osMOMSolver.h
new file mode 100644
index 0000000..0e759b9
--- /dev/null
+++ b/toolboxes/solvers/osMOMSolver.h
@@ -0,0 +1,216 @@
+/*
+ * osMOMSolver.h
+ *
+ *  Created on: Mar 23, 2015
+ *      Author: u051747
+ */
+//Based on Donghwan Kim; Ramani, S.; Fessler, J.A., "Combining Ordered Subsets and Momentum for Accelerated X-Ray CT Image Reconstruction,"
+#pragma once
+#include "subsetOperator.h"
+#include "solver.h"
+#include <numeric>
+#include <vector>
+#include <functional>
+#include <boost/iterator/counting_iterator.hpp>
+#include <boost/make_shared.hpp>
+
+namespace Gadgetron{
+template <class ARRAY_TYPE> class osMOMSolver : public solver< ARRAY_TYPE,ARRAY_TYPE> {
+	typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+	typedef typename realType<ELEMENT_TYPE>::Type REAL;
+public:
+	osMOMSolver() :solver< ARRAY_TYPE,ARRAY_TYPE>() {
+		_iterations=10;
+		_beta = REAL(1);
+		_alpha = 0.2;
+		_gamma = 0;
+		non_negativity_=false;
+		reg_steps_=2;
+		_kappa = REAL(1);
+	}
+	virtual ~osMOMSolver(){};
+
+	void set_max_iterations(int i){_iterations=i;}
+	int get_max_iterations(){return _iterations;}
+	void set_non_negativity_constraint(bool neg=true){non_negativity_=neg;}
+	/**
+	 * @brief Sets the weight of each step in the SART iteration
+	 * @param beta
+	 */
+	void set_beta(REAL beta){_beta = beta;}
+	void set_gamma(REAL gamma){_gamma = gamma;}
+	void set_kappa(REAL kappa){_kappa = kappa;}
+
+	/**
+	 * Sets the preconditioning image. In most cases this is not needed, and the preconditioning is calculated based on the system transform
+	 * @param precon_image
+	 */
+	void set_preconditioning_image(boost::shared_ptr<ARRAY_TYPE> precon_image){
+		this->preconditioning_image_ = precon_image;
+	}
+
+
+	void set_reg_steps(unsigned int reg_steps){ reg_steps_ = reg_steps;}
+
+	boost::shared_ptr<ARRAY_TYPE> solve(ARRAY_TYPE* in){
+		//boost::shared_ptr<ARRAY_TYPE> rhs = compute_rhs(in);
+		if( this->encoding_operator_.get() == 0 ){
+			throw std::runtime_error( "Error: cgSolver::compute_rhs : no encoding operator is set" );
+			return boost::shared_ptr<ARRAY_TYPE>();
+		}
+
+		// Get image space dimensions from the encoding operator
+		//
+
+		boost::shared_ptr< std::vector<size_t> > image_dims = this->encoding_operator_->get_domain_dimensions();
+		if( image_dims->size() == 0 ){
+			throw std::runtime_error( "Error: cgSolver::compute_rhs : encoding operator has not set domain dimension" );
+			return boost::shared_ptr<ARRAY_TYPE>();
+		}
+
+		ARRAY_TYPE * z = new ARRAY_TYPE(*image_dims);
+		if (this->x0_.get()){
+			*z = *(this->x0_.get());
+		} else  {
+			clear(z);
+		}
+
+		ARRAY_TYPE * x = new ARRAY_TYPE(*z);
+		ARRAY_TYPE * xold = new ARRAY_TYPE(*z);
+		std::cout <<"DEBUG DINGO 1" << std::endl;
+		std::vector<boost::shared_ptr<ARRAY_TYPE> > subsets = this->encoding_operator_->projection_subsets(in);
+
+		std::cout <<"DEBUG DINGO 2" << std::endl;
+		ARRAY_TYPE tmp_projection(in->get_dimensions());
+		std::vector<boost::shared_ptr<ARRAY_TYPE> > tmp_projections = this->encoding_operator_->projection_subsets(&tmp_projection);
+
+		std::cout <<"DEBUG DINGO 3" << std::endl;
+		boost::shared_ptr<ARRAY_TYPE> precon_image;
+		if (preconditioning_image_)
+			precon_image = preconditioning_image_;
+		else {
+			precon_image = boost::make_shared<ARRAY_TYPE>(image_dims.get());
+			fill(precon_image.get(),ELEMENT_TYPE(1));
+			this->encoding_operator_->mult_M(precon_image.get(),&tmp_projection,false);
+			this->encoding_operator_->mult_MH(&tmp_projection,precon_image.get(),false);
+			clamp_min(precon_image.get(),REAL(1e-6));
+			reciprocal_inplace(precon_image.get());
+			//ones_image *= (ELEMENT_TYPE) this->encoding_operator_->get_number_of_subsets();
+		}
+		ARRAY_TYPE tmp_image(image_dims.get());
+
+
+		std::cout <<"DEBUG DINGO 4" << std::endl;
+		REAL t = 1;
+		REAL told = 1;
+		if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+			std::cout << "osMOM setup done, starting iterations:" << std::endl;
+		}
+
+		std::vector<int> isubsets(boost::counting_iterator<int>(0), boost::counting_iterator<int>(this->encoding_operator_->get_number_of_subsets()));
+		REAL kappa_int = _kappa;
+		REAL step_size;
+		for (int i =0; i < _iterations; i++){
+			for (int isubset = 0; isubset < this->encoding_operator_->get_number_of_subsets(); isubset++){
+
+				t = 0.5*(1+std::sqrt(1+4*t*t));
+				int subset = isubsets[isubset];
+				this->encoding_operator_->mult_M(x,tmp_projections[subset].get(),subset,false);
+				*tmp_projections[subset] -= *subsets[subset];
+				*tmp_projections[subset] *= ELEMENT_TYPE(-1);
+				if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+					std::cout << "Iteration " <<i << " Subset " << subset << " Update norm: " << nrm2(tmp_projections[subset].get()) << std::endl;
+				}
+
+				this->encoding_operator_->mult_MH(tmp_projections[subset].get(),&tmp_image,subset,false);
+				tmp_image *= *precon_image;
+				axpy(REAL(_beta/(1+_gamma*i))*this->encoding_operator_->get_number_of_subsets(),&tmp_image,x);
+				if (i ==0 && isubset == 0)
+					step_size = _alpha*nrm2(x)/this->encoding_operator_->get_number_of_subsets();
+
+				//axpy(REAL(_beta),&tmp_image,x);
+				if (non_negativity_){
+					clamp_min(x,REAL(0));
+				}
+
+				for (auto op : regularization_operators){
+					/*
+					for (auto i = 0u; i < reg_steps_; i++){
+						op->gradient(x,&tmp_image);
+						tmp_image *= REAL(1)/nrm2(&tmp_image);
+						axpy(-step_size*op->get_weight(),&tmp_image,x);
+					}
+					 */
+					op->gradient(x,&tmp_image);
+					tmp_image /= nrm2(&tmp_image);
+					auto reg_val = op->magnitude(x);
+					std::cout << "Reg val: " << reg_val << std::endl;
+					ARRAY_TYPE y = *x;
+					axpy(-kappa_int,&tmp_image,&y);
+
+
+					while(op->magnitude(&y) > reg_val){
+
+						kappa_int /= 2;
+						axpy(kappa_int,&tmp_image,&y);
+						std::cout << "Kappa: " << kappa_int << std::endl;
+					}
+					reg_val = op->magnitude(&y);
+					*x = y;
+
+				}
+
+				*z = *x;
+				*z *= 1+(told-1)/t;
+				axpy(-(told-1)/t,xold,z);
+				std::swap(x,xold);
+				*x = *z;
+
+				told = t;
+
+
+				//step_size *= 0.99;
+
+			}
+			//std::reverse(isubsets.begin(),isubsets.end());
+			//std::random_shuffle(isubsets.begin(),isubsets.end());
+			/*
+			ARRAY_TYPE tmp_proj(*in);
+			clear(&tmp_proj);
+			this->encoding_operator_->mult_M(x,&tmp_proj,false);
+			tmp_proj -= *in;
+
+
+			std::stringstream ss;
+			ss << "osMOM-" << i << ".real";
+
+			write_nd_array<ELEMENT_TYPE>(x,ss.str().c_str());
+
+			//calc_regMultM(x,regEnc);
+			//REAL f = functionValue(&tmp_proj,regEnc,x);
+			std::cout << "Function value: " << dot(&tmp_proj,&tmp_proj) << std::endl;
+			 */
+		}
+		delete x,xold;
+
+
+		return boost::shared_ptr<ARRAY_TYPE>(z);
+	}
+
+	void set_encoding_operator(boost::shared_ptr<subsetOperator<ARRAY_TYPE> > encoding_operator){ encoding_operator_ = encoding_operator; }
+	virtual void add_nonlinear_operator(boost::shared_ptr< generalOperator<ARRAY_TYPE> > op ){
+		regularization_operators.push_back(op);
+	}
+
+
+protected:
+	int _iterations;
+	REAL _beta, _gamma, _alpha, _kappa;
+	bool non_negativity_;
+	unsigned int reg_steps_;
+	boost::shared_ptr<subsetOperator<ARRAY_TYPE> > encoding_operator_;
+	std::vector<boost::shared_ptr<generalOperator<ARRAY_TYPE>>> regularization_operators;
+	boost::shared_ptr<ARRAY_TYPE> preconditioning_image_;
+
+};
+}
diff --git a/toolboxes/solvers/osSPSSolver.h b/toolboxes/solvers/osSPSSolver.h
new file mode 100644
index 0000000..c12dbb2
--- /dev/null
+++ b/toolboxes/solvers/osSPSSolver.h
@@ -0,0 +1,180 @@
+#pragma once
+#include "subsetOperator.h"
+#include "solver.h"
+#include <numeric>
+#include <vector>
+#include <functional>
+#include <boost/iterator/counting_iterator.hpp>
+#include <boost/make_shared.hpp>
+
+namespace Gadgetron{
+template <class ARRAY_TYPE> class osSPSSolver : public solver< ARRAY_TYPE,ARRAY_TYPE> {
+	typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+	typedef typename realType<ELEMENT_TYPE>::Type REAL;
+public:
+	osSPSSolver() :solver< ARRAY_TYPE,ARRAY_TYPE>() {
+		_iterations=10;
+		_beta = REAL(1);
+		_alpha = 0.2;
+		_gamma = 0;
+		non_negativity_=false;
+		reg_steps_=2;
+		_kappa = REAL(1);
+	}
+	virtual ~osSPSSolver(){};
+
+	void set_max_iterations(int i){_iterations=i;}
+	int get_max_iterations(){return _iterations;}
+	void set_non_negativity_constraint(bool neg=true){non_negativity_=neg;}
+	/**
+	 * @brief Sets the weight of each step in the SART iteration
+	 * @param beta
+	 */
+	void set_beta(REAL beta){_beta = beta;}
+	void set_gamma(REAL gamma){_gamma = gamma;}
+	void set_kappa(REAL kappa){_kappa = kappa;}
+
+	/**
+	 * Sets the preconditioning image. In most cases this is not needed, and the preconditioning is calculated based on the system transform
+	 * @param precon_image
+	 */
+	void set_preconditioning_image(boost::shared_ptr<ARRAY_TYPE> precon_image){
+		this->preconditioning_image_ = precon_image;
+	}
+
+
+	void set_reg_steps(unsigned int reg_steps){ reg_steps_ = reg_steps;}
+
+	boost::shared_ptr<ARRAY_TYPE> solve(ARRAY_TYPE* in){
+		//boost::shared_ptr<ARRAY_TYPE> rhs = compute_rhs(in);
+		if( this->encoding_operator_.get() == 0 ){
+			throw std::runtime_error( "Error: cgSolver::compute_rhs : no encoding operator is set" );
+			return boost::shared_ptr<ARRAY_TYPE>();
+		}
+
+		// Get image space dimensions from the encoding operator
+		//
+
+		boost::shared_ptr< std::vector<size_t> > image_dims = this->encoding_operator_->get_domain_dimensions();
+		if( image_dims->size() == 0 ){
+			throw std::runtime_error( "Error: cgSolver::compute_rhs : encoding operator has not set domain dimension" );
+			return boost::shared_ptr<ARRAY_TYPE>();
+		}
+
+		ARRAY_TYPE * x = new ARRAY_TYPE(*image_dims);
+		if (this->x0_.get()){
+			*x = *(this->x0_.get());
+		} else  {
+			clear(x);
+		}
+
+		std::vector<boost::shared_ptr<ARRAY_TYPE> > subsets = this->encoding_operator_->projection_subsets(in);
+
+		ARRAY_TYPE tmp_projection(in->get_dimensions());
+		std::vector<boost::shared_ptr<ARRAY_TYPE> > tmp_projections = this->encoding_operator_->projection_subsets(&tmp_projection);
+
+		boost::shared_ptr<ARRAY_TYPE> precon_image;
+		if (preconditioning_image_)
+			precon_image = preconditioning_image_;
+		else {
+			precon_image = boost::make_shared<ARRAY_TYPE>(image_dims.get());
+			fill(precon_image.get(),ELEMENT_TYPE(1));
+			this->encoding_operator_->mult_M(precon_image.get(),&tmp_projection,false);
+			this->encoding_operator_->mult_MH(&tmp_projection,precon_image.get(),false);
+			abs_inplace(precon_image.get());
+			clamp_min(precon_image.get(),REAL(1e-6));
+
+			reciprocal_inplace(precon_image.get());
+			//ones_image *= (ELEMENT_TYPE) this->encoding_operator_->get_number_of_subsets();
+		}
+		ARRAY_TYPE tmp_image(image_dims.get());
+
+		if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+			std::cout << "osSPS setup done, starting iterations:" << std::endl;
+		}
+
+		std::vector<int> isubsets(boost::counting_iterator<int>(0), boost::counting_iterator<int>(this->encoding_operator_->get_number_of_subsets()));
+		REAL kappa_int = _kappa;
+		REAL step_size;
+		for (int i =0; i < _iterations; i++){
+			for (int isubset = 0; isubset < this->encoding_operator_->get_number_of_subsets(); isubset++){
+				int subset = isubsets[isubset];
+				this->encoding_operator_->mult_M(x,tmp_projections[subset].get(),subset,false);
+				*tmp_projections[subset] -= *subsets[subset];
+				*tmp_projections[subset] *= ELEMENT_TYPE(-1);
+				if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+					std::cout << "Iteration " <<i << " Subset " << subset << " Update norm: " << nrm2(tmp_projections[subset].get()) << std::endl;
+				}
+
+				this->encoding_operator_->mult_MH(tmp_projections[subset].get(),&tmp_image,subset,false);
+				tmp_image *= *precon_image;
+				axpy(REAL(_beta/(1+_gamma*i))*this->encoding_operator_->get_number_of_subsets(),&tmp_image,x);
+				if (i ==0 && isubset == 0)
+					step_size = _alpha*nrm2(x)/this->encoding_operator_->get_number_of_subsets();
+			//axpy(REAL(_beta),&tmp_image,x);
+				if (non_negativity_){
+					clamp_min(x,REAL(0));
+				}
+
+				for (auto op : regularization_operators){
+					op->gradient(x,&tmp_image);
+					tmp_image /= nrm2(&tmp_image);
+					auto reg_val = op->magnitude(x);
+					ARRAY_TYPE y = *x;
+					axpy(-kappa_int,&tmp_image,&y);
+
+
+					while(op->magnitude(&y) > reg_val){
+
+						kappa_int /= 2;
+						axpy(kappa_int,&tmp_image,&y);
+					}
+					reg_val = op->magnitude(&y);
+					*x = y;
+
+				}
+
+				//step_size *= 0.99;
+
+			}
+			//std::reverse(isubsets.begin(),isubsets.end());
+			//std::random_shuffle(isubsets.begin(),isubsets.end());
+			/*
+			ARRAY_TYPE tmp_proj(*in);
+			clear(&tmp_proj);
+			this->encoding_operator_->mult_M(x,&tmp_proj,false);
+			tmp_proj -= *in;
+
+
+			std::stringstream ss;
+			ss << "osSPS-" << i << ".real";
+
+			write_nd_array<ELEMENT_TYPE>(x,ss.str().c_str());
+
+			//calc_regMultM(x,regEnc);
+			//REAL f = functionValue(&tmp_proj,regEnc,x);
+			std::cout << "Function value: " << dot(&tmp_proj,&tmp_proj) << std::endl;
+			 */
+		}
+
+
+		return boost::shared_ptr<ARRAY_TYPE>(x);
+	}
+
+	void set_encoding_operator(boost::shared_ptr<subsetOperator<ARRAY_TYPE> > encoding_operator){ encoding_operator_ = encoding_operator; }
+	virtual void add_nonlinear_operator(boost::shared_ptr< generalOperator<ARRAY_TYPE> > op ){
+		regularization_operators.push_back(op);
+	}
+
+
+protected:
+	int _iterations;
+	REAL _beta, _gamma, _alpha, _kappa;
+	bool non_negativity_;
+	unsigned int reg_steps_;
+	boost::shared_ptr<subsetOperator<ARRAY_TYPE> > encoding_operator_;
+	std::vector<boost::shared_ptr<generalOperator<ARRAY_TYPE>>> regularization_operators;
+	boost::shared_ptr<ARRAY_TYPE> preconditioning_image_;
+
+};
+}
diff --git a/toolboxes/solvers/sbSolver.h b/toolboxes/solvers/sbSolver.h
new file mode 100644
index 0000000..ff89c74
--- /dev/null
+++ b/toolboxes/solvers/sbSolver.h
@@ -0,0 +1,842 @@
+/*
+  An implementation of the "Generalized Split Bregman Algorithm" - sec. 3.2. of the paper
+  "The Split Bregman Method for L1-Regularized Problems" by Tom Goldstein and Stanley Osher. 
+  Siam J. Imaging Sciences. Vol. 2, No. 2, pp. 323-343.
+ */
+
+#pragma once
+
+#include "linearOperatorSolver.h"
+#include "vector_td_utilities.h"
+#include "encodingOperatorContainer.h"
+#include "identityOperator.h"
+
+#include <vector>
+#include <iostream>
+#include <set>
+
+namespace Gadgetron{
+
+template< class ARRAY_TYPE_REAL,
+class ARRAY_TYPE_ELEMENT,
+class INNER_SOLVER >
+
+class sbSolver : public linearOperatorSolver<ARRAY_TYPE_ELEMENT>
+{
+
+protected:
+
+	typedef typename ARRAY_TYPE_REAL::element_type REAL;
+	typedef typename ARRAY_TYPE_ELEMENT::element_type ELEMENT_TYPE;
+
+	class sbRegularizationOperator{
+
+	public:
+
+		sbRegularizationOperator() {}
+		sbRegularizationOperator(boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > op) { reg_op=op; }
+		virtual ~sbRegularizationOperator(){}
+
+		virtual void initialize(boost::shared_ptr< std::vector<size_t> > image_dims, REAL normalization_factor = REAL(1))
+		{
+			d_k = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(reg_op->get_codomain_dimensions()));
+			b_k = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(reg_op->get_codomain_dimensions()));
+			clear(d_k.get());
+			clear(b_k.get());
+			if(prior.get()){
+				p_M = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(reg_op->get_codomain_dimensions()));
+				reg_op->mult_M(prior.get(),p_M.get());
+				*p_M *= normalization_factor;
+			}
+		}
+
+		virtual void update_encoding_space(ARRAY_TYPE_ELEMENT* encoding_space)
+		{
+			*encoding_space = *d_k;
+			*encoding_space -= *b_k;
+			if(prior.get())
+				*encoding_space += *p_M;
+		}
+
+		virtual void deinitialize()
+		{
+			d_k.reset();
+			b_k.reset();
+			p_M.reset();
+		}
+
+		REAL get_weight(){ return reg_op->get_weight(); }
+		void set_weight(REAL weight){ reg_op->set_weight(weight); }
+
+		virtual void update_dk(ARRAY_TYPE_ELEMENT*) = 0;
+		virtual void update_dk_bk(ARRAY_TYPE_ELEMENT*) = 0;
+
+		virtual boost::shared_ptr< std::vector<size_t> > get_codomain_dimensions(){
+			return reg_op->get_codomain_dimensions();
+		}
+
+		virtual void set_prior(boost::shared_ptr<ARRAY_TYPE_ELEMENT> image){ prior=image; }
+
+		boost::shared_ptr< linearOperator< ARRAY_TYPE_ELEMENT> > reg_op;
+		boost::shared_ptr<ARRAY_TYPE_ELEMENT> d_k;
+		boost::shared_ptr<ARRAY_TYPE_ELEMENT> b_k;
+		boost::shared_ptr<ARRAY_TYPE_ELEMENT> p_M;
+		boost::shared_ptr<ARRAY_TYPE_ELEMENT> prior;
+	};
+
+	class sbL1RegularizationOperator : public sbRegularizationOperator
+	{
+	public:
+		sbL1RegularizationOperator(boost::shared_ptr< linearOperator< ARRAY_TYPE_ELEMENT> > op) : sbRegularizationOperator(op) {}
+
+		virtual void update_dk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			ARRAY_TYPE_ELEMENT tmp(*this->b_k);
+			this->reg_op->mult_M(u_k,&tmp,true);
+			if (this->prior.get())
+				tmp -= *(this->p_M);
+			shrink1(&tmp,REAL(1)/this->reg_op->get_weight(),this->d_k.get());
+		}
+
+		virtual void update_dk_bk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			this->reg_op->mult_M(u_k,this->b_k.get(),true);
+			if (this->prior.get())
+				*(this->b_k) -= *(this->p_M);
+			shrink1(this->b_k.get(),REAL(1)/this->reg_op->get_weight(),this->d_k.get());
+			*this->b_k -= *this->d_k;
+		}
+	};
+
+
+	class sbL0RegularizationOperator : public sbRegularizationOperator
+	{
+	public:
+		sbL0RegularizationOperator(boost::shared_ptr< linearOperator< ARRAY_TYPE_ELEMENT> > op,REAL _p = REAL(0.5)) : sbRegularizationOperator(op), p(_p) {}
+
+		virtual void update_dk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			ARRAY_TYPE_ELEMENT tmp(*this->b_k);
+			this->reg_op->mult_M(u_k,&tmp,true);
+			if (this->prior.get())
+				tmp -= *(this->p_M);
+			pshrink(&tmp,REAL(1)/this->reg_op->get_weight(),p,this->d_k.get());
+		}
+
+		virtual void update_dk_bk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			this->reg_op->mult_M(u_k,this->b_k.get(),true);
+			if (this->prior.get())
+				*(this->b_k) -= *(this->p_M);
+			pshrink(this->b_k.get(),REAL(1)/this->reg_op->get_weight(),p,this->d_k.get());
+			*this->b_k -= *this->d_k;
+		}
+	protected:
+		REAL p;
+	};
+
+
+	class sbL1GroupRegularizationOperator : public sbRegularizationOperator
+	{
+	public:
+		sbL1GroupRegularizationOperator(std::vector<boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > > group)
+	: sbRegularizationOperator()
+	{
+			op_cont = boost::shared_ptr<encodingOperatorContainer<ARRAY_TYPE_ELEMENT> >
+			(new encodingOperatorContainer<ARRAY_TYPE_ELEMENT>);
+			for (int i = 0; i < group.size(); i++)
+				op_cont->add_operator(group[i]);
+			reg_ops = group;
+			this->reg_op = op_cont;
+	}
+
+		virtual void update_encoding_space(ARRAY_TYPE_ELEMENT* encoding_space)
+		{
+			for (int i=0; i < reg_ops.size(); i++){
+				ARRAY_TYPE_ELEMENT tmp(codom_dims,encoding_space->get_data_ptr()+op_cont->get_offset(i));
+				tmp = *d_ks[i];
+				tmp -= *b_ks[i];
+				if (this->prior.get())
+					tmp += *p_Ms[i];
+			}
+		}
+
+		virtual void initialize(boost::shared_ptr< std::vector<size_t> > image_dims, REAL normalization_factor = REAL(1))
+		{
+			codom_dims = reg_ops.front()->get_codomain_dimensions();
+			d_ks = std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> >(reg_ops.size());
+			b_ks = std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> >(reg_ops.size());
+			if (this->prior.get())
+				p_Ms = std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> >(reg_ops.size());
+			for (int i=0; i<reg_ops.size(); i++){
+				d_ks[i] = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(codom_dims));
+				clear(d_ks[i].get());
+				b_ks[i] = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(codom_dims));
+				clear(b_ks[i].get());
+				if (this->prior.get()){
+					p_Ms[i] = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(codom_dims));
+					reg_ops[i]->mult_M(this->prior.get(),p_Ms[i].get());
+					*p_Ms[i] *= normalization_factor;
+				}
+			}
+		}
+
+		virtual void deinitialize()
+		{
+			d_ks.clear();
+			b_ks.clear();
+			p_Ms.clear();
+		}
+
+		virtual void update_dk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			ARRAY_TYPE_REAL s_k(codom_dims);
+			ARRAY_TYPE_ELEMENT *tmp = new ARRAY_TYPE_ELEMENT[reg_ops.size()];
+			for (int i=0; i<reg_ops.size(); i++) {
+				tmp[i] = *b_ks[i];
+				this->reg_ops[i]->mult_M(u_k,&tmp[i],true);
+				if (this->prior.get())
+					tmp[i] -= *p_Ms[i];
+				(i==0) ? s_k = *abs_square<ELEMENT_TYPE>(&tmp[i]) : s_k += *abs_square<ELEMENT_TYPE>(&tmp[i]);
+			}
+			sqrt_inplace(&s_k);
+			for (int i=0; i<reg_ops.size(); i++) {
+				shrinkd(&tmp[i],&s_k,REAL(1)/reg_ops[i]->get_weight(),d_ks[i].get());
+			}
+			delete[] tmp;
+		}
+
+		virtual void update_dk_bk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			ARRAY_TYPE_REAL s_k(codom_dims);
+			for (int i=0; i<reg_ops.size(); i++) {
+				this->reg_ops[i]->mult_M(u_k,b_ks[i].get(),true);
+				if (this->prior.get())
+					*b_ks[i] -= *p_Ms[i];
+				(i==0) ? s_k = *abs_square(b_ks[i].get()) : s_k += *abs_square(b_ks[i].get());
+			}
+			sqrt_inplace(&s_k);
+			for (int i=0; i<reg_ops.size(); i++) {
+				shrinkd(b_ks[i].get(),&s_k,REAL(1)/reg_ops[i]->get_weight(),d_ks[i].get());
+				*b_ks[i] -= *d_ks[i];
+			}
+		}
+
+		virtual boost::shared_ptr< std::vector<size_t> > get_codomain_dimensions(){
+			return reg_ops.front()->get_codomain_dimensions();
+		}
+
+	protected:
+		std::vector<boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > > reg_ops;
+		std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> > d_ks;
+		std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> > b_ks;
+		std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> > p_Ms;
+		boost::shared_ptr<encodingOperatorContainer<ARRAY_TYPE_ELEMENT> > op_cont;
+		boost::shared_ptr< std::vector<size_t> > codom_dims;
+	};
+
+
+	class sbL0GroupRegularizationOperator : public sbRegularizationOperator
+	{
+	public:
+		sbL0GroupRegularizationOperator(std::vector<boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > > group, REAL _p = REAL(0.5))
+	: sbRegularizationOperator(), p(_p)
+	{
+			op_cont = boost::shared_ptr<encodingOperatorContainer<ARRAY_TYPE_ELEMENT> >
+			(new encodingOperatorContainer<ARRAY_TYPE_ELEMENT>);
+			for (int i = 0; i < group.size(); i++)
+				op_cont->add_operator(group[i]);
+			reg_ops = group;
+			this->reg_op = op_cont;
+	}
+
+		virtual void update_encoding_space(ARRAY_TYPE_ELEMENT* encoding_space)
+		{
+			for (int i=0; i < reg_ops.size(); i++){
+				ARRAY_TYPE_ELEMENT tmp(codom_dims,encoding_space->get_data_ptr()+op_cont->get_offset(i));
+				tmp = *d_ks[i];
+				tmp -= *b_ks[i];
+				if (this->prior.get())
+					tmp += *p_Ms[i];
+			}
+		}
+
+		virtual void initialize(boost::shared_ptr< std::vector<size_t> > image_dims, REAL normalization_factor = REAL(1))
+		{
+			codom_dims = reg_ops.front()->get_codomain_dimensions();
+			d_ks = std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> >(reg_ops.size());
+			b_ks = std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> >(reg_ops.size());
+			if (this->prior.get())
+				p_Ms = std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> >(reg_ops.size());
+			for (int i=0; i<reg_ops.size(); i++){
+				d_ks[i] = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(codom_dims));
+				clear(d_ks[i].get());
+				b_ks[i] = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(codom_dims));
+				clear(b_ks[i].get());
+				if (this->prior.get()){
+					p_Ms[i] = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(codom_dims));
+					reg_ops[i]->mult_M(this->prior.get(),p_Ms[i].get());
+					*p_Ms[i] *= normalization_factor;
+				}
+			}
+		}
+
+		virtual void deinitialize()
+		{
+			d_ks.clear();
+			b_ks.clear();
+			p_Ms.clear();
+		}
+
+		virtual void update_dk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			ARRAY_TYPE_REAL s_k(codom_dims);
+			ARRAY_TYPE_ELEMENT *tmp = new ARRAY_TYPE_ELEMENT[reg_ops.size()];
+			for (int i=0; i<reg_ops.size(); i++) {
+				tmp[i] = *b_ks[i];
+				this->reg_ops[i]->mult_M(u_k,&tmp[i],true);
+				if (this->prior.get())
+					tmp[i] -= *p_Ms[i];
+				(i==0) ? s_k = *abs_square<ELEMENT_TYPE>(&tmp[i]) : s_k += *abs_square<ELEMENT_TYPE>(&tmp[i]);
+			}
+			sqrt_inplace(&s_k);
+			for (int i=0; i<reg_ops.size(); i++) {
+				pshrinkd(&tmp[i],&s_k,REAL(1)/reg_ops[i]->get_weight(),p,d_ks[i].get());
+			}
+			delete[] tmp;
+		}
+
+		virtual void update_dk_bk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			ARRAY_TYPE_REAL s_k(codom_dims);
+			for (int i=0; i<reg_ops.size(); i++) {
+				this->reg_ops[i]->mult_M(u_k,b_ks[i].get(),true);
+				if (this->prior.get())
+					*b_ks[i] -= *p_Ms[i];
+				(i==0) ? s_k = *abs_square(b_ks[i].get()) : s_k += *abs_square(b_ks[i].get());
+			}
+			sqrt_inplace(&s_k);
+			for (int i=0; i<reg_ops.size(); i++) {
+				pshrinkd(b_ks[i].get(),&s_k,REAL(1)/reg_ops[i]->get_weight(),p,d_ks[i].get());
+				*b_ks[i] -= *d_ks[i];
+			}
+		}
+
+		virtual boost::shared_ptr< std::vector<size_t> > get_codomain_dimensions(){
+			return reg_ops.front()->get_codomain_dimensions();
+		}
+
+	protected:
+		std::vector<boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > > reg_ops;
+		std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> > d_ks;
+		std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> > b_ks;
+		std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> > p_Ms;
+		boost::shared_ptr<encodingOperatorContainer<ARRAY_TYPE_ELEMENT> > op_cont;
+		boost::shared_ptr< std::vector<size_t> > codom_dims;
+		REAL p;
+	};
+
+	class sbL2RegularizationOperator : public sbRegularizationOperator
+	{
+	public:
+
+		sbL2RegularizationOperator(boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > op) : sbRegularizationOperator(op) {}
+
+		virtual void update_dk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			*this->d_k = *this->b_k;
+			this->reg_op->mult_M(u_k,this->d_k.get(),true);
+			if (this->prior.get()){
+				*this->d_k -= *this->p_M;
+			}
+			*(this->d_k) *= REAL(1)/(1+this->reg_op->get_weight());
+		}
+
+		virtual void update_dk_bk(ARRAY_TYPE_ELEMENT* u_k){
+			update_dk(u_k);
+			*(this->b_k) = *(this->d_k);
+			*(this->b_k) *= this->reg_op->get_weight();
+		}
+	};
+
+	class sbNonNegativityOperator : public sbRegularizationOperator
+	{
+	public:
+		sbNonNegativityOperator(): sbRegularizationOperator(){
+			this->reg_op = boost::shared_ptr<identityOperator<ARRAY_TYPE_ELEMENT> >(new identityOperator<ARRAY_TYPE_ELEMENT>);
+		}
+
+		virtual void initialize(boost::shared_ptr< std::vector<size_t> > image_dims,
+				REAL normalization_factor = REAL(1))
+		{
+			this->reg_op->set_domain_dimensions(image_dims.get());
+			this->reg_op->set_codomain_dimensions(image_dims.get());
+			sbRegularizationOperator::initialize( image_dims, normalization_factor);
+		}
+
+		virtual void update_encoding_space(ARRAY_TYPE_ELEMENT* encoding_space){
+			*encoding_space = *(this->d_k);
+			clamp_min(encoding_space,REAL(0));
+			*encoding_space += *(this->b_k);
+		}
+
+		virtual void update_dk(ARRAY_TYPE_ELEMENT* u_k){
+			*(this->d_k) = *u_k;
+			*(this->d_k) -= (*(this->b_k));
+			clamp_min(this->d_k.get(),REAL(0));
+		}
+
+		virtual void update_dk_bk(ARRAY_TYPE_ELEMENT* u_k){
+			update_dk(u_k);
+			*(this->b_k) += *(this->d_k);
+			*(this->b_k) -= *u_k;
+		}
+	};
+
+	public:
+
+	// Constructor
+	//
+
+	sbSolver() : linearOperatorSolver<ARRAY_TYPE_ELEMENT>()
+	{
+		normalization_mode_ = SB_NORMALIZE_TO_IMAGE_SPACE_IDENTITY;
+		tolerance_ = REAL(0);
+		outer_iterations_ = 10;
+		inner_iterations_ = 1;
+		num_reg_operators_ = 0;
+		inner_solver_ = boost::shared_ptr<INNER_SOLVER>( new INNER_SOLVER() );
+		non_negativity_filter_weight_ = REAL(0);
+		use_x0_ = false;
+	}
+
+	// Destructor
+	//
+
+	virtual ~sbSolver() {}
+
+	// Add regularization operator to group (for isotropic regularization)
+	//
+
+	virtual void add_regularization_group_operator( boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > op )
+	{
+		if( !op.get() ){
+			throw std::runtime_error( "Error: sbSolver::add_regularization_group_operator : NULL operator provided" );
+		}
+		current_group_.push_back(op);
+	}
+
+	// Add isotroic regularization group (multiple groups allowed)
+	//
+
+	virtual void add_group(int L_norm=1)
+	{
+		if(current_group_.size()==0){
+			throw std::runtime_error( "Error: sbSolver::add_group : no regularization group operators added" );
+		}
+		if (L_norm==2){
+			for (int i=0; i<current_group_.size(); i++){
+				regularization_operators_.push_back(boost::shared_ptr<sbL2RegularizationOperator>(new sbL2RegularizationOperator(current_group_[i])));
+			}
+		} else if (L_norm==0){
+			boost::shared_ptr<sbL0GroupRegularizationOperator> group(new sbL0GroupRegularizationOperator(current_group_));
+			regularization_operators_.push_back(group);
+		}else if (L_norm ==1){
+			boost::shared_ptr<sbL1GroupRegularizationOperator> group(new sbL1GroupRegularizationOperator(current_group_));
+			regularization_operators_.push_back(group);
+		} else throw std::runtime_error("Illega L-norm used in add_group");
+		current_group_.clear();
+	}
+
+	virtual void add_group( boost::shared_ptr<ARRAY_TYPE_ELEMENT> prior, int L_norm=1 )
+	{
+		if(current_group_.size()==0){
+			throw std::runtime_error( "Error: sbSolver::add_group : no regularization group operators added" );
+		}
+		if (L_norm==2){
+			for (int i=0; i<current_group_.size(); i++){
+				regularization_operators_.push_back(boost::shared_ptr<sbL2RegularizationOperator>(new sbL2RegularizationOperator(current_group_[i])));
+				regularization_operators_.back()->set_prior(prior);
+			}
+		} else if (L_norm==0){
+			boost::shared_ptr<sbL0GroupRegularizationOperator> group(new sbL0GroupRegularizationOperator(current_group_));
+			group->set_prior(prior);
+			regularization_operators_.push_back(group);
+		} else if (L_norm==1){
+			boost::shared_ptr<sbL1GroupRegularizationOperator> group(new sbL1GroupRegularizationOperator(current_group_));
+			group->set_prior(prior);
+			regularization_operators_.push_back(group);
+		} else throw std::runtime_error("Illega L-norm used in add_group");
+		current_group_.clear();
+	}
+
+	virtual void add_regularization_operator(boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > op, int L_norm=1 ){
+		if (L_norm==1){
+			regularization_operators_.push_back(boost::shared_ptr<sbL1RegularizationOperator>(new sbL1RegularizationOperator(op)));
+		}else if (L_norm == 0){
+			regularization_operators_.push_back(boost::shared_ptr<sbL0RegularizationOperator>(new sbL0RegularizationOperator(op)));
+		}else{
+			regularization_operators_.push_back(boost::shared_ptr<sbL2RegularizationOperator>(new sbL2RegularizationOperator(op)));
+		}
+	}
+
+	virtual void add_regularization_operator(boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > op, boost::shared_ptr<ARRAY_TYPE_ELEMENT> prior, int L_norm=1 ){
+		if (L_norm==1){
+			regularization_operators_.push_back(boost::shared_ptr<sbL1RegularizationOperator>(new sbL1RegularizationOperator(op)));
+			regularization_operators_.back()->set_prior(prior);
+		}else if (L_norm == 0){
+			regularization_operators_.push_back(boost::shared_ptr<sbL0RegularizationOperator>(new sbL0RegularizationOperator(op)));
+			regularization_operators_.back()->set_prior(prior);
+		}else{
+			regularization_operators_.push_back(boost::shared_ptr<sbL2RegularizationOperator>(new sbL2RegularizationOperator(op)));
+			regularization_operators_.back()->set_prior(prior);
+		}
+	}
+
+	// Specify normalization mode.
+	// The default mode is to use image space normalization.
+
+	enum SB_normalization_mode{
+		SB_NO_NORMALIZATION,
+		SB_NORMALIZE_TO_IMAGE_SPACE_IDENTITY
+	};
+
+	virtual void set_normalization_mode( SB_normalization_mode mode ){
+		normalization_mode_ = mode;
+	}
+
+	// Set termination criterium tolerance
+	//
+
+	virtual void set_tc_tolerance( REAL tolerance )
+	{
+		if( tolerance < REAL(0) )
+			this->solver_warning( "Warning: sbSolver::set_tc_tolerence : tolerance cannot be negative. Ignored." );
+		else tolerance_ = tolerance;
+	}
+
+	virtual void set_non_negativity_filter(REAL nnf){
+		non_negativity_filter_weight_ = nnf;
+	}
+
+	// Set/get maximum number of outer Split-Bregman iterations
+	//
+
+	virtual void set_max_outer_iterations( unsigned int iterations ) { outer_iterations_ = iterations; }
+	virtual unsigned int get_max_outer_iterations() { return outer_iterations_; }
+
+	// Set/get maximum number of inner Split-Bregman iterations
+	//
+
+	virtual void set_max_inner_iterations( unsigned int iterations ) { inner_iterations_ = iterations; }
+	virtual unsigned int get_max_inner_iterations() { return inner_iterations_; }
+
+	virtual void set_use_inner_x0(bool use){ use_x0_=use; }
+
+	// Get the inner solver
+	//
+
+	virtual boost::shared_ptr<INNER_SOLVER> get_inner_solver() { return inner_solver_; }
+
+	// Provide the user an option to access u_k right after its update.
+	//
+
+	virtual bool post_linear_solver_callback( ARRAY_TYPE_ELEMENT* ) { return true; }
+
+	//
+	// Main solver interface
+	//
+
+	virtual boost::shared_ptr<ARRAY_TYPE_ELEMENT> solve( ARRAY_TYPE_ELEMENT *_f )
+    		{
+		// Check that operators etc. have been provided and consistent in dimensionality
+		//
+		validate_solver();
+
+		// Define u_k
+		//
+		boost::shared_ptr<ARRAY_TYPE_ELEMENT> u_k( new ARRAY_TYPE_ELEMENT(this->encoding_operator_->get_domain_dimensions()) );
+
+		// Use x0 (if provided) as starting solution estimate
+		//
+		if( this->get_x0().get() )
+			*u_k = *(this->get_x0());
+		else
+			clear(u_k.get());
+
+		// Normalize and _then_ initialize (the order matters)
+		boost::shared_ptr<ARRAY_TYPE_ELEMENT> f(new ARRAY_TYPE_ELEMENT(*_f));
+		REAL normalization_factor = normalize_data( f.get() );
+		initialize( normalization_factor );
+
+		// Invoke the core solver
+		//
+		core( tolerance_, outer_iterations_, inner_iterations_, f, u_k);
+
+		// Clean up memory occupied by the operator container and inner solver
+		deinitialize();
+
+		// Undo normalization
+		*u_k /= normalization_factor;
+
+		// ... and return the result
+		//
+		return u_k;
+    		}
+
+	protected:
+
+	//
+	// Everything beyond this point is internal to the implementation
+	// and not intended to be exposed as a public interface
+	//
+
+	// Validate operator
+	//
+
+	virtual void validate_encoding_operator()
+	{
+		boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > op = this->get_encoding_operator();
+
+		if( !op.get() ){
+			throw std::runtime_error( "Error: sbSolver::validate_encoding_operator : operator not set" );
+		}
+
+		boost::shared_ptr< std::vector<size_t> > op_dims = op->get_domain_dimensions();
+		if( op_dims->size() == 0 ){
+			throw std::runtime_error( "Error: sbSolver::validate_encoding_operator : encoding operator must have specified domain dimensions" );
+		}
+
+		op_dims = op->get_codomain_dimensions();
+		if( op_dims->size() == 0 ){
+			throw std::runtime_error( "Error: sbSolver::validate_encoding_operator : encoding operator must have specified codomain dimensions" );
+		}
+	}
+
+	// Validate regularization operator
+	//
+
+	virtual void validate_regularization_operators( std::vector<size_t> *image_dims )
+	{
+		if( image_dims->size() == 0 ){
+			throw std::runtime_error( "Error: sbSolver::validate_regularization_operators : empty dimensions vector provided" );
+		}
+
+		for( unsigned int i=0; i<this->regularization_operators_.size(); i++ ){
+
+			boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > op = regularization_operators_[i]->reg_op;
+			boost::shared_ptr< std::vector<size_t> > op_dims = op->get_domain_dimensions();
+			boost::shared_ptr< std::vector<size_t> > op_codims = op->get_codomain_dimensions();
+			if (!op_codims.get()){
+				throw std::runtime_error("Error: sbSolver::validate_regularization_operators : operator codomain dimension not set");
+			}
+
+			if( !op.get() ){
+				throw std::runtime_error( "Error: sbSolver::validate_regularization_operators : invalid operator provided" );
+			}
+
+			if( *op_dims != *image_dims ){
+				throw std::runtime_error( "Error: sbSolver::validate_regularization_operators : operator domain dimensions mismatch between encoding and regularization operators" );
+			}
+		}
+	}
+
+	// Check that the solver is set up properly
+	virtual void validate_solver()
+	{
+		// Some tests to check if we are ready to go...
+		//
+
+		validate_encoding_operator();
+		boost::shared_ptr< std::vector<size_t> > op_dims = this->encoding_operator_->get_domain_dimensions();
+		validate_regularization_operators(op_dims.get());
+	}
+
+	// Initialize solver
+	virtual void initialize( REAL normalization_factor = REAL(1) )
+	{
+		// Get image dimensions
+		boost::shared_ptr< std::vector<size_t> > image_dims =
+				this->encoding_operator_->get_domain_dimensions();
+
+		if (non_negativity_filter_weight_ > REAL(0)){
+			regularization_operators_.push_back(boost::shared_ptr<sbNonNegativityOperator>(new sbNonNegativityOperator));
+			regularization_operators_.back()->set_weight(non_negativity_filter_weight_);
+		}
+
+		// Set up inner solver
+		//
+
+		enc_op_container_ = boost::shared_ptr<encodingOperatorContainer<ARRAY_TYPE_ELEMENT> >( new encodingOperatorContainer<ARRAY_TYPE_ELEMENT>() );
+		inner_solver_->set_encoding_operator( enc_op_container_ );
+		enc_op_container_->add_operator( this->encoding_operator_ );
+
+		// Invoke initialization on all regularization operators
+		//
+
+		for (int i=0; i < regularization_operators_.size(); i++){
+			regularization_operators_[i]->initialize(image_dims, normalization_factor);
+			enc_op_container_->add_operator( regularization_operators_[i]->reg_op );
+		}
+	}
+
+	// Clean up operator memory in the inner solver
+	// Also restore the weights we temporarily changed
+
+	virtual void deinitialize()
+	{
+		enc_op_container_ = boost::shared_ptr<encodingOperatorContainer<ARRAY_TYPE_ELEMENT> >( new encodingOperatorContainer<ARRAY_TYPE_ELEMENT>);
+		inner_solver_->set_encoding_operator( enc_op_container_ );
+		for (int i=0; i < regularization_operators_.size(); i++){
+			regularization_operators_[i]->deinitialize();
+		}
+		if (non_negativity_filter_weight_ > REAL(0)){
+			regularization_operators_.pop_back();
+		}
+	}
+
+	// The core of the Split Bregman solver.
+	//
+
+	virtual void core( REAL tolerance, unsigned int outer_iterations, unsigned int inner_iterations,
+			boost::shared_ptr<ARRAY_TYPE_ELEMENT> f,
+			boost::shared_ptr<ARRAY_TYPE_ELEMENT> u_k )
+	{
+		// Image space dimensions
+		boost::shared_ptr< std::vector<size_t> > image_dims =
+				this->encoding_operator_->get_domain_dimensions();
+
+		// Keep a copy of the "previous" u_k to compute the outer loop change of u_k
+		//
+
+		ARRAY_TYPE_ELEMENT u_k_prev;
+		if( tolerance > REAL(0) || this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_VERBOSE ){
+			u_k_prev = *u_k;
+		}
+
+		//
+		// Outer loop
+		//
+
+		for( unsigned int outer_iteration=0; outer_iteration<outer_iterations; outer_iteration++ ) {
+
+			if( this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_MAX )
+				GDEBUG_STREAM(std::endl << "SB outer loop iteration " << outer_iteration << std::endl << std::endl);
+
+			//
+			// Inner loop
+			//
+
+			for( unsigned int inner_iteration=0; inner_iteration<inner_iterations; inner_iteration++ ) {
+
+				if( this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_MAX )
+					GDEBUG_STREAM(std::endl << "SB inner loop iteration " << inner_iteration << std::endl << std::endl);
+
+				{ // Brackets used to free 'data' below as soon as it goes out of scope
+
+					// Setup input vector to the encoding operator container (argument to the inner solver's solve)
+					//
+
+					ARRAY_TYPE_ELEMENT data(enc_op_container_->get_codomain_dimensions());
+					ARRAY_TYPE_ELEMENT tmp(f->get_dimensions().get(), data.get_data_ptr() );
+
+					tmp = *f;
+
+					// Next add the regularization operators' data, d_k - b_k
+					//
+
+					for( unsigned int i=0; i< regularization_operators_.size(); i++ ){
+						boost::shared_ptr<sbRegularizationOperator > op = regularization_operators_[i];
+						tmp.create( op->get_codomain_dimensions(), data.get_data_ptr()+enc_op_container_->get_offset(i+1) );
+						op->update_encoding_space(&tmp);
+					}
+
+					// Solve for u_k
+					//
+
+					{
+						if (use_x0_){
+							get_inner_solver()->set_x0(u_k);
+						}
+
+						boost::shared_ptr<ARRAY_TYPE_ELEMENT> tmp_u_k =
+								get_inner_solver()->solve( &data );
+
+						// Invoke the post inner solver callback
+						post_linear_solver_callback( tmp_u_k.get() );
+
+						// Compute change in u_k
+						if( this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_VERBOSE ){
+							*u_k -= *tmp_u_k;
+							GDEBUG_STREAM("u_k delta l2-norm (inner loop): " << nrm2(u_k.get()) << std::endl);
+						}
+
+						// Update u_k
+						*u_k = *tmp_u_k;
+					}
+				}
+
+				// Update d_k (and b_k in final inner iteration)
+				//
+
+				for( unsigned int i=0; i< regularization_operators_.size(); i++ ){
+					boost::shared_ptr<sbRegularizationOperator > op = regularization_operators_[i];
+					if( inner_iteration < inner_iterations-1 )
+						op->update_dk(u_k.get());
+					else
+						op->update_dk_bk(u_k.get());
+				}
+			} // end of inner loop
+
+			// Output change in u_k
+			if( tolerance > REAL(0) || this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_VERBOSE ){
+				u_k_prev *= ELEMENT_TYPE(-1);
+				u_k_prev += *u_k;
+				REAL delta = nrm2(&u_k_prev);
+
+				if( this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_VERBOSE )
+					GDEBUG_STREAM("u_k delta l2-norm (outer loop): " << delta << std::endl << std::endl);
+
+				if( delta < tolerance )
+					break;
+
+				u_k_prev = *u_k;
+			}
+		} // end of outer loop
+	}
+
+	virtual REAL normalize_data( ARRAY_TYPE_ELEMENT *f )
+	{
+		REAL image_scale = REAL(1);
+
+		if( normalization_mode_ == SB_NORMALIZE_TO_IMAGE_SPACE_IDENTITY ){
+
+			//
+			// Normalize to an average energy of "one intensity unit per image element"
+			//
+
+			boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > op = this->encoding_operator_;
+			ARRAY_TYPE_ELEMENT tmp( op->get_domain_dimensions() );
+			op->mult_MH( f, &tmp );
+			REAL sum = asum( &tmp );
+			image_scale = REAL(tmp.get_number_of_elements())/sum;
+			*f *= image_scale;
+		}
+
+		return image_scale;
+	}
+
+	protected:
+	SB_normalization_mode normalization_mode_;
+	REAL tolerance_;
+	unsigned int outer_iterations_, inner_iterations_;
+	unsigned int num_reg_operators_;
+	std::vector< boost::shared_ptr<sbRegularizationOperator> > regularization_operators_;
+	std::vector< boost::shared_ptr<linearOperator<ARRAY_TYPE_ELEMENT> > > current_group_;
+	boost::shared_ptr<INNER_SOLVER> inner_solver_;
+	boost::shared_ptr<encodingOperatorContainer<ARRAY_TYPE_ELEMENT> > enc_op_container_;
+	std::vector<unsigned int> weights_backup_;
+	REAL non_negativity_filter_weight_;
+	bool use_x0_;
+};
+}
diff --git a/toolboxes/solvers/sbcSolver.h b/toolboxes/solvers/sbcSolver.h
new file mode 100644
index 0000000..fe1daf2
--- /dev/null
+++ b/toolboxes/solvers/sbcSolver.h
@@ -0,0 +1,96 @@
+/*
+  An implementation of the constrained solver of the paper
+  "The Split Bregman Method for L1-Regularized Problems" by Tom Goldstein and Stanley Osher. 
+  Siam J. Imaging Sciences. Vol. 2, No. 2, pp. 323-343.
+*/
+
+#pragma once
+
+#include "sbSolver.h"
+
+namespace Gadgetron{
+
+  template<class ARRAY_TYPE_REAL,
+	   class ARRAY_TYPE_ELEMENT, 
+	   class INNER_SOLVER>
+  class sbcSolver : public sbSolver<ARRAY_TYPE_REAL, ARRAY_TYPE_ELEMENT, INNER_SOLVER>
+  {
+  protected:
+    typedef typename ARRAY_TYPE_REAL::element_type REAL;
+    
+  public:
+  
+    sbcSolver() : sbSolver<ARRAY_TYPE_REAL, ARRAY_TYPE_ELEMENT, INNER_SOLVER>() {}
+    virtual ~sbcSolver() {}
+    
+    virtual boost::shared_ptr<ARRAY_TYPE_ELEMENT> solve( ARRAY_TYPE_ELEMENT *_f )
+    {
+      // Check if everything is set up right
+      //
+      this->validate_solver();
+
+      // Define u_k
+      //
+      boost::shared_ptr<ARRAY_TYPE_ELEMENT> u_k( new ARRAY_TYPE_ELEMENT(this->encoding_operator_->get_domain_dimensions()));
+
+      // Use x0 (if provided) as starting estimate
+      if(this->get_x0().get())
+	*u_k = *(this->get_x0());
+      else 
+	clear(u_k.get());
+
+
+      // Normalize and _then_ initialize (the order matters)
+      //
+      
+      boost::shared_ptr<ARRAY_TYPE_ELEMENT> f(new ARRAY_TYPE_ELEMENT(*_f));
+      REAL normalization_factor = this->normalize_data( f.get() );
+      boost::shared_ptr<ARRAY_TYPE_ELEMENT> f_k(new ARRAY_TYPE_ELEMENT(*f));
+      this->initialize( normalization_factor );
+        
+      // Outer loop
+      //
+
+      for( unsigned int outer_iteration=0; outer_iteration<this->outer_iterations_; outer_iteration++ ) {
+      
+	if( this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_MAX )
+	  GDEBUG_STREAM(std::endl << "SBC outer loop iteration " << outer_iteration << std::endl << std::endl);
+	
+	// Invoke the core solver
+	//
+	
+	this->core( this->tolerance_, this->inner_iterations_, 1, f_k, u_k );
+
+	// Update f_k
+	//
+
+	ARRAY_TYPE_ELEMENT encoded_image(f->get_dimensions());
+	this->encoding_operator_->mult_M( u_k.get(), &encoded_image );
+	encoded_image -= *f;
+
+	if( this->tolerance_ > REAL(0) || this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_VERBOSE ){
+	
+	  REAL delta = nrm2(&encoded_image);
+	
+	  if( this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_VERBOSE )
+	    GDEBUG_STREAM("Squared residual norm (outer loop): " << delta*delta << std::endl << std::endl);
+	  
+	  if( delta < this->tolerance_ )
+	    break;
+	}
+
+	*f_k -= encoded_image;
+      
+      } // end of outer loop
+        
+      // Clean up memory occupied by the operator container and inner solver
+      this->deinitialize();
+
+      // Undo normalization
+      *u_k /= normalization_factor;
+      
+      // ... and return the result
+      return u_k;
+    }  
+  };
+}
diff --git a/toolboxes/solvers/solver.h b/toolboxes/solvers/solver.h
new file mode 100644
index 0000000..f6ea9e0
--- /dev/null
+++ b/toolboxes/solvers/solver.h
@@ -0,0 +1,47 @@
+/** \file solver.h
+    \brief Base class for all Gadgetron solvers.
+*/
+
+#pragma once
+
+#include <boost/shared_ptr.hpp>
+#include <string>
+#include <iostream>
+#include "log.h"
+namespace Gadgetron
+{
+
+  template <class ARRAY_TYPE_IN, class ARRAY_TYPE_OUT> class solver
+  {
+  public:
+
+    // Constructor/destructor
+    solver() { output_mode_ = OUTPUT_SILENT; }
+    virtual ~solver() {}
+  
+    // Output modes
+    enum solverOutputModes { OUTPUT_SILENT = 0, OUTPUT_WARNINGS = 1, OUTPUT_VERBOSE = 2, OUTPUT_MAX = 3 };
+  
+    // Set/get output mode
+    virtual int get_output_mode() { return output_mode_; }
+    virtual void set_output_mode( int output_mode ) {
+      if( !(output_mode >= OUTPUT_MAX || output_mode < 0 )) 
+	output_mode_ = output_mode;
+    }
+  
+    // Set/get starting solution/estimate for solver
+    virtual void set_x0( boost::shared_ptr<ARRAY_TYPE_OUT> x0 ){ x0_ = x0; }
+    virtual boost::shared_ptr<ARRAY_TYPE_OUT> get_x0(){ return x0_; }
+
+    virtual void solver_warning(std::string warn){
+      GDEBUG_STREAM(warn << std::endl);
+    }
+
+    // Invoke solver
+    virtual boost::shared_ptr<ARRAY_TYPE_OUT> solve( ARRAY_TYPE_IN* ) = 0;
+
+  protected:
+    int output_mode_;
+    boost::shared_ptr<ARRAY_TYPE_OUT> x0_;
+  };
+}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/gadgetron.git